From 877674b72c5aef7a5f28c7fd5b9f5e861e0cc529 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 16:49:45 +0800
Subject: [PATCH 001/132] Add PyTorch NPU full test CI workflow

- Add Dockerfile for pytorch-npu-builder with CANN 9.0.0-beta.2
- Add build-docker-image.yml for scheduled/manual image build
- Add _build.yml for PyTorch and torch_npu compilation
- Add _collect.yml for pytest case collection and sharding
- Add _test.yml for test execution with subprocess isolation
- Add npu-full-test.yml as main orchestration workflow
- Add scripts: collect_all_cases.py, run_npu_test_shard.py, generate_report.py
- Add CLAUDE.md with complete design documentation

Key features:
- Docker image pass-through via needs.build.outputs.docker-image
- Case-level sharding for load balancing
- Per-case subprocess execution for crash isolation
- Distributed (serial) vs Regular (32 workers) test execution

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/docker/pytorch-npu-builder.Dockerfile |  28 +
 .github/scripts/collect_all_cases.py          | 208 +++++
 .github/scripts/generate_report.py            | 172 ++++
 .github/scripts/run_npu_test_shard.py         | 203 +++++
 .github/workflows/_build.yml                  | 188 ++--
 .github/workflows/_collect.yml                | 104 +++
 .github/workflows/_test.yml                   |  87 ++
 .github/workflows/build-docker-image.yml      |  62 ++
 .github/workflows/npu-full-test.yml           | 125 +++
 CLAUDE.md                                     | 857 ++++++++++++++++++
 10 files changed, 1951 insertions(+), 83 deletions(-)
 create mode 100644 .github/docker/pytorch-npu-builder.Dockerfile
 create mode 100644 .github/scripts/collect_all_cases.py
 create mode 100644 .github/scripts/generate_report.py
 create mode 100644 .github/scripts/run_npu_test_shard.py
 create mode 100644 .github/workflows/_collect.yml
 create mode 100644 .github/workflows/_test.yml
 create mode 100644 .github/workflows/build-docker-image.yml
 create mode 100644 .github/workflows/npu-full-test.yml
 create mode 100644 CLAUDE.md

diff --git a/.github/docker/pytorch-npu-builder.Dockerfile b/.github/docker/pytorch-npu-builder.Dockerfile
new file mode 100644
index 0000000000..51faaa64d9
--- /dev/null
+++ b/.github/docker/pytorch-npu-builder.Dockerfile
@@ -0,0 +1,28 @@
+# 基于 PyTorch manylinux builder 镜像
+FROM ghcr.io/pytorch/manylinux-builder:aarch64
+
+# 设置工作目录
+WORKDIR /root
+
+# 安装 CANN 9.0.0-beta.2
+RUN mkdir -p cann && cd cann && \
+    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run && \
+    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run && \
+    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run && \
+    chmod +x Ascend-cann*.run && \
+    ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
+    ./Ascend-cann-A3*.run --install --quiet --install-path=/usr/local/Ascend && \
+    ./Ascend-cann-nnal*.run --install --quiet --install-path=/usr/local/Ascend && \
+    rm -rf cann
+
+# 设置环境变量
+ENV CANN_PATH=/usr/local/Ascend/cann
+ENV NNAL_PATH=/usr/local/Ascend/nnal
+ENV ASCEND_HOME=/usr/local/Ascend
+
+# 添加 CANN 环境初始化脚本
+RUN printf '#!/bin/bash\nsource /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\nsource /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true\n' > /etc/profile.d/cann_env.sh && \
+    chmod +x /etc/profile.d/cann_env.sh
+
+# 预安装 pytest 等测试依赖
+RUN pip3.11 install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard
\ No newline at end of file
diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
new file mode 100644
index 0000000000..2b0e3914b7
--- /dev/null
+++ b/.github/scripts/collect_all_cases.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+Collect all test cases from PyTorch test directory and shard them.
+
+This script scans the test directory, collects all test cases using pytest --collect-only,
+classifies them as distributed or regular tests, and shards them for parallel execution.
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Dict, List, Set, Tuple
+
+
+DISTRIBUTED_TEST_DIRS = [
+    "distributed",
+    "distributed/algorithms/nn",
+]
+
+
+def is_distributed_test(test_file: str) -> bool:
+    """Check if a test file is in distributed test directories."""
+    for dir_prefix in DISTRIBUTED_TEST_DIRS:
+        if test_file.startswith(dir_prefix):
+            return True
+    return False
+
+
+def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1) -> List[str]:
+    """Collect test cases from a single test file using pytest --collect-only."""
+    full_path = test_dir / test_file
+    if not full_path.exists():
+        return []
+
+    try:
+        result = subprocess.run(
+            ["pytest", "--collect-only", "-q", str(full_path)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=str(test_dir),
+            env={**os.environ, "PYTEST_ADDOPTS": ""}
+        )
+
+        cases = []
+        for line in result.stdout.splitlines():
+            # Parse pytest output format: "test_file.py::TestClass::test_method"
+            if "::" in line and not line.startswith("="):
+                case_id = line.strip()
+                if case_id and not case_id.startswith("<"):
+                    cases.append(case_id)
+
+        return cases
+    except subprocess.TimeoutExpired:
+        print(f"Timeout collecting cases from {test_file}")
+        return []
+    except Exception as e:
+        print(f"Error collecting cases from {test_file}: {e}")
+        return []
+
+
+def discover_test_files(test_dir: Path) -> List[str]:
+    """Discover all test_*.py files in the test directory."""
+    test_files = []
+    for py_file in test_dir.rglob("test_*.py"):
+        rel_path = str(py_file.relative_to(test_dir))
+        test_files.append(rel_path)
+    return sorted(test_files)
+
+
+def shard_cases(cases: List[str], num_shards: int) -> List[List[str]]:
+    """Shard cases evenly across shards."""
+    shards = [[] for _ in range(num_shards)]
+    for i, case in enumerate(cases):
+        shard_idx = i % num_shards
+        shards[shard_idx].append(case)
+    return shards
+
+
+def collect_all_cases(
+    test_dir: str,
+    distributed_shards: int,
+    regular_shards: int,
+    output_dir: str,
+    parallel: int = 1
+) -> Dict:
+    """Collect all test cases and shard them."""
+    test_dir_path = Path(test_dir)
+    if not test_dir_path.exists():
+        raise FileNotFoundError(f"Test directory not found: {test_dir}")
+
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+
+    print(f"Discovering test files in {test_dir}...")
+    test_files = discover_test_files(test_dir_path)
+    print(f"Found {len(test_files)} test files")
+
+    distributed_files = [f for f in test_files if is_distributed_test(f)]
+    regular_files = [f for f in test_files if not is_distributed_test(f)]
+
+    print(f"Distributed test files: {len(distributed_files)}")
+    print(f"Regular test files: {len(regular_files)}")
+
+    # Collect cases in parallel
+    print("Collecting distributed cases...")
+    distributed_cases = []
+    with ThreadPoolExecutor(max_workers=parallel) as executor:
+        futures = {
+            executor.submit(collect_cases_from_file, test_dir_path, f, parallel): f
+            for f in distributed_files
+        }
+        for future in as_completed(futures):
+            file = futures[future]
+            cases = future.result()
+            distributed_cases.extend(cases)
+
+    print("Collecting regular cases...")
+    regular_cases = []
+    with ThreadPoolExecutor(max_workers=parallel) as executor:
+        futures = {
+            executor.submit(collect_cases_from_file, test_dir_path, f, parallel): f
+            for f in regular_files
+        }
+        for future in as_completed(futures):
+            file = futures[future]
+            cases = future.result()
+            regular_cases.extend(cases)
+
+    print(f"Total distributed cases: {len(distributed_cases)}")
+    print(f"Total regular cases: {len(regular_cases)}")
+
+    # Shard cases
+    distributed_sharded = shard_cases(distributed_cases, distributed_shards)
+    regular_sharded = shard_cases(regular_cases, regular_shards)
+
+    # Save shards to JSON files
+    for i, shard in enumerate(distributed_sharded, 1):
+        shard_file = output_dir_path / f"distributed_cases_shard_{i}.json"
+        with open(shard_file, "w") as f:
+            json.dump({
+                "shard_index": i,
+                "total_shards": distributed_shards,
+                "cases": shard,
+                "count": len(shard)
+            }, f, indent=2)
+        print(f"Saved distributed shard {i} with {len(shard)} cases to {shard_file}")
+
+    for i, shard in enumerate(regular_sharded, 1):
+        shard_file = output_dir_path / f"regular_cases_shard_{i}.json"
+        with open(shard_file, "w") as f:
+            json.dump({
+                "shard_index": i,
+                "total_shards": regular_shards,
+                "cases": shard,
+                "count": len(shard)
+            }, f, indent=2)
+        print(f"Saved regular shard {i} with {len(shard)} cases to {shard_file}")
+
+    # Save summary
+    summary = {
+        "total_cases": len(distributed_cases) + len(regular_cases),
+        "distributed_cases": len(distributed_cases),
+        "regular_cases": len(regular_cases),
+        "distributed_shards": distributed_shards,
+        "regular_shards": regular_shards,
+        "distributed_files": len(distributed_files),
+        "regular_files": len(regular_files),
+    }
+
+    summary_file = output_dir_path / "cases_collection_summary.json"
+    with open(summary_file, "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"Saved summary to {summary_file}")
+
+    return summary
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Collect and shard PyTorch test cases")
+    parser.add_argument("--test-dir", required=True, help="PyTorch test directory path")
+    parser.add_argument("--distributed-shards", type=int, default=2, help="Number of distributed test shards")
+    parser.add_argument("--regular-shards", type=int, default=5, help="Number of regular test shards")
+    parser.add_argument("--output-dir", required=True, help="Output directory for shard JSON files")
+    parser.add_argument("--parallel", type=int, default=1, help="Number of parallel collectors")
+
+    args = parser.parse_args()
+
+    summary = collect_all_cases(
+        test_dir=args.test_dir,
+        distributed_shards=args.distributed_shards,
+        regular_shards=args.regular_shards,
+        output_dir=args.output_dir,
+        parallel=args.parallel
+    )
+
+    print("\nCollection Summary:")
+    print(f"  Total cases: {summary['total_cases']}")
+    print(f"  Distributed cases: {summary['distributed_cases']} ({summary['distributed_shards']} shards)")
+    print(f"  Regular cases: {summary['regular_cases']} ({summary['regular_shards']} shards)")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/scripts/generate_report.py b/.github/scripts/generate_report.py
new file mode 100644
index 0000000000..f96a5dce8c
--- /dev/null
+++ b/.github/scripts/generate_report.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""
+Generate consolidated test report from all shard results.
+
+Reads all shard result JSON files and generates a Markdown summary report
+and a detailed JSON report.
+"""
+
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List
+
+
+def load_shard_results(reports_root: str) -> List[Dict]:
+    """Load all shard result JSON files from the reports directory."""
+    reports_path = Path(reports_root)
+    if not reports_path.exists():
+        raise FileNotFoundError(f"Reports directory not found: {reports_root}")
+
+    results = []
+    for result_file in reports_path.rglob("*_results.json"):
+        with open(result_file) as f:
+            data = json.load(f)
+            results.append(data)
+
+    return results
+
+
+def aggregate_stats(shard_results: List[Dict]) -> Dict:
+    """Aggregate statistics from all shard results."""
+    total_stats = {
+        "passed": 0,
+        "failed": 0,
+        "error": 0,
+        "skipped": 0,
+        "timeout": 0,
+        "crashed": 0,
+        "xfail": 0,
+        "xpass": 0,
+        "unknown": 0,
+    }
+
+    total_cases = 0
+    for shard in shard_results:
+        total_cases += shard.get("total_cases", 0)
+        stats = shard.get("stats", {})
+        for key in total_stats:
+            total_stats[key] += stats.get(key, 0)
+
+    total_stats["total_cases"] = total_cases
+    total_stats["pass_rate"] = (
+        round(total_stats["passed"] / total_cases * 100, 2) if total_cases > 0 else 0
+    )
+
+    return total_stats
+
+
+def generate_markdown_report(stats: Dict, shard_results: List[Dict]) -> str:
+    """Generate a Markdown summary report."""
+    lines = [
+        "# PyTorch NPU Test Report",
+        "",
+        f"**Generated:** {stats.get('timestamp', 'N/A')}",
+        "",
+        "## Summary",
+        "",
+        "| Metric | Count |",
+        "|--------|-------|",
+        f"| Total Cases | {stats['total_cases']} |",
+        f"| Passed | {stats['passed']} |",
+        f"| Failed | {stats['failed']} |",
+        f"| Error | {stats['error']} |",
+        f"| Timeout | {stats['timeout']} |",
+        f"| Crashed | {stats['crashed']} |",
+        f"| Skipped | {stats['skipped']} |",
+        f"| XFail | {stats['xfail']} |",
+        f"| XPass | {stats['xpass']} |",
+        f"| Pass Rate | {stats['pass_rate']}% |",
+        "",
+        "## Shard Details",
+        "",
+    ]
+
+    for shard in sorted(shard_results, key=lambda x: x.get("shard_index", 0)):
+        shard_idx = shard.get("shard_index", "?")
+        shard_stats = shard.get("stats", {})
+        lines.append(f"### Shard {shard_idx}")
+        lines.append("")
+        lines.append(f"- Total cases: {shard.get('total_cases', 0)}")
+        lines.append(f"- Passed: {shard_stats.get('passed', 0)}")
+        lines.append(f"- Failed: {shard_stats.get('failed', 0)}")
+        lines.append(f"- Error: {shard_stats.get('error', 0)}")
+        lines.append(f"- Timeout: {shard_stats.get('timeout', 0)}")
+        lines.append(f"- Crashed: {shard_stats.get('crashed', 0)}")
+        lines.append("")
+
+    # Add failed cases section
+    failed_cases = []
+    for shard in shard_results:
+        for result in shard.get("results", []):
+            if result.get("status") in ["failed", "error", "timeout", "crashed"]:
+                failed_cases.append({
+                    "case_id": result.get("case_id", "?"),
+                    "status": result.get("status", "?"),
+                    "duration": result.get("duration", 0),
+                })
+
+    if failed_cases:
+        lines.append("## Failed Cases")
+        lines.append("")
+        lines.append("| Case ID | Status | Duration |")
+        lines.append("|---------|--------|----------|")
+        for case in failed_cases[:100]:  # Limit to first 100 for readability
+            lines.append(f"| {case['case_id']} | {case['status']} | {case['duration']}s |")
+
+        if len(failed_cases) > 100:
+            lines.append(f"\n*Showing first 100 of {len(failed_cases)} failed cases*")
+
+    return "\n".join(lines)
+
+
+def generate_json_report(stats: Dict, shard_results: List[Dict]) -> Dict:
+    """Generate a detailed JSON report."""
+    report = {
+        "summary": stats,
+        "shards": shard_results,
+        "failed_cases": [],
+    }
+
+    for shard in shard_results:
+        for result in shard.get("results", []):
+            if result.get("status") in ["failed", "error", "timeout", "crashed"]:
+                report["failed_cases"].append(result)
+
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate consolidated test report")
+    parser.add_argument("--reports-root", required=True, help="Root directory with shard results")
+    parser.add_argument("--output-markdown", required=True, help="Output Markdown file path")
+    parser.add_argument("--output-json", required=True, help="Output JSON file path")
+
+    args = parser.parse_args()
+
+    shard_results = load_shard_results(args.reports_root)
+    print(f"Loaded {len(shard_results)} shard results")
+
+    stats = aggregate_stats(shard_results)
+    stats["timestamp"] = datetime.utcnow().isoformat()
+
+    print(f"Total cases: {stats['total_cases']}")
+    print(f"Pass rate: {stats['pass_rate']}%")
+
+    # Generate Markdown report
+    markdown = generate_markdown_report(stats, shard_results)
+    with open(args.output_markdown, "w") as f:
+        f.write(markdown)
+    print(f"Markdown report saved to {args.output_markdown}")
+
+    # Generate JSON report
+    json_report = generate_json_report(stats, shard_results)
+    with open(args.output_json, "w") as f:
+        json.dump(json_report, f, indent=2)
+    print(f"JSON report saved to {args.output_json}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
new file mode 100644
index 0000000000..d6efcedf20
--- /dev/null
+++ b/.github/scripts/run_npu_test_shard.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""
+Run NPU test cases from a shard JSON file.
+
+Each test case runs in an independent subprocess for crash isolation.
+Results are collected and saved to JSON files.
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional
+
+
+def run_single_case(
+    case_id: str,
+    test_dir: Path,
+    timeout: int,
+    verbose: bool = False
+) -> Dict:
+    """Run a single test case in a subprocess."""
+    result = {
+        "case_id": case_id,
+        "status": "unknown",
+        "duration": 0,
+        "output": "",
+        "error": "",
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+    # Parse case_id to get test file path
+    parts = case_id.split("::")
+    test_file = parts[0]
+
+    # Add test file parent directory to PYTHONPATH for sibling imports
+    test_file_path = test_dir / test_file
+    parent_dir = str(test_file_path.parent)
+
+    env = os.environ.copy()
+    existing_pythonpath = env.get("PYTHONPATH", "")
+    env["PYTHONPATH"] = f"{parent_dir}:{existing_pythonpath}" if existing_pythonpath else parent_dir
+
+    cmd = ["pytest", "-v", "--timeout=300", "-x", case_id]
+
+    if verbose:
+        print(f"Running: {case_id}")
+
+    start_time = time.time()
+    try:
+        proc = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=str(test_dir),
+            env=env
+        )
+        duration = time.time() - start_time
+        result["duration"] = round(duration, 2)
+        result["output"] = proc.stdout
+        result["error"] = proc.stderr
+
+        if proc.returncode == 0:
+            result["status"] = "passed"
+        elif proc.returncode == 1:
+            result["status"] = "failed"
+        elif proc.returncode == 2:
+            result["status"] = "error"
+        elif proc.returncode == 3:
+            result["status"] = "skipped"
+        elif proc.returncode == 4:
+            result["status"] = "xfail"
+        elif proc.returncode == 5:
+            result["status"] = "xpass"
+        else:
+            result["status"] = f"unknown_exit_{proc.returncode}"
+
+        if verbose:
+            print(f"  [{result['status']}] {duration:.2f}s")
+
+    except subprocess.TimeoutExpired:
+        result["status"] = "timeout"
+        result["duration"] = timeout
+        result["error"] = f"Test timed out after {timeout} seconds"
+        if verbose:
+            print(f"  [timeout] {timeout}s")
+
+    except Exception as e:
+        result["status"] = "crashed"
+        result["error"] = str(e)
+        if verbose:
+            print(f"  [crashed] {e}")
+
+    return result
+
+
+def run_shard(
+    cases_json: str,
+    test_dir: str,
+    report_dir: str,
+    timeout: int,
+    max_workers: int,
+    verbose: bool = False
+) -> Dict:
+    """Run all cases from a shard JSON file."""
+    cases_file = Path(cases_json)
+    if not cases_file.exists():
+        raise FileNotFoundError(f"Cases JSON file not found: {cases_json}")
+
+    with open(cases_file) as f:
+        shard_data = json.load(f)
+
+    cases = shard_data.get("cases", [])
+    shard_index = shard_data.get("shard_index", 1)
+    total_shards = shard_data.get("total_shards", 1)
+
+    print(f"Loaded shard {shard_index}/{total_shards} with {len(cases)} cases")
+
+    test_dir_path = Path(test_dir)
+    if not test_dir_path.exists():
+        raise FileNotFoundError(f"Test directory not found: {test_dir}")
+
+    report_dir_path = Path(report_dir)
+    report_dir_path.mkdir(parents=True, exist_ok=True)
+
+    results = []
+    stats = {
+        "passed": 0,
+        "failed": 0,
+        "error": 0,
+        "skipped": 0,
+        "timeout": 0,
+        "crashed": 0,
+        "xfail": 0,
+        "xpass": 0,
+        "unknown": 0,
+    }
+
+    print(f"Running tests with {max_workers} workers...")
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(run_single_case, case, test_dir_path, timeout, verbose): case
+            for case in cases
+        }
+
+        for future in as_completed(futures):
+            case = futures[future]
+            result = future.result()
+            results.append(result)
+
+            status = result["status"]
+            if status in stats:
+                stats[status] += 1
+            else:
+                stats["unknown"] += 1
+
+    # Save results
+    results_file = report_dir_path / f"shard_{shard_index}_results.json"
+    with open(results_file, "w") as f:
+        json.dump({
+            "shard_index": shard_index,
+            "total_shards": total_shards,
+            "total_cases": len(cases),
+            "stats": stats,
+            "results": results,
+        }, f, indent=2)
+
+    print(f"\nShard {shard_index} Results saved to {results_file}")
+    print(f"Statistics: {stats}")
+
+    return {"stats": stats, "results_file": str(results_file)}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run NPU test shard")
+    parser.add_argument("--cases-json", required=True, help="JSON file with test cases")
+    parser.add_argument("--test-dir", required=True, help="PyTorch test directory")
+    parser.add_argument("--report-dir", required=True, help="Directory to save results")
+    parser.add_argument("--timeout", type=int, default=300, help="Timeout per test case (seconds, default 300)")
+    parser.add_argument("--max-workers", type=int, default=1, help="Number of parallel workers")
+    parser.add_argument("--verbose", action="store_true", help="Print verbose output")
+
+    args = parser.parse_args()
+
+    run_shard(
+        cases_json=args.cases_json,
+        test_dir=args.test_dir,
+        report_dir=args.report_dir,
+        timeout=args.timeout,
+        max_workers=args.max_workers,
+        verbose=args.verbose
+    )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 55a0b52dcf..b9122bd688 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -3,104 +3,126 @@ name: Build PyTorch and torch_npu
 on:
   workflow_call:
     inputs:
-      upstream_repo:
+      pytorch_branch:
         required: true
         type: string
-        description: The upstream repository full name (owner/repo)
-      upstream_sha:
+        default: 'main'
+      python_version:
         required: true
         type: string
-        description: The upstream commit SHA to build
-      upstream_fork_repo:
-        required: false
-        type: string
-        default: ''
-        description: The fork repository full name for PR from fork (owner/repo)
-      downstream_repo:
+        default: '3.11'
+      docker_image_tag:
         required: true
         type: string
-        description: The downstream repository full name (owner/repo)
+        description: 'Docker image tag with timestamp'
+    outputs:
+      docker-image:
+        description: 'Full Docker image URL'
+        value: ${{ jobs.build.outputs.docker-image }}
+      torch-wheel:
+        description: 'PyTorch wheel artifact name'
+        value: 'torch-wheel-main'
+      torch-npu-wheel:
+        description: 'torch_npu wheel artifact name'
+        value: 'torch-npu-wheel-main'
+      test-src:
+        description: 'Test source artifact name'
+        value: 'test-src-main'
+
+env:
+  REGISTRY: ghcr.io
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: linux-aarch64-a3-16
+    outputs:
+      docker-image: ${{ steps.set_image.outputs.docker-image }}
+
+    container:
+      image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}
+      options: --user root
+
     steps:
-      # Step 1: Checkout upstream PyTorch PR code
-      # Use fork repo if available (PR from fork), otherwise use upstream repo (push or PR from same repo)
-      - name: Checkout upstream PyTorch PR
-        uses: actions/checkout@v6
-        with:
-          repository: ${{ inputs.upstream_fork_repo != '' && inputs.upstream_fork_repo || inputs.upstream_repo }}
-          ref: ${{ inputs.upstream_sha }}
-          submodules: recursive
-          path: pytorch
+      - name: Set Docker image URL
+        id: set_image
+        run: |
+          DOCKER_IMAGE="${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
+          echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
+          echo "Using Docker image: ${DOCKER_IMAGE}"
 
-      # Step 2: Build PyTorch
-      - name: Build PyTorch
+      - name: Setup CANN environment
         run: |
-          echo "============================================"
-          echo "Building PyTorch from upstream PR..."
-          if [ -n "${{ inputs.upstream_fork_repo }}" ]; then
-            echo "Fork Repository: ${{ inputs.upstream_fork_repo }}"
-          else
-            echo "Repository: ${{ inputs.upstream_repo }}"
-          fi
-          echo "Commit SHA: ${{ inputs.upstream_sha }}"
-          echo "============================================"
-          # TODO: Add actual PyTorch build commands
-          # Example:
-          # cd pytorch
-          # pip install -r requirements.txt
-          # python setup.py develop
-          echo "[SIMULATED] PyTorch build completed successfully!"
-
-      # Step 3: Install PyTorch wheel
-      - name: Install PyTorch wheel
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+      - name: Clone upstream PyTorch main
         run: |
-          echo "============================================"
-          echo "Installing PyTorch wheel..."
-          echo "============================================"
-          # TODO: Add actual install commands
-          # Example:
-          # pip install pytorch/dist/torch*.whl
-          echo "[SIMULATED] PyTorch wheel installed successfully!"
-
-      # Step 4: Checkout downstream repo (torch_npu) master
-      - name: Checkout downstream repo (torch_npu)
-        uses: actions/checkout@v6
+          git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
+            https://github.com/pytorch/pytorch.git pytorch-src
+          PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
+          echo "pytorch_sha=${PYTORCH_SHA}"
+
+      - name: Checkout torch_npu
+        uses: actions/checkout@v4
         with:
-          repository: ${{ inputs.downstream_repo }}
-          ref: master
+          path: torch_npu-src
           submodules: recursive
-          path: torch_npu
 
-      # Step 5: Build torch_npu
-      - name: Build torch_npu
+      - name: Build PyTorch wheel
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+
+          cd pytorch-src
+          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
+
+          export MAX_JOBS=40
+          export USE_CUDA=0
+          export USE_CUDNN=0
+          export CMAKE_BUILD_TYPE=Release
+
+          python${{ inputs.python_version }} setup.py build bdist_wheel
+
+          echo "PyTorch wheel built:"
+          ls -la dist/
+
+      - name: Build torch_npu wheel
         run: |
-          echo "============================================"
-          echo "Building torch_npu from downstream master..."
-          echo "Repository: ${{ inputs.downstream_repo }}"
-          echo "Branch: master"
-          echo "============================================"
-          # TODO: Add actual torch_npu build commands
-          # Example:
-          # cd torch_npu
-          # pip install -r requirements.txt
-          # bash ci/build.sh
-          echo "[SIMULATED] torch_npu build completed successfully!"
-
-      - name: Summary
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
+
+          cd torch_npu-src
+          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
+
+          export MAX_JOBS=40
+          bash ci/build.sh --python=${{ inputs.python_version }}
+
+          echo "torch_npu wheel built:"
+          ls -la dist/
+
+      - name: Package test source
         run: |
-          echo "============================================"
-          echo "Build Summary"
-          echo "============================================"
-          if [ -n "${{ inputs.upstream_fork_repo }}" ]; then
-            echo "1. PyTorch fork PR: ${{ inputs.upstream_fork_repo }}@${{ inputs.upstream_sha }}"
-          else
-            echo "1. PyTorch upstream: ${{ inputs.upstream_repo }}@${{ inputs.upstream_sha }}"
-          fi
-          echo "2. PyTorch build: SUCCESS (simulated)"
-          echo "3. PyTorch install: SUCCESS (simulated)"
-          echo "4. torch_npu master: ${{ inputs.downstream_repo }}"
-          echo "5. torch_npu build: SUCCESS (simulated)"
-          echo "============================================"
\ No newline at end of file
+          tar -czf test-src.tar.gz pytorch-src
+          ls -la test-src.tar.gz
+
+      - name: Upload PyTorch wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: pytorch-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload torch_npu wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: torch_npu-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload test source
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-src-main
+          path: test-src.tar.gz
+          retention-days: 7
\ No newline at end of file
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
new file mode 100644
index 0000000000..b23febfe39
--- /dev/null
+++ b/.github/workflows/_collect.yml
@@ -0,0 +1,104 @@
+name: Collect Test Cases
+
+on:
+  workflow_call:
+    inputs:
+      docker-image:
+        required: true
+        type: string
+      distributed-shards:
+        required: false
+        type: number
+        default: 2
+      regular-shards:
+        required: false
+        type: number
+        default: 5
+    outputs:
+      distributed_matrix:
+        description: 'Distributed shards matrix'
+        value: ${{ jobs.collect.outputs.distributed_matrix }}
+      regular_matrix:
+        description: 'Regular shards matrix'
+        value: ${{ jobs.collect.outputs.regular_matrix }}
+      total_cases:
+        description: 'Total test cases count'
+        value: ${{ jobs.collect.outputs.total_cases }}
+
+env:
+  PYTHON_VERSION: '3.11'
+
+jobs:
+  collect:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 60
+    outputs:
+      distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
+      regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
+      total_cases: ${{ steps.collect.outputs.total_cases }}
+
+    container:
+      image: ${{ inputs.docker-image }}
+      options: --user root
+
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: .github/scripts
+
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: wheels
+
+      - name: Download torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: wheels
+
+      - name: Download test source
+        uses: actions/download-artifact@v4
+        with:
+          name: test-src-main
+
+      - name: Extract test source
+        run: tar -xzf test-src.tar.gz
+
+      - name: Install wheels
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
+
+      - name: Collect all test cases
+        id: collect
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          cd pytorch-src
+
+          python${{ env.PYTHON_VERSION }} ../.github/scripts/collect_all_cases.py \
+            --test-dir test \
+            --distributed-shards ${{ inputs.distributed-shards }} \
+            --regular-shards ${{ inputs.regular-shards }} \
+            --output-dir cases_shards \
+            --parallel 16
+
+          TOTAL=$(python${{ env.PYTHON_VERSION }} -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
+          echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT
+
+          # Generate shard matrices
+          python${{ env.PYTHON_VERSION }} -c "
+          import json
+          d_shards = ${{ inputs.distributed-shards }}
+          r_shards = ${{ inputs.regular-shards }}
+          print(f'distributed_matrix={json.dumps(list(range(1, d_shards + 1)))}')
+          print(f'regular_matrix={json.dumps(list(range(1, r_shards + 1)))}')
+          " >> $GITHUB_OUTPUT
+
+      - name: Upload cases shard JSONs
+        uses: actions/upload-artifact@v4
+        with:
+          name: cases-shards
+          path: pytorch-src/cases_shards/
\ No newline at end of file
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
new file mode 100644
index 0000000000..ee882795b7
--- /dev/null
+++ b/.github/workflows/_test.yml
@@ -0,0 +1,87 @@
+name: Run NPU Tests
+
+on:
+  workflow_call:
+    inputs:
+      docker-image:
+        required: true
+        type: string
+      test-type:
+        required: true
+        type: string
+      shard-index:
+        required: true
+        type: number
+      max-workers:
+        required: false
+        type: number
+        default: 1
+
+env:
+  PYTHON_VERSION: '3.11'
+
+jobs:
+  test:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 1200
+
+    container:
+      image: ${{ inputs.docker-image }}
+      options: --user root
+
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: .github/scripts
+
+      - name: Download torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: wheels
+
+      - name: Download torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: wheels
+
+      - name: Download test source
+        uses: actions/download-artifact@v4
+        with:
+          name: test-src-main
+
+      - name: Download cases shard
+        uses: actions/download-artifact@v4
+        with:
+          name: cases-shards
+          path: cases-shards
+
+      - name: Extract test source
+        run: tar -xzf test-src.tar.gz
+
+      - name: Install wheels
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
+
+      - name: Run ${{ inputs.test-type }} shard ${{ inputs.shard-index }}
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          python .github/scripts/run_npu_test_shard.py \
+            --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
+            --test-dir pytorch-src/test \
+            --report-dir test-reports \
+            --max-workers ${{ inputs.max-workers }} \
+            --verbose
+
+      - name: Upload test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-reports-${{ inputs.test-type }}-${{ inputs.shard-index }}
+          path: test-reports/
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
new file mode 100644
index 0000000000..2ebd204242
--- /dev/null
+++ b/.github/workflows/build-docker-image.yml
@@ -0,0 +1,62 @@
+name: Build Docker Image
+
+on:
+  push:
+    branches: [main, master]
+    paths:
+      - '.github/docker/pytorch-npu-builder.Dockerfile'
+      - '.github/workflows/build-docker-image.yml'
+  schedule:
+    - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
+  workflow_dispatch:
+    inputs:
+      cann_version:
+        description: 'CANN version'
+        default: '9.0.0-beta.2'
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: pytorch-npu-builder
+
+jobs:
+  build:
+    runs-on: linux-aarch64-a3-16
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Generate timestamp tag
+        id: tag
+        run: |
+          TIMESTAMP=$(date +%Y%m%d)
+          echo "tag=manylinux-cann${{ inputs.cann_version || '9.0.0-beta.2' }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .github/docker
+          file: .github/docker/pytorch-npu-builder.Dockerfile
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Output image URL
+        run: |
+          echo "Built image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
\ No newline at end of file
diff --git a/.github/workflows/npu-full-test.yml b/.github/workflows/npu-full-test.yml
new file mode 100644
index 0000000000..882ac26b63
--- /dev/null
+++ b/.github/workflows/npu-full-test.yml
@@ -0,0 +1,125 @@
+name: PyTorch NPU Full Test
+
+on:
+  push:
+    branches: [main, master]
+    paths:
+      - '.github/workflows/**'
+      - '.github/scripts/**'
+      - '.github/docker/**'
+  schedule:
+    - cron: '0 22 * * 1'  # UTC 22:00, Beijing 06:00, every Monday
+  workflow_dispatch:
+    inputs:
+      docker_image_tag:
+        description: 'Docker image tag (e.g., manylinux-cann9.0-20260428)'
+        default: 'latest'
+      pytorch_branch:
+        description: 'PyTorch branch to build'
+        default: 'main'
+      distributed_shards:
+        description: 'Distributed test shards'
+        default: 2
+        type: number
+      regular_shards:
+        description: 'Regular test shards'
+        default: 5
+        type: number
+
+env:
+  REGISTRY: ghcr.io
+  PYTHON_VERSION: '3.11'
+
+jobs:
+  build:
+    uses: ./.github/workflows/_build.yml
+    with:
+      pytorch_branch: ${{ inputs.pytorch_branch || 'main' }}
+      python_version: '3.11'
+      docker_image_tag: ${{ inputs.docker_image_tag || 'latest' }}
+
+  collect_cases:
+    needs: build
+    uses: ./.github/workflows/_collect.yml
+    with:
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      distributed-shards: ${{ inputs.distributed_shards || 2 }}
+      regular-shards: ${{ inputs.regular_shards || 5 }}
+
+  test_distributed:
+    needs:
+      - build
+      - collect_cases
+    strategy:
+      matrix:
+        shard: ${{ fromJson(needs.collect_cases.outputs.distributed_matrix) }}
+      fail-fast: false
+      max-parallel: 2
+    uses: ./.github/workflows/_test.yml
+    with:
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-type: distributed
+      shard-index: ${{ matrix.shard }}
+      max-workers: 1
+
+  test_regular:
+    needs:
+      - build
+      - collect_cases
+    strategy:
+      matrix:
+        shard: ${{ fromJson(needs.collect_cases.outputs.regular_matrix) }}
+      fail-fast: false
+      max-parallel: 5
+    uses: ./.github/workflows/_test.yml
+    with:
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-type: regular
+      shard-index: ${{ matrix.shard }}
+      max-workers: 32
+
+  report:
+    needs:
+      - test_distributed
+      - test_regular
+    runs-on: ubuntu-latest
+    if: always()
+
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: .github/scripts
+
+      - name: Download all test reports
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-reports-*
+          path: all-reports
+          merge-multiple: false
+
+      - name: Generate consolidated report
+        run: |
+          python .github/scripts/generate_report.py \
+            --reports-root all-reports \
+            --output-markdown report.md \
+            --output-json report.json
+
+      - name: Upload final report
+        uses: actions/upload-artifact@v4
+        with:
+          name: npu-full-test-report
+          path: |
+            report.md
+            report.json
+          retention-days: 30
+
+      - name: Package all logs
+        run: tar -czf all-test-logs.tar.gz all-reports/
+
+      - name: Upload logs artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: all-test-logs
+          path: all-test-logs.tar.gz
+          retention-days: 30
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000000..12c69a7edc
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,857 @@
+# Torch-NPU 编译和上游 PyTorch 测试用例拉起流程设计
+
+## Context
+
+设计一个完整的 CI/CD 流程，用于：
+1. 拉取上游 PyTorch main 分支代码并编译
+2. 安装上游 PyTorch wheel 包
+3. 编译 torch-npu
+4. 扫描 PyTorch test 目录下的所有测试用例
+5. 每个用例独立进程执行全量测试
+6. 记录测试结果并上传日志制品
+
+参考 PR #117 (https://github.com/Ascend/pytorch/pull/117) 的实现方案，简化 patch 和白名单逻辑。
+
+**关键改进**：采用上游 PyTorch 社区的最佳实践 —— **构建和测试使用同一个 Docker 镜像 URL**，通过 Job 依赖传递机制保证环境绝对一致。
+
+---
+
+## 方案确认
+
+| 项目 | 选择 |
+|------|------|
+| 基础镜像 | `pytorch/manylinux-builder:aarch64` |
+| 镜像托管 | GitHub Container Registry (ghcr.io) |
+| CANN安装 | 单独镜像构建流程，定期构建推送 |
+| Runner规格 | 统一使用 `linux-aarch64-a3-16` |
+| 镜像标签 | 带时间戳版本号，如 `manylinux-cann9.0-20260428` |
+| Dockerfile位置 | 项目仓库 `.github/docker/` 目录 |
+
+---
+
+## 整体架构
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           GitHub Actions Workflow                            │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                              │
+│  ┌──────────────────────────────────────────────────────────────────────┐   │
+│  │           阶段1: 镜像构建 (单独 workflow，定期运行)                    │   │
+│  │  Workflow: build-docker-image.yml                                    │   │
+│  │  Runner: linux-aarch64-a3-16                                         │   │
+│  │  基础镜像: pytorch/manylinux-builder:aarch64                         │   │
+│  │  安装 CANN 9.0.0-beta.2                                               │   │
+│  │  推送: ghcr.io/${{ github.repository_owner }}/pytorch-npu-builder    │   │
+│  │  标签: manylinux-cann9.0-${{ timestamp }}                            │   │
+│  └──────────────────────────────────────────────────────────────────────┘   │
+│                              │                                              │
+│                              │ 镜像已预构建                                  │
+│                              ▼                                              │
+│  ┌──────────────────────────────────────────────────────────────────────┐   │
+│  │           阶段2: 统一构建 (PyTorch + torch_npu)                       │   │
+│  │  Workflow: _build.yml                                                │   │
+│  │  Runner: linux-aarch64-a3-16                                         │   │
+│  │  Container: ghcr.io/xxx/pytorch-npu-builder:manylinux-cann9.0-xxx    │   │
+│  │  - Clone 上游 PyTorch main                                           │   │
+│  │  - 编译 PyTorch wheel                                                │   │
+│  │  - Checkout torch_npu                                                │   │
+│  │  - 编译 torch_npu wheel                                              │   │
+│  │  - 打包测试源码                                                      │   │
+│  │  Outputs: docker-image, torch-wheel, torch-npu-wheel, test-src       │   │
+│  └──────────────────────────────────────────────────────────────────────┘   │
+│                              │                                              │
+│                              │ docker-image URL 传递                        │
+│                              ▼                                              │
+│  ┌──────────────────────────────────────────────────────────────────────┐   │
+│  │           阶段3: 用例收集                                             │   │
+│  │  Job: collect_cases                                                  │   │
+│  │  Runner: linux-aarch64-a3-16                                         │   │
+│  │  Container: SAME Docker 镜像                                         │   │
+│  │  - 安装 torch + torch_npu                                            │   │
+│  │  - pytest --collect-only 收集所有用例                                │   │
+│  │  - 按用例分片 (distributed/regular)                                  │   │
+│  │  Outputs: distributed_matrix, regular_matrix, total_cases            │   │
+│  └──────────────────────────────────────────────────────────────────────┘   │
+│                              │                                              │
+│              ┌───────────────┴───────────────┐                              │
+│              │ docker-image URL 传递         │                              │
+│              ▼                               ▼                              │
+│  ┌─────────────────────────┐   ┌─────────────────────────────────┐         │
+│  │  阶段4: Distributed测试 │   │      阶段4: Regular测试          │         │
+│  │  Runner: a3-16          │   │      Runner: a3-16               │         │
+│  │  Container: SAME镜像    │   │      Container: SAME镜像         │         │
+│  │  串行执行               │   │      并发执行 (32 workers)       │         │
+│  │  2 shards               │   │      5 shards                    │         │
+│  └─────────────────────────┘   └─────────────────────────────────┘         │
+│              │                               │                              │
+│              └───────────────┬───────────────┘                              │
+│                              ▼                                              │
+│  ┌──────────────────────────────────────────────────────────────────────┐   │
+│  │           阶段5: 报告生成                                             │   │
+│  │  - 汇总所有 shard 结果                                               │   │
+│  │  - 生成 Markdown + JSON 报告                                        │   │
+│  │  - 压缩上传制品                                                      │   │
+│  └──────────────────────────────────────────────────────────────────────┘   │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 核心设计：Docker 镜像传递机制
+
+### 上游 PyTorch 社区的做法
+
+在 PyTorch 的 CI 中，Build Job 和 Test Job 使用**同一个 Docker 镜像**：
+
+```yaml
+# Build Job 计算并输出完整的镜像 URL
+linux-build:
+  outputs:
+    docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+# Test Job 通过 needs 获取同一个镜像
+linux-test:
+  needs: linux-build
+  container:
+    image: ${{ needs.linux-build.outputs.docker-image }}
+```
+
+这种做法的优势：
+1. **环境绝对一致** - 构建和测试使用同一镜像
+2. **避免重复安装** - 镜像中预装依赖
+3. **减少 artifact 传递** - 直接挂载构建产物
+4. **可追溯性** - 镜像 URL 带 SHA256 哈希，确保版本锁定
+
+---
+
+## 文件结构
+
+```
+.github/
+├── docker/
+│   └── pytorch-npu-builder.Dockerfile   # 基于 manylinux + CANN 的构建镜像
+│
+├── workflows/
+│   ├── build-docker-image.yml           # 镜像构建 workflow (定期运行)
+│   ├── npu-full-test.yml                # 主 workflow
+│   ├── _build.yml                       # 统一构建 workflow (可复用)
+│   └── _test.yml                        # 测试执行 workflow (可复用)
+│
+├── scripts/
+│   ├── collect_all_cases.py             # 收集所有测试用例
+│   ├── run_npu_test_shard.py            # 执行测试分片
+│   ├── parse_test_results.py            # 解析测试结果
+│   └── generate_report.py               # 生成汇总报告
+```
+
+---
+
+## 阶段1: Docker 镜像构建
+
+### Dockerfile: pytorch-npu-builder.Dockerfile
+
+```dockerfile
+# 基于 PyTorch manylinux builder 镜像
+FROM ghcr.io/pytorch/manylinux-builder:aarch64
+
+# 设置工作目录
+WORKDIR /root
+
+# 安装 CANN 9.0.0-beta.2
+RUN mkdir -p cann && cd cann && \
+    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run && \
+    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run && \
+    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run && \
+    chmod +x Ascend-cann*.run && \
+    ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
+    ./Ascend-cann-A3*.run --install --quiet --install-path=/usr/local/Ascend && \
+    ./Ascend-cann-nnal*.run --install --quiet --install-path=/usr/local/Ascend && \
+    rm -rf cann
+
+# 设置环境变量
+ENV CANN_PATH=/usr/local/Ascend/cann
+ENV NNAL_PATH=/usr/local/Ascend/nnal
+ENV ASCEND_HOME=/usr/local/Ascend
+
+# 添加 CANN 环境初始化脚本
+RUN echo '#!/bin/bash\n\
+source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\n\
+source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true' > /etc/profile.d/cann_env.sh && \
+    chmod +x /etc/profile.d/cann_env.sh
+
+# 预安装 pytest 等测试依赖
+RUN pip3.11 install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard
+```
+
+### Workflow: build-docker-image.yml
+
+```yaml
+name: Build Docker Image
+
+on:
+  schedule:
+    - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
+  workflow_dispatch:
+    inputs:
+      cann_version:
+        description: 'CANN version'
+        default: '9.0.0-beta.2'
+      force_build:
+        description: 'Force rebuild even if image exists'
+        default: false
+        type: boolean
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: pytorch-npu-builder
+
+jobs:
+  build:
+    runs-on: linux-aarch64-a3-16
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Generate timestamp tag
+        id: tag
+        run: |
+          TIMESTAMP=$(date +%Y%m%d)
+          echo "tag=manylinux-cann${{ inputs.cann_version || '9.0.0-beta.2' }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .github/docker
+          file: .github/docker/pytorch-npu-builder.Dockerfile
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Output image URL
+        run: |
+          echo "Built image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
+```
+
+---
+
+## 阶段2: 统一构建 (_build.yml)
+
+```yaml
+name: Build PyTorch and torch_npu
+
+on:
+  workflow_call:
+    inputs:
+      pytorch_branch:
+        required: true
+        type: string
+        default: 'main'
+      python_version:
+        required: true
+        type: string
+        default: '3.11'
+      docker_image_tag:
+        required: true
+        type: string
+        description: 'Docker image tag with timestamp'
+    outputs:
+      docker-image:
+        description: 'Full Docker image URL'
+        value: ${{ jobs.build.outputs.docker-image }}
+      torch-wheel:
+        description: 'PyTorch wheel artifact name'
+        value: 'torch-wheel-main'
+      torch-npu-wheel:
+        description: 'torch_npu wheel artifact name'
+        value: 'torch-npu-wheel-main'
+      test-src:
+        description: 'Test source artifact name'
+        value: 'test-src-main'
+
+env:
+  REGISTRY: ghcr.io
+
+jobs:
+  build:
+    runs-on: linux-aarch64-a3-16
+    outputs:
+      docker-image: ${{ steps.set_image.outputs.docker-image }}
+    
+    container:
+      image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}
+      options: --user root
+    
+    steps:
+      - name: Set Docker image URL
+        id: set_image
+        run: |
+          DOCKER_IMAGE="${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
+          echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
+          echo "Using Docker image: ${DOCKER_IMAGE}"
+
+      - name: Setup CANN environment
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+      - name: Clone upstream PyTorch main
+        run: |
+          git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
+            https://github.com/pytorch/pytorch.git pytorch-src
+          PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
+          echo "pytorch_sha=${PYTORCH_SHA}"
+
+      - name: Checkout torch_npu
+        uses: actions/checkout@v4
+        with:
+          path: torch_npu-src
+          submodules: recursive
+
+      - name: Build PyTorch wheel
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          
+          cd pytorch-src
+          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
+          
+          export MAX_JOBS=40
+          export USE_CUDA=0
+          export USE_CUDNN=0
+          export CMAKE_BUILD_TYPE=Release
+          
+          python${{ inputs.python_version }} setup.py build bdist_wheel
+          
+          echo "PyTorch wheel built:"
+          ls -la dist/
+
+      - name: Build torch_npu wheel
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+          
+          # 安装刚编译的 PyTorch
+          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
+          
+          cd torch_npu-src
+          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
+          
+          export MAX_JOBS=40
+          bash ci/build.sh --python=${{ inputs.python_version }}
+          
+          echo "torch_npu wheel built:"
+          ls -la dist/
+
+      - name: Package test source
+        run: |
+          tar -czf test-src.tar.gz pytorch-src
+          ls -la test-src.tar.gz
+
+      - name: Upload PyTorch wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: pytorch-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload torch_npu wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: torch_npu-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload test source
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-src-main
+          path: test-src.tar.gz
+          retention-days: 7
+```
+
+---
+
+## 阶段3: 用例收集 (collect_cases Job)
+
+```yaml
+collect_cases:
+  needs: build
+  runs-on: linux-aarch64-a3-16
+  timeout-minutes: 60
+  
+  # 使用 Build Job 输出的同一个 Docker 镜像
+  container:
+    image: ${{ needs.build.outputs.docker-image }}
+    options: --user root
+  
+  outputs:
+    distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
+    regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
+    total_cases: ${{ steps.collect.outputs.total_cases }}
+
+  steps:
+    - name: Checkout scripts
+      uses: actions/checkout@v4
+      with:
+        sparse-checkout: .github/scripts
+
+    - name: Download wheels
+      uses: actions/download-artifact@v4
+      with:
+        name: torch-wheel-main
+        path: wheels
+
+    - name: Download torch_npu wheel
+      uses: actions/download-artifact@v4
+      with:
+        name: torch-npu-wheel-main
+        path: wheels
+
+    - name: Download test source
+      uses: actions/download-artifact@v4
+      with:
+        name: test-src-main
+
+    - name: Extract test source
+      run: tar -xzf test-src.tar.gz
+
+    - name: Install wheels
+      run: |
+        source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+        pip3.11 install wheels/*.whl
+
+    - name: Collect all test cases
+      id: collect
+      run: |
+        source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+        cd pytorch-src
+        
+        python3.11 ../.github/scripts/collect_all_cases.py \
+          --test-dir test \
+          --distributed-shards 2 \
+          --regular-shards 5 \
+          --output-dir cases_shards \
+          --parallel 16
+        
+        TOTAL=$(python3 -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
+        echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT
+        
+        echo "distributed_matrix=[1,2]" >> $GITHUB_OUTPUT
+        echo "regular_matrix=[1,2,3,4,5]" >> $GITHUB_OUTPUT
+
+    - name: Upload cases shard JSONs
+      uses: actions/upload-artifact@v4
+      with:
+        name: cases-shards
+        path: pytorch-src/cases_shards/
+```
+
+---
+
+## 阶段4: 测试执行 (_test.yml)
+
+```yaml
+name: Run NPU Tests
+
+on:
+  workflow_call:
+    inputs:
+      docker-image:
+        required: true
+        type: string
+      test-type:
+        required: true
+        type: string
+      shard-index:
+        required: true
+        type: number
+      max-workers:
+        required: false
+        type: number
+        default: 1
+      timeout:
+        required: false
+        type: number
+        default: 1200
+
+jobs:
+  test:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 1200
+    
+    # 使用 Build Job 输出的同一个 Docker 镜像
+    container:
+      image: ${{ inputs.docker-image }}
+      options: --user root
+
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: .github/scripts
+
+      - name: Download torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: wheels
+
+      - name: Download torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: wheels
+
+      - name: Download test source
+        uses: actions/download-artifact@v4
+        with:
+          name: test-src-main
+
+      - name: Download cases shard
+        uses: actions/download-artifact@v4
+        with:
+          name: cases-shards
+          path: cases-shards
+
+      - name: Extract test source
+        run: tar -xzf test-src.tar.gz
+
+      - name: Install wheels
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          pip3.11 install wheels/*.whl
+
+      - name: Run ${{ inputs.test-type }} shard ${{ inputs.shard-index }}
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+          
+          python .github/scripts/run_npu_test_shard.py \
+            --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
+            --test-dir pytorch-src/test \
+            --report-dir test-reports \
+            --timeout ${{ inputs.timeout }} \
+            --max-workers ${{ inputs.max-workers }} \
+            --verbose
+
+      - name: Upload test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-reports-${{ inputs.test-type }}-${{ inputs.shard-index }}
+          path: test-reports/
+          retention-days: 30
+```
+
+---
+
+## 主 Workflow: npu-full-test.yml
+
+```yaml
+name: PyTorch NPU Full Test
+
+on:
+  push:
+    branches: [main, master]
+    paths:
+      - '.github/workflows/**'
+      - '.github/scripts/**'
+      - '.github/docker/**'
+  schedule:
+    - cron: '0 22 * * 1'  # UTC 22:00, Beijing 06:00, every Monday
+  workflow_dispatch:
+    inputs:
+      docker_image_tag:
+        description: 'Docker image tag (e.g., manylinux-cann9.0-20260428)'
+        default: 'latest'
+      pytorch_branch:
+        description: 'PyTorch branch to build'
+        default: 'main'
+      distributed_shards:
+        description: 'Distributed test shards'
+        default: '2'
+      regular_shards:
+        description: 'Regular test shards'
+        default: '5'
+
+env:
+  REGISTRY: ghcr.io
+  PYTHON_VERSION: '3.11'
+
+jobs:
+  build:
+    uses: ./.github/workflows/_build.yml
+    with:
+      pytorch_branch: ${{ inputs.pytorch_branch || 'main' }}
+      python_version: '3.11'
+      docker_image_tag: ${{ inputs.docker_image_tag || 'latest' }}
+
+  collect_cases:
+    needs: build
+    uses: ./.github/workflows/_collect.yml
+    with:
+      docker-image: ${{ needs.build.outputs.docker-image }}
+
+  test_distributed:
+    needs: 
+      - build
+      - collect_cases
+    strategy:
+      matrix:
+        shard: ${{ fromJson(needs.collect_cases.outputs.distributed_matrix) }}
+      fail-fast: false
+      max-parallel: 2
+    uses: ./.github/workflows/_test.yml
+    with:
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-type: distributed
+      shard-index: ${{ matrix.shard }}
+      max-workers: 1  # Distributed 串行执行
+      timeout: 1200
+
+  test_regular:
+    needs: 
+      - build
+      - collect_cases
+    strategy:
+      matrix:
+        shard: ${{ fromJson(needs.collect_cases.outputs.regular_matrix) }}
+      fail-fast: false
+      max-parallel: 5
+    uses: ./.github/workflows/_test.yml
+    with:
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-type: regular
+      shard-index: ${{ matrix.shard }}
+      max-workers: 32  # Regular 并发执行
+      timeout: 1200
+
+  report:
+    needs: 
+      - test_distributed
+      - test_regular
+    runs-on: ubuntu-latest
+    if: always()
+    
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: .github/scripts
+
+      - name: Download all test reports
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-reports-*
+          path: all-reports
+          merge-multiple: false
+
+      - name: Generate consolidated report
+        run: |
+          python .github/scripts/generate_report.py \
+            --reports-root all-reports \
+            --output-markdown report.md \
+            --output-json report.json
+
+      - name: Upload final report
+        uses: actions/upload-artifact@v4
+        with:
+          name: npu-full-test-report
+          path: |
+            report.md
+            report.json
+          retention-days: 30
+
+      - name: Package all logs
+        run: tar -czf all-test-logs.tar.gz all-reports/
+
+      - name: Upload logs artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: all-test-logs
+          path: all-test-logs.tar.gz
+          retention-days: 30
+```
+
+---
+
+## 关键脚本设计
+
+### collect_all_cases.py (用例收集)
+
+核心功能：
+1. 扫描 test 目录下所有 test_*.py 文件
+2. 分类为 distributed/regular 类型
+3. 通过 pytest --collect-only 收集每个文件的用例
+4. 将用例均分到 shards
+5. 保存 shard JSON 文件
+
+关键实现参考 PR #117 的 collect_all_cases.py。
+
+### run_npu_test_shard.py (测试执行)
+
+核心功能：
+1. 加载 shard JSON 中的用例列表
+2. 每个用例启动独立 pytest subprocess (崩溃隔离)
+3. 并发执行 (ThreadPoolExecutor，max_workers 参数控制)
+4. 收集结果：passed/failed/error/crashed/timeout
+5. 保存每个用例的结果 JSON
+
+关键实现参考 PR #117 的 run_npu_test_shard.py。
+
+### generate_report.py (报告生成)
+
+核心功能：
+1. 读取所有 shard 的 cases JSON
+2. 统计汇总：总用例数、passed/failed/crashed/timeout
+3. 生成 Markdown 报告表格
+4. 生成 JSON 详细报告
+
+---
+
+## 关键设计要点
+
+### 1. Docker 镜像传递机制
+
+与上游 PyTorch 一致的做法：
+
+```yaml
+# Build Job 输出镜像 URL
+outputs:
+  docker-image: ${{ steps.set_image.outputs.docker-image }}
+
+# Test Job 使用同一个镜像
+container:
+  image: ${{ needs.build.outputs.docker-image }}
+```
+
+优势：
+- 构建和测试环境完全一致
+- 避免 artifact 传递导致的环境差异
+- 镜像标签带时间戳，便于追溯
+
+### 2. CANN 环境初始化
+
+在 Dockerfile 中预装 CANN，并在镜像中添加环境初始化脚本：
+
+```bash
+source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+```
+
+每个 job 开始时调用此脚本确保环境正确。
+
+### 3. 用例级分片
+
+- pytest --collect-only 预收集所有用例
+- 按用例数量均分到 shards，实现负载均衡
+- 遏制某些 shard 因包含大文件而过载
+
+### 4. 每用例独立进程执行
+
+- 每个 pytest case 在独立 subprocess 中运行
+- 崩溃隔离：一个 case crash 不影响其他 case
+- 支持超时控制
+- 结果独立记录
+
+### 5. Distributed vs Regular 区分
+
+- Distributed: 需要 HCCL 分布式环境，串行执行 (max-workers=1)
+- Regular: 单机测试，并发执行 (max-workers=32)
+
+### 6. PYTHONPATH 处理
+
+某些测试文件导入同级模块：
+- 执行前动态添加测试文件父目录到 PYTHONPATH
+
+### 7. Runner 统一使用 a3-16
+
+- Build、Collect、Test 都使用 linux-aarch64-a3-16
+- 简化配置，统一环境
+- 16-card NPU 支持分布式测试
+
+---
+
+## 验证方案
+
+### 步骤1: 验证 Docker 镜像构建
+
+```bash
+# 本地构建测试
+cd .github/docker
+docker build -t pytorch-npu-builder:test -f pytorch-npu-builder.Dockerfile .
+
+# 验证 CANN 安装
+docker run --rm pytorch-npu-builder:test \
+  bash -c "source /usr/local/Ascend/cann/set_env.sh && echo CANN installed"
+```
+
+### 步骤2: 验证构建流程
+
+触发 workflow_dispatch，使用少量配置：
+- docker_image_tag: latest (或手动构建的标签)
+- pytorch_branch: main
+- distributed_shards: 1
+- regular_shards: 1
+
+### 步骤3: 验证用例收集和执行
+
+检查 collect_cases job 输出：
+- cases_collection_summary.json 内容
+- shard JSON 文件数量和内容
+
+检查 test job 输出：
+- cases JSON 结果统计
+- 失败用例日志
+
+### 步骤4: 验证报告生成
+
+检查 report job 输出：
+- report.md Markdown 报告
+- report.json 详细数据
+- all-test-logs.tar.gz 日志制品
+
+---
+
+## 预估工作量
+
+| 任务 | 预估时间 |
+|------|----------|
+| 创建 Dockerfile | 1 小时 |
+| 创建 build-docker-image.yml | 1 小时 |
+| 创建 _build.yml | 2 小时 |
+| 创建 _test.yml | 2 小时 |
+| 创建 npu-full-test.yml | 1 小时 |
+| 开发 collect_all_cases.py | 3 小时 |
+| 开发 run_npu_test_shard.py | 4 小时 |
+| 开发 generate_report.py | 2 小时 |
+| 本地镜像构建验证 | 2 小时 |
+| CI 调试和优化 | 4 小时 |
+| **总计** | **~20 小时** |
+
+---
+
+## 待确认事项
+
+1. **镜像仓库权限**: ghcr.io 推送需要 `packages: write` 权限，确认仓库已启用
+2. **Runner 访问**: linux-aarch64-a3-16 runner 确认可用
+3. **CANN 版本**: 当前使用 9.0.0-beta.2，是否需要支持多版本
+4. **镜像更新频率**: 每周构建一次是否合适，或需要手动触发机制
\ No newline at end of file

From 1a2f87a11a0e7f1b41319e2546f29ba0359604ec Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 16:59:19 +0800
Subject: [PATCH 002/132] Update workflow triggers and names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename npu-full-test.yml to "PyTorch NPU Full Test(main 分支)"
- Add pull_request trigger to build-docker-image.yml for Dockerfile and workflow changes

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 4 ++++
 .github/workflows/npu-full-test.yml      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 2ebd204242..6876cf6e01 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -6,6 +6,10 @@ on:
     paths:
       - '.github/docker/pytorch-npu-builder.Dockerfile'
       - '.github/workflows/build-docker-image.yml'
+  pull_request:
+    paths:
+      - '.github/docker/pytorch-npu-builder.Dockerfile'
+      - '.github/workflows/build-docker-image.yml'
   schedule:
     - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
   workflow_dispatch:
diff --git a/.github/workflows/npu-full-test.yml b/.github/workflows/npu-full-test.yml
index 882ac26b63..753a375cb7 100644
--- a/.github/workflows/npu-full-test.yml
+++ b/.github/workflows/npu-full-test.yml
@@ -1,4 +1,4 @@
-name: PyTorch NPU Full Test
+name: PyTorch NPU Full Test(main 分支)
 
 on:
   push:

From e691ad9f954591ec09eec638157438aaa4080728 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 17:05:51 +0800
Subject: [PATCH 003/132] Use GitHub ARM runner for Docker image build

- Change runner to ubuntu-22.04-arm (supports Docker)
- Skip login and push for pull_request events (only build test)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 6876cf6e01..3e7a6b277a 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -24,7 +24,7 @@ env:
 
 jobs:
   build:
-    runs-on: linux-aarch64-a3-16
+    runs-on: ubuntu-22.04-arm
     permissions:
       contents: read
       packages: write
@@ -37,6 +37,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Log in to Container Registry
+        if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -54,7 +55,7 @@ jobs:
         with:
           context: .github/docker
           file: .github/docker/pytorch-npu-builder.Dockerfile
-          push: true
+          push: ${{ github.event_name != 'pull_request' }}
           tags: |
             ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
             ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest

From 4c0885140eeedcc83f509b1f2285a25c7896ae34 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 17:08:15 +0800
Subject: [PATCH 004/132] Fix Docker image tag lowercase requirement

- Convert repository_owner to lowercase using tr command
- Docker requires all image names to be lowercase

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 3e7a6b277a..263381d54c 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -48,7 +48,9 @@ jobs:
         id: tag
         run: |
           TIMESTAMP=$(date +%Y%m%d)
+          REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
           echo "tag=manylinux-cann${{ inputs.cann_version || '9.0.0-beta.2' }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
+          echo "repo_owner=${REPO_OWNER}" >> $GITHUB_OUTPUT
 
       - name: Build and push Docker image
         uses: docker/build-push-action@v5
@@ -57,11 +59,11 @@ jobs:
           file: .github/docker/pytorch-npu-builder.Dockerfile
           push: ${{ github.event_name != 'pull_request' }}
           tags: |
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest
+            ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
+            ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:latest
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
       - name: Output image URL
         run: |
-          echo "Built image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
\ No newline at end of file
+          echo "Built image: ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
\ No newline at end of file

From 659aa6a2b883d80039fb77aed7b558eb15291820 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 17:17:52 +0800
Subject: [PATCH 005/132] Use PyPA manylinux_2_28_aarch64 base image

- Change base image from ghcr.io/pytorch/manylinux-builder (private)
  to quay.io/pypa/manylinux_2_28_aarch64 (public, matches PyTorch main)
- Add necessary OS packages matching PyTorch's Dockerfile
- Set Python 3.11 from manylinux as default (PATH=/opt/python/cp311-cp311/bin)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/docker/pytorch-npu-builder.Dockerfile | 40 +++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/.github/docker/pytorch-npu-builder.Dockerfile b/.github/docker/pytorch-npu-builder.Dockerfile
index 51faaa64d9..38b038afe6 100644
--- a/.github/docker/pytorch-npu-builder.Dockerfile
+++ b/.github/docker/pytorch-npu-builder.Dockerfile
@@ -1,5 +1,35 @@
-# 基于 PyTorch manylinux builder 镜像
-FROM ghcr.io/pytorch/manylinux-builder:aarch64
+# 基于 PyPA manylinux 2_28 aarch64 镜像 (与 PyTorch 主干一致)
+FROM quay.io/pypa/manylinux_2_28_aarch64
+
+# 安装必要的 OS 包
+RUN yum -y install epel-release && \
+    yum -y update && \
+    yum install -y \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        less \
+        libffi-devel \
+        libgomp \
+        make \
+        openssl-devel \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm \
+        zstd \
+        sudo && \
+    yum install -y --enablerepo=powertools ninja-build && \
+    rm -rf /var/cache/yum
 
 # 设置工作目录
 WORKDIR /root
@@ -24,5 +54,9 @@ ENV ASCEND_HOME=/usr/local/Ascend
 RUN printf '#!/bin/bash\nsource /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\nsource /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true\n' > /etc/profile.d/cann_env.sh && \
     chmod +x /etc/profile.d/cann_env.sh
 
+# 设置 Python 3.11 为默认版本
+ENV PYTHON_VERSION=3.11
+ENV PATH=/opt/python/cp311-cp311/bin:$PATH
+
 # 预安装 pytest 等测试依赖
-RUN pip3.11 install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard
\ No newline at end of file
+RUN pip install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard
\ No newline at end of file

From 14b76c446a5093726da574e8dffb6a96553a9486 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 17:21:35 +0800
Subject: [PATCH 006/132] Align Dockerfile with PyTorch official manywheel
 image

- Add gcc-toolset-13 for modern GCC toolchain (required for PyTorch build)
- Add language environment variables (LC_ALL, LANG, LANGUAGE)
- Add git safe.directory config for bind-mounted repos
- Remove pytest dependencies (install at test time, not in build image)
- Match PyTorch's .ci/docker/manywheel/Dockerfile_2_28_aarch64

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/docker/pytorch-npu-builder.Dockerfile | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/.github/docker/pytorch-npu-builder.Dockerfile b/.github/docker/pytorch-npu-builder.Dockerfile
index 38b038afe6..46e6347a80 100644
--- a/.github/docker/pytorch-npu-builder.Dockerfile
+++ b/.github/docker/pytorch-npu-builder.Dockerfile
@@ -1,7 +1,14 @@
 # 基于 PyPA manylinux 2_28 aarch64 镜像 (与 PyTorch 主干一致)
 FROM quay.io/pypa/manylinux_2_28_aarch64
 
-# 安装必要的 OS 包
+ARG GCCTOOLSET_VERSION=13
+
+# Language variables
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# 安装必要的 OS 包 (与 PyTorch 官方 Dockerfile 一致)
 RUN yum -y install epel-release && \
     yum -y update && \
     yum install -y \
@@ -27,10 +34,21 @@ RUN yum -y install epel-release && \
         xz \
         yasm \
         zstd \
-        sudo && \
+        sudo \
+        gcc-toolset-${GCCTOOLSET_VERSION}-gcc \
+        gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
+        gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
+        gcc-toolset-${GCCTOOLSET_VERSION}-gdb && \
     yum install -y --enablerepo=powertools ninja-build && \
     rm -rf /var/cache/yum
 
+# 确保使用正确的 devtoolset
+ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git 2.36+ 需要配置 safe.directory
+RUN git config --global --add safe.directory "*"
+
 # 设置工作目录
 WORKDIR /root
 
@@ -56,7 +74,4 @@ RUN printf '#!/bin/bash\nsource /usr/local/Ascend/cann/set_env.sh 2>/dev/null ||
 
 # 设置 Python 3.11 为默认版本
 ENV PYTHON_VERSION=3.11
-ENV PATH=/opt/python/cp311-cp311/bin:$PATH
-
-# 预安装 pytest 等测试依赖
-RUN pip install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard
\ No newline at end of file
+ENV PATH=/opt/python/cp311-cp311/bin:$PATH
\ No newline at end of file

From 3908f29b5066608857e7f48332cfed2bd7eb2d0d Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 17:49:40 +0800
Subject: [PATCH 007/132] Move Python PATH setup before CANN installation

CANN toolkit installation requires Python environment.
Previously Python PATH was set after CANN installation,
causing cann-ge-compiler install failure (exit code 4).

Now Python 3.11 PATH is configured before CANN installation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/docker/pytorch-npu-builder.Dockerfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/docker/pytorch-npu-builder.Dockerfile b/.github/docker/pytorch-npu-builder.Dockerfile
index 46e6347a80..75885be334 100644
--- a/.github/docker/pytorch-npu-builder.Dockerfile
+++ b/.github/docker/pytorch-npu-builder.Dockerfile
@@ -49,6 +49,10 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/op
 # git 2.36+ 需要配置 safe.directory
 RUN git config --global --add safe.directory "*"
 
+# 设置 Python 3.11 为默认版本 (CANN 安装需要 Python 环境)
+ENV PYTHON_VERSION=3.11
+ENV PATH=/opt/python/cp311-cp311/bin:$PATH
+
 # 设置工作目录
 WORKDIR /root
 
@@ -70,8 +74,4 @@ ENV ASCEND_HOME=/usr/local/Ascend
 
 # 添加 CANN 环境初始化脚本
 RUN printf '#!/bin/bash\nsource /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\nsource /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true\n' > /etc/profile.d/cann_env.sh && \
-    chmod +x /etc/profile.d/cann_env.sh
-
-# 设置 Python 3.11 为默认版本
-ENV PYTHON_VERSION=3.11
-ENV PATH=/opt/python/cp311-cp311/bin:$PATH
\ No newline at end of file
+    chmod +x /etc/profile.d/cann_env.sh
\ No newline at end of file

From 204ab4940ac86fd9b8014611a4257d621e14f358 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 18:13:27 +0800
Subject: [PATCH 008/132] Source CANN environment before nnal installation

NNAL package (atb) requires CANN environment variables to be set.
Add 'source /usr/local/Ascend/cann/set_env.sh' before nnal install.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/docker/pytorch-npu-builder.Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/docker/pytorch-npu-builder.Dockerfile b/.github/docker/pytorch-npu-builder.Dockerfile
index 75885be334..a4fa701667 100644
--- a/.github/docker/pytorch-npu-builder.Dockerfile
+++ b/.github/docker/pytorch-npu-builder.Dockerfile
@@ -64,6 +64,7 @@ RUN mkdir -p cann && cd cann && \
     chmod +x Ascend-cann*.run && \
     ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
     ./Ascend-cann-A3*.run --install --quiet --install-path=/usr/local/Ascend && \
+    source /usr/local/Ascend/cann/set_env.sh && \
     ./Ascend-cann-nnal*.run --install --quiet --install-path=/usr/local/Ascend && \
     rm -rf cann
 

From 9445b3331d4a1f7cc9116d53813111b027d7f6b0 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 18:55:22 +0800
Subject: [PATCH 009/132] Fix Docker image URL lowercase in _build.yml

Use 'ascend' (lowercase) instead of github.repository_owner
to match the actual image name pushed to ghcr.io.

Docker requires image names to be lowercase.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index b9122bd688..b9cf54de12 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -39,14 +39,14 @@ jobs:
       docker-image: ${{ steps.set_image.outputs.docker-image }}
 
     container:
-      image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}
+      image: ${{ env.REGISTRY }}/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}
       options: --user root
 
     steps:
       - name: Set Docker image URL
         id: set_image
         run: |
-          DOCKER_IMAGE="${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
+          DOCKER_IMAGE="${{ env.REGISTRY }}/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
           echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
           echo "Using Docker image: ${DOCKER_IMAGE}"
 

From 8b134942a87c7fa612ad7c2286e167020556b305 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 18:57:57 +0800
Subject: [PATCH 010/132] Add pull_request trigger to npu-full-test workflow

Trigger when workflow files, scripts, or docker files are modified:
- .github/workflows/** (workflow files)
- .github/scripts/** (Python scripts)
- .github/docker/** (Dockerfile)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-full-test.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/npu-full-test.yml b/.github/workflows/npu-full-test.yml
index 753a375cb7..5f0708f5da 100644
--- a/.github/workflows/npu-full-test.yml
+++ b/.github/workflows/npu-full-test.yml
@@ -7,6 +7,11 @@ on:
       - '.github/workflows/**'
       - '.github/scripts/**'
       - '.github/docker/**'
+  pull_request:
+    paths:
+      - '.github/workflows/**'
+      - '.github/scripts/**'
+      - '.github/docker/**'
   schedule:
     - cron: '0 22 * * 1'  # UTC 22:00, Beijing 06:00, every Monday
   workflow_dispatch:

From 2a6337756aa8a28c6b662f5b8c9f8fe2a9a65241 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:01:25 +0800
Subject: [PATCH 011/132] Remove env.REGISTRY from reusable workflow _build.yml

In reusable workflows (workflow_call), the env context is not
available in container configuration. Replace env.REGISTRY with
hardcoded ghcr.io value.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index b9cf54de12..781cf0d467 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -29,9 +29,6 @@ on:
         description: 'Test source artifact name'
         value: 'test-src-main'
 
-env:
-  REGISTRY: ghcr.io
-
 jobs:
   build:
     runs-on: linux-aarch64-a3-16
@@ -39,14 +36,14 @@ jobs:
       docker-image: ${{ steps.set_image.outputs.docker-image }}
 
     container:
-      image: ${{ env.REGISTRY }}/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}
+      image: ghcr.io/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}
       options: --user root
 
     steps:
       - name: Set Docker image URL
         id: set_image
         run: |
-          DOCKER_IMAGE="${{ env.REGISTRY }}/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
+          DOCKER_IMAGE="ghcr.io/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
           echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
           echo "Using Docker image: ${DOCKER_IMAGE}"
 

From 50e5dd4d5739f129223b210d9fe163dd633ab3d0 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:26:01 +0800
Subject: [PATCH 012/132] Add container credentials for private ghcr.io image
 pull

ghcr.io images are private by default and require authentication.
Add credentials configuration to all reusable workflows that pull
the pytorch-npu-builder image:
- _build.yml
- _collect.yml
- _test.yml

Uses github.actor and GITHUB_TOKEN for authentication.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml   | 3 +++
 .github/workflows/_collect.yml | 3 +++
 .github/workflows/_test.yml    | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 781cf0d467..a7bd9ddfba 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -38,6 +38,9 @@ jobs:
     container:
       image: ghcr.io/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}
       options: --user root
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
       - name: Set Docker image URL
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index b23febfe39..7d32ffb5d8 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -40,6 +40,9 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
       options: --user root
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
       - name: Checkout scripts
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index ee882795b7..797bd38417 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -28,6 +28,9 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
       options: --user root
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
       - name: Checkout scripts

From 32cf137dabd4a19a76cec35836de9110290dce1e Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:28:57 +0800
Subject: [PATCH 013/132] Remove pull_request trigger from build-docker-image
 workflow

Also remove related PR conditional checks since they are no longer needed:
- Remove 'if: github.event_name != 'pull_request' from login step
- Change 'push: ${{ github.event_name != 'pull_request' }}' to 'push: true'

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 263381d54c..b6fb77ec57 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -6,10 +6,6 @@ on:
     paths:
       - '.github/docker/pytorch-npu-builder.Dockerfile'
       - '.github/workflows/build-docker-image.yml'
-  pull_request:
-    paths:
-      - '.github/docker/pytorch-npu-builder.Dockerfile'
-      - '.github/workflows/build-docker-image.yml'
   schedule:
     - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
   workflow_dispatch:
@@ -37,7 +33,6 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Log in to Container Registry
-        if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -57,7 +52,7 @@ jobs:
         with:
           context: .github/docker
           file: .github/docker/pytorch-npu-builder.Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
+          push: true
           tags: |
             ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
             ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:latest

From fceb4c6a5c95d7b5df5dbd13dde05b5681178c3e Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:34:41 +0800
Subject: [PATCH 014/132] Add step to set package visibility to public

Use GitHub API to set the container package visibility to public
after pushing the image. This allows anyone to pull the image
without authentication.

PATCH /orgs/{org}/packages/container/{package_name}
with {"visibility":"public"}

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index b6fb77ec57..7512b0d8c2 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -59,6 +59,15 @@ jobs:
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
+      - name: Set package visibility to public
+        run: |
+          curl -X PATCH \
+            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/orgs/${{ steps.tag.outputs.repo_owner }}/packages/container/${{ env.IMAGE_NAME }}" \
+            -d '{"visibility":"public"}'
+          echo "Package visibility set to public"
+
       - name: Output image URL
         run: |
           echo "Built image: ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
\ No newline at end of file

From 9e50cddcf8456aed23475afe324b225ea77993c9 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:36:47 +0800
Subject: [PATCH 015/132] Restore pull_request trigger for build-docker-image
 workflow

- Add pull_request trigger for Dockerfile and workflow changes
- PR builds only test build, not push (push: ${{ github.event_name != 'pull_request' }})
- Skip visibility step for PR events

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 7512b0d8c2..5a872e1a8e 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -6,6 +6,10 @@ on:
     paths:
       - '.github/docker/pytorch-npu-builder.Dockerfile'
       - '.github/workflows/build-docker-image.yml'
+  pull_request:
+    paths:
+      - '.github/docker/pytorch-npu-builder.Dockerfile'
+      - '.github/workflows/build-docker-image.yml'
   schedule:
     - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
   workflow_dispatch:
@@ -52,7 +56,7 @@ jobs:
         with:
           context: .github/docker
           file: .github/docker/pytorch-npu-builder.Dockerfile
-          push: true
+          push: ${{ github.event_name != 'pull_request' }}
           tags: |
             ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
             ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:latest
@@ -60,6 +64,7 @@ jobs:
           cache-to: type=gha,mode=max
 
       - name: Set package visibility to public
+        if: github.event_name != 'pull_request'
         run: |
           curl -X PATCH \
             -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \

From 58555da705274731d10d4148017826b85868f003 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:37:55 +0800
Subject: [PATCH 016/132] Remove container credentials from workflows

Since the Docker image is now set to public visibility,
authentication is no longer required to pull the image.

Remove credentials configuration from:
- _build.yml
- _collect.yml
- _test.yml

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml   | 3 ---
 .github/workflows/_collect.yml | 3 ---
 .github/workflows/_test.yml    | 3 ---
 3 files changed, 9 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index a7bd9ddfba..781cf0d467 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -38,9 +38,6 @@ jobs:
     container:
       image: ghcr.io/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}
       options: --user root
-      credentials:
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
       - name: Set Docker image URL
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index 7d32ffb5d8..b23febfe39 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -40,9 +40,6 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
       options: --user root
-      credentials:
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
       - name: Checkout scripts
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 797bd38417..ee882795b7 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -28,9 +28,6 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
       options: --user root
-      credentials:
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
       - name: Checkout scripts

From c41ed0d1504e507d8484cd287cc1413c1c654c7a Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:40:46 +0800
Subject: [PATCH 017/132] Remove pull_request trigger from npu-full-test
 workflow

Full test workflow is resource-intensive and should only run on:
- Push to main/master branches
- Scheduled (weekly)
- Manual trigger

Not suitable for PR validation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-full-test.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/npu-full-test.yml b/.github/workflows/npu-full-test.yml
index 5f0708f5da..753a375cb7 100644
--- a/.github/workflows/npu-full-test.yml
+++ b/.github/workflows/npu-full-test.yml
@@ -7,11 +7,6 @@ on:
       - '.github/workflows/**'
       - '.github/scripts/**'
       - '.github/docker/**'
-  pull_request:
-    paths:
-      - '.github/workflows/**'
-      - '.github/scripts/**'
-      - '.github/docker/**'
   schedule:
     - cron: '0 22 * * 1'  # UTC 22:00, Beijing 06:00, every Monday
   workflow_dispatch:

From 958f4ba14e4268c4bc2c53ac321eb783f6f66d04 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 19:46:10 +0800
Subject: [PATCH 018/132] Enable push and public visibility for PR events (test
 phase)

For testing purposes:
- Set push: true (remove PR exclusion)
- Remove if condition from visibility step

This allows PR to also push and set image to public.

Note: Will revert to production config after testing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 5a872e1a8e..7b0c013da4 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -56,7 +56,7 @@ jobs:
         with:
           context: .github/docker
           file: .github/docker/pytorch-npu-builder.Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
+          push: true
           tags: |
             ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
             ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:latest
@@ -64,7 +64,6 @@ jobs:
           cache-to: type=gha,mode=max
 
       - name: Set package visibility to public
-        if: github.event_name != 'pull_request'
         run: |
           curl -X PATCH \
             -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \

From 9fcb9f6e5afc32aeba96972b473117e9fdf76de2 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 20:20:48 +0800
Subject: [PATCH 019/132] Change container registry from ghcr.io to quay.io

- Registry: quay.io
- Organization: kerer
- Image: pytorch-npu-builder

Changes:
- Use QUAY_USERNAME and QUAY_PASSWORD secrets for authentication
- Remove ghcr.io specific settings (visibility API call, packages:write permission)
- Update _build.yml to use quay.io/kerer/pytorch-npu-builder

Required secrets:
- QUAY_USERNAME: Robot account username (e.g., kerer+robot_name)
- QUAY_PASSWORD: Robot account token/password

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml             |  4 ++--
 .github/workflows/build-docker-image.yml | 28 ++++++++----------------
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 781cf0d467..1bc04eebda 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -36,14 +36,14 @@ jobs:
       docker-image: ${{ steps.set_image.outputs.docker-image }}
 
     container:
-      image: ghcr.io/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}
+      image: quay.io/kerer/pytorch-npu-builder:${{ inputs.docker_image_tag }}
       options: --user root
 
     steps:
       - name: Set Docker image URL
         id: set_image
         run: |
-          DOCKER_IMAGE="ghcr.io/ascend/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
+          DOCKER_IMAGE="quay.io/kerer/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
           echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
           echo "Using Docker image: ${DOCKER_IMAGE}"
 
diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 7b0c013da4..043c3ea645 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -19,15 +19,15 @@ on:
         default: '9.0.0-beta.2'
 
 env:
-  REGISTRY: ghcr.io
+  REGISTRY: quay.io
   IMAGE_NAME: pytorch-npu-builder
+  QUAY_ORG: kerer
 
 jobs:
   build:
     runs-on: ubuntu-22.04-arm
     permissions:
       contents: read
-      packages: write
 
     steps:
       - name: Checkout repository
@@ -36,20 +36,18 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      - name: Log in to Container Registry
+      - name: Log in to Quay.io Container Registry
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          username: ${{ secrets.QUAY_USERNAME }}
+          password: ${{ secrets.QUAY_PASSWORD }}
 
       - name: Generate timestamp tag
         id: tag
         run: |
           TIMESTAMP=$(date +%Y%m%d)
-          REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
           echo "tag=manylinux-cann${{ inputs.cann_version || '9.0.0-beta.2' }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
-          echo "repo_owner=${REPO_OWNER}" >> $GITHUB_OUTPUT
 
       - name: Build and push Docker image
         uses: docker/build-push-action@v5
@@ -58,20 +56,12 @@ jobs:
           file: .github/docker/pytorch-npu-builder.Dockerfile
           push: true
           tags: |
-            ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
-            ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:latest
+            ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
+            ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:latest
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
-      - name: Set package visibility to public
-        run: |
-          curl -X PATCH \
-            -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
-            -H "Accept: application/vnd.github.v3+json" \
-            "https://api.github.com/orgs/${{ steps.tag.outputs.repo_owner }}/packages/container/${{ env.IMAGE_NAME }}" \
-            -d '{"visibility":"public"}'
-          echo "Package visibility set to public"
-
       - name: Output image URL
         run: |
-          echo "Built image: ${{ env.REGISTRY }}/${{ steps.tag.outputs.repo_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
\ No newline at end of file
+          echo "Built image: ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
+          echo "Built image: ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:latest"
\ No newline at end of file

From 98194c20be8e72b9194a0770466e245ff33df8d1 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 20:33:29 +0800
Subject: [PATCH 020/132] Simplify workflow trigger to push on dev_master only

- Remove pull_request trigger
- Remove schedule trigger
- Remove workflow_dispatch trigger
- Keep only push on dev_master branch
- Use env.CANN_VERSION instead of input parameter

Trigger conditions:
- Push to dev_master branch
- Changes to Dockerfile or workflow file

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 043c3ea645..c965cddad1 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -2,26 +2,16 @@ name: Build Docker Image
 
 on:
   push:
-    branches: [main, master]
+    branches: [dev_master]
     paths:
       - '.github/docker/pytorch-npu-builder.Dockerfile'
       - '.github/workflows/build-docker-image.yml'
-  pull_request:
-    paths:
-      - '.github/docker/pytorch-npu-builder.Dockerfile'
-      - '.github/workflows/build-docker-image.yml'
-  schedule:
-    - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
-  workflow_dispatch:
-    inputs:
-      cann_version:
-        description: 'CANN version'
-        default: '9.0.0-beta.2'
 
 env:
   REGISTRY: quay.io
   IMAGE_NAME: pytorch-npu-builder
   QUAY_ORG: kerer
+  CANN_VERSION: '9.0.0-beta.2'
 
 jobs:
   build:
@@ -47,7 +37,7 @@ jobs:
         id: tag
         run: |
           TIMESTAMP=$(date +%Y%m%d)
-          echo "tag=manylinux-cann${{ inputs.cann_version || '9.0.0-beta.2' }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
+          echo "tag=manylinux-cann${{ env.CANN_VERSION }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
 
       - name: Build and push Docker image
         uses: docker/build-push-action@v5

From a6fb9e2be105bef5f61cbe8972987fecb5734ae3 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 20:35:38 +0800
Subject: [PATCH 021/132] Change image name from pytorch-npu-builder to pytorch

Target registry: quay.io/kerer/pytorch

- build-docker-image.yml: IMAGE_NAME=pytorch
- _build.yml: Update container image reference

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml             | 4 ++--
 .github/workflows/build-docker-image.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 1bc04eebda..b058cfb703 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -36,14 +36,14 @@ jobs:
       docker-image: ${{ steps.set_image.outputs.docker-image }}
 
     container:
-      image: quay.io/kerer/pytorch-npu-builder:${{ inputs.docker_image_tag }}
+      image: quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}
       options: --user root
 
     steps:
       - name: Set Docker image URL
         id: set_image
         run: |
-          DOCKER_IMAGE="quay.io/kerer/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
+          DOCKER_IMAGE="quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}"
           echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
           echo "Using Docker image: ${DOCKER_IMAGE}"
 
diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index c965cddad1..109ccb3b6b 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -9,7 +9,7 @@ on:
 
 env:
   REGISTRY: quay.io
-  IMAGE_NAME: pytorch-npu-builder
+  IMAGE_NAME: pytorch
   QUAY_ORG: kerer
   CANN_VERSION: '9.0.0-beta.2'
 

From ed1c8f5627f44605a0a1d438ab27c45240eda377 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 20:38:53 +0800
Subject: [PATCH 022/132] Add environment: quay to access environment secrets

Environment secrets require specifying the environment name
in the job configuration to be accessible.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 109ccb3b6b..836e57468b 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -16,6 +16,7 @@ env:
 jobs:
   build:
     runs-on: ubuntu-22.04-arm
+    environment: quay
     permissions:
       contents: read
 

From 8f81fb709b0bd49d5edf95fb1fbe158862001806 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 20:39:52 +0800
Subject: [PATCH 023/132] Set environment to QUAY_USERNAME to access secrets

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/build-docker-image.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index 836e57468b..c406458b00 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -16,7 +16,7 @@ env:
 jobs:
   build:
     runs-on: ubuntu-22.04-arm
-    environment: quay
+    environment: QUAY_USERNAME
     permissions:
       contents: read
 

From 9bf616e14e64288779ab585880778d429f45073b Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 28 Apr 2026 21:26:29 +0800
Subject: [PATCH 024/132] Change workflow trigger to pull_request and use
 quay.io registry

- Trigger on PR to main/master instead of push and schedule
- Change REGISTRY from ghcr.io to quay.io

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-full-test.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/npu-full-test.yml b/.github/workflows/npu-full-test.yml
index 753a375cb7..a99a624ce9 100644
--- a/.github/workflows/npu-full-test.yml
+++ b/.github/workflows/npu-full-test.yml
@@ -1,14 +1,12 @@
 name: PyTorch NPU Full Test(main 分支)
 
 on:
-  push:
+  pull_request:
     branches: [main, master]
     paths:
       - '.github/workflows/**'
       - '.github/scripts/**'
       - '.github/docker/**'
-  schedule:
-    - cron: '0 22 * * 1'  # UTC 22:00, Beijing 06:00, every Monday
   workflow_dispatch:
     inputs:
       docker_image_tag:
@@ -27,7 +25,7 @@ on:
         type: number
 
 env:
-  REGISTRY: ghcr.io
+  REGISTRY: quay.io
   PYTHON_VERSION: '3.11'
 
 jobs:

From bd329c89d4c0039c656d4c1c22d5f61b18e7142b Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@users.noreply.gitcode.com>
Date: Tue, 28 Apr 2026 23:59:01 +0800
Subject: [PATCH 025/132] Add workflow to verify pulling quay.io docker image

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/verify-docker-image.yml | 47 +++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 .github/workflows/verify-docker-image.yml

diff --git a/.github/workflows/verify-docker-image.yml b/.github/workflows/verify-docker-image.yml
new file mode 100644
index 0000000000..fec8435da5
--- /dev/null
+++ b/.github/workflows/verify-docker-image.yml
@@ -0,0 +1,47 @@
+name: Verify Docker Image Pull
+
+on:
+  workflow_dispatch:
+    inputs:
+      docker_image:
+        description: 'Docker image to verify'
+        default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
+        type: string
+
+jobs:
+  verify-image:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 30
+
+    steps:
+      - name: Pull Docker image
+        run: |
+          echo "Pulling image: ${{ inputs.docker_image }}"
+          docker pull ${{ inputs.docker_image }}
+
+      - name: Verify image exists
+        run: |
+          docker images ${{ inputs.docker_image }}
+
+      - name: Check CANN environment
+        run: |
+          docker run --rm ${{ inputs.docker_image }} \
+            bash -c "source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true && \
+                     echo 'CANN_PATH: $CANN_PATH' && \
+                     ls -la /usr/local/Ascend/cann/ | head -20"
+
+      - name: Check Python version
+        run: |
+          docker run --rm ${{ inputs.docker_image }} \
+            bash -c "python3.11 --version && pip3.11 list | grep -E 'torch|pytest'"
+
+      - name: Verify CANN toolkit installation
+        run: |
+          docker run --rm ${{ inputs.docker_image }} \
+            bash -c "ls -la /usr/local/Ascend/ && \
+                     cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo 'version.info not found'"
+
+      - name: Clean up
+        if: always()
+        run: |
+          docker rmi ${{ inputs.docker_image }} 2>/dev/null || true
\ No newline at end of file

From ab578e26e17583161f154083a581d90e864371ac Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@users.noreply.gitcode.com>
Date: Wed, 29 Apr 2026 00:01:40 +0800
Subject: [PATCH 026/132] Add push trigger to verify-docker-image workflow

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/verify-docker-image.yml | 27 +++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/verify-docker-image.yml b/.github/workflows/verify-docker-image.yml
index fec8435da5..93a0aa204d 100644
--- a/.github/workflows/verify-docker-image.yml
+++ b/.github/workflows/verify-docker-image.yml
@@ -1,6 +1,10 @@
 name: Verify Docker Image Pull
 
 on:
+  push:
+    branches: [dev_master]
+    paths:
+      - '.github/workflows/verify-docker-image.yml'
   workflow_dispatch:
     inputs:
       docker_image:
@@ -8,40 +12,49 @@ on:
         default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
         type: string
 
+env:
+  DEFAULT_IMAGE: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
+
 jobs:
   verify-image:
     runs-on: linux-aarch64-a3-16
     timeout-minutes: 30
 
     steps:
+      - name: Set image name
+        id: image
+        run: |
+          IMAGE="${{ inputs.docker_image || env.DEFAULT_IMAGE }}"
+          echo "image=${IMAGE}" >> $GITHUB_OUTPUT
+          echo "Using image: ${IMAGE}"
+
       - name: Pull Docker image
         run: |
-          echo "Pulling image: ${{ inputs.docker_image }}"
-          docker pull ${{ inputs.docker_image }}
+          docker pull ${{ steps.image.outputs.image }}
 
       - name: Verify image exists
         run: |
-          docker images ${{ inputs.docker_image }}
+          docker images ${{ steps.image.outputs.image }}
 
       - name: Check CANN environment
         run: |
-          docker run --rm ${{ inputs.docker_image }} \
+          docker run --rm ${{ steps.image.outputs.image }} \
             bash -c "source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true && \
                      echo 'CANN_PATH: $CANN_PATH' && \
                      ls -la /usr/local/Ascend/cann/ | head -20"
 
       - name: Check Python version
         run: |
-          docker run --rm ${{ inputs.docker_image }} \
+          docker run --rm ${{ steps.image.outputs.image }} \
             bash -c "python3.11 --version && pip3.11 list | grep -E 'torch|pytest'"
 
       - name: Verify CANN toolkit installation
         run: |
-          docker run --rm ${{ inputs.docker_image }} \
+          docker run --rm ${{ steps.image.outputs.image }} \
             bash -c "ls -la /usr/local/Ascend/ && \
                      cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo 'version.info not found'"
 
       - name: Clean up
         if: always()
         run: |
-          docker rmi ${{ inputs.docker_image }} 2>/dev/null || true
\ No newline at end of file
+          docker rmi ${{ steps.image.outputs.image }} 2>/dev/null || true
\ No newline at end of file

From 3abe5a8420a402c8b8ca37fec33e1b24656dc7af Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@users.noreply.gitcode.com>
Date: Wed, 29 Apr 2026 00:03:35 +0800
Subject: [PATCH 027/132] Use GitHub free ubuntu-22.04-arm runner for
 verification

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/verify-docker-image.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/verify-docker-image.yml b/.github/workflows/verify-docker-image.yml
index 93a0aa204d..6495336978 100644
--- a/.github/workflows/verify-docker-image.yml
+++ b/.github/workflows/verify-docker-image.yml
@@ -17,7 +17,7 @@ env:
 
 jobs:
   verify-image:
-    runs-on: linux-aarch64-a3-16
+    runs-on: ubuntu-22.04-arm
     timeout-minutes: 30
 
     steps:

From bde1eb1ca143d7aacdb4ef6ef0dc4b1e4932ebc4 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@users.noreply.gitcode.com>
Date: Wed, 29 Apr 2026 00:17:49 +0800
Subject: [PATCH 028/132] Fix grep failure when no torch/pytest packages
 installed

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/verify-docker-image.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/verify-docker-image.yml b/.github/workflows/verify-docker-image.yml
index 6495336978..ba3e4f2d04 100644
--- a/.github/workflows/verify-docker-image.yml
+++ b/.github/workflows/verify-docker-image.yml
@@ -46,7 +46,7 @@ jobs:
       - name: Check Python version
         run: |
           docker run --rm ${{ steps.image.outputs.image }} \
-            bash -c "python3.11 --version && pip3.11 list | grep -E 'torch|pytest'"
+            bash -c "python3.11 --version && pip3.11 list | grep -E 'torch|pytest' || echo 'No torch/pytest packages found'"
 
       - name: Verify CANN toolkit installation
         run: |

From 4c27b0039578d3988c98caa51b698ec0f5767c15 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@users.noreply.gitcode.com>
Date: Wed, 29 Apr 2026 00:27:38 +0800
Subject: [PATCH 029/132] Rename workflow and add test dependencies
 installation

- Rename npu-full-test.yml to npu-sync-test.yml
- Add timeout-minutes: 240 to _build.yml
- Add test dependencies installation step in _collect.yml and _test.yml
- Add --timeout 300 to run_npu_test_shard.py
- Remove torch/pytest check from verify-docker-image.yml

Test dependencies: pytest, pytest-timeout, pytest-xdist, hypothesis,
numpy, expecttest, pyyaml, zstandard, packaging

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml                               | 1 +
 .github/workflows/_collect.yml                             | 6 ++++++
 .github/workflows/_test.yml                                | 7 +++++++
 .github/workflows/{npu-full-test.yml => npu-sync-test.yml} | 2 +-
 .github/workflows/verify-docker-image.yml                  | 2 +-
 5 files changed, 16 insertions(+), 2 deletions(-)
 rename .github/workflows/{npu-full-test.yml => npu-sync-test.yml} (98%)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index b058cfb703..0f4febe676 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -32,6 +32,7 @@ on:
 jobs:
   build:
     runs-on: linux-aarch64-a3-16
+    timeout-minutes: 240
     outputs:
       docker-image: ${{ steps.set_image.outputs.docker-image }}
 
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index b23febfe39..6233d8f4cf 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -72,6 +72,12 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           pip${{ env.PYTHON_VERSION }} install wheels/*.whl
 
+      - name: Install test dependencies
+        run: |
+          pip${{ env.PYTHON_VERSION }} install \
+            pytest pytest-timeout pytest-xdist \
+            hypothesis numpy expecttest pyyaml zstandard packaging
+
       - name: Collect all test cases
         id: collect
         run: |
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index ee882795b7..b80970eff0 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -66,6 +66,12 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           pip${{ env.PYTHON_VERSION }} install wheels/*.whl
 
+      - name: Install test dependencies
+        run: |
+          pip${{ env.PYTHON_VERSION }} install \
+            pytest pytest-timeout pytest-xdist \
+            hypothesis numpy expecttest pyyaml zstandard packaging
+
       - name: Run ${{ inputs.test-type }} shard ${{ inputs.shard-index }}
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -75,6 +81,7 @@ jobs:
             --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
             --test-dir pytorch-src/test \
             --report-dir test-reports \
+            --timeout 300 \
             --max-workers ${{ inputs.max-workers }} \
             --verbose
 
diff --git a/.github/workflows/npu-full-test.yml b/.github/workflows/npu-sync-test.yml
similarity index 98%
rename from .github/workflows/npu-full-test.yml
rename to .github/workflows/npu-sync-test.yml
index a99a624ce9..f618e43dc7 100644
--- a/.github/workflows/npu-full-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -1,4 +1,4 @@
-name: PyTorch NPU Full Test(main 分支)
+name: PyTorch NPU Sync Test
 
 on:
   pull_request:
diff --git a/.github/workflows/verify-docker-image.yml b/.github/workflows/verify-docker-image.yml
index ba3e4f2d04..5b29d15a36 100644
--- a/.github/workflows/verify-docker-image.yml
+++ b/.github/workflows/verify-docker-image.yml
@@ -46,7 +46,7 @@ jobs:
       - name: Check Python version
         run: |
           docker run --rm ${{ steps.image.outputs.image }} \
-            bash -c "python3.11 --version && pip3.11 list | grep -E 'torch|pytest' || echo 'No torch/pytest packages found'"
+            bash -c "python3.11 --version"
 
       - name: Verify CANN toolkit installation
         run: |

From c5beeb7ab46202243c3762080d4b052703290c54 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@users.noreply.gitcode.com>
Date: Wed, 29 Apr 2026 00:36:02 +0800
Subject: [PATCH 030/132] Use requirements-ci.txt for test dependencies (follow
 upstream PyTorch)

- Add comprehensive test dependencies based on upstream .ci/docker/requirements-ci.txt
- Include pytest plugins: pytest-xdist, pytest-flakefinder, pytest-rerunfailures, pytest-subtests, pytest-timeout
- Add test utilities: hypothesis, expecttest, parameterized
- Add scientific packages: numpy, scipy, scikit-image, pillow
- Add ONNX support: onnx, onnxscript, protobuf
- Update _collect.yml and _test.yml to use pip install -r .ci/docker/requirements-ci.txt

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .ci/docker/requirements-ci.txt | 56 ++++++++++++++++++++++++++++++----
 .github/workflows/_collect.yml |  8 ++---
 .github/workflows/_test.yml    |  8 ++---
 3 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 8602d4d0fa..e0968f422c 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -1,7 +1,51 @@
-# Python dependencies required for unit tests
+# Python dependencies required for NPU tests
+# Based on upstream PyTorch .ci/docker/requirements-ci.txt
 
-mypy==1.9.0
-# Pin MyPy version because new errors are likely to appear with each release
-#Description: linter
-#Pinned versions: 1.9.0
-#test that import: test_typing.py, test_type_hints.py
+# pytest and plugins
+pytest==7.3.2
+pytest-xdist==3.3.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures>=10.3
+pytest-subtests==0.13.1
+pytest-timeout>=2.3.1
+xdoctest==1.3.0
+
+# test utilities
+hypothesis==6.56.4
+expecttest==0.3.0
+parameterized==0.8.1
+
+# numpy (version per Python version)
+numpy==1.26.2; python_version >= "3.11" and python_version < "3.14"
+
+# scientific packages
+scipy==1.14.1; python_version > "3.11" and python_version < "3.14"
+scikit-image==0.22.0
+pillow==12.1.1
+pywavelets==1.7.0; python_version >= "3.12"
+
+# core utilities
+networkx==2.8.8
+optree==0.13.0; python_version < "3.14"
+opt-einsum==3.3
+filelock==3.20.3
+sympy==1.13.3
+
+# build/serialization
+pyyaml==6.0.3
+packaging==24.0
+typing-extensions==4.12.2; python_version < "3.14"
+pyzstd
+setuptools>=70.1.0,<82
+zstandard
+
+# ONNX support
+onnx==1.20.0
+onnxscript==0.6.2
+protobuf==6.33.5
+
+# misc
+psutil
+jinja2==3.1.6
+tqdm>=4.66.0
+click
\ No newline at end of file
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index 6233d8f4cf..b01028cb7e 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -45,7 +45,9 @@ jobs:
       - name: Checkout scripts
         uses: actions/checkout@v4
         with:
-          sparse-checkout: .github/scripts
+          sparse-checkout: |
+            .github/scripts
+            .ci/docker
 
       - name: Download wheels
         uses: actions/download-artifact@v4
@@ -74,9 +76,7 @@ jobs:
 
       - name: Install test dependencies
         run: |
-          pip${{ env.PYTHON_VERSION }} install \
-            pytest pytest-timeout pytest-xdist \
-            hypothesis numpy expecttest pyyaml zstandard packaging
+          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
 
       - name: Collect all test cases
         id: collect
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index b80970eff0..851a7bb1f2 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -33,7 +33,9 @@ jobs:
       - name: Checkout scripts
         uses: actions/checkout@v4
         with:
-          sparse-checkout: .github/scripts
+          sparse-checkout: |
+            .github/scripts
+            .ci/docker
 
       - name: Download torch wheel
         uses: actions/download-artifact@v4
@@ -68,9 +70,7 @@ jobs:
 
       - name: Install test dependencies
         run: |
-          pip${{ env.PYTHON_VERSION }} install \
-            pytest pytest-timeout pytest-xdist \
-            hypothesis numpy expecttest pyyaml zstandard packaging
+          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
 
       - name: Run ${{ inputs.test-type }} shard ${{ inputs.shard-index }}
         run: |

From f6b47c5735d9093567ae0a7950bf4649d0ac3e59 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 09:25:01 +0800
Subject: [PATCH 031/132] Use PyTorch requirements-build.txt and add
 distributed build support

- Use upstream PyTorch requirements-build.txt for dependency installation
- Add USE_DISTRIBUTED=1 to enable distributed training backend support
- Add USE_OPENMP=1 and USE_MKLDNN=0 for proper NPU build configuration
- Add missing build dependencies: numpy, packaging, requests, six, typing-extensions

This fixes the 910B3 network interface configuration error during PyTorch build
by ensuring all required dependencies are installed and distributed support is enabled.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 0f4febe676..548f66273e 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -71,12 +71,19 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
 
           cd pytorch-src
-          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
 
+          # 安装构建依赖（使用上游 PyTorch requirements-build.txt）
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+          pip${{ inputs.python_version }} install -r requirements-build.txt
+
+          # 设置构建环境变量
           export MAX_JOBS=40
           export USE_CUDA=0
           export USE_CUDNN=0
+          export USE_DISTRIBUTED=1  # 启用分布式支持
           export CMAKE_BUILD_TYPE=Release
+          export USE_OPENMP=1       # 启用 OpenMP
+          export USE_MKLDNN=0       # 禁用 MKLDNN (NPU 版本)
 
           python${{ inputs.python_version }} setup.py build bdist_wheel
 
@@ -88,10 +95,14 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
+          # 安装刚编译的 PyTorch
           pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
 
           cd torch_npu-src
-          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
+
+          # 安装构建依赖（与 PyTorch 构建保持一致）
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+          pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
           export MAX_JOBS=40
           bash ci/build.sh --python=${{ inputs.python_version }}

From dd4e72fcb7c6885dc0cc755aa4ce3320c6172f8a Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 09:44:42 +0800
Subject: [PATCH 032/132] Restore original _build.yml and create new
 _build_torch_npu.yml with cache

- Restore _build.yml to original version (keep codebase unchanged)
- Create new _build_torch_npu.yml with:
  - PyTorch build cache based on SHA + requirements-build.txt hash
  - torch_npu build cache based on torch_npu SHA + PyTorch SHA
  - Use requirements-build.txt for dependency installation
  - Set USE_DISTRIBUTED=1 (required by torch_npu distributed module)
  - Set USE_OPENMP=1 (torch_npu CMakeLists.txt default)
  - Set USE_MKLDNN=0 (NPU doesn't need Intel CPU library)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml           |  15 +-
 .github/workflows/_build_torch_npu.yml | 228 +++++++++++++++++++++++++
 2 files changed, 230 insertions(+), 13 deletions(-)
 create mode 100644 .github/workflows/_build_torch_npu.yml

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 548f66273e..0f4febe676 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -71,19 +71,12 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
 
           cd pytorch-src
+          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
 
-          # 安装构建依赖（使用上游 PyTorch requirements-build.txt）
-          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
-          pip${{ inputs.python_version }} install -r requirements-build.txt
-
-          # 设置构建环境变量
           export MAX_JOBS=40
           export USE_CUDA=0
           export USE_CUDNN=0
-          export USE_DISTRIBUTED=1  # 启用分布式支持
           export CMAKE_BUILD_TYPE=Release
-          export USE_OPENMP=1       # 启用 OpenMP
-          export USE_MKLDNN=0       # 禁用 MKLDNN (NPU 版本)
 
           python${{ inputs.python_version }} setup.py build bdist_wheel
 
@@ -95,14 +88,10 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
-          # 安装刚编译的 PyTorch
           pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
 
           cd torch_npu-src
-
-          # 安装构建依赖（与 PyTorch 构建保持一致）
-          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
-          pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
+          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
 
           export MAX_JOBS=40
           bash ci/build.sh --python=${{ inputs.python_version }}
diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
new file mode 100644
index 0000000000..40742fed43
--- /dev/null
+++ b/.github/workflows/_build_torch_npu.yml
@@ -0,0 +1,228 @@
+name: Build PyTorch and torch_npu (with cache)
+
+on:
+  workflow_call:
+    inputs:
+      pytorch_branch:
+        required: true
+        type: string
+        default: 'main'
+      python_version:
+        required: true
+        type: string
+        default: '3.11'
+      docker_image_tag:
+        required: true
+        type: string
+        description: 'Docker image tag with timestamp'
+    outputs:
+      docker-image:
+        description: 'Full Docker image URL'
+        value: ${{ jobs.build.outputs.docker-image }}
+      torch-wheel:
+        description: 'PyTorch wheel artifact name'
+        value: 'torch-wheel-main'
+      torch-npu-wheel:
+        description: 'torch_npu wheel artifact name'
+        value: 'torch-npu-wheel-main'
+      test-src:
+        description: 'Test source artifact name'
+        value: 'test-src-main'
+
+env:
+  # 缓存版本号，当需要强制刷新缓存时修改此值
+  CACHE_VERSION: 'v1'
+
+jobs:
+  build:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.set_image.outputs.docker-image }}
+
+    container:
+      image: quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}
+      options: --user root
+
+    steps:
+      - name: Set Docker image URL
+        id: set_image
+        run: |
+          DOCKER_IMAGE="quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}"
+          echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
+          echo "Using Docker image: ${DOCKER_IMAGE}"
+
+      - name: Setup CANN environment
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+      - name: Clone upstream PyTorch main
+        id: clone_pytorch
+        run: |
+          git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
+            https://github.com/pytorch/pytorch.git pytorch-src
+          PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
+          echo "pytorch_sha=${PYTORCH_SHA}" >> $GITHUB_OUTPUT
+          echo "Cloned PyTorch commit: ${PYTORCH_SHA}"
+
+      - name: Checkout torch_npu
+        uses: actions/checkout@v4
+        with:
+          path: torch_npu-src
+          submodules: recursive
+
+      # ==================== PyTorch 构建缓存 ====================
+      - name: Get PyTorch build cache key
+        id: pytorch_cache_key
+        run: |
+          # 缓存键基于：PyTorch SHA + requirements-build.txt 内容 + 缓存版本
+          PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
+          REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pytorch-${PYTORCH_SHA}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore PyTorch build cache
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            pytorch-src/build
+            pytorch-src/dist
+            ~/.cache/pip
+          key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pytorch-
+
+      - name: Check if PyTorch wheel exists
+        id: check_pytorch_wheel
+        run: |
+          if ls pytorch-src/dist/*.whl 1>/dev/null 2>&1; then
+            echo "wheel_exists=true" >> $GITHUB_OUTPUT
+            echo "Found cached PyTorch wheel:"
+            ls -la pytorch-src/dist/
+          else
+            echo "wheel_exists=false" >> $GITHUB_OUTPUT
+            echo "No cached wheel found, will build PyTorch"
+          fi
+
+      - name: Build PyTorch wheel
+        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true'
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+
+          cd pytorch-src
+
+          # 安装构建依赖（使用上游 PyTorch requirements-build.txt）
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+          pip${{ inputs.python_version }} install -r requirements-build.txt
+
+          # 设置构建环境变量
+          export MAX_JOBS=40
+          export USE_CUDA=0
+          export USE_CUDNN=0
+          export USE_DISTRIBUTED=1  # 启用分布式支持（torch_npu 必需）
+          export CMAKE_BUILD_TYPE=Release
+          export USE_OPENMP=1       # 启用 OpenMP（torch_npu CMakeLists.txt 默认使用）
+          export USE_MKLDNN=0       # 禁用 MKLDNN（NPU 不需要 Intel CPU 库）
+
+          python${{ inputs.python_version }} setup.py build bdist_wheel
+
+          echo "PyTorch wheel built:"
+          ls -la dist/
+
+      - name: Save PyTorch build cache
+        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true' && always()
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            pytorch-src/build
+            pytorch-src/dist
+            ~/.cache/pip
+          key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
+
+      # ==================== torch_npu 构建缓存 ====================
+      - name: Get torch_npu build cache key
+        id: torch_npu_cache_key
+        run: |
+          # 缓存键基于：torch_npu 仓库 SHA + PyTorch SHA + 缓存版本
+          TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
+          PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
+          echo "cache_key=${{ env.CACHE_VERSION }}-torch_npu-${TORCH_NPU_SHA}-${PYTORCH_SHA}" >> $GITHUB_OUTPUT
+
+      - name: Restore torch_npu build cache
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            torch_npu-src/build
+            torch_npu-src/dist
+          key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-torch_npu-
+
+      - name: Check if torch_npu wheel exists
+        id: check_torch_npu_wheel
+        run: |
+          if ls torch_npu-src/dist/*.whl 1>/dev/null 2>&1; then
+            echo "wheel_exists=true" >> $GITHUB_OUTPUT
+            echo "Found cached torch_npu wheel:"
+            ls -la torch_npu-src/dist/
+          else
+            echo "wheel_exists=false" >> $GITHUB_OUTPUT
+            echo "No cached wheel found, will build torch_npu"
+          fi
+
+      - name: Build torch_npu wheel
+        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true'
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          # 安装刚编译的 PyTorch
+          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
+
+          cd torch_npu-src
+
+          # 安装构建依赖（与 PyTorch 构建保持一致）
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+          pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
+
+          export MAX_JOBS=40
+          bash ci/build.sh --python=${{ inputs.python_version }}
+
+          echo "torch_npu wheel built:"
+          ls -la dist/
+
+      - name: Save torch_npu build cache
+        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true' && always()
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            torch_npu-src/build
+            torch_npu-src/dist
+          key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
+
+      # ==================== 打包和上传 ====================
+      - name: Package test source
+        run: |
+          tar -czf test-src.tar.gz pytorch-src
+          ls -la test-src.tar.gz
+
+      - name: Upload PyTorch wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: pytorch-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload torch_npu wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: torch_npu-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload test source
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-src-main
+          path: test-src.tar.gz
+          retention-days: 7
\ No newline at end of file

From 3c18e4f702d5314b3c7fe83271f04b8596dc533b Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 09:50:23 +0800
Subject: [PATCH 033/132] Fix cache key ordering and add proper restore-keys
 for fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Reorder cache key: {low_frequency}-{high_frequency} format
  - PyTorch: v1-pytorch-{requirements_hash}-{SHA}
  - torch_npu: v1-torch_npu-{torch_npu_SHA}-{pytorch_SHA}

- Add proper restore-keys for graceful fallback:
  - PyTorch: {requirements_hash}- → empty (pip cache reuse)
  - torch_npu: {torch_npu_SHA}- → empty (build cache reuse)

- Fix wheel reuse logic: only reuse wheel on complete cache match
  - Complete match: same SHA → reuse wheel
  - Partial match: different SHA → rebuild wheel but reuse build/pip cache

- Clean up old wheels from partial cache matches to avoid version confusion

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 65 ++++++++++++++++++++------
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 40742fed43..d7c15e34d4 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -76,12 +76,15 @@ jobs:
       - name: Get PyTorch build cache key
         id: pytorch_cache_key
         run: |
-          # 缓存键基于：PyTorch SHA + requirements-build.txt 内容 + 缓存版本
+          # 缓存键顺序：低频变化在前，高频变化在后
+          # requirements-build.txt 变化频率 < PyTorch commit SHA
           PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
           REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pytorch-${PYTORCH_SHA}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+          echo "cache_key=${{ env.CACHE_VERSION }}-pytorch-${REQUIREMENTS_HASH}-${PYTORCH_SHA}" >> $GITHUB_OUTPUT
+          echo "partial_key=${{ env.CACHE_VERSION }}-pytorch-${REQUIREMENTS_HASH}-" >> $GITHUB_OUTPUT
 
       - name: Restore PyTorch build cache
+        id: restore_pytorch_cache
         uses: actions/cache/restore@v4
         with:
           path: |
@@ -90,18 +93,33 @@ jobs:
             ~/.cache/pip
           key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
           restore-keys: |
+            ${{ steps.pytorch_cache_key.outputs.partial_key }}
             ${{ env.CACHE_VERSION }}-pytorch-
 
-      - name: Check if PyTorch wheel exists
+      - name: Check if PyTorch wheel exists (complete match only)
         id: check_pytorch_wheel
         run: |
-          if ls pytorch-src/dist/*.whl 1>/dev/null 2>&1; then
-            echo "wheel_exists=true" >> $GITHUB_OUTPUT
-            echo "Found cached PyTorch wheel:"
-            ls -la pytorch-src/dist/
+          # 只有完整匹配缓存（key 完全匹配）才能复用 wheel
+          # 降级匹配的缓存可能来自不同 commit，不能复用 wheel
+          CACHE_KEY="${{ steps.pytorch_cache_key.outputs.cache_key }}"
+          RESTORED_KEY="${{ steps.restore_pytorch_cache.outputs.cache-hit == 'true' && steps.restore_pytorch_cache.outputs.cache-matched-key || '' }}"
+
+          if [[ "$CACHE_KEY" == "$RESTORED_KEY" ]]; then
+            # 完整匹配，检查 wheel 是否存在
+            if ls pytorch-src/dist/*.whl 1>/dev/null 2>&1; then
+              echo "wheel_exists=true" >> $GITHUB_OUTPUT
+              echo "Found cached PyTorch wheel (complete match):"
+              ls -la pytorch-src/dist/
+            else
+              echo "wheel_exists=false" >> $GITHUB_OUTPUT
+              echo "Complete match but no wheel found, will rebuild"
+            fi
           else
+            # 降级匹配或未命中缓存，不能复用 wheel，需要重新构建
             echo "wheel_exists=false" >> $GITHUB_OUTPUT
-            echo "No cached wheel found, will build PyTorch"
+            echo "Cache hit type: ${RESTORED_KEY:-'miss'} (not complete match, will rebuild)"
+            # 清理可能存在的旧 wheel，避免版本混淆
+            rm -rf pytorch-src/dist/*.whl 2>/dev/null || true
           fi
 
       - name: Build PyTorch wheel
@@ -143,12 +161,15 @@ jobs:
       - name: Get torch_npu build cache key
         id: torch_npu_cache_key
         run: |
-          # 缓存键基于：torch_npu 仓库 SHA + PyTorch SHA + 缓存版本
+          # 缓存键顺序：低频变化在前，高频变化在后
+          # torch_npu SHA 变化频率 < PyTorch SHA 变化频率
           TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
           PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
           echo "cache_key=${{ env.CACHE_VERSION }}-torch_npu-${TORCH_NPU_SHA}-${PYTORCH_SHA}" >> $GITHUB_OUTPUT
+          echo "partial_key=${{ env.CACHE_VERSION }}-torch_npu-${TORCH_NPU_SHA}-" >> $GITHUB_OUTPUT
 
       - name: Restore torch_npu build cache
+        id: restore_torch_npu_cache
         uses: actions/cache/restore@v4
         with:
           path: |
@@ -156,18 +177,32 @@ jobs:
             torch_npu-src/dist
           key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
           restore-keys: |
+            ${{ steps.torch_npu_cache_key.outputs.partial_key }}
             ${{ env.CACHE_VERSION }}-torch_npu-
 
-      - name: Check if torch_npu wheel exists
+      - name: Check if torch_npu wheel exists (complete match only)
         id: check_torch_npu_wheel
         run: |
-          if ls torch_npu-src/dist/*.whl 1>/dev/null 2>&1; then
-            echo "wheel_exists=true" >> $GITHUB_OUTPUT
-            echo "Found cached torch_npu wheel:"
-            ls -la torch_npu-src/dist/
+          # 只有完整匹配缓存才能复用 wheel（依赖的 PyTorch 版本必须相同）
+          CACHE_KEY="${{ steps.torch_npu_cache_key.outputs.cache_key }}"
+          RESTORED_KEY="${{ steps.restore_torch_npu_cache.outputs.cache-hit == 'true' && steps.restore_torch_npu_cache.outputs.cache-matched-key || '' }}"
+
+          if [[ "$CACHE_KEY" == "$RESTORED_KEY" ]]; then
+            # 完整匹配，检查 wheel 是否存在
+            if ls torch_npu-src/dist/*.whl 1>/dev/null 2>&1; then
+              echo "wheel_exists=true" >> $GITHUB_OUTPUT
+              echo "Found cached torch_npu wheel (complete match):"
+              ls -la torch_npu-src/dist/
+            else
+              echo "wheel_exists=false" >> $GITHUB_OUTPUT
+              echo "Complete match but no wheel found, will rebuild"
+            fi
           else
+            # 降级匹配或未命中缓存，不能复用 wheel
             echo "wheel_exists=false" >> $GITHUB_OUTPUT
-            echo "No cached wheel found, will build torch_npu"
+            echo "Cache hit type: ${RESTORED_KEY:-'miss'} (not complete match, will rebuild)"
+            # 清理可能存在的旧 wheel
+            rm -rf torch_npu-src/dist/*.whl 2>/dev/null || true
           fi
 
       - name: Build torch_npu wheel

From e9e6d98b35fa3ae7b3084358a5576835c65cbe29 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 09:54:47 +0800
Subject: [PATCH 034/132] Fix cache save condition and optimize tar packaging

- Change cache save condition from `always()` to `success()`
  - Prevents saving corrupted build artifacts when build fails
  - Corrupted cache would cause subsequent builds to fail repeatedly

- Optimize tar packaging by excluding build/dist directories
  - Exclude pytorch-src/build (build artifacts, tests don't need)
  - Exclude pytorch-src/dist (wheel already uploaded as artifact)
  - Significantly reduces artifact size (build dir can be several GB)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index d7c15e34d4..d7be9cce52 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -148,7 +148,8 @@ jobs:
           ls -la dist/
 
       - name: Save PyTorch build cache
-        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true' && always()
+        # 只在构建成功时保存缓存（避免保存损坏的中间文件）
+        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true' && success()
         uses: actions/cache/save@v4
         with:
           path: |
@@ -227,7 +228,8 @@ jobs:
           ls -la dist/
 
       - name: Save torch_npu build cache
-        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true' && always()
+        # 只在构建成功时保存缓存（避免保存损坏的中间文件）
+        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true' && success()
         uses: actions/cache/save@v4
         with:
           path: |
@@ -238,7 +240,12 @@ jobs:
       # ==================== 打包和上传 ====================
       - name: Package test source
         run: |
-          tar -czf test-src.tar.gz pytorch-src
+          # 排除 build 目录（编译产物，测试不需要）
+          # 排除 dist 目录（wheel 包已单独上传）
+          tar -czf test-src.tar.gz \
+            --exclude='pytorch-src/build' \
+            --exclude='pytorch-src/dist' \
+            pytorch-src
           ls -la test-src.tar.gz
 
       - name: Upload PyTorch wheel

From a84335f854f5de24d6dbd3c252ed6d8268221feb Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 10:32:05 +0800
Subject: [PATCH 035/132] Add cache verification steps for debugging and
 monitoring

Added detailed logging for cache status at key points:

- Display cache restore status (cache-hit, matched-key, directory sizes)
- Display pip cache info after restore and after install
- Display cache save status with saved directory sizes

This helps verify:
- Whether cache was successfully restored (complete/partial match)
- pip cache effectiveness (packages loaded from cache vs downloaded)
- Build cache size saved (for future optimization)
- Cache key matching logic working correctly

Example output:
  === GitHub Actions Cache Restore Status ===
  Cache hit: true
  Matched key: v1-pytorch-abc123-def456
  pip cache size: 256M

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 87 ++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index d7be9cce52..8ada1eb471 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -96,6 +96,36 @@ jobs:
             ${{ steps.pytorch_cache_key.outputs.partial_key }}
             ${{ env.CACHE_VERSION }}-pytorch-
 
+      - name: Display cache restore status
+        run: |
+          echo "=== GitHub Actions Cache Restore Status ==="
+          echo "Cache hit: ${{ steps.restore_pytorch_cache.outputs.cache-hit }}"
+          echo "Matched key: ${{ steps.restore_pytorch_cache.outputs.cache-matched-key }}"
+          echo "Requested key: ${{ steps.pytorch_cache_key.outputs.cache_key }}"
+          echo ""
+          echo "=== Restored Directory Sizes ==="
+          if [ -d "pytorch-src/build" ]; then
+            BUILD_SIZE=$(du -sh pytorch-src/build 2>/dev/null | cut -f1)
+            echo "pytorch-src/build: ${BUILD_SIZE}"
+          else
+            echo "pytorch-src/build: not found"
+          fi
+          if [ -d "pytorch-src/dist" ]; then
+            DIST_SIZE=$(du -sh pytorch-src/dist 2>/dev/null | cut -f1)
+            echo "pytorch-src/dist: ${DIST_SIZE}"
+          else
+            echo "pytorch-src/dist: not found"
+          fi
+          echo ""
+          echo "=== pip Cache Status ==="
+          pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
+          if [ -d ~/.cache/pip ]; then
+            PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+            echo "pip cache size: ${PIP_SIZE}"
+          else
+            echo "pip cache directory: not found"
+          fi
+
       - name: Check if PyTorch wheel exists (complete match only)
         id: check_pytorch_wheel
         run: |
@@ -133,6 +163,13 @@ jobs:
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install -r requirements-build.txt
 
+          # 显示 pip 安装后的缓存状态
+          echo ""
+          echo "=== pip Cache After Install ==="
+          pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
+          PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+          echo "pip cache size: ${PIP_SIZE}"
+
           # 设置构建环境变量
           export MAX_JOBS=40
           export USE_CUDA=0
@@ -158,6 +195,23 @@ jobs:
             ~/.cache/pip
           key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
 
+      - name: Display PyTorch cache save status
+        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true' && success()
+        run: |
+          echo "=== PyTorch Cache Saved ==="
+          echo "Cache key: ${{ steps.pytorch_cache_key.outputs.cache_key }}"
+          echo ""
+          echo "=== Saved Directory Sizes ==="
+          BUILD_SIZE=$(du -sh pytorch-src/build 2>/dev/null | cut -f1)
+          echo "pytorch-src/build: ${BUILD_SIZE}"
+          DIST_SIZE=$(du -sh pytorch-src/dist 2>/dev/null | cut -f1)
+          echo "pytorch-src/dist: ${DIST_SIZE}"
+          PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+          echo "pip cache: ${PIP_SIZE}"
+          echo ""
+          echo "=== pip Cache Info ==="
+          pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
+
       # ==================== torch_npu 构建缓存 ====================
       - name: Get torch_npu build cache key
         id: torch_npu_cache_key
@@ -181,6 +235,27 @@ jobs:
             ${{ steps.torch_npu_cache_key.outputs.partial_key }}
             ${{ env.CACHE_VERSION }}-torch_npu-
 
+      - name: Display torch_npu cache restore status
+        run: |
+          echo "=== torch_npu Cache Restore Status ==="
+          echo "Cache hit: ${{ steps.restore_torch_npu_cache.outputs.cache-hit }}"
+          echo "Matched key: ${{ steps.restore_torch_npu_cache.outputs.cache-matched-key }}"
+          echo "Requested key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}"
+          echo ""
+          echo "=== Restored Directory Sizes ==="
+          if [ -d "torch_npu-src/build" ]; then
+            BUILD_SIZE=$(du -sh torch_npu-src/build 2>/dev/null | cut -f1)
+            echo "torch_npu-src/build: ${BUILD_SIZE}"
+          else
+            echo "torch_npu-src/build: not found"
+          fi
+          if [ -d "torch_npu-src/dist" ]; then
+            DIST_SIZE=$(du -sh torch_npu-src/dist 2>/dev/null | cut -f1)
+            echo "torch_npu-src/dist: ${DIST_SIZE}"
+          else
+            echo "torch_npu-src/dist: not found"
+          fi
+
       - name: Check if torch_npu wheel exists (complete match only)
         id: check_torch_npu_wheel
         run: |
@@ -237,6 +312,18 @@ jobs:
             torch_npu-src/dist
           key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
 
+      - name: Display torch_npu cache save status
+        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true' && success()
+        run: |
+          echo "=== torch_npu Cache Saved ==="
+          echo "Cache key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}"
+          echo ""
+          echo "=== Saved Directory Sizes ==="
+          BUILD_SIZE=$(du -sh torch_npu-src/build 2>/dev/null | cut -f1)
+          echo "torch_npu-src/build: ${BUILD_SIZE}"
+          DIST_SIZE=$(du -sh torch_npu-src/dist 2>/dev/null | cut -f1)
+          echo "torch_npu-src/dist: ${DIST_SIZE}"
+
       # ==================== 打包和上传 ====================
       - name: Package test source
         run: |

From 50a63c0faad2b1e92fa0c3329f7a2b3f794b7a68 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 10:36:09 +0800
Subject: [PATCH 036/132] Restore _build.yml to upstream Ascend/pytorch master
 version

Revert all modifications to _build.yml to keep it consistent with
upstream Ascend/pytorch repository.

All build logic changes have been moved to the new file:
- .github/workflows/_build_torch_npu.yml (with cache support)

Original _build.yml remains as the upstream template (TODO/simulated version).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build.yml | 186 ++++++++++++++++-------------------
 1 file changed, 83 insertions(+), 103 deletions(-)

diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
index 0f4febe676..55a0b52dcf 100644
--- a/.github/workflows/_build.yml
+++ b/.github/workflows/_build.yml
@@ -3,124 +3,104 @@ name: Build PyTorch and torch_npu
 on:
   workflow_call:
     inputs:
-      pytorch_branch:
+      upstream_repo:
         required: true
         type: string
-        default: 'main'
-      python_version:
+        description: The upstream repository full name (owner/repo)
+      upstream_sha:
         required: true
         type: string
-        default: '3.11'
-      docker_image_tag:
+        description: The upstream commit SHA to build
+      upstream_fork_repo:
+        required: false
+        type: string
+        default: ''
+        description: The fork repository full name for PR from fork (owner/repo)
+      downstream_repo:
         required: true
         type: string
-        description: 'Docker image tag with timestamp'
-    outputs:
-      docker-image:
-        description: 'Full Docker image URL'
-        value: ${{ jobs.build.outputs.docker-image }}
-      torch-wheel:
-        description: 'PyTorch wheel artifact name'
-        value: 'torch-wheel-main'
-      torch-npu-wheel:
-        description: 'torch_npu wheel artifact name'
-        value: 'torch-npu-wheel-main'
-      test-src:
-        description: 'Test source artifact name'
-        value: 'test-src-main'
+        description: The downstream repository full name (owner/repo)
 
 jobs:
   build:
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 240
-    outputs:
-      docker-image: ${{ steps.set_image.outputs.docker-image }}
-
-    container:
-      image: quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}
-      options: --user root
-
+    runs-on: ubuntu-latest
     steps:
-      - name: Set Docker image URL
-        id: set_image
-        run: |
-          DOCKER_IMAGE="quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}"
-          echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
-          echo "Using Docker image: ${DOCKER_IMAGE}"
+      # Step 1: Checkout upstream PyTorch PR code
+      # Use fork repo if available (PR from fork), otherwise use upstream repo (push or PR from same repo)
+      - name: Checkout upstream PyTorch PR
+        uses: actions/checkout@v6
+        with:
+          repository: ${{ inputs.upstream_fork_repo != '' && inputs.upstream_fork_repo || inputs.upstream_repo }}
+          ref: ${{ inputs.upstream_sha }}
+          submodules: recursive
+          path: pytorch
 
-      - name: Setup CANN environment
+      # Step 2: Build PyTorch
+      - name: Build PyTorch
         run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-      - name: Clone upstream PyTorch main
+          echo "============================================"
+          echo "Building PyTorch from upstream PR..."
+          if [ -n "${{ inputs.upstream_fork_repo }}" ]; then
+            echo "Fork Repository: ${{ inputs.upstream_fork_repo }}"
+          else
+            echo "Repository: ${{ inputs.upstream_repo }}"
+          fi
+          echo "Commit SHA: ${{ inputs.upstream_sha }}"
+          echo "============================================"
+          # TODO: Add actual PyTorch build commands
+          # Example:
+          # cd pytorch
+          # pip install -r requirements.txt
+          # python setup.py develop
+          echo "[SIMULATED] PyTorch build completed successfully!"
+
+      # Step 3: Install PyTorch wheel
+      - name: Install PyTorch wheel
         run: |
-          git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
-            https://github.com/pytorch/pytorch.git pytorch-src
-          PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
-          echo "pytorch_sha=${PYTORCH_SHA}"
-
-      - name: Checkout torch_npu
-        uses: actions/checkout@v4
+          echo "============================================"
+          echo "Installing PyTorch wheel..."
+          echo "============================================"
+          # TODO: Add actual install commands
+          # Example:
+          # pip install pytorch/dist/torch*.whl
+          echo "[SIMULATED] PyTorch wheel installed successfully!"
+
+      # Step 4: Checkout downstream repo (torch_npu) master
+      - name: Checkout downstream repo (torch_npu)
+        uses: actions/checkout@v6
         with:
-          path: torch_npu-src
+          repository: ${{ inputs.downstream_repo }}
+          ref: master
           submodules: recursive
+          path: torch_npu
 
-      - name: Build PyTorch wheel
+      # Step 5: Build torch_npu
+      - name: Build torch_npu
         run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-
-          cd pytorch-src
-          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
-
-          export MAX_JOBS=40
-          export USE_CUDA=0
-          export USE_CUDNN=0
-          export CMAKE_BUILD_TYPE=Release
-
-          python${{ inputs.python_version }} setup.py build bdist_wheel
-
-          echo "PyTorch wheel built:"
-          ls -la dist/
-
-      - name: Build torch_npu wheel
+          echo "============================================"
+          echo "Building torch_npu from downstream master..."
+          echo "Repository: ${{ inputs.downstream_repo }}"
+          echo "Branch: master"
+          echo "============================================"
+          # TODO: Add actual torch_npu build commands
+          # Example:
+          # cd torch_npu
+          # pip install -r requirements.txt
+          # bash ci/build.sh
+          echo "[SIMULATED] torch_npu build completed successfully!"
+
+      - name: Summary
         run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
-
-          cd torch_npu-src
-          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
-
-          export MAX_JOBS=40
-          bash ci/build.sh --python=${{ inputs.python_version }}
-
-          echo "torch_npu wheel built:"
-          ls -la dist/
-
-      - name: Package test source
-        run: |
-          tar -czf test-src.tar.gz pytorch-src
-          ls -la test-src.tar.gz
-
-      - name: Upload PyTorch wheel
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-wheel-main
-          path: pytorch-src/dist/*.whl
-          retention-days: 7
-
-      - name: Upload torch_npu wheel
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-npu-wheel-main
-          path: torch_npu-src/dist/*.whl
-          retention-days: 7
-
-      - name: Upload test source
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-src-main
-          path: test-src.tar.gz
-          retention-days: 7
\ No newline at end of file
+          echo "============================================"
+          echo "Build Summary"
+          echo "============================================"
+          if [ -n "${{ inputs.upstream_fork_repo }}" ]; then
+            echo "1. PyTorch fork PR: ${{ inputs.upstream_fork_repo }}@${{ inputs.upstream_sha }}"
+          else
+            echo "1. PyTorch upstream: ${{ inputs.upstream_repo }}@${{ inputs.upstream_sha }}"
+          fi
+          echo "2. PyTorch build: SUCCESS (simulated)"
+          echo "3. PyTorch install: SUCCESS (simulated)"
+          echo "4. torch_npu master: ${{ inputs.downstream_repo }}"
+          echo "5. torch_npu build: SUCCESS (simulated)"
+          echo "============================================"
\ No newline at end of file

From c4589733849dcb20ad4a466f95e3eac54303816f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 10:39:48 +0800
Subject: [PATCH 037/132] Add git clone proxy support for faster repository
 access

- Add GH_PROXY_URL environment variable for proxy configuration
  - Default: https://gh-proxy.test.osinfra.cn
  - Set to empty string to disable proxy

- Configure git URL rewrite for PyTorch clone:
  - PyTorch clone: ${GH_PROXY_URL}/https://github.com/pytorch/pytorch.git

- Configure git URL rewrite for actions/checkout submodules:
  - url."${GH_PROXY_URL}/https://github.com/".insteadOf "https://github.com/"
  - url."${GH_PROXY_URL}/https://gitlab.com/".insteadOf "https://gitlab.com/"
  - This speeds up submodule clone for torch_npu

Speed improvement estimate:
- Direct clone: 2-5 minutes (slow from China network)
- Proxy clone: 10-30 seconds (cached by proxy server)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 8ada1eb471..9e2c2f26d2 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -32,6 +32,8 @@ on:
 env:
   # 缓存版本号，当需要强制刷新缓存时修改此值
   CACHE_VERSION: 'v1'
+  # GitHub 代理 URL（用于加速 git clone，留空则不使用代理）
+  GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
 
 jobs:
   build:
@@ -60,12 +62,31 @@ jobs:
       - name: Clone upstream PyTorch main
         id: clone_pytorch
         run: |
+          # 使用代理加速 git clone（如果配置了 GH_PROXY_URL）
+          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
+            echo "Using proxy: ${PYTORCH_REPO}"
+          fi
+
           git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
-            https://github.com/pytorch/pytorch.git pytorch-src
+            "${PYTORCH_REPO}" pytorch-src
           PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
           echo "pytorch_sha=${PYTORCH_SHA}" >> $GITHUB_OUTPUT
           echo "Cloned PyTorch commit: ${PYTORCH_SHA}"
 
+      - name: Configure git proxy for faster clone
+        run: |
+          # 配置 git URL rewrite 来使用代理（加速 submodules clone）
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            git config --global url."${{ env.GH_PROXY_URL }}/https://github.com/".insteadOf "https://github.com/"
+            git config --global url."${{ env.GH_PROXY_URL }}/https://gitlab.com/".insteadOf "https://gitlab.com/"
+            echo "Git proxy configured:"
+            git config --global --list | grep url
+          else
+            echo "No proxy configured, using direct connection"
+          fi
+
       - name: Checkout torch_npu
         uses: actions/checkout@v4
         with:

From 754ae85ff2609b102865456c21d294409248ffab Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 10:42:13 +0800
Subject: [PATCH 038/132] Fix npu-sync-test.yml to call _build_torch_npu.yml
 instead of _build.yml

- _build.yml was restored to upstream version (requires upstream_repo, upstream_sha)
- _build_torch_npu.yml is the new build workflow with cache support
- Update the workflow call to use the correct file

Fixes workflow validation error:
  Input upstream_repo is required, but not provided while calling
  Input upstream_sha is required, but not provided while calling

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-sync-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
index f618e43dc7..ec9cc3b67b 100644
--- a/.github/workflows/npu-sync-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -30,7 +30,7 @@ env:
 
 jobs:
   build:
-    uses: ./.github/workflows/_build.yml
+    uses: ./.github/workflows/_build_torch_npu.yml
     with:
       pytorch_branch: ${{ inputs.pytorch_branch || 'main' }}
       python_version: '3.11'

From 3c1480fa2eb01a35041d8318cde232d224718d96 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 10:47:56 +0800
Subject: [PATCH 039/132] Remove wheel caching logic (wheel is rarely reused)

Simplified cache strategy:
- Cache only build directory (CMake incremental build)
- Cache only pip cache (pip install from cache)
- Remove dist directory from cache (wheel almost never reused)

Reason: PyTorch main branch has new commits every day, so:
- Complete cache match (same SHA) rarely happens
- Partial cache match can reuse build/pip cache but not wheel
- Wheel cache wastes ~200MB space with almost no benefit

Changes:
- Remove check_pytorch_wheel and check_torch_npu_wheel steps
- Remove dist from cache path for both PyTorch and torch_npu
- Simplify build condition (no wheel_exists check)
- Simplify cache save condition (just success())
- Remove dist size display from cache status output

Cache benefits after simplification:
- pip install: ~5-10 seconds (vs ~2-5 minutes download)
- CMake build: incremental compile (vs full rebuild)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 109 +++++--------------------
 1 file changed, 19 insertions(+), 90 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 9e2c2f26d2..a181744f39 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -94,6 +94,7 @@ jobs:
           submodules: recursive
 
       # ==================== PyTorch 构建缓存 ====================
+      # 只缓存 build 目录和 pip cache，不缓存 wheel（wheel 几乎不会复用）
       - name: Get PyTorch build cache key
         id: pytorch_cache_key
         run: |
@@ -108,18 +109,19 @@ jobs:
         id: restore_pytorch_cache
         uses: actions/cache/restore@v4
         with:
+          # 只缓存 build 目录（CMake 构建缓存）和 pip cache
+          # 不缓存 dist 目录（wheel 包几乎不会复用，每次 commit 都需要重新构建）
           path: |
             pytorch-src/build
-            pytorch-src/dist
             ~/.cache/pip
           key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
           restore-keys: |
             ${{ steps.pytorch_cache_key.outputs.partial_key }}
             ${{ env.CACHE_VERSION }}-pytorch-
 
-      - name: Display cache restore status
+      - name: Display PyTorch cache restore status
         run: |
-          echo "=== GitHub Actions Cache Restore Status ==="
+          echo "=== PyTorch Cache Restore Status ==="
           echo "Cache hit: ${{ steps.restore_pytorch_cache.outputs.cache-hit }}"
           echo "Matched key: ${{ steps.restore_pytorch_cache.outputs.cache-matched-key }}"
           echo "Requested key: ${{ steps.pytorch_cache_key.outputs.cache_key }}"
@@ -131,12 +133,6 @@ jobs:
           else
             echo "pytorch-src/build: not found"
           fi
-          if [ -d "pytorch-src/dist" ]; then
-            DIST_SIZE=$(du -sh pytorch-src/dist 2>/dev/null | cut -f1)
-            echo "pytorch-src/dist: ${DIST_SIZE}"
-          else
-            echo "pytorch-src/dist: not found"
-          fi
           echo ""
           echo "=== pip Cache Status ==="
           pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
@@ -147,34 +143,7 @@ jobs:
             echo "pip cache directory: not found"
           fi
 
-      - name: Check if PyTorch wheel exists (complete match only)
-        id: check_pytorch_wheel
-        run: |
-          # 只有完整匹配缓存（key 完全匹配）才能复用 wheel
-          # 降级匹配的缓存可能来自不同 commit，不能复用 wheel
-          CACHE_KEY="${{ steps.pytorch_cache_key.outputs.cache_key }}"
-          RESTORED_KEY="${{ steps.restore_pytorch_cache.outputs.cache-hit == 'true' && steps.restore_pytorch_cache.outputs.cache-matched-key || '' }}"
-
-          if [[ "$CACHE_KEY" == "$RESTORED_KEY" ]]; then
-            # 完整匹配，检查 wheel 是否存在
-            if ls pytorch-src/dist/*.whl 1>/dev/null 2>&1; then
-              echo "wheel_exists=true" >> $GITHUB_OUTPUT
-              echo "Found cached PyTorch wheel (complete match):"
-              ls -la pytorch-src/dist/
-            else
-              echo "wheel_exists=false" >> $GITHUB_OUTPUT
-              echo "Complete match but no wheel found, will rebuild"
-            fi
-          else
-            # 降级匹配或未命中缓存，不能复用 wheel，需要重新构建
-            echo "wheel_exists=false" >> $GITHUB_OUTPUT
-            echo "Cache hit type: ${RESTORED_KEY:-'miss'} (not complete match, will rebuild)"
-            # 清理可能存在的旧 wheel，避免版本混淆
-            rm -rf pytorch-src/dist/*.whl 2>/dev/null || true
-          fi
-
       - name: Build PyTorch wheel
-        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true'
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
 
@@ -207,17 +176,16 @@ jobs:
 
       - name: Save PyTorch build cache
         # 只在构建成功时保存缓存（避免保存损坏的中间文件）
-        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true' && success()
+        if: success()
         uses: actions/cache/save@v4
         with:
           path: |
             pytorch-src/build
-            pytorch-src/dist
             ~/.cache/pip
           key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
 
       - name: Display PyTorch cache save status
-        if: steps.check_pytorch_wheel.outputs.wheel_exists != 'true' && success()
+        if: success()
         run: |
           echo "=== PyTorch Cache Saved ==="
           echo "Cache key: ${{ steps.pytorch_cache_key.outputs.cache_key }}"
@@ -225,15 +193,11 @@ jobs:
           echo "=== Saved Directory Sizes ==="
           BUILD_SIZE=$(du -sh pytorch-src/build 2>/dev/null | cut -f1)
           echo "pytorch-src/build: ${BUILD_SIZE}"
-          DIST_SIZE=$(du -sh pytorch-src/dist 2>/dev/null | cut -f1)
-          echo "pytorch-src/dist: ${DIST_SIZE}"
           PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
           echo "pip cache: ${PIP_SIZE}"
-          echo ""
-          echo "=== pip Cache Info ==="
-          pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
 
       # ==================== torch_npu 构建缓存 ====================
+      # 只缓存 build 目录，不缓存 wheel
       - name: Get torch_npu build cache key
         id: torch_npu_cache_key
         run: |
@@ -248,9 +212,9 @@ jobs:
         id: restore_torch_npu_cache
         uses: actions/cache/restore@v4
         with:
-          path: |
-            torch_npu-src/build
-            torch_npu-src/dist
+          # 只缓存 build 目录（CMake 构建缓存）
+          # 不缓存 dist 目录（wheel 包依赖特定 PyTorch 版本，几乎不会复用）
+          path: torch_npu-src/build
           key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
           restore-keys: |
             ${{ steps.torch_npu_cache_key.outputs.partial_key }}
@@ -270,40 +234,8 @@ jobs:
           else
             echo "torch_npu-src/build: not found"
           fi
-          if [ -d "torch_npu-src/dist" ]; then
-            DIST_SIZE=$(du -sh torch_npu-src/dist 2>/dev/null | cut -f1)
-            echo "torch_npu-src/dist: ${DIST_SIZE}"
-          else
-            echo "torch_npu-src/dist: not found"
-          fi
-
-      - name: Check if torch_npu wheel exists (complete match only)
-        id: check_torch_npu_wheel
-        run: |
-          # 只有完整匹配缓存才能复用 wheel（依赖的 PyTorch 版本必须相同）
-          CACHE_KEY="${{ steps.torch_npu_cache_key.outputs.cache_key }}"
-          RESTORED_KEY="${{ steps.restore_torch_npu_cache.outputs.cache-hit == 'true' && steps.restore_torch_npu_cache.outputs.cache-matched-key || '' }}"
-
-          if [[ "$CACHE_KEY" == "$RESTORED_KEY" ]]; then
-            # 完整匹配，检查 wheel 是否存在
-            if ls torch_npu-src/dist/*.whl 1>/dev/null 2>&1; then
-              echo "wheel_exists=true" >> $GITHUB_OUTPUT
-              echo "Found cached torch_npu wheel (complete match):"
-              ls -la torch_npu-src/dist/
-            else
-              echo "wheel_exists=false" >> $GITHUB_OUTPUT
-              echo "Complete match but no wheel found, will rebuild"
-            fi
-          else
-            # 降级匹配或未命中缓存，不能复用 wheel
-            echo "wheel_exists=false" >> $GITHUB_OUTPUT
-            echo "Cache hit type: ${RESTORED_KEY:-'miss'} (not complete match, will rebuild)"
-            # 清理可能存在的旧 wheel
-            rm -rf torch_npu-src/dist/*.whl 2>/dev/null || true
-          fi
 
       - name: Build torch_npu wheel
-        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true'
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
@@ -313,7 +245,7 @@ jobs:
 
           cd torch_npu-src
 
-          # 安装构建依赖（与 PyTorch 构建保持一致）
+          # 安装构建依赖
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
@@ -324,17 +256,15 @@ jobs:
           ls -la dist/
 
       - name: Save torch_npu build cache
-        # 只在构建成功时保存缓存（避免保存损坏的中间文件）
-        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true' && success()
+        # 只在构建成功时保存缓存
+        if: success()
         uses: actions/cache/save@v4
         with:
-          path: |
-            torch_npu-src/build
-            torch_npu-src/dist
+          path: torch_npu-src/build
           key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
 
       - name: Display torch_npu cache save status
-        if: steps.check_torch_npu_wheel.outputs.wheel_exists != 'true' && success()
+        if: success()
         run: |
           echo "=== torch_npu Cache Saved ==="
           echo "Cache key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}"
@@ -342,17 +272,16 @@ jobs:
           echo "=== Saved Directory Sizes ==="
           BUILD_SIZE=$(du -sh torch_npu-src/build 2>/dev/null | cut -f1)
           echo "torch_npu-src/build: ${BUILD_SIZE}"
-          DIST_SIZE=$(du -sh torch_npu-src/dist 2>/dev/null | cut -f1)
-          echo "torch_npu-src/dist: ${DIST_SIZE}"
 
       # ==================== 打包和上传 ====================
       - name: Package test source
         run: |
-          # 排除 build 目录（编译产物，测试不需要）
-          # 排除 dist 目录（wheel 包已单独上传）
+          # 排除 build 和 dist 目录（编译产物，测试不需要）
           tar -czf test-src.tar.gz \
             --exclude='pytorch-src/build' \
             --exclude='pytorch-src/dist' \
+            --exclude='torch_npu-src/build' \
+            --exclude='torch_npu-src/dist' \
             pytorch-src
           ls -la test-src.tar.gz
 

From ab9272ab34906c43df11439f5623d517ed4f4be5 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 10:54:00 +0800
Subject: [PATCH 040/132] Simplify cache strategy: only pip cache and ccache

Cache strategy simplified to:
1. pip cache (~/.cache/pip) - based on requirements-build.txt hash
2. ccache (~/.cache/ccache) - compiler cache for C/C++ objects

Removed:
- pytorch-src/build cache (CMake build artifacts, replaced by ccache)
- torch_npu-src/build cache (CMake build artifacts, replaced by ccache)

Added:
- ccache installation and configuration (5G max size, compression enabled)
- ccache statistics display before/after each build
- CMAKE_C_COMPILER_LAUNCHER and CMAKE_CXX_COMPILER_LAUNCHER for CMake

Benefits:
- pip install: ~5-10 seconds (vs ~2-5 minutes download)
- ccache: reuses compiled objects (vs full recompile, ~30-60 minutes)
- Smaller cache footprint (ccache only stores object files, not full build)

Cache keys:
- pip: v1-pip-{requirements_hash} (shared across all builds)
- ccache: v1-{requirements_hash}-{pytorch_sha}-{torch_npu_sha}

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 203 +++++++++++--------------
 1 file changed, 91 insertions(+), 112 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index a181744f39..8bd820abc5 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -93,148 +93,112 @@ jobs:
           path: torch_npu-src
           submodules: recursive
 
-      # ==================== PyTorch 构建缓存 ====================
-      # 只缓存 build 目录和 pip cache，不缓存 wheel（wheel 几乎不会复用）
-      - name: Get PyTorch build cache key
-        id: pytorch_cache_key
+      # ==================== 缓存配置 ====================
+      - name: Get cache key
+        id: cache_key
         run: |
-          # 缓存键顺序：低频变化在前，高频变化在后
-          # requirements-build.txt 变化频率 < PyTorch commit SHA
+          # 缓存键基于：requirements-build.txt + PyTorch SHA + torch_npu SHA
           PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
+          TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
           REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pytorch-${REQUIREMENTS_HASH}-${PYTORCH_SHA}" >> $GITHUB_OUTPUT
-          echo "partial_key=${{ env.CACHE_VERSION }}-pytorch-${REQUIREMENTS_HASH}-" >> $GITHUB_OUTPUT
+          echo "cache_key=${{ env.CACHE_VERSION }}-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
+          echo "partial_key=${{ env.CACHE_VERSION }}-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-" >> $GITHUB_OUTPUT
+          echo "pip_key=${{ env.CACHE_VERSION }}-pip-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
 
-      - name: Restore PyTorch build cache
-        id: restore_pytorch_cache
+      - name: Restore pip cache
         uses: actions/cache/restore@v4
         with:
-          # 只缓存 build 目录（CMake 构建缓存）和 pip cache
-          # 不缓存 dist 目录（wheel 包几乎不会复用，每次 commit 都需要重新构建）
-          path: |
-            pytorch-src/build
-            ~/.cache/pip
-          key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
+          path: ~/.cache/pip
+          key: ${{ steps.cache_key.outputs.pip_key }}
           restore-keys: |
-            ${{ steps.pytorch_cache_key.outputs.partial_key }}
-            ${{ env.CACHE_VERSION }}-pytorch-
+            ${{ env.CACHE_VERSION }}-pip-
 
-      - name: Display PyTorch cache restore status
+      - name: Restore ccache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/ccache
+          key: ${{ steps.cache_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ steps.cache_key.outputs.partial_key }}
+            ${{ env.CACHE_VERSION }}-
+
+      - name: Setup ccache
         run: |
-          echo "=== PyTorch Cache Restore Status ==="
-          echo "Cache hit: ${{ steps.restore_pytorch_cache.outputs.cache-hit }}"
-          echo "Matched key: ${{ steps.restore_pytorch_cache.outputs.cache-matched-key }}"
-          echo "Requested key: ${{ steps.pytorch_cache_key.outputs.cache_key }}"
+          # 安装 ccache（manylinux 镜像没有预装）
+          yum install -y ccache
+
+          # 配置 ccache
+          ccache --set-config=cache_dir=~/.cache/ccache
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+
+          # 设置 ccache 环境变量（让 CMake 使用 ccache）
+          export CC="ccache gcc"
+          export CXX="ccache g++"
+          export CMAKE_C_COMPILER_LAUNCHER=ccache
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+          echo "CCACHE_DIR=~/.cache/ccache" >> $GITHUB_ENV
+          echo "CC=ccache gcc" >> $GITHUB_ENV
+          echo "CXX=ccache g++" >> $GITHUB_ENV
+
+          echo "=== ccache Configuration ==="
+          ccache --show-config
+
           echo ""
-          echo "=== Restored Directory Sizes ==="
-          if [ -d "pytorch-src/build" ]; then
-            BUILD_SIZE=$(du -sh pytorch-src/build 2>/dev/null | cut -f1)
-            echo "pytorch-src/build: ${BUILD_SIZE}"
-          else
-            echo "pytorch-src/build: not found"
-          fi
+          echo "=== ccache Statistics (before build) ==="
+          ccache --show-stats
+
+      - name: Display cache restore status
+        run: |
+          echo "=== Cache Restore Status ==="
           echo ""
-          echo "=== pip Cache Status ==="
+          echo "=== pip Cache ==="
           pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
           if [ -d ~/.cache/pip ]; then
             PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
             echo "pip cache size: ${PIP_SIZE}"
           else
-            echo "pip cache directory: not found"
+            echo "pip cache: not found"
           fi
+          echo ""
+          echo "=== ccache ==="
+          ccache --show-stats || echo "ccache not available"
 
+      # ==================== 构建 PyTorch ====================
       - name: Build PyTorch wheel
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
 
           cd pytorch-src
 
-          # 安装构建依赖（使用上游 PyTorch requirements-build.txt）
+          # 安装构建依赖
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install -r requirements-build.txt
 
-          # 显示 pip 安装后的缓存状态
-          echo ""
-          echo "=== pip Cache After Install ==="
-          pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
-          PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
-          echo "pip cache size: ${PIP_SIZE}"
-
           # 设置构建环境变量
           export MAX_JOBS=40
           export USE_CUDA=0
           export USE_CUDNN=0
           export USE_DISTRIBUTED=1  # 启用分布式支持（torch_npu 必需）
           export CMAKE_BUILD_TYPE=Release
-          export USE_OPENMP=1       # 启用 OpenMP（torch_npu CMakeLists.txt 默认使用）
-          export USE_MKLDNN=0       # 禁用 MKLDNN（NPU 不需要 Intel CPU 库）
+          export USE_OPENMP=1       # 启用 OpenMP
+          export USE_MKLDNN=0       # 禁用 MKLDNN
+
+          # ccache 配置（已在上一步设置）
+          export CMAKE_C_COMPILER_LAUNCHER=ccache
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
           python${{ inputs.python_version }} setup.py build bdist_wheel
 
           echo "PyTorch wheel built:"
           ls -la dist/
 
-      - name: Save PyTorch build cache
-        # 只在构建成功时保存缓存（避免保存损坏的中间文件）
-        if: success()
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            pytorch-src/build
-            ~/.cache/pip
-          key: ${{ steps.pytorch_cache_key.outputs.cache_key }}
-
-      - name: Display PyTorch cache save status
-        if: success()
-        run: |
-          echo "=== PyTorch Cache Saved ==="
-          echo "Cache key: ${{ steps.pytorch_cache_key.outputs.cache_key }}"
           echo ""
-          echo "=== Saved Directory Sizes ==="
-          BUILD_SIZE=$(du -sh pytorch-src/build 2>/dev/null | cut -f1)
-          echo "pytorch-src/build: ${BUILD_SIZE}"
-          PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
-          echo "pip cache: ${PIP_SIZE}"
-
-      # ==================== torch_npu 构建缓存 ====================
-      # 只缓存 build 目录，不缓存 wheel
-      - name: Get torch_npu build cache key
-        id: torch_npu_cache_key
-        run: |
-          # 缓存键顺序：低频变化在前，高频变化在后
-          # torch_npu SHA 变化频率 < PyTorch SHA 变化频率
-          TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
-          PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
-          echo "cache_key=${{ env.CACHE_VERSION }}-torch_npu-${TORCH_NPU_SHA}-${PYTORCH_SHA}" >> $GITHUB_OUTPUT
-          echo "partial_key=${{ env.CACHE_VERSION }}-torch_npu-${TORCH_NPU_SHA}-" >> $GITHUB_OUTPUT
-
-      - name: Restore torch_npu build cache
-        id: restore_torch_npu_cache
-        uses: actions/cache/restore@v4
-        with:
-          # 只缓存 build 目录（CMake 构建缓存）
-          # 不缓存 dist 目录（wheel 包依赖特定 PyTorch 版本，几乎不会复用）
-          path: torch_npu-src/build
-          key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ steps.torch_npu_cache_key.outputs.partial_key }}
-            ${{ env.CACHE_VERSION }}-torch_npu-
-
-      - name: Display torch_npu cache restore status
-        run: |
-          echo "=== torch_npu Cache Restore Status ==="
-          echo "Cache hit: ${{ steps.restore_torch_npu_cache.outputs.cache-hit }}"
-          echo "Matched key: ${{ steps.restore_torch_npu_cache.outputs.cache-matched-key }}"
-          echo "Requested key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}"
-          echo ""
-          echo "=== Restored Directory Sizes ==="
-          if [ -d "torch_npu-src/build" ]; then
-            BUILD_SIZE=$(du -sh torch_npu-src/build 2>/dev/null | cut -f1)
-            echo "torch_npu-src/build: ${BUILD_SIZE}"
-          else
-            echo "torch_npu-src/build: not found"
-          fi
+          echo "=== ccache Statistics (after PyTorch build) ==="
+          ccache --show-stats
 
+      # ==================== 构建 torch_npu ====================
       - name: Build torch_npu wheel
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -250,33 +214,48 @@ jobs:
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
           export MAX_JOBS=40
+          export CMAKE_C_COMPILER_LAUNCHER=ccache
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
           bash ci/build.sh --python=${{ inputs.python_version }}
 
           echo "torch_npu wheel built:"
           ls -la dist/
 
-      - name: Save torch_npu build cache
-        # 只在构建成功时保存缓存
+          echo ""
+          echo "=== ccache Statistics (after torch_npu build) ==="
+          ccache --show-stats
+
+      # ==================== 保存缓存 ====================
+      - name: Save pip cache
         if: success()
         uses: actions/cache/save@v4
         with:
-          path: torch_npu-src/build
-          key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}
+          path: ~/.cache/pip
+          key: ${{ steps.cache_key.outputs.pip_key }}
 
-      - name: Display torch_npu cache save status
+      - name: Save ccache
+        if: success()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/ccache
+          key: ${{ steps.cache_key.outputs.cache_key }}
+
+      - name: Display cache save status
         if: success()
         run: |
-          echo "=== torch_npu Cache Saved ==="
-          echo "Cache key: ${{ steps.torch_npu_cache_key.outputs.cache_key }}"
+          echo "=== Cache Saved ==="
+          echo "pip key: ${{ steps.cache_key.outputs.pip_key }}"
+          echo "ccache key: ${{ steps.cache_key.outputs.cache_key }}"
           echo ""
-          echo "=== Saved Directory Sizes ==="
-          BUILD_SIZE=$(du -sh torch_npu-src/build 2>/dev/null | cut -f1)
-          echo "torch_npu-src/build: ${BUILD_SIZE}"
+          PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+          CCACHE_SIZE=$(du -sh ~/.cache/ccache 2>/dev/null | cut -f1)
+          echo "pip cache: ${PIP_SIZE}"
+          echo "ccache: ${CCACHE_SIZE}"
 
       # ==================== 打包和上传 ====================
       - name: Package test source
         run: |
-          # 排除 build 和 dist 目录（编译产物，测试不需要）
           tar -czf test-src.tar.gz \
             --exclude='pytorch-src/build' \
             --exclude='pytorch-src/dist' \

From 785bae09e23b0f366b005abd8c17ddf6a978d4cd Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 11:04:37 +0800
Subject: [PATCH 041/132] Disable torchair build due to PyTorch API
 compatibility issue

torchair compilation fails with:
  error: 'six' has not been declared
  auto tuple = six::isTuple(obj);

Cause: Upstream PyTorch main branch changed internal C++ API
- six::isTuple() is no longer available in new PyTorch headers
- torchair depends on old PyTorch internal API

Solution: Temporarily disable torchair build with --disable_torchair
- Wait for torch_npu/torchair to adapt to new PyTorch API
- Core torch_npu functionality still works without torchair

Impact: torchair is an optional component for dynamo optimization,
       core NPU operations are unaffected.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 8bd820abc5..2dd9d08bf8 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -217,7 +217,10 @@ jobs:
           export CMAKE_C_COMPILER_LAUNCHER=ccache
           export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-          bash ci/build.sh --python=${{ inputs.python_version }}
+          # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
+          # torchair 依赖 PyTorch 内部的 six::isTuple() API，该 API 在新版已移除
+          # 临时禁用，等待 torch_npu/torchair 适配新版 PyTorch API
+          bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair
 
           echo "torch_npu wheel built:"
           ls -la dist/

From 9070d5012defc5e4ad5f887bf13e8086a8f91714 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 11:07:40 +0800
Subject: [PATCH 042/132] Remove pip cache, keep only ccache (fix numpy not
 found error)

Problem: pip cache restore caused "No matching distribution found for numpy"
- pip cache may have corrupted state or incorrect metadata
- Restored cache interferes with pip's package discovery

Solution: Remove pip cache, use only ccache
- pip download is fast anyway (few seconds)
- ccache is the real speedup (saves 30-60 minutes compile time)

Changes:
- Remove pip cache restore/save steps
- Update CACHE_VERSION to v2 (force fresh start)
- Keep only ccache for compiler object caching

Cache strategy now:
- ccache only: ~/.cache/ccache (5G max, compression enabled)
- Based on: requirements-build.txt + pytorch_sha + torch_npu_sha

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 90 +++++++-------------------
 1 file changed, 24 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 2dd9d08bf8..4fd68fa0bc 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -31,7 +31,7 @@ on:
 
 env:
   # 缓存版本号，当需要强制刷新缓存时修改此值
-  CACHE_VERSION: 'v1'
+  CACHE_VERSION: 'v2'
   # GitHub 代理 URL（用于加速 git clone，留空则不使用代理）
   GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
 
@@ -93,34 +93,28 @@ jobs:
           path: torch_npu-src
           submodules: recursive
 
-      # ==================== 缓存配置 ====================
-      - name: Get cache key
-        id: cache_key
+      # ==================== ccache 缓存配置 ====================
+      # 只使用 ccache 缓存，不使用 pip 缓存
+      # pip 缓存可能导致依赖查找问题，且 pip 下载已经很快（几秒）
+      # ccache 是真正加速编译的关键（可节省 30-60 分钟）
+      - name: Get ccache key
+        id: ccache_key
         run: |
-          # 缓存键基于：requirements-build.txt + PyTorch SHA + torch_npu SHA
+          # ccache 缓存键基于：requirements-build.txt + PyTorch SHA + torch_npu SHA
           PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
           TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
           REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
-          echo "partial_key=${{ env.CACHE_VERSION }}-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-" >> $GITHUB_OUTPUT
-          echo "pip_key=${{ env.CACHE_VERSION }}-pip-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
-
-      - name: Restore pip cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.cache_key.outputs.pip_key }}
-          restore-keys: |
-            ${{ env.CACHE_VERSION }}-pip-
+          echo "cache_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
+          echo "partial_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-" >> $GITHUB_OUTPUT
 
       - name: Restore ccache
         uses: actions/cache/restore@v4
         with:
           path: ~/.cache/ccache
-          key: ${{ steps.cache_key.outputs.cache_key }}
+          key: ${{ steps.ccache_key.outputs.cache_key }}
           restore-keys: |
-            ${{ steps.cache_key.outputs.partial_key }}
-            ${{ env.CACHE_VERSION }}-
+            ${{ steps.ccache_key.outputs.partial_key }}
+            ${{ env.CACHE_VERSION }}-ccache-
 
       - name: Setup ccache
         run: |
@@ -132,12 +126,7 @@ jobs:
           ccache --set-config=max_size=5G
           ccache --set-config=compression=true
 
-          # 设置 ccache 环境变量（让 CMake 使用 ccache）
-          export CC="ccache gcc"
-          export CXX="ccache g++"
-          export CMAKE_C_COMPILER_LAUNCHER=ccache
-          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
+          # 设置 ccache 环境变量
           echo "CCACHE_DIR=~/.cache/ccache" >> $GITHUB_ENV
           echo "CC=ccache gcc" >> $GITHUB_ENV
           echo "CXX=ccache g++" >> $GITHUB_ENV
@@ -149,22 +138,6 @@ jobs:
           echo "=== ccache Statistics (before build) ==="
           ccache --show-stats
 
-      - name: Display cache restore status
-        run: |
-          echo "=== Cache Restore Status ==="
-          echo ""
-          echo "=== pip Cache ==="
-          pip${{ inputs.python_version }} cache info 2>/dev/null || echo "pip cache info not available"
-          if [ -d ~/.cache/pip ]; then
-            PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
-            echo "pip cache size: ${PIP_SIZE}"
-          else
-            echo "pip cache: not found"
-          fi
-          echo ""
-          echo "=== ccache ==="
-          ccache --show-stats || echo "ccache not available"
-
       # ==================== 构建 PyTorch ====================
       - name: Build PyTorch wheel
         run: |
@@ -172,7 +145,7 @@ jobs:
 
           cd pytorch-src
 
-          # 安装构建依赖
+          # 安装构建依赖（不使用 pip 缓存，避免依赖查找问题）
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install -r requirements-build.txt
 
@@ -180,12 +153,10 @@ jobs:
           export MAX_JOBS=40
           export USE_CUDA=0
           export USE_CUDNN=0
-          export USE_DISTRIBUTED=1  # 启用分布式支持（torch_npu 必需）
+          export USE_DISTRIBUTED=1
           export CMAKE_BUILD_TYPE=Release
-          export USE_OPENMP=1       # 启用 OpenMP
-          export USE_MKLDNN=0       # 禁用 MKLDNN
-
-          # ccache 配置（已在上一步设置）
+          export USE_OPENMP=1
+          export USE_MKLDNN=0
           export CMAKE_C_COMPILER_LAUNCHER=ccache
           export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -218,8 +189,6 @@ jobs:
           export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
-          # torchair 依赖 PyTorch 内部的 six::isTuple() API，该 API 在新版已移除
-          # 临时禁用，等待 torch_npu/torchair 适配新版 PyTorch API
           bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair
 
           echo "torch_npu wheel built:"
@@ -229,32 +198,21 @@ jobs:
           echo "=== ccache Statistics (after torch_npu build) ==="
           ccache --show-stats
 
-      # ==================== 保存缓存 ====================
-      - name: Save pip cache
-        if: success()
-        uses: actions/cache/save@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.cache_key.outputs.pip_key }}
-
+      # ==================== 保存 ccache 缓存 ====================
       - name: Save ccache
         if: success()
         uses: actions/cache/save@v4
         with:
           path: ~/.cache/ccache
-          key: ${{ steps.cache_key.outputs.cache_key }}
+          key: ${{ steps.ccache_key.outputs.cache_key }}
 
-      - name: Display cache save status
+      - name: Display ccache save status
         if: success()
         run: |
-          echo "=== Cache Saved ==="
-          echo "pip key: ${{ steps.cache_key.outputs.pip_key }}"
-          echo "ccache key: ${{ steps.cache_key.outputs.cache_key }}"
-          echo ""
-          PIP_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+          echo "=== ccache Saved ==="
+          echo "Key: ${{ steps.ccache_key.outputs.cache_key }}"
           CCACHE_SIZE=$(du -sh ~/.cache/ccache 2>/dev/null | cut -f1)
-          echo "pip cache: ${PIP_SIZE}"
-          echo "ccache: ${CCACHE_SIZE}"
+          echo "ccache size: ${CCACHE_SIZE}"
 
       # ==================== 打包和上传 ====================
       - name: Package test source

From 76a089952b0dc2956aa923514504c143801273c8 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 11:34:46 +0800
Subject: [PATCH 043/132] Fix ccache configuration: use symbolic links instead
 of CC environment variable

Problem: CMake passed compiler flags directly to ccache instead of gcc
  /usr/bin/ccache -DCAFFE2_PERF_WITH_SVE=1 ...
  ccache: invalid option -- 'D'

Cause: Setting CC="ccache gcc" caused CMake to invoke ccache directly
with compiler flags, but ccache doesn't accept -D options.

Solution: Use symbolic links to let ccache masquerade as gcc/g++
  ln -sf /usr/bin/ccache /usr/local/bin/gcc
  ln -sf /usr/bin/ccache /usr/local/bin/g++

This way:
- CMake calls /usr/local/bin/gcc
- gcc is actually a symlink to ccache
- ccache intercepts the call, caches results, then calls real gcc

Removed:
- CC/CXX environment variables
- CMAKE_C_COMPILER_LAUNCHER settings (redundant with symlinks)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 4fd68fa0bc..1503cd8728 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -126,14 +126,26 @@ jobs:
           ccache --set-config=max_size=5G
           ccache --set-config=compression=true
 
-          # 设置 ccache 环境变量
+          # 使用符号链接方式让 ccache 模拟 gcc/g++
+          # 这是推荐的方式，避免 CMake 直接把编译选项传给 ccache
+          # CMake 会调用 gcc，但实际上通过符号链接调用 ccache
+          mkdir -p /usr/local/bin
+          ln -sf /usr/bin/ccache /usr/local/bin/gcc
+          ln -sf /usr/bin/ccache /usr/local/bin/g++
+          ln -sf /usr/bin/ccache /usr/local/bin/cc
+          ln -sf /usr/bin/ccache /usr/local/bin/c++
+
+          # 设置 PATH 优先使用符号链接
+          echo "PATH=/usr/local/bin:$PATH" >> $GITHUB_ENV
           echo "CCACHE_DIR=~/.cache/ccache" >> $GITHUB_ENV
-          echo "CC=ccache gcc" >> $GITHUB_ENV
-          echo "CXX=ccache g++" >> $GITHUB_ENV
 
           echo "=== ccache Configuration ==="
           ccache --show-config
 
+          echo ""
+          echo "=== Symbolic Links ==="
+          ls -la /usr/local/bin/gcc /usr/local/bin/g++
+
           echo ""
           echo "=== ccache Statistics (before build) ==="
           ccache --show-stats
@@ -157,8 +169,6 @@ jobs:
           export CMAKE_BUILD_TYPE=Release
           export USE_OPENMP=1
           export USE_MKLDNN=0
-          export CMAKE_C_COMPILER_LAUNCHER=ccache
-          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
           python${{ inputs.python_version }} setup.py build bdist_wheel
 
@@ -185,8 +195,6 @@ jobs:
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
           export MAX_JOBS=40
-          export CMAKE_C_COMPILER_LAUNCHER=ccache
-          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
           bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair

From 64ecb1347521a257aa9a90565b588dc24bebe2de Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 12:12:10 +0800
Subject: [PATCH 044/132] Increase build parallelism: MAX_JOBS=80, ccache
 max_size=20G

- Increase MAX_JOBS from 40 to 80 for both PyTorch and torch_npu builds
- Increase ccache max_size from 5G to 20G to accommodate more cache

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 1503cd8728..ca33dd3226 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -123,7 +123,7 @@ jobs:
 
           # 配置 ccache
           ccache --set-config=cache_dir=~/.cache/ccache
-          ccache --set-config=max_size=5G
+          ccache --set-config=max_size=20G
           ccache --set-config=compression=true
 
           # 使用符号链接方式让 ccache 模拟 gcc/g++
@@ -162,7 +162,7 @@ jobs:
           pip${{ inputs.python_version }} install -r requirements-build.txt
 
           # 设置构建环境变量
-          export MAX_JOBS=40
+          export MAX_JOBS=80
           export USE_CUDA=0
           export USE_CUDNN=0
           export USE_DISTRIBUTED=1
@@ -194,7 +194,7 @@ jobs:
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
-          export MAX_JOBS=40
+          export MAX_JOBS=80
 
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
           bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair

From a83b36f8621d739d52bd3602f2299ff74ca4bb91 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 12:16:09 +0800
Subject: [PATCH 045/132] Increase build parallelism: MAX_JOBS=128

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index ca33dd3226..7ea9627a30 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -162,7 +162,7 @@ jobs:
           pip${{ inputs.python_version }} install -r requirements-build.txt
 
           # 设置构建环境变量
-          export MAX_JOBS=80
+          export MAX_JOBS=128
           export USE_CUDA=0
           export USE_CUDNN=0
           export USE_DISTRIBUTED=1
@@ -194,7 +194,7 @@ jobs:
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
-          export MAX_JOBS=80
+          export MAX_JOBS=128
 
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
           bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair

From 6554007623a84ec11e223332411ffd34c77f3261 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 13:16:55 +0800
Subject: [PATCH 046/132] Add pip cache to accelerate dependency download

- Add pip cache restore before build (key based on requirements-build.txt hash)
- Add pip cache save after successful build
- pip cache accelerates dependency download, does not affect build result

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 45 ++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 7ea9627a30..e1abcaf4c3 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -93,9 +93,29 @@ jobs:
           path: torch_npu-src
           submodules: recursive
 
+      # ==================== pip 缓存配置 ====================
+      # pip 缓存加速依赖下载，不影响构建结果
+      # 缓存键基于 requirements-build.txt hash（依赖变化频率低）
+      - name: Get pip cache key
+        id: pip_key
+        run: |
+          REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore pip cache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-
+            ${{ env.CACHE_VERSION }}-pip-
+
+      - name: Setup pip cache directory
+        run: |
+          mkdir -p ~/.cache/pip
+
       # ==================== ccache 缓存配置 ====================
-      # 只使用 ccache 缓存，不使用 pip 缓存
-      # pip 缓存可能导致依赖查找问题，且 pip 下载已经很快（几秒）
       # ccache 是真正加速编译的关键（可节省 30-60 分钟）
       - name: Get ccache key
         id: ccache_key
@@ -157,7 +177,7 @@ jobs:
 
           cd pytorch-src
 
-          # 安装构建依赖（不使用 pip 缓存，避免依赖查找问题）
+          # 安装构建依赖（pip 缓存已恢复，加速下载）
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install -r requirements-build.txt
 
@@ -206,7 +226,14 @@ jobs:
           echo "=== ccache Statistics (after torch_npu build) ==="
           ccache --show-stats
 
-      # ==================== 保存 ccache 缓存 ====================
+      # ==================== 保存缓存 ====================
+      - name: Save pip cache
+        if: success()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+
       - name: Save ccache
         if: success()
         uses: actions/cache/save@v4
@@ -214,11 +241,15 @@ jobs:
           path: ~/.cache/ccache
           key: ${{ steps.ccache_key.outputs.cache_key }}
 
-      - name: Display ccache save status
+      - name: Display cache save status
         if: success()
         run: |
-          echo "=== ccache Saved ==="
-          echo "Key: ${{ steps.ccache_key.outputs.cache_key }}"
+          echo "=== Cache Saved ==="
+          echo "pip cache key: ${{ steps.pip_key.outputs.cache_key }}"
+          PIP_CACHE_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+          echo "pip cache size: ${PIP_CACHE_SIZE}"
+          echo ""
+          echo "ccache key: ${{ steps.ccache_key.outputs.cache_key }}"
           CCACHE_SIZE=$(du -sh ~/.cache/ccache 2>/dev/null | cut -f1)
           echo "ccache size: ${CCACHE_SIZE}"
 

From 30868616530da091c78258952b27c3849ca93dd5 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 13:55:48 +0800
Subject: [PATCH 047/132] Add pip cache to _test.yml for test dependencies

- Add pip cache restore before installing test dependencies
- Cache key based on requirements-ci.txt hash
- Save pip cache after test run (even if tests fail)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_test.yml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 851a7bb1f2..5a7ce7ebf2 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -19,6 +19,7 @@ on:
 
 env:
   PYTHON_VERSION: '3.11'
+  CACHE_VERSION: 'v2'
 
 jobs:
   test:
@@ -37,6 +38,27 @@ jobs:
             .github/scripts
             .ci/docker
 
+      # ==================== pip 缓存配置 ====================
+      # pip 缓存加速测试依赖下载
+      - name: Get pip cache key
+        id: pip_key
+        run: |
+          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore pip cache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-
+            ${{ env.CACHE_VERSION }}-pip-test-
+
+      - name: Setup pip cache directory
+        run: |
+          mkdir -p ~/.cache/pip
+
       - name: Download torch wheel
         uses: actions/download-artifact@v4
         with:
@@ -85,6 +107,14 @@ jobs:
             --max-workers ${{ inputs.max-workers }} \
             --verbose
 
+      # ==================== 保存 pip 缓存 ====================
+      - name: Save pip cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+
       - name: Upload test reports
         if: always()
         uses: actions/upload-artifact@v4

From c81036655e25ff36d86dcdf9c0c721b22ac35fbe Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 14:10:03 +0800
Subject: [PATCH 048/132] Always save cache regardless of build result

- Change cache save condition from `if: success()` to `if: always()`
- Cache is useful even when build fails (e.g., partial compilation progress)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index e1abcaf4c3..5e4f1a7afd 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -228,21 +228,21 @@ jobs:
 
       # ==================== 保存缓存 ====================
       - name: Save pip cache
-        if: success()
+        if: always()
         uses: actions/cache/save@v4
         with:
           path: ~/.cache/pip
           key: ${{ steps.pip_key.outputs.cache_key }}
 
       - name: Save ccache
-        if: success()
+        if: always()
         uses: actions/cache/save@v4
         with:
           path: ~/.cache/ccache
           key: ${{ steps.ccache_key.outputs.cache_key }}
 
       - name: Display cache save status
-        if: success()
+        if: always()
         run: |
           echo "=== Cache Saved ==="
           echo "pip cache key: ${{ steps.pip_key.outputs.cache_key }}"

From 7bfbf93aa39f5a662996a3799062d0215803e52e Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 14:30:14 +0800
Subject: [PATCH 049/132] Add PyPI cache URL to accelerate pip downloads

- Add PYPI_CACHE_URL environment variable pointing to nginx pypi cache
- Configure pip index-url before installing dependencies
- Reference: https://github.com/sgl-project/sglang/blob/main/.github/workflows/pr-test-npu.yml

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 13 +++++++++++++
 .github/workflows/_test.yml            | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 5e4f1a7afd..4ae8c41859 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -34,6 +34,8 @@ env:
   CACHE_VERSION: 'v2'
   # GitHub 代理 URL（用于加速 git clone，留空则不使用代理）
   GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
 
 jobs:
   build:
@@ -115,6 +117,17 @@ jobs:
         run: |
           mkdir -p ~/.cache/pip
 
+      - name: Configure pip index URL
+        run: |
+          # 配置 pip 使用 PyPI 缓存加速下载
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            pip${{ inputs.python_version }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            pip${{ inputs.python_version }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          else
+            echo "No PyPI cache URL configured, using default"
+          fi
+
       # ==================== ccache 缓存配置 ====================
       # ccache 是真正加速编译的关键（可节省 30-60 分钟）
       - name: Get ccache key
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 5a7ce7ebf2..8f8961e2fa 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -20,6 +20,8 @@ on:
 env:
   PYTHON_VERSION: '3.11'
   CACHE_VERSION: 'v2'
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
 
 jobs:
   test:
@@ -59,6 +61,17 @@ jobs:
         run: |
           mkdir -p ~/.cache/pip
 
+      - name: Configure pip index URL
+        run: |
+          # 配置 pip 使用 PyPI 缓存加速下载
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          else
+            echo "No PyPI cache URL configured, using default"
+          fi
+
       - name: Download torch wheel
         uses: actions/download-artifact@v4
         with:

From 891621b4f2ef38a2e0e515061022d3d6fa553ada Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 16:05:58 +0800
Subject: [PATCH 050/132] Fix Jinja2 compatibility issue and add pip cache to
 _collect.yml

- Add PYPI_CACHE_URL and pip cache configuration to _collect.yml
- Add "Upgrade pip and setuptools" step before installing wheels
  (fixes Jinja2 2.6 compatibility issue with new setuptools)
- Also add upgrade step to _build_torch_npu.yml and _test.yml

The Jinja2 2.6 version from PyPI cache is incompatible with setuptools>=70,
which removed the deprecated 'Feature' API. Upgrading pip/setuptools first
ensures proper version resolution.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml |  5 +++
 .github/workflows/_collect.yml         | 47 ++++++++++++++++++++++++++
 .github/workflows/_test.yml            |  5 +++
 3 files changed, 57 insertions(+)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 4ae8c41859..f1f6bda14c 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -128,6 +128,11 @@ jobs:
             echo "No PyPI cache URL configured, using default"
           fi
 
+      - name: Upgrade pip and setuptools
+        run: |
+          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+
       # ==================== ccache 缓存配置 ====================
       # ccache 是真正加速编译的关键（可节省 30-60 分钟）
       - name: Get ccache key
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index b01028cb7e..f26503b395 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -27,6 +27,9 @@ on:
 
 env:
   PYTHON_VERSION: '3.11'
+  CACHE_VERSION: 'v2'
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
 
 jobs:
   collect:
@@ -49,6 +52,42 @@ jobs:
             .github/scripts
             .ci/docker
 
+      # ==================== pip 缓存配置 ====================
+      - name: Get pip cache key
+        id: pip_key
+        run: |
+          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore pip cache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-
+            ${{ env.CACHE_VERSION }}-pip-collect-
+
+      - name: Setup pip cache directory
+        run: |
+          mkdir -p ~/.cache/pip
+
+      - name: Configure pip index URL
+        run: |
+          # 配置 pip 使用 PyPI 缓存加速下载
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          else
+            echo "No PyPI cache URL configured, using default"
+          fi
+
+      - name: Upgrade pip and setuptools
+        run: |
+          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
+          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
+
       - name: Download wheels
         uses: actions/download-artifact@v4
         with:
@@ -78,6 +117,14 @@ jobs:
         run: |
           pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
 
+      # ==================== 保存 pip 缓存 ====================
+      - name: Save pip cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+
       - name: Collect all test cases
         id: collect
         run: |
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 8f8961e2fa..fbc648926d 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -72,6 +72,11 @@ jobs:
             echo "No PyPI cache URL configured, using default"
           fi
 
+      - name: Upgrade pip and setuptools
+        run: |
+          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
+          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
+
       - name: Download torch wheel
         uses: actions/download-artifact@v4
         with:

From 7d69c097a48bbfa30f937bcae86d49ea0bd503bb Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 17:48:00 +0800
Subject: [PATCH 051/132] Optimize test source package: only package test
 directory

- Package only pytorch-src/test instead of entire pytorch-src
- Reduces artifact size from GBs to ~tens of MBs
- Update _collect.yml and _test.yml to use new directory structure
- Extract test-src.tar.gz now produces test/ directory directly

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 8 ++------
 .github/workflows/_collect.yml         | 5 ++---
 .github/workflows/_test.yml            | 2 +-
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index f1f6bda14c..fe0570f6d3 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -274,12 +274,8 @@ jobs:
       # ==================== 打包和上传 ====================
       - name: Package test source
         run: |
-          tar -czf test-src.tar.gz \
-            --exclude='pytorch-src/build' \
-            --exclude='pytorch-src/dist' \
-            --exclude='torch_npu-src/build' \
-            --exclude='torch_npu-src/dist' \
-            pytorch-src
+          # 只打包测试目录，不需要整个 PyTorch 源码
+          tar -czf test-src.tar.gz pytorch-src/test
           ls -la test-src.tar.gz
 
       - name: Upload PyTorch wheel
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index f26503b395..aaa447757a 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -129,9 +129,8 @@ jobs:
         id: collect
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          cd pytorch-src
 
-          python${{ env.PYTHON_VERSION }} ../.github/scripts/collect_all_cases.py \
+          python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
             --test-dir test \
             --distributed-shards ${{ inputs.distributed-shards }} \
             --regular-shards ${{ inputs.regular-shards }} \
@@ -154,4 +153,4 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: cases-shards
-          path: pytorch-src/cases_shards/
\ No newline at end of file
+          path: cases_shards/
\ No newline at end of file
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index fbc648926d..9d3776cea3 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -119,7 +119,7 @@ jobs:
 
           python .github/scripts/run_npu_test_shard.py \
             --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
-            --test-dir pytorch-src/test \
+            --test-dir test \
             --report-dir test-reports \
             --timeout 300 \
             --max-workers ${{ inputs.max-workers }} \

From 931120fb88c30c9680b85d63edf5704ced998052 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 18:03:19 +0800
Subject: [PATCH 052/132] Add verbose logging to collect_all_cases.py

- Add --verbose flag to print detailed collection progress
- Print each file's collection result (OK/WARN/ERROR/SKIP/TIMEOUT)
- Print pytest stderr when collection fails or returns 0 cases
- Print error summary at the end with top 10 failed files

This helps diagnose why cases collection returns 0 cases.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py | 91 +++++++++++++++++++++++-----
 .github/workflows/_collect.yml       |  3 +-
 2 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 2b0e3914b7..4744d2d467 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -30,11 +30,18 @@ def is_distributed_test(test_file: str) -> bool:
     return False
 
 
-def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1) -> List[str]:
-    """Collect test cases from a single test file using pytest --collect-only."""
+def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1, verbose: bool = False) -> Tuple[List[str], str]:
+    """Collect test cases from a single test file using pytest --collect-only.
+
+    Returns:
+        Tuple of (cases list, error message or empty string)
+    """
     full_path = test_dir / test_file
     if not full_path.exists():
-        return []
+        error = f"File not found: {full_path}"
+        if verbose:
+            print(f"[SKIP] {test_file}: {error}")
+        return [], error
 
     try:
         result = subprocess.run(
@@ -54,13 +61,38 @@ def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1) -
                 if case_id and not case_id.startswith("<"):
                     cases.append(case_id)
 
-        return cases
+        # Check for errors
+        error_msg = ""
+        if result.returncode != 0:
+            error_msg = result.stderr.strip() if result.stderr else "Unknown error"
+            if verbose:
+                print(f"[ERROR] {test_file}: pytest returned {result.returncode}")
+                if result.stderr:
+                    print(f"  stderr: {result.stderr[:200]}")
+        elif len(cases) == 0:
+            # No cases collected, might be import error or empty file
+            if result.stderr:
+                error_msg = result.stderr.strip()
+                if verbose:
+                    print(f"[WARN] {test_file}: 0 cases collected, stderr: {result.stderr[:200]}")
+            else:
+                if verbose:
+                    print(f"[WARN] {test_file}: 0 cases collected (possibly empty or all skipped)")
+        else:
+            if verbose:
+                print(f"[OK] {test_file}: {len(cases)} cases collected")
+
+        return cases, error_msg
     except subprocess.TimeoutExpired:
-        print(f"Timeout collecting cases from {test_file}")
-        return []
+        error = "Timeout after 60s"
+        if verbose:
+            print(f"[TIMEOUT] {test_file}: {error}")
+        return [], error
     except Exception as e:
-        print(f"Error collecting cases from {test_file}: {e}")
-        return []
+        error = str(e)
+        if verbose:
+            print(f"[EXCEPTION] {test_file}: {error}")
+        return [], error
 
 
 def discover_test_files(test_dir: Path) -> List[str]:
@@ -86,7 +118,8 @@ def collect_all_cases(
     distributed_shards: int,
     regular_shards: int,
     output_dir: str,
-    parallel: int = 1
+    parallel: int = 1,
+    verbose: bool = False
 ) -> Dict:
     """Collect all test cases and shard them."""
     test_dir_path = Path(test_dir)
@@ -106,34 +139,62 @@ def collect_all_cases(
     print(f"Distributed test files: {len(distributed_files)}")
     print(f"Regular test files: {len(regular_files)}")
 
+    if verbose:
+        print("\n=== Collecting distributed cases ===")
+
     # Collect cases in parallel
     print("Collecting distributed cases...")
     distributed_cases = []
+    distributed_errors = {}
     with ThreadPoolExecutor(max_workers=parallel) as executor:
         futures = {
-            executor.submit(collect_cases_from_file, test_dir_path, f, parallel): f
+            executor.submit(collect_cases_from_file, test_dir_path, f, parallel, verbose): f
             for f in distributed_files
         }
         for future in as_completed(futures):
             file = futures[future]
-            cases = future.result()
+            cases, error = future.result()
             distributed_cases.extend(cases)
+            if error:
+                distributed_errors[file] = error
+
+    if verbose:
+        print("\n=== Collecting regular cases ===")
 
     print("Collecting regular cases...")
     regular_cases = []
+    regular_errors = {}
     with ThreadPoolExecutor(max_workers=parallel) as executor:
         futures = {
-            executor.submit(collect_cases_from_file, test_dir_path, f, parallel): f
+            executor.submit(collect_cases_from_file, test_dir_path, f, parallel, verbose): f
             for f in regular_files
         }
         for future in as_completed(futures):
             file = futures[future]
-            cases = future.result()
+            cases, error = future.result()
             regular_cases.extend(cases)
+            if error:
+                regular_errors[file] = error
 
     print(f"Total distributed cases: {len(distributed_cases)}")
     print(f"Total regular cases: {len(regular_cases)}")
 
+    # Print summary of errors if any
+    if distributed_errors or regular_errors:
+        print("\n=== Collection Errors Summary ===")
+        if distributed_errors:
+            print(f"Distributed files with errors: {len(distributed_errors)}")
+            for file, error in sorted(distributed_errors.items())[:10]:
+                print(f"  {file}: {error[:100]}")
+            if len(distributed_errors) > 10:
+                print(f"  ... and {len(distributed_errors) - 10} more")
+        if regular_errors:
+            print(f"Regular files with errors: {len(regular_errors)}")
+            for file, error in sorted(regular_errors.items())[:10]:
+                print(f"  {file}: {error[:100]}")
+            if len(regular_errors) > 10:
+                print(f"  ... and {len(regular_errors) - 10} more")
+
     # Shard cases
     distributed_sharded = shard_cases(distributed_cases, distributed_shards)
     regular_sharded = shard_cases(regular_cases, regular_shards)
@@ -187,6 +248,7 @@ def main():
     parser.add_argument("--regular-shards", type=int, default=5, help="Number of regular test shards")
     parser.add_argument("--output-dir", required=True, help="Output directory for shard JSON files")
     parser.add_argument("--parallel", type=int, default=1, help="Number of parallel collectors")
+    parser.add_argument("--verbose", action="store_true", help="Print detailed collection progress")
 
     args = parser.parse_args()
 
@@ -195,7 +257,8 @@ def main():
         distributed_shards=args.distributed_shards,
         regular_shards=args.regular_shards,
         output_dir=args.output_dir,
-        parallel=args.parallel
+        parallel=args.parallel,
+        verbose=args.verbose
     )
 
     print("\nCollection Summary:")
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index aaa447757a..ac3a7a90e7 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -135,7 +135,8 @@ jobs:
             --distributed-shards ${{ inputs.distributed-shards }} \
             --regular-shards ${{ inputs.regular-shards }} \
             --output-dir cases_shards \
-            --parallel 16
+            --parallel 16 \
+            --verbose
 
           TOTAL=$(python${{ env.PYTHON_VERSION }} -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
           echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT

From 6f572b81b4221999e1b0f7a3a5efcb9e66acfd2a Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 18:09:30 +0800
Subject: [PATCH 053/132] Add test-collect.yml workflow and remove PR trigger
 from npu-sync-test.yml

- Create new test-collect.yml workflow for testing collect stage
- Triggered by PR events on workflow/scripts paths
- Downloads artifacts from specified workflow run (default: 25097789439)
- Uses dawidd6/action-download-artifact@v3 for cross-repo artifact download
- Remove pull_request trigger from npu-sync-test.yml (keep only workflow_dispatch)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-sync-test.yml |   6 -
 .github/workflows/test-collect.yml  | 171 ++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/test-collect.yml

diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
index ec9cc3b67b..0c81b4172a 100644
--- a/.github/workflows/npu-sync-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -1,12 +1,6 @@
 name: PyTorch NPU Sync Test
 
 on:
-  pull_request:
-    branches: [main, master]
-    paths:
-      - '.github/workflows/**'
-      - '.github/scripts/**'
-      - '.github/docker/**'
   workflow_dispatch:
     inputs:
       docker_image_tag:
diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
new file mode 100644
index 0000000000..d3bfc1d22f
--- /dev/null
+++ b/.github/workflows/test-collect.yml
@@ -0,0 +1,171 @@
+name: Test Collect Stage
+
+on:
+  pull_request:
+    branches: [main, master]
+    paths:
+      - '.github/workflows/**'
+      - '.github/scripts/**'
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: 'Workflow run ID to download artifacts from'
+        required: true
+        default: '25097789439'
+      distributed_shards:
+        description: 'Distributed test shards'
+        default: 2
+        type: number
+      regular_shards:
+        description: 'Regular test shards'
+        default: 5
+        type: number
+
+env:
+  PYTHON_VERSION: '3.11'
+  CACHE_VERSION: 'v2'
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
+jobs:
+  collect:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 60
+    outputs:
+      distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
+      regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
+      total_cases: ${{ steps.collect.outputs.total_cases }}
+
+    container:
+      image: quay.io/kerer/pytorch:latest
+      options: --user root
+
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            .github/scripts
+            .ci/docker
+
+      # ==================== pip 缓存配置 ====================
+      - name: Get pip cache key
+        id: pip_key
+        run: |
+          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore pip cache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-
+            ${{ env.CACHE_VERSION }}-pip-collect-
+
+      - name: Setup pip cache directory
+        run: |
+          mkdir -p ~/.cache/pip
+
+      - name: Configure pip index URL
+        run: |
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          else
+            echo "No PyPI cache URL configured, using default"
+          fi
+
+      - name: Upgrade pip and setuptools
+        run: |
+          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
+
+      # ==================== Download artifacts from previous workflow run ====================
+      - name: Download artifacts from workflow run
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          run_id: ${{ inputs.run_id || '25097789439' }}
+          name: torch-wheel-main
+          path: wheels
+          repo: Ascend/pytorch
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          skip_unpack: false
+
+      - name: Download torch_npu wheel from workflow run
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          run_id: ${{ inputs.run_id || '25097789439' }}
+          name: torch-npu-wheel-main
+          path: wheels
+          repo: Ascend/pytorch
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download test source from workflow run
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          run_id: ${{ inputs.run_id || '25097789439' }}
+          name: test-src-main
+          repo: Ascend/pytorch
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract test source
+        run: tar -xzf test-src.tar.gz
+
+      - name: Install wheels
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
+
+      - name: Install test dependencies
+        run: |
+          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
+
+      # ==================== 保存 pip 缓存 ====================
+      - name: Save pip cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+
+      - name: Collect all test cases
+        id: collect
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+
+          python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
+            --test-dir test \
+            --distributed-shards ${{ inputs.distributed_shards || 2 }} \
+            --regular-shards ${{ inputs.regular_shards || 5 }} \
+            --output-dir cases_shards \
+            --parallel 16 \
+            --verbose
+
+          TOTAL=$(python${{ env.PYTHON_VERSION }} -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
+          echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT
+
+          # Generate shard matrices
+          python${{ env.PYTHON_VERSION }} -c "
+          import json
+          d_shards = ${{ inputs.distributed_shards || 2 }}
+          r_shards = ${{ inputs.regular_shards || 5 }}
+          print(f'distributed_matrix={json.dumps(list(range(1, d_shards + 1)))}')
+          print(f'regular_matrix={json.dumps(list(range(1, r_shards + 1)))}')
+          " >> $GITHUB_OUTPUT
+
+      - name: Upload cases shard JSONs
+        uses: actions/upload-artifact@v4
+        with:
+          name: cases-shards
+          path: cases_shards/
+
+      - name: Print collection summary
+        if: always()
+        run: |
+          echo "=== Collection Summary ==="
+          if [ -f cases_shards/cases_collection_summary.json ]; then
+            cat cases_shards/cases_collection_summary.json
+          else
+            echo "Summary file not found"
+          fi
\ No newline at end of file

From 2be0d11a2638e753b045ada8e9e7981c355d0086 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 18:30:04 +0800
Subject: [PATCH 054/132] Clone PyTorch test source directly instead of
 downloading artifact

- Add GH_PROXY_URL env var for git clone acceleration
- Replace artifact download with git clone --depth=1 (no submodules)
- Update test-dir from 'test' to 'pytorch-src/test'
- Faster than downloading large artifact file

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index d3bfc1d22f..92cffc3663 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -25,6 +25,7 @@ env:
   PYTHON_VERSION: '3.11'
   CACHE_VERSION: 'v2'
   PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+  GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
 
 jobs:
   collect:
@@ -101,16 +102,17 @@ jobs:
           repo: Ascend/pytorch
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Download test source from workflow run
-        uses: dawidd6/action-download-artifact@v3
-        with:
-          run_id: ${{ inputs.run_id || '25097789439' }}
-          name: test-src-main
-          repo: Ascend/pytorch
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract test source
-        run: tar -xzf test-src.tar.gz
+      # ==================== Clone PyTorch for test source ====================
+      - name: Clone PyTorch test source
+        run: |
+          # 使用代理加速 git clone，只获取测试用例（浅克隆，无 submodules）
+          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
+            echo "Using proxy: ${PYTORCH_REPO}"
+          fi
+          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
+          echo "Cloned PyTorch test directory"
 
       - name: Install wheels
         run: |
@@ -135,7 +137,7 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
 
           python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
-            --test-dir test \
+            --test-dir pytorch-src/test \
             --distributed-shards ${{ inputs.distributed_shards || 2 }} \
             --regular-shards ${{ inputs.regular_shards || 5 }} \
             --output-dir cases_shards \

From bb4c7f41ec1f348bf107c832e9a454ee80739eb2 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 18:49:11 +0800
Subject: [PATCH 055/132] Print full error logs in collect_all_cases.py verbose
 mode

- Remove truncation of stderr output (was limited to 200 chars)
- Print each error line separately for better readability
- Error summary shows first 5 lines per file (instead of 100 chars)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 4744d2d467..cb803f988d 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -68,13 +68,18 @@ def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1, v
             if verbose:
                 print(f"[ERROR] {test_file}: pytest returned {result.returncode}")
                 if result.stderr:
-                    print(f"  stderr: {result.stderr[:200]}")
+                    # 打印完整 stderr，不截断
+                    for line in result.stderr.splitlines():
+                        print(f"  {line}")
         elif len(cases) == 0:
             # No cases collected, might be import error or empty file
             if result.stderr:
                 error_msg = result.stderr.strip()
                 if verbose:
-                    print(f"[WARN] {test_file}: 0 cases collected, stderr: {result.stderr[:200]}")
+                    print(f"[WARN] {test_file}: 0 cases collected")
+                    # 打印完整 stderr
+                    for line in result.stderr.splitlines():
+                        print(f"  {line}")
             else:
                 if verbose:
                     print(f"[WARN] {test_file}: 0 cases collected (possibly empty or all skipped)")
@@ -185,15 +190,20 @@ def collect_all_cases(
         if distributed_errors:
             print(f"Distributed files with errors: {len(distributed_errors)}")
             for file, error in sorted(distributed_errors.items())[:10]:
-                print(f"  {file}: {error[:100]}")
+                # 打印完整错误信息
+                print(f"  {file}:")
+                for line in error.splitlines()[:5]:  # 只打印前5行避免过长
+                    print(f"    {line}")
             if len(distributed_errors) > 10:
-                print(f"  ... and {len(distributed_errors) - 10} more")
+                print(f"  ... and {len(distributed_errors) - 10} more files")
         if regular_errors:
             print(f"Regular files with errors: {len(regular_errors)}")
             for file, error in sorted(regular_errors.items())[:10]:
-                print(f"  {file}: {error[:100]}")
+                print(f"  {file}:")
+                for line in error.splitlines()[:5]:
+                    print(f"    {line}")
             if len(regular_errors) > 10:
-                print(f"  ... and {len(regular_errors) - 10} more")
+                print(f"  ... and {len(regular_errors) - 10} more files")
 
     # Shard cases
     distributed_sharded = shard_cases(distributed_cases, distributed_shards)

From 5836fc1d90d6acbf0611f70bcef64aec6101e1f7 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 18:56:07 +0800
Subject: [PATCH 056/132] Add run_test.py execution step to test-collect.yml

- Add step to get test list from run_test.py --help output
- Parse TESTS list from help message epilog
- Iterate through all tests and run them using run_test.py -i
- Print detailed logs for each test, continue on failure
- Show summary of passed/failed tests at the end
- Set 120 minutes timeout for test execution

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 97 +++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 92cffc3663..1bac9dbae1 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -170,4 +170,99 @@ jobs:
             cat cases_shards/cases_collection_summary.json
           else
             echo "Summary file not found"
-          fi
\ No newline at end of file
+          fi
+
+      # ==================== 使用 run_test.py 运行测试 ====================
+      - name: Get test list from run_test.py
+        id: get_tests
+        run: |
+          cd pytorch-src
+          # 获取 --help 输出，从中解析测试列表
+          python${{ env.PYTHON_VERSION }} test/run_test.py --help > /tmp/run_test_help.txt 2>&1 || true
+          cat /tmp/run_test_help.txt
+
+          # 解析测试列表（从 "where TESTS is any of:" 后面获取）
+          python${{ env.PYTHON_VERSION }} -c "
+          import re
+          with open('/tmp/run_test_help.txt') as f:
+              content = f.read()
+          # 查找测试列表
+          match = re.search(r'where TESTS is any of:\s*(.+)', content)
+          if match:
+              tests_str = match.group(1)
+              # 解析逗号分隔的测试名称
+              tests = [t.strip() for t in tests_str.split(',')]
+              # 保存到文件
+              with open('/tmp/test_list.txt', 'w') as f:
+                  for t in tests:
+                      if t:
+                          f.write(t + '\n')
+              print(f'Found {len([t for t in tests if t])} tests')
+              print('Tests:', tests[:20], '...')
+          else:
+              print('Could not find TESTS list in help output')
+              # 如果解析失败，使用默认测试列表
+              default_tests = ['test_nn', 'test_torch', 'test_autograd', 'test_distributions', 'test_indexing', 'test_type_promotion']
+              with open('/tmp/test_list.txt', 'w') as f:
+                  for t in default_tests:
+                      f.write(t + '\n')
+              print(f'Using default test list: {default_tests}')
+          "
+
+          # 输出测试数量
+          TEST_COUNT=$(wc -l < /tmp/test_list.txt)
+          echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
+          echo "=== Tests to run (${TEST_COUNT} tests) ==="
+          cat /tmp/test_list.txt
+
+      - name: Run tests using run_test.py
+        timeout-minutes: 120
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          cd pytorch-src
+
+          echo "=== Starting test execution ==="
+          TEST_COUNT=0
+          PASSED=0
+          FAILED=0
+
+          while IFS= read -r test_name; do
+            if [ -z "$test_name" ]; then
+              continue
+            fi
+
+            TEST_COUNT=$((TEST_COUNT + 1))
+            echo ""
+            echo "============================================"
+            echo "Test ${TEST_COUNT}: ${test_name}"
+            echo "============================================"
+
+            # 运行测试，捕获输出，即使失败也继续
+            set +e
+            python${{ env.PYTHON_VERSION }} test/run_test.py -i "${test_name}" -v 2>&1 | tee /tmp/test_output_${TEST_COUNT}.log
+            TEST_EXIT_CODE=${PIPESTATUS[0]}
+            set -e
+
+            # 检查结果
+            if [ $TEST_EXIT_CODE -eq 0 ]; then
+              echo ">>> ${test_name}: PASSED"
+              PASSED=$((PASSED + 1))
+            else
+              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE})"
+              FAILED=$((FAILED + 1))
+            fi
+
+            echo ""
+            echo "--------------------------------------------"
+            echo "Summary so far: ${PASSED} passed, ${FAILED} failed out of ${TEST_COUNT} tests"
+            echo "--------------------------------------------"
+
+          done < /tmp/test_list.txt
+
+          echo ""
+          echo "============================================"
+          echo "=== FINAL SUMMARY ==="
+          echo "Total tests run: ${TEST_COUNT}"
+          echo "Passed: ${PASSED}"
+          echo "Failed: ${FAILED}"
+          echo "============================================"
\ No newline at end of file

From f24a4affb53c33653ccf556162246759e02bbaf5 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 19:18:23 +0800
Subject: [PATCH 057/132] Run run_test.py from test directory to avoid torch
 import conflict

- Change from 'cd pytorch-src' to 'cd pytorch-src/test'
- Remove 'test/' prefix from run_test.py path (already in test dir)
- Avoids source torch directory conflicting with installed torch library

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 1bac9dbae1..ceceb3423a 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -176,9 +176,11 @@ jobs:
       - name: Get test list from run_test.py
         id: get_tests
         run: |
-          cd pytorch-src
+          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
+          cd pytorch-src/test
+
           # 获取 --help 输出，从中解析测试列表
-          python${{ env.PYTHON_VERSION }} test/run_test.py --help > /tmp/run_test_help.txt 2>&1 || true
+          python${{ env.PYTHON_VERSION }} run_test.py --help > /tmp/run_test_help.txt 2>&1 || true
           cat /tmp/run_test_help.txt
 
           # 解析测试列表（从 "where TESTS is any of:" 后面获取）
@@ -219,7 +221,9 @@ jobs:
         timeout-minutes: 120
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          cd pytorch-src
+
+          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
+          cd pytorch-src/test
 
           echo "=== Starting test execution ==="
           TEST_COUNT=0
@@ -239,7 +243,7 @@ jobs:
 
             # 运行测试，捕获输出，即使失败也继续
             set +e
-            python${{ env.PYTHON_VERSION }} test/run_test.py -i "${test_name}" -v 2>&1 | tee /tmp/test_output_${TEST_COUNT}.log
+            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v 2>&1 | tee /tmp/test_output_${TEST_COUNT}.log
             TEST_EXIT_CODE=${PIPESTATUS[0]}
             set -e
 

From 616484e0c2c3da1a223713396c873516018c338f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 19:23:13 +0800
Subject: [PATCH 058/132] Add NPU verification and environment setup to
 test-collect.yml

- Add Verify NPU availability step after installing wheels
- Show npu-smi info before running tests
- Import both CANN and nnal/atb environment variables before:
  - Collect all test cases
  - Get test list from run_test.py
  - Run tests using run_test.py
- Print torch/torch_npu version and NPU availability info

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 32 ++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index ceceb3423a..16cae31eb9 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -119,6 +119,27 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           pip${{ env.PYTHON_VERSION }} install wheels/*.whl
 
+      - name: Verify NPU availability
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          echo "=== NPU-SMI Info ==="
+          npu-smi info || echo "npu-smi not available"
+
+          PYTHON=python${{ env.PYTHON_VERSION }}
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          if torch.npu.is_available():
+              print(f'NPU name: {torch.npu.get_device_name(0)}')
+              print(f'NPU capability: {torch.npu.get_device_capability(0)}')
+          "
+
       - name: Install test dependencies
         run: |
           pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
@@ -135,6 +156,10 @@ jobs:
         id: collect
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          echo "=== NPU-SMI Info ==="
+          npu-smi info || echo "npu-smi not available"
 
           python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
             --test-dir pytorch-src/test \
@@ -176,6 +201,9 @@ jobs:
       - name: Get test list from run_test.py
         id: get_tests
         run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
           # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
           cd pytorch-src/test
 
@@ -221,6 +249,10 @@ jobs:
         timeout-minutes: 120
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          echo "=== NPU-SMI Info ==="
+          npu-smi info || echo "npu-smi not available"
 
           # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
           cd pytorch-src/test

From 5fc4f6752720cd6b1813cdf88254ba261897bc69 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 19:48:45 +0800
Subject: [PATCH 059/132] Fix issues in test-collect.yml workflow

1. Increase job timeout from 60 to 180 minutes (run_test step needs 120)
2. Use discover_tests.py directly for test list (more reliable than --help parsing)
3. Add fallback methods: discover_tests.py -> --help parsing -> default list
4. Only show first 20 tests in log (avoid huge output)
5. Save test results summary to /tmp/test_results_summary.json
6. Upload test results artifact (logs + summary)
7. Add Print final summary step to show results

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 107 ++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 16cae31eb9..f9287e594c 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -30,7 +30,7 @@ env:
 jobs:
   collect:
     runs-on: linux-aarch64-a3-16
-    timeout-minutes: 60
+    timeout-minutes: 180
     outputs:
       distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
       regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
@@ -207,43 +207,63 @@ jobs:
           # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
           cd pytorch-src/test
 
-          # 获取 --help 输出，从中解析测试列表
-          python${{ env.PYTHON_VERSION }} run_test.py --help > /tmp/run_test_help.txt 2>&1 || true
-          cat /tmp/run_test_help.txt
-
-          # 解析测试列表（从 "where TESTS is any of:" 后面获取）
+          # 方法1: 直接从 discover_tests.py 获取测试列表（更可靠）
           python${{ env.PYTHON_VERSION }} -c "
-          import re
-          with open('/tmp/run_test_help.txt') as f:
-              content = f.read()
-          # 查找测试列表
-          match = re.search(r'where TESTS is any of:\s*(.+)', content)
-          if match:
-              tests_str = match.group(1)
-              # 解析逗号分隔的测试名称
-              tests = [t.strip() for t in tests_str.split(',')]
-              # 保存到文件
-              with open('/tmp/test_list.txt', 'w') as f:
-                  for t in tests:
-                      if t:
-                          f.write(t + '\n')
-              print(f'Found {len([t for t in tests if t])} tests')
-              print('Tests:', tests[:20], '...')
-          else:
-              print('Could not find TESTS list in help output')
-              # 如果解析失败，使用默认测试列表
-              default_tests = ['test_nn', 'test_torch', 'test_autograd', 'test_distributions', 'test_indexing', 'test_type_promotion']
-              with open('/tmp/test_list.txt', 'w') as f:
-                  for t in default_tests:
-                      f.write(t + '\n')
-              print(f'Using default test list: {default_tests}')
-          "
+          import sys
+          sys.path.insert(0, '../tools/testing')
+          from discover_tests import TESTS
+          with open('/tmp/test_list.txt', 'w') as f:
+              for t in TESTS:
+                  f.write(t + '\n')
+          print(f'Found {len(TESTS)} tests from discover_tests.py')
+          " 2>/dev/null || true
+
+          # 如果方法1失败，尝试方法2: 从 --help 解析
+          if [ ! -f /tmp/test_list.txt ] || [ ! -s /tmp/test_list.txt ]; then
+            echo "Method 1 failed, trying method 2 (parse --help)"
+            python${{ env.PYTHON_VERSION }} run_test.py --help > /tmp/run_test_help.txt 2>&1 || true
+
+            # 解析测试列表
+            python${{ env.PYTHON_VERSION }} -c "
+            import re
+            with open('/tmp/run_test_help.txt') as f:
+                content = f.read()
+            # 查找测试列表 (格式: where TESTS is any of: xxx, yyy, zzz)
+            match = re.search(r'where TESTS is any of:\s*(.+)', content)
+            if match:
+                tests_str = match.group(1).strip()
+                tests = [t.strip() for t in tests_str.split(',')]
+                tests = [t for t in tests if t]
+                with open('/tmp/test_list.txt', 'w') as f:
+                    for t in tests:
+                        f.write(t + '\n')
+                print(f'Found {len(tests)} tests from --help')
+            else:
+                print('Could not find TESTS list')
+            "
+          fi
+
+          # 如果都失败，使用默认列表
+          if [ ! -f /tmp/test_list.txt ] || [ ! -s /tmp/test_list.txt ]; then
+            echo "All methods failed, using default test list"
+            cat > /tmp/test_list.txt << 'DEFAULT_TESTS'
+test_nn
+test_torch
+test_autograd
+test_distributions
+test_indexing
+test_type_promotion
+DEFAULT_TESTS
+          fi
 
           # 输出测试数量
           TEST_COUNT=$(wc -l < /tmp/test_list.txt)
           echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
           echo "=== Tests to run (${TEST_COUNT} tests) ==="
-          cat /tmp/test_list.txt
+          head -20 /tmp/test_list.txt
+          if [ $TEST_COUNT -gt 20 ]; then
+            echo "... and $(($TEST_COUNT - 20)) more tests"
+          fi
 
       - name: Run tests using run_test.py
         timeout-minutes: 120
@@ -301,4 +321,25 @@ jobs:
           echo "Total tests run: ${TEST_COUNT}"
           echo "Passed: ${PASSED}"
           echo "Failed: ${FAILED}"
-          echo "============================================"
\ No newline at end of file
+          echo "============================================"
+
+          # 保存结果摘要
+          echo "{\"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}}" > /tmp/test_results_summary.json
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-run-results
+          path: |
+            /tmp/test_output_*.log
+            /tmp/test_results_summary.json
+          retention-days: 7
+
+      - name: Print final summary
+        if: always()
+        run: |
+          echo "=== Final Test Results ==="
+          if [ -f /tmp/test_results_summary.json ]; then
+            cat /tmp/test_results_summary.json
+          fi
\ No newline at end of file

From 9e6b1a066daaa7de2df7b000637d3c3e0e8b9850 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 19:50:28 +0800
Subject: [PATCH 060/132] Fix YAML syntax error: replace heredoc with echo
 statements

- heredoc (<< 'DEFAULT_TESTS') causes YAML indentation issues
- Use multiple echo statements instead for default test list

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index f9287e594c..edff9a4199 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -246,14 +246,12 @@ jobs:
           # 如果都失败，使用默认列表
           if [ ! -f /tmp/test_list.txt ] || [ ! -s /tmp/test_list.txt ]; then
             echo "All methods failed, using default test list"
-            cat > /tmp/test_list.txt << 'DEFAULT_TESTS'
-test_nn
-test_torch
-test_autograd
-test_distributions
-test_indexing
-test_type_promotion
-DEFAULT_TESTS
+            echo "test_nn" > /tmp/test_list.txt
+            echo "test_torch" >> /tmp/test_list.txt
+            echo "test_autograd" >> /tmp/test_list.txt
+            echo "test_distributions" >> /tmp/test_list.txt
+            echo "test_indexing" >> /tmp/test_list.txt
+            echo "test_type_promotion" >> /tmp/test_list.txt
           fi
 
           # 输出测试数量

From bb7f2a98216f38b95639e4e4d45e51b982de3292 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 20:02:22 +0800
Subject: [PATCH 061/132] Install upstream PyTorch test dependencies

- Add installation of pytorch-src/.ci/docker/requirements-ci.txt
- Add common development dependencies: expecttest, hypothesis, fsspec, etc.
- Use || true to allow partial failures (some deps may not be needed for all tests)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index edff9a4199..2aee563a34 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -142,8 +142,15 @@ jobs:
 
       - name: Install test dependencies
         run: |
+          # 安装当前仓库的基础测试依赖
           pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
 
+          # 安装上游 PyTorch 的完整测试依赖
+          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
+
+          # 安装 PyTorch 开发依赖（可选）
+          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
+
       # ==================== 保存 pip 缓存 ====================
       - name: Save pip cache
         if: always()

From 254d0ab53dd410755dc072d7432348d71426d5a8 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 20:06:02 +0800
Subject: [PATCH 062/132] Show CANN directory contents in Verify NPU step

- List /usr/local/Ascend/ directory contents
- Show CANN directory structure and version.info
- Show NNAL directory contents
- Helps verify CANN installation status in Docker image

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 2aee563a34..2b558bc695 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -121,9 +121,30 @@ jobs:
 
       - name: Verify NPU availability
         run: |
+          echo "=== CANN Directory Contents ==="
+          ls -la /usr/local/Ascend/ || echo "Ascend directory not found"
+          echo ""
+          echo "=== CANN Version Info ==="
+          if [ -d /usr/local/Ascend/cann ]; then
+            ls -la /usr/local/Ascend/cann/
+            echo ""
+            echo "=== CANN Version File ==="
+            cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo "version.info not found"
+          else
+            echo "CANN directory not found"
+          fi
+          echo ""
+          echo "=== NNAL Directory Contents ==="
+          if [ -d /usr/local/Ascend/nnal ]; then
+            ls -la /usr/local/Ascend/nnal/
+          else
+            echo "NNAL directory not found"
+          fi
+
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
+          echo ""
           echo "=== NPU-SMI Info ==="
           npu-smi info || echo "npu-smi not available"
 

From 3af5820e91d68d823cd55347f8f1cfc286e49746 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 20:18:06 +0800
Subject: [PATCH 063/132] Move Install test dependencies step after Upgrade pip

- Clone PyTorch test source earlier (needed for requirements-ci.txt)
- Install test dependencies after pip/setuptools upgrade
- Remove duplicate Clone and Install steps from later positions

New order:
1. Upgrade pip and setuptools
2. Clone PyTorch test source
3. Install test dependencies
4. Download artifacts
5. Install wheels
6. Verify NPU availability

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 46 +++++++++++++++---------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 2b558bc695..af19ccb841 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -82,6 +82,29 @@ jobs:
         run: |
           pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
 
+      # ==================== Clone PyTorch for test source（提前克隆） ====================
+      - name: Clone PyTorch test source
+        run: |
+          # 使用代理加速 git clone，只获取测试用例（浅克隆，无 submodules）
+          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
+            echo "Using proxy: ${PYTORCH_REPO}"
+          fi
+          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
+          echo "Cloned PyTorch test directory"
+
+      - name: Install test dependencies
+        run: |
+          # 安装当前仓库的基础测试依赖
+          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
+
+          # 安装上游 PyTorch 的完整测试依赖
+          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
+
+          # 安装 PyTorch 开发依赖（可选）
+          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
+
       # ==================== Download artifacts from previous workflow run ====================
       - name: Download artifacts from workflow run
         uses: dawidd6/action-download-artifact@v3
@@ -102,18 +125,6 @@ jobs:
           repo: Ascend/pytorch
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
-      # ==================== Clone PyTorch for test source ====================
-      - name: Clone PyTorch test source
-        run: |
-          # 使用代理加速 git clone，只获取测试用例（浅克隆，无 submodules）
-          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
-          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
-            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
-            echo "Using proxy: ${PYTORCH_REPO}"
-          fi
-          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
-          echo "Cloned PyTorch test directory"
-
       - name: Install wheels
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -161,17 +172,6 @@ jobs:
               print(f'NPU capability: {torch.npu.get_device_capability(0)}')
           "
 
-      - name: Install test dependencies
-        run: |
-          # 安装当前仓库的基础测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
-
-          # 安装上游 PyTorch 的完整测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
-
-          # 安装 PyTorch 开发依赖（可选）
-          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
-
       # ==================== 保存 pip 缓存 ====================
       - name: Save pip cache
         if: always()

From 2d2afc134e3ee53c93d4f77f3e198f4453355790 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 20:45:11 +0800
Subject: [PATCH 064/132] Fix torch import conflict: cd to /tmp before Verify
 NPU

- Change directory to /tmp before importing torch/torch_npu
- Avoids pytorch-src local torch module overriding installed torch
- pytest and run_test.py already execute from pytorch-src/test (safe)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index af19ccb841..a9a4abb1ef 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -132,6 +132,9 @@ jobs:
 
       - name: Verify NPU availability
         run: |
+          # 切换到 /tmp 目录，避免 pytorch-src 源码目录干扰 torch 导入
+          cd /tmp
+
           echo "=== CANN Directory Contents ==="
           ls -la /usr/local/Ascend/ || echo "Ascend directory not found"
           echo ""

From 542135fbe0811b33ac1c6fb25d15a1b2f334e313 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 20:51:28 +0800
Subject: [PATCH 065/132] Add newline at end of test-collect.yml

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index a9a4abb1ef..e9af1e7fa0 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -371,4 +371,4 @@ jobs:
           echo "=== Final Test Results ==="
           if [ -f /tmp/test_results_summary.json ]; then
             cat /tmp/test_results_summary.json
-          fi
\ No newline at end of file
+          fi

From 7bea533599fd37e0f75b07c6f550cc2feaccc332 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 29 Apr 2026 21:12:09 +0800
Subject: [PATCH 066/132] Make Verify NPU step continue on error

- Add continue-on-error: true to prevent workflow failure
- Add fallback message when torch/torch_npu import fails
- Error is expected when torch and torch_npu versions are mismatched

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index e9af1e7fa0..340f0042c8 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -131,6 +131,7 @@ jobs:
           pip${{ env.PYTHON_VERSION }} install wheels/*.whl
 
       - name: Verify NPU availability
+        continue-on-error: true
         run: |
           # 切换到 /tmp 目录，避免 pytorch-src 源码目录干扰 torch 导入
           cd /tmp
@@ -163,6 +164,8 @@ jobs:
           npu-smi info || echo "npu-smi not available"
 
           PYTHON=python${{ env.PYTHON_VERSION }}
+          echo ""
+          echo "=== PyTorch and NPU Info ==="
           $PYTHON -c "
           import torch
           print(f'torch: {torch.__version__}')
@@ -173,7 +176,7 @@ jobs:
           if torch.npu.is_available():
               print(f'NPU name: {torch.npu.get_device_name(0)}')
               print(f'NPU capability: {torch.npu.get_device_capability(0)}')
-          "
+          " 2>&1 || echo "torch/torch_npu import failed - this is expected if torch and torch_npu versions are mismatched"
 
       # ==================== 保存 pip 缓存 ====================
       - name: Save pip cache

From 3e1b9839413c741243ef3e50b7556e0334e46cb0 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 11:42:26 +0800
Subject: [PATCH 067/132] Refactor Docker build: single CANN version with
 multi-Python support

Major changes:
- Build images by CANN version only (no Python version matrix)
- Pre-install all Python versions (3.10/3.11/3.12/3.13) in single image
- Add build_image.sh with CANN version URL mapping table
- Add Python version switch script in container
- Fix workflow push logic for schedule and push triggers

New features:
- CANN version mapping: version -> toolkit/a3_ops/nnal URLs
- Python switch: source /usr/local/bin/switch_python.sh 3.12
- SKIP_DOCKER_LOGIN for CI optimization
- Image tags: cann${version} instead of py${python}-cann${version}

Fixes:
- Schedule/push triggers now correctly push images to registry
- Version extraction logic in Summary step
- Login timing optimization (after checking if image exists)

Documentation:
- BUILD_IMAGE_README.md: usage guide
- PYTORCH_DOCKER_COMPARISON.md: upstream comparison
- BUILD_FLOW_CHECK.md: workflow verification report

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/docker/pytorch-npu-builder.Dockerfile |  92 ++-
 .github/scripts/BUILD_FLOW_CHECK.md           | 382 +++++++++++
 .github/scripts/BUILD_IMAGE_README.md         | 307 +++++++++
 .github/scripts/PYTORCH_DOCKER_COMPARISON.md  | 595 ++++++++++++++++++
 .github/scripts/build_image.sh                | 479 ++++++++++++++
 .github/workflows/build-docker-image.yml      | 157 ++++-
 6 files changed, 1980 insertions(+), 32 deletions(-)
 create mode 100644 .github/scripts/BUILD_FLOW_CHECK.md
 create mode 100644 .github/scripts/BUILD_IMAGE_README.md
 create mode 100644 .github/scripts/PYTORCH_DOCKER_COMPARISON.md
 create mode 100755 .github/scripts/build_image.sh

diff --git a/.github/docker/pytorch-npu-builder.Dockerfile b/.github/docker/pytorch-npu-builder.Dockerfile
index a4fa701667..f8a443b402 100644
--- a/.github/docker/pytorch-npu-builder.Dockerfile
+++ b/.github/docker/pytorch-npu-builder.Dockerfile
@@ -3,6 +3,12 @@ FROM quay.io/pypa/manylinux_2_28_aarch64
 
 ARG GCCTOOLSET_VERSION=13
 
+# CANN 包下载 URL（通过 build-arg 传入）
+ARG CANN_TOOLKIT_URL
+ARG CANN_A3OPS_URL
+ARG CANN_NNAL_URL
+ARG CANN_VERSION
+
 # Language variables
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
@@ -49,18 +55,50 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/op
 # git 2.36+ 需要配置 safe.directory
 RUN git config --global --add safe.directory "*"
 
-# 设置 Python 3.11 为默认版本 (CANN 安装需要 Python 环境)
-ENV PYTHON_VERSION=3.11
+# ============================================================
+# 预装所有 Python 版本（镜像支持多 Python 版本）
+# ============================================================
+# manylinux 镜像已包含 cp310-cp310, cp311-cp311, cp312-cp312, cp313-cp313
+# 默认使用 Python 3.11（可通过环境变量切换）
+
+ENV DEFAULT_PYTHON_VERSION=3.11
 ENV PATH=/opt/python/cp311-cp311/bin:$PATH
 
-# 设置工作目录
+# 创建 Python 版本切换脚本
+RUN printf '#!/bin/bash\n\
+# Python 版本切换辅助脚本\n\
+# 使用方法: source /usr/local/bin/switch_python.sh 3.11\n\
+\n\
+PYTHON_VERSION="${1:-3.11}"\n\
+\n\
+case "$PYTHON_VERSION" in\n\
+    3.10) PYTHON_DIR="cp310-cp310" ;;\n\
+    3.11) PYTHON_DIR="cp311-cp311" ;;\n\
+    3.12) PYTHON_DIR="cp312-cp312" ;;\n\
+    3.13) PYTHON_DIR="cp313-cp313" ;;\n\
+    *) echo "Unsupported Python version: $PYTHON_VERSION"; return 1 ;;\n\
+esac\n\
+\n\
+export PATH=/opt/python/$PYTHON_DIR/bin:$PATH\n\
+echo "Switched to Python $PYTHON_VERSION ($(python --version))"\n\
+' > /usr/local/bin/switch_python.sh && \
+    chmod +x /usr/local/bin/switch_python.sh
+
+# 为每个 Python 版本安装常用包
+RUN for py_dir in cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313; do \
+        /opt/python/$py_dir/bin/pip install --upgrade pip setuptools wheel; \
+    done
+
+# ============================================================
+# 安装 CANN（使用传入的 URL）
+# ============================================================
+
 WORKDIR /root
 
-# 安装 CANN 9.0.0-beta.2
 RUN mkdir -p cann && cd cann && \
-    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run && \
-    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run && \
-    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run && \
+    curl -O "${CANN_TOOLKIT_URL}" && \
+    curl -O "${CANN_A3OPS_URL}" && \
+    curl -O "${CANN_NNAL_URL}" && \
     chmod +x Ascend-cann*.run && \
     ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
     ./Ascend-cann-A3*.run --install --quiet --install-path=/usr/local/Ascend && \
@@ -72,7 +110,43 @@ RUN mkdir -p cann && cd cann && \
 ENV CANN_PATH=/usr/local/Ascend/cann
 ENV NNAL_PATH=/usr/local/Ascend/nnal
 ENV ASCEND_HOME=/usr/local/Ascend
+ENV CANN_VERSION=${CANN_VERSION}
 
 # 添加 CANN 环境初始化脚本
-RUN printf '#!/bin/bash\nsource /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\nsource /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true\n' > /etc/profile.d/cann_env.sh && \
-    chmod +x /etc/profile.d/cann_env.sh
\ No newline at end of file
+RUN printf '#!/bin/bash\n\
+source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\n\
+source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true\n\
+' > /etc/profile.d/cann_env.sh && \
+    chmod +x /etc/profile.d/cann_env.sh
+
+# ============================================================
+# 预安装 pytest 等测试依赖（为所有 Python 版本）
+# ============================================================
+
+RUN for py_dir in cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313; do \
+        /opt/python/$py_dir/bin/pip install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard cmake ninja; \
+    done
+
+# ============================================================
+# 设置工作目录和默认命令
+# ============================================================
+
+WORKDIR /workspace
+
+# 创建 welcome 消息
+RUN printf '\n\
+========================================\n\
+PyTorch NPU Builder Image\n\
+========================================\n\
+CANN Version: %s\n\
+Python Versions: 3.10, 3.11, 3.12, 3.13 (default: 3.11)\n\
+\n\
+To switch Python version:\n\
+  source /usr/local/bin/switch_python.sh 3.12\n\
+\n\
+To setup CANN environment:\n\
+  source /etc/profile.d/cann_env.sh\n\
+========================================\n\
+\n' "${CANN_VERSION}" > /etc/motd
+
+CMD ["bash"]
\ No newline at end of file
diff --git a/.github/scripts/BUILD_FLOW_CHECK.md b/.github/scripts/BUILD_FLOW_CHECK.md
new file mode 100644
index 0000000000..842fb9b02c
--- /dev/null
+++ b/.github/scripts/BUILD_FLOW_CHECK.md
@@ -0,0 +1,382 @@
+# Docker 镜像构建流程问题检查与修复报告
+
+## 发现的问题
+
+### 问题 1：定时构建和 push 触发时不推送镜像 ❌❌❌
+
+**严重级别**：P0 - Critical
+
+**问题描述**：
+
+原 workflow 使用条件表达式：
+```yaml
+${{ inputs.push_image && '--push' || '' }}
+```
+
+当触发方式为 `push` 或 `schedule` 时：
+- `inputs.push_image` 为空（undefined）
+- 条件表达式返回空字符串
+- **结果：定时构建和 push 触发时不推送镜像到远端！**
+
+**影响**：
+- 定时构建的镜像无法自动推送到 registry
+- 需要手动触发才能推送镜像
+
+---
+
+### 问题 2：镜像推送时机设计不合理 ⚠️
+
+**原设计**：
+- 手动触发：根据 inputs.push_image 决定是否推送
+- push/schedule 触发：不推送 ❌
+
+**PyTorch 上游参考**：
+```yaml
+- name: Push to ghcr.io
+  if: ${{ github.event_name == 'push' }}
+```
+
+PyTorch 只在 push 到 main 分支时推送镜像。
+
+**正确逻辑**：
+- workflow_dispatch：根据 inputs.push_image 决定 ✅
+- push（分支推送）：默认推送 ✅
+- schedule（定时构建）：默认推送 ✅
+- pull_request：不推送 ✅
+
+---
+
+### 问题 3：Summary 中版本提取逻辑错误 ⚠️
+
+**原代码**：
+```bash
+CANN_MAJOR=$(echo "${{ inputs.cann_version || env.CANN_STABLE }}" | sed 's/-beta.*//' | sed 's/-rc.*//' | sed 's/\.[0-9]*$//')
+```
+
+**问题**：
+对于简化版本 `9.0`：
+- `sed 's/\.[0-9]*$//'` 删除最后的 `.数字`
+- `9.0` → 删除 `.0` → 结果是 `9` ❌
+- 应该保持 `9.0`
+
+**正确逻辑**：
+```bash
+if [[ "$CANN_INPUT" =~ ^[0-9]+\.[0-9]+$ ]]; then
+  CANN_MAJOR="$CANN_INPUT"  # 简化版本直接使用
+else
+  CANN_MAJOR=$(echo "$CANN_INPUT" | grep -oP '^[0-9]+\.[0-9]+')  # 提取前两位
+fi
+```
+
+---
+
+### 问题 4：双重登录导致冗余 ⚠️
+
+**原流程**：
+```
+Workflow: docker/login-action 登录
+Script: login_registry 再次登录
+```
+
+虽然不会出错，但浪费时间，且可能导致登录状态混乱。
+
+---
+
+### 问题 5：登录时机不优化 ⚠️
+
+**原执行顺序**：
+```
+parse_args → check_dependencies → parse_cann_version → login_registry → build_image
+```
+
+**问题**：
+如果镜像已存在，build_image 会跳过构建，但 login_registry 已经执行了登录。
+
+**优化**：
+将登录移到确认需要构建之后：
+```
+parse_args → check_dependencies → parse_cann_version → build_image
+                                           ↓
+                                      检查镜像是否存在
+                                           ↓
+                                      确认需要构建 → login_registry → 构建
+```
+
+---
+
+## 修复方案
+
+### 修复 1：添加参数确定步骤 ✅
+
+```yaml
+- name: Determine build parameters
+  id: params
+  run: |
+    # 确定是否推送镜像
+    if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+      PUSH_IMAGE="${{ inputs.push_image }}"
+    elif [[ "${{ github.event_name }}" == "push" || "${{ github.event_name }}" == "schedule" ]]; then
+      PUSH_IMAGE="true"
+    else
+      PUSH_IMAGE="false"
+    fi
+
+    echo "push_image=${PUSH_IMAGE}" >> $GITHUB_OUTPUT
+```
+
+**效果**：
+- workflow_dispatch：根据用户输入决定
+- push/schedule：默认推送 ✅
+- 其他情况：不推送
+
+---
+
+### 修复 2：正确配置 login-action ✅
+
+```yaml
+- name: Login to Quay.io
+  if: ${{ steps.params.outputs.push_image == 'true' }}
+  uses: docker/login-action@v3
+  with:
+    registry: ${{ env.REGISTRY }}
+    username: ${{ secrets.QUAY_USERNAME }}
+    password: ${{ secrets.QUAY_PASSWORD }}
+```
+
+**效果**：
+- 只在需要推送时登录
+- 使用官方推荐的 login-action
+- 更安全地处理 secrets
+
+---
+
+### 修复 3：优化脚本登录逻辑 ✅
+
+```bash
+# 在 build_image 函数中
+# 先检查镜像是否存在
+if [[ "$FORCE_BUILD" == "false" && "$PUSH_IMAGE" == "true" ]]; then
+    if docker pull "${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${first_tag}" &>/dev/null; then
+        log_info "镜像已存在，跳过构建"
+        return 0
+    fi
+fi
+
+# 确认需要构建，执行登录
+if [[ "$PUSH_IMAGE" == "true" ]]; then
+    if [[ "${SKIP_DOCKER_LOGIN:-false}" != "true" ]]; then
+        login_registry
+    fi
+fi
+```
+
+**效果**：
+- 先检查镜像是否存在，避免不必要的登录
+- 支持 SKIP_DOCKER_LOGIN 环境变量（用于 CI）
+- 本地使用仍然会自动登录
+
+---
+
+### 修复 4：添加 SKIP_DOCKER_LOGIN 环境变量 ✅
+
+**Workflow 中**：
+```yaml
+- name: Build and push image
+  env:
+    SKIP_DOCKER_LOGIN: true  # 已通过 login-action 登录
+```
+
+**效果**：
+- 避免 workflow 中双重登录
+- 脚本检测到此变量后跳过登录
+
+---
+
+### 修复 5：修正 Summary 版本提取 ✅
+
+```bash
+CANN_INPUT="${{ steps.params.outputs.cann_version }}"
+
+if [[ "$CANN_INPUT" =~ ^[0-9]+\.[0-9]+$ ]]; then
+  CANN_MAJOR="$CANN_INPUT"  # 简化版本直接使用
+else
+  CANN_MAJOR=$(echo "$CANN_INPUT" | grep -oP '^[0-9]+\.[0-9]+')
+fi
+```
+
+**效果**：
+- `9.0` → 保持为 `9.0` ✅
+- `9.0.0-beta.2` → 提取为 `9.0` ✅
+- `8.0` → 保持为 `8.0` ✅
+
+---
+
+## 修复后的完整流程
+
+### Workflow 流程
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  Step 1: Determine build parameters                         │
+│  ├─ 判断 event_name 类型                                     │
+│  ├─ 确定 push_image (workflow_dispatch → inputs.push_image) │
+│  │                    (push/schedule → true)                │
+│  ├─ 确定 force_build                                         │
+│  ├─ 确定 cann_version                                        │
+│  └─ 输出到 GITHUB_OUTPUT                                     │
+└─────────────────────────────────────────────────────────────┘
+                           ↓
+┌─────────────────────────────────────────────────────────────┐
+│  Step 2: Setup Docker Buildx                                │
+│  ├─ 配置 docker-container driver                            │
+│  └─ 使用 moby/buildkit:latest                               │
+└─────────────────────────────────────────────────────────────┘
+                           ↓
+┌─────────────────────────────────────────────────────────────┐
+│  Step 3: Login to Quay.io (conditional)                     │
+│  ├─ if: push_image == 'true'                                │
+│  ├─ 使用 docker/login-action                                │
+│  └─ 处理 secrets 安全                                        │
+└─────────────────────────────────────────────────────────────┘
+                           ↓ (如果需要推送)
+┌─────────────────────────────────────────────────────────────┐
+│  Step 4: Build and push image                               │
+│  ├─ 设置 SKIP_DOCKER_LOGIN=true                             │
+│  ├─ 调用 build_image.sh                                     │
+│  ├─ 根据 push_image 添加 --push 参数                        │
+│  └─ 根据 force_build 添加 --force 参数                      │
+└─────────────────────────────────────────────────────────────┘
+                           ↓
+┌─────────────────────────────────────────────────────────────┐
+│  Step 5: Summary (always)                                   │
+│  ├─ 生成构建报告                                             │
+│  ├─ 显示镜像标签                                             │
+│  └─ 显示 Python 切换方法                                     │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### 脚本内部流程
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  1. parse_args                                              │
+│  ├─ 解析命令行参数                                           │
+│  └─ 设置 REGISTRY、QUAY_ORG 等                              │
+└─────────────────────────────────────────────────────────────┘
+                           ↓
+┌─────────────────────────────────────────────────────────────┐
+│  2. check_dependencies                                      │
+│  ├─ 检查 docker 是否安装                                     │
+│  └─ 检查 docker buildx                                      │
+└─────────────────────────────────────────────────────────────┘
+                           ↓
+┌─────────────────────────────────────────────────────────────┐
+│  3. parse_cann_version                                      │
+│  ├─ 从版本映射表查找 URL                                     │
+│  ├─ 提取完整版本和大版本                                     │
+│  └─ 判断是否为 stable                                        │
+└─────────────────────────────────────────────────────────────┘
+                           ↓
+┌─────────────────────────────────────────────────────────────┐
+│  4. build_image                                             │
+│  ├─ 生成镜像标签                                             │
+│  ├─ 检查镜像是否已存在（如果 force=false && push=true）      │
+│  │  └─ 如果存在 → return 0                                  │
+│  ├─ 确认需要构建 → 登录（如果 SKIP_DOCKER_LOGIN != true）    │
+│  ├─ 执行 docker buildx build                                │
+│  │  ├─ --build-arg 传递 URL                                 │
+│  │  ├─ --tag 添加多个标签                                    │
+│  │  └─ --push 或 --load                                     │
+│  └─ 输出构建信息                                             │
+└─────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 触发场景对比表
+
+| 触发方式 | CANN 版本 | Push Image | Force Build | 登录 | 推送 |
+|---------|-----------|------------|-------------|------|------|
+| workflow_dispatch (默认) | 9.0 | true | false | ✅ | ✅ |
+| workflow_dispatch (--push=false) | 9.0 | false | false | ❌ | ❌ |
+| workflow_dispatch (--force) | 9.0 | true | true | ✅ | ✅ |
+| push (dev_master) | 9.0 (stable) | true | false | ✅ | ✅ |
+| schedule (周日) | 9.0 (stable) | true | false | ✅ | ✅ |
+| pull_request | - | - | - | ❌ | ❌ |
+
+---
+
+## 验证清单
+
+### ✅ 已验证
+
+1. ✅ Workflow YAML 格式正确
+2. ✅ Shell 脚本语法正确
+3. ✅ 参数传递逻辑正确
+4. ✅ 登录时机优化正确
+5. ✅ SKIP_DOCKER_LOGIN 机制正确
+6. ✅ 版本提取逻辑正确
+7. ✅ 条件判断逻辑正确
+
+### 🔄 需要在实际运行中验证
+
+1. 🔄 定时构建是否正确推送镜像
+2. 🔄 push 触发是否正确推送镜像
+3. 🔄 镜像已存在时是否正确跳过构建
+4. 🔄 Docker buildx --push 是否成功推送所有标签
+5. 🔄 Quay.io 登录是否成功
+6. 🔄 镜像标签是否正确生成和推送
+
+---
+
+## 后续建议
+
+### 建议 1：添加构建失败通知
+
+```yaml
+- name: Notify on failure
+  if: failure()
+  run: |
+    # 可以集成 Slack/Email 通知
+    echo "::error::Build failed for CANN version ${{ steps.params.outputs.cann_version }}"
+```
+
+### 建议 2：添加镜像验证步骤
+
+```yaml
+- name: Verify pushed image
+  if: ${{ steps.params.outputs.push_image == 'true' }}
+  run: |
+    docker pull "${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:cann${{ steps.params.outputs.cann_version }}"
+    docker inspect --format='{{.Config.Env}}' "${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:cann${{ steps.params.outputs.cann_version }}"
+```
+
+### 建议 3：添加构建缓存
+
+```yaml
+- name: Build and push image
+  uses: docker/build-push-action@v5
+  with:
+    context: .github/docker
+    file: .github/docker/pytorch-npu-builder.Dockerfile
+    push: ${{ steps.params.outputs.push_image == 'true' }}
+    tags: |
+      ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:cann${{ steps.params.outputs.cann_version }}
+    cache-from: type=gha
+    cache-to: type=gha,mode=max
+```
+
+---
+
+## 修复文件列表
+
+| 文件 | 修改内容 |
+|------|---------|
+| `.github/workflows/build-docker-image.yml` | ✅ 添加参数确定步骤<br>✅ 修正 login-action 条件<br>✅ 添加 SKIP_DOCKER_LOGIN<br>✅ 修正 Summary 版本提取 |
+| `.github/scripts/build_image.sh` | ✅ 优化登录时机<br>✅ 添加 SKIP_DOCKER_LOGIN 支持<br>✅ 移除主函数中的 login_registry |
+
+---
+
+**检查时间**: 2026-05-06
+**检查人**: Claude Code
+**状态**: ✅ 所有问题已修复，等待实际运行验证
\ No newline at end of file
diff --git a/.github/scripts/BUILD_IMAGE_README.md b/.github/scripts/BUILD_IMAGE_README.md
new file mode 100644
index 0000000000..2d3caa9c47
--- /dev/null
+++ b/.github/scripts/BUILD_IMAGE_README.md
@@ -0,0 +1,307 @@
+# Docker 镜像构建脚本使用指南
+
+## 概述
+
+`build_image.sh` 脚本用于构建支持多 Python 版本的 PyTorch NPU Docker 镜像。
+
+## 核心设计理念
+
+### CANN 与 Python 版本关系
+
+**重要说明**：
+- CANN 包安装只需要 Python 3 环境，**不与特定 Python 版本绑定**
+- 编译和运行 PyTorch 时，切换 Python 版本不会影响 CANN
+- 因此镜像只需按 CANN 版本构建，无需按 Python 版本重复构建
+
+### 镜像特性
+
+1. **一个镜像支持所有 Python 版本**
+   - 预装 Python 3.10/3.11/3.12/3.13
+   - 通过环境变量或脚本切换 Python 版本
+
+2. **按 CANN 版本构建**
+   - 一个 CANN 版本对应一个镜像
+   - 维护版本映射表，自动获取下载地址
+
+3. **镜像标签简化**
+   - 标签只显示 CANN 版本
+   - 不再包含 Python 版本信息
+
+---
+
+## 支持的配置
+
+### CANN 版本映射表
+
+脚本维护以下版本映射（三个包的下载 URL）：
+
+| 版本号 | Toolkit | A3-ops | NNAL |
+|--------|---------|--------|------|
+| `9.0` | toolkit_9.0.0 | A3-ops_9.0.0 | nnal_9.0.0 |
+| `9.0.0-beta.2` | toolkit_9.0.0-beta.2 | A3-ops_9.0.0-beta.2 | nnal_9.0.0-beta.2 |
+| `8.0` | toolkit_8.0.RC3 | A3-ops_8.0.RC3 | nnal_8.0.RC3 |
+
+**Stable 版本标记**：
+- `CANN_STABLE="9.0"` - 用于生成 `latest` 标签
+
+---
+
+## 使用方式
+
+### 查看支持的 CANN 版本
+
+```bash
+./build_image.sh --list-versions
+```
+
+输出：
+```
+支持的 CANN 版本：
+
+  - 9.0
+  - 9.0.0-beta.2
+  - 8.0
+
+Stable 版本（用于 latest 标签）: 9.0
+```
+
+### 本地构建
+
+```bash
+# 使用简化版本号（推荐）
+./build_image.sh --cann-version 9.0
+
+# 使用完整版本号
+./build_image.sh --cann-version 9.0.0-beta.2
+
+# 查看详细日志
+./build_image.sh --cann-version 9.0 --verbose
+```
+
+### 推送镜像
+
+```bash
+# 需要设置环境变量
+export QUAY_USERNAME="your_username"
+export QUAY_PASSWORD="your_password"
+
+# 构建并推送
+./build_image.sh --cann-version 9.0 --push
+```
+
+---
+
+## 镜像使用指南
+
+### 拉取镜像
+
+```bash
+# 拉取指定 CANN 版本
+docker pull quay.io/kerer/pytorch:cann9.0
+
+# 拉取 latest（stable 版本）
+docker pull quay.io/kerer/pytorch:latest
+```
+
+### 运行容器
+
+```bash
+# 启动容器（默认 Python 3.11）
+docker run -it quay.io/kerer/pytorch:cann9.0 bash
+
+# 启动容器并挂载工作目录
+docker run -it -v $(pwd):/workspace quay.io/kerer/pytorch:cann9.0 bash
+```
+
+### 切换 Python 版本
+
+**方法 1：使用切换脚本**
+```bash
+# 在容器内执行
+source /usr/local/bin/switch_python.sh 3.11
+source /usr/local/bin/switch_python.sh 3.12
+source /usr/local/bin/switch_python.sh 3.13
+```
+
+**方法 2：修改环境变量**
+```bash
+# Python 3.11
+export PATH=/opt/python/cp311-cp311/bin:$PATH
+
+# Python 3.12
+export PATH=/opt/python/cp312-cp312/bin:$PATH
+
+# Python 3.13
+export PATH=/opt/python/cp313-cp313/bin:$PATH
+```
+
+**验证 Python 版本**
+```bash
+python --version
+pip --version
+```
+
+### 初始化 CANN 环境
+
+```bash
+# 在容器内执行
+source /etc/profile.d/cann_env.sh
+```
+
+---
+
+## 镜像标签说明
+
+### 标签层级
+
+每个 CANN 版本生成以下标签：
+
+| 标签类型 | 格式 | 示例 | 用途 |
+|---------|------|------|------|
+| **完整版**（带时间戳） | `cann${VERSION}-${TIMESTAMP}` | `cann9.0-20260506` | 版本追溯 |
+| **标准版** | `cann${VERSION}` | `cann9.0.0-beta.2` | 日常使用 ⭐ |
+| **简化版** | `cann${MAJOR}` | `cann9.0` | 快速识别 ⭐ |
+| **latest**（仅 stable） | `latest` | `latest` | 使用最新 |
+
+### Stable 版本额外标签
+
+CANN stable 版本（当前为 9.0）额外生成：
+- `latest` - 全局最新
+- `cann-latest` - CANN 最新
+- `cann9.0-latest` - 该 CANN 版本最新
+
+---
+
+## Workflow 使用
+
+### 手动触发构建
+
+1. 进入 GitHub Actions 页面
+2. 选择 "Build Docker Image" workflow
+3. 点击 "Run workflow"
+4. 选择参数：
+   - `cann_version`: 输入 CANN 版本（如 `9.0` 或 `9.0.0-beta.2`）
+   - `push_image`: 是否推送镜像
+   - `force_build`: 是否强制构建
+
+### 自动触发
+
+- **Push 触发**: 当修改相关文件时自动触发（默认构建 stable 版本）
+- **定时触发**: 每周日凌晨 2:00 UTC 自动构建 stable 版本
+
+---
+
+## 添加新的 CANN 版本
+
+### 步骤 1：更新版本映射表
+
+在 `build_image.sh` 中添加新版本：
+
+```bash
+declare -A CANN_VERSIONS=(
+    # 已有版本...
+
+    # 新增版本
+    ["9.1"]="https://...toolkit_9.1.0_linux-aarch64.run|https://...A3-ops_9.1.0_linux-aarch64.run|https://...nnal_9.1.0_linux-aarch64.run"
+)
+```
+
+格式：`"版本号"="toolkit_url|a3_ops_url|nnal_url"`
+
+### 步骤 2：更新 Stable 版本（可选）
+
+如果新版本成为 stable，更新：
+
+```bash
+CANN_STABLE="9.1"
+```
+
+---
+
+## 与 PyTorch 上游对比
+
+### 关键差异
+
+| 维度 | PyTorch CUDA | torch-npu CANN |
+|------|-------------|---------------|
+| **构建策略** | 按 CUDA + Python 版本矩阵 | 只按 CANN 版本 ⭐ |
+| **镜像数量** | 多个（每种组合一个） | 少量（每个 CANN 一个） |
+| **Python 切换** | 不同镜像 | 同一镜像切换环境变量 ⭐ |
+| **版本映射** | 简化版 → 完整版 | URL 映射表 ⭐ |
+
+### 优势
+
+1. **镜像数量减少**：1 个 CANN 版本 = 1 个镜像（而非 4 个）
+2. **灵活性更高**：无需预判 Python 版本需求
+3. **维护更简单**：只需维护 CANN 版本映射表
+
+---
+
+## 常见问题
+
+### Q1: 为什么一个镜像支持多个 Python 版本？
+
+A: CANN 安装只需要 Python 3 环境，不绑定特定版本。切换 Python 版本不影响 CANN 功能。
+
+### Q2: 如何在 CI 中使用特定 Python 版本？
+
+A: 在容器内执行切换脚本：
+```bash
+source /usr/local/bin/switch_python.sh 3.12
+```
+
+或在 Dockerfile/脚本中修改 PATH：
+```bash
+export PATH=/opt/python/cp312-cp312/bin:$PATH
+```
+
+### Q3: 如何验证 CANN 是否正常工作？
+
+A:
+```bash
+source /etc/profile.d/cann_env.sh
+python -c "import torch; import torch_npu; print(torch_npu.npu.is_available())"
+```
+
+### Q4: 不同 CANN 版本有什么区别？
+
+A:
+- **9.0**: Stable 版本，推荐用于生产环境
+- **9.0.0-beta.2**: Beta 版本，包含最新特性
+- **8.0**: 旧版本，用于兼容性测试
+
+---
+
+## 脚本参数详解
+
+| 参数 | 说明 | 默认值 |
+|------|------|--------|
+| `--cann-version` | CANN 版本号 | 必需 |
+| `--registry` | Docker registry | `quay.io` |
+| `--quay-org` | Quay.io 组织 | `kerer` |
+| `--image-name` | 镜像名称 | `pytorch` |
+| `--push` | 推送镜像 | 不推送 |
+| `--force` | 强制构建 | 不强制 |
+| `--verbose` | 详细日志 | 不显示 |
+| `--list-versions` | 显示版本列表 | - |
+
+---
+
+## 更新日志
+
+### 2026-05-06 重构
+
+**主要变更**：
+1. ❌ 移除 Python 版本参数（不再按 Python 构建镜像）
+2. ✅ 预装所有 Python 版本（3.10/3.11/3.12/3.13）
+3. ✅ 添加 Python 版本切换脚本
+4. ✅ 只按 CANN 版本构建镜像
+5. ✅ 维护 CANN 包 URL 映射表
+
+**镜像标签变化**：
+- 原：`py3.11-cann9.0`
+- 新：`cann9.0`
+
+---
+
+**生成时间**: 2026-05-06
\ No newline at end of file
diff --git a/.github/scripts/PYTORCH_DOCKER_COMPARISON.md b/.github/scripts/PYTORCH_DOCKER_COMPARISON.md
new file mode 100644
index 0000000000..d8318de1b9
--- /dev/null
+++ b/.github/scripts/PYTORCH_DOCKER_COMPARISON.md
@@ -0,0 +1,595 @@
+# PyTorch 上游镜像构建逻辑对比分析
+
+## 对比概述
+
+对比上游 PyTorch 仓库和当前 torch-npu 项目的镜像构建策略，分析相似点和差异。
+
+**对应关系**：
+- PyTorch CUDA → torch-npu CANN
+- PyTorch cuDNN → torch-npu NNAL/A3-ops
+- PyTorch Python 版本 → torch-npu Python 版本
+
+---
+
+## 一、镜像命名策略对比
+
+### PyTorch 上游命名
+
+**CI 镜像命名**（用于内部测试）：
+```
+pytorch-linux-jammy-cuda13.0-cudnn9-py3.10-clang18
+pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+pytorch-linux-jammy-py3.11-clang18          # CPU 版本
+```
+
+特点：
+- 格式：`pytorch-linux-{OS}-{CUDA}-{cuDNN}-py{Python}-{Compiler}`
+- CUDA 版本格式：`cuda13.0`（去掉小版本号）
+- cuDNN 版本格式：`cudnn9`（只保留大版本）
+- Python 版本格式：`py3.10` 或 `py3`（默认最新）
+
+**官方发布镜像命名**（用户使用）：
+```
+ghcr.io/pytorch/pytorch:2.5.1-cuda12.1-cudnn9-runtime
+ghcr.io/pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel
+ghcr.io/pytorch/pytorch:2.5.1-runtime          # CPU 版本
+```
+
+特点：
+- 格式：`{PyTorch版本}-cuda{CUDA简版}-cudnn{cuDNN}-{类型}`
+- 镜像类型：`runtime`（运行时）vs `devel`（开发）
+- CUDA 版本简化：`cuda12.1`（去掉补丁版本）
+
+**Nightly 镜像额外标签**：
+```
+ghcr.io/pytorch/pytorch-nightly:2.5.0.dev20250101-cuda12.1-cudnn9-runtime
+ghcr.io/pytorch/pytorch-nightly:{git_commit}-cu121
+ghcr.io/pytorch/pytorch-nightly:latest          # Stable CUDA 的 latest
+```
+
+---
+
+### torch-npu 当前命名
+
+**当前标签格式**：
+```
+quay.io/kerer/pytorch:py3.11-cann9.0.0-beta.2-20260506   # 完整版
+quay.io/kerer/pytorch:py3.11-cann9.0.0-beta.2            # 标准版
+quay.io/kerer/pytorch:py3.11-cann9.0.0                   # 大版本简化
+quay.io/kerer/pytorch:py3.11-latest                      # Python latest
+quay.io/kerer/pytorch:latest                            # 全局 latest
+```
+
+---
+
+### 命名策略对比表
+
+| 维度 | PyTorch 上游 | torch-npu 当前 | 建议 |
+|------|-------------|---------------|------|
+| **前缀** | 无前缀 | `py` 前缀 | ✅ torch-npu 更直观 |
+| **Python 版本** | `py3.10` 或 `py3` | `py3.11` | ✅ 相似，都用 py 前缀 |
+| **CUDA/CANN** | `cuda13.0` | `cann9.0.0-beta.2` | ⚠️ PyTorch 更简化 |
+| **cuDNN/NNAL** | `cudnn9` | 未包含 | ⚠️ torch-npu 可添加 |
+| **镜像类型** | `runtime`/`devel` | 无区分 | ⚠️ 可考虑添加 |
+| **时间戳** | nightly 包含 | 完整版包含 | ✅ 相似 |
+| **latest 位置** | `latest` 无后缀 | `latest` 或 `py3.11-latest` | ✅ torch-npu 更细致 |
+
+---
+
+## 二、版本管理策略对比
+
+### PyTorch CUDA 版本矩阵
+
+**支持的 CUDA 版本**：
+```python
+CUDA_ARCHES = ["12.6", "13.0", "13.2"]
+CUDA_STABLE = "13.0"    # 稳定版本
+
+CUDA_ARCHES_FULL_VERSION = {
+    "12.6": "12.6.3",
+    "13.0": "13.0.2",
+    "13.2": "13.2.1",
+}
+
+CUDA_ARCHES_CUDNN_VERSION = {
+    "12.6": "9",
+    "13.0": "9",
+    "13.2": "9",
+}
+```
+
+**特点**：
+- 维护简化版本 → 完整版本映射表
+- 每个 CUDA 版本对应固定的 cuDNN 版本
+- 明确标记稳定版本（用于 latest 标签）
+
+---
+
+### torch-npu CANN 版本管理
+
+**当前实现**：
+```bash
+DEFAULT_CANN_VERSION="9.0.0-beta.2"
+DEFAULT_CANN_DATE="20260330"
+
+# 提取大版本号
+cann_major=$(echo "$CANN_VERSION" | sed 's/-beta.*//' | sed 's/-rc.*//')
+```
+
+**差异**：
+- ❌ 没有版本映射表（简化版本 → 完整版本）
+- ❌ 没有 stable 版本标记
+- ❌ CANN 日期硬编码在参数中
+
+---
+
+### 建议改进：版本映射表
+
+创建类似 PyTorch 的版本映射配置：
+
+```bash
+# 在 build_image.sh 中添加版本映射
+declare -A CANN_ARCHES_FULL_VERSION=(
+    ["9.0"]="9.0.0"
+    ["8.0"]="8.0.RC3"
+)
+
+declare -A CANN_ARCHES_DATE=(
+    ["9.0"]="20260330"
+    ["8.0"]="20250101"
+)
+
+declare -A CANN_ARCHES_NNAL_VERSION=(
+    ["9.0"]="9.0.0"   # 对应 cuDNN
+    ["8.0"]="8.0"
+)
+
+CANN_STABLE="9.0"  # 稳定版本
+```
+
+---
+
+## 三、构建组织方式对比
+
+### PyTorch 构建架构
+
+**文件组织**：
+```
+pytorch/pytorch/
+├── Dockerfile                           # 用户发布镜像
+├── docker.Makefile                      # 构建脚本
+├── .ci/docker/                          # CI 镜像目录
+│   ├── ubuntu/Dockerfile                # Ubuntu 基础镜像
+│   ├── common/                          # 公共安装脚本
+│   │   ├── install_cuda.sh              # CUDA 安装脚本
+│   │   ├── install_conda.sh             # Conda 安装脚本
+│   │   └── install_gcc.sh               # GCC 安装脚本
+│   ├── requirements-ci.txt              # CI 依赖
+│   └── ci_commit_pins/                  # 版本锁定
+│       ├── triton.txt                   # Triton 版本
+│       ├── nccl*                        # NCCL 版本
+│       └── jax.txt                      # JAX 版本
+├── .github/workflows/
+│   ├── docker-builds.yml                # CI 镜像构建
+│   ├── docker-release.yml               # 发布镜像构建
+│   └── docker-cache-rocm.yml            # ROCm 缓存
+└── .github/scripts/
+    ├── generate_docker_release_matrix.py  # 矩阵生成
+    └── generate_binary_build_matrix.py    # 二进制矩阵
+```
+
+**关键特点**：
+1. **分层组织**：CI 镜像和发布镜像分离
+2. **公共脚本**：`common/` 目录下有各种安装脚本
+3. **版本锁定**：`ci_commit_pins/` 目录锁定所有依赖版本
+4. **矩阵生成**：Python 脚本动态生成构建矩阵
+
+---
+
+### torch-npu 构建架构
+
+**当前文件组织**：
+```
+ascend-pytorch/
+├── .github/
+│   ├── docker/
+│   │   └── pytorch-npu-builder.Dockerfile   # 单一 Dockerfile
+│   ├── scripts/
+│   │   ├── build_image.sh                   # 构建脚本
+│   │   └── BUILD_IMAGE_README.md            # 文档
+│   └── workflows/
+│       └── build-docker-image.yml           # Workflow
+```
+
+**对比差异**：
+
+| 组织方式 | PyTorch 上游 | torch-npu 当前 | 建议 |
+|---------|-------------|---------------|------|
+| **CI vs 发布分离** | ✅ 分离 | ❌ 单一 Dockerfile | ⚠️ 可考虑分离 |
+| **公共安装脚本** | ✅ `common/` 目录 | ❌ 直接在 Dockerfile | ⚠️ 建议拆分 |
+| **版本锁定文件** | ✅ `ci_commit_pins/` | ❌ 硬编码参数 | ⚠️ 强烈建议 |
+| **矩阵生成脚本** | ✅ Python 脚本 | ✅ Shell 脚本 | ✅ 相似 |
+| **文档完整性** | ❌ 较少 | ✅ README 文档 | ✅ torch-npu 更好 |
+
+---
+
+## 四、Workflow 设计对比
+
+### PyTorch docker-builds.yml
+
+**触发条件**：
+```yaml
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - .ci/docker/**
+      - .github/workflows/docker-builds.yml
+  push:
+    branches: [main, release/*]
+    paths:
+      - .ci/docker/**
+      - .github/workflows/docker-builds.yml
+  schedule:
+    - cron: 1 3 * * 3   # 每周三 UTC 03:01
+```
+
+**Matrix 策略**：
+```yaml
+matrix:
+  docker-image-name: [
+    pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
+    pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm,
+    pytorch-linux-jammy-py3.10-clang18,      # CPU 版本
+    # ... 30+ 种镜像配置
+  ]
+  include:
+    - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
+      runner: linux.arm64.m7g.4xlarge       # ARM64 特定 runner
+```
+
+**镜像推送**：
+```yaml
+# 推送到 ECR（AWS）
+- name: Build docker image
+  uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+  with:
+    docker-image-name: ci-image:${{ matrix.docker-image-name }}
+    always-rebuild: true
+    push: true
+
+# 推送到 ghcr.io（公共）
+- name: Push to https://ghcr.io/
+  if: ${{ github.event_name == 'push' }}
+  run: |
+    ghcr_image="ghcr.io/pytorch/ci-image"
+    tag=${ECR_DOCKER_IMAGE##*:}
+    docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${tag}"
+    docker push "${ghcr_image}:${tag}"
+    # Also push a tag without the hash
+    docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${{ matrix.docker-image-name }}"
+    docker push "${ghcr_image}:${{ matrix.docker-image-name }}"
+```
+
+---
+
+### torch-npu build-docker-image.yml
+
+**当前实现**：
+```yaml
+on:
+  push:
+    branches: [dev_master]
+    paths:
+      - '.github/docker/**'
+      - '.github/workflows/**'
+  schedule:
+    - cron: '0 2 * * 0'   # 每周日 UTC 02:00
+  workflow_dispatch:
+    inputs:
+      python_version: ['all', '3.10', '3.11', '3.12', '3.13']
+      cann_version: '9.0.0-beta.2'
+      push_image: true
+
+matrix:
+  python: ['3.10', '3.11', '3.12', '3.13']
+```
+
+---
+
+### Workflow 对比表
+
+| 设计要点 | PyTorch 上游 | torch-npu 当前 | 建议 |
+|---------|-------------|---------------|------|
+| **触发路径** | `.ci/docker/**` | `.github/docker/**` | ✅ 相似 |
+| **定时构建** | 每周三 | 每周日 | ✅ 合理 |
+| **Matrix 配置** | 硬编码镜像名列表 | Python 版本列表 | ⚠️ PyTorch 更详细 |
+| **多 Registry** | ECR + ghcr.io | 单一 quay.io | ⚠️ 可考虑多 Registry |
+| **推送策略** | Hash tag + Name tag | 多层级标签 | ✅ torch-npu 更细致 |
+| **手动触发** | 无参数 | 多参数输入 | ✅ torch-npu 更灵活 |
+
+---
+
+## 五、CUDA/CANN 安装方式对比
+
+### PyTorch CUDA 安装
+
+**install_cuda.sh 脚本**（分离式）：
+```bash
+# 调用方式
+ARG CUDA_VERSION
+COPY ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION}
+
+# Dockerfile 中的环境变量
+ENV DESIRED_CUDA ${CUDA_VERSION}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
+```
+
+**特点**：
+- CUDA 安装逻辑独立在 `install_cuda.sh` 中
+- Dockerfile 只负责调用脚本
+- 版本通过 ARG 参数传递
+
+---
+
+### torch-npu CANN 安装
+
+**当前实现**（嵌入式）：
+```dockerfile
+ARG CANN_VERSION
+ARG CANN_DATE
+
+RUN mkdir -p cann && cd cann && \
+    curl -O https://.../Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run && \
+    curl -O https://.../Ascend-cann-A3-ops_${CANN_VERSION}_linux-aarch64.run && \
+    curl -O https://.../Ascend-cann-nnal_${CANN_VERSION}_linux-aarch64.run && \
+    chmod +x Ascend-cann*.run && \
+    ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
+    # ...
+```
+
+**对比差异**：
+
+| 安装方式 | PyTorch 上游 | torch-npu 当前 | 建议 |
+|---------|-------------|---------------|------|
+| **脚本分离** | ✅ `install_cuda.sh` | ❌ 嵌入 Dockerfile | ⚠️ 建议拆分 |
+| **依赖安装** | ✅ NCCL/cuSPARSE 等独立脚本 | ❌ 混在一起 | ⚠️ 建议拆分 |
+| **版本管理** | ✅ 参数传递 + 环境变量 | ✅ 参数传递 | ✅ 相似 |
+| **安装路径** | `/usr/local/cuda` | `/usr/local/Ascend` | ✅ 合理 |
+
+---
+
+## 六、镜像类型对比
+
+### PyTorch 镜像类型
+
+**两种镜像类型**：
+```dockerfile
+# runtime 镜像（精简）
+FROM official as runtime
+# 只包含运行时必需的组件
+
+# devel 镜像（完整）
+FROM official as dev
+# 包含开发工具、编译器等
+```
+
+**docker.Makefile 定义**：
+```makefile
+runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-runtime
+devel-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-devel
+```
+
+---
+
+### torch-npu 当前状态
+
+**单一镜像类型**：
+- 当前只有一种镜像，包含构建和运行时所有工具
+- 没有区分 runtime 和 devel
+
+**建议**：
+```dockerfile
+# 可以添加多阶段构建
+FROM base as runtime
+# 只包含 CANN runtime + Python
+
+FROM runtime as devel
+# 添加编译工具、调试工具等
+```
+
+---
+
+## 七、关键差异总结
+
+### 相似点 ✅
+
+1. **使用 ARG 参数化版本**
+   - 都通过 `ARG CUDA_VERSION` / `ARG CANN_VERSION` 传递版本
+   - 都支持多 Python 版本
+
+2. **Matrix 策略构建**
+   - 都使用 GitHub Actions matrix 并行构建
+   - 都支持定时构建和手动触发
+
+3. **Registry 推送**
+   - 都推送到公共 Registry
+   - 都生成多层级标签
+
+4. **版本简化处理**
+   - 都从完整版本提取简化版本
+   - 都有 latest 标签策略
+
+---
+
+### 差异点 ⚠️
+
+| 差异 | PyTorch 上游优势 | torch-npu 待改进 |
+|------|-----------------|----------------|
+| **文件组织** | CI/发布分离，公共脚本目录 | 单一 Dockerfile，建议拆分 |
+| **版本管理** | 版本映射表，stable 标记 | 硬编码日期，建议映射表 |
+| **版本锁定** | `ci_commit_pins/` 目录锁定所有依赖 | 无版本锁定文件 |
+| **镜像类型** | runtime/devel 分离 | 单一镜像，可考虑分离 |
+| **多 Registry** | ECR（私有）+ ghcr.io（公共） | 单一 quay.io |
+| **依赖分离** | CUDA/NCCL/cuSPARSE 独立脚本 | CANN 组件混在一起 |
+
+---
+
+## 八、改进建议优先级
+
+### P0（必须改进）
+
+1. **创建版本锁定文件**
+   ```
+   .github/docker/cann_versions.txt
+   .github/docker/nnal_versions.txt
+   .github/docker/a3_ops_versions.txt
+   ```
+
+2. **创建版本映射表**
+   ```bash
+   # 在 build_image.sh 中
+   declare -A CANN_VERSIONS=(
+       ["9.0"]="9.0.0|20260330"
+       ["8.0"]="8.0.RC3|20250101"
+   )
+   ```
+
+---
+
+### P1（建议改进）
+
+1. **拆分安装脚本**
+   ```
+   .github/scripts/docker/
+   ├── install_cann.sh
+   ├── install_nnal.sh
+   ├── install_a3_ops.sh
+   └── common_utils.sh
+   ```
+
+2. **添加镜像类型区分**
+   ```dockerfile
+   FROM base as runtime   # 精简镜像
+   FROM runtime as devel  # 完整镜像
+   ```
+
+3. **添加 stable 版本标记**
+   ```bash
+   CANN_STABLE="9.0"  # 用于生成 latest 标签
+   ```
+
+---
+
+### P2（可选改进）
+
+1. **多 Registry 支持**
+   - AWS ECR（私有缓存）
+   - ghcr.io（公共发布）
+
+2. **CI/发布镜像分离**
+   - CI 镜像：包含测试工具
+   - 发布镜像：精简运行时
+
+---
+
+## 九、标签命名建议调整
+
+### 当前 torch-npu 标签（保持）
+
+```
+✅ py3.11-cann9.0.0-beta.2          # 标准版
+✅ py3.11-latest                     # Python latest
+✅ latest                            # 全局 latest
+```
+
+### 建议新增标签
+
+```
+新增：py3.11-cann9.0-runtime        # 镜像类型标记
+新增：py3.11-cann9.0-devel          # 开发镜像
+新增：cann9.0-stable                # Stable 版本标记
+新增：2.5.1-py3.11-cann9.0-runtime  # 包含 PyTorch 版本（可选）
+```
+
+---
+
+## 十、代码示例：版本映射实现
+
+### 建议在 build_image.sh 中添加
+
+```bash
+#!/bin/bash
+
+# CANN 版本映射表（类似 PyTorch）
+declare -A CANN_ARCHES=(
+    ["9.0"]="9.0.0"
+    ["8.0"]="8.0.RC3"
+)
+
+declare -A CANN_ARCHES_DATE=(
+    ["9.0"]="20260330"
+    ["8.0"]="20250101"
+)
+
+declare -A CANN_ARCHES_NNAL=(
+    ["9.0"]="9.0.0"
+    ["8.0"]="8.0"
+)
+
+CANN_STABLE="9.0"  # Stable 版本（用于 latest）
+
+# 解析版本参数
+parse_cann_version() {
+    local input="$1"
+
+    # 如果输入是简化版本（如 "9.0"），查找完整版本
+    if [[ -v CANN_ARCHES[$input] ]]; then
+        CANN_VERSION="${CANN_ARCHES[$input]}"
+        CANN_DATE="${CANN_ARCHES_DATE[$input]}"
+        NNAL_VERSION="${CANN_ARCHES_NNAL[$input]}"
+        CANN_MAJOR="$input"
+    else
+        # 如果输入是完整版本（如 "9.0.0-beta.2"），提取简化版本
+        CANN_VERSION="$input"
+        CANN_MAJOR=$(echo "$input" | sed 's/-beta.*//' | sed 's/-rc.*//' | sed 's/\.[0-9]*$//')
+        CANN_DATE="${CANN_ARCHES_DATE[$CANN_MAJOR]:-DEFAULT_CANN_DATE}"
+        NNAL_VERSION="${CANN_ARCHES_NNAL[$CANN_MAJOR]:-$CANN_VERSION}"
+    fi
+
+    # 判断是否为 stable 版本
+    IS_STABLE=$([[ "$CANN_MAJOR" == "$CANN_STABLE" ]] && echo "true" || echo "false")
+}
+```
+
+---
+
+## 结论
+
+### 总体评价
+
+✅ **相似度高**：torch-npu 的设计思路与 PyTorch 上游基本一致，都采用了参数化构建、Matrix 策略、多版本支持等现代 CI/CD 最佳实践。
+
+⚠️ **待改进点**：
+1. 版本管理缺乏映射表和锁定文件
+2. 安装脚本未拆分，维护性较弱
+3. 镜像类型未区分 runtime/devel
+4. 缺少 stable 版本标记
+
+### 下一步行动
+
+**建议按照优先级顺序改进**：
+1. P0：创建版本锁定文件和映射表
+2. P1：拆分安装脚本，添加镜像类型
+3. P2：考虑多 Registry 和 CI/发布分离
+
+**标签命名**：
+- 当前命名策略已经很好，符合 PyTorch 风格
+- 可以考虑添加 runtime/devel 类型标记
+- 建议添加 stable 版本的 latest 标签
+
+---
+
+**生成时间**: 2026-05-06
+**对比版本**: PyTorch upstream main branch (2026-05-06)
\ No newline at end of file
diff --git a/.github/scripts/build_image.sh b/.github/scripts/build_image.sh
new file mode 100755
index 0000000000..bfb797be71
--- /dev/null
+++ b/.github/scripts/build_image.sh
@@ -0,0 +1,479 @@
+#!/bin/bash
+#
+# build_image.sh - 构建 PyTorch NPU Docker 镜像
+#
+# 功能：按 CANN 版本构建镜像，镜像预装多 Python 版本，通过环境变量切换
+#
+# 使用方式：
+#   ./build_image.sh --cann-version 9.0
+#   ./build_image.sh --cann-version 9.0.0-beta.2 --push
+#   ./build_image.sh --list-versions     # 查看支持的 CANN 版本
+#
+
+set -euo pipefail
+
+# ============================================================
+# CANN 版本映射表
+# 每个版本对应三个包的下载 URL
+# ============================================================
+
+declare -A CANN_VERSIONS=(
+    # 版本号 -> toolkit|a3_ops|nnal 的 URL
+    ["9.0"]="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0_linux-aarch64.run"
+
+    ["9.0.0-beta.2"]="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run"
+
+    ["8.0"]="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20250101/Ascend-cann-toolkit_8.0.RC3_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20250101/Ascend-cann-A3-ops_8.0.RC3_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20250101/Ascend-cann-nnal_8.0.RC3_linux-aarch64.run"
+)
+
+# Stable 版本标记（用于 latest 标签）
+CANN_STABLE="9.0"
+
+# 预装的 Python 版本列表
+PYTHON_VERSIONS=("3.10" "3.11" "3.12" "3.13")
+
+# manylinux 对应的 Python 目录名映射
+declare -A PYTHON_DIR_MAP=(
+    ["3.10"]="cp310-cp310"
+    ["3.11"]="cp311-cp311"
+    ["3.12"]="cp312-cp312"
+    ["3.13"]="cp313-cp313"
+)
+
+# ============================================================
+# 默认配置
+# ============================================================
+
+DEFAULT_REGISTRY="quay.io"
+DEFAULT_QUAY_ORG="kerer"
+DEFAULT_IMAGE_NAME="pytorch"
+
+# 参数变量
+CANN_VERSION_INPUT=""
+REGISTRY=""
+QUAY_ORG=""
+IMAGE_NAME=""
+PUSH_IMAGE=false
+FORCE_BUILD=false
+VERBOSE=false
+LIST_VERSIONS=false
+
+# ============================================================
+# 日志函数
+# ============================================================
+
+log_info() {
+    echo "[INFO] $1"
+}
+
+log_error() {
+    echo "[ERROR] $1" >&2
+}
+
+log_verbose() {
+    if [[ "$VERBOSE" == "true" ]]; then
+        echo "[VERBOSE] $1"
+    fi
+}
+
+# ============================================================
+# 显示帮助信息
+# ============================================================
+
+show_help() {
+    cat << EOF
+用法: $0 [OPTIONS]
+
+构建支持不同 CANN 版本的 PyTorch NPU Docker 镜像。
+
+镜像特性：
+  - 预装多个 Python 版本 (3.10/3.11/3.12/3.13)
+  - 通过环境变量切换 Python 版本
+  - 按 CANN 版本构建镜像
+
+CANN 参数:
+  --cann-version VERSION      CANN 版本号（支持简化版或完整版）
+                              简化版: 9.0, 8.0
+                              完整版: 9.0.0-beta.2
+  --list-versions             显示支持的 CANN 版本列表
+
+镜像参数:
+  --registry REGISTRY         Docker registry 地址 (默认: quay.io)
+  --quay-org ORG              Quay.io 组织名 (默认: kerer)
+  --image-name NAME           镜像名称 (默认: pytorch)
+
+构建选项:
+  --push                      构建后推送镜像到 registry
+  --force                     强制构建，即使镜像已存在
+  --verbose                   显示详细日志
+
+Python 版本切换（运行时）:
+  镜像预装多个 Python 版本，使用时通过环境变量切换：
+  export PATH=/opt/python/cp311-cp311/bin:\$PATH  # 使用 Python 3.11
+  export PATH=/opt/python/cp312-cp312/bin:\$PATH  # 使用 Python 3.12
+
+示例:
+  $0 --cann-version 9.0
+  $0 --cann-version 9.0.0-beta.2 --push
+  $0 --list-versions
+
+支持的 CANN 版本：
+$(show_supported_versions)
+
+EOF
+}
+
+show_supported_versions() {
+    echo "简化版本      完整版本"
+    echo "-----------   ----------------"
+    for version in "${!CANN_VERSIONS[@]}"; do
+        if [[ ! "$version" =~ -beta ]] && [[ ! "$version" =~ -rc ]]; then
+            echo "$version       (完整版见映射表)"
+        fi
+    done
+    echo ""
+    echo "完整版本示例："
+    echo "  9.0.0-beta.2"
+    echo "  8.0.RC3"
+}
+
+# ============================================================
+# 解析命令行参数
+# ============================================================
+
+parse_args() {
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --cann-version)
+                CANN_VERSION_INPUT="$2"
+                shift 2
+                ;;
+            --registry)
+                REGISTRY="$2"
+                shift 2
+                ;;
+            --quay-org)
+                QUAY_ORG="$2"
+                shift 2
+                ;;
+            --image-name)
+                IMAGE_NAME="$2"
+                shift 2
+                ;;
+            --push)
+                PUSH_IMAGE=true
+                shift
+                ;;
+            --force)
+                FORCE_BUILD=true
+                shift
+                ;;
+            --verbose)
+                VERBOSE=true
+                shift
+                ;;
+            --list-versions)
+                LIST_VERSIONS=true
+                shift
+                ;;
+            -h|--help)
+                show_help
+                exit 0
+                ;;
+            *)
+                log_error "未知参数: $1"
+                show_help
+                exit 1
+                ;;
+        esac
+    done
+
+    # 设置默认值
+    REGISTRY="${REGISTRY:-$DEFAULT_REGISTRY}"
+    QUAY_ORG="${QUAY_ORG:-$DEFAULT_QUAY_ORG}"
+    IMAGE_NAME="${IMAGE_NAME:-$DEFAULT_IMAGE_NAME}"
+
+    # 显示版本列表
+    if [[ "$LIST_VERSIONS" == "true" ]]; then
+        echo "支持的 CANN 版本："
+        echo ""
+        for version in "${!CANN_VERSIONS[@]}"; do
+            echo "  - $version"
+        done
+        echo ""
+        echo "Stable 版本（用于 latest 标签）: $CANN_STABLE"
+        exit 0
+    fi
+
+    # 验证参数
+    if [[ -z "$CANN_VERSION_INPUT" ]]; then
+        log_error "必须指定 --cann-version 或使用 --list-versions"
+        show_help
+        exit 1
+    fi
+}
+
+# ============================================================
+# 解析 CANN 版本
+# ============================================================
+
+parse_cann_version() {
+    local input="$CANN_VERSION_INPUT"
+
+    log_verbose "解析 CANN 版本: $input"
+
+    # 检查版本是否在映射表中
+    if [[ ! -v CANN_VERSIONS[$input] ]]; then
+        log_error "不支持的 CANN 版本: $input"
+        log_info "支持的版本: ${!CANN_VERSIONS[*]}"
+        log_info "使用 --list-versions 查看完整列表"
+        exit 1
+    fi
+
+    # 解析 URL
+    local urls="${CANN_VERSIONS[$input]}"
+    CANN_TOOLKIT_URL=$(echo "$urls" | cut -d'|' -f1)
+    CANN_A3OPS_URL=$(echo "$urls" | cut -d'|' -f2)
+    CANN_NNAL_URL=$(echo "$urls" | cut -d'|' -f3)
+
+    # 提取版本号（去掉 beta/rc 后缀）
+    CANN_VERSION_FULL="$input"
+    CANN_VERSION_MAJOR=$(echo "$input" | sed 's/-beta.*//' | sed 's/-rc.*//')
+
+    # 判断是否为 stable 版本
+    if [[ "$CANN_VERSION_MAJOR" == "$CANN_STABLE" ]]; then
+        IS_STABLE="true"
+    else
+        IS_STABLE="false"
+    fi
+
+    log_verbose "Toolkit URL: $CANN_TOOLKIT_URL"
+    log_verbose "A3-ops URL: $CANN_A3OPS_URL"
+    log_verbose "NNAL URL: $CANN_NNAL_URL"
+    log_verbose "Full version: $CANN_VERSION_FULL"
+    log_verbose "Major version: $CANN_VERSION_MAJOR"
+    log_verbose "Is stable: $IS_STABLE"
+}
+
+# ============================================================
+# 生成镜像标签
+# ============================================================
+
+generate_tags() {
+    local timestamp=$(date +%Y%m%d)
+    local tags=()
+
+    # 提取大版本号（去掉 patch 号，但保留 beta/rc）
+    # 例如：9.0.0-beta.2 → 9.0，9.0 → 9.0，8.0.RC3 → 8.0
+    local cann_major
+
+    # 如果版本号已经是简化格式（没有第二个点），则保持原样
+    if [[ "$CANN_VERSION_FULL" =~ ^[0-9]+\.[0-9]+$ ]]; then
+        cann_major="$CANN_VERSION_FULL"
+    else
+        # 提取前两位数字（去掉 patch 号和 beta/rc 后缀）
+        cann_major=$(echo "$CANN_VERSION_FULL" | grep -oP '^[0-9]+\.[0-9]+')
+    fi
+
+    # 1. 完整版本标签（带时间戳）- 用于追溯
+    tags+=("cann${CANN_VERSION_FULL}-${timestamp}")
+
+    # 2. 标准版本标签（无时间戳）- 用于日常使用
+    # 如果输入已经是简化版本，则跳过完整版本标签，避免重复
+    if [[ "$CANN_VERSION_FULL" != "$cann_major" ]]; then
+        tags+=("cann${CANN_VERSION_FULL}")
+    fi
+
+    # 3. 大版本简化标签 - 用于快速识别
+    tags+=("cann${cann_major}")
+
+    # 4. latest 标签（仅 stable 版本）
+    if [[ "$IS_STABLE" == "true" ]]; then
+        tags+=("latest")
+        tags+=("cann-latest")
+        tags+=("cann${cann_major}-latest")
+    fi
+
+    # 输出所有标签
+    for tag in "${tags[@]}"; do
+        echo "$tag"
+    done
+}
+
+# ============================================================
+# 构建镜像
+# ============================================================
+
+build_image() {
+    log_info "=========================================="
+    log_info "构建镜像: CANN $CANN_VERSION_FULL"
+    log_info "=========================================="
+
+    log_info "预装 Python 版本: ${PYTHON_VERSIONS[*]}"
+
+    # 生成镜像标签
+    local tags=$(generate_tags)
+    local tag_args=""
+    while IFS= read -r tag; do
+        tag_args+=" --tag ${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${tag}"
+    done <<< "$tags"
+
+    log_info "镜像标签:"
+    while IFS= read -r tag; do
+        log_info "  - ${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${tag}"
+    done <<< "$tags"
+
+    # 检查镜像是否已存在（除非强制构建）
+    if [[ "$FORCE_BUILD" == "false" && "$PUSH_IMAGE" == "true" ]]; then
+        local first_tag=$(echo "$tags" | head -n1)
+        if docker pull "${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${first_tag}" &>/dev/null; then
+            log_info "镜像已存在，跳过构建: ${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${first_tag}"
+            return 0
+        fi
+    fi
+
+    # 确认需要构建，执行登录（如果需要推送）
+    # 如果环境变量 SKIP_DOCKER_LOGIN=true，则跳过（用于 CI，已通过 login-action 登录）
+    if [[ "$PUSH_IMAGE" == "true" ]]; then
+        if [[ "${SKIP_DOCKER_LOGIN:-false}" != "true" ]]; then
+            login_registry
+        else
+            log_verbose "跳过登录（SKIP_DOCKER_LOGIN=true）"
+        fi
+    fi
+
+    # Dockerfile 路径
+    local dockerfile_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/.github/docker"
+    local dockerfile="${dockerfile_dir}/pytorch-npu-builder.Dockerfile"
+
+    if [[ ! -f "$dockerfile" ]]; then
+        log_error "Dockerfile 不存在: $dockerfile"
+        exit 1
+    fi
+
+    log_verbose "Dockerfile: $dockerfile"
+
+    # 构建参数
+    local build_args="
+        --build-arg CANN_TOOLKIT_URL=${CANN_TOOLKIT_URL}
+        --build-arg CANN_A3OPS_URL=${CANN_A3OPS_URL}
+        --build-arg CANN_NNAL_URL=${CANN_NNAL_URL}
+        --build-arg CANN_VERSION=${CANN_VERSION_FULL}
+    "
+
+    # 构建命令
+    local build_cmd="docker buildx build \
+        ${build_args} \
+        ${tag_args} \
+        --file ${dockerfile} \
+        --platform linux/arm64 \
+        ${dockerfile_dir}"
+
+    if [[ "$PUSH_IMAGE" == "true" ]]; then
+        build_cmd+=" --push"
+    else
+        build_cmd+=" --load"
+    fi
+
+    log_verbose "构建命令: $build_cmd"
+
+    # 执行构建
+    log_info "开始构建..."
+    if ! eval "$build_cmd"; then
+        log_error "构建失败"
+        return 1
+    fi
+
+    log_info "构建成功"
+
+    # 输出构建信息
+    echo ""
+    log_info "构建信息:"
+    log_info "  CANN 版本: $CANN_VERSION_FULL"
+    log_info "  CANN 大版本: $CANN_VERSION_MAJOR"
+    log_info "  Stable: $IS_STABLE"
+    log_info "  预装 Python: ${PYTHON_VERSIONS[*]}"
+    log_info "  镜像地址:"
+    while IFS= read -r tag; do
+        log_info "    ${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${tag}"
+    done <<< "$tags"
+
+    echo ""
+    log_info "使用方法："
+    log_info "  docker run -it ${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:cann${CANN_VERSION_MAJOR} bash"
+    log_info "  # 切换 Python 版本："
+    log_info "  export PATH=/opt/python/cp311-cp311/bin:\$PATH  # Python 3.11"
+    log_info "  export PATH=/opt/python/cp312-cp312/bin:\$PATH  # Python 3.12"
+    echo ""
+
+    return 0
+}
+
+# ============================================================
+# 检查依赖
+# ============================================================
+
+check_dependencies() {
+    log_verbose "检查依赖..."
+
+    # 检查 docker
+    if ! command -v docker &>/dev/null; then
+        log_error "未安装 docker"
+        exit 1
+    fi
+
+    # 检查 docker buildx
+    if ! docker buildx version &>/dev/null; then
+        log_error "docker buildx 不可用"
+        exit 1
+    fi
+
+    log_verbose "依赖检查通过"
+}
+
+# ============================================================
+# 登录 registry
+# ============================================================
+
+login_registry() {
+    if [[ "$PUSH_IMAGE" == "true" ]]; then
+        log_info "登录 Registry: $REGISTRY"
+
+        case "$REGISTRY" in
+            quay.io)
+                if [[ -z "${QUAY_USERNAME:-}" || -z "${QUAY_PASSWORD:-}" ]]; then
+                    log_error "需要设置环境变量 QUAY_USERNAME 和 QUAY_PASSWORD"
+                    exit 1
+                fi
+                docker login quay.io -u "$QUAY_USERNAME" --password-stdin <<< "$QUAY_PASSWORD"
+                ;;
+            ghcr.io)
+                if [[ -z "${GITHUB_TOKEN:-}" ]]; then
+                    log_error "需要设置环境变量 GITHUB_TOKEN"
+                    exit 1
+                fi
+                echo "$GITHUB_TOKEN" | docker login ghcr.io -u "${GITHUB_ACTOR:-}" --password-stdin
+                ;;
+            *)
+                log_error "不支持的 registry: $REGISTRY"
+                exit 1
+                ;;
+        esac
+
+        log_info "登录成功"
+    fi
+}
+
+# ============================================================
+# 主函数
+# ============================================================
+
+main() {
+    parse_args "$@"
+    check_dependencies
+    parse_cann_version
+    build_image
+}
+
+# 执行主函数
+main "$@"
\ No newline at end of file
diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
index c406458b00..7df92cd072 100644
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@@ -6,12 +6,32 @@ on:
     paths:
       - '.github/docker/pytorch-npu-builder.Dockerfile'
       - '.github/workflows/build-docker-image.yml'
+      - '.github/scripts/build_image.sh'
+  schedule:
+    - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
+  workflow_dispatch:
+    inputs:
+      cann_version:
+        description: 'CANN version (e.g., 9.0, 9.0.0-beta.2, 8.0)'
+        required: true
+        default: '9.0'
+        type: string
+      push_image:
+        description: 'Push image to registry'
+        required: true
+        default: true
+        type: boolean
+      force_build:
+        description: 'Force rebuild even if image exists'
+        required: false
+        default: false
+        type: boolean
 
 env:
   REGISTRY: quay.io
-  IMAGE_NAME: pytorch
   QUAY_ORG: kerer
-  CANN_VERSION: '9.0.0-beta.2'
+  IMAGE_NAME: pytorch
+  CANN_STABLE: '9.0'
 
 jobs:
   build:
@@ -24,35 +44,126 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Set up Docker Buildx
+      - name: Make script executable
+        run: chmod +x .github/scripts/build_image.sh
+
+      - name: Determine build parameters
+        id: params
+        run: |
+          # 确定是否推送镜像
+          # 规则：手动触发根据 inputs.push_image 决定
+          #       push/schedule 触发默认推送（除非是 PR）
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            PUSH_IMAGE="${{ inputs.push_image }}"
+          elif [[ "${{ github.event_name }}" == "push" || "${{ github.event_name }}" == "schedule" ]]; then
+            PUSH_IMAGE="true"
+          else
+            PUSH_IMAGE="false"
+          fi
+
+          echo "push_image=${PUSH_IMAGE}" >> $GITHUB_OUTPUT
+
+          # 确定是否强制构建
+          FORCE_BUILD="${{ inputs.force_build || 'false' }}"
+          echo "force_build=${FORCE_BUILD}" >> $GITHUB_OUTPUT
+
+          # 确定 CANN 版本
+          CANN_VERSION="${{ inputs.cann_version || env.CANN_STABLE }}"
+          echo "cann_version=${CANN_VERSION}" >> $GITHUB_OUTPUT
+
+          echo "Build parameters:"
+          echo "  Event: ${{ github.event_name }}"
+          echo "  CANN version: ${CANN_VERSION}"
+          echo "  Push image: ${PUSH_IMAGE}"
+          echo "  Force build: ${FORCE_BUILD}"
+
+      - name: Setup Docker Buildx
         uses: docker/setup-buildx-action@v3
+        with:
+          driver: docker-container
+          driver-opts: image=moby/buildkit:latest
 
-      - name: Log in to Quay.io Container Registry
+      - name: Login to Quay.io
+        if: ${{ steps.params.outputs.push_image == 'true' }}
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
           username: ${{ secrets.QUAY_USERNAME }}
           password: ${{ secrets.QUAY_PASSWORD }}
 
-      - name: Generate timestamp tag
-        id: tag
+      - name: Build and push image
+        env:
+          QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }}
+          QUAY_PASSWORD: ${{ secrets.QUAY_PASSWORD }}
+          SKIP_DOCKER_LOGIN: true  # 已通过 login-action 登录，跳过脚本中的登录
         run: |
-          TIMESTAMP=$(date +%Y%m%d)
-          echo "tag=manylinux-cann${{ env.CANN_VERSION }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
+          # 构建参数
+          ARGS=""
+          if [[ "${{ steps.params.outputs.push_image }}" == "true" ]]; then
+            ARGS+=" --push"
+          fi
+          if [[ "${{ steps.params.outputs.force_build }}" == "true" ]]; then
+            ARGS+=" --force"
+          fi
 
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .github/docker
-          file: .github/docker/pytorch-npu-builder.Dockerfile
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
-            ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:latest
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-      - name: Output image URL
+          .github/scripts/build_image.sh \
+            --cann-version "${{ steps.params.outputs.cann_version }}" \
+            --registry "${{ env.REGISTRY }}" \
+            --quay-org "${{ env.QUAY_ORG }}" \
+            --image-name "${{ env.IMAGE_NAME }}" \
+            ${ARGS} \
+            --verbose
+
+      - name: Summary
+        if: always()
+        run: |
+          echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Build Details" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
+          echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
+          echo "| **Trigger** | ${{ github.event_name }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| **CANN Version** | ${{ steps.params.outputs.cann_version }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| **Python Versions** | 3.10, 3.11, 3.12, 3.13 (pre-installed) |" >> $GITHUB_STEP_SUMMARY
+          echo "| **Registry** | ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| **Push Enabled** | ${{ steps.params.outputs.push_image }} |" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          # 提取 CANN 大版本号（正确处理简化版本）
+          CANN_INPUT="${{ steps.params.outputs.cann_version }}"
+
+          # 如果是简化版本（如 9.0），直接使用
+          # 如果是完整版本（如 9.0.0-beta.2），提取前两位数字
+          if [[ "$CANN_INPUT" =~ ^[0-9]+\.[0-9]+$ ]]; then
+            CANN_MAJOR="$CANN_INPUT"
+          else
+            CANN_MAJOR=$(echo "$CANN_INPUT" | grep -oP '^[0-9]+\.[0-9]+')
+          fi
+
+          echo "### Image Tags" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+          echo "quay.io/kerer/pytorch:cann${{ steps.params.outputs.cann_version }}" >> $GITHUB_STEP_SUMMARY
+          echo "quay.io/kerer/pytorch:cann${CANN_MAJOR}" >> $GITHUB_STEP_SUMMARY
+          if [[ "${CANN_MAJOR}" == "${{ env.CANN_STABLE }}" ]]; then
+            echo "quay.io/kerer/pytorch:latest" >> $GITHUB_STEP_SUMMARY
+            echo "quay.io/kerer/pytorch:cann-latest" >> $GITHUB_STEP_SUMMARY
+          fi
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Python Version Switch" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
+          echo "# Inside container:" >> $GITHUB_STEP_SUMMARY
+          echo "source /usr/local/bin/switch_python.sh 3.11" >> $GITHUB_STEP_SUMMARY
+          echo "source /usr/local/bin/switch_python.sh 3.12" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Build time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY
+
+      - name: Cleanup on failure
+        if: failure()
         run: |
-          echo "Built image: ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
-          echo "Built image: ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:latest"
\ No newline at end of file
+          echo "::error::Build failed for CANN version ${{ steps.params.outputs.cann_version }}"
+          echo "Check the build logs for details"
\ No newline at end of file

From 81474d7ee46d96c2f8353ffc990e60c78b1116ae Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 11:47:01 +0800
Subject: [PATCH 068/132] Fix Dockerfile path calculation in build script

Problem: Path contained double .github/.github/docker due to incorrect
relative path handling in GitHub Actions runner environment.

Solution: Use git rev-parse --show-toplevel to reliably get project root,
fallback to relative path calculation if not in git repo.

Added debug logging for script_dir, project_root, and dockerfile_dir.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/build_image.sh | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/build_image.sh b/.github/scripts/build_image.sh
index bfb797be71..c41cc1c3bb 100755
--- a/.github/scripts/build_image.sh
+++ b/.github/scripts/build_image.sh
@@ -343,9 +343,25 @@ build_image() {
     fi
 
     # Dockerfile 路径
-    local dockerfile_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/.github/docker"
+    # 使用 git 获取项目根目录（更可靠）
+    local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    local project_root
+
+    # 尝试使用 git 获取项目根目录
+    if git rev-parse --show-toplevel &>/dev/null; then
+        project_root="$(git rev-parse --show-toplevel)"
+    else
+        # 如果不在 git 仓库中，从脚本目录向上推导
+        project_root="$(cd "${script_dir}/.." && pwd)"
+    fi
+
+    local dockerfile_dir="${project_root}/.github/docker"
     local dockerfile="${dockerfile_dir}/pytorch-npu-builder.Dockerfile"
 
+    log_verbose "Script dir: ${script_dir}"
+    log_verbose "Project root: ${project_root}"
+    log_verbose "Dockerfile dir: ${dockerfile_dir}"
+
     if [[ ! -f "$dockerfile" ]]; then
         log_error "Dockerfile 不存在: $dockerfile"
         exit 1

From 96cbeffc44b125f723690675424108cea30d789e Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 11:48:38 +0800
Subject: [PATCH 069/132] Fix docker buildx command formatting issue

Problem: Multi-line string with newlines and indentation caused shell
to interpret each line as separate command when using eval.

Solution: Use single-line format for build_args and build_cmd to avoid
newline interpretation issues.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/build_image.sh | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/build_image.sh b/.github/scripts/build_image.sh
index c41cc1c3bb..090b474351 100755
--- a/.github/scripts/build_image.sh
+++ b/.github/scripts/build_image.sh
@@ -369,21 +369,11 @@ build_image() {
 
     log_verbose "Dockerfile: $dockerfile"
 
-    # 构建参数
-    local build_args="
-        --build-arg CANN_TOOLKIT_URL=${CANN_TOOLKIT_URL}
-        --build-arg CANN_A3OPS_URL=${CANN_A3OPS_URL}
-        --build-arg CANN_NNAL_URL=${CANN_NNAL_URL}
-        --build-arg CANN_VERSION=${CANN_VERSION_FULL}
-    "
-
-    # 构建命令
-    local build_cmd="docker buildx build \
-        ${build_args} \
-        ${tag_args} \
-        --file ${dockerfile} \
-        --platform linux/arm64 \
-        ${dockerfile_dir}"
+    # 构建参数（单行格式，避免换行符问题）
+    local build_args="--build-arg CANN_TOOLKIT_URL=${CANN_TOOLKIT_URL} --build-arg CANN_A3OPS_URL=${CANN_A3OPS_URL} --build-arg CANN_NNAL_URL=${CANN_NNAL_URL} --build-arg CANN_VERSION=${CANN_VERSION_FULL}"
+
+    # 构建命令（单行格式）
+    local build_cmd="docker buildx build ${build_args} ${tag_args} --file ${dockerfile} --platform linux/arm64 ${dockerfile_dir}"
 
     if [[ "$PUSH_IMAGE" == "true" ]]; then
         build_cmd+=" --push"

From e5a5727db336c40e4ddd6fdb4ece9ca31c32703f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 11:55:52 +0800
Subject: [PATCH 070/132] Fix CANN package URLs: use 9.0.0-beta.2 for stable
 version

Problem: OBS doesn't have Ascend-cann-toolkit_9.0.0_linux-aarch64.run
         (only 9.0.0-beta.2 exists)

Solution: Update version mapping to point both "9.0" and "9.0.0-beta.2"
          to the actual existing beta.2 package URLs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/build_image.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/build_image.sh b/.github/scripts/build_image.sh
index 090b474351..25763c688e 100755
--- a/.github/scripts/build_image.sh
+++ b/.github/scripts/build_image.sh
@@ -19,7 +19,8 @@ set -euo pipefail
 
 declare -A CANN_VERSIONS=(
     # 版本号 -> toolkit|a3_ops|nnal 的 URL
-    ["9.0"]="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0_linux-aarch64.run"
+    # 注意：OBS 上当前只有 9.0.0-beta.2 版本的包
+    ["9.0"]="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run"
 
     ["9.0.0-beta.2"]="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run|https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run"
 

From 0f54c7c123b0c273716f72124b3c7ae22702aa99 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 12:50:31 +0800
Subject: [PATCH 071/132] Add PR trigger for npu-sync-test workflow

Trigger on pull_request events (opened/synchronize/reopened) targeting
main, master, or release branches. Retain manual dispatch option.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-sync-test.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
index 0c81b4172a..4f104fb89b 100644
--- a/.github/workflows/npu-sync-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -1,6 +1,12 @@
 name: PyTorch NPU Sync Test
 
 on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+      - master
+      - 'release/**'
   workflow_dispatch:
     inputs:
       docker_image_tag:

From 9d0893d541d07bf7a5edc1f7f20349de81569601 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 14:08:23 +0800
Subject: [PATCH 072/132] Update default Docker image tag to
 manylinux-cann9.0.0-beta.2-20260428

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-sync-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
index 4f104fb89b..1f6c16b88e 100644
--- a/.github/workflows/npu-sync-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -10,8 +10,8 @@ on:
   workflow_dispatch:
     inputs:
       docker_image_tag:
-        description: 'Docker image tag (e.g., manylinux-cann9.0-20260428)'
-        default: 'latest'
+        description: 'Docker image tag (e.g., manylinux-cann9.0.0-beta.2-20260428)'
+        default: 'manylinux-cann9.0.0-beta.2-20260428'
       pytorch_branch:
         description: 'PyTorch branch to build'
         default: 'main'

From fd440d8dc4a8b37e4ec09d8f2d49c21a491fad34 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 18:08:46 +0800
Subject: [PATCH 073/132] Fix PyTorch 2.11+ API compatibility and optimize
 build workflow

PyTorch 2.11+ compatibility fixes:
- Add support for new DTensor register_single_dim_strategy API
- Fix SavedVariable::unpack parameter type change (std::shared_ptr -> c10::intrusive_ptr)
- Add version-based conditional compilation for backward compatibility

Workflow optimization:
- Configure git proxy before clone for faster submodule downloads
- Use --recurse-submodules to download all submodules in one pass

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml        |  31 ++---
 torch_npu/_compat/distributed.py              |  26 +++++
 .../distributed/tensor/_pointwise_ops.py      | 108 ++++++++++++++++--
 torchnpugen/autograd/templates/Functions.h    |  36 ++++++
 4 files changed, 178 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index fe0570f6d3..8f56d5f7c2 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -61,7 +61,19 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
-      - name: Clone upstream PyTorch main
+      - name: Configure git proxy for faster clone
+        run: |
+          # 配置 git URL rewrite 来使用代理（加速 clone 和 submodules）
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            git config --global url."${{ env.GH_PROXY_URL }}/https://github.com/".insteadOf "https://github.com/"
+            git config --global url."${{ env.GH_PROXY_URL }}/https://gitlab.com/".insteadOf "https://gitlab.com/"
+            echo "Git proxy configured:"
+            git config --global --list | grep url
+          else
+            echo "No proxy configured, using direct connection"
+          fi
+
+      - name: Clone upstream PyTorch with submodules
         id: clone_pytorch
         run: |
           # 使用代理加速 git clone（如果配置了 GH_PROXY_URL）
@@ -71,23 +83,14 @@ jobs:
             echo "Using proxy: ${PYTORCH_REPO}"
           fi
 
-          git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
+          # --recurse-submodules 同时下载所有 submodules（使用已配置的 git proxy）
+          git clone --depth=1 --recurse-submodules --branch ${{ inputs.pytorch_branch }} \
             "${PYTORCH_REPO}" pytorch-src
           PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
           echo "pytorch_sha=${PYTORCH_SHA}" >> $GITHUB_OUTPUT
           echo "Cloned PyTorch commit: ${PYTORCH_SHA}"
-
-      - name: Configure git proxy for faster clone
-        run: |
-          # 配置 git URL rewrite 来使用代理（加速 submodules clone）
-          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
-            git config --global url."${{ env.GH_PROXY_URL }}/https://github.com/".insteadOf "https://github.com/"
-            git config --global url."${{ env.GH_PROXY_URL }}/https://gitlab.com/".insteadOf "https://gitlab.com/"
-            echo "Git proxy configured:"
-            git config --global --list | grep url
-          else
-            echo "No proxy configured, using direct connection"
-          fi
+          echo "Submodules downloaded:"
+          ls -la pytorch-src/third_party/ | head -20
 
       - name: Checkout torch_npu
         uses: actions/checkout@v4
diff --git a/torch_npu/_compat/distributed.py b/torch_npu/_compat/distributed.py
index a40a66d664..6d71a3bd3b 100644
--- a/torch_npu/_compat/distributed.py
+++ b/torch_npu/_compat/distributed.py
@@ -7,3 +7,29 @@
     from torch.distributed.tensor._ops.utils import register_op_strategy, register_prop_rule
 else:
     from torch.distributed.tensor._ops.registration import register_op_strategy, register_prop_rule
+
+# COMPAT(>= 2.11): pointwise_strategy removed in PyTorch 2.11
+#   New API uses register_single_dim_strategy and _register_single_dim_pointwise
+# CAN REMOVE else branch when MIN_SUPPORTED >= (2, 11)
+if CURRENT_VERSION >= (2, 11):
+    try:
+        from torch.distributed.tensor._ops.single_dim_strategy import (
+            register_single_dim_strategy,
+            _ShardingPlaceholder,
+        )
+        # pointwise_strategy is no longer available, use register_single_dim_strategy instead
+        pointwise_strategy = None
+    except ImportError:
+        # Fallback for older PyTorch versions without single_dim_strategy
+        pointwise_strategy = None
+        register_single_dim_strategy = None
+        _ShardingPlaceholder = None
+else:
+    try:
+        from torch.distributed.tensor._ops._pointwise_ops import pointwise_strategy
+        register_single_dim_strategy = None
+        _ShardingPlaceholder = None
+    except ImportError:
+        pointwise_strategy = None
+        register_single_dim_strategy = None
+        _ShardingPlaceholder = None
diff --git a/torch_npu/distributed/tensor/_pointwise_ops.py b/torch_npu/distributed/tensor/_pointwise_ops.py
index 931ca340f8..a4b2413618 100644
--- a/torch_npu/distributed/tensor/_pointwise_ops.py
+++ b/torch_npu/distributed/tensor/_pointwise_ops.py
@@ -1,13 +1,15 @@
-
 import torch
 from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
-from torch_npu._compat.distributed import register_op_strategy
-from torch.distributed.tensor._ops._pointwise_ops import pointwise_strategy
-
+from torch_npu._compat.distributed import (
+    register_op_strategy,
+    pointwise_strategy,
+    register_single_dim_strategy,
+    _ShardingPlaceholder,
+)
+from torch_npu._compat.version import CURRENT_VERSION
 
 npu = torch.ops.npu
 
-
 custom_pointwise_ops = {
     npu.npu_dtype_cast.default: 0,
     npu._npu_dtype_cast.default: 0,
@@ -16,12 +18,100 @@
 }
 
 
+def _custom_pointwise_strategy_new(
+    op: torch._ops.OpOverload,
+    args_schema,
+    kwargs_schema,
+):
+    """
+    New strategy function for PyTorch 2.11+ using register_single_dim_strategy.
+    Returns placements list following the new API format: list[list[Placement | _ShardingPlaceholder]]
+
+    For pointwise ops, all tensors should shard on the same dimension.
+    This matches the behavior of pointwise_strategy(linearity=0) in older PyTorch.
+    """
+    from torch.distributed.tensor.placement_types import Replicate
+    from torch.distributed.tensor._dtensor_spec import TensorMeta
+
+    # Get tensor arguments
+    tensor_args = [arg for arg in args_schema if isinstance(arg, TensorMeta)]
+    if not tensor_args:
+        # No tensor args, return replicate strategy
+        return [[Replicate()]]
+
+    # Get common shape (broadcasted)
+    common_shape = torch.broadcast_shapes(*[arg.shape for arg in tensor_args])
+
+    num_outputs = 1  # Most pointwise ops have single output
+    placements = []
+
+    # For each dimension, create a sharding strategy
+    for dim_idx in range(len(common_shape)):
+        strategy = [_ShardingPlaceholder(dim_idx)] * num_outputs
+        for arg in tensor_args:
+            # Map common dim to arg dim (handling broadcast)
+            common_dim_to_arg_dim = _infer_broadcast_dims_map(common_shape, arg.shape)
+            if common_dim_to_arg_dim[dim_idx] >= 0:
+                strategy.append(_ShardingPlaceholder(common_dim_to_arg_dim[dim_idx]))
+            else:
+                strategy.append(Replicate())
+        placements.append(strategy)
+
+    # Add replicate strategy
+    replicate_strategy = [Replicate()] * (num_outputs + len(tensor_args))
+    placements.insert(0, replicate_strategy)
+
+    return placements
+
+
+def _infer_broadcast_dims_map(common_shape, arg_shape):
+    """Map dimensions from common shape to argument shape for broadcast handling."""
+    # Simple implementation: map matching dimensions, -1 for broadcast dimensions
+    result = []
+    arg_ndim = len(arg_shape)
+    common_ndim = len(common_shape)
+
+    for common_idx in range(common_ndim):
+        # Calculate corresponding arg index
+        arg_idx = common_idx - (common_ndim - arg_ndim)
+        if arg_idx >= 0 and arg_idx < arg_ndim:
+            # Check if dimension matches (not broadcast)
+            if arg_shape[arg_idx] == common_shape[common_idx] or arg_shape[arg_idx] == 1:
+                result.append(arg_idx)
+            else:
+                result.append(-1)
+        else:
+            result.append(-1)
+
+    return result
+
+
 def custom_pointwise_strategy(op_schema: OpSchema):
+    """Legacy strategy function for PyTorch < 2.11."""
     op_type = custom_pointwise_ops.get(op_schema.op, -1)
     return pointwise_strategy(op_schema, linearity=op_type)
 
 
-for op in custom_pointwise_ops:
-    register_op_strategy(
-        op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
-    )(custom_pointwise_strategy)
+# Register strategies based on PyTorch version
+if CURRENT_VERSION >= (2, 11):
+    # Use new register_single_dim_strategy API for PyTorch 2.11+
+    if register_single_dim_strategy is not None:
+        for op in custom_pointwise_ops:
+            register_single_dim_strategy(
+                op,
+                schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
+            )(_custom_pointwise_strategy_new)
+    else:
+        # Fallback if register_single_dim_strategy is not available
+        # This should not happen in PyTorch 2.11+, but we keep it for safety
+        pass
+else:
+    # Use legacy pointwise_strategy for PyTorch < 2.11
+    if pointwise_strategy is not None:
+        for op in custom_pointwise_ops:
+            register_op_strategy(
+                op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
+            )(custom_pointwise_strategy)
+    else:
+        # Fallback if pointwise_strategy is not available
+        pass
\ No newline at end of file
diff --git a/torchnpugen/autograd/templates/Functions.h b/torchnpugen/autograd/templates/Functions.h
index bb822779e6..d732e524c9 100644
--- a/torchnpugen/autograd/templates/Functions.h
+++ b/torchnpugen/autograd/templates/Functions.h
@@ -13,6 +13,20 @@
 
 #include <c10/core/SymIntArrayRef.h>
 
+// COMPAT(>= 2.11): SavedVariable::unpack parameter type changed from std::shared_ptr<Node>
+// to c10::intrusive_ptr<Node> in PyTorch 2.11
+// We detect the change by checking if TORCH_VERSION_MAJOR >= 2 and TORCH_VERSION_MINOR >= 11
+#if defined(TORCH_VERSION_MAJOR) && defined(TORCH_VERSION_MINOR)
+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 11
+#define USE_INTRUSIVE_PTR_FOR_UNPACK 1
+#else
+#define USE_INTRUSIVE_PTR_FOR_UNPACK 0
+#endif
+#else
+// Fallback: assume new API if version macros not defined (likely newer PyTorch)
+#define USE_INTRUSIVE_PTR_FOR_UNPACK 1
+#endif
+
 using namespace torch::autograd;
 
 namespace at_npu { namespace autograd { namespace generated {
@@ -27,6 +41,27 @@ using at::ScalarType;
 using c10::optional;
 using c10::fmap;
 
+#if USE_INTRUSIVE_PTR_FOR_UNPACK
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, c10::intrusive_ptr<Node> saved_for = nullptr)
+{
+    // NB: we must explicitly do the conversion in the lambda, otherwise template
+    // deduction will give a Tensor of Variable which is not convertible
+    return fmap(xs, [&saved_for](const SavedVariable& x) {
+        return static_cast<Tensor>(x.unpack(saved_for));
+    });
+}
+
+inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, c10::intrusive_ptr<Node> saved_for = nullptr)
+{
+    torch::List<c10::optional<Tensor>> result;
+    result.reserve(xs.size());
+    for (const SavedVariable& v : xs) {
+        auto var = v.unpack(saved_for);
+        result.push_back(var.defined() ? c10::optional<Tensor>(var) : c10::nullopt);
+    }
+    return result;
+}
+#else
 inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr)
 {
     // NB: we must explicitly do the conversion in the lambda, otherwise template
@@ -46,6 +81,7 @@ inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariab
     }
     return result;
 }
+#endif
 
 
 struct TypeAndSize {

From ba7ecf180f26d8577f396768f2c0356f07c6f51e Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 18:56:20 +0800
Subject: [PATCH 074/132] Fix _collect.yml: add Verify NPU step and improve
 dependency installation

- Add Verify NPU availability step to check CANN, NNAL, and NPU status
- Add upstream PyTorch test dependencies installation
- Add PyTorch development dependencies (expecttest, hypothesis, etc.)
- Fix --test-dir parameter to use pytorch-src/test (correct path after extraction)
- Add NNAL environment sourcing in collect step
- Add npu-smi info output before collection

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_collect.yml | 62 +++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index ac3a7a90e7..81c7d22dbf 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -115,8 +115,64 @@ jobs:
 
       - name: Install test dependencies
         run: |
+          # 安装当前仓库的基础测试依赖
           pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
 
+          # 安装上游 PyTorch 的完整测试依赖（从已下载的源码）
+          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
+
+          # 安装 PyTorch 开发依赖（可选）
+          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
+
+      # ==================== Verify NPU availability ====================
+      - name: Verify NPU availability
+        continue-on-error: true
+        run: |
+          # 切换到 /tmp 目录，避免 pytorch-src 源码目录干扰 torch 导入
+          cd /tmp
+
+          echo "=== CANN Directory Contents ==="
+          ls -la /usr/local/Ascend/ || echo "Ascend directory not found"
+          echo ""
+          echo "=== CANN Version Info ==="
+          if [ -d /usr/local/Ascend/cann ]; then
+            ls -la /usr/local/Ascend/cann/
+            echo ""
+            echo "=== CANN Version File ==="
+            cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo "version.info not found"
+          else
+            echo "CANN directory not found"
+          fi
+          echo ""
+          echo "=== NNAL Directory Contents ==="
+          if [ -d /usr/local/Ascend/nnal ]; then
+            ls -la /usr/local/Ascend/nnal/
+          else
+            echo "NNAL directory not found"
+          fi
+
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          echo ""
+          echo "=== NPU-SMI Info ==="
+          npu-smi info || echo "npu-smi not available"
+
+          PYTHON=python${{ env.PYTHON_VERSION }}
+          echo ""
+          echo "=== PyTorch and NPU Info ==="
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          if torch.npu.is_available():
+              print(f'NPU name: {torch.npu.get_device_name(0)}')
+              print(f'NPU capability: {torch.npu.get_device_capability(0)}')
+          " 2>&1 || echo "torch/torch_npu import failed - this is expected if torch and torch_npu versions are mismatched"
+
       # ==================== 保存 pip 缓存 ====================
       - name: Save pip cache
         if: always()
@@ -129,9 +185,13 @@ jobs:
         id: collect
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          echo "=== NPU-SMI Info ==="
+          npu-smi info || echo "npu-smi not available"
 
           python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
-            --test-dir test \
+            --test-dir pytorch-src/test \
             --distributed-shards ${{ inputs.distributed-shards }} \
             --regular-shards ${{ inputs.regular-shards }} \
             --output-dir cases_shards \

From 7a557fa0451c67cb305509dfd99b005360b15c5c Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 19:04:38 +0800
Subject: [PATCH 075/132] Split torch_npu build step into two separate steps

- Step 1: Install PyTorch wheel and build dependencies
  - Source CANN/NNAL environment
  - Install built PyTorch wheel
  - Verify PyTorch installation
  - Install torch_npu build dependencies (cmake, ninja, numpy, etc.)
  - Show ccache stats before build

- Step 2: Build torch_npu wheel
  - Source CANN/NNAL environment
  - Set MAX_JOBS=128
  - Run ci/build.sh with --disable_torchair
  - Show build results and ccache stats

This separation improves clarity and makes debugging easier.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 8f56d5f7c2..2aca4adb71 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -221,20 +221,37 @@ jobs:
           ccache --show-stats
 
       # ==================== 构建 torch_npu ====================
-      - name: Build torch_npu wheel
+      - name: Install PyTorch wheel and build dependencies
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
-          # 安装刚编译的 PyTorch
+          echo "=== Installing built PyTorch wheel ==="
           pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
 
-          cd torch_npu-src
+          echo ""
+          echo "=== Verifying PyTorch installation ==="
+          python${{ inputs.python_version }} -c "import torch; print(f'torch version: {torch.__version__}')"
 
-          # 安装构建依赖
+          echo ""
+          echo "=== Installing torch_npu build dependencies ==="
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
+          cd torch_npu-src
+
+          # 显示 ccache 统计（依赖安装阶段）
+          echo ""
+          echo "=== ccache Statistics (before torch_npu build) ==="
+          ccache --show-stats
+
+      - name: Build torch_npu wheel
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          cd torch_npu-src
+
           export MAX_JOBS=128
 
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）

From 8ff7b58f2e16534c0a45fd1917aaf668d15d4913 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 19:05:04 +0800
Subject: [PATCH 076/132] Remove pull_request trigger from test-collect.yml

Only keep workflow_dispatch trigger to allow manual execution.
This workflow depends on artifacts from previous build workflow,
so it cannot be triggered directly by pull_request events.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 340f0042c8..705bf678e0 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -1,11 +1,6 @@
 name: Test Collect Stage
 
 on:
-  pull_request:
-    branches: [main, master]
-    paths:
-      - '.github/workflows/**'
-      - '.github/scripts/**'
   workflow_dispatch:
     inputs:
       run_id:

From 9ef8e06dfe51b911596daf3cd062d35ce0cc815b Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 19:24:20 +0800
Subject: [PATCH 077/132] Fix ccache configuration and improve cache hit rate

1. Fix max_size configuration:
   - Write config directly to ~/.cache/ccache/ccache.conf file
   - This ensures priority over /etc/ccache.conf secondary config
   - Set max_size = 20G (was showing 5G due to secondary config override)
   - Add compression_level = 6 for better balance

2. Fix cache key strategy:
   - Remove PyTorch SHA from cache key (it changes every time due to --depth=1 clone)
   - Use torch_npu SHA + requirements-build.txt hash as cache key
   - Add base_key for broader cache restoration

3. Ensure ccache is actually used:
   - Set CC and CXX environment variables to use ccache symlinks
   - Clear stats before PyTorch build (--zero-stats)
   - Export CC/CXX in both PyTorch and torch_npu build steps

4. Add verification:
   - Show config file contents after setup
   - This helps debug if max_size is correctly applied

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 49 ++++++++++++++++++++------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 2aca4adb71..66d7b3a18c 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -138,15 +138,20 @@ jobs:
 
       # ==================== ccache 缓存配置 ====================
       # ccache 是真正加速编译的关键（可节省 30-60 分钟）
+      # 注意：PyTorch 每次 clone 都是新 commit，所以缓存键不包含 PyTorch SHA
+      # 我们依赖 torch_npu SHA 和 requirements-build.txt hash 作为缓存键
       - name: Get ccache key
         id: ccache_key
         run: |
-          # ccache 缓存键基于：requirements-build.txt + PyTorch SHA + torch_npu SHA
-          PYTORCH_SHA="${{ steps.clone_pytorch.outputs.pytorch_sha }}"
+          # ccache 缓存键：torch_npu SHA + requirements hash
+          # PyTorch SHA 每次都变化（--depth=1 clone 最新），所以不包含在缓存键中
           TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
           REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
-          echo "partial_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${PYTORCH_SHA}-" >> $GITHUB_OUTPUT
+          echo "cache_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
+          # partial_key 用于恢复同版本 requirements 的缓存（不同 torch_npu 版本）
+          echo "partial_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-" >> $GITHUB_OUTPUT
+          # base_key 用于恢复同 CACHE_VERSION 的所有缓存
+          echo "base_key=${{ env.CACHE_VERSION }}-ccache-" >> $GITHUB_OUTPUT
 
       - name: Restore ccache
         uses: actions/cache/restore@v4
@@ -155,21 +160,26 @@ jobs:
           key: ${{ steps.ccache_key.outputs.cache_key }}
           restore-keys: |
             ${{ steps.ccache_key.outputs.partial_key }}
-            ${{ env.CACHE_VERSION }}-ccache-
+            ${{ steps.ccache_key.outputs.base_key }}
 
       - name: Setup ccache
         run: |
           # 安装 ccache（manylinux 镜像没有预装）
           yum install -y ccache
 
-          # 配置 ccache
-          ccache --set-config=cache_dir=~/.cache/ccache
-          ccache --set-config=max_size=20G
-          ccache --set-config=compression=true
+          # 创建 ccache 配置目录
+          mkdir -p ~/.cache/ccache
+
+          # 直接写入配置文件（确保优先级高于 /etc/ccache.conf）
+          cat > ~/.cache/ccache/ccache.conf << 'EOF'
+          max_size = 20G
+          cache_dir = ~/.cache/ccache
+          compression = true
+          compression_level = 6
+          EOF
 
           # 使用符号链接方式让 ccache 模拟 gcc/g++
           # 这是推荐的方式，避免 CMake 直接把编译选项传给 ccache
-          # CMake 会调用 gcc，但实际上通过符号链接调用 ccache
           mkdir -p /usr/local/bin
           ln -sf /usr/bin/ccache /usr/local/bin/gcc
           ln -sf /usr/bin/ccache /usr/local/bin/g++
@@ -180,9 +190,17 @@ jobs:
           echo "PATH=/usr/local/bin:$PATH" >> $GITHUB_ENV
           echo "CCACHE_DIR=~/.cache/ccache" >> $GITHUB_ENV
 
+          # 设置编译器环境变量，确保 CMake/Ninja 使用 ccache
+          echo "CC=/usr/local/bin/gcc" >> $GITHUB_ENV
+          echo "CXX=/usr/local/bin/g++" >> $GITHUB_ENV
+
           echo "=== ccache Configuration ==="
           ccache --show-config
 
+          echo ""
+          echo "=== Config File Contents ==="
+          cat ~/.cache/ccache/ccache.conf
+
           echo ""
           echo "=== Symbolic Links ==="
           ls -la /usr/local/bin/gcc /usr/local/bin/g++
@@ -211,6 +229,13 @@ jobs:
           export USE_OPENMP=1
           export USE_MKLDNN=0
 
+          # 确保使用 ccache（CMake 会检测 CC/CXX 环境变量）
+          export CC=/usr/local/bin/gcc
+          export CXX=/usr/local/bin/g++
+
+          # 清除 ccache 统计（开始新的构建）
+          ccache --zero-stats
+
           python${{ inputs.python_version }} setup.py build bdist_wheel
 
           echo "PyTorch wheel built:"
@@ -254,6 +279,10 @@ jobs:
 
           export MAX_JOBS=128
 
+          # 确保使用 ccache
+          export CC=/usr/local/bin/gcc
+          export CXX=/usr/local/bin/g++
+
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
           bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair
 

From 68e5ed9dfab5a93b86cd80e753bec7fbde7fb764 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 19:30:45 +0800
Subject: [PATCH 078/132] Fix VariableFallbackKernel.cpp for PyTorch 2.11+
 autograd API changes

PyTorch 2.11+ API changes:
- torch::autograd::deleteNode is removed (intrusive_ptr handles cleanup)
- Node uses c10::intrusive_ptr instead of std::shared_ptr
- set_history requires c10::intrusive_ptr<Node> parameter

Changes:
- Add version detection via CURRENT_VERSION macro
- For PyTorch 2.11+: use c10::intrusive_ptr and c10::make_intrusive
- For PyTorch < 2.11: keep std::shared_ptr and deleteNode
- Add conditional compilation (#if USE_INTRUSIVE_PTR_FOR_NODE)

This fixes compilation errors:
- 'deleteNode' is not a member of 'torch::autograd'
- no matching function for call to 'set_history'

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../csrc/aten/VariableFallbackKernel.cpp      | 138 ++++++++++++++++++
 1 file changed, 138 insertions(+)

diff --git a/torch_npu/csrc/aten/VariableFallbackKernel.cpp b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
index 25d96107e4..b3bcc8872f 100644
--- a/torch_npu/csrc/aten/VariableFallbackKernel.cpp
+++ b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
@@ -10,6 +10,15 @@
 
 #include "torch_npu/csrc/core/npu/NPUException.h"
 
+// COMPAT(>= 2.11): Handle PyTorch autograd API changes
+#include <torch_npu/_compat/version.h>
+#if CURRENT_VERSION >= (2, 11)
+// In PyTorch 2.11+, Node uses c10::intrusive_ptr and deleteNode is removed
+#define USE_INTRUSIVE_PTR_FOR_NODE 1
+#else
+#define USE_INTRUSIVE_PTR_FOR_NODE 0
+#endif
+
 /*
  * This file implements a variable fallback kernel for custom operators.
  * Since tensors always have the Autograd set, but custom operators
@@ -100,6 +109,134 @@ auto WarnNotImplemented::apply(variable_list&& inputs) -> variable_list
     return output;
 }
 
+#if USE_INTRUSIVE_PTR_FOR_NODE
+// PyTorch 2.11+ uses c10::intrusive_ptr for Node
+static void npuBasicAutogradNotImplementedFallbackImpl(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet dispatch_keys,
+    torch::jit::Stack* stack)
+{
+    const auto& schema = op.schema();
+    const auto& op_name = schema.operator_name().name;
+    const auto num_arguments = schema.arguments().size();
+    const auto num_returns = schema.returns().size();
+    const auto stack_start = stack->size() - num_arguments;
+
+    if (torch::autograd::getAutogradFallbackMode() == torch::autograd::AutogradFallbackMode::Nothing) {
+        op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+        return;
+    }
+    TORCH_INTERNAL_ASSERT(
+        torch::autograd::getAutogradFallbackMode() == torch::autograd::AutogradFallbackMode::Warn);
+
+    bool any_input_requires_grad = false;
+    _foreach_tensor(
+        [&](size_t _, size_t idx_arg, const at::Tensor& t) {
+            if (t.requires_grad()) {
+            any_input_requires_grad = true;
+            }
+        },
+        stack,
+        stack_start,
+        num_arguments);
+    // Optimization: TLS access can be slow. So we only check if it necessary
+    // by putting it after the requires_grad checks.
+    any_input_requires_grad = any_input_requires_grad && at::GradMode::is_enabled();
+
+    c10::intrusive_ptr<WarnNotImplemented> grad_fn;
+    if (any_input_requires_grad) {
+        // NB: It is standard to collect edges from all tensors
+        // (see generated/VariableTypeEverything.cpp for examples)
+        std::vector<const at::Tensor*> all_tensors_on_stack;
+        _foreach_tensor(
+            [&](size_t _, size_t idx_arg, const at::Tensor& t) {
+            all_tensors_on_stack.push_back(&t);
+            },
+            stack,
+            stack_start,
+            num_arguments);
+        // In PyTorch 2.11+, use intrusive_ptr directly (no deleteNode needed)
+        grad_fn = c10::make_intrusive<WarnNotImplemented>(
+            op_name, all_tensors_on_stack.size());
+        grad_fn->set_next_edges(torch::autograd::collect_next_edges(all_tensors_on_stack));
+    }
+
+    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+
+    if (any_input_requires_grad) {
+        // NB: if the operator mutates any inputs in-place and does not return them
+        // as outputs, we are unable to lazily raise a warning. This is OK because
+        // we don't expect many existing operators to do this because of the amount
+        // of technical expertise necessary (you would need to manually register an
+        // autograd kernel without using autograd.Function)
+        _foreach_tensor(
+            [&](size_t _, size_t idx_ret, const at::Tensor& t) {
+            if (!torch::autograd::isDifferentiableType(t.scalar_type())) {
+                return;
+            }
+            const bool is_mutable_output =
+                schema.is_aliasing({c10::SchemaArgType::output, idx_ret}) &&
+                schema.is_mutable({c10::SchemaArgType::output, idx_ret});
+
+            // If the post-autograd implementation returns Tensors that require
+            // grad, then we install a hook that will warn during the backwards.
+            //
+            // NB: If the operation is inplace and the inputs were views,
+            // it is possible that the history was rebased and the hook will
+            // not warn in all places where it should. That is, the following
+            // won't warn:
+            // >>> x = torch.randn(3, 3, requires_grad=True)
+            // >>> z = x.clone()
+            // >>> w = z[0]
+            // >>> k = w[0]
+            // >>> y = op(k)
+            // >>> torch.autograd.grad(z.sum(), w)
+            if (t.requires_grad()) {
+                t.register_hook([op_name](const at::Tensor& grad) {
+                warnAutogradNotImplemented(op_name);
+                });
+                // If history is rebased, then we will attempt to warn
+                // on the view's base. This will catch most cases (because
+                // users typically call .backward() and backprop through
+                // the entire program).
+                if (t.is_view() && is_mutable_output) {
+                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+                auto& base = const_cast<at::TensorBase&>(t._base());
+                if (base.requires_grad()) {
+                    // Can only register_hook on tensors that require grad.
+                    base.register_hook([op_name](const at::TensorBase& grad) {
+                    warnAutogradNotImplemented(op_name);
+                    });
+                }
+                }
+                return;
+            }
+
+            // If the post-autograd implementation returns any Tensors that
+            // don't require grad, then we install the WarnNotImplemented grad_fn.
+            // This grad_fn warns in backward and returns undefined tensor
+            // gradients.
+            //
+            // NOTE [autograd fallback and in-place operations]
+            // If the schema says the output is mutable, and the output
+            // is an input, and the input is a view Tensor, then...
+            // we're not sure if set_history is OK to do, so we just skip
+            // adding the grad_fn. Builtin operators do rebase_history here,
+            // but custom operators may have multiple Tensor(a!) returns,
+            // rebase_history assumes single Tensor(a!) return, and in general
+            // custom ops don't have a good in-place story.
+            if (!is_mutable_output) {
+                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+                torch::autograd::set_history(const_cast<at::Tensor&>(t), grad_fn);
+            }
+            },
+            stack,
+            stack->size() - num_returns,
+            num_returns);
+    }
+}
+#else
+// PyTorch < 2.11 uses std::shared_ptr for Node
 static void npuBasicAutogradNotImplementedFallbackImpl(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet dispatch_keys,
@@ -224,6 +361,7 @@ static void npuBasicAutogradNotImplementedFallbackImpl(
             num_returns);
     }
 }
+#endif
 
 // Register fallthrough for Autograd backends dispatch keys
 // NB: But not the private use ones; maybe the extension wants

From 0b1645b94c30457837b11c2398a9dfd6efb336dd Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 19:41:05 +0800
Subject: [PATCH 079/132] Fix ccache and pip cache paths: use absolute paths
 instead of ~

Root cause analysis:
- ~ (tilde) doesn't expand in $GITHUB_ENV or GitHub Actions cache paths
- CCACHE_DIR=~/.cache/ccache resulted in ccache using default config
- max_size showed 5G instead of configured 20G because config file wasn't read

Changes:
1. Replace all ~ paths with absolute paths /root/.cache/...
   - pip cache: ~/.cache/pip -> /root/.cache/pip
   - ccache: ~/.cache/ccache -> /root/.cache/ccache

2. Fix CCACHE_DIR environment variable:
   - Old: echo "CCACHE_DIR=~/.cache/ccache" >> $GITHUB_ENV (not expanded)
   - New: echo "CCACHE_DIR=$CCACHE_DIR_PATH" >> $GITHUB_ENV (expanded)

3. Fix ccache config file:
   - Old: cache_dir = ~/.cache/ccache
   - New: cache_dir = /root/.cache/ccache

4. Ensure CCACHE_DIR is set in all build steps:
   - PyTorch build: export CCACHE_DIR=/root/.cache/ccache
   - torch_npu build: export CCACHE_DIR=/root/.cache/ccache
   - ccache --show-stats calls: CCACHE_DIR=/root/.cache/ccache ccache --show-stats

5. Add verification output:
   - Show cache directory contents after setup
   - This helps debug if config is correctly applied

Expected results after fix:
- max_size should show 20G (not 5G)
- Cache should be properly restored from GitHub Actions cache
- Cache hit rate should improve on subsequent builds with same torch_npu SHA

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 44 +++++++++++++++-----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 66d7b3a18c..70a6c1d739 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -110,7 +110,7 @@ jobs:
       - name: Restore pip cache
         uses: actions/cache/restore@v4
         with:
-          path: ~/.cache/pip
+          path: /root/.cache/pip
           key: ${{ steps.pip_key.outputs.cache_key }}
           restore-keys: |
             ${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-
@@ -118,7 +118,7 @@ jobs:
 
       - name: Setup pip cache directory
         run: |
-          mkdir -p ~/.cache/pip
+          mkdir -p /root/.cache/pip
 
       - name: Configure pip index URL
         run: |
@@ -156,7 +156,7 @@ jobs:
       - name: Restore ccache
         uses: actions/cache/restore@v4
         with:
-          path: ~/.cache/ccache
+          path: /root/.cache/ccache
           key: ${{ steps.ccache_key.outputs.cache_key }}
           restore-keys: |
             ${{ steps.ccache_key.outputs.partial_key }}
@@ -167,19 +167,19 @@ jobs:
           # 安装 ccache（manylinux 镜像没有预装）
           yum install -y ccache
 
-          # 创建 ccache 配置目录
-          mkdir -p ~/.cache/ccache
+          # 创建 ccache 配置目录（使用绝对路径）
+          CCACHE_DIR_PATH="/root/.cache/ccache"
+          mkdir -p "$CCACHE_DIR_PATH"
 
-          # 直接写入配置文件（确保优先级高于 /etc/ccache.conf）
-          cat > ~/.cache/ccache/ccache.conf << 'EOF'
+          # 直接写入配置文件（使用绝对路径）
+          cat > "$CCACHE_DIR_PATH/ccache.conf" << 'EOF'
           max_size = 20G
-          cache_dir = ~/.cache/ccache
+          cache_dir = /root/.cache/ccache
           compression = true
           compression_level = 6
           EOF
 
           # 使用符号链接方式让 ccache 模拟 gcc/g++
-          # 这是推荐的方式，避免 CMake 直接把编译选项传给 ccache
           mkdir -p /usr/local/bin
           ln -sf /usr/bin/ccache /usr/local/bin/gcc
           ln -sf /usr/bin/ccache /usr/local/bin/g++
@@ -188,18 +188,24 @@ jobs:
 
           # 设置 PATH 优先使用符号链接
           echo "PATH=/usr/local/bin:$PATH" >> $GITHUB_ENV
-          echo "CCACHE_DIR=~/.cache/ccache" >> $GITHUB_ENV
+
+          # 设置 CCACHE_DIR（使用绝对路径，不使用 ~）
+          echo "CCACHE_DIR=$CCACHE_DIR_PATH" >> $GITHUB_ENV
 
           # 设置编译器环境变量，确保 CMake/Ninja 使用 ccache
           echo "CC=/usr/local/bin/gcc" >> $GITHUB_ENV
           echo "CXX=/usr/local/bin/g++" >> $GITHUB_ENV
 
           echo "=== ccache Configuration ==="
-          ccache --show-config
+          CCACHE_DIR="$CCACHE_DIR_PATH" ccache --show-config
 
           echo ""
           echo "=== Config File Contents ==="
-          cat ~/.cache/ccache/ccache.conf
+          cat "$CCACHE_DIR_PATH/ccache.conf"
+
+          echo ""
+          echo "=== Cache Directory ==="
+          ls -la "$CCACHE_DIR_PATH/"
 
           echo ""
           echo "=== Symbolic Links ==="
@@ -207,7 +213,7 @@ jobs:
 
           echo ""
           echo "=== ccache Statistics (before build) ==="
-          ccache --show-stats
+          CCACHE_DIR="$CCACHE_DIR_PATH" ccache --show-stats
 
       # ==================== 构建 PyTorch ====================
       - name: Build PyTorch wheel
@@ -232,6 +238,7 @@ jobs:
           # 确保使用 ccache（CMake 会检测 CC/CXX 环境变量）
           export CC=/usr/local/bin/gcc
           export CXX=/usr/local/bin/g++
+          export CCACHE_DIR=/root/.cache/ccache
 
           # 清除 ccache 统计（开始新的构建）
           ccache --zero-stats
@@ -268,7 +275,7 @@ jobs:
           # 显示 ccache 统计（依赖安装阶段）
           echo ""
           echo "=== ccache Statistics (before torch_npu build) ==="
-          ccache --show-stats
+          CCACHE_DIR=/root/.cache/ccache ccache --show-stats
 
       - name: Build torch_npu wheel
         run: |
@@ -282,6 +289,7 @@ jobs:
           # 确保使用 ccache
           export CC=/usr/local/bin/gcc
           export CXX=/usr/local/bin/g++
+          export CCACHE_DIR=/root/.cache/ccache
 
           # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
           bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair
@@ -298,14 +306,14 @@ jobs:
         if: always()
         uses: actions/cache/save@v4
         with:
-          path: ~/.cache/pip
+          path: /root/.cache/pip
           key: ${{ steps.pip_key.outputs.cache_key }}
 
       - name: Save ccache
         if: always()
         uses: actions/cache/save@v4
         with:
-          path: ~/.cache/ccache
+          path: /root/.cache/ccache
           key: ${{ steps.ccache_key.outputs.cache_key }}
 
       - name: Display cache save status
@@ -313,11 +321,11 @@ jobs:
         run: |
           echo "=== Cache Saved ==="
           echo "pip cache key: ${{ steps.pip_key.outputs.cache_key }}"
-          PIP_CACHE_SIZE=$(du -sh ~/.cache/pip 2>/dev/null | cut -f1)
+          PIP_CACHE_SIZE=$(du -sh /root/.cache/pip 2>/dev/null | cut -f1)
           echo "pip cache size: ${PIP_CACHE_SIZE}"
           echo ""
           echo "ccache key: ${{ steps.ccache_key.outputs.cache_key }}"
-          CCACHE_SIZE=$(du -sh ~/.cache/ccache 2>/dev/null | cut -f1)
+          CCACHE_SIZE=$(du -sh /root/.cache/ccache 2>/dev/null | cut -f1)
           echo "ccache size: ${CCACHE_SIZE}"
 
       # ==================== 打包和上传 ====================

From 61695b3578871b123d3f003d6a22389f17cc7676 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 6 May 2026 20:17:18 +0800
Subject: [PATCH 080/132] Fix VariableFallbackKernel.cpp: use PyTorch version
 macros instead of Python version file

Root cause:
- C++ code cannot include Python version.py file
- fatal error: torch_npu/_compat/version.h: No such file or directory
- torch_npu/_compat/version.py is a Python module, not a C++ header

Solution:
- Use PyTorch's built-in C++ version macros (TORCH_VERSION_MAJOR, TORCH_VERSION_MINOR)
- These macros are defined in torch/csrc/api/include/torch/version.h
- Available in all PyTorch C++ builds

Changes:
- Remove: #include <torch_npu/_compat/version.h> (wrong - Python file)
- Add: Use TORCH_VERSION_MAJOR and TORCH_VERSION_MINOR macros
- Add fallback: Assume newer PyTorch if version macros not defined

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 torch_npu/csrc/aten/VariableFallbackKernel.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/torch_npu/csrc/aten/VariableFallbackKernel.cpp b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
index b3bcc8872f..caa4b70fcb 100644
--- a/torch_npu/csrc/aten/VariableFallbackKernel.cpp
+++ b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
@@ -11,13 +11,18 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 
 // COMPAT(>= 2.11): Handle PyTorch autograd API changes
-#include <torch_npu/_compat/version.h>
-#if CURRENT_VERSION >= (2, 11)
-// In PyTorch 2.11+, Node uses c10::intrusive_ptr and deleteNode is removed
+// PyTorch provides version macros that we can use in C++ code
+// TORCH_VERSION_MAJOR, TORCH_VERSION_MINOR are defined in torch/csrc/api/include/torch/version.h
+#if defined(TORCH_VERSION_MAJOR) && defined(TORCH_VERSION_MINOR)
+#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 11
 #define USE_INTRUSIVE_PTR_FOR_NODE 1
 #else
 #define USE_INTRUSIVE_PTR_FOR_NODE 0
 #endif
+#else
+// Fallback: assume newer PyTorch if version macros not defined
+#define USE_INTRUSIVE_PTR_FOR_NODE 1
+#endif
 
 /*
  * This file implements a variable fallback kernel for custom operators.

From 9f582bad32fa735a30b955d2a3fa7ee0a590ef24 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 10:46:06 +0800
Subject: [PATCH 081/132] Revert torch_npu/ and torchnpugen/autograd/templates/
 to upstream/master state

After merging upstream/master, restore these directories to upstream/master
state, keeping other modifications from dev_master branch.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 torch_npu/_compat/distributed.py              |  26 ----
 .../csrc/aten/VariableFallbackKernel.cpp      | 143 ------------------
 .../distributed/tensor/_pointwise_ops.py      | 141 +++--------------
 torchnpugen/autograd/templates/Functions.h    |  36 -----
 4 files changed, 20 insertions(+), 326 deletions(-)

diff --git a/torch_npu/_compat/distributed.py b/torch_npu/_compat/distributed.py
index 6d71a3bd3b..a40a66d664 100644
--- a/torch_npu/_compat/distributed.py
+++ b/torch_npu/_compat/distributed.py
@@ -7,29 +7,3 @@
     from torch.distributed.tensor._ops.utils import register_op_strategy, register_prop_rule
 else:
     from torch.distributed.tensor._ops.registration import register_op_strategy, register_prop_rule
-
-# COMPAT(>= 2.11): pointwise_strategy removed in PyTorch 2.11
-#   New API uses register_single_dim_strategy and _register_single_dim_pointwise
-# CAN REMOVE else branch when MIN_SUPPORTED >= (2, 11)
-if CURRENT_VERSION >= (2, 11):
-    try:
-        from torch.distributed.tensor._ops.single_dim_strategy import (
-            register_single_dim_strategy,
-            _ShardingPlaceholder,
-        )
-        # pointwise_strategy is no longer available, use register_single_dim_strategy instead
-        pointwise_strategy = None
-    except ImportError:
-        # Fallback for older PyTorch versions without single_dim_strategy
-        pointwise_strategy = None
-        register_single_dim_strategy = None
-        _ShardingPlaceholder = None
-else:
-    try:
-        from torch.distributed.tensor._ops._pointwise_ops import pointwise_strategy
-        register_single_dim_strategy = None
-        _ShardingPlaceholder = None
-    except ImportError:
-        pointwise_strategy = None
-        register_single_dim_strategy = None
-        _ShardingPlaceholder = None
diff --git a/torch_npu/csrc/aten/VariableFallbackKernel.cpp b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
index caa4b70fcb..25d96107e4 100644
--- a/torch_npu/csrc/aten/VariableFallbackKernel.cpp
+++ b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
@@ -10,20 +10,6 @@
 
 #include "torch_npu/csrc/core/npu/NPUException.h"
 
-// COMPAT(>= 2.11): Handle PyTorch autograd API changes
-// PyTorch provides version macros that we can use in C++ code
-// TORCH_VERSION_MAJOR, TORCH_VERSION_MINOR are defined in torch/csrc/api/include/torch/version.h
-#if defined(TORCH_VERSION_MAJOR) && defined(TORCH_VERSION_MINOR)
-#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 11
-#define USE_INTRUSIVE_PTR_FOR_NODE 1
-#else
-#define USE_INTRUSIVE_PTR_FOR_NODE 0
-#endif
-#else
-// Fallback: assume newer PyTorch if version macros not defined
-#define USE_INTRUSIVE_PTR_FOR_NODE 1
-#endif
-
 /*
  * This file implements a variable fallback kernel for custom operators.
  * Since tensors always have the Autograd set, but custom operators
@@ -114,134 +100,6 @@ auto WarnNotImplemented::apply(variable_list&& inputs) -> variable_list
     return output;
 }
 
-#if USE_INTRUSIVE_PTR_FOR_NODE
-// PyTorch 2.11+ uses c10::intrusive_ptr for Node
-static void npuBasicAutogradNotImplementedFallbackImpl(
-    const c10::OperatorHandle& op,
-    c10::DispatchKeySet dispatch_keys,
-    torch::jit::Stack* stack)
-{
-    const auto& schema = op.schema();
-    const auto& op_name = schema.operator_name().name;
-    const auto num_arguments = schema.arguments().size();
-    const auto num_returns = schema.returns().size();
-    const auto stack_start = stack->size() - num_arguments;
-
-    if (torch::autograd::getAutogradFallbackMode() == torch::autograd::AutogradFallbackMode::Nothing) {
-        op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
-        return;
-    }
-    TORCH_INTERNAL_ASSERT(
-        torch::autograd::getAutogradFallbackMode() == torch::autograd::AutogradFallbackMode::Warn);
-
-    bool any_input_requires_grad = false;
-    _foreach_tensor(
-        [&](size_t _, size_t idx_arg, const at::Tensor& t) {
-            if (t.requires_grad()) {
-            any_input_requires_grad = true;
-            }
-        },
-        stack,
-        stack_start,
-        num_arguments);
-    // Optimization: TLS access can be slow. So we only check if it necessary
-    // by putting it after the requires_grad checks.
-    any_input_requires_grad = any_input_requires_grad && at::GradMode::is_enabled();
-
-    c10::intrusive_ptr<WarnNotImplemented> grad_fn;
-    if (any_input_requires_grad) {
-        // NB: It is standard to collect edges from all tensors
-        // (see generated/VariableTypeEverything.cpp for examples)
-        std::vector<const at::Tensor*> all_tensors_on_stack;
-        _foreach_tensor(
-            [&](size_t _, size_t idx_arg, const at::Tensor& t) {
-            all_tensors_on_stack.push_back(&t);
-            },
-            stack,
-            stack_start,
-            num_arguments);
-        // In PyTorch 2.11+, use intrusive_ptr directly (no deleteNode needed)
-        grad_fn = c10::make_intrusive<WarnNotImplemented>(
-            op_name, all_tensors_on_stack.size());
-        grad_fn->set_next_edges(torch::autograd::collect_next_edges(all_tensors_on_stack));
-    }
-
-    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
-
-    if (any_input_requires_grad) {
-        // NB: if the operator mutates any inputs in-place and does not return them
-        // as outputs, we are unable to lazily raise a warning. This is OK because
-        // we don't expect many existing operators to do this because of the amount
-        // of technical expertise necessary (you would need to manually register an
-        // autograd kernel without using autograd.Function)
-        _foreach_tensor(
-            [&](size_t _, size_t idx_ret, const at::Tensor& t) {
-            if (!torch::autograd::isDifferentiableType(t.scalar_type())) {
-                return;
-            }
-            const bool is_mutable_output =
-                schema.is_aliasing({c10::SchemaArgType::output, idx_ret}) &&
-                schema.is_mutable({c10::SchemaArgType::output, idx_ret});
-
-            // If the post-autograd implementation returns Tensors that require
-            // grad, then we install a hook that will warn during the backwards.
-            //
-            // NB: If the operation is inplace and the inputs were views,
-            // it is possible that the history was rebased and the hook will
-            // not warn in all places where it should. That is, the following
-            // won't warn:
-            // >>> x = torch.randn(3, 3, requires_grad=True)
-            // >>> z = x.clone()
-            // >>> w = z[0]
-            // >>> k = w[0]
-            // >>> y = op(k)
-            // >>> torch.autograd.grad(z.sum(), w)
-            if (t.requires_grad()) {
-                t.register_hook([op_name](const at::Tensor& grad) {
-                warnAutogradNotImplemented(op_name);
-                });
-                // If history is rebased, then we will attempt to warn
-                // on the view's base. This will catch most cases (because
-                // users typically call .backward() and backprop through
-                // the entire program).
-                if (t.is_view() && is_mutable_output) {
-                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-                auto& base = const_cast<at::TensorBase&>(t._base());
-                if (base.requires_grad()) {
-                    // Can only register_hook on tensors that require grad.
-                    base.register_hook([op_name](const at::TensorBase& grad) {
-                    warnAutogradNotImplemented(op_name);
-                    });
-                }
-                }
-                return;
-            }
-
-            // If the post-autograd implementation returns any Tensors that
-            // don't require grad, then we install the WarnNotImplemented grad_fn.
-            // This grad_fn warns in backward and returns undefined tensor
-            // gradients.
-            //
-            // NOTE [autograd fallback and in-place operations]
-            // If the schema says the output is mutable, and the output
-            // is an input, and the input is a view Tensor, then...
-            // we're not sure if set_history is OK to do, so we just skip
-            // adding the grad_fn. Builtin operators do rebase_history here,
-            // but custom operators may have multiple Tensor(a!) returns,
-            // rebase_history assumes single Tensor(a!) return, and in general
-            // custom ops don't have a good in-place story.
-            if (!is_mutable_output) {
-                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-                torch::autograd::set_history(const_cast<at::Tensor&>(t), grad_fn);
-            }
-            },
-            stack,
-            stack->size() - num_returns,
-            num_returns);
-    }
-}
-#else
-// PyTorch < 2.11 uses std::shared_ptr for Node
 static void npuBasicAutogradNotImplementedFallbackImpl(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet dispatch_keys,
@@ -366,7 +224,6 @@ static void npuBasicAutogradNotImplementedFallbackImpl(
             num_returns);
     }
 }
-#endif
 
 // Register fallthrough for Autograd backends dispatch keys
 // NB: But not the private use ones; maybe the extension wants
diff --git a/torch_npu/distributed/tensor/_pointwise_ops.py b/torch_npu/distributed/tensor/_pointwise_ops.py
index e1db47c0a7..49cafcd28c 100644
--- a/torch_npu/distributed/tensor/_pointwise_ops.py
+++ b/torch_npu/distributed/tensor/_pointwise_ops.py
@@ -1,17 +1,13 @@
+
 import torch
 from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
-from torch_npu._compat.distributed import (
-    register_op_strategy,
-    pointwise_strategy,
-    register_single_dim_strategy,
-    _ShardingPlaceholder,
-)
-from torch_npu._compat.version import CURRENT_VERSION
+from torch_npu._compat.distributed import register_op_strategy
+from torch.distributed.tensor._ops._pointwise_ops import pointwise_strategy
 
 aten = torch.ops.aten
 npu = torch.ops.npu
 
-# Linear pointwise ops from upstream
+
 custom_linear_pointwise_ops = {
     npu.npu_dtype_cast.default: 0,
     npu._npu_dtype_cast.default: 0,
@@ -19,7 +15,18 @@
     npu._npu_dtype_cast_backward.default: 0,
 }
 
-# Additional pointwise ops from upstream (use linearity=0)
+
+def custom_linear_pointwise_strategy(op_schema: OpSchema):
+    op_type = custom_linear_pointwise_ops.get(op_schema.op, -1)
+    return pointwise_strategy(op_schema, linearity=op_type)
+
+
+for op in custom_linear_pointwise_ops:
+    register_op_strategy(
+        op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
+    )(custom_linear_pointwise_strategy)
+
+
 custom_pointwise_ops = [
     # please keep the entries below alphabetically sorted
     # native ops
@@ -35,115 +42,7 @@
 ]
 
 
-def _custom_pointwise_strategy_new(
-    op: torch._ops.OpOverload,
-    args_schema,
-    kwargs_schema,
-):
-    """
-    New strategy function for PyTorch 2.11+ using register_single_dim_strategy.
-    Returns placements list following the new API format: list[list[Placement | _ShardingPlaceholder]]
-
-    For pointwise ops, all tensors should shard on the same dimension.
-    This matches the behavior of pointwise_strategy(linearity=0) in older PyTorch.
-    """
-    from torch.distributed.tensor.placement_types import Replicate
-    from torch.distributed.tensor._dtensor_spec import TensorMeta
-
-    # Get tensor arguments
-    tensor_args = [arg for arg in args_schema if isinstance(arg, TensorMeta)]
-    if not tensor_args:
-        # No tensor args, return replicate strategy
-        return [[Replicate()]]
-
-    # Get common shape (broadcasted)
-    common_shape = torch.broadcast_shapes(*[arg.shape for arg in tensor_args])
-
-    num_outputs = 1  # Most pointwise ops have single output
-    placements = []
-
-    # For each dimension, create a sharding strategy
-    for dim_idx in range(len(common_shape)):
-        strategy = [_ShardingPlaceholder(dim_idx)] * num_outputs
-        for arg in tensor_args:
-            # Map common dim to arg dim (handling broadcast)
-            common_dim_to_arg_dim = _infer_broadcast_dims_map(common_shape, arg.shape)
-            if common_dim_to_arg_dim[dim_idx] >= 0:
-                strategy.append(_ShardingPlaceholder(common_dim_to_arg_dim[dim_idx]))
-            else:
-                strategy.append(Replicate())
-        placements.append(strategy)
-
-    # Add replicate strategy
-    replicate_strategy = [Replicate()] * (num_outputs + len(tensor_args))
-    placements.insert(0, replicate_strategy)
-
-    return placements
-
-
-def _infer_broadcast_dims_map(common_shape, arg_shape):
-    """Map dimensions from common shape to argument shape for broadcast handling."""
-    # Simple implementation: map matching dimensions, -1 for broadcast dimensions
-    result = []
-    arg_ndim = len(arg_shape)
-    common_ndim = len(common_shape)
-
-    for common_idx in range(common_ndim):
-        # Calculate corresponding arg index
-        arg_idx = common_idx - (common_ndim - arg_ndim)
-        if arg_idx >= 0 and arg_idx < arg_ndim:
-            # Check if dimension matches (not broadcast)
-            if arg_shape[arg_idx] == common_shape[common_idx] or arg_shape[arg_idx] == 1:
-                result.append(arg_idx)
-            else:
-                result.append(-1)
-        else:
-            result.append(-1)
-
-    return result
-
-
-def custom_linear_pointwise_strategy(op_schema: OpSchema):
-    """Legacy strategy function for PyTorch < 2.11."""
-    op_type = custom_linear_pointwise_ops.get(op_schema.op, -1)
-    return pointwise_strategy(op_schema, linearity=op_type)
-
-
-# Register strategies based on PyTorch version
-if CURRENT_VERSION >= (2, 11):
-    # Use new register_single_dim_strategy API for PyTorch 2.11+
-    if register_single_dim_strategy is not None:
-        # Register linear pointwise ops
-        for op in custom_linear_pointwise_ops:
-            register_single_dim_strategy(
-                op,
-                schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
-            )(_custom_pointwise_strategy_new)
-
-        # Register additional pointwise ops
-        for op in custom_pointwise_ops:
-            register_single_dim_strategy(
-                op,
-                schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
-            )(_custom_pointwise_strategy_new)
-    else:
-        # Fallback if register_single_dim_strategy is not available
-        # This should not happen in PyTorch 2.11+, but we keep it for safety
-        pass
-else:
-    # Use legacy pointwise_strategy for PyTorch < 2.11
-    if pointwise_strategy is not None:
-        # Register linear pointwise ops
-        for op in custom_linear_pointwise_ops:
-            register_op_strategy(
-                op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
-            )(custom_linear_pointwise_strategy)
-
-        # Register additional pointwise ops
-        for op in custom_pointwise_ops:
-            register_op_strategy(
-                op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
-            )(pointwise_strategy)
-    else:
-        # Fallback if pointwise_strategy is not available
-        pass
\ No newline at end of file
+for op in custom_pointwise_ops:
+    register_op_strategy(
+        op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"])
+    )(pointwise_strategy)
diff --git a/torchnpugen/autograd/templates/Functions.h b/torchnpugen/autograd/templates/Functions.h
index d732e524c9..bb822779e6 100644
--- a/torchnpugen/autograd/templates/Functions.h
+++ b/torchnpugen/autograd/templates/Functions.h
@@ -13,20 +13,6 @@
 
 #include <c10/core/SymIntArrayRef.h>
 
-// COMPAT(>= 2.11): SavedVariable::unpack parameter type changed from std::shared_ptr<Node>
-// to c10::intrusive_ptr<Node> in PyTorch 2.11
-// We detect the change by checking if TORCH_VERSION_MAJOR >= 2 and TORCH_VERSION_MINOR >= 11
-#if defined(TORCH_VERSION_MAJOR) && defined(TORCH_VERSION_MINOR)
-#if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 11
-#define USE_INTRUSIVE_PTR_FOR_UNPACK 1
-#else
-#define USE_INTRUSIVE_PTR_FOR_UNPACK 0
-#endif
-#else
-// Fallback: assume new API if version macros not defined (likely newer PyTorch)
-#define USE_INTRUSIVE_PTR_FOR_UNPACK 1
-#endif
-
 using namespace torch::autograd;
 
 namespace at_npu { namespace autograd { namespace generated {
@@ -41,27 +27,6 @@ using at::ScalarType;
 using c10::optional;
 using c10::fmap;
 
-#if USE_INTRUSIVE_PTR_FOR_UNPACK
-inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, c10::intrusive_ptr<Node> saved_for = nullptr)
-{
-    // NB: we must explicitly do the conversion in the lambda, otherwise template
-    // deduction will give a Tensor of Variable which is not convertible
-    return fmap(xs, [&saved_for](const SavedVariable& x) {
-        return static_cast<Tensor>(x.unpack(saved_for));
-    });
-}
-
-inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, c10::intrusive_ptr<Node> saved_for = nullptr)
-{
-    torch::List<c10::optional<Tensor>> result;
-    result.reserve(xs.size());
-    for (const SavedVariable& v : xs) {
-        auto var = v.unpack(saved_for);
-        result.push_back(var.defined() ? c10::optional<Tensor>(var) : c10::nullopt);
-    }
-    return result;
-}
-#else
 inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr)
 {
     // NB: we must explicitly do the conversion in the lambda, otherwise template
@@ -81,7 +46,6 @@ inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariab
     }
     return result;
 }
-#endif
 
 
 struct TypeAndSize {

From 7eb3e7a13679cba0bce28f48f38ef5a6b0b4ce1d Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 10:56:41 +0800
Subject: [PATCH 082/132] Build PyTorch from specific commit
 fccc94ae83f61fe26559abc999797297196bac29

Pin PyTorch source to a specific commit instead of using latest main branch.
This ensures reproducible builds and avoids API compatibility issues.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 70a6c1d739..430a207f0f 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -83,14 +83,21 @@ jobs:
             echo "Using proxy: ${PYTORCH_REPO}"
           fi
 
-          # --recurse-submodules 同时下载所有 submodules（使用已配置的 git proxy）
-          git clone --depth=1 --recurse-submodules --branch ${{ inputs.pytorch_branch }} \
-            "${PYTORCH_REPO}" pytorch-src
-          PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
+          # 克隆指定 commit: fccc94ae83f61fe26559abc999797297196bac29
+          # 先浅克隆，再 fetch 指定 commit，最后 checkout
+          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
+          cd pytorch-src
+          git fetch --depth=1 origin fccc94ae83f61fe26559abc999797297196bac29
+          git checkout fccc94ae83f61fe26559abc999797297196bac29
+
+          # 初始化 submodules
+          git submodule update --init --recursive
+
+          PYTORCH_SHA=$(git rev-parse HEAD)
           echo "pytorch_sha=${PYTORCH_SHA}" >> $GITHUB_OUTPUT
           echo "Cloned PyTorch commit: ${PYTORCH_SHA}"
           echo "Submodules downloaded:"
-          ls -la pytorch-src/third_party/ | head -20
+          ls -la third_party/ | head -20
 
       - name: Checkout torch_npu
         uses: actions/checkout@v4

From 3c8db92770f3f92c4948c75cc8f8432dab276257 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 10:58:34 +0800
Subject: [PATCH 083/132] Fix test source directory path in _test.yml

The test source is packaged as pytorch-src/test and extracted to the same
directory structure. Changed --test-dir from 'test' to 'pytorch-src/test'
to correctly locate the test files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 9d3776cea3..fbc648926d 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -119,7 +119,7 @@ jobs:
 
           python .github/scripts/run_npu_test_shard.py \
             --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
-            --test-dir test \
+            --test-dir pytorch-src/test \
             --report-dir test-reports \
             --timeout 300 \
             --max-workers ${{ inputs.max-workers }} \

From ab408c0906d5bd1270fa92af37f17f4b508d1160 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 11:07:47 +0800
Subject: [PATCH 084/132] Package PyTorch source with build artifacts for
 testing

Changes:
- Package entire pytorch-src directory including build artifacts (.so files)
- Exclude unnecessary files: .git, CMakeFiles, .o files, dist/*.whl
- Rename artifact from test-src-main to pytorch-src-main
- Update _collect.yml and _test.yml to use new artifact name

This ensures test jobs have access to compiled libraries that may be
needed for certain tests.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_build_torch_npu.yml | 38 +++++++++++++++++++++-----
 .github/workflows/_collect.yml         |  8 +++---
 .github/workflows/_test.yml            |  8 +++---
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
index 430a207f0f..515d474b13 100644
--- a/.github/workflows/_build_torch_npu.yml
+++ b/.github/workflows/_build_torch_npu.yml
@@ -336,11 +336,35 @@ jobs:
           echo "ccache size: ${CCACHE_SIZE}"
 
       # ==================== 打包和上传 ====================
-      - name: Package test source
+      - name: Package PyTorch source and build artifacts
         run: |
-          # 只打包测试目录，不需要整个 PyTorch 源码
-          tar -czf test-src.tar.gz pytorch-src/test
-          ls -la test-src.tar.gz
+          # 打包整个 pytorch-src 目录（包含测试源码和编译产物）
+          # 排除不必要的文件以减小体积：
+          # - .git 目录（最占空间）
+          # - build/ 目录中的编译中间产物（CMakeFiles, .o 文件等）
+          # - dist/*.whl（已单独上传为 artifact）
+
+          echo "=== PyTorch source directory size ==="
+          du -sh pytorch-src/
+
+          echo ""
+          echo "=== Build artifacts location ==="
+          ls -la pytorch-src/build/lib.*/torch/*.so 2>/dev/null | head -5 || echo "No .so files found in build/lib"
+          ls -la pytorch-src/torch/_C.so 2>/dev/null || echo "No _C.so in torch/"
+
+          echo ""
+          echo "=== Creating archive (excluding large unnecessary files) ==="
+          tar -czf pytorch-src.tar.gz \
+            --exclude='pytorch-src/.git' \
+            --exclude='pytorch-src/build/CMakeFiles' \
+            --exclude='pytorch-src/build/*.o' \
+            --exclude='pytorch-src/build/**/*.o' \
+            --exclude='pytorch-src/dist/*.whl' \
+            pytorch-src
+
+          echo ""
+          echo "=== Archive size ==="
+          ls -lh pytorch-src.tar.gz
 
       - name: Upload PyTorch wheel
         uses: actions/upload-artifact@v4
@@ -356,9 +380,9 @@ jobs:
           path: torch_npu-src/dist/*.whl
           retention-days: 7
 
-      - name: Upload test source
+      - name: Upload PyTorch source and build artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: test-src-main
-          path: test-src.tar.gz
+          name: pytorch-src-main
+          path: pytorch-src.tar.gz
           retention-days: 7
\ No newline at end of file
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
index 81c7d22dbf..d2abb6cc97 100644
--- a/.github/workflows/_collect.yml
+++ b/.github/workflows/_collect.yml
@@ -100,13 +100,13 @@ jobs:
           name: torch-npu-wheel-main
           path: wheels
 
-      - name: Download test source
+      - name: Download PyTorch source and build artifacts
         uses: actions/download-artifact@v4
         with:
-          name: test-src-main
+          name: pytorch-src-main
 
-      - name: Extract test source
-        run: tar -xzf test-src.tar.gz
+      - name: Extract PyTorch source
+        run: tar -xzf pytorch-src.tar.gz
 
       - name: Install wheels
         run: |
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index fbc648926d..91f61dec00 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -89,10 +89,10 @@ jobs:
           name: torch-npu-wheel-main
           path: wheels
 
-      - name: Download test source
+      - name: Download PyTorch source and build artifacts
         uses: actions/download-artifact@v4
         with:
-          name: test-src-main
+          name: pytorch-src-main
 
       - name: Download cases shard
         uses: actions/download-artifact@v4
@@ -100,8 +100,8 @@ jobs:
           name: cases-shards
           path: cases-shards
 
-      - name: Extract test source
-        run: tar -xzf test-src.tar.gz
+      - name: Extract PyTorch source
+        run: tar -xzf pytorch-src.tar.gz
 
       - name: Install wheels
         run: |

From 5089254c9233abd6cd8a5d4ea4f296c752aa1648 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 14:25:59 +0800
Subject: [PATCH 085/132] Move PR trigger from npu-sync-test.yml to
 test-collect.yml

- Remove pull_request trigger from npu-sync-test.yml (only manual workflow_dispatch)
- Add pull_request trigger to test-collect.yml for main/master/release branches

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-sync-test.yml | 6 ------
 .github/workflows/test-collect.yml  | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
index 1f6c16b88e..29ae49d936 100644
--- a/.github/workflows/npu-sync-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -1,12 +1,6 @@
 name: PyTorch NPU Sync Test
 
 on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-      - main
-      - master
-      - 'release/**'
   workflow_dispatch:
     inputs:
       docker_image_tag:
diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 705bf678e0..fa6566e41e 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -1,6 +1,12 @@
 name: Test Collect Stage
 
 on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+      - master
+      - 'release/**'
   workflow_dispatch:
     inputs:
       run_id:

From ca08d316a2533ff4c69d3ffa9f350c9df2fffd77 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 14:28:09 +0800
Subject: [PATCH 086/132] Use fixed workflow run for test-collect.yml artifacts

Changes:
- Download artifacts from fixed run: https://github.com/Ascend/pytorch/actions/runs/25473494025
- Remove git clone PyTorch step (test source now from artifact)
- Add pytorch-src-main artifact download (contains test source + build artifacts)
- Fix run_id to 25473494025 for all artifact downloads

Artifacts downloaded:
- torch-wheel-main: PyTorch wheel
- torch-npu-wheel-main: torch_npu wheel
- pytorch-src-main: PyTorch source + build artifacts + test source

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 54 +++++++++++++++---------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index fa6566e41e..4786f2530f 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -83,34 +83,12 @@ jobs:
         run: |
           pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
 
-      # ==================== Clone PyTorch for test source（提前克隆） ====================
-      - name: Clone PyTorch test source
-        run: |
-          # 使用代理加速 git clone，只获取测试用例（浅克隆，无 submodules）
-          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
-          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
-            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
-            echo "Using proxy: ${PYTORCH_REPO}"
-          fi
-          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
-          echo "Cloned PyTorch test directory"
-
-      - name: Install test dependencies
-        run: |
-          # 安装当前仓库的基础测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
-
-          # 安装上游 PyTorch 的完整测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
-
-          # 安装 PyTorch 开发依赖（可选）
-          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
-
-      # ==================== Download artifacts from previous workflow run ====================
-      - name: Download artifacts from workflow run
+      # ==================== Download artifacts from Ascend/pytorch workflow run ====================
+      # 从固定 workflow run 下载: https://github.com/Ascend/pytorch/actions/runs/25473494025
+      - name: Download torch wheel from workflow run
         uses: dawidd6/action-download-artifact@v3
         with:
-          run_id: ${{ inputs.run_id || '25097789439' }}
+          run_id: 25473494025
           name: torch-wheel-main
           path: wheels
           repo: Ascend/pytorch
@@ -120,12 +98,34 @@ jobs:
       - name: Download torch_npu wheel from workflow run
         uses: dawidd6/action-download-artifact@v3
         with:
-          run_id: ${{ inputs.run_id || '25097789439' }}
+          run_id: 25473494025
           name: torch-npu-wheel-main
           path: wheels
           repo: Ascend/pytorch
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Download PyTorch source and build artifacts from workflow run
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          run_id: 25473494025
+          name: pytorch-src-main
+          repo: Ascend/pytorch
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract PyTorch source
+        run: tar -xzf pytorch-src.tar.gz
+
+      - name: Install test dependencies
+        run: |
+          # 安装当前仓库的基础测试依赖
+          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
+
+          # 安装上游 PyTorch 的完整测试依赖
+          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
+
+          # 安装 PyTorch 开发依赖（可选）
+          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
+
       - name: Install wheels
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true

From 4d322824bef1a20eb57db8fdb3d165bb11c7a8c0 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 14:32:40 +0800
Subject: [PATCH 087/132] Fix issues in test-collect.yml

Changes:
1. Remove unused run_id input (now hardcoded to 25473494025)
2. Remove unused GH_PROXY_URL env variable (no git clone needed)
3. Fix install order: Install wheels first, then test dependencies
   (pytorch requirements-ci.txt may depend on torch)
4. Add comment explaining fixed workflow run artifacts

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 4786f2530f..bd39ce6d25 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -9,10 +9,6 @@ on:
       - 'release/**'
   workflow_dispatch:
     inputs:
-      run_id:
-        description: 'Workflow run ID to download artifacts from'
-        required: true
-        default: '25097789439'
       distributed_shards:
         description: 'Distributed test shards'
         default: 2
@@ -26,7 +22,6 @@ env:
   PYTHON_VERSION: '3.11'
   CACHE_VERSION: 'v2'
   PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-  GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
 
 jobs:
   collect:
@@ -84,7 +79,8 @@ jobs:
           pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
 
       # ==================== Download artifacts from Ascend/pytorch workflow run ====================
-      # 从固定 workflow run 下载: https://github.com/Ascend/pytorch/actions/runs/25473494025
+      # 固定 workflow run: https://github.com/Ascend/pytorch/actions/runs/25473494025
+      # 包含: torch wheel, torch_npu wheel, pytorch 源码 + 编译产物
       - name: Download torch wheel from workflow run
         uses: dawidd6/action-download-artifact@v3
         with:
@@ -115,22 +111,22 @@ jobs:
       - name: Extract PyTorch source
         run: tar -xzf pytorch-src.tar.gz
 
+      - name: Install wheels
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
+
       - name: Install test dependencies
         run: |
           # 安装当前仓库的基础测试依赖
           pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
 
-          # 安装上游 PyTorch 的完整测试依赖
+          # 安装上游 PyTorch 的完整测试依赖（需要先安装 torch）
           pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
 
           # 安装 PyTorch 开发依赖（可选）
           pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
 
-      - name: Install wheels
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
-
       - name: Verify NPU availability
         continue-on-error: true
         run: |

From 290dd710d20dfb5d2aa6e2ffda8c073b25e47076 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 15:17:05 +0800
Subject: [PATCH 088/132] Update workflow run ID to 25473829132 for artifact
 downloads

Use artifacts from: https://github.com/Ascend/pytorch/actions/runs/25473829132

Changed artifacts:
- torch-wheel-main
- torch-npu-wheel-main
- pytorch-src-main

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index bd39ce6d25..dfaf21f4ec 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -79,12 +79,12 @@ jobs:
           pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
 
       # ==================== Download artifacts from Ascend/pytorch workflow run ====================
-      # 固定 workflow run: https://github.com/Ascend/pytorch/actions/runs/25473494025
+      # 固定 workflow run: https://github.com/Ascend/pytorch/actions/runs/25473829132
       # 包含: torch wheel, torch_npu wheel, pytorch 源码 + 编译产物
       - name: Download torch wheel from workflow run
         uses: dawidd6/action-download-artifact@v3
         with:
-          run_id: 25473494025
+          run_id: 25473829132
           name: torch-wheel-main
           path: wheels
           repo: Ascend/pytorch
@@ -94,7 +94,7 @@ jobs:
       - name: Download torch_npu wheel from workflow run
         uses: dawidd6/action-download-artifact@v3
         with:
-          run_id: 25473494025
+          run_id: 25473829132
           name: torch-npu-wheel-main
           path: wheels
           repo: Ascend/pytorch
@@ -103,7 +103,7 @@ jobs:
       - name: Download PyTorch source and build artifacts from workflow run
         uses: dawidd6/action-download-artifact@v3
         with:
-          run_id: 25473494025
+          run_id: 25473829132
           name: pytorch-src-main
           repo: Ascend/pytorch
           github_token: ${{ secrets.GITHUB_TOKEN }}

From 42cf3597cfeec314316a66224800026dfcadc43d Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 15:30:36 +0800
Subject: [PATCH 089/132] Add debug logging for test case collection path
 issues

Add comprehensive debug output to diagnose pytest file not found errors:
- Print working directory and path details in collect_all_cases.py
- Print pytorch-src directory structure after extraction in test-collect.yml
- Focus on distributed/_composable/fsdp path investigation

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py | 65 +++++++++++++++++++++++++++-
 .github/workflows/test-collect.yml   | 46 ++++++++++++++++++++
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index cb803f988d..65deb56255 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -37,15 +37,42 @@ def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1, v
         Tuple of (cases list, error message or empty string)
     """
     full_path = test_dir / test_file
+
+    # ===== DEBUG: Print path details for first few files =====
+    if verbose and "distributed/_composable/fsdp" in test_file:
+        print("\n=== DEBUG: collect_cases_from_file ===")
+        print(f"test_file: {test_file}")
+        print(f"test_dir: {test_dir}")
+        print(f"test_dir.resolve(): {test_dir.resolve()}")
+        print(f"full_path: {full_path}")
+        print(f"full_path.resolve(): {full_path.resolve()}")
+        print(f"full_path.exists(): {full_path.exists()}")
+        print(f"cwd for pytest: {test_dir}")
+        print("=" * 50)
+
     if not full_path.exists():
         error = f"File not found: {full_path}"
         if verbose:
             print(f"[SKIP] {test_file}: {error}")
+            # DEBUG: Show what files exist in similar location
+            parent = full_path.parent
+            if parent.exists():
+                print(f"  DEBUG: Files in {parent}:")
+                for item in sorted(parent.iterdir())[:10]:
+                    print(f"    {item.name}")
+            else:
+                print(f"  DEBUG: Parent directory {parent} does not exist")
         return [], error
 
     try:
+        # ===== DEBUG: Print pytest command for specific files =====
+        pytest_cmd = ["pytest", "--collect-only", "-q", str(full_path)]
+        if verbose and "distributed/_composable/fsdp" in test_file:
+            print(f"\nDEBUG: pytest command: {pytest_cmd}")
+            print(f"DEBUG: cwd: {str(test_dir)}")
+
         result = subprocess.run(
-            ["pytest", "--collect-only", "-q", str(full_path)],
+            pytest_cmd,
             capture_output=True,
             text=True,
             timeout=60,
@@ -131,6 +158,42 @@ def collect_all_cases(
     if not test_dir_path.exists():
         raise FileNotFoundError(f"Test directory not found: {test_dir}")
 
+    # ===== DEBUG: Print directory structure and paths =====
+    print("\n=== DEBUG: Directory Structure and Paths ===")
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"test_dir argument: {test_dir}")
+    print(f"test_dir_path: {test_dir_path}")
+    print(f"test_dir_path.resolve(): {test_dir_path.resolve()}")
+    print(f"test_dir_path exists: {test_dir_path.exists()}")
+    print(f"test_dir_path is absolute: {test_dir_path.is_absolute()}")
+
+    # List top-level directories in test_dir
+    if test_dir_path.exists():
+        print(f"\nTop-level items in test_dir_path:")
+        for item in sorted(test_dir_path.iterdir())[:20]:
+            print(f"  {item.name} {'[DIR]' if item.is_dir() else '[FILE]'}")
+
+        # Check distributed directory specifically
+        distributed_path = test_dir_path / "distributed"
+        if distributed_path.exists():
+            print(f"\ndistributed/ directory exists: {distributed_path}")
+            print(f"Contents of distributed/ (first 20 items):")
+            for item in sorted(distributed_path.iterdir())[:20]:
+                print(f"  {item.name} {'[DIR]' if item.is_dir() else '[FILE]'}")
+
+            # Check _composable/fsdp specifically
+            fsdp_path = distributed_path / "_composable" / "fsdp"
+            if fsdp_path.exists():
+                print(f"\n_fsdp path exists: {fsdp_path}")
+                print(f"Contents of fsdp/ (first 10 items):")
+                for item in sorted(fsdp_path.iterdir())[:10]:
+                    print(f"  {item.name} {'[DIR]' if item.is_dir() else '[FILE]'}")
+            else:
+                print(f"\nfsdp path NOT found: {fsdp_path}")
+        else:
+            print(f"\ndistributed/ directory NOT found")
+    print("=" * 50 + "\n")
+
     output_dir_path = Path(output_dir)
     output_dir_path.mkdir(parents=True, exist_ok=True)
 
diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index dfaf21f4ec..440185ab6c 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -111,6 +111,52 @@ jobs:
       - name: Extract PyTorch source
         run: tar -xzf pytorch-src.tar.gz
 
+      - name: DEBUG - Print pytorch-src directory structure
+        run: |
+          echo "=== Current working directory ==="
+          pwd
+          echo ""
+          echo "=== Top-level files in current directory ==="
+          ls -la
+          echo ""
+          echo "=== pytorch-src directory exists? ==="
+          ls -la pytorch-src/ || echo "pytorch-src directory not found"
+          echo ""
+          echo "=== pytorch-src/test directory exists? ==="
+          ls -la pytorch-src/test/ || echo "pytorch-src/test directory not found"
+          echo ""
+          echo "=== pytorch-src/test subdirectories (first 20) ==="
+          if [ -d pytorch-src/test ]; then
+            ls -la pytorch-src/test/ | head -25
+          fi
+          echo ""
+          echo "=== pytorch-src/test/distributed directory ==="
+          if [ -d pytorch-src/test/distributed ]; then
+            ls -la pytorch-src/test/distributed/ | head -20
+            echo ""
+            echo "=== pytorch-src/test/distributed/_composable directory ==="
+            if [ -d pytorch-src/test/distributed/_composable ]; then
+              ls -la pytorch-src/test/distributed/_composable/
+              echo ""
+              echo "=== pytorch-src/test/distributed/_composable/fsdp directory ==="
+              if [ -d pytorch-src/test/distributed/_composable/fsdp ]; then
+                ls -la pytorch-src/test/distributed/_composable/fsdp/ | head -15
+              else
+                echo "fsdp directory NOT found"
+              fi
+            else
+              echo "_composable directory NOT found"
+            fi
+          else
+            echo "distributed directory NOT found"
+          fi
+          echo ""
+          echo "=== Find test_fully_shard files ==="
+          find pytorch-src/test -name "test_fully_shard*.py" 2>/dev/null || echo "No test_fully_shard files found"
+          echo ""
+          echo "=== Check if pytorch-src.tar.gz still exists ==="
+          ls -la pytorch-src.tar.gz || echo "pytorch-src.tar.gz not found after extraction"
+
       - name: Install wheels
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true

From 92c456c2281a5408f11bcdbd31248df55ccfecf3 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 15:37:13 +0800
Subject: [PATCH 090/132] Change PyTorch source download to use GitHub commit
 with proxy

- Download source from commit fccc94ae83f61fe26559abc999797297196bac29
- Use gh-proxy.test.osinfra.cn proxy for faster download
- No submodules needed for test case execution only
- Auto-rename extracted directory to pytorch-src

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 42 ++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 440185ab6c..be5fd0a50c 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -100,16 +100,42 @@ jobs:
           repo: Ascend/pytorch
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Download PyTorch source and build artifacts from workflow run
-        uses: dawidd6/action-download-artifact@v3
-        with:
-          run_id: 25473829132
-          name: pytorch-src-main
-          repo: Ascend/pytorch
-          github_token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Download PyTorch source from GitHub
+        run: |
+          PYTORCH_COMMIT=fccc94ae83f61fe26559abc999797297196bac29
+          PYTORCH_REPO=pytorch/pytorch
+          PROXY_URL=https://gh-proxy.test.osinfra.cn
+
+          echo "Downloading PyTorch source from commit: ${PYTORCH_COMMIT}"
+          echo "Using proxy: ${PROXY_URL}"
+
+          # Download source tarball via proxy (no submodules needed for testing)
+          curl -L "${PROXY_URL}/https://github.com/${PYTORCH_REPO}/archive/${PYTORCH_COMMIT}.tar.gz" \
+            -o pytorch-src.tar.gz
+
+          echo "Downloaded pytorch-src.tar.gz:"
+          ls -la pytorch-src.tar.gz
 
       - name: Extract PyTorch source
-        run: tar -xzf pytorch-src.tar.gz
+        run: |
+          # Extract source tarball
+          tar -xzf pytorch-src.tar.gz
+
+          # Rename directory to pytorch-src (tarball extracts as pytorch-fccc94ae...)
+          mv pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
+            mv pytorch-pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
+            true
+
+          # If still not renamed, find and rename the extracted directory
+          if [ ! -d pytorch-src ]; then
+            EXTRACTED_DIR=$(find . -maxdepth 1 -type d -name "pytorch-*" | head -1)
+            if [ -n "$EXTRACTED_DIR" ]; then
+              mv "$EXTRACTED_DIR" pytorch-src
+            fi
+          fi
+
+          echo "Extracted to pytorch-src directory:"
+          ls -la pytorch-src/ | head -10
 
       - name: DEBUG - Print pytorch-src directory structure
         run: |

From 3648c3d48b268f5990e580ce4560ea0a066b3f14 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 16:00:44 +0800
Subject: [PATCH 091/132] Fix pytest path argument for test case collection

- Use test_file (relative path) as pytest argument instead of full_path
- When pytest cwd is test_dir, argument should be relative to cwd
- Example: pytest --collect-only -q dynamo/test_regional_inductor.py
- Previously used full_path causing double path prefix error

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 65deb56255..49ad0fbdc4 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -33,21 +33,28 @@ def is_distributed_test(test_file: str) -> bool:
 def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1, verbose: bool = False) -> Tuple[List[str], str]:
     """Collect test cases from a single test file using pytest --collect-only.
 
+    Args:
+        test_dir: The test directory path (used as cwd for pytest)
+        test_file: Relative path to test file from test_dir (e.g., "dynamo/test_regional_inductor.py")
+        parallel: Number of parallel collectors (not used in this function)
+        verbose: Print debug information
+
     Returns:
         Tuple of (cases list, error message or empty string)
     """
     full_path = test_dir / test_file
 
-    # ===== DEBUG: Print path details for first few files =====
-    if verbose and "distributed/_composable/fsdp" in test_file:
+    # ===== DEBUG: Print path details for specific files =====
+    if verbose and ("distributed/_composable/fsdp" in test_file or "dynamo" in test_file):
         print("\n=== DEBUG: collect_cases_from_file ===")
-        print(f"test_file: {test_file}")
+        print(f"test_file (relative): {test_file}")
         print(f"test_dir: {test_dir}")
         print(f"test_dir.resolve(): {test_dir.resolve()}")
-        print(f"full_path: {full_path}")
+        print(f"full_path (test_dir / test_file): {full_path}")
         print(f"full_path.resolve(): {full_path.resolve()}")
         print(f"full_path.exists(): {full_path.exists()}")
-        print(f"cwd for pytest: {test_dir}")
+        print(f"cwd for pytest: {str(test_dir)}")
+        print(f"pytest arg (should be relative): {test_file}")
         print("=" * 50)
 
     if not full_path.exists():
@@ -65,9 +72,11 @@ def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1, v
         return [], error
 
     try:
-        # ===== DEBUG: Print pytest command for specific files =====
-        pytest_cmd = ["pytest", "--collect-only", "-q", str(full_path)]
-        if verbose and "distributed/_composable/fsdp" in test_file:
+        # Use test_file (relative path) as pytest argument since cwd is test_dir
+        # pytest command should be: pytest --collect-only -q dynamo/test_regional_inductor.py
+        # NOT: pytest --collect-only -q pytorch-src/test/dynamo/test_regional_inductor.py
+        pytest_cmd = ["pytest", "--collect-only", "-q", test_file]
+        if verbose and ("distributed/_composable/fsdp" in test_file or "dynamo" in test_file):
             print(f"\nDEBUG: pytest command: {pytest_cmd}")
             print(f"DEBUG: cwd: {str(test_dir)}")
 

From 7c288d87bc1022a9a33b316e6850ae41c10b80f9 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 17:09:29 +0800
Subject: [PATCH 092/132] Simplify test-collect.yml and add timing information

- Remove fallback methods for test list (method 2 and default list)
- Only use discover_tests.py to get test list
- Redirect test output to file only (not to console)
- Add per-test and total timing statistics
- Show last 20 lines of log for failed tests only
- Include duration_seconds in test results summary JSON

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 68 +++++++++++-------------------
 1 file changed, 24 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index be5fd0a50c..e281bf09fe 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -321,42 +321,6 @@ jobs:
           print(f'Found {len(TESTS)} tests from discover_tests.py')
           " 2>/dev/null || true
 
-          # 如果方法1失败，尝试方法2: 从 --help 解析
-          if [ ! -f /tmp/test_list.txt ] || [ ! -s /tmp/test_list.txt ]; then
-            echo "Method 1 failed, trying method 2 (parse --help)"
-            python${{ env.PYTHON_VERSION }} run_test.py --help > /tmp/run_test_help.txt 2>&1 || true
-
-            # 解析测试列表
-            python${{ env.PYTHON_VERSION }} -c "
-            import re
-            with open('/tmp/run_test_help.txt') as f:
-                content = f.read()
-            # 查找测试列表 (格式: where TESTS is any of: xxx, yyy, zzz)
-            match = re.search(r'where TESTS is any of:\s*(.+)', content)
-            if match:
-                tests_str = match.group(1).strip()
-                tests = [t.strip() for t in tests_str.split(',')]
-                tests = [t for t in tests if t]
-                with open('/tmp/test_list.txt', 'w') as f:
-                    for t in tests:
-                        f.write(t + '\n')
-                print(f'Found {len(tests)} tests from --help')
-            else:
-                print('Could not find TESTS list')
-            "
-          fi
-
-          # 如果都失败，使用默认列表
-          if [ ! -f /tmp/test_list.txt ] || [ ! -s /tmp/test_list.txt ]; then
-            echo "All methods failed, using default test list"
-            echo "test_nn" > /tmp/test_list.txt
-            echo "test_torch" >> /tmp/test_list.txt
-            echo "test_autograd" >> /tmp/test_list.txt
-            echo "test_distributions" >> /tmp/test_list.txt
-            echo "test_indexing" >> /tmp/test_list.txt
-            echo "test_type_promotion" >> /tmp/test_list.txt
-          fi
-
           # 输出测试数量
           TEST_COUNT=$(wc -l < /tmp/test_list.txt)
           echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
@@ -382,6 +346,7 @@ jobs:
           TEST_COUNT=0
           PASSED=0
           FAILED=0
+          START_TIME=$(date +%s)
 
           while IFS= read -r test_name; do
             if [ -z "$test_name" ]; then
@@ -389,43 +354,58 @@ jobs:
             fi
 
             TEST_COUNT=$((TEST_COUNT + 1))
+            TEST_START_TIME=$(date +%s)
             echo ""
             echo "============================================"
             echo "Test ${TEST_COUNT}: ${test_name}"
             echo "============================================"
 
-            # 运行测试，捕获输出，即使失败也继续
+            # 运行测试，捕获输出到文件但不打印到控制台
             set +e
-            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v 2>&1 | tee /tmp/test_output_${TEST_COUNT}.log
-            TEST_EXIT_CODE=${PIPESTATUS[0]}
+            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${TEST_COUNT}.log 2>&1
+            TEST_EXIT_CODE=$?
             set -e
 
+            TEST_END_TIME=$(date +%s)
+            TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
+
             # 检查结果
             if [ $TEST_EXIT_CODE -eq 0 ]; then
-              echo ">>> ${test_name}: PASSED"
+              echo ">>> ${test_name}: PASSED (${TEST_DURATION}s)"
               PASSED=$((PASSED + 1))
             else
-              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE})"
+              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
               FAILED=$((FAILED + 1))
+              # 打印失败测试的最后几行日志
+              echo "--- Last 20 lines of log ---"
+              tail -20 /tmp/test_output_${TEST_COUNT}.log
+              echo "--- End of log preview ---"
             fi
 
+            CURRENT_TIME=$(date +%s)
+            TOTAL_DURATION=$((CURRENT_TIME - START_TIME))
             echo ""
             echo "--------------------------------------------"
-            echo "Summary so far: ${PASSED} passed, ${FAILED} failed out of ${TEST_COUNT} tests"
+            echo "Summary: ${PASSED} passed, ${FAILED} failed, ${TEST_COUNT} total"
+            echo "Elapsed: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date +%H:%M:%S))"
             echo "--------------------------------------------"
 
           done < /tmp/test_list.txt
 
+          END_TIME=$(date +%s)
+          TOTAL_DURATION=$((END_TIME - START_TIME))
+
           echo ""
           echo "============================================"
           echo "=== FINAL SUMMARY ==="
           echo "Total tests run: ${TEST_COUNT}"
           echo "Passed: ${PASSED}"
           echo "Failed: ${FAILED}"
+          echo "Total time: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
           echo "============================================"
 
-          # 保存结果摘要
-          echo "{\"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}}" > /tmp/test_results_summary.json
+          # 保存结果摘要（包含时间信息）
+          echo "{\"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary.json
 
       - name: Upload test results
         if: always()

From 210387243802ae7dc81a10fead184ee895864f31 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 17:24:06 +0800
Subject: [PATCH 093/132] Increase test execution timeout to 20 hours

- Change timeout-minutes from 120 to 1200 (20 hours)
- Allow longer test runs for full test suite execution

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index e281bf09fe..878cdb0968 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -331,7 +331,7 @@ jobs:
           fi
 
       - name: Run tests using run_test.py
-        timeout-minutes: 120
+        timeout-minutes: 1200  # 20 hours
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true

From 5785218663febdd61d28595402a8ab4e31b6b874 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 17:34:43 +0800
Subject: [PATCH 094/132] Remove debug step for pytorch-src directory structure

- Remove DEBUG - Print pytorch-src directory structure step
- Path issue has been resolved, debug output no longer needed

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 46 ------------------------------
 1 file changed, 46 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 878cdb0968..bbe65794ac 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -137,52 +137,6 @@ jobs:
           echo "Extracted to pytorch-src directory:"
           ls -la pytorch-src/ | head -10
 
-      - name: DEBUG - Print pytorch-src directory structure
-        run: |
-          echo "=== Current working directory ==="
-          pwd
-          echo ""
-          echo "=== Top-level files in current directory ==="
-          ls -la
-          echo ""
-          echo "=== pytorch-src directory exists? ==="
-          ls -la pytorch-src/ || echo "pytorch-src directory not found"
-          echo ""
-          echo "=== pytorch-src/test directory exists? ==="
-          ls -la pytorch-src/test/ || echo "pytorch-src/test directory not found"
-          echo ""
-          echo "=== pytorch-src/test subdirectories (first 20) ==="
-          if [ -d pytorch-src/test ]; then
-            ls -la pytorch-src/test/ | head -25
-          fi
-          echo ""
-          echo "=== pytorch-src/test/distributed directory ==="
-          if [ -d pytorch-src/test/distributed ]; then
-            ls -la pytorch-src/test/distributed/ | head -20
-            echo ""
-            echo "=== pytorch-src/test/distributed/_composable directory ==="
-            if [ -d pytorch-src/test/distributed/_composable ]; then
-              ls -la pytorch-src/test/distributed/_composable/
-              echo ""
-              echo "=== pytorch-src/test/distributed/_composable/fsdp directory ==="
-              if [ -d pytorch-src/test/distributed/_composable/fsdp ]; then
-                ls -la pytorch-src/test/distributed/_composable/fsdp/ | head -15
-              else
-                echo "fsdp directory NOT found"
-              fi
-            else
-              echo "_composable directory NOT found"
-            fi
-          else
-            echo "distributed directory NOT found"
-          fi
-          echo ""
-          echo "=== Find test_fully_shard files ==="
-          find pytorch-src/test -name "test_fully_shard*.py" 2>/dev/null || echo "No test_fully_shard files found"
-          echo ""
-          echo "=== Check if pytorch-src.tar.gz still exists ==="
-          ls -la pytorch-src.tar.gz || echo "pytorch-src.tar.gz not found after extraction"
-
       - name: Install wheels
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true

From d9d7db2c1730395e26bfb298ee2d790fea25eed8 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 18:56:46 +0800
Subject: [PATCH 095/132] Implement matrix sharding strategy for test execution

Replace sequential test execution with 6 parallel shards using
run_test.py --shard parameter. Each shard runs independently with
crash isolation and uploads separate test artifacts.

Changes:
- Use run_test.py --dry-run for test counting
- Add matrix strategy with 6 shards max-parallel
- Execute tests with --shard <id> 6 parameter
- Upload separate artifacts per shard

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 320 +++++++++++++++++------------
 1 file changed, 192 insertions(+), 128 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index bbe65794ac..5116296287 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -28,9 +28,8 @@ jobs:
     runs-on: linux-aarch64-a3-16
     timeout-minutes: 180
     outputs:
-      distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
-      regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
-      total_cases: ${{ steps.collect.outputs.total_cases }}
+      test_matrix: ${{ steps.get_tests.outputs.test_matrix }}
+      total_tests: ${{ steps.get_tests.outputs.total_tests }}
 
     container:
       image: quay.io/kerer/pytorch:latest
@@ -209,83 +208,159 @@ jobs:
           path: ~/.cache/pip
           key: ${{ steps.pip_key.outputs.cache_key }}
 
-      - name: Collect all test cases
-        id: collect
+      # ==================== 使用 run_test.py 运行测试 ====================
+      - name: Get test list from run_test.py
+        id: get_tests
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
-          echo "=== NPU-SMI Info ==="
-          npu-smi info || echo "npu-smi not available"
+          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
+          cd pytorch-src/test
 
-          python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
-            --test-dir pytorch-src/test \
-            --distributed-shards ${{ inputs.distributed_shards || 2 }} \
-            --regular-shards ${{ inputs.regular_shards || 5 }} \
-            --output-dir cases_shards \
-            --parallel 16 \
-            --verbose
-
-          TOTAL=$(python${{ env.PYTHON_VERSION }} -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
-          echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT
-
-          # Generate shard matrices
-          python${{ env.PYTHON_VERSION }} -c "
-          import json
-          d_shards = ${{ inputs.distributed_shards || 2 }}
-          r_shards = ${{ inputs.regular_shards || 5 }}
-          print(f'distributed_matrix={json.dumps(list(range(1, d_shards + 1)))}')
-          print(f'regular_matrix={json.dumps(list(range(1, r_shards + 1)))}')
-          " >> $GITHUB_OUTPUT
-
-      - name: Upload cases shard JSONs
-        uses: actions/upload-artifact@v4
+          # 获取测试总数（使用 dry-run 模式）
+          TEST_COUNT=$(python${{ env.PYTHON_VERSION }} run_test.py --dry-run 2>&1 | grep -o "Running [0-9]* items" | grep -o "[0-9]*" || echo "0")
+
+          echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
+          echo "=== Tests to run (${TEST_COUNT} tests) ==="
+
+          # 输出矩阵配置 (6 shards)
+          echo "test_matrix=[1,2,3,4,5,6]" >> $GITHUB_OUTPUT
+          echo "total_tests=${TEST_COUNT}" >> $GITHUB_OUTPUT
+
+  # ==================== 矩阵分片测试 ====================
+  test:
+    needs: collect
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 1200  # 20 hours per shard
+
+    strategy:
+      matrix:
+        shard: ${{ fromJson(needs.collect.outputs.test_matrix) }}
+      fail-fast: false
+      max-parallel: 6
+
+    container:
+      image: quay.io/kerer/pytorch:latest
+      options: --user root
+
+    steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
         with:
-          name: cases-shards
-          path: cases_shards/
+          sparse-checkout: |
+            .github/scripts
+            .ci/docker
 
-      - name: Print collection summary
-        if: always()
+      # ==================== pip 缓存配置 ====================
+      - name: Get pip cache key
+        id: pip_key
         run: |
-          echo "=== Collection Summary ==="
-          if [ -f cases_shards/cases_collection_summary.json ]; then
-            cat cases_shards/cases_collection_summary.json
+          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore pip cache
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-
+            ${{ env.CACHE_VERSION }}-pip-test-
+
+      - name: Setup pip cache directory
+        run: |
+          mkdir -p ~/.cache/pip
+
+      - name: Configure pip index URL
+        run: |
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
           else
-            echo "Summary file not found"
+            echo "No PyPI cache URL configured, using default"
           fi
 
-      # ==================== 使用 run_test.py 运行测试 ====================
-      - name: Get test list from run_test.py
-        id: get_tests
+      - name: Upgrade pip and setuptools
         run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
 
-          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
-          cd pytorch-src/test
+      # ==================== Download artifacts ====================
+      - name: Download torch wheel from workflow run
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          run_id: 25473829132
+          name: torch-wheel-main
+          path: wheels
+          repo: Ascend/pytorch
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          skip_unpack: false
 
-          # 方法1: 直接从 discover_tests.py 获取测试列表（更可靠）
-          python${{ env.PYTHON_VERSION }} -c "
-          import sys
-          sys.path.insert(0, '../tools/testing')
-          from discover_tests import TESTS
-          with open('/tmp/test_list.txt', 'w') as f:
-              for t in TESTS:
-                  f.write(t + '\n')
-          print(f'Found {len(TESTS)} tests from discover_tests.py')
-          " 2>/dev/null || true
-
-          # 输出测试数量
-          TEST_COUNT=$(wc -l < /tmp/test_list.txt)
-          echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
-          echo "=== Tests to run (${TEST_COUNT} tests) ==="
-          head -20 /tmp/test_list.txt
-          if [ $TEST_COUNT -gt 20 ]; then
-            echo "... and $(($TEST_COUNT - 20)) more tests"
+      - name: Download torch_npu wheel from workflow run
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          run_id: 25473829132
+          name: torch-npu-wheel-main
+          path: wheels
+          repo: Ascend/pytorch
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download PyTorch source from GitHub
+        run: |
+          PYTORCH_COMMIT=fccc94ae83f61fe26559abc999797297196bac29
+          PYTORCH_REPO=pytorch/pytorch
+          PROXY_URL=https://gh-proxy.test.osinfra.cn
+
+          echo "Downloading PyTorch source from commit: ${PYTORCH_COMMIT}"
+          echo "Using proxy: ${PROXY_URL}"
+
+          # Download source tarball via proxy (no submodules needed for testing)
+          curl -L "${PROXY_URL}/https://github.com/${PYTORCH_REPO}/archive/${PYTORCH_COMMIT}.tar.gz" \
+            -o pytorch-src.tar.gz
+
+          echo "Downloaded pytorch-src.tar.gz:"
+          ls -la pytorch-src.tar.gz
+
+      - name: Extract PyTorch source
+        run: |
+          # Extract source tarball
+          tar -xzf pytorch-src.tar.gz
+
+          # Rename directory to pytorch-src (tarball extracts as pytorch-fccc94ae...)
+          mv pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
+            mv pytorch-pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
+            true
+
+          # If still not renamed, find and rename the extracted directory
+          if [ ! -d pytorch-src ]; then
+            EXTRACTED_DIR=$(find . -maxdepth 1 -type d -name "pytorch-*" | head -1)
+            if [ -n "$EXTRACTED_DIR" ]; then
+              mv "$EXTRACTED_DIR" pytorch-src
+            fi
           fi
 
-      - name: Run tests using run_test.py
-        timeout-minutes: 1200  # 20 hours
+          echo "Extracted to pytorch-src directory:"
+          ls -la pytorch-src/ | head -10
+
+      - name: Install wheels
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
+
+      - name: Install test dependencies
+        run: |
+          # 安装当前仓库的基础测试依赖
+          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
+
+          # 安装上游 PyTorch 的完整测试依赖（需要先安装 torch）
+          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
+
+          # 安装 PyTorch 开发依赖（可选）
+          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
+
+      - name: Verify NPU availability
+        continue-on-error: true
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
@@ -293,88 +368,77 @@ jobs:
           echo "=== NPU-SMI Info ==="
           npu-smi info || echo "npu-smi not available"
 
-          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
-          cd pytorch-src/test
+          PYTHON=python${{ env.PYTHON_VERSION }}
+          echo ""
+          echo "=== PyTorch and NPU Info ==="
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          " 2>&1 || echo "torch/torch_npu import failed"
 
-          echo "=== Starting test execution ==="
-          TEST_COUNT=0
-          PASSED=0
-          FAILED=0
-          START_TIME=$(date +%s)
+      # ==================== 保存 pip 缓存 ====================
+      - name: Save pip cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
 
-          while IFS= read -r test_name; do
-            if [ -z "$test_name" ]; then
-              continue
-            fi
+      # ==================== 运行分片测试 ====================
+      - name: Run shard ${{ matrix.shard }} tests
+        timeout-minutes: 1200
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
-            TEST_COUNT=$((TEST_COUNT + 1))
-            TEST_START_TIME=$(date +%s)
-            echo ""
-            echo "============================================"
-            echo "Test ${TEST_COUNT}: ${test_name}"
-            echo "============================================"
-
-            # 运行测试，捕获输出到文件但不打印到控制台
-            set +e
-            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${TEST_COUNT}.log 2>&1
-            TEST_EXIT_CODE=$?
-            set -e
-
-            TEST_END_TIME=$(date +%s)
-            TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
-
-            # 检查结果
-            if [ $TEST_EXIT_CODE -eq 0 ]; then
-              echo ">>> ${test_name}: PASSED (${TEST_DURATION}s)"
-              PASSED=$((PASSED + 1))
-            else
-              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
-              FAILED=$((FAILED + 1))
-              # 打印失败测试的最后几行日志
-              echo "--- Last 20 lines of log ---"
-              tail -20 /tmp/test_output_${TEST_COUNT}.log
-              echo "--- End of log preview ---"
-            fi
+          echo "=== Running Shard ${{ matrix.shard }} of 6 ==="
+          echo "=== NPU-SMI Info ==="
+          npu-smi info || echo "npu-smi not available"
 
-            CURRENT_TIME=$(date +%s)
-            TOTAL_DURATION=$((CURRENT_TIME - START_TIME))
-            echo ""
-            echo "--------------------------------------------"
-            echo "Summary: ${PASSED} passed, ${FAILED} failed, ${TEST_COUNT} total"
-            echo "Elapsed: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date +%H:%M:%S))"
-            echo "--------------------------------------------"
+          # 切换到 test 目录执行
+          cd pytorch-src/test
 
-          done < /tmp/test_list.txt
+          # 使用 run_test.py 的分片功能
+          # --shard <shard_id> <num_shards>
+          # shard_id: 1-6 (当前分片ID)
+          # num_shards: 6 (总分片数)
+          START_TIME=$(date +%s)
+
+          python${{ env.PYTHON_VERSION }} run_test.py \
+            --shard ${{ matrix.shard }} 6 \
+            --verbose \
+            --exclude-jit-executor \
+            --exclude-distributed-tests \
+            2>&1 | tee /tmp/test_output_shard_${{ matrix.shard }}.log
 
           END_TIME=$(date +%s)
-          TOTAL_DURATION=$((END_TIME - START_TIME))
+          DURATION=$((END_TIME - START_TIME))
 
           echo ""
           echo "============================================"
-          echo "=== FINAL SUMMARY ==="
-          echo "Total tests run: ${TEST_COUNT}"
-          echo "Passed: ${PASSED}"
-          echo "Failed: ${FAILED}"
-          echo "Total time: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
+          echo "=== Shard ${{ matrix.shard }} Completed ==="
+          echo "Duration: ${DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
           echo "============================================"
 
-          # 保存结果摘要（包含时间信息）
-          echo "{\"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary.json
-
-      - name: Upload test results
+      - name: Upload test results for shard ${{ matrix.shard }}
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: test-run-results
+          name: test-results-shard-${{ matrix.shard }}
           path: |
-            /tmp/test_output_*.log
-            /tmp/test_results_summary.json
+            /tmp/test_output_shard_${{ matrix.shard }}.log
+            pytorch-src/test/test-reports/
           retention-days: 7
 
-      - name: Print final summary
+      - name: Print shard summary
         if: always()
         run: |
-          echo "=== Final Test Results ==="
-          if [ -f /tmp/test_results_summary.json ]; then
-            cat /tmp/test_results_summary.json
+          echo "=== Shard ${{ matrix.shard }} Summary ==="
+          if [ -f /tmp/test_output_shard_${{ matrix.shard }}.log ]; then
+            echo "Last 50 lines of output:"
+            tail -50 /tmp/test_output_shard_${{ matrix.shard }}.log
           fi

From 6581d058b266e10a9f8d5cc446af0c446f635d4f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 19:15:18 +0800
Subject: [PATCH 096/132] Fix test collection: use discover_tests.py instead of
 dry-run

run_test.py --dry-run does not output "Running X items" format,
so grep parsing failed and returned 0 tests. Revert to the
original reliable method using discover_tests.TESTS list.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 5116296287..7d1d4c5d81 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -218,11 +218,25 @@ jobs:
           # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
           cd pytorch-src/test
 
-          # 获取测试总数（使用 dry-run 模式）
-          TEST_COUNT=$(python${{ env.PYTHON_VERSION }} run_test.py --dry-run 2>&1 | grep -o "Running [0-9]* items" | grep -o "[0-9]*" || echo "0")
-
+          # 直接从 discover_tests.py 获取测试列表（更可靠）
+          python${{ env.PYTHON_VERSION }} -c "
+          import sys
+          sys.path.insert(0, '../tools/testing')
+          from discover_tests import TESTS
+          with open('/tmp/test_list.txt', 'w') as f:
+              for t in TESTS:
+                  f.write(t + '\n')
+          print(f'Found {len(TESTS)} tests from discover_tests.py')
+          " 2>/dev/null || true
+
+          # 输出测试数量
+          TEST_COUNT=$(wc -l < /tmp/test_list.txt)
           echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
           echo "=== Tests to run (${TEST_COUNT} tests) ==="
+          head -20 /tmp/test_list.txt
+          if [ $TEST_COUNT -gt 20 ]; then
+            echo "... and $(($TEST_COUNT - 20)) more tests"
+          fi
 
           # 输出矩阵配置 (6 shards)
           echo "test_matrix=[1,2,3,4,5,6]" >> $GITHUB_OUTPUT

From cf2d3f5afd6618948ec238e89c4c0563c0258006 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 19:24:19 +0800
Subject: [PATCH 097/132] Implement manual sharding: split test list into 6
 JSON shards

Changes:
- Collect job: Get TESTS list from discover_tests.py and split
  into 6 equal shards, each saved as shard_1.json to shard_6.json
- Upload shard JSONs as artifact for test jobs
- Test job: Download corresponding shard JSON, execute each test
  individually using run_test.py -i <test_name> -v
- Each test runs in separate subprocess for crash isolation
- Upload individual test logs per shard

This approach provides better control and visibility than
run_test.py --shard parameter.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 178 +++++++++++++++++++++++------
 1 file changed, 141 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 7d1d4c5d81..817f994020 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -208,8 +208,8 @@ jobs:
           path: ~/.cache/pip
           key: ${{ steps.pip_key.outputs.cache_key }}
 
-      # ==================== 使用 run_test.py 运行测试 ====================
-      - name: Get test list from run_test.py
+      # ==================== 使用 discover_tests.py 获取测试列表并分片 ====================
+      - name: Get test list and create shards
         id: get_tests
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -218,29 +218,71 @@ jobs:
           # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
           cd pytorch-src/test
 
-          # 直接从 discover_tests.py 获取测试列表（更可靠）
+          # 创建分片 JSON 文件目录
+          mkdir -p /tmp/shards
+
+          # 从 discover_tests.py 获取测试列表并分片
           python${{ env.PYTHON_VERSION }} -c "
           import sys
+          import json
           sys.path.insert(0, '../tools/testing')
           from discover_tests import TESTS
-          with open('/tmp/test_list.txt', 'w') as f:
-              for t in TESTS:
-                  f.write(t + '\n')
-          print(f'Found {len(TESTS)} tests from discover_tests.py')
-          " 2>/dev/null || true
-
-          # 输出测试数量
-          TEST_COUNT=$(wc -l < /tmp/test_list.txt)
-          echo "test_count=${TEST_COUNT}" >> $GITHUB_OUTPUT
-          echo "=== Tests to run (${TEST_COUNT} tests) ==="
-          head -20 /tmp/test_list.txt
-          if [ $TEST_COUNT -gt 20 ]; then
-            echo "... and $(($TEST_COUNT - 20)) more tests"
-          fi
+
+          NUM_SHARDS = 6
+
+          # 平均分片
+          tests_per_shard = len(TESTS) // NUM_SHARDS
+          remainder = len(TESTS) % NUM_SHARDS
+
+          shard_data = {}
+          start_idx = 0
+
+          for shard_id in range(1, NUM_SHARDS + 1):
+              # 前几个 shard 多分配余数
+              extra = 1 if shard_id <= remainder else 0
+              end_idx = start_idx + tests_per_shard + extra
+              shard_tests = TESTS[start_idx:end_idx]
+
+              shard_data[shard_id] = {
+                  'shard_id': shard_id,
+                  'total_shards': NUM_SHARDS,
+                  'tests': shard_tests,
+                  'test_count': len(shard_tests)
+              }
+
+              # 写入 JSON 文件
+              with open(f'/tmp/shards/shard_{shard_id}.json', 'w') as f:
+                  json.dump(shard_data[shard_id], f, indent=2)
+
+              print(f'Shard {shard_id}: {len(shard_tests)} tests')
+              start_idx = end_idx
+
+          # 写入汇总信息
+          summary = {
+              'total_tests': len(TESTS),
+              'num_shards': NUM_SHARDS,
+              'shard_distribution': {k: v['test_count'] for k, v in shard_data.items()}
+          }
+          with open('/tmp/shards/summary.json', 'w') as f:
+              json.dump(summary, f, indent=2)
+
+          print(f'Total tests: {len(TESTS)}')
+          "
 
           # 输出矩阵配置 (6 shards)
           echo "test_matrix=[1,2,3,4,5,6]" >> $GITHUB_OUTPUT
-          echo "total_tests=${TEST_COUNT}" >> $GITHUB_OUTPUT
+          echo "total_tests=$(cat /tmp/shards/summary.json | grep -o '\"total_tests\": [0-9]*' | grep -o '[0-9]*')" >> $GITHUB_OUTPUT
+
+          # 显示分片分布
+          echo "=== Shard Distribution ==="
+          cat /tmp/shards/summary.json
+
+      - name: Upload shard JSONs
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-shards
+          path: /tmp/shards/
+          retention-days: 1
 
   # ==================== 矩阵分片测试 ====================
   test:
@@ -403,6 +445,12 @@ jobs:
           key: ${{ steps.pip_key.outputs.cache_key }}
 
       # ==================== 运行分片测试 ====================
+      - name: Download shard JSON
+        uses: actions/download-artifact@v4
+        with:
+          name: test-shards
+          path: shards
+
       - name: Run shard ${{ matrix.shard }} tests
         timeout-minutes: 1200
         run: |
@@ -413,46 +461,102 @@ jobs:
           echo "=== NPU-SMI Info ==="
           npu-smi info || echo "npu-smi not available"
 
-          # 切换到 test 目录执行
+          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
           cd pytorch-src/test
 
-          # 使用 run_test.py 的分片功能
-          # --shard <shard_id> <num_shards>
-          # shard_id: 1-6 (当前分片ID)
-          # num_shards: 6 (总分片数)
+          # 从 JSON 文件获取测试列表
+          TEST_LIST=$(cat ../shards/shard_${{ matrix.shard }}.json | python${{ env.PYTHON_VERSION }} -c "import sys, json; data = json.load(sys.stdin); print('\n'.join(data['tests']))")
+
+          # 保存到临时文件
+          echo "$TEST_LIST" > /tmp/test_list_shard_${{ matrix.shard }}.txt
+
+          TEST_COUNT_IN_SHARD=$(wc -l < /tmp/test_list_shard_${{ matrix.shard }}.txt)
+          echo "=== Shard ${{ matrix.shard }} contains ${TEST_COUNT_IN_SHARD} tests ==="
+          head -10 /tmp/test_list_shard_${{ matrix.shard }}.txt
+          if [ $TEST_COUNT_IN_SHARD -gt 10 ]; then
+            echo "... and $(($TEST_COUNT_IN_SHARD - 10)) more tests"
+          fi
+
+          # 开始执行测试
+          echo "=== Starting test execution ==="
+          TEST_COUNT=0
+          PASSED=0
+          FAILED=0
           START_TIME=$(date +%s)
 
-          python${{ env.PYTHON_VERSION }} run_test.py \
-            --shard ${{ matrix.shard }} 6 \
-            --verbose \
-            --exclude-jit-executor \
-            --exclude-distributed-tests \
-            2>&1 | tee /tmp/test_output_shard_${{ matrix.shard }}.log
+          while IFS= read -r test_name; do
+            if [ -z "$test_name" ]; then
+              continue
+            fi
+
+            TEST_COUNT=$((TEST_COUNT + 1))
+            TEST_START_TIME=$(date +%s)
+            echo ""
+            echo "============================================"
+            echo "Test ${TEST_COUNT}/${TEST_COUNT_IN_SHARD}: ${test_name}"
+            echo "============================================"
+
+            # 运行测试，捕获输出到文件但不打印到控制台
+            set +e
+            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log 2>&1
+            TEST_EXIT_CODE=$?
+            set -e
+
+            TEST_END_TIME=$(date +%s)
+            TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
+
+            # 检查结果
+            if [ $TEST_EXIT_CODE -eq 0 ]; then
+              echo ">>> ${test_name}: PASSED (${TEST_DURATION}s)"
+              PASSED=$((PASSED + 1))
+            else
+              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
+              FAILED=$((FAILED + 1))
+              # 打印失败测试的最后几行日志
+              echo "--- Last 20 lines of log ---"
+              tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log
+              echo "--- End of log preview ---"
+            fi
+
+            CURRENT_TIME=$(date +%s)
+            TOTAL_DURATION=$((CURRENT_TIME - START_TIME))
+            echo ""
+            echo "--------------------------------------------"
+            echo "Shard ${{ matrix.shard }} Summary: ${PASSED} passed, ${FAILED} failed, ${TEST_COUNT}/${TEST_COUNT_IN_SHARD} done"
+            echo "Elapsed: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date +%H:%M:%S))"
+            echo "--------------------------------------------"
+
+          done < /tmp/test_list_shard_${{ matrix.shard }}.txt
 
           END_TIME=$(date +%s)
-          DURATION=$((END_TIME - START_TIME))
+          TOTAL_DURATION=$((END_TIME - START_TIME))
 
           echo ""
           echo "============================================"
-          echo "=== Shard ${{ matrix.shard }} Completed ==="
-          echo "Duration: ${DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
+          echo "=== SHARD ${{ matrix.shard }} FINAL SUMMARY ==="
+          echo "Total tests run: ${TEST_COUNT}"
+          echo "Passed: ${PASSED}"
+          echo "Failed: ${FAILED}"
+          echo "Total time: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
           echo "============================================"
 
+          # 保存结果摘要（包含时间信息）
+          echo "{\"shard_id\": ${{ matrix.shard }}, \"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary_shard_${{ matrix.shard }}.json
+
       - name: Upload test results for shard ${{ matrix.shard }}
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: test-results-shard-${{ matrix.shard }}
           path: |
-            /tmp/test_output_shard_${{ matrix.shard }}.log
-            pytorch-src/test/test-reports/
+            /tmp/test_output_${{ matrix.shard }}_*.log
+            /tmp/test_results_summary_shard_${{ matrix.shard }}.json
           retention-days: 7
 
       - name: Print shard summary
         if: always()
         run: |
           echo "=== Shard ${{ matrix.shard }} Summary ==="
-          if [ -f /tmp/test_output_shard_${{ matrix.shard }}.log ]; then
-            echo "Last 50 lines of output:"
-            tail -50 /tmp/test_output_shard_${{ matrix.shard }}.log
+          if [ -f /tmp/test_results_summary_shard_${{ matrix.shard }}.json ]; then
+            cat /tmp/test_results_summary_shard_${{ matrix.shard }}.json
           fi

From ab209556aad868759374079599b1a0a105dd6cbd Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 7 May 2026 19:57:52 +0800
Subject: [PATCH 098/132] Fix shard JSON path: read before cd to test directory

The download-artifact puts shards in workspace root, but the
script was reading ../shards/ after cd pytorch-src/test.

Fix:
- Read shard JSON in workspace root before cd
- Add verification step to show shard files
- Use /tmp for test list (absolute path)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 817f994020..7192a1b03f 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -451,6 +451,14 @@ jobs:
           name: test-shards
           path: shards
 
+      - name: Verify shard files
+        run: |
+          echo "=== Checking shard files ==="
+          ls -la shards/
+          echo ""
+          echo "=== Shard ${{ matrix.shard }} content preview ==="
+          cat shards/shard_${{ matrix.shard }}.json | head -20
+
       - name: Run shard ${{ matrix.shard }} tests
         timeout-minutes: 1200
         run: |
@@ -461,11 +469,8 @@ jobs:
           echo "=== NPU-SMI Info ==="
           npu-smi info || echo "npu-smi not available"
 
-          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
-          cd pytorch-src/test
-
-          # 从 JSON 文件获取测试列表
-          TEST_LIST=$(cat ../shards/shard_${{ matrix.shard }}.json | python${{ env.PYTHON_VERSION }} -c "import sys, json; data = json.load(sys.stdin); print('\n'.join(data['tests']))")
+          # 从 JSON 文件获取测试列表（在工作目录根目录）
+          TEST_LIST=$(cat shards/shard_${{ matrix.shard }}.json | python${{ env.PYTHON_VERSION }} -c "import sys, json; data = json.load(sys.stdin); print('\n'.join(data['tests']))")
 
           # 保存到临时文件
           echo "$TEST_LIST" > /tmp/test_list_shard_${{ matrix.shard }}.txt
@@ -477,6 +482,9 @@ jobs:
             echo "... and $(($TEST_COUNT_IN_SHARD - 10)) more tests"
           fi
 
+          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
+          cd pytorch-src/test
+
           # 开始执行测试
           echo "=== Starting test execution ==="
           TEST_COUNT=0

From 951f6e8f92aec6a4df4130f1360bcd5756f68a34 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 09:09:04 +0800
Subject: [PATCH 099/132] Add timeout control and crash detection for test
 execution

Improvements:
- Add single test timeout (30 minutes) using timeout command
- Distinguish between PASSED, FAILED, TIMEOUT, and CRASHED
- timeout exit code 124 = test timed out
- exit code >= 128 = crash (signal killed process)
- exit code 137 = OOM killed, 139 = segfault, etc.
- Add crash detection with signal number reporting
- Update summary to show all status categories

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 36 +++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 7192a1b03f..2fb5db06d6 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -490,8 +490,13 @@ jobs:
           TEST_COUNT=0
           PASSED=0
           FAILED=0
+          TIMEOUT=0
+          CRASHED=0
           START_TIME=$(date +%s)
 
+          # 单测试超时：30分钟
+          SINGLE_TEST_TIMEOUT=1800
+
           while IFS= read -r test_name; do
             if [ -z "$test_name" ]; then
               continue
@@ -504,25 +509,38 @@ jobs:
             echo "Test ${TEST_COUNT}/${TEST_COUNT_IN_SHARD}: ${test_name}"
             echo "============================================"
 
-            # 运行测试，捕获输出到文件但不打印到控制台
+            # 运行测试，捕获输出到文件，带超时控制
             set +e
-            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log 2>&1
+            timeout ${SINGLE_TEST_TIMEOUT}s python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log 2>&1
             TEST_EXIT_CODE=$?
             set -e
 
             TEST_END_TIME=$(date +%s)
             TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
 
-            # 检查结果
+            # 检查结果（区分失败、超时、崩溃）
+            # timeout exit codes: 124=timeout, 137=OOM killed, 139=segfault, etc.
             if [ $TEST_EXIT_CODE -eq 0 ]; then
               echo ">>> ${test_name}: PASSED (${TEST_DURATION}s)"
               PASSED=$((PASSED + 1))
+            elif [ $TEST_EXIT_CODE -eq 124 ]; then
+              echo ">>> ${test_name}: TIMEOUT (${TEST_DURATION}s, killed after ${SINGLE_TEST_TIMEOUT}s)"
+              TIMEOUT=$((TIMEOUT + 1))
+              echo "--- Last 20 lines of log ---"
+              tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log || echo "(log file may be empty)"
+              echo "--- End of log preview ---"
             else
-              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
-              FAILED=$((FAILED + 1))
+              # 区分普通失败和可能的崩溃
+              if [ $TEST_EXIT_CODE -ge 128 ]; then
+                echo ">>> ${test_name}: CRASHED (exit code: ${TEST_EXIT_CODE}, signal: $(($TEST_EXIT_CODE - 128)), ${TEST_DURATION}s)"
+                CRASHED=$((CRASHED + 1))
+              else
+                echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
+                FAILED=$((FAILED + 1))
+              fi
               # 打印失败测试的最后几行日志
               echo "--- Last 20 lines of log ---"
-              tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log
+              tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log || echo "(log file may be empty)"
               echo "--- End of log preview ---"
             fi
 
@@ -530,7 +548,7 @@ jobs:
             TOTAL_DURATION=$((CURRENT_TIME - START_TIME))
             echo ""
             echo "--------------------------------------------"
-            echo "Shard ${{ matrix.shard }} Summary: ${PASSED} passed, ${FAILED} failed, ${TEST_COUNT}/${TEST_COUNT_IN_SHARD} done"
+            echo "Shard ${{ matrix.shard }} Summary: ${PASSED} passed, ${FAILED} failed, ${TIMEOUT} timeout, ${CRASHED} crashed, ${TEST_COUNT}/${TEST_COUNT_IN_SHARD} done"
             echo "Elapsed: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date +%H:%M:%S))"
             echo "--------------------------------------------"
 
@@ -545,11 +563,13 @@ jobs:
           echo "Total tests run: ${TEST_COUNT}"
           echo "Passed: ${PASSED}"
           echo "Failed: ${FAILED}"
+          echo "Timeout: ${TIMEOUT}"
+          echo "Crashed: ${CRASHED}"
           echo "Total time: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
           echo "============================================"
 
           # 保存结果摘要（包含时间信息）
-          echo "{\"shard_id\": ${{ matrix.shard }}, \"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary_shard_${{ matrix.shard }}.json
+          echo "{\"shard_id\": ${{ matrix.shard }}, \"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"timeout\": ${TIMEOUT}, \"crashed\": ${CRASHED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary_shard_${{ matrix.shard }}.json
 
       - name: Upload test results for shard ${{ matrix.shard }}
         if: always()

From 85379e192c335d4d6235b7a7557a8e6d7b99b2c4 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 09:19:16 +0800
Subject: [PATCH 100/132] Remove single test timeout, keep crash detection

Job timeout-minutes: 1200 controls overall execution time.
Single test timeout is not needed.

Keep crash detection: exit code >= 128 indicates signal kill
(137=OOM, 139=segfault, etc.)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 34 ++++++++++--------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index 2fb5db06d6..ac961ddfcc 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -490,13 +490,9 @@ jobs:
           TEST_COUNT=0
           PASSED=0
           FAILED=0
-          TIMEOUT=0
           CRASHED=0
           START_TIME=$(date +%s)
 
-          # 单测试超时：30分钟
-          SINGLE_TEST_TIMEOUT=1800
-
           while IFS= read -r test_name; do
             if [ -z "$test_name" ]; then
               continue
@@ -509,36 +505,29 @@ jobs:
             echo "Test ${TEST_COUNT}/${TEST_COUNT_IN_SHARD}: ${test_name}"
             echo "============================================"
 
-            # 运行测试，捕获输出到文件，带超时控制
+            # 运行测试，捕获输出到文件
             set +e
-            timeout ${SINGLE_TEST_TIMEOUT}s python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log 2>&1
+            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log 2>&1
             TEST_EXIT_CODE=$?
             set -e
 
             TEST_END_TIME=$(date +%s)
             TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
 
-            # 检查结果（区分失败、超时、崩溃）
-            # timeout exit codes: 124=timeout, 137=OOM killed, 139=segfault, etc.
+            # 检查结果（区分失败和崩溃）
+            # exit code >= 128 表示被信号杀死（如 137=OOM, 139=segfault）
             if [ $TEST_EXIT_CODE -eq 0 ]; then
               echo ">>> ${test_name}: PASSED (${TEST_DURATION}s)"
               PASSED=$((PASSED + 1))
-            elif [ $TEST_EXIT_CODE -eq 124 ]; then
-              echo ">>> ${test_name}: TIMEOUT (${TEST_DURATION}s, killed after ${SINGLE_TEST_TIMEOUT}s)"
-              TIMEOUT=$((TIMEOUT + 1))
+            elif [ $TEST_EXIT_CODE -ge 128 ]; then
+              echo ">>> ${test_name}: CRASHED (exit code: ${TEST_EXIT_CODE}, signal: $(($TEST_EXIT_CODE - 128)), ${TEST_DURATION}s)"
+              CRASHED=$((CRASHED + 1))
               echo "--- Last 20 lines of log ---"
               tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log || echo "(log file may be empty)"
               echo "--- End of log preview ---"
             else
-              # 区分普通失败和可能的崩溃
-              if [ $TEST_EXIT_CODE -ge 128 ]; then
-                echo ">>> ${test_name}: CRASHED (exit code: ${TEST_EXIT_CODE}, signal: $(($TEST_EXIT_CODE - 128)), ${TEST_DURATION}s)"
-                CRASHED=$((CRASHED + 1))
-              else
-                echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
-                FAILED=$((FAILED + 1))
-              fi
-              # 打印失败测试的最后几行日志
+              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
+              FAILED=$((FAILED + 1))
               echo "--- Last 20 lines of log ---"
               tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log || echo "(log file may be empty)"
               echo "--- End of log preview ---"
@@ -548,7 +537,7 @@ jobs:
             TOTAL_DURATION=$((CURRENT_TIME - START_TIME))
             echo ""
             echo "--------------------------------------------"
-            echo "Shard ${{ matrix.shard }} Summary: ${PASSED} passed, ${FAILED} failed, ${TIMEOUT} timeout, ${CRASHED} crashed, ${TEST_COUNT}/${TEST_COUNT_IN_SHARD} done"
+            echo "Shard ${{ matrix.shard }} Summary: ${PASSED} passed, ${FAILED} failed, ${CRASHED} crashed, ${TEST_COUNT}/${TEST_COUNT_IN_SHARD} done"
             echo "Elapsed: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date +%H:%M:%S))"
             echo "--------------------------------------------"
 
@@ -563,13 +552,12 @@ jobs:
           echo "Total tests run: ${TEST_COUNT}"
           echo "Passed: ${PASSED}"
           echo "Failed: ${FAILED}"
-          echo "Timeout: ${TIMEOUT}"
           echo "Crashed: ${CRASHED}"
           echo "Total time: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
           echo "============================================"
 
           # 保存结果摘要（包含时间信息）
-          echo "{\"shard_id\": ${{ matrix.shard }}, \"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"timeout\": ${TIMEOUT}, \"crashed\": ${CRASHED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary_shard_${{ matrix.shard }}.json
+          echo "{\"shard_id\": ${{ matrix.shard }}, \"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"crashed\": ${CRASHED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary_shard_${{ matrix.shard }}.json
 
       - name: Upload test results for shard ${{ matrix.shard }}
         if: always()

From 3da78947eed519cd8062f75cb9c2d3650e023d78 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 12:20:53 +0800
Subject: [PATCH 101/132] Remove PR trigger from test-collect.yml

Only keep workflow_dispatch for manual triggering.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/test-collect.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
index ac961ddfcc..a53086dcc4 100644
--- a/.github/workflows/test-collect.yml
+++ b/.github/workflows/test-collect.yml
@@ -1,12 +1,6 @@
 name: Test Collect Stage
 
 on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-      - main
-      - master
-      - 'release/**'
   workflow_dispatch:
     inputs:
       distributed_shards:

From 867eb10207ddcb39b14517a80b935abc7fbba613 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 12:21:36 +0800
Subject: [PATCH 102/132] Add PR trigger to npu-sync-test.yml

Trigger on PR events targeting main, master, or release branches.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/npu-sync-test.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
index 29ae49d936..1f6c16b88e 100644
--- a/.github/workflows/npu-sync-test.yml
+++ b/.github/workflows/npu-sync-test.yml
@@ -1,6 +1,12 @@
 name: PyTorch NPU Sync Test
 
 on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+      - master
+      - 'release/**'
   workflow_dispatch:
     inputs:
       docker_image_tag:

From 3e4e5d68a32a5c344d892f4d1056aa175e8e4a5d Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 16:19:35 +0800
Subject: [PATCH 103/132] Add test progress logging in run_npu_test_shard.py

Before execution, print:
- Test index/total: "Test 6/93911: pytest -v --timeout=300 -x case_id"
- Running case: "Running: case_id"
After execution, print result: "[status] duration"

This provides clear progress visibility during test execution.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/run_npu_test_shard.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index d6efcedf20..582a56773d 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -22,6 +22,8 @@ def run_single_case(
     case_id: str,
     test_dir: Path,
     timeout: int,
+    test_index: int = 0,
+    total_tests: int = 0,
     verbose: bool = False
 ) -> Dict:
     """Run a single test case in a subprocess."""
@@ -48,7 +50,10 @@ def run_single_case(
 
     cmd = ["pytest", "-v", "--timeout=300", "-x", case_id]
 
+    # Print progress and command before execution
     if verbose:
+        if test_index > 0 and total_tests > 0:
+            print(f"Test {test_index}/{total_tests}: { ' '.join(cmd)}")
         print(f"Running: {case_id}")
 
     start_time = time.time()
@@ -144,14 +149,16 @@ def run_shard(
 
     print(f"Running tests with {max_workers} workers...")
 
+    total_tests = len(cases)
+
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = {
-            executor.submit(run_single_case, case, test_dir_path, timeout, verbose): case
-            for case in cases
+            executor.submit(run_single_case, case, test_dir_path, timeout, idx, total_tests, verbose): (case, idx)
+            for idx, case in enumerate(cases, start=1)
         }
 
         for future in as_completed(futures):
-            case = futures[future]
+            case, idx = futures[future]
             result = future.result()
             results.append(result)
 

From 736176ed3c4e3df64f5dd6e3b36c452b4027ac45 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 16:27:07 +0800
Subject: [PATCH 104/132] Simplify test results JSON: only keep case_id,
 duration, status

Remove output, error, timestamp fields to reduce file size.
Results JSON is uploaded to artifacts via _test.yml.

Format:
{
  "results": [
    {"case_id": "...", "duration": 10.5, "status": "passed"},
    ...
  ]
}

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/run_npu_test_shard.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 582a56773d..21b702f9ff 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -168,7 +168,17 @@ def run_shard(
             else:
                 stats["unknown"] += 1
 
-    # Save results
+    # Save results - simplified format with only case_id, duration, status
+    simplified_results = [
+        {
+            "case_id": r["case_id"],
+            "duration": r["duration"],
+            "status": r["status"],
+        }
+        for r in results
+    ]
+
+    # Save simplified results JSON
     results_file = report_dir_path / f"shard_{shard_index}_results.json"
     with open(results_file, "w") as f:
         json.dump({
@@ -176,7 +186,7 @@ def run_shard(
             "total_shards": total_shards,
             "total_cases": len(cases),
             "stats": stats,
-            "results": results,
+            "results": simplified_results,
         }, f, indent=2)
 
     print(f"\nShard {shard_index} Results saved to {results_file}")

From e17c62f187ea474b077dd15156d3d97ffd5b8f35 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Fri, 8 May 2026 16:30:24 +0800
Subject: [PATCH 105/132] Restore full results JSON with all fields

Keep complete test results including:
- case_id, duration, status (core fields)
- output, error, timestamp (diagnostic fields)

Reverts previous simplified format change.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/run_npu_test_shard.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 21b702f9ff..3c04a63b2a 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -168,17 +168,7 @@ def run_shard(
             else:
                 stats["unknown"] += 1
 
-    # Save results - simplified format with only case_id, duration, status
-    simplified_results = [
-        {
-            "case_id": r["case_id"],
-            "duration": r["duration"],
-            "status": r["status"],
-        }
-        for r in results
-    ]
-
-    # Save simplified results JSON
+    # Save results JSON (contains case_id, duration, status, output, error, timestamp)
     results_file = report_dir_path / f"shard_{shard_index}_results.json"
     with open(results_file, "w") as f:
         json.dump({
@@ -186,7 +176,7 @@ def run_shard(
             "total_shards": total_shards,
             "total_cases": len(cases),
             "stats": stats,
-            "results": simplified_results,
+            "results": results,
         }, f, indent=2)
 
     print(f"\nShard {shard_index} Results saved to {results_file}")

From 6f8e0a6d97a3a98778eeb9a372711fb452a5020d Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 17:53:36 +0800
Subject: [PATCH 106/132] Simplify workflow parameters: remove pytorch_version
 and docker_image_tag

- Remove pytorch_version, use only pytorch_ref for build and artifact naming
- Remove docker_image_tag, use only docker_image (full URL)
- Remove artifact_suffix in build workflow, use pytorch_ref directly
- Simplify cache keys by removing pytorch_version dependency
- Update default pytorch_ref to fccc94ae83f61fe26559abc999797297196bac29
- Update default torch_npu_ref to master

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py          |  667 ++---
 .github/scripts/discover_test_files.py        |  341 +++
 .../scripts/generate_npu_full_test_report.py  | 1276 +++++++++
 .github/scripts/parse_test_results.py         |  807 ++++++
 .github/scripts/run_npu_test_shard.py         | 2304 ++++++++++++++++-
 .../workflows/_torch-npu-upstream-build.yml   |  417 +++
 .../workflows/_torch-npu-upstream-collect.yml |  246 ++
 .../workflows/_torch-npu-upstream-prepare.yml |   49 +
 .../workflows/_torch-npu-upstream-report.yml  |  106 +
 .../_torch-npu-upstream-test-custom.yml       |  170 ++
 .../_torch-npu-upstream-test-dist.yml         |  226 ++
 .../_torch-npu-upstream-test-regular.yml      |  229 ++
 .../workflows/_torch-npu-upstream-test.yml    |  143 +
 .../torch-npu-upstream-test-trigger.yml       |   70 +
 14 files changed, 6613 insertions(+), 438 deletions(-)
 create mode 100644 .github/scripts/discover_test_files.py
 create mode 100644 .github/scripts/generate_npu_full_test_report.py
 create mode 100644 .github/scripts/parse_test_results.py
 create mode 100644 .github/workflows/_torch-npu-upstream-build.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-collect.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-prepare.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-report.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-test-custom.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-test-dist.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-test-regular.yml
 create mode 100644 .github/workflows/_torch-npu-upstream-test.yml
 create mode 100644 .github/workflows/torch-npu-upstream-test-trigger.yml

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 49ad0fbdc4..9e15b047ee 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -1,9 +1,23 @@
 #!/usr/bin/env python3
 """
-Collect all test cases from PyTorch test directory and shard them.
-
-This script scans the test directory, collects all test cases using pytest --collect-only,
-classifies them as distributed or regular tests, and shards them for parallel execution.
+Collect all test cases and split into shards.
+
+This script runs in prepare job (once) to:
+1. Discover test files by type (distributed/regular)
+2. Collect all test cases via pytest --collect-only
+3. Split cases evenly into N shards
+4. Output shard JSON files for each type
+5. Save collection error logs for failed files
+
+Usage:
+    python collect_all_cases.py \
+        --test-dir /path/to/pytorch/test \
+        --case-paths-config /path/to/case_paths_ci.yml \
+        --distributed-shards 2 \
+        --regular-shards 5 \
+        --output-dir /path/to/output \
+        --error-log-dir /path/to/error_logs \
+        --parallel 16
 """
 
 import argparse
@@ -13,340 +27,401 @@
 import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-from typing import Dict, List, Set, Tuple
+from typing import Dict, List, Tuple
 
+# Import discover_test_files module
+import discover_test_files
 
-DISTRIBUTED_TEST_DIRS = [
-    "distributed",
-    "distributed/algorithms/nn",
-]
 
+def get_test_file_parent_dir(test_file: str, test_dir: Path) -> Path:
+    """
+    Get the parent directory of a test file.
 
-def is_distributed_test(test_file: str) -> bool:
-    """Check if a test file is in distributed test directories."""
-    for dir_prefix in DISTRIBUTED_TEST_DIRS:
-        if test_file.startswith(dir_prefix):
-            return True
-    return False
+    This directory should be added to PYTHONPATH to enable
+    imports of sibling modules (e.g., model_registry.py).
 
+    Args:
+        test_file: Test file path (e.g., "test/distributed/pipelining/test_backward.py")
+        test_dir: Path to PyTorch test directory
 
-def collect_cases_from_file(test_dir: Path, test_file: str, parallel: int = 1, verbose: bool = False) -> Tuple[List[str], str]:
-    """Collect test cases from a single test file using pytest --collect-only.
+    Returns:
+        Path to the test file's parent directory
+    """
+    if test_file.startswith("test/"):
+        test_file_rel = test_file[5:]
+    else:
+        test_file_rel = test_file
+
+    test_file_path = Path(test_file_rel)
+    return test_dir / test_file_path.parent
 
-    Args:
-        test_dir: The test directory path (used as cwd for pytest)
-        test_file: Relative path to test file from test_dir (e.g., "dynamo/test_regional_inductor.py")
-        parallel: Number of parallel collectors (not used in this function)
-        verbose: Print debug information
+
+def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, List[str], bool, str]:
+    """
+    Collect test cases from a single file.
+
+    Adds test file's parent directory to PYTHONPATH to enable
+    imports of sibling modules (e.g., 'from model_registry import MLPModule').
 
     Returns:
-        Tuple of (cases list, error message or empty string)
+        Tuple of (test_file, nodeids, success, error_message)
+        - test_file: Original test file path
+        - nodeids: List of collected test case nodeids
+        - success: True if collection succeeded without errors
+        - error_message: Error details if collection failed, empty string otherwise
     """
-    full_path = test_dir / test_file
-
-    # ===== DEBUG: Print path details for specific files =====
-    if verbose and ("distributed/_composable/fsdp" in test_file or "dynamo" in test_file):
-        print("\n=== DEBUG: collect_cases_from_file ===")
-        print(f"test_file (relative): {test_file}")
-        print(f"test_dir: {test_dir}")
-        print(f"test_dir.resolve(): {test_dir.resolve()}")
-        print(f"full_path (test_dir / test_file): {full_path}")
-        print(f"full_path.resolve(): {full_path.resolve()}")
-        print(f"full_path.exists(): {full_path.exists()}")
-        print(f"cwd for pytest: {str(test_dir)}")
-        print(f"pytest arg (should be relative): {test_file}")
-        print("=" * 50)
-
-    if not full_path.exists():
-        error = f"File not found: {full_path}"
-        if verbose:
-            print(f"[SKIP] {test_file}: {error}")
-            # DEBUG: Show what files exist in similar location
-            parent = full_path.parent
-            if parent.exists():
-                print(f"  DEBUG: Files in {parent}:")
-                for item in sorted(parent.iterdir())[:10]:
-                    print(f"    {item.name}")
-            else:
-                print(f"  DEBUG: Parent directory {parent} does not exist")
-        return [], error
+    if test_file.startswith("test/"):
+        test_file_rel = test_file[5:]
+    else:
+        test_file_rel = test_file
+
+    # Extract display name (remove test/ prefix and .py suffix)
+    display_name = test_file_rel
+    if display_name.endswith(".py"):
+        display_name = display_name[:-3]
+
+    # Get test file's parent directory for PYTHONPATH
+    test_file_dir = get_test_file_parent_dir(test_file, test_dir)
+
+    # Build environment with test file directory in PYTHONPATH
+    env = os.environ.copy()
+    existing_pythonpath = env.get("PYTHONPATH", "")
+    env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
+
+    command = [
+        sys.executable,
+        "-m",
+        "pytest",
+        "--collect-only",
+        "--quiet",
+        test_file_rel,
+    ]
 
     try:
-        # Use test_file (relative path) as pytest argument since cwd is test_dir
-        # pytest command should be: pytest --collect-only -q dynamo/test_regional_inductor.py
-        # NOT: pytest --collect-only -q pytorch-src/test/dynamo/test_regional_inductor.py
-        pytest_cmd = ["pytest", "--collect-only", "-q", test_file]
-        if verbose and ("distributed/_composable/fsdp" in test_file or "dynamo" in test_file):
-            print(f"\nDEBUG: pytest command: {pytest_cmd}")
-            print(f"DEBUG: cwd: {str(test_dir)}")
-
         result = subprocess.run(
-            pytest_cmd,
+            command,
+            cwd=str(test_dir),
+            env=env,
             capture_output=True,
             text=True,
-            timeout=60,
-            cwd=str(test_dir),
-            env={**os.environ, "PYTEST_ADDOPTS": ""}
+            encoding="utf-8",
+            errors="replace",
+            timeout=120,
         )
 
-        cases = []
+        nodeids = []
         for line in result.stdout.splitlines():
-            # Parse pytest output format: "test_file.py::TestClass::test_method"
-            if "::" in line and not line.startswith("="):
-                case_id = line.strip()
-                if case_id and not case_id.startswith("<"):
-                    cases.append(case_id)
-
-        # Check for errors
-        error_msg = ""
-        if result.returncode != 0:
-            error_msg = result.stderr.strip() if result.stderr else "Unknown error"
-            if verbose:
-                print(f"[ERROR] {test_file}: pytest returned {result.returncode}")
-                if result.stderr:
-                    # 打印完整 stderr，不截断
-                    for line in result.stderr.splitlines():
-                        print(f"  {line}")
-        elif len(cases) == 0:
-            # No cases collected, might be import error or empty file
-            if result.stderr:
-                error_msg = result.stderr.strip()
-                if verbose:
-                    print(f"[WARN] {test_file}: 0 cases collected")
-                    # 打印完整 stderr
-                    for line in result.stderr.splitlines():
-                        print(f"  {line}")
-            else:
-                if verbose:
-                    print(f"[WARN] {test_file}: 0 cases collected (possibly empty or all skipped)")
+            if "::" in line and not line.strip().startswith("<"):
+                nodeids.append(line.strip())
+
+        # Check for collection errors based on pytest exit codes:
+        #   0: all passed (success)
+        #   2: pytest error (includes collection errors like ImportError)
+        #   3: all skipped (success)
+        #   4: command line error (error)
+        #   5: no tests collected (ERROR - test file should have cases)
+        # Key insight: if a test file is selected for execution, it should have cases.
+        # returncode 5 means 0 cases collected, which indicates a problem.
+        if result.returncode in (0, 3):
+            # Normal: passed or skipped
+            return (test_file, nodeids, True, "")
         else:
-            if verbose:
-                print(f"[OK] {test_file}: {len(cases)} cases collected")
+            # returncode 2, 4, 5: real collection error
+            # returncode 5 specifically means no tests collected - a problem for selected files
+            error_msg = result.stdout.strip()
+            if result.stderr.strip():
+                error_msg += "\n--- stderr ---\n" + result.stderr.strip()
+            return (test_file, nodeids, False, error_msg)
 
-        return cases, error_msg
     except subprocess.TimeoutExpired:
-        error = "Timeout after 60s"
-        if verbose:
-            print(f"[TIMEOUT] {test_file}: {error}")
-        return [], error
+        error_msg = f"TIMEOUT: Collection took >120s for {display_name}"
+        return (test_file, [], False, error_msg)
     except Exception as e:
-        error = str(e)
-        if verbose:
-            print(f"[EXCEPTION] {test_file}: {error}")
-        return [], error
-
-
-def discover_test_files(test_dir: Path) -> List[str]:
-    """Discover all test_*.py files in the test directory."""
-    test_files = []
-    for py_file in test_dir.rglob("test_*.py"):
-        rel_path = str(py_file.relative_to(test_dir))
-        test_files.append(rel_path)
-    return sorted(test_files)
-
-
-def shard_cases(cases: List[str], num_shards: int) -> List[List[str]]:
-    """Shard cases evenly across shards."""
-    shards = [[] for _ in range(num_shards)]
-    for i, case in enumerate(cases):
-        shard_idx = i % num_shards
-        shards[shard_idx].append(case)
-    return shards
+        error_msg = f"ERROR: {e}"
+        return (test_file, [], False, error_msg)
 
 
 def collect_all_cases(
-    test_dir: str,
-    distributed_shards: int,
-    regular_shards: int,
-    output_dir: str,
-    parallel: int = 1,
-    verbose: bool = False
-) -> Dict:
-    """Collect all test cases and shard them."""
-    test_dir_path = Path(test_dir)
-    if not test_dir_path.exists():
-        raise FileNotFoundError(f"Test directory not found: {test_dir}")
-
-    # ===== DEBUG: Print directory structure and paths =====
-    print("\n=== DEBUG: Directory Structure and Paths ===")
-    print(f"Current working directory: {os.getcwd()}")
-    print(f"test_dir argument: {test_dir}")
-    print(f"test_dir_path: {test_dir_path}")
-    print(f"test_dir_path.resolve(): {test_dir_path.resolve()}")
-    print(f"test_dir_path exists: {test_dir_path.exists()}")
-    print(f"test_dir_path is absolute: {test_dir_path.is_absolute()}")
-
-    # List top-level directories in test_dir
-    if test_dir_path.exists():
-        print(f"\nTop-level items in test_dir_path:")
-        for item in sorted(test_dir_path.iterdir())[:20]:
-            print(f"  {item.name} {'[DIR]' if item.is_dir() else '[FILE]'}")
-
-        # Check distributed directory specifically
-        distributed_path = test_dir_path / "distributed"
-        if distributed_path.exists():
-            print(f"\ndistributed/ directory exists: {distributed_path}")
-            print(f"Contents of distributed/ (first 20 items):")
-            for item in sorted(distributed_path.iterdir())[:20]:
-                print(f"  {item.name} {'[DIR]' if item.is_dir() else '[FILE]'}")
-
-            # Check _composable/fsdp specifically
-            fsdp_path = distributed_path / "_composable" / "fsdp"
-            if fsdp_path.exists():
-                print(f"\n_fsdp path exists: {fsdp_path}")
-                print(f"Contents of fsdp/ (first 10 items):")
-                for item in sorted(fsdp_path.iterdir())[:10]:
-                    print(f"  {item.name} {'[DIR]' if item.is_dir() else '[FILE]'}")
-            else:
-                print(f"\nfsdp path NOT found: {fsdp_path}")
-        else:
-            print(f"\ndistributed/ directory NOT found")
-    print("=" * 50 + "\n")
-
-    output_dir_path = Path(output_dir)
-    output_dir_path.mkdir(parents=True, exist_ok=True)
+    test_files: List[str],
+    test_dir: Path,
+    error_log_dir: Path,
+    parallel: int = 16,
+) -> List[Dict]:
+    """
+    Collect all cases from all files.
 
-    print(f"Discovering test files in {test_dir}...")
-    test_files = discover_test_files(test_dir_path)
-    print(f"Found {len(test_files)} test files")
+    Args:
+        test_files: List of test file paths
+        test_dir: Path to PyTorch test directory
+        error_log_dir: Directory to save error logs for failed collections
+        parallel: Number of parallel workers
 
-    distributed_files = [f for f in test_files if is_distributed_test(f)]
-    regular_files = [f for f in test_files if not is_distributed_test(f)]
+    Returns:
+        List of dicts with nodeid and file for each collected case
+    """
+    all_cases = []
+    failed_files = []  # Track files with collection errors for logging
 
-    print(f"Distributed test files: {len(distributed_files)}")
-    print(f"Regular test files: {len(regular_files)}")
+    print(f"Collecting cases from {len(test_files)} files with {parallel} workers...")
+    print("=" * 60)
 
-    if verbose:
-        print("\n=== Collecting distributed cases ===")
+    # Create error log directory
+    error_log_dir.mkdir(parents=True, exist_ok=True)
 
-    # Collect cases in parallel
-    print("Collecting distributed cases...")
-    distributed_cases = []
-    distributed_errors = {}
-    with ThreadPoolExecutor(max_workers=parallel) as executor:
-        futures = {
-            executor.submit(collect_cases_from_file, test_dir_path, f, parallel, verbose): f
-            for f in distributed_files
-        }
-        for future in as_completed(futures):
-            file = futures[future]
-            cases, error = future.result()
-            distributed_cases.extend(cases)
-            if error:
-                distributed_errors[file] = error
-
-    if verbose:
-        print("\n=== Collecting regular cases ===")
-
-    print("Collecting regular cases...")
-    regular_cases = []
-    regular_errors = {}
     with ThreadPoolExecutor(max_workers=parallel) as executor:
         futures = {
-            executor.submit(collect_cases_from_file, test_dir_path, f, parallel, verbose): f
-            for f in regular_files
+            executor.submit(collect_cases_for_file, f, test_dir): f
+            for f in test_files
         }
+
+        completed = 0
+        successful_count = 0
+        failed_count = 0
+        total_cases = 0
+
         for future in as_completed(futures):
-            file = futures[future]
-            cases, error = future.result()
-            regular_cases.extend(cases)
-            if error:
-                regular_errors[file] = error
-
-    print(f"Total distributed cases: {len(distributed_cases)}")
-    print(f"Total regular cases: {len(regular_cases)}")
-
-    # Print summary of errors if any
-    if distributed_errors or regular_errors:
-        print("\n=== Collection Errors Summary ===")
-        if distributed_errors:
-            print(f"Distributed files with errors: {len(distributed_errors)}")
-            for file, error in sorted(distributed_errors.items())[:10]:
-                # 打印完整错误信息
-                print(f"  {file}:")
-                for line in error.splitlines()[:5]:  # 只打印前5行避免过长
-                    print(f"    {line}")
-            if len(distributed_errors) > 10:
-                print(f"  ... and {len(distributed_errors) - 10} more files")
-        if regular_errors:
-            print(f"Regular files with errors: {len(regular_errors)}")
-            for file, error in sorted(regular_errors.items())[:10]:
-                print(f"  {file}:")
-                for line in error.splitlines()[:5]:
-                    print(f"    {line}")
-            if len(regular_errors) > 10:
-                print(f"  ... and {len(regular_errors) - 10} more files")
-
-    # Shard cases
-    distributed_sharded = shard_cases(distributed_cases, distributed_shards)
-    regular_sharded = shard_cases(regular_cases, regular_shards)
-
-    # Save shards to JSON files
-    for i, shard in enumerate(distributed_sharded, 1):
-        shard_file = output_dir_path / f"distributed_cases_shard_{i}.json"
-        with open(shard_file, "w") as f:
-            json.dump({
-                "shard_index": i,
-                "total_shards": distributed_shards,
-                "cases": shard,
-                "count": len(shard)
-            }, f, indent=2)
-        print(f"Saved distributed shard {i} with {len(shard)} cases to {shard_file}")
-
-    for i, shard in enumerate(regular_sharded, 1):
-        shard_file = output_dir_path / f"regular_cases_shard_{i}.json"
-        with open(shard_file, "w") as f:
-            json.dump({
-                "shard_index": i,
-                "total_shards": regular_shards,
-                "cases": shard,
-                "count": len(shard)
-            }, f, indent=2)
-        print(f"Saved regular shard {i} with {len(shard)} cases to {shard_file}")
-
-    # Save summary
-    summary = {
-        "total_cases": len(distributed_cases) + len(regular_cases),
-        "distributed_cases": len(distributed_cases),
-        "regular_cases": len(regular_cases),
-        "distributed_shards": distributed_shards,
-        "regular_shards": regular_shards,
-        "distributed_files": len(distributed_files),
-        "regular_files": len(regular_files),
+            test_file, nodeids, success, error_msg = future.result()
+            completed += 1
+
+            # Extract display name for logging
+            if test_file.startswith("test/"):
+                display_name = test_file[5:]
+            else:
+                display_name = test_file
+            if display_name.endswith(".py"):
+                display_name = display_name[:-3]
+
+            if success:
+                successful_count += 1
+                # Print concise log for successful files
+                print(f"  {display_name}: {len(nodeids)} cases")
+                for nodeid in nodeids:
+                    all_cases.append({
+                        "nodeid": nodeid,
+                        "file": test_file,
+                    })
+            else:
+                failed_count += 1
+                # Print concise log for failed files
+                print(f"  [FAILED] {display_name}: {len(nodeids)} cases")
+                # Save error details to log file
+                failed_files.append({
+                    "file": display_name,
+                    "error": error_msg,
+                    "cases": len(nodeids),
+                    "test_file": test_file,
+                })
+                # Still add any cases that were collected despite errors
+                for nodeid in nodeids:
+                    all_cases.append({
+                        "nodeid": nodeid,
+                        "file": test_file,
+                    })
+
+            # Update total cases count for progress display
+            total_cases += len(nodeids)
+
+            # Print progress summary every 100 files
+            if completed % 100 == 0:
+                print(f"  [Progress: {completed}/{len(test_files)} files, {successful_count} ok, {failed_count} failed, {total_cases} cases]")
+
+    print("=" * 60)
+
+    # Save error logs to files
+    if failed_files:
+        save_error_logs(failed_files, error_log_dir)
+
+    # Final summary
+    print(f"Collection complete: {len(all_cases)} cases from {successful_count}/{len(test_files)} files")
+    if failed_count > 0:
+        print(f"  WARNING: {failed_count} files had collection errors (logs saved to {error_log_dir})")
+
+    return all_cases
+
+
+def save_error_logs(failed_files: List[Dict], error_log_dir: Path) -> None:
+    """
+    Save collection error logs to individual files and create a summary.
+
+    Args:
+        failed_files: List of dicts with file, error, cases info
+        error_log_dir: Directory to save error logs
+    """
+    print(f"Saving error logs for {len(failed_files)} failed files...")
+
+    # Save individual error log files
+    for failed in failed_files:
+        # Create safe filename from display name (replace / with _)
+        safe_name = failed['file'].replace('/', '_')
+        log_file = error_log_dir / f"{safe_name}.log"
+
+        # Write error log
+        with open(log_file, 'w', encoding='utf-8') as f:
+            f.write(f"File: {failed['file']}\n")
+            f.write(f"Cases collected: {failed['cases']}\n")
+            f.write(f"Test file path: {failed['test_file']}\n")
+            f.write("=" * 80 + "\n")
+            f.write("Collection Error:\n")
+            f.write("=" * 80 + "\n")
+            f.write(failed['error'])
+            f.write("\n")
+
+    # Save summary JSON
+    summary_file = error_log_dir / "collection_errors_summary.json"
+    summary_data = {
+        "total_failed": len(failed_files),
+        "failed_files": [
+            {
+                "file": f['file'],
+                "cases": f['cases'],
+                "test_file": f['test_file'],
+                "log_file": f"{f['file'].replace('/', '_')}.log",
+            }
+            for f in failed_files
+        ],
     }
+    summary_file.write_text(json.dumps(summary_data, indent=2), encoding='utf-8')
+
+    print(f"  Error logs saved to {error_log_dir}")
+    print(f"  Summary: {summary_file}")
+
+
+def split_cases_into_shards(cases: List[Dict], num_shards: int) -> List[List[Dict]]:
+    """Split cases evenly into shards."""
+    total = len(cases)
+    base_size = total // num_shards
+    remainder = total % num_shards
+
+    shards = []
+    start = 0
+    for i in range(num_shards):
+        size = base_size + (1 if i < remainder else 0)
+        shards.append(cases[start:start + size])
+        start += size
+
+    return shards
 
-    summary_file = output_dir_path / "cases_collection_summary.json"
-    with open(summary_file, "w") as f:
-        json.dump(summary, f, indent=2)
-    print(f"Saved summary to {summary_file}")
 
-    return summary
+def save_shards(
+    cases: List[Dict],
+    num_shards: int,
+    test_type: str,
+    output_dir: Path,
+) -> Dict:
+    """Save shard JSONs and return summary."""
+    shards = split_cases_into_shards(cases, num_shards)
+
+    print(f"\nSaving {test_type} shards...")
+    for i, shard_cases in enumerate(shards, 1):
+        shard_file = output_dir / f"{test_type}_cases_shard_{i}.json"
+        shard_data = {
+            "shard": i,
+            "num_shards": num_shards,
+            "test_type": test_type,
+            "total_cases": len(shard_cases),
+            "cases": shard_cases,
+        }
+        shard_file.write_text(json.dumps(shard_data, indent=2), encoding="utf-8")
+        print(f"  Shard {i}: {len(shard_cases)} cases -> {shard_file}")
+
+    return {
+        "test_type": test_type,
+        "num_shards": num_shards,
+        "total_cases": len(cases),
+        "shard_sizes": [len(s) for s in shards],
+    }
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Collect and shard PyTorch test cases")
-    parser.add_argument("--test-dir", required=True, help="PyTorch test directory path")
-    parser.add_argument("--distributed-shards", type=int, default=2, help="Number of distributed test shards")
-    parser.add_argument("--regular-shards", type=int, default=5, help="Number of regular test shards")
-    parser.add_argument("--output-dir", required=True, help="Output directory for shard JSON files")
-    parser.add_argument("--parallel", type=int, default=1, help="Number of parallel collectors")
-    parser.add_argument("--verbose", action="store_true", help="Print detailed collection progress")
-
-    args = parser.parse_args()
-
-    summary = collect_all_cases(
-        test_dir=args.test_dir,
-        distributed_shards=args.distributed_shards,
-        regular_shards=args.regular_shards,
-        output_dir=args.output_dir,
-        parallel=args.parallel,
-        verbose=args.verbose
+    args = parse_args()
+
+    test_dir = Path(args.test_dir).resolve()
+    output_dir = Path(args.output_dir).resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Error log directory for failed collections
+    error_log_dir = Path(args.error_log_dir).resolve() if args.error_log_dir else output_dir / "collection_errors"
+    error_log_dir.mkdir(parents=True, exist_ok=True)
+
+    summaries = []
+
+    # ========================================
+    # Step 1: Collect distributed test cases
+    # ========================================
+    print("=" * 80)
+    print("Collecting distributed test cases")
+    print("=" * 80)
+
+    dist_files, dist_meta = discover_test_files.discover_test_files(
+        test_dir=test_dir,
+        test_type="distributed",
+        case_paths_config=args.case_paths_config,
     )
+    print(f"Found {len(dist_files)} distributed test files")
+
+    dist_cases = collect_all_cases(dist_files, test_dir, error_log_dir / "distributed", args.parallel)
+    print(f"Total distributed cases: {len(dist_cases)}")
+
+    dist_summary = save_shards(dist_cases, args.distributed_shards, "distributed", output_dir)
+    summaries.append(dist_summary)
 
-    print("\nCollection Summary:")
-    print(f"  Total cases: {summary['total_cases']}")
-    print(f"  Distributed cases: {summary['distributed_cases']} ({summary['distributed_shards']} shards)")
-    print(f"  Regular cases: {summary['regular_cases']} ({summary['regular_shards']} shards)")
+    # ========================================
+    # Step 2: Collect regular test cases
+    # ========================================
+    print("\n" + "=" * 80)
+    print("Collecting regular test cases")
+    print("=" * 80)
+
+    reg_files, reg_meta = discover_test_files.discover_test_files(
+        test_dir=test_dir,
+        test_type="regular",
+        case_paths_config=args.case_paths_config,
+    )
+    print(f"Found {len(reg_files)} regular test files")
+
+    reg_cases = collect_all_cases(reg_files, test_dir, error_log_dir / "regular", args.parallel)
+    print(f"Total regular cases: {len(reg_cases)}")
+
+    reg_summary = save_shards(reg_cases, args.regular_shards, "regular", output_dir)
+    summaries.append(reg_summary)
+
+    # ========================================
+    # Step 3: Save overall summary
+    # ========================================
+    overall_summary = {
+        "distributed": {
+            "cases_summary": dist_summary,
+            "discovery_metadata": dist_meta,
+        },
+        "regular": {
+            "cases_summary": reg_summary,
+            "discovery_metadata": reg_meta,
+        },
+        "total_cases": len(dist_cases) + len(reg_cases),
+        "total_files_scanned": dist_meta.get("total_files", 0) + reg_meta.get("total_files", 0),
+    }
+    summary_file = output_dir / "cases_collection_summary.json"
+    summary_file.write_text(json.dumps(overall_summary, indent=2), encoding="utf-8")
+    print(f"\nOverall summary saved to {summary_file}")
+
+    print("\n" + "=" * 80)
+    print("Collection Complete")
+    print("=" * 80)
+    print(f"Distributed: {len(dist_cases)} cases -> {args.distributed_shards} shards (serial execution)")
+    print(f"Regular: {len(reg_cases)} cases -> {args.regular_shards} shards (parallel execution)")
+    print(f"Total: {len(dist_cases) + len(reg_cases)} cases")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and shard test cases")
+    parser.add_argument("--test-dir", required=True, help="PyTorch test directory")
+    parser.add_argument("--case-paths-config", help="case_paths_ci.yml path")
+    parser.add_argument("--distributed-shards", type=int, default=2, help="Distributed test shards")
+    parser.add_argument("--regular-shards", type=int, default=5, help="Regular test shards")
+    parser.add_argument("--output-dir", required=True, help="Output directory for shard JSONs")
+    parser.add_argument("--error-log-dir", help="Output directory for collection error logs (default: output-dir/collection_errors)")
+    parser.add_argument("--parallel", type=int, default=16, help="Parallel collection workers")
+    return parser.parse_args()
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/discover_test_files.py b/.github/scripts/discover_test_files.py
new file mode 100644
index 0000000000..0b6e6167f3
--- /dev/null
+++ b/.github/scripts/discover_test_files.py
@@ -0,0 +1,341 @@
+#!/usr/bin/env python3
+"""
+Discover test files for PyTorch NPU testing.
+
+This script integrates 3 steps:
+    Step 1: Test file discovery (scan all test_*.py)
+    Step 2: Shard type filtering (distributed/regular)
+    Step 3: Whitelist/blacklist filtering (case_paths_ci.yml)
+
+Output: Sorted list of test file paths (with 'test/' prefix)
+
+Usage:
+    python discover_test_files.py \
+        --test-dir /path/to/pytorch/test \
+        --test-type distributed \
+        --case-paths-config /path/to/case_paths_ci.yml \
+        --output /path/to/output_file.txt
+
+    # Or output to stdout:
+    python discover_test_files.py \
+        --test-dir /path/to/pytorch/test \
+        --test-type regular \
+        --case-paths-config /path/to/case_paths_ci.yml
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+try:
+    import yaml  # type: ignore
+except ImportError:
+    yaml = None  # type: ignore
+
+
+# ==============================================================================
+# Path Normalization Functions
+# ==============================================================================
+
+
+def normalize_path(value: str) -> str:
+    """Normalize path: convert backslashes, remove ./ prefix."""
+    normalized = value.replace("\\", "/").strip()
+    while normalized.startswith("./"):
+        normalized = normalized[2:]
+    return normalized.strip("/")
+
+
+def normalize_rule_path(rule: str) -> str:
+    """Normalize rule path: ensure it has 'test/' prefix."""
+    normalized = normalize_path(rule)
+    if not normalized:
+        return ""
+    if normalized == "test" or normalized.startswith("test/"):
+        return normalized.rstrip("/")
+    return f"test/{normalized}".rstrip("/")
+
+
+# ==============================================================================
+# YAML Parsing Functions
+# ==============================================================================
+
+
+def parse_simple_yaml_lists(raw_text: str) -> Dict[str, List[str]]:
+    """Parse YAML file for whitelist/blacklist without yaml library."""
+    parsed = {"whitelist": [], "blacklist": []}
+    current_key = None
+
+    for raw_line in raw_text.splitlines():
+        without_comment = raw_line.split("#", 1)[0].rstrip()
+        if not without_comment.strip():
+            continue
+
+        stripped = without_comment.lstrip()
+        if not raw_line.startswith((" ", "\t")) and stripped.endswith(":"):
+            key = stripped[:-1].strip()
+            current_key = key if key in parsed else None
+            continue
+
+        if current_key and stripped.startswith("- "):
+            value = stripped[2:].strip().strip("\"'")
+            if value:
+                parsed[current_key].append(value)
+
+    return parsed
+
+
+def coerce_rule_list(value, key: str) -> List[str]:
+    """Validate and normalize rule list."""
+    if value is None:
+        return []
+    if not isinstance(value, list):
+        raise ValueError(f"Expected '{key}' to be a list, got {type(value).__name__}")
+
+    normalized_values = []
+    for item in value:
+        if not isinstance(item, str):
+            raise ValueError(f"Expected every '{key}' entry to be a string, got {type(item).__name__}")
+        normalized = normalize_rule_path(item)
+        if normalized:
+            normalized_values.append(normalized)
+    return normalized_values
+
+
+def load_case_path_rules(config_file: Optional[str]) -> Tuple[str, List[str], List[str]]:
+    """Load whitelist/blacklist rules from case_paths_ci.yml."""
+    if not config_file:
+        return "", [], []
+
+    config_path = Path(config_file).resolve()
+    if not config_path.exists():
+        raise FileNotFoundError(f"case_paths_ci config not found: {config_path}")
+
+    raw_text = config_path.read_text(encoding="utf-8")
+
+    if yaml is not None:
+        payload = yaml.safe_load(raw_text) or {}
+    else:
+        payload = parse_simple_yaml_lists(raw_text)
+
+    if not isinstance(payload, dict):
+        raise ValueError(f"Expected a YAML object in {config_path}, got {type(payload).__name__}")
+
+    whitelist = coerce_rule_list(payload.get("whitelist"), "whitelist")
+    blacklist = coerce_rule_list(payload.get("blacklist"), "blacklist")
+    return str(config_path), whitelist, blacklist
+
+
+# ==============================================================================
+# Test File Discovery (Step 1)
+# ==============================================================================
+
+
+def discover_raw_test_files(test_dir: Path) -> List[str]:
+    """Scan all test_*.py files in test directory."""
+    files = []
+    for test_file in test_dir.rglob("test_*.py"):
+        rel_path = test_file.relative_to(test_dir).as_posix()
+        files.append(f"test/{rel_path}")
+    return sorted(files)
+
+
+# ==============================================================================
+# Type Filtering (Step 2)
+# ==============================================================================
+
+
+def filter_tests_by_type(test_files: List[str], test_type: str) -> Tuple[List[str], List[str]]:
+    """Filter test files by test type (distributed/regular)."""
+    if test_type == "distributed":
+        selected = [f for f in test_files if f.startswith("test/distributed/")]
+        excluded = [f for f in test_files if not f.startswith("test/distributed/")]
+    else:
+        selected = [f for f in test_files if not f.startswith("test/distributed/")]
+        excluded = [f for f in test_files if f.startswith("test/distributed/")]
+    return selected, excluded
+
+
+# ==============================================================================
+# Path Rules Filtering (Step 3)
+# ==============================================================================
+
+
+def path_matches_rule(test_path: str, rule: str) -> bool:
+    """Check if test path matches a rule (supports glob patterns)."""
+    import fnmatch
+
+    normalized_path = normalize_path(test_path)
+    normalized_rule = normalize_rule_path(rule)
+    if not normalized_rule:
+        return False
+
+    if any(char in normalized_rule for char in "*?[]"):
+        return fnmatch.fnmatch(normalized_path, normalized_rule)
+
+    return normalized_path == normalized_rule or normalized_path.startswith(f"{normalized_rule}/")
+
+
+def apply_case_path_rules(
+    test_files: List[str], whitelist: List[str], blacklist: List[str]
+) -> Tuple[List[str], List[str]]:
+    """Apply whitelist and blacklist rules to filter test files."""
+    # Apply whitelist (if empty, select all)
+    if whitelist:
+        selected = [path for path in test_files if any(path_matches_rule(path, rule) for rule in whitelist)]
+    else:
+        selected = list(test_files)
+
+    # Apply blacklist
+    if blacklist:
+        selected = [path for path in selected if not any(path_matches_rule(path, rule) for rule in blacklist)]
+
+    selected_set = set(selected)
+    excluded = [path for path in test_files if path not in selected_set]
+    return selected, excluded
+
+
+# ==============================================================================
+# Main Discovery Function
+# ==============================================================================
+
+
+def discover_test_files(
+    test_dir: Path,
+    test_type: str,
+    case_paths_config: Optional[str],
+) -> Tuple[List[str], Dict]:
+    """
+    Execute all 3 steps to discover test files.
+
+    Returns:
+        Tuple of (selected_files, metadata_dict)
+    """
+    # Step 1: Discover all test files
+    all_test_files = discover_raw_test_files(test_dir)
+    total_count = len(all_test_files)
+
+    # Step 2: Filter by test type
+    type_selected, type_excluded = filter_tests_by_type(all_test_files, test_type)
+
+    # Step 3: Apply whitelist/blacklist rules
+    config_path, whitelist, blacklist = load_case_path_rules(case_paths_config)
+    rules_selected, rules_excluded = apply_case_path_rules(type_selected, whitelist, blacklist)
+
+    # Metadata for reporting
+    metadata = {
+        "test_dir": str(test_dir),
+        "test_type": test_type,
+        "total_files": total_count,
+        "type_selected": len(type_selected),
+        "type_excluded": len(type_excluded),
+        "whitelist_entries": len(whitelist),
+        "blacklist_entries": len(blacklist),
+        "rules_selected": len(rules_selected),
+        "rules_excluded": len(rules_excluded),
+        "case_paths_config": config_path,
+    }
+
+    return rules_selected, metadata
+
+
+# ==============================================================================
+# CLI Interface
+# ==============================================================================
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Discover test files for PyTorch NPU testing",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--test-dir",
+        type=str,
+        required=True,
+        help="Path to the PyTorch test directory",
+    )
+    parser.add_argument(
+        "--test-type",
+        type=str,
+        choices=["distributed", "regular"],
+        default="regular",
+        help="Test type: 'distributed' for distributed tests, 'regular' for other tests",
+    )
+    parser.add_argument(
+        "--case-paths-config",
+        type=str,
+        help="Path to case_paths_ci.yml for file-level whitelist/blacklist control",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Output file path for test file list (default: stdout)",
+    )
+    parser.add_argument(
+        "--metadata-output",
+        type=str,
+        help="Output file path for metadata JSON (optional)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Print verbose output including metadata",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    test_dir = Path(args.test_dir).resolve()
+    if not test_dir.is_dir():
+        raise FileNotFoundError(f"Test directory not found: {test_dir}")
+
+    # Execute discovery
+    selected_files, metadata = discover_test_files(
+        test_dir=test_dir,
+        test_type=args.test_type,
+        case_paths_config=args.case_paths_config,
+    )
+
+    # Output test file list
+    output_content = "\n".join(selected_files) + ("\n" if selected_files else "")
+
+    if args.output:
+        output_path = Path(args.output).resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(output_content, encoding="utf-8")
+        if args.verbose:
+            print(f"Written {len(selected_files)} test files to: {output_path}")
+    else:
+        sys.stdout.write(output_content)
+
+    # Output metadata
+    if args.metadata_output:
+        metadata_path = Path(args.metadata_output).resolve()
+        metadata_path.parent.mkdir(parents=True, exist_ok=True)
+        metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+        if args.verbose:
+            print(f"Written metadata to: {metadata_path}")
+
+    # Verbose summary
+    if args.verbose:
+        print(f"\nDiscovery Summary:")
+        print(f"  Test directory: {test_dir}")
+        print(f"  Test type: {args.test_type}")
+        print(f"  Total files scanned: {metadata['total_files']}")
+        print(f"  After type filter: {metadata['type_selected']} selected, {metadata['type_excluded']} excluded")
+        if args.case_paths_config:
+            print(f"  Whitelist entries: {metadata['whitelist_entries']}")
+            print(f"  Blacklist entries: {metadata['blacklist_entries']}")
+            print(f"  After rules filter: {metadata['rules_selected']} selected, {metadata['rules_excluded']} excluded")
+        print(f"  Final selected files: {len(selected_files)}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/scripts/generate_npu_full_test_report.py b/.github/scripts/generate_npu_full_test_report.py
new file mode 100644
index 0000000000..569edc481f
--- /dev/null
+++ b/.github/scripts/generate_npu_full_test_report.py
@@ -0,0 +1,1276 @@
+#!/usr/bin/env python3
+"""
+Generate a consolidated markdown/json report for the NPU full test workflow.
+"""
+
+import argparse
+import json
+import re
+import xml.etree.ElementTree as ET
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+# Import aggregation function from parse_test_results.py
+import parse_test_results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate consolidated NPU full test report")
+    parser.add_argument("--reports-root", required=True, help="Root directory containing shard report files")
+    parser.add_argument("--output-markdown", required=True, help="Path to write markdown report")
+    parser.add_argument("--output-json", required=True, help="Path to write JSON report")
+    parser.add_argument("--pytorch-version", required=True, help="PyTorch version string")
+    parser.add_argument("--torch-npu-whl", required=True, help="torch_npu wheel URL")
+    parser.add_argument("--patch-count", default="N/A", help="Applied patch count")
+    parser.add_argument("--shard-matrix-json", required=True, help="JSON array of requested shard ids")
+    parser.add_argument("--docker-image", default="N/A", help="Docker image used for test execution")
+    parser.add_argument("--runner", default="N/A", help="Runner machine type")
+    parser.add_argument("--special-reports-root", help="Root directory containing special test report files")
+    parser.add_argument("--expected-special-tests-json", default="[]", help="JSON array of expected special test names")
+    parser.add_argument("--cases-summary", help="Path to cases_collection_summary.json for file discovery stats")
+    return parser.parse_args()
+
+
+def load_json_file(path: Path) -> Dict:
+    """Load JSON file with error handling for malformed/truncated files."""
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Warning: Invalid JSON in {path}: {e}")
+        # Read file content to diagnose truncation
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                content = f.read()
+            print(f"  File size: {len(content)} bytes")
+            # Show context around error position
+            error_pos = e.pos if hasattr(e, 'pos') else 0
+            start = max(0, error_pos - 100)
+            end = min(len(content), error_pos + 100)
+            print(f"  Context around error (pos {error_pos}): ...{content[start:end]}...")
+        except Exception:
+            pass
+        return {}
+    except Exception as e:
+        print(f"Warning: Failed to load {path}: {e}")
+        return {}
+
+
+def parse_junit_xml_testsuites(xml_path: Path) -> List[Dict]:
+    """
+    Parse JUnit XML file and extract per-testsuite statistics.
+
+    Each testsuite represents a test file with its own stats:
+    - name: test file name
+    - tests: total test cases
+    - failures: failed test cases
+    - errors: error test cases
+    - skipped: skipped test cases
+    - time: execution time in seconds
+
+    Returns a list of testsuite statistics.
+    """
+    testsuites = []
+
+    if not xml_path.exists():
+        return testsuites
+
+    try:
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+
+        # Handle both <testsuites> and <testsuite> as root
+        if root.tag == "testsuites":
+            for testsuite in root.findall("testsuite"):
+                stats = parse_testsuite_element(testsuite)
+                if stats:
+                    testsuites.append(stats)
+        elif root.tag == "testsuite":
+            stats = parse_testsuite_element(root)
+            if stats:
+                testsuites.append(stats)
+
+    except ET.ParseError as e:
+        print(f"Warning: Failed to parse XML {xml_path}: {e}")
+    except Exception as e:
+        print(f"Warning: Error reading XML {xml_path}: {e}")
+
+    return testsuites
+
+
+def parse_testsuite_element(testsuite: ET.Element) -> Optional[Dict]:
+    """Parse a single testsuite element and return its statistics."""
+    try:
+        name = testsuite.get("name", "unknown")
+        tests = int(testsuite.get("tests", 0))
+        failures = int(testsuite.get("failures", 0))
+        errors = int(testsuite.get("errors", 0))
+        skipped = int(testsuite.get("skipped", 0))
+        time = float(testsuite.get("time", 0.0))
+        passed = tests - failures - errors - skipped
+
+        return {
+            "name": name,
+            "tests": tests,
+            "passed": passed,
+            "failures": failures,
+            "errors": errors,
+            "skipped": skipped,
+            "time": time,
+        }
+    except (ValueError, TypeError):
+        return None
+
+
+def extract_test_identifier(test_path: str) -> str:
+    """
+    Extract a test identifier from a test file path.
+
+    Converts paths like:
+    - "test/distributed/_composable/fsdp/test_fully_shard_autograd.py"
+    To:
+    - "distributed._composable.fsdp.test_fully_shard_autograd"
+
+    This matches the testsuite naming convention used by pytest/run_test.py.
+    """
+    # Remove 'test/' prefix if present
+    path = test_path
+    if path.startswith("test/"):
+        path = path[5:]
+    # Remove '.py' suffix
+    if path.endswith(".py"):
+        path = path[:-3]
+    # Convert path separators to dots
+    path = path.replace("/", ".").replace("\\", ".")
+    return path
+
+
+def aggregate_testsuite_stats_for_shard(
+    reports_root: Path,
+    shard_type: str,
+    shard: int,
+    planned_files: List[str],
+    missing_files_list: List[str] = None
+) -> List[Dict]:
+    """
+    Aggregate all testsuite statistics for a specific shard.
+
+    The test execution generates XML files named `shard_{type}-{shard}_pytest*.xml`.
+    Each XML file contains testcases with `file` attribute indicating the test file.
+
+    Args:
+        reports_root: Root directory containing all merged report files
+        shard_type: Shard type ("distributed" or "regular")
+        shard: Shard number to aggregate for
+        planned_files: List of test file paths planned for this shard
+        missing_files_list: List of test file paths that crashed and didn't generate XML
+
+    Returns:
+        List of testsuite statistics for tests belonging to this shard.
+        Missing files are included with status="MISSING" and tests=0.
+    """
+    if missing_files_list is None:
+        missing_files_list = []
+
+    all_testsuites = {}
+    # Map from test identifier -> aggregated stats
+
+    # Build set of test identifiers from planned files
+    planned_identifiers = set()
+    for planned in planned_files:
+        identifier = extract_test_identifier(planned)
+        if identifier:
+            planned_identifiers.add(identifier)
+
+    # Also include just the test file names for simpler matching
+    planned_test_names = set()
+    for planned in planned_files:
+        name = Path(planned).name.replace(".py", "")
+        planned_test_names.add(name)
+
+    print(f"DEBUG: planned_files count={len(planned_files)}, planned_identifiers count={len(planned_identifiers)}")
+    if planned_identifiers:
+        print(f"DEBUG: First 3 planned_identifiers: {list(planned_identifiers)[:3]}")
+
+    # Convert shard_type to file prefix ("distributed" -> "dist", "regular" -> "reg")
+    type_prefix = "dist" if shard_type == "distributed" else "reg"
+
+    # Debug: List all files in reports_root
+    print(f"DEBUG aggregate_testsuite_stats_for_shard: shard_type={shard_type}, shard={shard}")
+    print(f"DEBUG: reports_root={reports_root}, exists={reports_root.exists()}")
+    if reports_root.exists():
+        all_xml_files = list(reports_root.rglob("*.xml"))
+        print(f"DEBUG: Total XML files in reports_root (rglob): {len(all_xml_files)}")
+        matching_xml_files = list(reports_root.rglob(f"shard_{type_prefix}-{shard}_pytest*.xml"))
+        print(f"DEBUG: Matching XML files for shard_{type_prefix}-{shard}_pytest*.xml (rglob): {len(matching_xml_files)}")
+        for xf in matching_xml_files[:5]:
+            print(f"DEBUG:   - {xf.relative_to(reports_root)}")
+
+    # Find all XML files for this shard: shard_{type}-{shard}_pytest*.xml
+    # Use rglob to search recursively (files may be in subdirectories due to artifact merge)
+    for xml_path in reports_root.rglob(f"shard_{type_prefix}-{shard}_pytest*.xml"):
+        # Parse testcase elements and aggregate by file attribute
+        test_file_stats = aggregate_testcases_by_file(xml_path, planned_identifiers, planned_test_names)
+        for test_id, stats in test_file_stats.items():
+            if test_id in all_testsuites:
+                # Merge with existing stats
+                existing = all_testsuites[test_id]
+                existing["tests"] += stats["tests"]
+                existing["passed"] += stats["passed"]
+                existing["failures"] += stats["failures"]
+                existing["errors"] += stats["errors"]
+                existing["skipped"] += stats["skipped"]
+                existing["time"] += stats["time"]
+            else:
+                all_testsuites[test_id] = stats
+
+    # Also check nested directories for Phase 1 style XMLs (run_test.py output)
+    phase1_patterns = [
+        "junit",
+        "pytorch-test-src/test/test-reports/python-pytest",
+    ]
+
+    for phase1_pattern in phase1_patterns:
+        phase1_base = reports_root / phase1_pattern
+        if not phase1_base.exists():
+            continue
+
+        for test_dir in phase1_base.iterdir():
+            if not test_dir.is_dir():
+                continue
+            test_identifier = test_dir.name
+            matched = False
+            for planned_id in planned_identifiers:
+                if test_identifier == planned_id or test_identifier.startswith(planned_id) or planned_id.startswith(test_identifier):
+                    matched = True
+                    break
+            if not matched:
+                for test_name in planned_test_names:
+                    if test_identifier.endswith(test_name) or test_name in test_identifier:
+                        matched = True
+                        break
+            if not matched:
+                continue
+
+            if test_identifier not in all_testsuites:
+                all_testsuites[test_identifier] = {
+                    "name": test_identifier,
+                    "tests": 0,
+                    "passed": 0,
+                    "failures": 0,
+                    "errors": 0,
+                    "skipped": 0,
+                    "time": 0.0,
+                }
+            aggregated = all_testsuites[test_identifier]
+
+            for xml_file in test_dir.glob("*.xml"):
+                testsuites = parse_junit_xml_testsuites(xml_file)
+                for ts in testsuites:
+                    aggregated["tests"] += ts.get("tests", 0)
+                    aggregated["passed"] += ts.get("passed", 0)
+                    aggregated["failures"] += ts.get("failures", 0)
+                    aggregated["errors"] += ts.get("errors", 0)
+                    aggregated["skipped"] += ts.get("skipped", 0)
+                    aggregated["time"] += ts.get("time", 0.0)
+
+    # Add missing files (crashed without generating XML) to the result
+    # These files show as "MISSING" in the test file details
+    for missing_file in missing_files_list:
+        missing_identifier = extract_test_identifier(missing_file)
+        if missing_identifier and missing_identifier not in all_testsuites:
+            all_testsuites[missing_identifier] = {
+                "name": missing_identifier,
+                "tests": 0,
+                "passed": 0,
+                "failures": 0,
+                "errors": 0,
+                "skipped": 0,
+                "time": 0.0,
+                "status": "MISSING",  # Special status for crashed files
+            }
+
+    # Convert to list and sort by name
+    result = list(all_testsuites.values())
+    result.sort(key=lambda x: x["name"])
+
+    return result
+
+
+def aggregate_testcases_by_file(xml_path: Path, planned_identifiers: set, planned_test_names: set) -> Dict[str, Dict]:
+    """
+    Parse XML file and aggregate testcase statistics by file attribute.
+
+    Used for XMLs where testsuite name is generic "pytest".
+    If planned_identifiers is empty, accept all testcases.
+    """
+    result = {}
+    debug_count = 0
+
+    try:
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+
+        # Find all testcase elements
+        testcases = root.findall(".//testcase")
+        print(f"DEBUG aggregate_testcases_by_file: {xml_path.name}, testcases={len(testcases)}, planned_ids={len(planned_identifiers)}")
+
+        for testcase in testcases:
+            file_attr = testcase.get("file", "")
+            classname_attr = testcase.get("classname", "")
+
+            # Extract test identifier from file attribute or classname
+            test_identifier = None
+
+            if file_attr:
+                # e.g., "distributed/fsdp/test_fsdp_sharded_grad_scaler.py"
+                test_identifier = extract_test_identifier("test/" + file_attr) if not file_attr.startswith("test/") else extract_test_identifier(file_attr)
+                if debug_count < 3:
+                    print(f"DEBUG: file_attr='{file_attr}' -> test_identifier='{test_identifier}'")
+                    debug_count += 1
+            elif classname_attr:
+                # classname format: "test.distributed._composable.fsdp.test_fully_shard_comm.TestFullyShardCollectiveOps"
+                # The last part is the class name, need to extract the module path
+                # e.g., extract "test.distributed._composable.fsdp.test_fully_shard_comm" (module name)
+                parts = classname_attr.split(".")
+                if len(parts) > 1:
+                    # Remove the last part (class name like TestFullyShardCollectiveOps)
+                    # Keep everything before the class name
+                    module_parts = parts[:-1]
+                    classname_attr = ".".join(module_parts)
+                # Convert to match planned_identifiers format (dot-separated, no test/ prefix)
+                # planned_identifiers format: "distributed._composable.fsdp.test_fully_shard_comm"
+                test_identifier = classname_attr
+                # Remove 'test.' prefix if present to match planned_identifiers
+                if test_identifier.startswith("test."):
+                    test_identifier = test_identifier[5:]
+                if debug_count < 3:
+                    print(f"DEBUG: classname_attr='{classname_attr}' -> test_identifier='{test_identifier}'")
+                    debug_count += 1
+
+            if not test_identifier:
+                continue
+
+            # If planned_identifiers is empty, accept all testcases
+            # Otherwise, check if this test belongs to planned files
+            if planned_identifiers or planned_test_names:
+                matched = False
+                for planned_id in planned_identifiers:
+                    if test_identifier == planned_id or test_identifier.startswith(planned_id) or planned_id.startswith(test_identifier):
+                        matched = True
+                        break
+                if not matched:
+                    for test_name in planned_test_names:
+                        if test_identifier.endswith(test_name) or test_name in test_identifier:
+                            matched = True
+                            break
+                if not matched:
+                    continue
+
+            # Initialize stats for this test file
+            if test_identifier not in result:
+                result[test_identifier] = {
+                    "name": test_identifier,
+                    "tests": 0,
+                    "passed": 0,
+                    "failures": 0,
+                    "errors": 0,
+                    "skipped": 0,
+                    "time": 0.0,
+                }
+
+            # Count testcase
+            stats = result[test_identifier]
+            stats["tests"] += 1
+
+            # Determine outcome
+            failure = testcase.find("failure")
+            error = testcase.find("error")
+            skipped = testcase.find("skipped")
+
+            if failure is not None:
+                stats["failures"] += 1
+            elif error is not None:
+                stats["errors"] += 1
+            elif skipped is not None:
+                stats["skipped"] += 1
+            else:
+                stats["passed"] += 1
+
+            # Add time
+            time_str = testcase.get("time", "0")
+            try:
+                stats["time"] += float(time_str)
+            except ValueError:
+                pass
+
+    except ET.ParseError as e:
+        print(f"Warning: Failed to parse XML {xml_path}: {e}")
+    except Exception as e:
+        print(f"Warning: Error reading XML {xml_path}: {e}")
+
+    return result
+
+
+def parse_requested_shards(raw: str) -> List[Tuple[str, int]]:
+    """
+    Parse shard identifiers from JSON array.
+
+    Supports formats:
+    - Integers: [1, 2, 3] -> [("regular", 1), ("regular", 2), ("regular", 3)]
+    - Type-prefixed: ["dist-1", "reg-2"] -> [("distributed", 1), ("regular", 2)]
+
+    Returns list of (shard_type, shard_number) tuples.
+    """
+    try:
+        value = json.loads(raw)
+    except json.JSONDecodeError:
+        return []
+
+    if not isinstance(value, list):
+        return []
+
+    result = []
+    for item in value:
+        try:
+            if isinstance(item, str):
+                # Parse type-prefixed format: "dist-1", "reg-2"
+                if "-" in item:
+                    type_prefix, num_str = item.split("-", 1)
+                    if type_prefix == "dist":
+                        shard_type = "distributed"
+                    elif type_prefix == "reg":
+                        shard_type = "regular"
+                    else:
+                        # Unknown prefix, skip
+                        continue
+                    shard_num = int(num_str)
+                    result.append((shard_type, shard_num))
+                else:
+                    # String without prefix, try to parse as int
+                    shard_num = int(item)
+                    result.append(("regular", shard_num))
+            elif isinstance(item, int):
+                # Plain integer, assume "regular" type
+                result.append(("regular", item))
+        except (TypeError, ValueError):
+            continue
+    # Sort by type then number
+    return sorted(set(result), key=lambda x: (x[0], x[1]))
+
+
+def parse_expected_special_tests(raw: str) -> List[str]:
+    try:
+        value = json.loads(raw)
+    except json.JSONDecodeError:
+        return []
+
+    if not isinstance(value, list):
+        return []
+
+    result = []
+    for item in value:
+        if isinstance(item, str) and item:
+            result.append(item)
+    return sorted(set(result))
+
+
+def load_text_lines(path: Path) -> List[str]:
+    with open(path, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f if line.strip()]
+
+
+def get_int_value(payload: Dict, *keys: str) -> int:
+    for key in keys:
+        if key not in payload:
+            continue
+        try:
+            return int(payload.get(key, 0))
+        except (TypeError, ValueError):
+            continue
+    return 0
+
+
+def get_selected_test_entries(info: Dict) -> int:
+    return get_int_value(info, "selected_test_entries", "upstream_selected_tests")
+
+
+def get_selected_test_files(info: Dict) -> int:
+    return get_int_value(info, "selected_test_files", "upstream_selected_file_tests")
+
+
+def get_path_filtered_out_files(info: Dict) -> int:
+    return get_int_value(info, "path_filtered_out_files", "excluded_test_files")
+
+
+def get_unhandled_special_tests(info: Dict) -> int:
+    return get_int_value(info, "unhandled_special_tests", "upstream_unhandled_tests")
+
+
+def discover_shard_files(
+    reports_root: Path,
+) -> Tuple[
+    Dict[Tuple[str, int], Path],  # stats_files
+    Dict[Tuple[str, int], Path],  # info_files
+    Dict[Tuple[str, int], Path],  # plan_files
+    Dict[Tuple[str, int], Path],  # excluded_files
+    Dict[Tuple[str, int], Path],  # unhandled_files
+    Dict[Tuple[str, int], Path],  # xml_files
+    Dict[Tuple[str, int], Path],  # missing_files
+    Dict[Tuple[str, int], Path],  # cases_files
+]:
+    """
+    Discover all shard report files in the reports directory.
+
+    Returns dicts keyed by (shard_type, shard_number) tuples.
+
+    File name format: shard_{type}-{number}_{suffix}
+    Examples:
+    - shard_dist-1_stats.json
+    - shard_reg-1_info.json
+    - shard_dist-1_cases.json  (case-level results)
+    """
+    stats_files = {}
+    info_files = {}
+    plan_files = {}
+    excluded_files = {}
+    unhandled_files = {}
+    xml_files = {}
+    missing_files = {}
+    cases_files = {}
+
+    def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
+        """
+        Parse shard type and number from filename.
+
+        Filename format: shard_{type}-{number}_{suffix}
+        e.g., shard_dist-1_stats.json -> ("distributed", 1)
+        e.g., shard_reg-2_planned_test_files.txt -> ("regular", 2)
+        """
+        stem = path.stem  # filename without extension
+        # Match pattern: shard_{type}-{number}_{suffix}
+        match = re.match(r"shard_(dist|reg)-(\d+)_" + suffix_pattern, stem)
+        if match:
+            type_prefix = match.group(1)
+            shard_num = int(match.group(2))
+            if type_prefix == "dist":
+                return ("distributed", shard_num)
+            elif type_prefix == "reg":
+                return ("regular", shard_num)
+        return None
+
+    for path in reports_root.rglob("shard_*_stats.json"):
+        key = parse_shard_filename(path, "stats")
+        if key:
+            stats_files[key] = path
+
+    for path in reports_root.rglob("shard_*_info.json"):
+        key = parse_shard_filename(path, "info")
+        if key:
+            info_files[key] = path
+
+    for path in reports_root.rglob("shard_*_planned_test_files.txt"):
+        key = parse_shard_filename(path, "planned_test_files")
+        if key:
+            plan_files[key] = path
+
+    for path in reports_root.rglob("shard_*_excluded_test_files.txt"):
+        key = parse_shard_filename(path, "excluded_test_files")
+        if key:
+            excluded_files[key] = path
+
+    for path in reports_root.rglob("shard_*_unhandled_upstream_tests.txt"):
+        key = parse_shard_filename(path, "unhandled_upstream_tests")
+        if key:
+            unhandled_files[key] = path
+
+    # Discover XML files for per-test-file statistics
+    for path in reports_root.rglob("shard_*_pytest*.xml"):
+        # XML filename: shard_{type}-{number}_pytest{suffix}.xml
+        stem = path.stem
+        match = re.match(r"shard_(dist|reg)-(\d+)_pytest", stem)
+        if match:
+            type_prefix = match.group(1)
+            shard_num = int(match.group(2))
+            if type_prefix == "dist":
+                key = ("distributed", shard_num)
+            elif type_prefix == "reg":
+                key = ("regular", shard_num)
+            xml_files[key] = path
+
+    # Discover missing files list (files that crashed and didn't generate XML)
+    for path in reports_root.rglob("shard_*_missing_files.txt"):
+        key = parse_shard_filename(path, "missing_files")
+        if key:
+            missing_files[key] = path
+
+    # Discover case-level results files
+    for path in reports_root.rglob("shard_*_cases.json"):
+        key = parse_shard_filename(path, "cases")
+        if key:
+            cases_files[key] = path
+
+    return stats_files, info_files, plan_files, excluded_files, unhandled_files, xml_files, missing_files, cases_files
+
+
+def get_shard_status(stats: Dict, present: bool) -> str:
+    if not present:
+        return "MISSING"
+    if stats.get("crashed"):
+        return "CRASHED"
+    if stats.get("timed_out"):
+        return "TIMEOUT"
+    if stats.get("incomplete"):
+        return "INCOMPLETE"
+    if stats.get("errors", 0) > 0:
+        return "ERROR"
+    if stats.get("failed", 0) > 0:
+        return "FAILED"
+    if stats.get("total", 0) == 0:
+        return "NO TESTS"
+    return "PASSED"
+
+
+def get_overall_status(status_counts: Counter) -> str:
+    if status_counts["MISSING"] > 0:
+        return "FAILED"
+    if any(status_counts[key] > 0 for key in ("CRASHED", "TIMEOUT", "INCOMPLETE", "ERROR", "FAILED")):
+        return "FAILED"
+    if status_counts["PASSED"] > 0:
+        return "PASSED"
+    return "NO TESTS"
+
+
+def format_duration(seconds: float) -> str:
+    seconds = float(seconds)
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    if hours > 0:
+        return f"{hours}h {minutes}m {secs:.1f}s"
+    if minutes > 0:
+        return f"{minutes}m {secs:.1f}s"
+    return f"{secs:.1f}s"
+
+
+def build_note(stats: Dict) -> str:
+    notes = []
+    if stats.get("crash_signal"):
+        notes.append(stats["crash_signal"])
+    if stats.get("timed_out"):
+        notes.append("overall timeout")
+    if stats.get("incomplete"):
+        notes.append("no junit xml")
+    if stats.get("error_message"):
+        notes.append(stats["error_message"])
+    return "; ".join(notes)
+
+
+def sanitize_markdown_cell(value: str) -> str:
+    return value.replace("|", "\\|").replace("\n", "<br>")
+
+
+def format_planned_files_cell(planned_files: List[str]) -> str:
+    if not planned_files:
+        return "-"
+    return "<br>".join(sanitize_markdown_cell(path) for path in planned_files)
+
+
+def format_testsuite_detail(stats: Dict) -> str:
+    """
+    Format a single testsuite's stats for display.
+
+    Format: "test_file.py: 5 passed, 2 failed, 1 error, 0 skipped, 3.2s"
+    Or for missing files: "test_file.py: MISSING (crashed, no report)"
+    """
+    name = sanitize_markdown_cell(stats.get("name", "unknown"))
+
+    # Check for MISSING status (file crashed without generating report)
+    if stats.get("status") == "MISSING":
+        return f"{name}: MISSING (crashed, no report)"
+    passed = stats.get("passed", 0)
+    failures = stats.get("failures", 0)
+    errors = stats.get("errors", 0)
+    skipped = stats.get("skipped", 0)
+    time = stats.get("time", 0.0)
+
+    # Build stats parts (comma-separated)
+    stats_parts = []
+    if passed > 0:
+        stats_parts.append(f"{passed} passed")
+    if failures > 0:
+        stats_parts.append(f"{failures} failed")
+    if errors > 0:
+        stats_parts.append(f"{errors} error")
+    if skipped > 0:
+        stats_parts.append(f"{skipped} skipped")
+    stats_parts.append(format_duration_short(time))
+
+    # Format: "name: stats1, stats2, ..."
+    stats_str = ", ".join(stats_parts)
+    return f"{name}: {stats_str}"
+
+
+def format_duration_short(seconds: float) -> str:
+    """Format duration in a compact form for testsuite display."""
+    seconds = float(seconds)
+    if seconds >= 60:
+        minutes = int(seconds // 60)
+        secs = seconds % 60
+        return f"{minutes}m{secs:.0f}s"
+    return f"{seconds:.1f}s"
+
+
+def format_testsuite_details_cell(testsuites: List[Dict]) -> str:
+    """
+    Format all testsuite stats for a shard into a single cell.
+
+    Each testsuite is displayed on a separate line with its stats.
+    """
+    if not testsuites:
+        return "-"
+
+    lines = []
+    for ts in testsuites:
+        lines.append(format_testsuite_detail(ts))
+
+    return "<br>".join(lines)
+
+
+def format_summary_note(note: str) -> str:
+    cleaned = (note or "").strip()
+    if not cleaned or cleaned == "pytest exited with code 1":
+        return "-"
+    return sanitize_markdown_cell(cleaned)
+
+
+def format_scope_list(items: List[str]) -> List[str]:
+    if not items:
+        return ["- None"]
+    return [f"- {sanitize_markdown_cell(item)}" for item in items]
+
+
+def render_table(headers: List[str], rows: List[List[str]]) -> List[str]:
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        lines.append("| " + " | ".join(row) + " |")
+    return lines
+
+
+def discover_special_test_files(reports_root: Path | None) -> Dict[str, Path]:
+    if reports_root is None or not reports_root.exists():
+        return {}
+
+    special_files = {}
+    for path in reports_root.rglob("special_test_*.json"):
+        try:
+            payload = load_json_file(path)
+        except Exception:
+            continue
+        name = payload.get("name")
+        if isinstance(name, str) and name:
+            special_files[name] = path
+    return special_files
+
+
+def main():
+    args = parse_args()
+    reports_root = Path(args.reports_root)
+    output_markdown = Path(args.output_markdown)
+    output_json = Path(args.output_json)
+    requested_shards = parse_requested_shards(args.shard_matrix_json)
+    expected_special_tests = parse_expected_special_tests(args.expected_special_tests_json)
+    special_reports_root = Path(args.special_reports_root) if args.special_reports_root else None
+
+    # Load cases collection summary for file discovery stats
+    cases_summary_data = None
+    file_discovery_stats = {
+        "total_files_scanned": 0,
+        "distributed_files_before_filter": 0,
+        "distributed_files_after_filter": 0,
+        "regular_files_before_filter": 0,
+        "regular_files_after_filter": 0,
+    }
+    if args.cases_summary:
+        cases_summary_path = Path(args.cases_summary)
+        if cases_summary_path.exists():
+            cases_summary_data = load_json_file(cases_summary_path)
+            # Extract file discovery stats from metadata
+            if cases_summary_data:
+                file_discovery_stats["total_files_scanned"] = cases_summary_data.get("total_files_scanned", 0)
+                dist_meta = cases_summary_data.get("distributed", {}).get("discovery_metadata", {})
+                reg_meta = cases_summary_data.get("regular", {}).get("discovery_metadata", {})
+                file_discovery_stats["distributed_files_before_filter"] = dist_meta.get("type_selected", 0)
+                file_discovery_stats["distributed_files_after_filter"] = dist_meta.get("rules_selected", 0)
+                file_discovery_stats["regular_files_before_filter"] = reg_meta.get("type_selected", 0)
+                file_discovery_stats["regular_files_after_filter"] = reg_meta.get("rules_selected", 0)
+
+    stats_files, info_files, plan_files, excluded_files, unhandled_files, xml_files, missing_files_paths, cases_files = discover_shard_files(reports_root)
+    special_test_files = discover_special_test_files(special_reports_root)
+    shard_ids = requested_shards or sorted(set(stats_files) | set(info_files) | set(cases_files))
+
+    status_counts = Counter()
+    totals = {
+        "total": 0,
+        "passed": 0,
+        "failed": 0,
+        "skipped": 0,
+        "errors": 0,
+        "duration": 0.0,
+        "discovered_test_files": 0,
+        "selected_test_entries": 0,
+        "selected_test_files": 0,
+        "path_filtered_out_files": 0,
+        "planned_files": 0,
+        "junit_generated_shards": 0,
+        "junit_xml_files": 0,
+        "zero_item_test_files": 0,
+        "startup_failures": 0,
+        "import_failures": 0,
+        "test_failures": 0,
+        "missing_files": 0,
+        "total_cases": 0,
+        "case_passed": 0,
+        "case_failed": 0,
+        "case_errors": 0,
+        "case_crashed": 0,
+        "case_timeout": 0,
+    }
+    shard_rows = []
+    unique_planned_files = set()
+    unique_excluded_files = set()
+    unique_unhandled_tests = set()
+    unique_missing_files = set()
+    selection_modes = set()
+    cases_results = {}  # Store case-level results for each shard
+
+    for shard_type, shard_num in shard_ids:
+        shard_key = (shard_type, shard_num)
+        stats_path = stats_files.get(shard_key)
+        info_path = info_files.get(shard_key)
+        plan_path = plan_files.get(shard_key)
+        excluded_path = excluded_files.get(shard_key)
+        unhandled_path = unhandled_files.get(shard_key)
+        missing_path = missing_files_paths.get(shard_key)
+        cases_path = cases_files.get(shard_key)
+        stats = load_json_file(stats_path) if stats_path else {}
+        info = load_json_file(info_path) if info_path else {}
+        selected_test_entries = get_selected_test_entries(info)
+        selected_test_files = get_selected_test_files(info)
+        path_filtered_out_files = get_path_filtered_out_files(info)
+        unhandled_special_tests = get_unhandled_special_tests(info)
+        planned_files = load_text_lines(plan_path) if plan_path else []
+        excluded_test_files = load_text_lines(excluded_path) if excluded_path else []
+        unhandled_tests = load_text_lines(unhandled_path) if unhandled_path else []
+        missing_files_list = load_text_lines(missing_path) if missing_path else []
+
+        # Load case-level results if available
+        cases_data = load_json_file(cases_path) if cases_path else {}
+        if cases_data:
+            cases_results[shard_key] = cases_data
+            # Override stats with case-level data
+            stats["total"] = cases_data.get("total_cases", 0)
+            stats["passed"] = cases_data.get("passed", 0)
+            stats["failed"] = cases_data.get("failed", 0)
+            stats["errors"] = cases_data.get("errors", 0)
+            stats["skipped"] = cases_data.get("skipped", 0)
+            stats["crashed"] = cases_data.get("crashed", 0)
+            stats["timeout"] = cases_data.get("timeout", 0)
+            stats["duration"] = cases_data.get("duration", 0.0)
+            # Update totals
+            totals["total_cases"] += cases_data.get("total_cases", 0)
+            totals["case_passed"] += cases_data.get("passed", 0)
+            totals["case_failed"] += cases_data.get("failed", 0)
+            totals["case_errors"] += cases_data.get("errors", 0)
+            totals["case_crashed"] += cases_data.get("crashed", 0)
+            totals["case_timeout"] += cases_data.get("timeout", 0)
+
+        present = bool(stats_path or cases_path)
+
+        # Parse ALL XML files to get per-test-file statistics
+        # This includes Phase 1 (run_test.py) and Phase 2 (pytest fallback) results
+        # Filter by planned test files to ensure we only include tests for this shard
+        # Include missing files that crashed without generating reports
+        testsuite_stats = aggregate_testsuite_stats_for_shard(reports_root, shard_type, shard_num, planned_files, missing_files_list)
+
+        # If testsuite_stats has entries, aggregate their totals and override incomplete status
+        has_phase1_xmls = len(testsuite_stats) > 0
+        if has_phase1_xmls:
+            # Aggregate stats from Phase 1 XMLs
+            xml_totals = {
+                "tests": 0,
+                "passed": 0,
+                "failures": 0,
+                "errors": 0,
+                "skipped": 0,
+                "time": 0.0,
+            }
+            for ts in testsuite_stats:
+                xml_totals["tests"] += ts.get("tests", 0)
+                xml_totals["passed"] += ts.get("passed", 0)
+                xml_totals["failures"] += ts.get("failures", 0)
+                xml_totals["errors"] += ts.get("errors", 0)
+                xml_totals["skipped"] += ts.get("skipped", 0)
+                xml_totals["time"] += ts.get("time", 0.0)
+
+            # Use XML data to fill stats if:
+            # 1. stats.json doesn't exist (stats is empty) but we have XML data
+            # 2. stats.json exists but is incomplete and we have XML data to override
+            # This ensures per-file isolation mode shards get correct totals even without stats.json
+            if xml_totals["tests"] > 0:
+                # Always fill stats from XML if stats is empty or incomplete
+                if not stats or stats.get("incomplete"):
+                    stats["incomplete"] = False
+                    stats["total"] = xml_totals["tests"]
+                    stats["passed"] = xml_totals["passed"]
+                    stats["failed"] = xml_totals["failures"]
+                    stats["errors"] = xml_totals["errors"]
+                    stats["skipped"] = xml_totals["skipped"]
+                    stats["duration"] = xml_totals["time"]
+                    # Mark as present if we have XML data (even without stats.json)
+                    if not present:
+                        present = True
+
+        unique_planned_files.update(planned_files)
+        unique_excluded_files.update(excluded_test_files)
+        unique_unhandled_tests.update(unhandled_tests)
+        unique_missing_files.update(missing_files_list)
+        if info.get("selection_mode"):
+            selection_modes.add(str(info.get("selection_mode")))
+
+        status = get_shard_status(stats, present)
+        status_counts[status] += 1
+
+        totals["total"] += int(stats.get("total", 0))
+        totals["passed"] += int(stats.get("passed", 0))
+        totals["failed"] += int(stats.get("failed", 0))
+        totals["skipped"] += int(stats.get("skipped", 0))
+        totals["errors"] += int(stats.get("errors", 0))
+        totals["duration"] += float(stats.get("duration", 0.0))
+        totals["discovered_test_files"] = max(
+            totals["discovered_test_files"], int(info.get("total_files", 0))
+        )
+        totals["selected_test_entries"] = max(totals["selected_test_entries"], selected_test_entries)
+        totals["selected_test_files"] = max(totals["selected_test_files"], selected_test_files)
+        totals["path_filtered_out_files"] = max(totals["path_filtered_out_files"], path_filtered_out_files)
+        totals["planned_files"] += int(info.get("shard_files", 0))
+        totals["junit_generated_shards"] += 1 if info.get("junit_generated") else 0
+        totals["junit_xml_files"] += int(info.get("junit_xml_files", 0) or stats.get("junit_xml_files", 0))
+        totals["zero_item_test_files"] += int(info.get("zero_item_test_files", 0) or stats.get("zero_item_test_files", 0))
+        totals["startup_failures"] += int(info.get("startup_failures", 0) or stats.get("startup_failures", 0))
+        totals["import_failures"] += int(info.get("import_failures", 0) or stats.get("import_failures", 0))
+        totals["test_failures"] += int(info.get("test_failures", 0) or stats.get("test_failures", 0))
+        totals["missing_files"] += len(missing_files_list)
+
+        # Convert shard_type to display prefix ("distributed" -> "dist", "regular" -> "reg")
+        shard_prefix = "dist" if shard_type == "distributed" else "reg"
+        shard_rows.append(
+            {
+                "shard": f"{shard_prefix}-{shard_num}",  # "dist-1" or "reg-1"
+                "shard_type": shard_type,
+                "shard_num": shard_num,
+                "status": status,
+                "total": int(stats.get("total", 0)),
+                "passed": int(stats.get("passed", 0)),
+                "failed": int(stats.get("failed", 0)),
+                "skipped": int(stats.get("skipped", 0)),
+                "errors": int(stats.get("errors", 0)),
+                "crashed": int(stats.get("crashed", 0)),
+                "timeout": int(stats.get("timeout", 0)),
+                "duration": float(stats.get("duration", 0.0)),
+                "planned_files": int(info.get("shard_files", 0)),
+                "discovered_test_files": int(info.get("total_files", 0)),
+                "selected_test_entries": selected_test_entries,
+                "selected_test_files": selected_test_files,
+                "unhandled_special_tests": unhandled_special_tests,
+                "planned_file_names": planned_files,
+                "path_filtered_out_files": path_filtered_out_files,
+                "disabled_matched": int(info.get("disabled_count_matched", 0)),
+                "disabled_deselected": int(info.get("disabled_count_deselected", 0)),
+                "junit_generated": bool(info.get("junit_generated", stats.get("junit_generated", False))),
+                "junit_xml_files": int(info.get("junit_xml_files", stats.get("junit_xml_files", 0))),
+                "zero_item_test_files": int(info.get("zero_item_test_files", stats.get("zero_item_test_files", 0))),
+                "startup_failures": int(info.get("startup_failures", stats.get("startup_failures", 0))),
+                "import_failures": int(info.get("import_failures", stats.get("import_failures", 0))),
+                "test_failures": int(info.get("test_failures", stats.get("test_failures", 0))),
+                "note": build_note(stats),
+                "testsuite_stats": testsuite_stats,  # Per-test-file statistics
+            }
+        )
+
+    overall_status = get_overall_status(status_counts)
+    whl_name = Path(args.torch_npu_whl).name
+    received_reports = len(stats_files)
+    expected_reports = len(shard_ids)
+    unique_planned_count = len(unique_planned_files)
+    excluded_test_files_list = sorted(unique_excluded_files)
+    unhandled_tests_list = sorted(unique_unhandled_tests)
+    not_covered_by_requested_shards = max(
+        totals["selected_test_files"] - unique_planned_count,
+        0,
+    )
+    selection_mode_display = ", ".join(sorted(selection_modes)) if selection_modes else "-"
+    include_selected_entries = totals["selected_test_entries"] > 0
+    include_unhandled_tests = bool(unhandled_tests_list)
+
+    # Show all shards in the detail table
+    sorted_shards = sorted(shard_rows, key=lambda row: (row["shard_type"], row["shard_num"]))
+    slowest = sorted(shard_rows, key=lambda row: row["duration"], reverse=True)[:20]
+    special_test_names = expected_special_tests or sorted(special_test_files)
+    special_test_rows = []
+    special_status_counts = Counter()
+
+    for test_name in special_test_names:
+        payload = load_json_file(special_test_files[test_name]) if test_name in special_test_files else {}
+        status = str(payload.get("status", "MISSING"))
+        special_status_counts[status] += 1
+        special_test_rows.append(
+            {
+                "name": test_name,
+                "group": str(payload.get("group", "-")),
+                "status": status,
+                "duration": float(payload.get("duration", 0.0)),
+                "returncode": payload.get("returncode", "-"),
+                "note": str(payload.get("note", "") or "-"),
+            }
+        )
+
+    if any(row["status"] != "PASSED" for row in special_test_rows):
+        overall_status = "FAILED"
+
+    include_special_tests = bool(special_test_names or special_test_rows)
+
+    # Build Selection row content based on available data
+    if cases_summary_data:
+        # Use file discovery stats from cases_collection_summary.json
+        total_scanned = file_discovery_stats["total_files_scanned"]
+        dist_before = file_discovery_stats["distributed_files_before_filter"]
+        dist_after = file_discovery_stats["distributed_files_after_filter"]
+        reg_before = file_discovery_stats["regular_files_before_filter"]
+        reg_after = file_discovery_stats["regular_files_after_filter"]
+        total_after_filter = dist_after + reg_after
+        selection_content = (
+            f"扫描发现 {total_scanned} 个测试文件; "
+            f"黑白名单过滤后 {total_after_filter} 个文件 "
+            f"(distributed: {dist_before} -> {dist_after}, regular: {reg_before} -> {reg_after})"
+        )
+    else:
+        # Fallback to original selection mode display
+        selection_content = (
+            f"{selection_mode_display}; "
+            f"{totals['selected_test_files']} selected, "
+            f"{totals['path_filtered_out_files']} filtered out"
+        )
+
+    overview_rows = [
+        ["Overall result", overall_status],
+        ["PyTorch", f"`v{args.pytorch_version}`"],
+        ["torch_npu", f"`{whl_name}`"],
+        ["Patches applied", str(args.patch_count)],
+        ["Docker image", f"`{args.docker_image}`"],
+        ["Runner", f"`{args.runner}`"],
+        ["Shards", f"{received_reports} / {expected_reports} reported"],
+        ["Selection", selection_content],
+        [
+            "Tests",
+            (
+                f"{totals['total']} total; {totals['passed']} passed; {totals['failed']} failed; "
+                f"{totals['errors']} errors; {totals['skipped']} skipped"
+            ),
+        ],
+        ["Duration", format_duration(totals["duration"])],
+    ]
+    if totals["missing_files"] > 0:
+        overview_rows.append(["Missing files", f"{totals['missing_files']} crashed without report"])
+    if include_special_tests:
+        overview_rows.append(["Special tests expected", str(len(special_test_names))])
+
+    # Add case-level statistics if available (replace Tests row with case-level data)
+    if totals["total_cases"] > 0:
+        overview_rows.append([
+            "Case-level stats",
+            (
+                f"{totals['total_cases']} cases; "
+                f"{totals['case_passed']} passed; "
+                f"{totals['case_failed']} failed; "
+                f"{totals['case_errors']} errors; "
+                f"{totals['case_crashed']} crashed; "
+                f"{totals['case_timeout']} timeout"
+            ),
+        ])
+
+    markdown_lines = [
+        "# PyTorch NPU Full Test Summary",
+        "",
+        "## Overview",
+    ]
+    markdown_lines.extend(
+        render_table(
+            ["Item", "Value"],
+            overview_rows,
+        )
+    )
+
+    # Add case-level statistics table if available
+    if cases_results:
+        markdown_lines.extend(["", "## 用例级执行统计"])
+        markdown_lines.extend(
+            render_table(
+                ["Shard", "总用例", "通过", "失败", "错误", "崩溃", "超时", "Duration"],
+                [
+                    [
+                        f"{row['shard']}",
+                        str(row["total"]),
+                        str(row["passed"]),
+                        str(row["failed"]),
+                        str(row["errors"]),
+                        str(row.get("crashed", 0)),
+                        str(row.get("timeout", 0)),
+                        format_duration(row["duration"]),
+                    ]
+                    for row in sorted_shards
+                    if (row["shard_type"], row["shard_num"]) in cases_results
+                ],
+            )
+        )
+
+        # Add file-level statistics table
+        file_stats = parse_test_results.aggregate_all_cases_by_file(cases_results)
+
+        if file_stats:
+            # Sort files by total cases descending
+            sorted_files = sorted(
+                file_stats.values(),
+                key=lambda x: (-x["total"], x["file"])
+            )
+
+            markdown_lines.extend(["", "## 测试文件结果汇总"])
+
+            file_rows = []
+            for fs in sorted_files:  # Show all files
+                failed_total = fs["failed"] + fs["errors"] + fs["crashed"] + fs["timeout"]
+                fail_rate = f"{(failed_total / fs['total'] * 100):.1f}%" if fs["total"] > 0 else "0%"
+                file_rows.append([
+                    sanitize_markdown_cell(fs["file"]),
+                    str(fs["total"]),
+                    str(fs["passed"]),
+                    str(fs["failed"]),
+                    str(fs["errors"]),
+                    str(fs["crashed"]),
+                    str(fs["timeout"]),
+                    fail_rate,
+                ])
+
+            markdown_lines.extend(
+                render_table(
+                    ["测试文件", "总用例", "通过", "失败", "错误", "崩溃", "超时", "失败率"],
+                    file_rows,
+                )
+            )
+
+    if include_unhandled_tests:
+        markdown_lines.extend(["", "## Unhandled Special Tests"])
+        markdown_lines.extend(format_scope_list(unhandled_tests_list))
+    if include_special_tests:
+        markdown_lines.extend(["", "## Special Test Results"])
+        markdown_lines.extend(
+            render_table(
+                ["Test", "Group", "Status", "Duration", "Return Code", "Note"],
+                [
+                    [
+                        row["name"],
+                        row["group"],
+                        row["status"],
+                        format_duration(row["duration"]),
+                        str(row["returncode"]),
+                        sanitize_markdown_cell(row["note"]),
+                    ]
+                    for row in special_test_rows
+                ] or [["-", "-", "-", "0.0s", "-", "-"]],
+            )
+        )
+
+    report_json = {
+        "overall_status": overall_status,
+        "requested_shards": shard_ids,
+        "reports_collected": received_reports,
+        "patch_count": args.patch_count,
+        "pytorch_version": args.pytorch_version,
+        "torch_npu_whl": whl_name,
+        "docker_image": args.docker_image,
+        "runner": args.runner,
+        "status_counts": dict(status_counts),
+        "totals": totals,
+        "file_discovery_stats": file_discovery_stats,
+        "execution_scope": {
+            "selection_mode": sorted(selection_modes),
+            "selected_test_entries": totals["selected_test_entries"],
+            "selected_test_files": totals["selected_test_files"],
+            "path_filtered_out_files": totals["path_filtered_out_files"],
+            "unique_planned_test_files": unique_planned_count,
+            "files_not_covered_by_requested_shards": not_covered_by_requested_shards,
+            "excluded_test_files": excluded_test_files_list,
+            "unhandled_special_tests": unhandled_tests_list,
+            "missing_files": sorted(unique_missing_files),
+        },
+        "failure_breakdown": {
+            "startup_failures": totals["startup_failures"],
+            "import_failures": totals["import_failures"],
+            "test_failures": totals["test_failures"],
+            "missing_files": totals["missing_files"],
+        },
+        "shards": shard_rows,
+        "failed_shards": [row for row in shard_rows if row["status"] not in ("PASSED", "NO TESTS")],
+        "slowest_shards": slowest,
+    }
+
+    # Add full cases summary if available
+    if cases_summary_data:
+        report_json["cases_collection_summary"] = cases_summary_data
+
+    # Add case-level results if available
+    if cases_results:
+        report_json["cases_results"] = {
+            "total_cases": totals["total_cases"],
+            "passed": totals["case_passed"],
+            "failed": totals["case_failed"],
+            "errors": totals["case_errors"],
+            "crashed": totals["case_crashed"],
+            "timeout": totals["case_timeout"],
+            "shards": {
+                f"{shard_type}-{shard_num}": data
+                for (shard_type, shard_num), data in cases_results.items()
+            },
+        }
+
+        # Add file-level aggregation
+        file_stats = parse_test_results.aggregate_all_cases_by_file(cases_results)
+        report_json["file_level_stats"] = dict(sorted(
+            file_stats.items(),
+            key=lambda x: (-x[1]["total"], x[0])
+        ))
+
+        # Add list of files with failures
+        failed_files = parse_test_results.get_files_with_failures(file_stats)
+        report_json["files_with_failures"] = failed_files
+
+    if include_special_tests:
+        report_json["special_tests"] = {
+            "expected": special_test_names,
+            "status_counts": dict(special_status_counts),
+            "results": special_test_rows,
+        }
+
+    output_markdown.write_text("\n".join(markdown_lines) + "\n", encoding="utf-8")
+    output_json.write_text(json.dumps(report_json, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+
+    print(f"Generated markdown report: {output_markdown}")
+    print(f"Generated json report: {output_json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/parse_test_results.py b/.github/scripts/parse_test_results.py
new file mode 100644
index 0000000000..0dfb34e8e1
--- /dev/null
+++ b/.github/scripts/parse_test_results.py
@@ -0,0 +1,807 @@
+#!/usr/bin/env python3
+"""
+Parse test results from JUnit XML files and pytest logs.
+
+This script provides utilities for:
+    - Parsing JUnit XML reports
+    - Aggregating test statistics
+    - Analyzing pytest log files
+    - Generating result reports (JSON, text)
+
+Usage as module:
+    from parse_test_results import (
+        parse_junit_xml,
+        aggregate_junit_stats,
+        analyze_pytest_log,
+        finalize_stats,
+        save_stats_file,
+        save_info_file,
+        print_stats_summary,
+    )
+
+Usage as CLI:
+    python parse_test_results.py \
+        --report-dir test-reports \
+        --shard 1 \
+        --shard-type distributed \
+        --output-dir parsed-results
+"""
+
+import argparse
+import json
+import os
+import re
+import signal
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+
+# ==============================================================================
+# JUnit XML Parsing
+# ==============================================================================
+
+
+def parse_junit_xml(xml_file: str) -> Dict:
+    """
+    Parse a single JUnit XML file and extract test statistics.
+
+    Args:
+        xml_file: Path to JUnit XML file
+
+    Returns:
+        Dict with keys: total, passed, failed, skipped, errors, duration
+    """
+    stats = {
+        "total": 0,
+        "passed": 0,
+        "failed": 0,
+        "skipped": 0,
+        "errors": 0,
+        "duration": 0.0,
+    }
+
+    if not os.path.exists(xml_file):
+        return stats
+
+    try:
+        tree = ET.parse(xml_file)
+        root = tree.getroot()
+        for testsuite in root.iter("testsuite"):
+            stats["total"] += int(testsuite.get("tests", 0))
+            stats["failed"] += int(testsuite.get("failures", 0))
+            stats["skipped"] += int(testsuite.get("skipped", 0))
+            stats["errors"] += int(testsuite.get("errors", 0))
+            stats["duration"] += float(testsuite.get("time", 0))
+        stats["passed"] = stats["total"] - stats["failed"] - stats["skipped"] - stats["errors"]
+    except Exception as exc:
+        print(f"Warning: Failed to parse XML report {xml_file}: {exc}")
+
+    return stats
+
+
+def aggregate_junit_stats(report_roots: List[Path], pattern: str = "*.xml") -> Dict:
+    """
+    Aggregate statistics from multiple JUnit XML files.
+
+    Args:
+        report_roots: List of directories to search for XML files
+        pattern: Glob pattern for XML files (default: "*.xml")
+
+    Returns:
+        Dict with aggregated stats: total, passed, failed, skipped, errors, duration
+    """
+    totals = {
+        "total": 0,
+        "passed": 0,
+        "failed": 0,
+        "skipped": 0,
+        "errors": 0,
+        "duration": 0.0,
+    }
+
+    seen_files = set()
+    for report_root in report_roots:
+        if not report_root.exists():
+            continue
+        for xml_file in report_root.rglob(pattern):
+            try:
+                resolved = str(xml_file.resolve())
+            except OSError:
+                resolved = str(xml_file)
+            if resolved in seen_files:
+                continue
+            seen_files.add(resolved)
+
+            stats = parse_junit_xml(str(xml_file))
+            for key in totals:
+                totals[key] += stats[key]
+
+    totals["xml_files_count"] = len(seen_files)
+    return totals
+
+
+def parse_shard_xml_files(report_dir: Path, shard: int, shard_type: str = "regular") -> Dict:
+    """
+    Parse all JUnit XML files for a specific shard.
+
+    Args:
+        report_dir: Directory containing test reports
+        shard: Shard number
+        shard_type: "distributed" or "regular"
+
+    Returns:
+        Dict with aggregated stats for the shard
+    """
+    prefix = get_shard_type_prefix(shard_type)
+    xml_pattern = f"shard_{prefix}-{shard}_pytest*.xml"
+
+    xml_files = sorted(report_dir.glob(xml_pattern))
+    if not xml_files:
+        return {
+            "total": 0,
+            "passed": 0,
+            "failed": 0,
+            "skipped": 0,
+            "errors": 0,
+            "duration": 0.0,
+            "junit_generated": False,
+            "junit_xml_files": 0,
+        }
+
+    stats = aggregate_junit_stats([report_dir], xml_pattern)
+    stats["junit_generated"] = True
+    stats["junit_xml_files"] = len(xml_files)
+    return stats
+
+
+# ==============================================================================
+# Log Analysis
+# ==============================================================================
+
+
+def analyze_pytest_log(log_file: Path, returncode: int) -> Dict:
+    """
+    Analyze pytest log file for failure patterns.
+
+    Args:
+        log_file: Path to pytest log file
+        returncode: pytest process return code
+
+    Returns:
+        Dict with: zero_item_test_files, startup_failures, import_failures, test_failures
+    """
+    metrics = {
+        "zero_item_test_files": 0,
+        "startup_failures": 0,
+        "import_failures": 0,
+        "test_failures": 0,
+    }
+
+    if not log_file.exists():
+        return metrics
+
+    try:
+        content = log_file.read_text(encoding="utf-8", errors="replace")
+    except OSError:
+        return metrics
+
+    # Detect "no tests collected" scenarios
+    if returncode == 5 or "collected 0 items" in content or "no tests ran" in content:
+        metrics["zero_item_test_files"] = 1
+
+    # Count import errors
+    metrics["import_failures"] = len(
+        re.findall(r"^ImportError while importing test module", content, flags=re.MULTILINE)
+    )
+
+    # Count collection errors (excluding import errors)
+    collection_errors = len(re.findall(r"^ERROR collecting ", content, flags=re.MULTILINE))
+    metrics["startup_failures"] = max(collection_errors - metrics["import_failures"], 0)
+
+    return metrics
+
+
+# ==============================================================================
+# Stats Processing
+# ==============================================================================
+
+
+def create_empty_stats() -> Dict:
+    """Create empty statistics dictionary."""
+    return {
+        "total": 0,
+        "passed": 0,
+        "failed": 0,
+        "skipped": 0,
+        "errors": 0,
+        "duration": 0.0,
+        "junit_generated": False,
+        "junit_xml_files": 0,
+        "zero_item_test_files": 0,
+        "startup_failures": 0,
+        "import_failures": 0,
+        "test_failures": 0,
+    }
+
+
+def create_shard_info(shard: int, num_shards: int, timestamp: str) -> Dict:
+    """Create shard info dictionary template."""
+    return {
+        "shard": shard,
+        "num_shards": num_shards,
+        "selection_mode": "pytest_direct",
+        "total_files": 0,
+        "selected_test_files": 0,
+        "shard_files": 0,
+        "path_filtered_out_files": 0,
+        "excluded_test_files": 0,
+        "disabled_count": 0,
+        "whitelist_entries": 0,
+        "blacklist_entries": 0,
+        "junit_generated": False,
+        "junit_xml_files": 0,
+        "zero_item_test_files": 0,
+        "startup_failures": 0,
+        "import_failures": 0,
+        "test_failures": 0,
+        "timestamp": timestamp,
+    }
+
+
+def finalize_stats(base_stats: Dict, returncode: int, duration: float, error_message: str = "") -> Dict:
+    """
+    Finalize statistics with returncode and duration.
+
+    Args:
+        base_stats: Base statistics dict
+        returncode: Process return code
+        duration: Execution duration in seconds
+        error_message: Optional error message
+
+    Returns:
+        Finalized stats dict
+    """
+    stats = dict(base_stats)
+    stats["duration"] = max(float(stats.get("duration", 0.0)), duration)
+
+    if returncode != 0:
+        stats["returncode"] = returncode
+
+        # Handle signal crashes (negative returncode)
+        if returncode < 0:
+            signal_num = abs(returncode)
+            try:
+                signal_name = signal.Signals(signal_num).name
+            except ValueError:
+                signal_name = f"SIG{signal_num}"
+            stats["crashed"] = True
+            stats["crash_signal"] = signal_name
+
+        # Mark incomplete if no tests
+        if stats.get("total", 0) == 0:
+            stats["errors"] = max(stats.get("errors", 0), 1)
+            stats["incomplete"] = True
+
+        if error_message:
+            stats["error_message"] = error_message
+    else:
+        stats["returncode"] = 0
+
+    return stats
+
+
+def get_shard_status(stats: Dict, has_xml: bool) -> str:
+    """
+    Determine shard status from stats.
+
+    Args:
+        stats: Statistics dict
+        has_xml: Whether XML files were generated
+
+    Returns:
+        Status string: MISSING, CRASHED, TIMEOUT, ERROR, FAILED, NO_TESTS, PASSED
+    """
+    if not has_xml:
+        return "MISSING"
+
+    if stats.get("crashed"):
+        return "CRASHED"
+
+    if stats.get("timed_out"):
+        return "TIMEOUT"
+
+    if stats.get("incomplete"):
+        return "INCOMPLETE"
+
+    if stats.get("errors", 0) > 0:
+        return "ERROR"
+
+    if stats.get("failed", 0) > 0:
+        return "FAILED"
+
+    if stats.get("total", 0) == 0:
+        return "NO_TESTS"
+
+    return "PASSED"
+
+
+# ==============================================================================
+# Utility Functions
+# ==============================================================================
+
+
+def get_shard_type_prefix(shard_type: str) -> str:
+    """Convert shard type to short prefix for file naming."""
+    return "dist" if shard_type == "distributed" else "reg"
+
+
+def get_shard_log_file(report_dir: Path, shard: int, shard_type: str = "regular") -> Path:
+    """Get path for shard log file."""
+    prefix = get_shard_type_prefix(shard_type)
+    return report_dir / f"test_shard_{prefix}-{shard}.log"
+
+
+def load_disabled_testcases_count(json_file: str) -> int:
+    """Count entries in disabled_testcases.json."""
+    if not json_file or not os.path.exists(json_file):
+        return 0
+
+    with open(json_file, encoding="utf-8") as f:
+        data = json.load(f)
+
+    if isinstance(data, (dict, list)):
+        return len(data)
+    return 0
+
+
+# ==============================================================================
+# File Save Functions
+# ==============================================================================
+
+
+def save_stats_file(report_dir: str, shard: int, stats: Dict, shard_type: str = "regular") -> str:
+    """Save statistics to JSON file."""
+    os.makedirs(report_dir, exist_ok=True)
+    prefix = get_shard_type_prefix(shard_type)
+    stats_file = os.path.join(report_dir, f"shard_{prefix}-{shard}_stats.json")
+    with open(stats_file, "w", encoding="utf-8") as f:
+        json.dump(stats, f, indent=2)
+    return stats_file
+
+
+def save_info_file(report_dir: str, shard: int, info: Dict, shard_type: str = "regular") -> str:
+    """Save info to JSON file."""
+    os.makedirs(report_dir, exist_ok=True)
+    prefix = get_shard_type_prefix(shard_type)
+    info_file = os.path.join(report_dir, f"shard_{prefix}-{shard}_info.json")
+    with open(info_file, "w", encoding="utf-8") as f:
+        json.dump(info, f, indent=2)
+    return info_file
+
+
+def save_test_plan_file(report_dir: str, shard: int, planned_tests: List[str], shard_type: str = "regular") -> str:
+    """Save planned test files list."""
+    os.makedirs(report_dir, exist_ok=True)
+    prefix = get_shard_type_prefix(shard_type)
+    plan_file = os.path.join(report_dir, f"shard_{prefix}-{shard}_planned_test_files.txt")
+    with open(plan_file, "w", encoding="utf-8") as f:
+        for target in planned_tests:
+            f.write(f"{target}\n")
+    return plan_file
+
+
+def save_excluded_test_files_file(report_dir: str, shard: int, excluded_files: List[str], shard_type: str = "regular") -> str:
+    """Save excluded test files list."""
+    os.makedirs(report_dir, exist_ok=True)
+    prefix = get_shard_type_prefix(shard_type)
+    excluded_file = os.path.join(report_dir, f"shard_{prefix}-{shard}_excluded_test_files.txt")
+    with open(excluded_file, "w", encoding="utf-8") as f:
+        for target in excluded_files:
+            f.write(f"{target}\n")
+    return excluded_file
+
+
+def save_missing_files_file(report_dir: str, shard: int, missing_files: List[str], shard_type: str = "regular") -> str:
+    """Save missing files list (crashed files without XML)."""
+    os.makedirs(report_dir, exist_ok=True)
+    prefix = get_shard_type_prefix(shard_type)
+    missing_file = os.path.join(report_dir, f"shard_{prefix}-{shard}_missing_files.txt")
+    with open(missing_file, "w", encoding="utf-8") as f:
+        for file_path in missing_files:
+            f.write(f"{file_path}\n")
+    return missing_file
+
+
+def save_cases_file(report_dir: str, shard: int, cases_data: Dict, shard_type: str = "regular") -> str:
+    """Save case-level results to JSON file."""
+    os.makedirs(report_dir, exist_ok=True)
+    prefix = get_shard_type_prefix(shard_type)
+    cases_file = os.path.join(report_dir, f"shard_{prefix}-{shard}_cases.json")
+    with open(cases_file, "w", encoding="utf-8") as f:
+        json.dump(cases_data, f, indent=2, ensure_ascii=False)
+    return cases_file
+
+
+def load_cases_file(report_dir: Path, shard: int, shard_type: str = "regular") -> Dict:
+    """Load case-level results from JSON file."""
+    prefix = get_shard_type_prefix(shard_type)
+    cases_file = report_dir / f"shard_{prefix}-{shard}_cases.json"
+    if not cases_file.exists():
+        return {}
+    try:
+        with open(cases_file, encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Warning: Failed to load cases file {cases_file}: {e}")
+        return {}
+
+
+# ==============================================================================
+# Case Aggregation by File
+# ==============================================================================
+
+
+def aggregate_cases_by_file(cases_list: List[Dict]) -> Dict[str, Dict]:
+    """
+    Aggregate case results by test file.
+
+    This function groups test cases by their source file and computes
+    statistics (passed, failed, errors, etc.) per file. It also collects
+    detailed failure information for reporting.
+
+    Args:
+        cases_list: List of case result dicts with "nodeid", "file", "status" keys
+
+    Returns:
+        Dict mapping test file path -> aggregated stats
+        Each entry contains:
+            - file: test file path
+            - total: total cases in file
+            - passed, failed, errors, crashed, timeout, skipped: counts
+            - failed_cases: list of failed/error/crashed/timeout cases with details
+            - duration: total execution time for file
+    """
+    file_stats = {}
+
+    for case in cases_list:
+        test_file = case.get("file", "unknown")
+        if not test_file:
+            # Try to extract file from nodeid
+            nodeid = case.get("nodeid", "")
+            if "::" in nodeid:
+                test_file = nodeid.split("::")[0]
+            else:
+                test_file = "unknown"
+
+        status = case.get("status", "error")
+        duration = case.get("duration", 0.0)
+
+        if test_file not in file_stats:
+            file_stats[test_file] = {
+                "file": test_file,
+                "total": 0,
+                "passed": 0,
+                "failed": 0,
+                "errors": 0,
+                "crashed": 0,
+                "timeout": 0,
+                "skipped": 0,
+                "failed_cases": [],
+                "duration": 0.0,
+            }
+
+        stats = file_stats[test_file]
+        stats["total"] += 1
+        stats["duration"] += duration
+
+        if status == "passed":
+            stats["passed"] += 1
+        elif status == "failed":
+            stats["failed"] += 1
+            stats["failed_cases"].append({
+                "nodeid": case.get("nodeid"),
+                "status": "failed",
+                "message": case.get("message", ""),
+                "duration": duration,
+            })
+        elif status == "error":
+            stats["errors"] += 1
+            stats["failed_cases"].append({
+                "nodeid": case.get("nodeid"),
+                "status": "error",
+                "message": case.get("message", ""),
+                "duration": duration,
+            })
+        elif status == "crashed":
+            stats["crashed"] += 1
+            stats["failed_cases"].append({
+                "nodeid": case.get("nodeid"),
+                "status": "crashed",
+                "message": case.get("message", ""),
+                "duration": duration,
+            })
+        elif status == "timeout":
+            stats["timeout"] += 1
+            stats["failed_cases"].append({
+                "nodeid": case.get("nodeid"),
+                "status": "timeout",
+                "message": f"Timeout after {duration}s",
+                "duration": duration,
+            })
+        elif status == "skipped":
+            stats["skipped"] += 1
+
+    return file_stats
+
+
+def aggregate_all_cases_by_file(cases_results: Dict) -> Dict[str, Dict]:
+    """
+    Aggregate all cases from multiple shards by test file.
+
+    Args:
+        cases_results: Dict mapping shard_key -> cases_data (from shard_*_cases.json)
+
+    Returns:
+        Dict mapping test file -> aggregated stats across all shards
+    """
+    all_file_stats = {}
+
+    for shard_key, cases_data in cases_results.items():
+        shard_cases = cases_data.get("cases", [])
+        file_stats = aggregate_cases_by_file(shard_cases)
+
+        for test_file, stats in file_stats.items():
+            if test_file not in all_file_stats:
+                all_file_stats[test_file] = {
+                    "file": test_file,
+                    "total": 0,
+                    "passed": 0,
+                    "failed": 0,
+                    "errors": 0,
+                    "crashed": 0,
+                    "timeout": 0,
+                    "skipped": 0,
+                    "failed_cases": [],
+                    "duration": 0.0,
+                }
+
+            existing = all_file_stats[test_file]
+            existing["total"] += stats["total"]
+            existing["passed"] += stats["passed"]
+            existing["failed"] += stats["failed"]
+            existing["errors"] += stats["errors"]
+            existing["crashed"] += stats["crashed"]
+            existing["timeout"] += stats["timeout"]
+            existing["skipped"] += stats["skipped"]
+            existing["duration"] += stats["duration"]
+            existing["failed_cases"].extend(stats["failed_cases"])
+
+    # Sort failed_cases within each file
+    for test_file in all_file_stats:
+        all_file_stats[test_file]["failed_cases"].sort(
+            key=lambda x: x.get("nodeid", "")
+        )
+
+    return all_file_stats
+
+
+def get_files_with_failures(file_stats: Dict[str, Dict]) -> List[Dict]:
+    """
+    Get list of test files that have failures/errors/crashes/timeout.
+
+    Args:
+        file_stats: Dict from aggregate_all_cases_by_file()
+
+    Returns:
+        List of file stats dicts sorted by file name, only including files with failures
+    """
+    failed_files = []
+    for test_file, stats in file_stats.items():
+        if stats["failed"] > 0 or stats["errors"] > 0 or stats["crashed"] > 0 or stats["timeout"] > 0:
+            failed_files.append(stats)
+
+    failed_files.sort(key=lambda x: x["file"])
+    return failed_files
+
+
+# ==============================================================================
+# Summary Printing
+# ==============================================================================
+
+
+def print_stats_summary(shard: int, stats: Dict, shard_type: str = "regular") -> None:
+    """Print statistics summary to stdout."""
+    prefix = get_shard_type_prefix(shard_type)
+    print(f"\n{'=' * 60}")
+    print(f"Test Results for Shard {prefix}-{shard}")
+    print(f"{'=' * 60}")
+    print(f"Total:   {stats['total']}")
+    print(f"Passed:  {stats['passed']}")
+    print(f"Failed:  {stats['failed']}")
+    print(f"Skipped: {stats['skipped']}")
+    print(f"Errors:  {stats['errors']}")
+    print(f"Duration: {stats['duration']:.2f}s")
+    if stats.get("missing_files_count"):
+        print(f"Missing files: {stats['missing_files_count']}")
+    if stats.get("crashed"):
+        print(f"Crash signal: {stats.get('crash_signal', 'unknown')}")
+    print(f"{'=' * 60}")
+
+
+def create_result_summary(stats: Dict, shard: int, shard_type: str) -> str:
+    """Create a formatted result summary string."""
+    prefix = get_shard_type_prefix(shard_type)
+    status = get_shard_status(stats, stats.get("junit_generated", False))
+
+    lines = [
+        f"Shard {prefix}-{shard} Results:",
+        f"  Status: {status}",
+        f"  Total: {stats.get('total', 0)}",
+        f"  Passed: {stats.get('passed', 0)}",
+        f"  Failed: {stats.get('failed', 0)}",
+        f"  Errors: {stats.get('errors', 0)}",
+        f"  Duration: {stats.get('duration', 0.0):.2f}s",
+    ]
+
+    if stats.get("missing_files_count"):
+        lines.append(f"  Missing: {stats['missing_files_count']}")
+
+    return "\n".join(lines)
+
+
+# ==============================================================================
+# High-Level Parsing Functions
+# ==============================================================================
+
+
+def parse_shard_results(
+    report_dir: Path,
+    shard: int,
+    shard_type: str,
+    returncode: int,
+    duration: float,
+    missing_files: List[str] = None,
+) -> Tuple[Dict, Dict]:
+    """
+    Parse all results for a shard and return (stats, log_metrics).
+
+    This is the main entry point for result parsing.
+
+    Args:
+        report_dir: Directory containing test reports
+        shard: Shard number
+        shard_type: "distributed" or "regular"
+        returncode: pytest process return code
+        duration: Execution duration
+        missing_files: List of files that crashed (no XML generated)
+
+    Returns:
+        Tuple of (stats_dict, log_metrics_dict)
+    """
+    missing_files = missing_files or []
+
+    # Parse JUnit XML files
+    stats = parse_shard_xml_files(report_dir, shard, shard_type)
+
+    # Add per-file isolation metadata
+    stats["per_file_isolation"] = True
+    stats["missing_files_count"] = len(missing_files)
+
+    # Analyze log file
+    log_file = get_shard_log_file(report_dir, shard, shard_type)
+    log_metrics = analyze_pytest_log(log_file, returncode)
+
+    # Finalize stats
+    stats = finalize_stats(stats, returncode, duration)
+
+    # Merge log metrics
+    log_metrics["test_failures"] = stats.get("failed", 0) + stats.get("errors", 0)
+    log_metrics["missing_files_count"] = len(missing_files)
+    stats.update(log_metrics)
+
+    # Handle returncode=5 (no tests collected) as success
+    if returncode == 5 and stats.get("total", 0) == 0:
+        stats["returncode"] = 0
+
+    return stats, log_metrics
+
+
+def generate_shard_reports(
+    report_dir: str,
+    shard: int,
+    shard_type: str,
+    stats: Dict,
+    info: Dict,
+    missing_files: List[str] = None,
+) -> Dict[str, str]:
+    """
+    Generate all report files for a shard.
+
+    Args:
+        report_dir: Output directory
+        shard: Shard number
+        shard_type: "distributed" or "regular"
+        stats: Statistics dict
+        info: Info dict
+        missing_files: List of missing/crashed files
+
+    Returns:
+        Dict mapping report type to file path
+    """
+    report_files = {}
+
+    # Save stats
+    report_files["stats"] = save_stats_file(report_dir, shard, stats, shard_type)
+
+    # Save info
+    report_files["info"] = save_info_file(report_dir, shard, info, shard_type)
+
+    # Save missing files if any
+    if missing_files:
+        report_files["missing"] = save_missing_files_file(report_dir, shard, missing_files, shard_type)
+
+    return report_files
+
+
+# ==============================================================================
+# CLI Interface
+# ==============================================================================
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Parse test results from JUnit XML files")
+    parser.add_argument("--report-dir", type=str, required=True, help="Directory containing test reports")
+    parser.add_argument("--shard", type=int, required=True, help="Shard number")
+    parser.add_argument(
+        "--shard-type",
+        type=str,
+        choices=["distributed", "regular"],
+        default="regular",
+        help="Shard type",
+    )
+    parser.add_argument("--returncode", type=int, default=0, help="pytest return code")
+    parser.add_argument("--duration", type=float, default=0.0, help="Execution duration in seconds")
+    parser.add_argument("--output-stats", type=str, help="Output file for stats JSON")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    return parser.parse_args()
+
+
+def main():
+    """CLI entry point."""
+    args = parse_args()
+
+    report_dir = Path(args.report_dir).resolve()
+    if not report_dir.exists():
+        print(f"Error: Report directory not found: {report_dir}")
+        sys.exit(1)
+
+    # Parse results
+    stats, log_metrics = parse_shard_results(
+        report_dir=report_dir,
+        shard=args.shard,
+        shard_type=args.shard_type,
+        returncode=args.returncode,
+        duration=args.duration,
+    )
+
+    # Output
+    if args.output_stats:
+        output_path = Path(args.output_stats)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(json.dumps(stats, indent=2), encoding="utf-8")
+        print(f"Stats saved to: {output_path}")
+
+    if args.verbose:
+        print(json.dumps(stats, indent=2))
+
+    print_stats_summary(args.shard, stats, args.shard_type)
+
+    # Exit with appropriate code
+    sys.exit(stats.get("returncode", 0))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 3c04a63b2a..2bb5686216 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -1,210 +1,2230 @@
 #!/usr/bin/env python3
 """
-Run NPU test cases from a shard JSON file.
+Run a shard of patched upstream PyTorch tests via per-case isolation pytest execution.
 
-Each test case runs in an independent subprocess for crash isolation.
-Results are collected and saved to JSON files.
+This script focuses on:
+    - Test discovery (via discover_test_files.py)
+    - Shard assignment (Step 4)
+    - Per-case isolation execution (serial or concurrent subprocess)
+
+Result parsing is handled by parse_test_results.py.
+
+Test types:
+    - distributed: NPU distributed tests (test/distributed/*) - serial execution
+    - regular: All other tests - concurrent execution (max 4 workers by default)
+
+Each shard executes tests in per-case isolation mode:
+    - First collect all test cases via pytest --collect-only
+    - Each case runs in its own pytest subprocess
+    - NPU kernel crashes won't cascade to other cases (each case isolated)
+    - Results recorded in cases.json file
+
+Execution modes:
+    - Serial: One case at a time (for distributed tests)
+    - Concurrent: Up to max_workers subprocesses running simultaneously (for regular tests)
+
+Usage:
+    python run_npu_test_shard.py \
+        --shard 1 \
+        --num-shards 50 \
+        --test-type distributed \
+        --test-dir /path/to/pytorch/test \
+        --case-paths-config /path/to/case_paths_ci.yml \
+        --disabled-testcases /path/to/disabled_testcases.json \
+        --report-dir test-reports \
+        --timeout 1200 \
+        --max-workers 4 \
+        --verbose
 """
 
 import argparse
+import dataclasses
+import importlib.util
 import json
 import os
+import signal
 import subprocess
 import sys
-import time
+import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Optional
+from queue import Queue
+from time import monotonic
+from typing import Dict, List, Optional, Tuple
+
+
+# ==============================================================================
+# Import Result Parser Module
+# ==============================================================================
+
+
+def load_parse_test_results_module(script_dir: Path):
+    """Load parse_test_results module dynamically."""
+    module_path = script_dir / "parse_test_results.py"
+    if not module_path.exists():
+        raise FileNotFoundError(f"parse_test_results.py not found at {module_path}")
+
+    spec = importlib.util.spec_from_file_location("parse_test_results", str(module_path))
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# ==============================================================================
+# Data Classes
+# ==============================================================================
+
+
+@dataclasses.dataclass
+class DiscoveryResult:
+    """Result from discover_test_files.py."""
+    test_files: List[str]
+    metadata: Dict
+    total_files: int
+
+
+@dataclasses.dataclass
+class ShardAssignmentResult:
+    """Result of Step 4: Shard assignment."""
+    shard: int
+    num_shards: int
+    planned_tests: List[str]
+    planned_count: int
+
+
+@dataclasses.dataclass
+class ShardPlanResult:
+    """Complete result of discovery + shard assignment."""
+    discovery: DiscoveryResult
+    shard_assignment: ShardAssignmentResult
+
+    def get_planned_tests(self) -> List[str]:
+        return self.shard_assignment.planned_tests
+
+    def to_info_dict(self) -> Dict:
+        return {
+            "total_files": self.discovery.metadata.get("total_files", 0),
+            "test_type": self.discovery.metadata.get("test_type", "regular"),
+            "type_selected_files": self.discovery.metadata.get("type_selected", 0),
+            "type_excluded_files": self.discovery.metadata.get("type_excluded", 0),
+            "whitelist_entries": self.discovery.metadata.get("whitelist_entries", 0),
+            "blacklist_entries": self.discovery.metadata.get("blacklist_entries", 0),
+            "rules_selected": self.discovery.metadata.get("rules_selected", 0),
+            "rules_excluded": self.discovery.metadata.get("rules_excluded", 0),
+            "shard": self.shard_assignment.shard,
+            "num_shards": self.shard_assignment.num_shards,
+            "shard_files": self.shard_assignment.planned_count,
+        }
+
+
+# ==============================================================================
+# Concurrent Execution Data Classes
+# ==============================================================================
+
+
+@dataclasses.dataclass
+class CaseExecutionTask:
+    """Task for concurrent case execution."""
+    case_idx: int
+    nodeid: str
+    test_file: str
+    file_idx: int
+
+
+@dataclasses.dataclass
+class ConcurrentExecutionConfig:
+    """Configuration for concurrent execution."""
+    max_workers: int = 4
+    per_case_timeout: int = 1200
+    verbose: bool = False
+
+
+# ==============================================================================
+# Failed Case Log Saving Functions
+# ==============================================================================
+
+
+def sanitize_nodeid_for_filename(nodeid: str) -> str:
+    """
+    Convert nodeid to a safe filename.
+
+    Replaces special characters with underscores and truncates if too long.
+    """
+    # Replace special characters
+    safe_name = nodeid.replace("::", "_").replace("/", "_").replace("\\", "_")
+    safe_name = safe_name.replace("(", "_").replace(")", "_").replace("[", "_").replace("]", "_")
+    safe_name = safe_name.replace("*", "_").replace("?", "_").replace(" ", "_")
+    safe_name = safe_name.replace(".", "_")
+
+    # Remove leading underscores and collapse multiple underscores
+    while safe_name.startswith("_"):
+        safe_name = safe_name[1:]
+    while "__" in safe_name:
+        safe_name = safe_name.replace("__", "_")
+
+    # Truncate if too long (max 200 chars)
+    if len(safe_name) > 200:
+        safe_name = safe_name[:200]
+
+    return safe_name or "unknown_case"
+
+
+def save_failed_case_log(
+    report_dir: Path,
+    shard: int,
+    shard_type: str,
+    nodeid: str,
+    case_idx: int,
+    status: str,
+    stdout: str,
+    stderr: str,
+    duration: float,
+    returncode: int,
+    command: str,
+) -> Path:
+    """
+    Save complete execution log for a failed case.
+
+    Creates a dedicated log file containing:
+    - Case metadata (nodeid, status, duration, returncode)
+    - Full stdout and stderr output
+    - Execution command
+
+    Returns:
+        Path to the saved log file
+    """
+    # Only save for failed/error/crashed/timeout cases
+    if status not in ("failed", "error", "crashed", "timeout"):
+        return None
+
+    # Create failed cases log directory
+    failed_logs_dir = report_dir / "failed_cases_logs"
+    failed_logs_dir.mkdir(parents=True, exist_ok=True)
+
+    # Generate safe filename
+    safe_name = sanitize_nodeid_for_filename(nodeid)
+    prefix = "dist" if shard_type == "distributed" else "reg"
+    log_filename = f"{prefix}-{shard}_{case_idx}_{safe_name}.log"
+    log_path = failed_logs_dir / log_filename
+
+    # Write log content
+    content_lines = [
+        "=" * 80,
+        f"FAILED CASE LOG",
+        "=" * 80,
+        f"Shard: {prefix}-{shard}",
+        f"Case Index: {case_idx}",
+        f"Nodeid: {nodeid}",
+        f"Status: {status}",
+        f"Duration: {duration:.2f}s",
+        f"Return Code: {returncode}",
+        f"Command: {command}",
+        "=" * 80,
+        "",
+        "STDOUT:",
+        "-" * 80,
+        stdout or "(empty)",
+        "",
+        "STDERR:",
+        "-" * 80,
+        stderr or "(empty)",
+        "",
+        "=" * 80,
+    ]
+
+    log_path.write_text("\n".join(content_lines), encoding="utf-8")
+    return log_path
+
+
+class ConcurrentResultAggregator:
+    """Thread-safe result aggregator for concurrent execution."""
+
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._cases_list: List[Dict] = []
+        self._worst_returncode: int = 0
+        self._passed_count: int = 0
+        self._failed_count: int = 0
+        self._error_count: int = 0
+        self._skipped_count: int = 0
+        self._crashed_count: int = 0
+        self._timeout_count: int = 0
+        self._total_cases: int = 0
+
+    def add_case_result(self, case_result: Dict) -> None:
+        """Thread-safe add case result."""
+        with self._lock:
+            self._cases_list.append(case_result)
+            self._total_cases += 1
+
+            status = case_result.get("status", "error")
+            if status == "passed":
+                self._passed_count += 1
+            elif status == "failed":
+                self._failed_count += 1
+            elif status == "skipped":
+                self._skipped_count += 1
+            elif status == "crashed":
+                self._crashed_count += 1
+            elif status == "timeout":
+                self._timeout_count += 1
+            elif status == "error":
+                self._error_count += 1
+            else:
+                # Unknown status treated as error
+                self._error_count += 1
 
+            # Track worst returncode (ignore skipped/no_tests)
+            rc = case_result.get("returncode", 1)
+            if rc != 0 and rc != 3 and rc != 5:
+                if self._worst_returncode == 0:
+                    self._worst_returncode = rc
 
-def run_single_case(
-    case_id: str,
+    def get_sorted_cases(self) -> List[Dict]:
+        """Get cases sorted by case_idx."""
+        with self._lock:
+            return sorted(self._cases_list, key=lambda x: x.get("case_idx", 0))
+
+    def get_summary(self) -> Dict:
+        """Get execution summary."""
+        with self._lock:
+            return {
+                "total_cases": self._total_cases,
+                "passed_count": self._passed_count,
+                "failed_count": self._failed_count,
+                "error_count": self._error_count,
+                "skipped_count": self._skipped_count,
+                "crashed_count": self._crashed_count,
+                "timeout_count": self._timeout_count,
+                "worst_returncode": self._worst_returncode,
+            }
+
+
+class ProgressTracker:
+    """Thread-safe progress tracker with real-time output."""
+
+    def __init__(self, total_tasks: int):
+        self._total_tasks = total_tasks
+        self._completed_tasks = 0
+        self._lock = threading.Lock()
+        self._start_time = monotonic()
+
+    def mark_completed(self, nodeid: str, status: str, duration: float) -> None:
+        """Mark task completed and print progress."""
+        with self._lock:
+            self._completed_tasks += 1
+            elapsed = monotonic() - self._start_time
+            progress_pct = (self._completed_tasks / self._total_tasks) * 100
+
+            # Status indicator
+            status_icon = {
+                "passed": "[PASS]",
+                "failed": "[FAIL]",
+                "error": "[ERR]",
+                "crashed": "[CRASH]",
+                "timeout": "[TIMEOUT]",
+                "skipped": "[SKIP]",
+            }.get(status, "[?]")
+
+            # Truncate nodeid for display
+            display_nodeid = nodeid[:60] + "..." if len(nodeid) > 60 else nodeid
+
+            print(f"[{self._completed_tasks}/{self._total_tasks}] {progress_pct:.1f}% "
+                  f"{status_icon} {display_nodeid} ({duration:.1f}s) "
+                  f"[elapsed: {elapsed:.0f}s]", flush=True)
+
+    def get_progress(self) -> Tuple[int, int]:
+        """Get current progress."""
+        with self._lock:
+            return self._completed_tasks, self._total_tasks
+
+
+def get_signal_name(signal_num: int) -> str:
+    """Convert signal number to human-readable name."""
+    try:
+        name = signal.Signals(signal_num).name
+        return f"{name}({signal_num})"
+    except ValueError:
+        return f"SIG{signal_num}"
+
+
+# ==============================================================================
+# Discovery Integration
+# ==============================================================================
+
+
+def load_discover_module(script_dir: Path):
+    """Load discover_test_files module dynamically."""
+    module_path = script_dir / "discover_test_files.py"
+    if not module_path.exists():
+        raise FileNotFoundError(f"discover_test_files.py not found at {module_path}")
+
+    spec = importlib.util.spec_from_file_location("discover_test_files", str(module_path))
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def run_discovery(
+    test_dir: Path,
+    test_type: str,
+    case_paths_config: Optional[str],
+    discover_module,
+) -> DiscoveryResult:
+    """Run test discovery and return DiscoveryResult."""
+    test_files, metadata = discover_module.discover_test_files(
+        test_dir=test_dir,
+        test_type=test_type,
+        case_paths_config=case_paths_config,
+    )
+
+    return DiscoveryResult(
+        test_files=test_files,
+        metadata=metadata,
+        total_files=len(test_files),
+    )
+
+
+# ==============================================================================
+# Shard Assignment (Step 4)
+# ==============================================================================
+
+
+def select_shard_files(test_files: List[str], shard: int, num_shards: int) -> List[str]:
+    """
+    Select test files for a shard using contiguous range-based selection.
+
+    Args:
+        test_files: List of test file paths, already sorted alphabetically
+        shard: Shard number (1-indexed, 1 <= shard <= num_shards)
+        num_shards: Total number of shards
+
+    Returns:
+        List of test files assigned to this shard
+    """
+    if not test_files:
+        return []
+
+    shard_index = shard - 1
+    total_files = len(test_files)
+
+    base_size = total_files // num_shards
+    remainder = total_files % num_shards
+
+    if shard_index < remainder:
+        start = shard_index * (base_size + 1)
+        end = start + base_size + 1
+    else:
+        start = remainder * (base_size + 1) + (shard_index - remainder) * base_size
+        end = start + base_size
+
+    return test_files[start:end]
+
+
+def assign_shard(discovery_result: DiscoveryResult, shard: int, num_shards: int) -> ShardAssignmentResult:
+    """Assign test files to a specific shard."""
+    planned_tests = select_shard_files(discovery_result.test_files, shard, num_shards)
+    return ShardAssignmentResult(
+        shard=shard,
+        num_shards=num_shards,
+        planned_tests=planned_tests,
+        planned_count=len(planned_tests),
+    )
+
+
+# ==============================================================================
+# Complete Test Planning
+# ==============================================================================
+
+
+def plan_shard_tests(
     test_dir: Path,
+    shard: int,
+    num_shards: int,
+    test_type: str,
+    case_paths_config: Optional[str],
+    discover_module,
+) -> ShardPlanResult:
+    """Complete test planning: discovery + shard assignment."""
+    discovery_result = run_discovery(test_dir, test_type, case_paths_config, discover_module)
+    shard_assignment_result = assign_shard(discovery_result, shard, num_shards)
+
+    return ShardPlanResult(
+        discovery=discovery_result,
+        shard_assignment=shard_assignment_result,
+    )
+
+
+def create_test_plan_summary(result: ShardPlanResult) -> str:
+    """Create human-readable summary."""
+    lines = [
+        "=" * 60,
+        "Test Planning Summary",
+        "=" * 60,
+        f"Discovery (Steps 1-3): {result.discovery.metadata.get('total_files', 0)} files scanned",
+        f"  Test type: {result.discovery.metadata.get('test_type', 'regular')}",
+        f"  Type filter: {result.discovery.metadata.get('type_selected', 0)} selected",
+        f"  Rules filter: {result.discovery.metadata.get('rules_selected', 0)} after whitelist/blacklist",
+        f"Shard Assignment (Step 4): {result.shard_assignment.planned_count} files for shard {result.shard_assignment.shard}/{result.shard_assignment.num_shards}",
+        "=" * 60,
+    ]
+    return "\n".join(lines)
+
+
+# ==============================================================================
+# Utility Functions
+# ==============================================================================
+
+
+def strip_test_prefix_and_suffix(test_path: str) -> str:
+    """Remove 'test/' prefix and '.py' suffix from path."""
+    path = test_path
+    if path.startswith("test/"):
+        path = path[5:]
+    if path.endswith(".py"):
+        path = path[:-3]
+    return path
+
+
+def load_installed_torch_root() -> str:
+    """Get installed torch root directory."""
+    try:
+        import torch
+        return str(Path(torch.__file__).resolve().parent.parent)
+    except Exception as exc:
+        print(f"Warning: Failed to import torch: {exc}")
+        return ""
+
+
+# ==============================================================================
+# Case Collection
+# ==============================================================================
+
+
+def collect_test_cases(test_file: str, test_dir: Path, env: Dict) -> List[str]:
+    """
+    Collect all test cases from a test file via pytest --collect-only.
+
+    Adds test file's parent directory to PYTHONPATH to enable
+    imports of sibling modules (e.g., 'from model_registry import MLPModule').
+
+    Args:
+        test_file: Test file path (e.g., "test/test_autograd.py")
+        test_dir: Path to PyTorch test directory
+        env: Environment dict for subprocess (will be modified for this call)
+
+    Returns:
+        List of case nodeids (e.g., ["test_autograd.py::TestAutograd::test_grad"])
+    """
+    # Strip test/ prefix if present
+    original_test_file = test_file
+    if test_file.startswith("test/"):
+        test_file = test_file[5:]
+
+    # Get test file's parent directory for PYTHONPATH
+    test_file_path = Path(test_file)
+    test_file_dir = test_dir / test_file_path.parent
+
+    # Build per-file environment with test file directory in PYTHONPATH
+    file_env = env.copy()
+    existing_pythonpath = file_env.get("PYTHONPATH", "")
+    file_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
+
+    command = [
+        sys.executable,
+        "-m",
+        "pytest",
+        "--collect-only",
+        "--quiet",
+        test_file,
+    ]
+
+    try:
+        result = subprocess.run(
+            command,
+            cwd=str(test_dir),
+            env=file_env,  # Use per-file environment with test file directory in PYTHONPATH
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            timeout=60,  # Collection timeout
+        )
+
+        # Check for collection errors based on pytest exit codes:
+        #   0: all passed (success)
+        #   2: pytest error (includes collection errors like ImportError)
+        #   3: all skipped (success)
+        #   4: command line error (error)
+        #   5: no tests collected (ERROR - test file should have cases)
+        # Key insight: if a test file is selected for execution, it should have cases.
+        # returncode 5 means 0 cases collected, which indicates a problem.
+        stdout_content = result.stdout.strip()
+
+        if result.returncode not in (0, 3):
+            # returncode 2, 4, 5: real collection error
+            # returncode 5 specifically means no tests collected - a problem for selected files
+            print(f"    WARNING: Collection errors for {test_file}:")
+            # Print relevant lines from stdout (pytest collection errors are in stdout)
+            stdout_lines = stdout_content.splitlines()
+            for line in stdout_lines[-20:]:
+                if line.strip():
+                    print(f"      {line[:200]}")
+            # Also print stderr if relevant
+            if result.stderr:
+                stderr_lines = result.stderr.strip().splitlines()
+                for line in stderr_lines[-10:]:
+                    if line.strip():
+                        print(f"      {line[:200]}")
+
+        # Parse nodeids from output
+        nodeids = []
+        for line in result.stdout.splitlines():
+            # pytest --collect-only outputs nodeids like:
+            # <Function test_grad>
+            # or with verbose:
+            # test_autograd.py::TestAutograd::test_grad
+            if "::" in line:
+                # Extract nodeid (remove leading spaces and markers)
+                nodeid = line.strip()
+                # Remove pytest markers like <Function ...>
+                if nodeid.startswith("<"):
+                    continue
+                nodeids.append(nodeid)
+
+        return nodeids
+
+    except subprocess.TimeoutExpired:
+        print(f"WARNING: Collection timeout for {test_file}")
+        return []
+    except Exception as e:
+        print(f"WARNING: Collection failed for {test_file}: {e}")
+        return []
+
+
+# ==============================================================================
+# Case Execution
+# ==============================================================================
+
+
+def run_single_test_case(
+    case_nodeid: str,
+    test_dir: Path,
+    env: Dict,
     timeout: int,
-    test_index: int = 0,
-    total_tests: int = 0,
-    verbose: bool = False
+    verbose: bool,
+    report_dir: Path = None,
+    shard: int = 0,
+    shard_type: str = "regular",
+    case_idx: int = 0,
+    test_file: str = "",
 ) -> Dict:
-    """Run a single test case in a subprocess."""
-    result = {
-        "case_id": case_id,
-        "status": "unknown",
-        "duration": 0,
-        "output": "",
-        "error": "",
-        "timestamp": datetime.utcnow().isoformat(),
-    }
+    """
+    Run a single test case in isolated subprocess.
+
+    Adds test file's parent directory to PYTHONPATH to enable
+    imports of sibling modules (e.g., 'from model_registry import MLPModule').
+
+    Args:
+        case_nodeid: Test case nodeid (e.g., "test_autograd.py::TestAutograd::test_grad")
+        test_dir: Path to PyTorch test directory
+        env: Environment dict for subprocess (will be modified for this call)
+        timeout: Per-case timeout in seconds
+        verbose: Verbose output
+        test_file: Test file path for PYTHONPATH calculation
+
+    Returns:
+        Dict with: nodeid, status, duration, returncode, message, command
+    """
+    start_time = monotonic()
+
+    # Preserve original nodeid for result reporting
+    original_nodeid = case_nodeid
+
+    # Strip test/ prefix from nodeid if present (pytest --collect-only outputs with test/ prefix)
+    # When cwd is test_dir, the path should be relative to test_dir, not include test/
+    if case_nodeid.startswith("test/"):
+        case_nodeid = case_nodeid[5:]
+
+    # Build per-case environment with test file directory in PYTHONPATH
+    case_env = env.copy()
+    if test_file:
+        if test_file.startswith("test/"):
+            test_file_rel = test_file[5:]
+        else:
+            test_file_rel = test_file
 
-    # Parse case_id to get test file path
-    parts = case_id.split("::")
-    test_file = parts[0]
+        test_file_path = Path(test_file_rel)
+        test_file_dir = test_dir / test_file_path.parent
 
-    # Add test file parent directory to PYTHONPATH for sibling imports
-    test_file_path = test_dir / test_file
-    parent_dir = str(test_file_path.parent)
+        existing_pythonpath = case_env.get("PYTHONPATH", "")
+        case_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
 
-    env = os.environ.copy()
-    existing_pythonpath = env.get("PYTHONPATH", "")
-    env["PYTHONPATH"] = f"{parent_dir}:{existing_pythonpath}" if existing_pythonpath else parent_dir
+    command = [
+        sys.executable,
+        "-m",
+        "pytest",
+        "--color=no",
+        "-ra",
+        "--tb=short",
+        case_nodeid,
+    ]
 
-    cmd = ["pytest", "-v", "--timeout=300", "-x", case_id]
+    if timeout > 0:
+        command.append(f"--timeout={timeout}")
 
-    # Print progress and command before execution
     if verbose:
-        if test_index > 0 and total_tests > 0:
-            print(f"Test {test_index}/{total_tests}: { ' '.join(cmd)}")
-        print(f"Running: {case_id}")
+        command.append("-vv")
+    else:
+        command.append("-v")
+
+    # Print command to log
+    command_str = " ".join(command)
+    print(f"    Command: {command_str}")
 
-    start_time = time.time()
     try:
-        proc = subprocess.run(
-            cmd,
+        result = subprocess.run(
+            command,
+            cwd=str(test_dir),
+            env=case_env,  # Use per-case environment with test file directory in PYTHONPATH
             capture_output=True,
             text=True,
-            timeout=timeout,
+            encoding="utf-8",
+            errors="replace",
+            timeout=timeout + 10,  # Extra buffer for timeout handling
+        )
+
+        duration = monotonic() - start_time
+        returncode = result.returncode
+
+        # Determine status
+        if returncode == 0:
+            status = "passed"
+        elif returncode == 1:
+            status = "failed"
+        elif returncode == 2:
+            status = "error"
+        elif returncode == 3:
+            status = "skipped"
+        elif returncode == 4:
+            status = "error"  # usage error
+        elif returncode == 5:
+            status = "no_tests"
+        elif returncode < 0:
+            status = "crashed"
+        else:
+            status = "error"
+
+        # Extract error message from output
+        message = ""
+        if status in ("failed", "error", "crashed"):
+            # Extract last meaningful lines from stderr/stdout
+            output = result.stderr + result.stdout
+            lines = output.splitlines()
+            error_lines = [l for l in lines[-20:] if l.strip()]
+            message = "\n".join(error_lines[-5:])[:500]  # Limit message length
+
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": status,
+            "duration": duration,
+            "returncode": returncode,
+            "message": message,
+            "command": command_str,
+        }
+
+        # Save failed case log to file
+        if status in ("failed", "error", "crashed") and report_dir:
+            save_failed_case_log(
+                report_dir=report_dir,
+                shard=shard,
+                shard_type=shard_type,
+                nodeid=original_nodeid,
+                case_idx=case_idx,
+                status=status,
+                stdout=result.stdout,
+                stderr=result.stderr,
+                duration=duration,
+                returncode=returncode,
+                command=command_str,
+            )
+
+        return case_result
+
+    except subprocess.TimeoutExpired:
+        duration = monotonic() - start_time
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": "timeout",
+            "duration": duration,
+            "returncode": -1,
+            "message": f"Timeout after {timeout}s",
+            "command": command_str,
+        }
+
+        # Save timeout case log
+        if report_dir:
+            save_failed_case_log(
+                report_dir=report_dir,
+                shard=shard,
+                shard_type=shard_type,
+                nodeid=original_nodeid,
+                case_idx=case_idx,
+                status="timeout",
+                stdout="(process timed out, no output captured)",
+                stderr="(process timed out, no output captured)",
+                duration=duration,
+                returncode=-1,
+                command=command_str,
+            )
+
+        return case_result
+
+    except Exception as e:
+        duration = monotonic() - start_time
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": "error",
+            "duration": duration,
+            "returncode": 1,
+            "message": str(e)[:500],
+            "command": command_str,
+        }
+
+        # Save error case log
+        if report_dir:
+            save_failed_case_log(
+                report_dir=report_dir,
+                shard=shard,
+                shard_type=shard_type,
+                nodeid=original_nodeid,
+                case_idx=case_idx,
+                status="error",
+                stdout="(exception occurred before execution)",
+                stderr=str(e),
+                duration=duration,
+                returncode=1,
+                command=command_str,
+            )
+
+        return case_result
+
+
+# ==============================================================================
+# Concurrent Case Execution
+# ==============================================================================
+
+
+def run_single_case_concurrent(
+    task: CaseExecutionTask,
+    test_dir: Path,
+    merged_env: Dict[str, str],
+    config: ConcurrentExecutionConfig,
+    result_aggregator: ConcurrentResultAggregator,
+    progress_tracker: ProgressTracker,
+    log_queue: Queue,
+    report_dir: Path,
+    shard: int,
+    shard_type: str,
+) -> Dict:
+    """
+    Execute a single test case in subprocess (for concurrent execution).
+
+    This function runs in ThreadPoolExecutor threads. Each call spawns
+    an independent subprocess for the test case. Core dumps and crashes
+    in the subprocess do NOT affect the main Python process or other
+    concurrent tasks.
+
+    CRITICAL: This function must catch ALL exceptions and return a result
+    dict. It should NEVER raise exceptions to ThreadPoolExecutor level.
+
+    Args:
+        task: Case execution task with nodeid and metadata
+        test_dir: PyTorch test directory
+        merged_env: Environment variables
+        config: Execution configuration
+        result_aggregator: Thread-safe result collector
+        progress_tracker: Thread-safe progress tracker
+        log_queue: Queue for log messages
+
+    Returns:
+        Dict with case result (never raises exception)
+    """
+    start_time = monotonic()
+    original_nodeid = task.nodeid
+    case_nodeid = task.nodeid
+
+    # Strip test/ prefix for pytest execution
+    if case_nodeid.startswith("test/"):
+        case_nodeid = case_nodeid[5:]
+
+    command = [
+        sys.executable,
+        "-m",
+        "pytest",
+        "--color=no",
+        "-ra",
+        "--tb=short",
+        case_nodeid,
+    ]
+
+    if config.per_case_timeout > 0:
+        command.append(f"--timeout={config.per_case_timeout}")
+
+    if config.verbose:
+        command.append("-vv")
+    else:
+        command.append("-v")
+
+    command_str = " ".join(command)
+
+    # Build per-case environment with test file directory in PYTHONPATH
+    # This enables imports of sibling modules (e.g., 'from model_registry import MLPModule')
+    case_env = merged_env.copy()
+    test_file = task.test_file
+    if test_file.startswith("test/"):
+        test_file_rel = test_file[5:]
+    else:
+        test_file_rel = test_file
+
+    test_file_path = Path(test_file_rel)
+    test_file_dir = test_dir / test_file_path.parent
+
+    existing_pythonpath = case_env.get("PYTHONPATH", "")
+    case_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
+
+    # Print start log to stdout (before execution)
+    # Truncate nodeid for display
+    display_nodeid = original_nodeid[:70] + "..." if len(original_nodeid) > 70 else original_nodeid
+    print(f"[{task.case_idx}] Starting: {display_nodeid}", flush=True)
+
+    # Log start
+    log_queue.put({
+        "type": "case_start",
+        "case_idx": task.case_idx,
+        "nodeid": original_nodeid,
+        "file": task.test_file,
+        "command": command_str,
+    })
+
+    # Execute subprocess - CRITICAL: catch ALL exceptions
+    try:
+        result = subprocess.run(
+            command,
             cwd=str(test_dir),
-            env=env
+            env=case_env,  # Use per-case environment with test file directory in PYTHONPATH
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            timeout=config.per_case_timeout + 30,  # Extra buffer
         )
-        duration = time.time() - start_time
-        result["duration"] = round(duration, 2)
-        result["output"] = proc.stdout
-        result["error"] = proc.stderr
-
-        if proc.returncode == 0:
-            result["status"] = "passed"
-        elif proc.returncode == 1:
-            result["status"] = "failed"
-        elif proc.returncode == 2:
-            result["status"] = "error"
-        elif proc.returncode == 3:
-            result["status"] = "skipped"
-        elif proc.returncode == 4:
-            result["status"] = "xfail"
-        elif proc.returncode == 5:
-            result["status"] = "xpass"
+
+        duration = monotonic() - start_time
+        returncode = result.returncode
+
+        # Determine status (including crashed with negative returncode)
+        if returncode == 0:
+            status = "passed"
+        elif returncode == 1:
+            status = "failed"
+        elif returncode == 2:
+            status = "error"
+        elif returncode == 3:
+            status = "skipped"
+        elif returncode == 4:
+            status = "error"
+        elif returncode == 5:
+            status = "no_tests"
+        elif returncode < 0:
+            # Core dump or signal crash
+            status = "crashed"
+            signal_name = get_signal_name(abs(returncode))
         else:
-            result["status"] = f"unknown_exit_{proc.returncode}"
+            status = "error"
+
+        # Extract error message
+        message = ""
+        if status in ("failed", "error", "crashed"):
+            output = result.stderr + result.stdout
+            lines = output.splitlines()
+            error_lines = [l for l in lines[-20:] if l.strip()]
+            message = "\n".join(error_lines[-5:])[:500]
+            if status == "crashed":
+                message = f"Process crashed with {signal_name}\n{message}"
+
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": status,
+            "duration": duration,
+            "returncode": returncode,
+            "message": message,
+            "command": command_str,
+            "file": task.test_file,
+            "case_idx": task.case_idx,
+        }
 
-        if verbose:
-            print(f"  [{result['status']}] {duration:.2f}s")
+        # Save failed case log to file
+        if status in ("failed", "error", "crashed"):
+            save_failed_case_log(
+                report_dir=report_dir,
+                shard=shard,
+                shard_type=shard_type,
+                nodeid=original_nodeid,
+                case_idx=task.case_idx,
+                status=status,
+                stdout=result.stdout,
+                stderr=result.stderr,
+                duration=duration,
+                returncode=returncode,
+                command=command_str,
+            )
 
     except subprocess.TimeoutExpired:
-        result["status"] = "timeout"
-        result["duration"] = timeout
-        result["error"] = f"Test timed out after {timeout} seconds"
-        if verbose:
-            print(f"  [timeout] {timeout}s")
+        # Timeout - return result, don't raise
+        duration = monotonic() - start_time
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": "timeout",
+            "duration": duration,
+            "returncode": -1,
+            "message": f"Timeout after {config.per_case_timeout}s",
+            "command": command_str,
+            "file": task.test_file,
+            "case_idx": task.case_idx,
+        }
+
+        # Save timeout case log
+        save_failed_case_log(
+            report_dir=report_dir,
+            shard=shard,
+            shard_type=shard_type,
+            nodeid=original_nodeid,
+            case_idx=task.case_idx,
+            status="timeout",
+            stdout="(process timed out, no output captured)",
+            stderr="(process timed out, no output captured)",
+            duration=duration,
+            returncode=-1,
+            command=command_str,
+        )
 
     except Exception as e:
-        result["status"] = "crashed"
-        result["error"] = str(e)
-        if verbose:
-            print(f"  [crashed] {e}")
+        # Any other exception - return result, don't raise
+        duration = monotonic() - start_time
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": "error",
+            "duration": duration,
+            "returncode": 1,
+            "message": f"Unexpected error: {str(e)[:200]}",
+            "command": command_str,
+            "file": task.test_file,
+            "case_idx": task.case_idx,
+        }
+
+        # Save error case log
+        save_failed_case_log(
+            report_dir=report_dir,
+            shard=shard,
+            shard_type=shard_type,
+            nodeid=original_nodeid,
+            case_idx=task.case_idx,
+            status="error",
+            stdout="(exception occurred before execution)",
+            stderr=str(e),
+            duration=duration,
+            returncode=1,
+            command=command_str,
+        )
+
+    # Log finish
+    log_queue.put({
+        "type": "case_finish",
+        "case_idx": task.case_idx,
+        "nodeid": original_nodeid,
+        "status": case_result["status"],
+        "duration": case_result["duration"],
+        "message": case_result["message"][:200] if case_result["message"] else "",
+    })
+
+    # Update aggregator (thread-safe)
+    result_aggregator.add_case_result(case_result)
+
+    # Update progress (thread-safe)
+    progress_tracker.mark_completed(original_nodeid, case_result["status"], duration)
+
+    return case_result
 
-    return result
 
+def log_writer_thread(log_queue: Queue, log_file: Path, stop_event: threading.Event) -> None:
+    """
+    Background thread for writing logs.
 
-def run_shard(
-    cases_json: str,
-    test_dir: str,
-    report_dir: str,
+    Ensures thread-safe log file writes while concurrent tasks run.
+    """
+    with log_file.open("w", encoding="utf-8") as log_handle:
+        while not stop_event.is_set() or not log_queue.empty():
+            try:
+                log_entry = log_queue.get(timeout=0.5)
+            except:
+                continue
+
+            if log_entry.get("type") == "header":
+                log_handle.write(log_entry.get("content", ""))
+                log_handle.flush()
+            elif log_entry.get("type") == "case_start":
+                log_handle.write(f"\n[{log_entry['case_idx']}] {log_entry['nodeid']}\n")
+                log_handle.write(f"  File: {log_entry.get('file', '')}\n")
+                log_handle.write(f"  Command: {log_entry.get('command', '')}\n")
+                log_handle.flush()
+            elif log_entry.get("type") == "case_finish":
+                status_str = log_entry.get("status", "")
+                duration_str = f"{log_entry.get('duration', 0):.2f}s"
+                log_handle.write(f"  Status: {status_str}, Duration: {duration_str}\n")
+                if log_entry.get("message"):
+                    log_handle.write(f"  Message: {log_entry['message']}\n")
+                log_handle.flush()
+            elif log_entry.get("type") == "summary":
+                log_handle.write(log_entry.get("content", ""))
+                log_handle.flush()
+
+
+def run_tests_with_concurrent_isolation(
+    planned_tests: List[str],
+    shard: int,
+    test_dir: Path,
+    report_dir: Path,
+    env_updates: Dict[str, str],
     timeout: int,
+    verbose: bool,
+    shard_type: str,
     max_workers: int,
-    verbose: bool = False
-) -> Dict:
-    """Run all cases from a shard JSON file."""
-    cases_file = Path(cases_json)
-    if not cases_file.exists():
-        raise FileNotFoundError(f"Cases JSON file not found: {cases_json}")
+    result_module,
+) -> Tuple[int, float, List[Dict]]:
+    """
+    Execute tests with concurrent per-case isolation.
 
-    with open(cases_file) as f:
-        shard_data = json.load(f)
+    Each test case runs in its own pytest subprocess for crash isolation.
+    Up to max_workers subprocesses execute concurrently via ThreadPoolExecutor.
 
-    cases = shard_data.get("cases", [])
-    shard_index = shard_data.get("shard_index", 1)
-    total_shards = shard_data.get("total_shards", 1)
+    Core dumps in subprocess do NOT affect:
+    - The main Python process
+    - Other concurrent subprocesses
+    - Pending tasks in the queue
 
-    print(f"Loaded shard {shard_index}/{total_shards} with {len(cases)} cases")
+    Args:
+        planned_tests: List of test file paths
+        shard: Shard number
+        test_dir: PyTorch test directory
+        report_dir: Report output directory
+        env_updates: Environment variable updates
+        timeout: Per-case timeout in seconds
+        verbose: Verbose output
+        shard_type: "distributed" or "regular"
+        max_workers: Maximum concurrent subprocesses (default: 4)
+        result_module: parse_test_results module
 
-    test_dir_path = Path(test_dir)
-    if not test_dir_path.exists():
-        raise FileNotFoundError(f"Test directory not found: {test_dir}")
+    Returns:
+        Tuple of (worst_returncode, duration, cases_list_sorted)
+    """
+    start = monotonic()
+    log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
 
-    report_dir_path = Path(report_dir)
-    report_dir_path.mkdir(parents=True, exist_ok=True)
+    merged_env = os.environ.copy()
+    merged_env.update(env_updates)
 
-    results = []
-    stats = {
-        "passed": 0,
-        "failed": 0,
-        "error": 0,
-        "skipped": 0,
-        "timeout": 0,
-        "crashed": 0,
-        "xfail": 0,
-        "xpass": 0,
-        "unknown": 0,
-    }
+    config = ConcurrentExecutionConfig(
+        max_workers=max_workers,
+        per_case_timeout=timeout,
+        verbose=verbose,
+    )
+
+    # Thread-safe result aggregator
+    result_aggregator = ConcurrentResultAggregator()
+
+    # Log queue and writer thread
+    log_queue = Queue()
+    stop_event = threading.Event()
+    log_thread = threading.Thread(
+        target=log_writer_thread,
+        args=(log_queue, log_file, stop_event),
+        daemon=True,
+    )
 
-    print(f"Running tests with {max_workers} workers...")
+    # Write log header
+    log_queue.put({
+        "type": "header",
+        "content": (
+            "=" * 80 + "\n"
+            f"Concurrent per-case isolation pytest execution ({shard_type} shard)\n"
+            "=" * 80 + "\n"
+            f"Total test files: {len(planned_tests)}\n"
+            f"Max concurrent workers: {max_workers}\n"
+            "Execution mode: concurrent subprocess, each case isolated\n"
+            "=" * 80 + "\n\n"
+        ),
+    })
 
-    total_tests = len(cases)
+    log_thread.start()
+
+    print(f"\n{'=' * 80}")
+    print(f"Concurrent per-case isolation mode: {len(planned_tests)} files")
+    print(f"Execution mode: {max_workers} workers concurrent, each case in subprocess")
+    print(f"{'=' * 80}\n")
+
+    # Phase 1: Collect all test cases (serial, as parsing test files)
+    all_tasks: List[CaseExecutionTask] = []
+    case_idx = 0
+
+    print("Phase 1: Collecting test cases...")
+    for file_idx, test_file in enumerate(planned_tests, 1):
+        test_name = strip_test_prefix_and_suffix(test_file)
+        print(f"\n  [File {file_idx}/{len(planned_tests)}] Collecting: {test_name}")
+
+        case_nodeids = collect_test_cases(test_file, test_dir, merged_env)
+
+        if not case_nodeids:
+            print(f"    No cases collected")
+            continue
+
+        print(f"    Collected {len(case_nodeids)} cases")
+
+        for nodeid in case_nodeids:
+            case_idx += 1
+            all_tasks.append(CaseExecutionTask(
+                case_idx=case_idx,
+                nodeid=nodeid,
+                test_file=test_file,
+                file_idx=file_idx,
+            ))
+
+    total_cases = len(all_tasks)
+    print(f"\n{'=' * 80}")
+    print(f"Phase 2: Concurrent execution with {max_workers} workers")
+    print(f"Total cases to execute: {total_cases}")
+    print(f"{'=' * 80}\n")
+
+    # Phase 2: Concurrent execution via ThreadPoolExecutor
+    progress_tracker = ProgressTracker(total_cases)
 
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {
-            executor.submit(run_single_case, case, test_dir_path, timeout, idx, total_tests, verbose): (case, idx)
-            for idx, case in enumerate(cases, start=1)
+        # Submit all tasks
+        future_to_task = {
+            executor.submit(
+                run_single_case_concurrent,
+                task,
+                test_dir,
+                merged_env,
+                config,
+                result_aggregator,
+                progress_tracker,
+                log_queue,
+                report_dir,
+                shard,
+                shard_type,
+            ): task
+            for task in all_tasks
         }
 
-        for future in as_completed(futures):
-            case, idx = futures[future]
-            result = future.result()
-            results.append(result)
+        # Wait for completion (as_completed gives results as they finish)
+        for future in as_completed(future_to_task):
+            task = future_to_task[future]
+            try:
+                # Result already collected in aggregator
+                _ = future.result()
+            except Exception as e:
+                # Should never happen (run_single_case_concurrent catches all)
+                # But as safety, create error result
+                case_result = {
+                    "nodeid": task.nodeid,
+                    "status": "error",
+                    "duration": 0.0,
+                    "returncode": 1,
+                    "message": f"Future error: {str(e)[:200]}",
+                    "file": task.test_file,
+                    "case_idx": task.case_idx,
+                }
+                result_aggregator.add_case_result(case_result)
+                progress_tracker.mark_completed(task.nodeid, "error", 0.0)
+
+    # Stop log thread
+    elapsed = monotonic() - start
+    summary = result_aggregator.get_summary()
+
+    log_queue.put({
+        "type": "summary",
+        "content": (
+            f"\n{'=' * 80}\n"
+            f"Summary: {summary['total_cases']} cases executed\n"
+            f"  Passed: {summary['passed_count']}\n"
+            f"  Failed: {summary['failed_count']}\n"
+            f"  Errors: {summary['error_count']}\n"
+            f"  Crashed: {summary['crashed_count']}\n"
+            f"  Timeout: {summary['timeout_count']}\n"
+            f"  Skipped: {summary['skipped_count']}\n"
+            f"  Duration: {elapsed:.2f}s\n"
+            f"  Concurrent workers: {max_workers}\n"
+            f"{'=' * 80}\n"
+        ),
+    })
+
+    stop_event.set()
+    log_thread.join(timeout=5)
+
+    # Print final summary
+    print(f"\n{'=' * 80}", flush=True)
+    print(f"Summary: {summary['total_cases']} cases executed", flush=True)
+    print(f"  Passed: {summary['passed_count']}", flush=True)
+    print(f"  Failed: {summary['failed_count']}", flush=True)
+    print(f"  Errors: {summary['error_count']}", flush=True)
+    print(f"  Crashed: {summary['crashed_count']}", flush=True)
+    print(f"  Timeout: {summary['timeout_count']}", flush=True)
+    print(f"  Skipped: {summary['skipped_count']}", flush=True)
+    print(f"  Duration: {elapsed:.2f}s", flush=True)
+    print(f"{'=' * 80}", flush=True)
+
+    return summary["worst_returncode"], elapsed, result_aggregator.get_sorted_cases()
+
+
+def run_tests_with_case_isolation(
+    planned_tests: List[str],
+    shard: int,
+    test_dir: Path,
+    report_dir: Path,
+    env_updates: Dict[str, str],
+    timeout: int,
+    verbose: bool,
+    shard_type: str,
+    result_module,
+) -> Tuple[int, float, List[Dict]]:
+    """
+    Execute tests with per-case isolation (strict serial execution).
+
+    Each test case runs in its own pytest subprocess for crash isolation.
+    No parallel execution - strict serial processing.
+
+    Returns:
+        Tuple of (worst_returncode, duration, cases_list)
+    """
+    start = monotonic()
+    prefix = result_module.get_shard_type_prefix(shard_type)
+    log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
+
+    merged_env = os.environ.copy()
+    merged_env.update(env_updates)
+
+    cases_list = []
+    worst_returncode = 0
+
+    with log_file.open("w", encoding="utf-8") as log_handle:
+        log_handle.write("=" * 80 + "\n")
+        log_handle.write(f"Per-case isolation pytest execution ({shard_type} shard)\n")
+        log_handle.write("=" * 80 + "\n")
+        log_handle.write(f"Total test files: {len(planned_tests)}\n")
+        log_handle.write("Execution mode: strict serial, each case in own process\n")
+        log_handle.write("=" * 80 + "\n\n")
+        log_handle.flush()
+
+        print(f"\n{'=' * 80}")
+        print(f"Per-case isolation mode: {len(planned_tests)} files")
+        print("Execution mode: strict serial, each case in own process")
+        print(f"{'=' * 80}\n")
+
+        total_cases = 0
+        case_idx = 0
+
+        for file_idx, test_file in enumerate(planned_tests, 1):
+            test_name = strip_test_prefix_and_suffix(test_file)
+
+            log_handle.write(f"\n{'=' * 80}\n")
+            log_handle.write(f"[File {file_idx}/{len(planned_tests)}] {test_name}\n")
+            log_handle.write(f"{'=' * 80}\n")
+            log_handle.flush()
+
+            print(f"\n[File {file_idx}/{len(planned_tests)}] {test_name}")
+            print("  Collecting test cases...")
+
+            # Collect cases for this file
+            case_nodeids = collect_test_cases(test_file, test_dir, merged_env)
+
+            if not case_nodeids:
+                log_handle.write(f"  No cases collected\n")
+                print(f"    No cases collected")
+                continue
+
+            log_handle.write(f"  Collected {len(case_nodeids)} cases\n")
+            log_handle.flush()
+            print(f"    Collected {len(case_nodeids)} cases")
+
+            # Execute each case serially
+            for nodeid in case_nodeids:
+                case_idx += 1
+                total_cases += 1
+
+                log_handle.write(f"\n  [{case_idx}] {nodeid}\n")
+                log_handle.flush()
+
+                print(f"    [{case_idx}] {nodeid}")
+
+                # Run single case
+                case_result = run_single_test_case(
+                    nodeid,
+                    test_dir,
+                    merged_env,
+                    timeout,
+                    verbose,
+                    report_dir,
+                    shard,
+                    shard_type,
+                    case_idx,
+                    test_file,
+                )
+
+                # Add file info
+                case_result["file"] = test_file
+
+                # Log result
+                status_str = case_result["status"]
+                duration_str = f"{case_result['duration']:.2f}s"
+                command_str = case_result.get("command", "")
+                message = case_result.get("message", "")
+                log_handle.write(f"    Command: {command_str}\n")
+                log_handle.write(f"    Status: {status_str}, Duration: {duration_str}\n")
+                if message:
+                    log_handle.write(f"    Message: {message[:500]}\n")
+                log_handle.flush()
+
+                print(f"      {status_str} ({duration_str})")
+                # Print error message for failed/error cases to stdout
+                if status_str in ("failed", "error", "crashed", "timeout") and message:
+                    # Print first few lines of error message
+                    msg_lines = message.splitlines()[:5]
+                    for msg_line in msg_lines:
+                        if msg_line.strip():
+                            print(f"        {msg_line[:200]}")
+
+                cases_list.append(case_result)
+
+                # Track worst returncode
+                rc = case_result["returncode"]
+                if rc != 0 and rc != 3 and rc != 5:  # Ignore skipped/no_tests
+                    if worst_returncode == 0:
+                        worst_returncode = rc
+
+        # Summary
+        elapsed = monotonic() - start
+
+        passed_count = sum(1 for c in cases_list if c["status"] == "passed")
+        failed_count = sum(1 for c in cases_list if c["status"] == "failed")
+        error_count = sum(1 for c in cases_list if c["status"] == "error")
+        crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
+        timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
+        skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
+
+        log_handle.write(f"\n{'=' * 80}\n")
+        log_handle.write(f"Summary: {total_cases} cases executed\n")
+        log_handle.write(f"  Passed: {passed_count}\n")
+        log_handle.write(f"  Failed: {failed_count}\n")
+        log_handle.write(f"  Errors: {error_count}\n")
+        log_handle.write(f"  Crashed: {crashed_count}\n")
+        log_handle.write(f"  Timeout: {timeout_count}\n")
+        log_handle.write(f"  Skipped: {skipped_count}\n")
+        log_handle.write(f"  Duration: {elapsed:.2f}s\n")
+        log_handle.write(f"{'=' * 80}\n")
+        log_handle.flush()
+
+        print(f"\n{'=' * 80}")
+        print(f"Summary: {total_cases} cases executed")
+        print(f"  Passed: {passed_count}, Failed: {failed_count}, Errors: {error_count}, Crashed: {crashed_count}, Timeout: {timeout_count}")
+        print(f"  Duration: {elapsed:.2f}s")
+        print(f"{'=' * 80}")
+
+    return worst_returncode, elapsed, cases_list
+
+
+def run_tests_with_tasks_concurrent(
+    tasks: List[CaseExecutionTask],
+    shard: int,
+    test_dir: Path,
+    report_dir: Path,
+    env_updates: Dict[str, str],
+    timeout: int,
+    verbose: bool,
+    shard_type: str,
+    max_workers: int,
+    result_module,
+) -> Tuple[int, float, List[Dict]]:
+    """
+    Execute pre-collected test cases with concurrent per-case isolation.
+
+    This function takes CaseExecutionTask objects directly (pre-collected cases)
+    and executes them concurrently without the file-level case collection phase.
+
+    Args:
+        tasks: List of CaseExecutionTask objects (pre-collected cases)
+        shard: Shard number
+        test_dir: PyTorch test directory
+        report_dir: Report output directory
+        env_updates: Environment variable updates
+        timeout: Per-case timeout in seconds
+        verbose: Verbose output
+        shard_type: "distributed" or "regular"
+        max_workers: Maximum concurrent subprocesses
+        result_module: parse_test_results module
+
+    Returns:
+        Tuple of (worst_returncode, duration, cases_list_sorted)
+    """
+    start = monotonic()
+    log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
+
+    merged_env = os.environ.copy()
+    merged_env.update(env_updates)
+
+    config = ConcurrentExecutionConfig(
+        max_workers=max_workers,
+        per_case_timeout=timeout,
+        verbose=verbose,
+    )
+
+    # Thread-safe result aggregator
+    result_aggregator = ConcurrentResultAggregator()
+
+    # Log queue and writer thread
+    log_queue = Queue()
+    stop_event = threading.Event()
+    log_thread = threading.Thread(
+        target=log_writer_thread,
+        args=(log_queue, log_file, stop_event),
+        daemon=True,
+    )
+
+    # Write log header
+    log_queue.put({
+        "type": "header",
+        "content": (
+            "=" * 80 + "\n"
+            f"Pre-collected cases concurrent execution ({shard_type} shard)\n"
+            "=" * 80 + "\n"
+            f"Total cases: {len(tasks)}\n"
+            f"Max concurrent workers: {max_workers}\n"
+            "Execution mode: concurrent subprocess, each case isolated\n"
+            "=" * 80 + "\n\n"
+        ),
+    })
+
+    log_thread.start()
+
+    print(f"\n{'=' * 80}", flush=True)
+    print(f"Pre-collected cases: {len(tasks)} cases", flush=True)
+    print(f"Execution mode: {max_workers} workers concurrent, each case in subprocess", flush=True)
+    print(f"{'=' * 80}\n", flush=True)
+
+    total_cases = len(tasks)
+    print(f"Phase 1: Executing {total_cases} pre-collected cases...", flush=True)
+
+    # Phase 2: Concurrent execution via ThreadPoolExecutor
+    progress_tracker = ProgressTracker(total_cases)
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all tasks
+        future_to_task = {
+            executor.submit(
+                run_single_case_concurrent,
+                task,
+                test_dir,
+                merged_env,
+                config,
+                result_aggregator,
+                progress_tracker,
+                log_queue,
+                report_dir,
+                shard,
+                shard_type,
+            ): task
+            for task in tasks
+        }
+
+        # Wait for completion (as_completed gives results as they finish)
+        for future in as_completed(future_to_task):
+            task = future_to_task[future]
+            try:
+                # Result already collected in aggregator
+                _ = future.result()
+            except Exception as e:
+                # Should never happen (run_single_case_concurrent catches all)
+                # But as safety, create error result
+                case_result = {
+                    "nodeid": task.nodeid,
+                    "status": "error",
+                    "duration": 0.0,
+                    "returncode": 1,
+                    "message": f"Future error: {str(e)[:200]}",
+                    "file": task.test_file,
+                    "case_idx": task.case_idx,
+                }
+                result_aggregator.add_case_result(case_result)
+                progress_tracker.mark_completed(task.nodeid, "error", 0.0)
+
+    # Stop log thread
+    elapsed = monotonic() - start
+    summary = result_aggregator.get_summary()
+
+    log_queue.put({
+        "type": "summary",
+        "content": (
+            f"\n{'=' * 80}\n"
+            f"Summary: {summary['total_cases']} cases executed\n"
+            f"  Passed: {summary['passed_count']}\n"
+            f"  Failed: {summary['failed_count']}\n"
+            f"  Errors: {summary['error_count']}\n"
+            f"  Crashed: {summary['crashed_count']}\n"
+            f"  Timeout: {summary['timeout_count']}\n"
+            f"  Skipped: {summary['skipped_count']}\n"
+            f"  Duration: {elapsed:.2f}s\n"
+            f"  Concurrent workers: {max_workers}\n"
+            f"{'=' * 80}\n"
+        ),
+    })
+
+    stop_event.set()
+    log_thread.join(timeout=5)
+
+    # Print final summary
+    print(f"\n{'=' * 80}", flush=True)
+    print(f"Summary: {summary['total_cases']} cases executed", flush=True)
+    print(f"  Passed: {summary['passed_count']}", flush=True)
+    print(f"  Failed: {summary['failed_count']}", flush=True)
+    print(f"  Errors: {summary['error_count']}", flush=True)
+    print(f"  Crashed: {summary['crashed_count']}", flush=True)
+    print(f"  Timeout: {summary['timeout_count']}", flush=True)
+    print(f"  Skipped: {summary['skipped_count']}", flush=True)
+    print(f"  Duration: {elapsed:.2f}s", flush=True)
+    print(f"{'=' * 80}", flush=True)
+
+    return summary["worst_returncode"], elapsed, result_aggregator.get_sorted_cases()
+
+
+def build_execution_env(
+    test_dir: Path,
+    script_dir: Path,
+    disabled_testcases_file: str,
+    shard: int,
+    shard_type: str,
+) -> Dict[str, str]:
+    """Build environment variables for test execution."""
+    repo_root = test_dir.parent
+    pythonpath_parts = [str(script_dir)]
+
+    torch_path = load_installed_torch_root()
+    if torch_path:
+        pythonpath_parts.append(torch_path)
+
+    pythonpath_parts.extend([str(repo_root), str(test_dir)])
+
+    existing_pythonpath = os.environ.get("PYTHONPATH", "")
+    if existing_pythonpath:
+        pythonpath_parts.append(existing_pythonpath)
+
+    updates = {
+        "PYTHONPATH": os.pathsep.join(pythonpath_parts),
+        "PYTORCH_TEST_NPU": "1",
+        "TORCH_DEVICE_BACKEND_AUTOLOAD": "1",
+        "NO_TD": "1",
+        "PYTHONUNBUFFERED": "1",
+        # Note: Do NOT set CI=true here, as some test files have conditional
+        # test generation logic like:
+        #   if not (IS_CI and torch.cuda.is_available()):
+        #       globals().update(generate_tests(...))
+        # Setting CI=true would prevent test case generation in those files.
+    }
+
+    # Use PyTorch's built-in DISABLED_TESTS_FILE mechanism for skipping test cases
+    if disabled_testcases_file:
+        # The disabled_testcases.json format is similar to .pytorch-disabled-tests.json
+        # Set DISABLED_TESTS_FILE to use PyTorch's built-in skip mechanism
+        updates["DISABLED_TESTS_FILE"] = os.path.abspath(disabled_testcases_file)
+
+    return updates
+
+
+def clean_existing_junit_xml(report_dir: Path) -> None:
+    """Clean existing JUnit XML files."""
+    if not report_dir.exists():
+        return
+    for xml_file in report_dir.rglob("*.xml"):
+        xml_file.unlink(missing_ok=True)
+
 
-            status = result["status"]
-            if status in stats:
-                stats[status] += 1
+def remove_existing_file(path: Path) -> None:
+    """Remove existing file."""
+    path.unlink(missing_ok=True)
+
+
+# ==============================================================================
+# Test Files Input Parser
+# ==============================================================================
+
+
+def parse_test_files_input(test_files_str: str, test_dir: Path) -> List[str]:
+    """
+    Parse comma-separated test file input and return standardized test file paths.
+
+    Args:
+        test_files_str: Comma-separated test file paths (e.g., "test_meta.py,test_nn.py")
+        test_dir: Path to PyTorch test directory
+
+    Returns:
+        List of standardized test file paths (e.g., ["test/test_meta.py", "test/test_nn.py"])
+
+    Raises:
+        FileNotFoundError: If any specified test file does not exist
+    """
+    files = [f.strip() for f in test_files_str.split(",") if f.strip()]
+    result = []
+
+    for f in files:
+        # Normalize path format: ensure starts with "test/"
+        if not f.startswith("test/"):
+            f = "test/" + f
+
+        # Remove leading "test/" prefix if it's duplicated
+        if f.startswith("test/test/"):
+            f = f[5:]
+
+        # Verify file exists
+        full_path = test_dir.parent / f
+        if not full_path.exists():
+            # Try with .py extension if not provided
+            if not f.endswith(".py"):
+                f_with_ext = f + ".py"
+                full_path_with_ext = test_dir.parent / f_with_ext
+                if full_path_with_ext.exists():
+                    f = f_with_ext
+                    full_path = full_path_with_ext
+                else:
+                    raise FileNotFoundError(f"Test file not found: {f} or {f_with_ext}")
             else:
-                stats["unknown"] += 1
+                raise FileNotFoundError(f"Test file not found: {f}")
 
-    # Save results JSON (contains case_id, duration, status, output, error, timestamp)
-    results_file = report_dir_path / f"shard_{shard_index}_results.json"
-    with open(results_file, "w") as f:
-        json.dump({
-            "shard_index": shard_index,
-            "total_shards": total_shards,
-            "total_cases": len(cases),
-            "stats": stats,
-            "results": results,
-        }, f, indent=2)
+        result.append(f)
 
-    print(f"\nShard {shard_index} Results saved to {results_file}")
-    print(f"Statistics: {stats}")
+    return result
 
-    return {"stats": stats, "results_file": str(results_file)}
 
+# ==============================================================================
+# CLI
+# ==============================================================================
 
-def main():
-    parser = argparse.ArgumentParser(description="Run NPU test shard")
-    parser.add_argument("--cases-json", required=True, help="JSON file with test cases")
-    parser.add_argument("--test-dir", required=True, help="PyTorch test directory")
-    parser.add_argument("--report-dir", required=True, help="Directory to save results")
-    parser.add_argument("--timeout", type=int, default=300, help="Timeout per test case (seconds, default 300)")
-    parser.add_argument("--max-workers", type=int, default=1, help="Number of parallel workers")
-    parser.add_argument("--verbose", action="store_true", help="Print verbose output")
 
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Run PyTorch NPU tests for a shard via per-case isolation"
+    )
+    parser.add_argument("--test-files", type=str, help="Comma-separated test file paths to run directly (skip shard assignment, e.g., 'test_meta.py,test_nn.py')")
+    parser.add_argument("--cases-json", type=str, help="Path to pre-collected cases JSON file (skip case collection, use test_type from JSON)")
+    parser.add_argument("--shard", type=int, help="Shard number (1-indexed, required if --test-files/--cases-json not set)")
+    parser.add_argument("--num-shards", type=int, help="Total number of shards (required if --test-files/--cases-json not set)")
+    parser.add_argument(
+        "--test-type",
+        type=str,
+        choices=["distributed", "regular"],
+        default="regular",
+        help="Test type (ignored if --test-files is set). distributed uses serial execution, regular uses concurrent.",
+    )
+    parser.add_argument("--test-dir", type=str, required=True, help="Path to PyTorch test directory")
+    parser.add_argument("--disabled-testcases", type=str, help="Path to disabled_testcases.json")
+    parser.add_argument("--case-paths-config", type=str, help="Path to case_paths_ci.yml")
+    parser.add_argument("--report-dir", type=str, default="test-reports", help="Directory for reports")
+    parser.add_argument("--timeout", type=int, default=1200, help="Per-case timeout in seconds (default: 1200 = 20 minutes)")
+    parser.add_argument(
+        "--max-workers",
+        type=int,
+        default=4,
+        help="Maximum concurrent workers for regular tests (default: 4). Each worker runs one pytest subprocess.",
+    )
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
     args = parser.parse_args()
 
-    run_shard(
-        cases_json=args.cases_json,
-        test_dir=args.test_dir,
-        report_dir=args.report_dir,
-        timeout=args.timeout,
-        max_workers=args.max_workers,
-        verbose=args.verbose
+    # Validate required arguments based on mode
+    if not args.test_files and not args.cases_json:
+        if not args.shard or not args.num_shards:
+            parser.error("--shard and --num-shards are required when --test-files/--cases-json is not set")
+
+    # Validate max_workers
+    if args.max_workers < 1:
+        parser.error("--max-workers must be at least 1")
+    if args.max_workers > 128:
+        print(f"WARNING: --max-workers={args.max_workers} is very high, may cause resource contention")
+
+    return args
+
+
+def main():
+    """Main entry point."""
+    args = parse_args()
+
+    # Resolve paths
+    test_dir = Path(args.test_dir).resolve()
+    if not test_dir.is_dir():
+        raise FileNotFoundError(f"Test directory not found: {test_dir}")
+
+    repo_root = test_dir.parent
+    script_dir = Path(__file__).resolve().parent
+    report_dir = Path(args.report_dir).resolve()
+    report_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load modules
+    discover_module = load_discover_module(script_dir)
+    result_module = load_parse_test_results_module(script_dir)
+
+    timestamp = datetime.now().isoformat()
+
+    # ==========================================================================
+    # Mode: Direct execution of specified test files
+    # ==========================================================================
+    if args.test_files:
+        print("=" * 80)
+        print("Custom Test Files Execution Mode")
+        print("=" * 80)
+
+        # Parse test files input
+        planned_tests = parse_test_files_input(args.test_files, test_dir)
+
+        # Use fixed shard number for custom mode
+        shard = 1
+        num_shards = 1
+        shard_type = "custom"
+
+        print(f"Test files specified: {len(planned_tests)}")
+        print(f"Test directory: {test_dir}")
+        print(f"Execution mode: concurrent ({args.max_workers} workers, per-case subprocess isolation)")
+        if args.disabled_testcases:
+            disabled_count = result_module.load_disabled_testcases_count(args.disabled_testcases)
+            print(f"Disabled testcase entries: {disabled_count}")
+        print(f"\n{'=' * 80}\n")
+
+        for index, target in enumerate(planned_tests, 1):
+            display_name = strip_test_prefix_and_suffix(target)
+            print(f"  [{index:03d}] {display_name}")
+
+        # Create info dict for custom mode
+        info = result_module.create_shard_info(shard, num_shards, timestamp)
+        info["selection_mode"] = "custom_files"
+        info["shard_type"] = shard_type
+        info["shard_files"] = len(planned_tests)
+        info["total_files"] = len(planned_tests)
+        info["selected_test_files"] = len(planned_tests)
+        if args.disabled_testcases:
+            info["disabled_count"] = result_module.load_disabled_testcases_count(args.disabled_testcases)
+
+        # Save test plan
+        result_module.save_test_plan_file(str(report_dir), shard, planned_tests, shard_type)
+
+        # Clean old files
+        clean_existing_junit_xml(report_dir)
+        remove_existing_file(result_module.get_shard_log_file(report_dir, shard, shard_type))
+
+        # Build execution env
+        env_updates = build_execution_env(
+            test_dir, script_dir, args.disabled_testcases, shard, shard_type
+        )
+
+        # Execute tests (custom mode uses concurrent execution by default)
+        cases_list = []
+        if planned_tests:
+            # Custom mode: concurrent execution for efficiency
+            returncode, duration, cases_list = run_tests_with_concurrent_isolation(
+                planned_tests,
+                shard,
+                test_dir,
+                report_dir,
+                env_updates,
+                args.timeout,
+                args.verbose,
+                shard_type,
+                args.max_workers,
+                result_module,
+            )
+            info["per_case_isolation"] = True
+            info["concurrent_workers"] = args.max_workers
+            info["returncode"] = returncode
+            info["duration"] = duration
+
+        # Build cases.json data
+        passed_count = sum(1 for c in cases_list if c["status"] == "passed")
+        failed_count = sum(1 for c in cases_list if c["status"] == "failed")
+        error_count = sum(1 for c in cases_list if c["status"] == "error")
+        crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
+        timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
+        skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
+
+        cases_data = {
+            "shard": shard,
+            "shard_type": shard_type,
+            "execution_mode": "concurrent",
+            "concurrent_workers": args.max_workers,
+            "total_cases": len(cases_list),
+            "passed": passed_count,
+            "failed": failed_count,
+            "errors": error_count,
+            "skipped": skipped_count,
+            "crashed": crashed_count,
+            "timeout": timeout_count,
+            "duration": duration,
+            "cases": cases_list,
+        }
+
+        # Save cases.json
+        result_module.save_cases_file(str(report_dir), shard, cases_data, shard_type)
+
+        # Save info and stats
+        result_module.save_info_file(str(report_dir), shard, info, shard_type)
+
+        stats = {
+            "total": len(cases_list),
+            "passed": passed_count,
+            "failed": failed_count,
+            "skipped": skipped_count,
+            "errors": error_count,
+            "duration": duration,
+            "returncode": returncode,
+            "per_case_isolation": True,
+        }
+
+        result_module.save_stats_file(str(report_dir), shard, stats, shard_type)
+
+        # Print summary
+        result_module.print_stats_summary(shard, stats, shard_type)
+
+        # Exit with 0 to allow step to succeed and report generation to proceed
+        # The actual test results are recorded in cases.json
+        sys.exit(0)
+
+    # ==========================================================================
+    # Mode: Pre-collected cases JSON execution
+    # ==========================================================================
+    if args.cases_json:
+        print("=" * 80)
+        print("Pre-collected Cases Execution Mode")
+        print("=" * 80)
+
+        cases_file = Path(args.cases_json).resolve()
+        if not cases_file.exists():
+            raise FileNotFoundError(f"Cases JSON file not found: {cases_file}")
+
+        cases_data = json.loads(cases_file.read_text(encoding="utf-8"))
+
+        shard = cases_data["shard"]
+        num_shards = cases_data["num_shards"]
+        shard_type = cases_data.get("test_type", "regular")
+        planned_cases = cases_data["cases"]
+        total_cases = len(planned_cases)
+
+        print(f"Cases JSON: {cases_file}")
+        print(f"Shard: {shard}/{num_shards}")
+        print(f"Test type: {shard_type}")
+        print(f"Total cases: {total_cases}")
+        print(f"Test directory: {test_dir}")
+
+        # Execution mode based on test_type
+        if shard_type == "distributed":
+            print(f"Execution mode: SERIAL (per-case subprocess isolation)")
+        else:
+            print(f"Execution mode: CONCURRENT ({args.max_workers} workers, per-case subprocess isolation)")
+
+        if args.disabled_testcases:
+            disabled_count = result_module.load_disabled_testcases_count(args.disabled_testcases)
+            print(f"Disabled testcase entries: {disabled_count}")
+
+        print(f"\n{'=' * 80}\n")
+
+        # Create info dict for cases-json mode
+        info = result_module.create_shard_info(shard, num_shards, timestamp)
+        info["selection_mode"] = "cases_json"
+        info["shard_type"] = shard_type
+        info["cases_json_file"] = str(cases_file)
+        info["total_cases"] = total_cases
+        info["per_case_isolation"] = True
+        if args.disabled_testcases:
+            info["disabled_count"] = result_module.load_disabled_testcases_count(args.disabled_testcases)
+
+        # Clean old files
+        clean_existing_junit_xml(report_dir)
+        remove_existing_file(result_module.get_shard_log_file(report_dir, shard, shard_type))
+
+        # Build execution env
+        env_updates = build_execution_env(
+            test_dir, script_dir, args.disabled_testcases, shard, shard_type
+        )
+
+        # Convert cases to CaseExecutionTask format
+        tasks = []
+        for i, case in enumerate(planned_cases, 1):
+            tasks.append(CaseExecutionTask(
+                case_idx=i,
+                nodeid=case["nodeid"],
+                test_file=case.get("file", ""),
+                file_idx=0,
+            ))
+
+        # Execute tests based on shard_type
+        cases_list = []
+        if tasks:
+            # Determine execution mode and worker count
+            if shard_type == "distributed":
+                # Distributed: serial execution (1 worker)
+                effective_workers = 1
+                print(f"\nExecution mode: SERIAL (distributed tests require sequential execution)")
+            else:
+                # Regular: concurrent execution
+                effective_workers = args.max_workers
+                print(f"\nExecution mode: CONCURRENT ({effective_workers} workers)")
+
+            # Execute tasks directly using the new function
+            returncode, duration, cases_list = run_tests_with_tasks_concurrent(
+                tasks,
+                shard,
+                test_dir,
+                report_dir,
+                env_updates,
+                args.timeout,
+                args.verbose,
+                shard_type,
+                effective_workers,
+                result_module,
+            )
+            info["execution_mode"] = "serial" if effective_workers == 1 else "concurrent"
+            info["concurrent_workers"] = effective_workers
+
+            info["returncode"] = returncode
+            info["duration"] = duration
+        else:
+            print("No cases to execute.")
+            returncode = 0
+            duration = 0.0
+
+        # Build cases.json data
+        passed_count = sum(1 for c in cases_list if c["status"] == "passed")
+        failed_count = sum(1 for c in cases_list if c["status"] == "failed")
+        error_count = sum(1 for c in cases_list if c["status"] == "error")
+        skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
+        crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
+        timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
+
+        output_cases_data = {
+            "shard": shard,
+            "shard_type": shard_type,
+            "execution_mode": info.get("execution_mode", "unknown"),
+            "concurrent_workers": info.get("concurrent_workers", 1),
+            "total_cases": len(cases_list),
+            "passed": passed_count,
+            "failed": failed_count,
+            "errors": error_count,
+            "skipped": skipped_count,
+            "crashed": crashed_count,
+            "timeout": timeout_count,
+            "duration": duration,
+            "cases": cases_list,
+        }
+
+        # Save cases.json
+        result_module.save_cases_file(str(report_dir), shard, output_cases_data, shard_type)
+
+        # Save info and stats
+        result_module.save_info_file(str(report_dir), shard, info, shard_type)
+
+        stats = {
+            "total": len(cases_list),
+            "passed": passed_count,
+            "failed": failed_count,
+            "skipped": skipped_count,
+            "errors": error_count,
+            "crashed": crashed_count,
+            "timeout": timeout_count,
+            "duration": duration,
+            "returncode": returncode,
+            "per_case_isolation": True,
+        }
+
+        result_module.save_stats_file(str(report_dir), shard, stats, shard_type)
+
+        # Print summary
+        result_module.print_stats_summary(shard, stats, shard_type)
+
+        # Exit with 0 to allow step to succeed and report generation to proceed
+        # The actual test results are recorded in cases.json
+        sys.exit(0)
+
+    # ==========================================================================
+    # Mode: Shard-based execution (original logic)
+    # ==========================================================================
+
+    # Validate shard number
+    if args.shard < 1 or args.shard > args.num_shards:
+        raise ValueError(f"Invalid shard {args.shard}; expected 1 <= shard <= {args.num_shards}")
+
+    shard_type = args.test_type
+    timestamp = datetime.now().isoformat()
+
+    # ==========================================================================
+    # Execute test planning
+    # ==========================================================================
+    plan_result = plan_shard_tests(
+        test_dir=test_dir,
+        shard=args.shard,
+        num_shards=args.num_shards,
+        test_type=shard_type,
+        case_paths_config=args.case_paths_config,
+        discover_module=discover_module,
+    )
+    planned_tests = plan_result.get_planned_tests()
+
+    # ==========================================================================
+    # Create info dict
+    # ==========================================================================
+    info = result_module.create_shard_info(args.shard, args.num_shards, timestamp)
+    info.update(plan_result.to_info_dict())
+    info["shard_type"] = shard_type
+    info["disabled_count"] = result_module.load_disabled_testcases_count(args.disabled_testcases)
+    info["selected_test_files"] = plan_result.discovery.metadata.get("rules_selected", 0)
+    info["excluded_test_files"] = plan_result.discovery.metadata.get("rules_excluded", 0)
+    info["shard_files"] = plan_result.shard_assignment.planned_count
+
+    if args.case_paths_config:
+        info["path_rules_file"] = args.case_paths_config
+
+    # Save test plan
+    result_module.save_test_plan_file(str(report_dir), args.shard, planned_tests, shard_type)
+
+    # Save excluded files (not assigned to this shard)
+    all_selected = plan_result.discovery.test_files
+    excluded_for_shard = [f for f in all_selected if f not in planned_tests]
+    result_module.save_excluded_test_files_file(str(report_dir), args.shard, excluded_for_shard, shard_type)
+
+    # Print summary
+    print(create_test_plan_summary(plan_result))
+    print(f"\nRepository root: {repo_root}")
+    print(f"Test directory: {test_dir}")
+    if shard_type == "distributed":
+        print("Execution mode: SERIAL (per-case subprocess isolation)")
+    else:
+        print(f"Execution mode: CONCURRENT ({args.max_workers} workers, per-case subprocess isolation)")
+    if args.case_paths_config:
+        print(f"Case path rules: {args.case_paths_config}")
+    print(f"Disabled testcase entries: {info['disabled_count']}")
+    print(f"\n{'=' * 80}\n")
+
+    for index, target in enumerate(planned_tests, 1):
+        display_name = strip_test_prefix_and_suffix(target)
+        print(f"  [{index:03d}] {display_name}")
+
+    # Clean old files
+    clean_existing_junit_xml(report_dir)
+    remove_existing_file(result_module.get_shard_log_file(report_dir, args.shard, shard_type))
+
+    # Build execution env
+    env_updates = build_execution_env(
+        test_dir, script_dir, args.disabled_testcases, args.shard, shard_type
     )
 
+    # ==========================================================================
+    # Execute tests - choose execution mode based on shard_type
+    # ==========================================================================
+    # distributed tests: serial execution (each case in own process, one at a time)
+    # regular tests: concurrent execution (max_workers subprocesses simultaneously)
+    cases_list = []
+    if planned_tests:
+        if shard_type == "distributed":
+            # Distributed tests: serial execution for stability
+            print("\nExecution mode: SERIAL (distributed tests require sequential execution)")
+            returncode, duration, cases_list = run_tests_with_case_isolation(
+                planned_tests,
+                args.shard,
+                test_dir,
+                report_dir,
+                env_updates,
+                args.timeout,
+                args.verbose,
+                shard_type,
+                result_module,
+            )
+            info["execution_mode"] = "serial"
+        else:
+            # Regular tests: concurrent execution for efficiency
+            print(f"\nExecution mode: CONCURRENT ({args.max_workers} workers)")
+            returncode, duration, cases_list = run_tests_with_concurrent_isolation(
+                planned_tests,
+                args.shard,
+                test_dir,
+                report_dir,
+                env_updates,
+                args.timeout,
+                args.verbose,
+                shard_type,
+                args.max_workers,
+                result_module,
+            )
+            info["execution_mode"] = "concurrent"
+            info["concurrent_workers"] = args.max_workers
+        info["per_case_isolation"] = True
+    else:
+        print("No test files assigned to this shard after file-level filtering.")
+        returncode = 0
+        duration = 0.0
+
+    # Build cases.json data
+    passed_count = sum(1 for c in cases_list if c["status"] == "passed")
+    failed_count = sum(1 for c in cases_list if c["status"] == "failed")
+    error_count = sum(1 for c in cases_list if c["status"] == "error")
+    crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
+    timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
+    skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
+
+    cases_data = {
+        "shard": args.shard,
+        "shard_type": shard_type,
+        "execution_mode": info.get("execution_mode", "serial"),
+        "concurrent_workers": info.get("concurrent_workers", 1),
+        "total_cases": len(cases_list),
+        "passed": passed_count,
+        "failed": failed_count,
+        "errors": error_count,
+        "skipped": skipped_count,
+        "crashed": crashed_count,
+        "timeout": timeout_count,
+        "duration": duration,
+        "cases": cases_list,
+    }
+
+    # Save cases.json
+    result_module.save_cases_file(str(report_dir), args.shard, cases_data, shard_type)
+
+    # ==========================================================================
+    # Generate reports
+    # ==========================================================================
+    stats = {
+        "total": len(cases_list),
+        "passed": passed_count,
+        "failed": failed_count,
+        "skipped": skipped_count,
+        "errors": error_count,
+        "duration": duration,
+        "returncode": returncode,
+        "per_case_isolation": True,
+    }
+
+    result_module.save_info_file(str(report_dir), args.shard, info, shard_type)
+    result_module.save_stats_file(str(report_dir), args.shard, stats, shard_type)
+    result_module.print_stats_summary(args.shard, stats, shard_type)
+
+    # Exit with 0 to allow step to succeed and report generation to proceed
+    # The actual test results are recorded in cases.json
+    sys.exit(0)
+
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-build.yml b/.github/workflows/_torch-npu-upstream-build.yml
new file mode 100644
index 0000000000..e34fcce8d5
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-build.yml
@@ -0,0 +1,417 @@
+name: Build PyTorch and torch_npu (with cache)
+
+on:
+  workflow_call:
+    inputs:
+      pytorch_ref:
+        required: true
+        type: string
+        description: 'PyTorch branch, tag, or commit SHA to build'
+      torch_npu_ref:
+        required: true
+        type: string
+        description: 'torch_npu branch, tag, or commit SHA to build'
+      python_version:
+        required: true
+        type: string
+        default: '3.11'
+      docker_image:
+        required: true
+        type: string
+        default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
+        description: 'Docker image URL to use for build'
+    outputs:
+      docker-image:
+        description: 'Full Docker image URL'
+        value: ${{ inputs.docker_image }}
+      torch-wheel:
+        description: 'PyTorch wheel artifact name'
+        value: 'torch-wheel-main'
+      torch-npu-wheel:
+        description: 'torch_npu wheel artifact name'
+        value: 'torch-npu-wheel-main'
+      pytorch-src:
+        description: 'PyTorch source and test code artifact name'
+        value: 'pytorch-src-main'
+
+env:
+  # 缓存版本号，当需要强制刷新缓存时修改此值
+  CACHE_VERSION: 'v2'
+  # GitHub 代理 URL（用于加速 git clone，留空则不使用代理）
+  GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
+jobs:
+  build:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 240
+
+    container:
+      image: ${{ inputs.docker_image }}
+      options: --user root
+
+    steps:
+      - name: Display Docker image
+        run: |
+          echo "Using Docker image: ${{ inputs.docker_image }}"
+
+      - name: Setup CANN environment
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+      - name: Configure git proxy for faster clone
+        run: |
+          # 配置 git URL rewrite 来使用代理（加速 clone 和 submodules）
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            git config --global url."${{ env.GH_PROXY_URL }}/https://github.com/".insteadOf "https://github.com/"
+            git config --global url."${{ env.GH_PROXY_URL }}/https://gitlab.com/".insteadOf "https://gitlab.com/"
+            echo "Git proxy configured:"
+            git config --global --list | grep url
+          else
+            echo "No proxy configured, using direct connection"
+          fi
+
+      - name: Clone upstream PyTorch with submodules
+        id: clone_pytorch
+        run: |
+          # 使用代理加速 git clone（如果配置了 GH_PROXY_URL）
+          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
+            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
+            echo "Using proxy: ${PYTORCH_REPO}"
+          fi
+
+          # 克隆指定 ref（branch, tag, 或 commit）
+          PYTORCH_REF="${{ inputs.pytorch_ref }}"
+          echo "Cloning PyTorch with ref: ${PYTORCH_REF}"
+
+          # 先浅克隆，再 fetch 指定 ref，最后 checkout
+          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
+          cd pytorch-src
+          git fetch --depth=1 origin "${PYTORCH_REF}"
+          git checkout "${PYTORCH_REF}"
+
+          # 初始化 submodules
+          git submodule update --init --recursive
+
+          PYTORCH_SHA=$(git rev-parse HEAD)
+          PYTORCH_SHA_SHORT=$(git rev-parse --short HEAD)
+          echo "pytorch_sha=${PYTORCH_SHA}" >> $GITHUB_OUTPUT
+          echo "pytorch_sha_short=${PYTORCH_SHA_SHORT}" >> $GITHUB_OUTPUT
+          echo "Cloned PyTorch commit: ${PYTORCH_SHA}"
+          echo "Submodules downloaded:"
+          ls -la third_party/ | head -20
+
+      - name: Checkout torch_npu
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: ${{ inputs.torch_npu_ref }}
+          path: torch_npu-src
+          submodules: recursive
+
+      # ==================== pip 缓存配置 ====================
+      # pip 缓存加速依赖下载，不影响构建结果
+      # 缓存键基于 requirements-build.txt hash（依赖变化频率低）
+      - name: Get pip cache key
+        id: pip_key
+        run: |
+          REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
+
+      - name: Restore pip cache
+        uses: actions/cache/restore@v4
+        with:
+          path: /root/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-
+            ${{ env.CACHE_VERSION }}-pip-
+
+      - name: Setup pip cache directory
+        run: |
+          mkdir -p /root/.cache/pip
+
+      - name: Configure pip index URL
+        run: |
+          # 配置 pip 使用 PyPI 缓存加速下载
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            pip${{ inputs.python_version }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            pip${{ inputs.python_version }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          else
+            echo "No PyPI cache URL configured, using default"
+          fi
+
+      - name: Upgrade pip and setuptools
+        run: |
+          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+
+      # ==================== ccache 缓存配置 ====================
+      # ccache 是真正加速编译的关键（可节省 30-60 分钟）
+      # 注意：PyTorch 每次 clone 都是新 commit，所以缓存键不包含 PyTorch SHA
+      # 我们依赖 torch_npu SHA 和 requirements-build.txt hash 作为缓存键
+      - name: Get ccache key
+        id: ccache_key
+        run: |
+          # ccache 缓存键：torch_npu SHA + requirements hash
+          # PyTorch SHA 每次都变化（--depth=1 clone 最新），所以不包含在缓存键中
+          TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
+          REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
+          echo "cache_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
+          # partial_key 用于恢复同版本 requirements 的缓存（不同 torch_npu 版本）
+          echo "partial_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-" >> $GITHUB_OUTPUT
+          # base_key 用于恢复同 CACHE_VERSION 的所有缓存
+          echo "base_key=${{ env.CACHE_VERSION }}-ccache-" >> $GITHUB_OUTPUT
+
+      - name: Restore ccache
+        uses: actions/cache/restore@v4
+        with:
+          path: /root/.cache/ccache
+          key: ${{ steps.ccache_key.outputs.cache_key }}
+          restore-keys: |
+            ${{ steps.ccache_key.outputs.partial_key }}
+            ${{ steps.ccache_key.outputs.base_key }}
+
+      - name: Setup ccache
+        run: |
+          # 安装 ccache（manylinux 镜像没有预装）
+          yum install -y ccache
+
+          # 创建 ccache 配置目录（使用绝对路径）
+          CCACHE_DIR_PATH="/root/.cache/ccache"
+          mkdir -p "$CCACHE_DIR_PATH"
+
+          # 直接写入配置文件（使用绝对路径）
+          cat > "$CCACHE_DIR_PATH/ccache.conf" << 'EOF'
+          max_size = 20G
+          cache_dir = /root/.cache/ccache
+          compression = true
+          compression_level = 6
+          EOF
+
+          # 使用符号链接方式让 ccache 模拟 gcc/g++
+          mkdir -p /usr/local/bin
+          ln -sf /usr/bin/ccache /usr/local/bin/gcc
+          ln -sf /usr/bin/ccache /usr/local/bin/g++
+          ln -sf /usr/bin/ccache /usr/local/bin/cc
+          ln -sf /usr/bin/ccache /usr/local/bin/c++
+
+          # 设置 PATH 优先使用符号链接
+          echo "PATH=/usr/local/bin:$PATH" >> $GITHUB_ENV
+
+          # 设置 CCACHE_DIR（使用绝对路径，不使用 ~）
+          echo "CCACHE_DIR=$CCACHE_DIR_PATH" >> $GITHUB_ENV
+
+          # 设置编译器环境变量，确保 CMake/Ninja 使用 ccache
+          echo "CC=/usr/local/bin/gcc" >> $GITHUB_ENV
+          echo "CXX=/usr/local/bin/g++" >> $GITHUB_ENV
+
+          echo "=== ccache Configuration ==="
+          CCACHE_DIR="$CCACHE_DIR_PATH" ccache --show-config
+
+          echo ""
+          echo "=== Config File Contents ==="
+          cat "$CCACHE_DIR_PATH/ccache.conf"
+
+          echo ""
+          echo "=== Cache Directory ==="
+          ls -la "$CCACHE_DIR_PATH/"
+
+          echo ""
+          echo "=== Symbolic Links ==="
+          ls -la /usr/local/bin/gcc /usr/local/bin/g++
+
+          echo ""
+          echo "=== ccache Statistics (before build) ==="
+          CCACHE_DIR="$CCACHE_DIR_PATH" ccache --show-stats
+
+      # ==================== 构建 PyTorch ====================
+      - name: Build PyTorch wheel
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+
+          cd pytorch-src
+
+          # 安装构建依赖（pip 缓存已恢复，加速下载）
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+          pip${{ inputs.python_version }} install -r requirements-build.txt
+
+          # 设置构建环境变量
+          export MAX_JOBS=128
+          export USE_CUDA=0
+          export USE_CUDNN=0
+          export USE_DISTRIBUTED=1
+          export CMAKE_BUILD_TYPE=Release
+          export USE_OPENMP=1
+          export USE_MKLDNN=0
+
+          # 确保使用 ccache（CMake 会检测 CC/CXX 环境变量）
+          export CC=/usr/local/bin/gcc
+          export CXX=/usr/local/bin/g++
+          export CCACHE_DIR=/root/.cache/ccache
+
+          # 清除 ccache 统计（开始新的构建）
+          ccache --zero-stats
+
+          python${{ inputs.python_version }} setup.py build bdist_wheel
+
+          echo "PyTorch wheel built:"
+          ls -la dist/
+
+          echo ""
+          echo "=== ccache Statistics (after PyTorch build) ==="
+          ccache --show-stats
+
+      # ==================== 构建 torch_npu ====================
+      - name: Install PyTorch wheel and build dependencies
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          echo "=== Installing built PyTorch wheel ==="
+          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
+
+          echo ""
+          echo "=== Verifying PyTorch installation ==="
+          python${{ inputs.python_version }} -c "import torch; print(f'torch version: {torch.__version__}')"
+
+          echo ""
+          echo "=== Installing torch_npu build dependencies ==="
+          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
+          pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
+
+          cd torch_npu-src
+
+          # 显示 ccache 统计（依赖安装阶段）
+          echo ""
+          echo "=== ccache Statistics (before torch_npu build) ==="
+          CCACHE_DIR=/root/.cache/ccache ccache --show-stats
+
+      - name: Build torch_npu wheel
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          cd torch_npu-src
+
+          export MAX_JOBS=128
+
+          # 确保使用 ccache
+          export CC=/usr/local/bin/gcc
+          export CXX=/usr/local/bin/g++
+          export CCACHE_DIR=/root/.cache/ccache
+
+          # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
+          bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair
+
+          echo "torch_npu wheel built:"
+          ls -la dist/
+
+          echo ""
+          echo "=== ccache Statistics (after torch_npu build) ==="
+          ccache --show-stats
+
+      # ==================== 保存缓存 ====================
+      - name: Save pip cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: /root/.cache/pip
+          key: ${{ steps.pip_key.outputs.cache_key }}
+
+      - name: Save ccache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: /root/.cache/ccache
+          key: ${{ steps.ccache_key.outputs.cache_key }}
+
+      - name: Display cache save status
+        if: always()
+        run: |
+          echo "=== Cache Saved ==="
+          echo "pip cache key: ${{ steps.pip_key.outputs.cache_key }}"
+          PIP_CACHE_SIZE=$(du -sh /root/.cache/pip 2>/dev/null | cut -f1)
+          echo "pip cache size: ${PIP_CACHE_SIZE}"
+          echo ""
+          echo "ccache key: ${{ steps.ccache_key.outputs.cache_key }}"
+          CCACHE_SIZE=$(du -sh /root/.cache/ccache 2>/dev/null | cut -f1)
+          echo "ccache size: ${CCACHE_SIZE}"
+
+      # ==================== 打包和上传 ====================
+      - name: Package PyTorch source and build artifacts
+        run: |
+          # 打包整个 pytorch-src 目录（包含测试源码和编译产物）
+          # 排除不必要的文件以减小体积：
+          # - .git 目录（最占空间）
+          # - build/ 目录中的编译中间产物（CMakeFiles, .o 文件等）
+          # - dist/*.whl（已单独上传为 artifact）
+
+          echo "=== PyTorch source directory size ==="
+          du -sh pytorch-src/
+
+          echo ""
+          echo "=== Build artifacts location ==="
+          ls -la pytorch-src/build/lib.*/torch/*.so 2>/dev/null | head -5 || echo "No .so files found in build/lib"
+          ls -la pytorch-src/torch/_C.so 2>/dev/null || echo "No _C.so in torch/"
+
+          echo ""
+          echo "=== Creating archive (excluding large unnecessary files) ==="
+          tar -czf pytorch-src.tar.gz \
+            --exclude='pytorch-src/.git' \
+            --exclude='pytorch-src/build/CMakeFiles' \
+            --exclude='pytorch-src/build/*.o' \
+            --exclude='pytorch-src/build/**/*.o' \
+            --exclude='pytorch-src/dist/*.whl' \
+            pytorch-src
+
+          echo ""
+          echo "=== Archive size ==="
+          ls -lh pytorch-src.tar.gz
+
+      - name: Upload PyTorch wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-wheel-main
+          path: pytorch-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload torch_npu wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-npu-wheel-main
+          path: torch_npu-src/dist/*.whl
+          retention-days: 7
+
+      - name: Upload PyTorch source and build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: pytorch-src-main
+          path: pytorch-src.tar.gz
+          retention-days: 7
+
+      # ==================== 长期制品保存 ====================
+      - name: Upload torch wheel (long-term)
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-wheel-${{ inputs.pytorch_ref }}
+          path: pytorch-src/dist/*.whl
+          retention-days: 30
+
+      - name: Upload torch_npu wheel (long-term)
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-npu-wheel-${{ inputs.pytorch_ref }}
+          path: torch_npu-src/dist/*.whl
+          retention-days: 30
+
+      - name: Upload PyTorch source (long-term)
+        uses: actions/upload-artifact@v4
+        with:
+          name: pytorch-src-${{ inputs.pytorch_ref }}
+          path: pytorch-src.tar.gz
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
new file mode 100644
index 0000000000..ede2996862
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -0,0 +1,246 @@
+name: Torch NPU Upstream Collect
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+        description: Python version to use
+      torch_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch wheel artifact from build
+      torch_npu_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch_npu wheel artifact from build
+      pytorch_src_artifact:
+        required: true
+        type: string
+        description: Name of the pytorch source artifact from build
+      docker_image:
+        required: true
+        type: string
+        description: Docker image to use
+      distributed_shards:
+        required: false
+        type: string
+        default: '2'
+        description: Number of shards for distributed tests
+      regular_shards:
+        required: false
+        type: string
+        default: '5'
+        description: Number of shards for regular tests
+    outputs:
+      distributed_matrix:
+        description: Distributed shard matrix JSON
+        value: ${{ jobs.collect.outputs.distributed_matrix }}
+      regular_matrix:
+        description: Regular shard matrix JSON
+        value: ${{ jobs.collect.outputs.regular_matrix }}
+      distributed_shards:
+        description: Number of distributed shards
+        value: ${{ jobs.collect.outputs.distributed_shards }}
+      regular_shards:
+        description: Number of regular shards
+        value: ${{ jobs.collect.outputs.regular_shards }}
+      total_cases:
+        description: Total number of test cases
+        value: ${{ jobs.collect.outputs.total_cases }}
+
+jobs:
+  collect:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 60
+    container:
+      image: ${{ inputs.docker_image }}
+      options: --user root
+    outputs:
+      distributed_matrix: ${{ steps.collect_and_shard.outputs.distributed_matrix }}
+      regular_matrix: ${{ steps.collect_and_shard.outputs.regular_matrix }}
+      distributed_shards: ${{ steps.collect_and_shard.outputs.distributed_shards }}
+      regular_shards: ${{ steps.collect_and_shard.outputs.regular_shards }}
+      total_cases: ${{ steps.collect_and_shard.outputs.total_cases }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: v2.7.1_dev
+          fetch-depth: 1
+          path: ascend_pytorch
+
+      - name: Setup cache directories
+        run: |
+          mkdir -p /github/home/.cache/pip
+          chmod -R 777 /github/home/.cache
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: /github/home/.cache/pip
+          key: pip-arm-collect-py${{ inputs.python_version }}
+          restore-keys: |
+            pip-arm-collect-py${{ inputs.python_version }}-
+            pip-arm-collect-
+
+      - name: Download built torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_wheel_artifact }}
+          path: torch-wheel-artifact
+
+      - name: Download built torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_npu_wheel_artifact }}
+          path: torch-npu-wheel-artifact
+
+      - name: Download PyTorch source and test code
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.pytorch_src_artifact }}
+          path: pytorch-src-artifact
+
+      - name: Extract PyTorch source
+        run: |
+          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
+
+      - name: Install built PyTorch and torch_npu
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PIP=pip${{ inputs.python_version }}
+          PYTHON=python${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+
+          $PIP install --upgrade pip
+
+          # Install built torch wheel
+          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_WHL}"
+
+          # Install built torch_npu wheel
+          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_NPU_WHL}"
+
+          echo "Installed PyTorch and torch_npu from built wheels"
+          echo "torch: ${TORCH_WHL}"
+          echo "torch_npu: ${TORCH_NPU_WHL}"
+
+      - name: Verify NPU availability
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          "
+
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          cd pytorch-src
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
+      - name: Collect all test cases and shard
+        id: collect_and_shard
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          cd pytorch-src
+
+          # Case-level sharding
+          DISTRIBUTED_SHARDS='${{ inputs.distributed_shards }}'
+          REGULAR_SHARDS='${{ inputs.regular_shards }}'
+
+          echo "=== Collecting all test cases ==="
+          echo "Distributed shards: ${DISTRIBUTED_SHARDS}"
+          echo "Regular shards: ${REGULAR_SHARDS}"
+
+          $PYTHON ../ascend_pytorch/.github/scripts/collect_all_cases.py \
+            --test-dir test \
+            --case-paths-config test_upstream/case_paths_ci.yml \
+            --distributed-shards ${DISTRIBUTED_SHARDS} \
+            --regular-shards ${REGULAR_SHARDS} \
+            --output-dir cases_shards \
+            --error-log-dir collection_errors \
+            --parallel 16 \
+            2>&1 | tee /tmp/collect_cases.log
+
+          # Verify output
+          echo "=== Generated shard files ==="
+          ls -la cases_shards/
+
+          echo "=== Collection summary ==="
+          cat cases_shards/cases_collection_summary.json
+
+          # Extract total cases from summary
+          TOTAL_CASES=$(python3 -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
+
+          # Build shard matrices
+          DIST_SHARDS=$(seq 1 ${DISTRIBUTED_SHARDS} | tr '\n' ',' | sed 's/,$//')
+          REG_SHARDS=$(seq 1 ${REGULAR_SHARDS} | tr '\n' ',' | sed 's/,$//')
+
+          echo "distributed_matrix=[${DIST_SHARDS}]" >> $GITHUB_OUTPUT
+          echo "distributed_shards=${DISTRIBUTED_SHARDS}" >> $GITHUB_OUTPUT
+          echo "regular_matrix=[${REG_SHARDS}]" >> $GITHUB_OUTPUT
+          echo "regular_shards=${REGULAR_SHARDS}" >> $GITHUB_OUTPUT
+          echo "total_cases=${TOTAL_CASES}" >> $GITHUB_OUTPUT
+
+          echo "=== Shard configuration ==="
+          echo "Distributed tests: ${DISTRIBUTED_SHARDS} shards (case-level, serial execution, linux-aarch64-a3-16)"
+          echo "Regular tests: ${REGULAR_SHARDS} shards (case-level, 64 workers, linux-aarch64-a3-16)"
+          echo "Total cases: ${TOTAL_CASES}"
+
+          # Package error logs if any
+          if [ -d "collection_errors" ] && [ "$(ls -A collection_errors 2>/dev/null)" ]; then
+            echo "=== Packaging collection error logs ==="
+            tar -czf collection_errors.tar.gz collection_errors/
+            echo "Error logs packaged: collection_errors.tar.gz"
+            ls -la collection_errors.tar.gz
+          fi
+
+      - name: Upload cases shard JSONs
+        uses: actions/upload-artifact@v4
+        with:
+          name: cases-shards
+          path: pytorch-src/cases_shards/
+          retention-days: 7
+
+      - name: Upload collection error logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: collection-error-logs
+          path: pytorch-src/collection_errors.tar.gz
+          if-no-files-found: ignore
+          retention-days: 30
+
+      - name: Upload collect logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: collect-cases-logs
+          path: /tmp/collect_cases.log
+          if-no-files-found: warn
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-prepare.yml b/.github/workflows/_torch-npu-upstream-prepare.yml
new file mode 100644
index 0000000000..69210a5570
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-prepare.yml
@@ -0,0 +1,49 @@
+name: Torch NPU Upstream Prepare
+
+on:
+  workflow_call:
+    inputs:
+      pytorch_version:
+        required: true
+        type: string
+        description: PyTorch version to clone for test source
+      prepared_test_src_artifact:
+        required: true
+        type: string
+        description: Name of the artifact for prepared test source
+    
+jobs:
+  prepare:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: v2.7.1_dev
+          fetch-depth: 1
+
+      - name: Clone PyTorch v${{ inputs.pytorch_version }} (for test source)
+        run: |
+          git clone --depth=1 --branch v${{ inputs.pytorch_version }} \
+            https://github.com/pytorch/pytorch.git pytorch-test-src
+
+      - name: Package prepared test source
+        run: |
+          tar -czf pytorch-test-src.tar.gz pytorch-test-src
+
+      - name: Upload prepared test source
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.prepared_test_src_artifact }}
+          path: pytorch-test-src.tar.gz
+          retention-days: 7
+
+      - name: Upload prepare logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: prepare-logs
+          path: /tmp/*.log
+          if-no-files-found: warn
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-report.yml b/.github/workflows/_torch-npu-upstream-report.yml
new file mode 100644
index 0000000000..db88c80775
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-report.yml
@@ -0,0 +1,106 @@
+name: Torch NPU Upstream Report
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+        description: Python version to use
+      torch_npu_wheel_name:
+        required: false
+        type: string
+        default: 'source-build.whl'
+        description: Name of the torch_npu wheel file
+      docker_image:
+        required: true
+        type: string
+        description: Docker image used for tests
+      distributed_matrix:
+        required: false
+        type: string
+        default: '[]'
+        description: Distributed shard matrix JSON
+      regular_matrix:
+        required: false
+        type: string
+        default: '[]'
+        description: Regular shard matrix JSON
+
+jobs:
+  generate_report:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: v2.7.1_dev
+          fetch-depth: 1
+
+      - name: Setup Python ${{ inputs.python_version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ inputs.python_version }}
+
+      - name: Download distributed shard reports
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-reports-dist-*
+          path: all-test-reports
+          merge-multiple: true
+
+      - name: Download regular shard reports
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-reports-reg-*
+          path: all-test-reports
+          merge-multiple: true
+
+      - name: Download custom test reports
+        uses: actions/download-artifact@v4
+        with:
+          name: test-reports-custom
+          path: all-test-reports
+          merge-multiple: true
+        continue-on-error: true
+
+      - name: Download cases collection summary
+        uses: actions/download-artifact@v4
+        with:
+          name: cases-shards
+          path: cases-shards
+        continue-on-error: true
+
+      - name: Generate consolidated summary
+        run: |
+          PYTHON=python
+          REPORT_MD=npu-full-test-summary.md
+          REPORT_JSON=npu-full-test-summary.json
+
+          # Combine both shard matrices for reporting
+          DIST_MATRIX='${{ inputs.distributed_matrix }}'
+          REG_MATRIX='${{ inputs.regular_matrix }}'
+          COMBINED_MATRIX=$(python3 -c "import sys,json; dist=json.loads('${DIST_MATRIX}'); reg=json.loads('${REG_MATRIX}'); print(json.dumps(['dist-'+str(s) for s in dist]+['reg-'+str(s) for s in reg]))")
+
+          $PYTHON .github/scripts/generate_npu_full_test_report.py \
+            --reports-root all-test-reports \
+            --output-markdown ${REPORT_MD} \
+            --output-json ${REPORT_JSON} \
+            --torch-npu-whl "${{ inputs.torch_npu_wheel_name }}" \
+            --shard-matrix-json "${COMBINED_MATRIX}" \
+            --docker-image "${{ inputs.docker_image }}" \
+            --runner "linux-aarch64-a3-16 (distributed, serial), linux-aarch64-a3-16 (regular, 64 workers), linux-aarch64-a3-8 (custom)" \
+            --cases-summary cases-shards/cases_collection_summary.json
+
+          cat ${REPORT_MD} >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload consolidated summary
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: npu-full-test-summary
+          path: |
+            npu-full-test-summary.md
+            npu-full-test-summary.json
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
new file mode 100644
index 0000000000..f6bf949870
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -0,0 +1,170 @@
+name: Torch NPU Upstream Test Custom
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+        description: Python version to use
+      torch_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch wheel artifact from build
+      torch_npu_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch_npu wheel artifact from build
+      pytorch_src_artifact:
+        required: true
+        type: string
+        description: Name of the pytorch source artifact from build
+      docker_image:
+        required: true
+        type: string
+        description: Docker image to use
+      test_files:
+        required: true
+        type: string
+        description: Test files to run (comma-separated)
+
+jobs:
+  run_tests:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 1200
+    container:
+      image: ${{ inputs.docker_image }}
+      options: --user root
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: v2.7.1_dev
+          fetch-depth: 1
+          path: ascend_pytorch
+
+      - name: Setup cache directories
+        run: |
+          mkdir -p /github/home/.cache/pip
+
+      - name: Download built torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_wheel_artifact }}
+          path: torch-wheel-artifact
+
+      - name: Download built torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_npu_wheel_artifact }}
+          path: torch-npu-wheel-artifact
+
+      - name: Download PyTorch source and test code
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.pytorch_src_artifact }}
+          path: pytorch-src-artifact
+
+      - name: Extract PyTorch source
+        run: |
+          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
+
+      - name: Install built PyTorch and torch_npu
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PIP=pip${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+
+          $PIP install --upgrade pip
+
+          # Install built torch wheel
+          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_WHL}"
+
+          # Install built torch_npu wheel
+          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_NPU_WHL}"
+
+          echo "Installed PyTorch and torch_npu from built wheels"
+
+      - name: Verify NPU availability
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          python${{ inputs.python_version }} - <<'PY'
+          import torch
+          import torch_npu
+          print(f'torch: {torch.__version__}')
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          PY
+
+      - name: Install test dependencies
+        run: |
+          pip${{ inputs.python_version }} install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          pip${{ inputs.python_version }} install pytest-rerunfailures pytest-flakefinder pytest-subtests==0.13.1 xdoctest==1.1.0 pulp>=2.9
+          pip${{ inputs.python_version }} install -r pytorch-src/requirements.txt || true
+
+      - name: Run custom test files
+        id: run_tests
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          REPORT_DIR=test-reports
+          mkdir -p ${REPORT_DIR}
+
+          # Custom test files: per-case isolation execution
+          python${{ inputs.python_version }} ascend_pytorch/.github/scripts/run_npu_test_shard.py \
+            --test-files "${{ inputs.test_files }}" \
+            --test-dir pytorch-src/test \
+            --disabled-testcases pytorch-src/test_upstream/disabled_testcases.json \
+            --report-dir ${REPORT_DIR} \
+            --timeout 1200 \
+            --verbose \
+            2>&1 | tee /tmp/test_custom.log
+
+          TEST_STATUS=${PIPESTATUS[0]}
+          echo "status=${TEST_STATUS}" >> $GITHUB_OUTPUT
+          # Don't exit with test status - let step succeed to allow report generation
+
+      - name: Package and upload test reports
+        if: always()
+        run: |
+          # Package failed cases logs into compressed archive
+          if [ -d "test-reports/failed_cases_logs" ]; then
+            echo "=== Compressing failed cases logs ==="
+            tar -czf test-reports/failed_cases_logs.tar.gz -C test-reports failed_cases_logs
+            rm -rf test-reports/failed_cases_logs
+            echo "Failed cases logs compressed"
+          fi
+
+      - name: Upload test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-reports-custom
+          path: test-reports/
+          retention-days: 30
+
+      - name: Compress and upload error logs
+        if: failure()
+        run: |
+          mkdir -p error-logs
+          cp /tmp/test_custom.log error-logs/ 2>/dev/null || true
+          tar -czf error-logs-custom.tar.gz error-logs/
+          echo "Error logs compressed"
+
+      - name: Upload error logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: error-logs-custom
+          path: error-logs-custom.tar.gz
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
new file mode 100644
index 0000000000..1788dbe313
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -0,0 +1,226 @@
+name: Torch NPU Upstream Test Distributed
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+        description: Python version to use
+      torch_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch wheel artifact from build
+      torch_npu_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch_npu wheel artifact from build
+      pytorch_src_artifact:
+        required: true
+        type: string
+        description: Name of the pytorch source artifact from build
+      docker_image:
+        required: true
+        type: string
+        description: Docker image to use
+      distributed_matrix:
+        required: true
+        type: string
+        description: Distributed shard matrix JSON
+      distributed_shards:
+        required: true
+        type: string
+        description: Number of distributed shards
+
+jobs:
+  run_tests:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 1200
+    container:
+      image: ${{ inputs.docker_image }}
+      options: --user root
+    strategy:
+      matrix:
+        shard: ${{ fromJson(inputs.distributed_matrix) }}
+      fail-fast: false
+      max-parallel: 2
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: v2.7.1_dev
+          fetch-depth: 1
+          path: ascend_pytorch
+
+      - name: Setup cache directories
+        run: |
+          mkdir -p /github/home/.cache/pip
+          chmod -R 777 /github/home/.cache
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: /github/home/.cache/pip
+          key: pip-arm-dist-py${{ inputs.python_version }}
+          restore-keys: |
+            pip-arm-dist-py${{ inputs.python_version }}-
+
+      - name: Download built torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_wheel_artifact }}
+          path: torch-wheel-artifact
+
+      - name: Download built torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_npu_wheel_artifact }}
+          path: torch-npu-wheel-artifact
+
+      - name: Download PyTorch source and test code
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.pytorch_src_artifact }}
+          path: pytorch-src-artifact
+
+      - name: Extract PyTorch source
+        run: |
+          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
+
+      - name: Install built PyTorch and torch_npu
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PIP=pip${{ inputs.python_version }}
+          PYTHON=python${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+
+          $PIP install --upgrade pip
+
+          # Install built torch wheel
+          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_WHL}"
+
+          # Install built torch_npu wheel
+          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_NPU_WHL}"
+
+          echo "Installed PyTorch and torch_npu from built wheels"
+          echo "torch: ${TORCH_WHL}"
+          echo "torch_npu: ${TORCH_NPU_WHL}"
+
+      - name: Verify NPU availability
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          if torch.npu.is_available():
+              print(f'NPU name: {torch.npu.get_device_name(0)}')
+          "
+
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          cd pytorch-src
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
+      - name: Download cases shard JSONs
+        uses: actions/download-artifact@v4
+        with:
+          name: cases-shards
+          path: cases-shards
+
+      - name: Run distributed shard ${{ matrix.shard }}/${{ inputs.distributed_shards }}
+        id: run_test
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          REPORT_DIR=test-reports
+          CASES_JSON="cases-shards/distributed_cases_shard_${{ matrix.shard }}.json"
+
+          mkdir -p ${REPORT_DIR}
+
+          # Get case count from JSON
+          TOTAL_CASES=$(python3 -c "import json; d=json.load(open('${CASES_JSON}')); print(d['total_cases'])")
+
+          echo "=== Distributed Shard ${{ matrix.shard }} (Case-level) ==="
+          echo "Total cases: ${TOTAL_CASES}"
+          echo "Runner: linux-aarch64-a3-16 (16-card NPU)"
+          echo "Execution mode: SERIAL"
+
+          # Distributed tests: pre-collected cases, serial execution
+          set +e
+          $PYTHON ascend_pytorch/.github/scripts/run_npu_test_shard.py \
+            --cases-json "${CASES_JSON}" \
+            --test-dir pytorch-src/test \
+            --disabled-testcases pytorch-src/test_upstream/disabled_testcases.json \
+            --report-dir ${REPORT_DIR} \
+            --timeout 1200 \
+            --verbose \
+            2>&1 | tee /tmp/test_shard_dist_${{ matrix.shard }}.log
+
+          TEST_STATUS=${PIPESTATUS[0]}
+          set -e
+          echo "status=${TEST_STATUS}" >> $GITHUB_OUTPUT
+          # Don't exit with test status - let step succeed to allow report generation
+
+      - name: Package and upload test reports
+        if: always()
+        run: |
+          # Package failed cases logs into compressed archive
+          if [ -d "test-reports/failed_cases_logs" ]; then
+            echo "=== Compressing failed cases logs ==="
+            tar -czf test-reports/failed_cases_logs.tar.gz -C test-reports failed_cases_logs
+            rm -rf test-reports/failed_cases_logs
+            echo "Failed cases logs compressed: $(ls -lh test-reports/failed_cases_logs.tar.gz)"
+          fi
+
+          # Package shard_cases.json
+          if [ -f "test-reports/shard_dist-${{ matrix.shard }}_cases.json" ]; then
+            echo "Cases JSON exists: $(ls -lh test-reports/shard_dist-${{ matrix.shard }}_cases.json)"
+          fi
+
+      - name: Upload test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-reports-dist-${{ matrix.shard }}
+          path: test-reports/
+          retention-days: 30
+
+      - name: Compress and upload error logs
+        if: failure()
+        run: |
+          # Only upload logs when tests failed
+          mkdir -p error-logs
+          cp /tmp/test_shard_dist_${{ matrix.shard }}.log error-logs/ 2>/dev/null || true
+          tar -czf error-logs-dist-${{ matrix.shard }}.tar.gz error-logs/
+          echo "Error logs compressed: $(ls -lh error-logs-dist-${{ matrix.shard }}.tar.gz)"
+
+      - name: Upload error logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: error-logs-dist-${{ matrix.shard }}
+          path: error-logs-dist-${{ matrix.shard }}.tar.gz
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
new file mode 100644
index 0000000000..dd1c054e35
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -0,0 +1,229 @@
+name: Torch NPU Upstream Test Regular
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+        description: Python version to use
+      torch_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch wheel artifact from build
+      torch_npu_wheel_artifact:
+        required: true
+        type: string
+        description: Name of the torch_npu wheel artifact from build
+      pytorch_src_artifact:
+        required: true
+        type: string
+        description: Name of the pytorch source artifact from build
+      docker_image:
+        required: true
+        type: string
+        description: Docker image to use
+      regular_matrix:
+        required: true
+        type: string
+        description: Regular shard matrix JSON
+      regular_shards:
+        required: true
+        type: string
+        description: Number of regular shards
+
+jobs:
+  run_tests:
+    runs-on: linux-aarch64-a3-16
+    timeout-minutes: 1200
+    container:
+      image: ${{ inputs.docker_image }}
+      options: --user root
+    strategy:
+      matrix:
+        shard: ${{ fromJson(inputs.regular_matrix) }}
+      fail-fast: false
+      max-parallel: 5
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: kerer-ai/pytorch
+          ref: v2.7.1_dev
+          fetch-depth: 1
+          path: ascend_pytorch
+
+      - name: Setup cache directories
+        run: |
+          mkdir -p /github/home/.cache/pip
+          chmod -R 777 /github/home/.cache
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: /github/home/.cache/pip
+          key: pip-arm-regular-py${{ inputs.python_version }}
+          restore-keys: |
+            pip-arm-regular-py${{ inputs.python_version }}-
+
+      - name: Download built torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_wheel_artifact }}
+          path: torch-wheel-artifact
+
+      - name: Download built torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_npu_wheel_artifact }}
+          path: torch-npu-wheel-artifact
+
+      - name: Download PyTorch source and test code
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.pytorch_src_artifact }}
+          path: pytorch-src-artifact
+
+      - name: Extract PyTorch source
+        run: |
+          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
+
+      - name: Install built PyTorch and torch_npu
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PIP=pip${{ inputs.python_version }}
+          PYTHON=python${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+
+          $PIP install --upgrade pip
+
+          # Install built torch wheel
+          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_WHL}"
+
+          # Install built torch_npu wheel
+          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_NPU_WHL}"
+
+          echo "Installed PyTorch and torch_npu from built wheels"
+          echo "torch: ${TORCH_WHL}"
+          echo "torch_npu: ${TORCH_NPU_WHL}"
+
+      - name: Verify NPU availability
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          if torch.npu.is_available():
+              print(f'NPU name: {torch.npu.get_device_name(0)}')
+          "
+
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          cd pytorch-src
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
+      - name: Download cases shard JSONs
+        uses: actions/download-artifact@v4
+        with:
+          name: cases-shards
+          path: cases-shards
+
+      - name: Run regular shard ${{ matrix.shard }}/${{ inputs.regular_shards }}
+        id: run_test
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          REPORT_DIR=test-reports
+          CASES_JSON="cases-shards/regular_cases_shard_${{ matrix.shard }}.json"
+
+          mkdir -p ${REPORT_DIR}
+
+          # Get case count from JSON
+          TOTAL_CASES=$(python3 -c "import json; d=json.load(open('${CASES_JSON}')); print(d['total_cases'])")
+
+          echo "=== Regular Shard ${{ matrix.shard }} (Case-level) ==="
+          echo "Total cases: ${TOTAL_CASES}"
+          echo "Runner: linux-aarch64-a3-16 (16-card NPU)"
+          echo "Execution mode: CONCURRENT (64 workers)"
+
+          # Regular tests: pre-collected cases, 64 concurrent workers
+          set +e
+          $PYTHON ascend_pytorch/.github/scripts/run_npu_test_shard.py \
+            --cases-json "${CASES_JSON}" \
+            --test-dir pytorch-src/test \
+            --disabled-testcases pytorch-src/test_upstream/disabled_testcases.json \
+            --report-dir ${REPORT_DIR} \
+            --timeout 1200 \
+            --max-workers 64 \
+            --verbose \
+            2>&1 | tee /tmp/test_shard_reg_${{ matrix.shard }}.log
+
+          TEST_STATUS=${PIPESTATUS[0]}
+          set -e
+          echo "status=${TEST_STATUS}" >> $GITHUB_OUTPUT
+          # Don't exit with test status - let step succeed to allow report generation
+
+      - name: Package and upload test reports
+        if: always()
+        run: |
+          # Package failed cases logs into compressed archive
+          if [ -d "test-reports/failed_cases_logs" ]; then
+            echo "=== Compressing failed cases logs ==="
+            FAILED_LOGS_COUNT=$(find test-reports/failed_cases_logs -type f | wc -l)
+            echo "Found ${FAILED_LOGS_COUNT} failed case log files"
+            tar -czf test-reports/failed_cases_logs.tar.gz -C test-reports failed_cases_logs
+            rm -rf test-reports/failed_cases_logs
+            echo "Failed cases logs compressed: $(ls -lh test-reports/failed_cases_logs.tar.gz)"
+          fi
+
+          # Package shard_cases.json
+          if [ -f "test-reports/shard_reg-${{ matrix.shard }}_cases.json" ]; then
+            echo "Cases JSON exists: $(ls -lh test-reports/shard_reg-${{ matrix.shard }}_cases.json)"
+          fi
+
+      - name: Upload test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-reports-reg-${{ matrix.shard }}
+          path: test-reports/
+          retention-days: 30
+
+      - name: Compress and upload error logs
+        if: failure()
+        run: |
+          # Only upload logs when tests failed
+          mkdir -p error-logs
+          cp /tmp/test_shard_reg_${{ matrix.shard }}.log error-logs/ 2>/dev/null || true
+          tar -czf error-logs-reg-${{ matrix.shard }}.tar.gz error-logs/
+          echo "Error logs compressed: $(ls -lh error-logs-reg-${{ matrix.shard }}.tar.gz)"
+
+      - name: Upload error logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: error-logs-reg-${{ matrix.shard }}
+          path: error-logs-reg-${{ matrix.shard }}.tar.gz
+          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-test.yml b/.github/workflows/_torch-npu-upstream-test.yml
new file mode 100644
index 0000000000..678e80f9ce
--- /dev/null
+++ b/.github/workflows/_torch-npu-upstream-test.yml
@@ -0,0 +1,143 @@
+name: Torch NPU Upstream Test
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+        description: Python version to use
+      pytorch_ref:
+        required: false
+        type: string
+        default: 'fccc94ae83f61fe26559abc999797297196bac29'
+        description: PyTorch branch, tag, or commit SHA to build
+      torch_npu_ref:
+        required: false
+        type: string
+        default: 'master'
+        description: torch_npu branch, tag, or commit SHA to build
+      docker_image:
+        required: false
+        type: string
+        default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
+        description: Docker image to use for all jobs
+      distributed_shards:
+        required: false
+        type: string
+        default: '2'
+        description: Number of shards for distributed tests
+      regular_shards:
+        required: false
+        type: string
+        default: '5'
+        description: Number of shards for regular tests
+      test_files:
+        required: false
+        type: string
+        default: ''
+        description: Test files to run directly (comma-separated)
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  # ============================================================================
+  # 1. Build PyTorch and torch_npu Wheels
+  # ============================================================================
+  build:
+    uses: ./.github/workflows/_torch-npu-upstream-build.yml
+    with:
+      pytorch_ref: ${{ inputs.pytorch_ref }}
+      torch_npu_ref: ${{ inputs.torch_npu_ref }}
+      python_version: ${{ inputs.python_version }}
+      docker_image: ${{ inputs.docker_image }}
+
+  # ============================================================================
+  # 2. Collect Test Cases (only when test_files is empty)
+  # ============================================================================
+  collect_cases:
+    needs:
+      - build
+    if: ${{ inputs.test_files == '' }}
+    uses: ./.github/workflows/_torch-npu-upstream-collect.yml
+    with:
+      python_version: ${{ inputs.python_version }}
+      torch_wheel_artifact: ${{ needs.build.outputs.torch-wheel }}
+      torch_npu_wheel_artifact: ${{ needs.build.outputs.torch-npu-wheel }}
+      pytorch_src_artifact: ${{ needs.build.outputs.pytorch-src }}
+      docker_image: ${{ inputs.docker_image }}
+      distributed_shards: ${{ inputs.distributed_shards }}
+      regular_shards: ${{ inputs.regular_shards }}
+
+  # ============================================================================
+  # 3. Run Distributed Tests (only when test_files is empty)
+  # ============================================================================
+  test_distributed:
+    needs:
+      - build
+      - collect_cases
+    if: ${{ inputs.test_files == '' }}
+    uses: ./.github/workflows/_torch-npu-upstream-test-dist.yml
+    with:
+      python_version: ${{ inputs.python_version }}
+      torch_wheel_artifact: ${{ needs.build.outputs.torch-wheel }}
+      torch_npu_wheel_artifact: ${{ needs.build.outputs.torch-npu-wheel }}
+      pytorch_src_artifact: ${{ needs.build.outputs.pytorch-src }}
+      docker_image: ${{ inputs.docker_image }}
+      distributed_matrix: ${{ needs.collect_cases.outputs.distributed_matrix }}
+      distributed_shards: ${{ needs.collect_cases.outputs.distributed_shards }}
+
+  # ============================================================================
+  # 4. Run Regular Tests (only when test_files is empty)
+  # ============================================================================
+  test_regular:
+    needs:
+      - build
+      - collect_cases
+    if: ${{ inputs.test_files == '' }}
+    uses: ./.github/workflows/_torch-npu-upstream-test-regular.yml
+    with:
+      python_version: ${{ inputs.python_version }}
+      torch_wheel_artifact: ${{ needs.build.outputs.torch-wheel }}
+      torch_npu_wheel_artifact: ${{ needs.build.outputs.torch-npu-wheel }}
+      pytorch_src_artifact: ${{ needs.build.outputs.pytorch-src }}
+      docker_image: ${{ inputs.docker_image }}
+      regular_matrix: ${{ needs.collect_cases.outputs.regular_matrix }}
+      regular_shards: ${{ needs.collect_cases.outputs.regular_shards }}
+
+  # ============================================================================
+  # 5. Run Custom Tests (only when test_files is provided)
+  # ============================================================================
+  test_custom:
+    needs:
+      - build
+    if: ${{ inputs.test_files != '' }}
+    uses: ./.github/workflows/_torch-npu-upstream-test-custom.yml
+    with:
+      python_version: ${{ inputs.python_version }}
+      torch_wheel_artifact: ${{ needs.build.outputs.torch-wheel }}
+      torch_npu_wheel_artifact: ${{ needs.build.outputs.torch-npu-wheel }}
+      pytorch_src_artifact: ${{ needs.build.outputs.pytorch-src }}
+      docker_image: ${{ inputs.docker_image }}
+      test_files: ${{ inputs.test_files }}
+
+  # ============================================================================
+  # 6. Generate Test Report
+  # ============================================================================
+  report:
+    needs:
+      - build
+      - collect_cases
+      - test_distributed
+      - test_regular
+      - test_custom
+    if: always() && needs.build.result == 'success'
+    uses: ./.github/workflows/_torch-npu-upstream-report.yml
+    with:
+      python_version: ${{ inputs.python_version }}
+      torch_npu_wheel_name: ${{ needs.build.outputs.torch-npu-wheel }}
+      docker_image: ${{ inputs.docker_image }}
+      distributed_matrix: ${{ needs.collect_cases.outputs.distributed_matrix || '[]' }}
+      regular_matrix: ${{ needs.collect_cases.outputs.regular_matrix || '[]' }}
\ No newline at end of file
diff --git a/.github/workflows/torch-npu-upstream-test-trigger.yml b/.github/workflows/torch-npu-upstream-test-trigger.yml
new file mode 100644
index 0000000000..4bd8d793a4
--- /dev/null
+++ b/.github/workflows/torch-npu-upstream-test-trigger.yml
@@ -0,0 +1,70 @@
+name: Torch NPU Upstream Main Test Trigger
+
+on:
+  push:
+    branches:
+      - main
+      - master
+      - 'release/**'
+    paths:
+      - '.github/workflows/torch-npu-upstream-test-trigger.yml'
+      - '.github/workflows/_torch-npu-upstream*.yml'
+      - '.github/scripts/*.py'
+      - 'test_upstream/**'
+  pull_request:
+    paths:
+      - '.github/workflows/torch-npu-upstream-test-trigger.yml'
+      - '.github/workflows/_torch-npu-upstream*.yml'
+      - '.github/scripts/*.py'
+      - 'test_upstream/**'
+  schedule:
+    - cron: '0 22 * * 1'  # UTC 22:00 (Beijing time next day 06:00), every Monday
+  workflow_dispatch:
+    inputs:
+      python_version:
+        description: 'Python version (default 3.11)'
+        required: false
+        default: '3.11'
+        type: string
+      pytorch_ref:
+        description: 'PyTorch branch, tag, or commit SHA to build (default fccc94ae83f61fe26559abc999797297196bac29)'
+        required: false
+        default: 'fccc94ae83f61fe26559abc999797297196bac29'
+        type: string
+      torch_npu_ref:
+        description: 'torch_npu branch, tag, or commit SHA to build (default master)'
+        required: false
+        default: 'master'
+        type: string
+      docker_image:
+        description: 'Docker image to use for all jobs (default quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428)'
+        required: false
+        default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
+        type: string
+      distributed_shards:
+        description: 'Number of shards for distributed tests (default 2)'
+        required: false
+        default: '2'
+        type: string
+      regular_shards:
+        description: 'Number of shards for regular tests (default 5)'
+        required: false
+        default: '5'
+        type: string
+      test_files:
+        description: 'Test files to run directly (comma-separated, e.g., "test_meta.py,test_nn.py"). Skip shard assignment if set.'
+        required: false
+        default: ''
+        type: string
+
+jobs:
+  trigger_test:
+    uses: ./.github/workflows/_torch-npu-upstream-test.yml
+    with:
+      python_version: ${{ github.event.inputs.python_version || '3.11' }}
+      pytorch_ref: ${{ github.event.inputs.pytorch_ref || 'fccc94ae83f61fe26559abc999797297196bac29' }}
+      torch_npu_ref: ${{ github.event.inputs.torch_npu_ref || 'master' }}
+      docker_image: ${{ github.event.inputs.docker_image || 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428' }}
+      distributed_shards: ${{ github.event.inputs.distributed_shards || '2' }}
+      regular_shards: ${{ github.event.inputs.regular_shards || '5' }}
+      test_files: ${{ github.event.inputs.test_files || '' }}
\ No newline at end of file

From 3afc0d2ceeb243dedd918bb890acf7b170b16e90 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 17:56:49 +0800
Subject: [PATCH 107/132] Remove deprecated workflow files and documentation

- Remove old _build_torch_npu.yml, npu-sync-test.yml, test-collect.yml
- Remove verify-docker-image.yml workflow
- Remove generate_report.py (replaced by generate_npu_full_test_report.py)
- Remove BUILD_IMAGE_README.md and PYTORCH_DOCKER_COMPARISON.md docs

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/scripts/BUILD_IMAGE_README.md        | 307 ----------
 .github/scripts/PYTORCH_DOCKER_COMPARISON.md | 595 -------------------
 .github/scripts/generate_report.py           | 172 ------
 .github/workflows/_build_torch_npu.yml       | 388 ------------
 .github/workflows/npu-sync-test.yml          | 123 ----
 .github/workflows/test-collect.yml           | 572 ------------------
 .github/workflows/verify-docker-image.yml    |  60 --
 7 files changed, 2217 deletions(-)
 delete mode 100644 .github/scripts/BUILD_IMAGE_README.md
 delete mode 100644 .github/scripts/PYTORCH_DOCKER_COMPARISON.md
 delete mode 100644 .github/scripts/generate_report.py
 delete mode 100644 .github/workflows/_build_torch_npu.yml
 delete mode 100644 .github/workflows/npu-sync-test.yml
 delete mode 100644 .github/workflows/test-collect.yml
 delete mode 100644 .github/workflows/verify-docker-image.yml

diff --git a/.github/scripts/BUILD_IMAGE_README.md b/.github/scripts/BUILD_IMAGE_README.md
deleted file mode 100644
index 2d3caa9c47..0000000000
--- a/.github/scripts/BUILD_IMAGE_README.md
+++ /dev/null
@@ -1,307 +0,0 @@
-# Docker 镜像构建脚本使用指南
-
-## 概述
-
-`build_image.sh` 脚本用于构建支持多 Python 版本的 PyTorch NPU Docker 镜像。
-
-## 核心设计理念
-
-### CANN 与 Python 版本关系
-
-**重要说明**：
-- CANN 包安装只需要 Python 3 环境，**不与特定 Python 版本绑定**
-- 编译和运行 PyTorch 时，切换 Python 版本不会影响 CANN
-- 因此镜像只需按 CANN 版本构建，无需按 Python 版本重复构建
-
-### 镜像特性
-
-1. **一个镜像支持所有 Python 版本**
-   - 预装 Python 3.10/3.11/3.12/3.13
-   - 通过环境变量或脚本切换 Python 版本
-
-2. **按 CANN 版本构建**
-   - 一个 CANN 版本对应一个镜像
-   - 维护版本映射表，自动获取下载地址
-
-3. **镜像标签简化**
-   - 标签只显示 CANN 版本
-   - 不再包含 Python 版本信息
-
----
-
-## 支持的配置
-
-### CANN 版本映射表
-
-脚本维护以下版本映射（三个包的下载 URL）：
-
-| 版本号 | Toolkit | A3-ops | NNAL |
-|--------|---------|--------|------|
-| `9.0` | toolkit_9.0.0 | A3-ops_9.0.0 | nnal_9.0.0 |
-| `9.0.0-beta.2` | toolkit_9.0.0-beta.2 | A3-ops_9.0.0-beta.2 | nnal_9.0.0-beta.2 |
-| `8.0` | toolkit_8.0.RC3 | A3-ops_8.0.RC3 | nnal_8.0.RC3 |
-
-**Stable 版本标记**：
-- `CANN_STABLE="9.0"` - 用于生成 `latest` 标签
-
----
-
-## 使用方式
-
-### 查看支持的 CANN 版本
-
-```bash
-./build_image.sh --list-versions
-```
-
-输出：
-```
-支持的 CANN 版本：
-
-  - 9.0
-  - 9.0.0-beta.2
-  - 8.0
-
-Stable 版本（用于 latest 标签）: 9.0
-```
-
-### 本地构建
-
-```bash
-# 使用简化版本号（推荐）
-./build_image.sh --cann-version 9.0
-
-# 使用完整版本号
-./build_image.sh --cann-version 9.0.0-beta.2
-
-# 查看详细日志
-./build_image.sh --cann-version 9.0 --verbose
-```
-
-### 推送镜像
-
-```bash
-# 需要设置环境变量
-export QUAY_USERNAME="your_username"
-export QUAY_PASSWORD="your_password"
-
-# 构建并推送
-./build_image.sh --cann-version 9.0 --push
-```
-
----
-
-## 镜像使用指南
-
-### 拉取镜像
-
-```bash
-# 拉取指定 CANN 版本
-docker pull quay.io/kerer/pytorch:cann9.0
-
-# 拉取 latest（stable 版本）
-docker pull quay.io/kerer/pytorch:latest
-```
-
-### 运行容器
-
-```bash
-# 启动容器（默认 Python 3.11）
-docker run -it quay.io/kerer/pytorch:cann9.0 bash
-
-# 启动容器并挂载工作目录
-docker run -it -v $(pwd):/workspace quay.io/kerer/pytorch:cann9.0 bash
-```
-
-### 切换 Python 版本
-
-**方法 1：使用切换脚本**
-```bash
-# 在容器内执行
-source /usr/local/bin/switch_python.sh 3.11
-source /usr/local/bin/switch_python.sh 3.12
-source /usr/local/bin/switch_python.sh 3.13
-```
-
-**方法 2：修改环境变量**
-```bash
-# Python 3.11
-export PATH=/opt/python/cp311-cp311/bin:$PATH
-
-# Python 3.12
-export PATH=/opt/python/cp312-cp312/bin:$PATH
-
-# Python 3.13
-export PATH=/opt/python/cp313-cp313/bin:$PATH
-```
-
-**验证 Python 版本**
-```bash
-python --version
-pip --version
-```
-
-### 初始化 CANN 环境
-
-```bash
-# 在容器内执行
-source /etc/profile.d/cann_env.sh
-```
-
----
-
-## 镜像标签说明
-
-### 标签层级
-
-每个 CANN 版本生成以下标签：
-
-| 标签类型 | 格式 | 示例 | 用途 |
-|---------|------|------|------|
-| **完整版**（带时间戳） | `cann${VERSION}-${TIMESTAMP}` | `cann9.0-20260506` | 版本追溯 |
-| **标准版** | `cann${VERSION}` | `cann9.0.0-beta.2` | 日常使用 ⭐ |
-| **简化版** | `cann${MAJOR}` | `cann9.0` | 快速识别 ⭐ |
-| **latest**（仅 stable） | `latest` | `latest` | 使用最新 |
-
-### Stable 版本额外标签
-
-CANN stable 版本（当前为 9.0）额外生成：
-- `latest` - 全局最新
-- `cann-latest` - CANN 最新
-- `cann9.0-latest` - 该 CANN 版本最新
-
----
-
-## Workflow 使用
-
-### 手动触发构建
-
-1. 进入 GitHub Actions 页面
-2. 选择 "Build Docker Image" workflow
-3. 点击 "Run workflow"
-4. 选择参数：
-   - `cann_version`: 输入 CANN 版本（如 `9.0` 或 `9.0.0-beta.2`）
-   - `push_image`: 是否推送镜像
-   - `force_build`: 是否强制构建
-
-### 自动触发
-
-- **Push 触发**: 当修改相关文件时自动触发（默认构建 stable 版本）
-- **定时触发**: 每周日凌晨 2:00 UTC 自动构建 stable 版本
-
----
-
-## 添加新的 CANN 版本
-
-### 步骤 1：更新版本映射表
-
-在 `build_image.sh` 中添加新版本：
-
-```bash
-declare -A CANN_VERSIONS=(
-    # 已有版本...
-
-    # 新增版本
-    ["9.1"]="https://...toolkit_9.1.0_linux-aarch64.run|https://...A3-ops_9.1.0_linux-aarch64.run|https://...nnal_9.1.0_linux-aarch64.run"
-)
-```
-
-格式：`"版本号"="toolkit_url|a3_ops_url|nnal_url"`
-
-### 步骤 2：更新 Stable 版本（可选）
-
-如果新版本成为 stable，更新：
-
-```bash
-CANN_STABLE="9.1"
-```
-
----
-
-## 与 PyTorch 上游对比
-
-### 关键差异
-
-| 维度 | PyTorch CUDA | torch-npu CANN |
-|------|-------------|---------------|
-| **构建策略** | 按 CUDA + Python 版本矩阵 | 只按 CANN 版本 ⭐ |
-| **镜像数量** | 多个（每种组合一个） | 少量（每个 CANN 一个） |
-| **Python 切换** | 不同镜像 | 同一镜像切换环境变量 ⭐ |
-| **版本映射** | 简化版 → 完整版 | URL 映射表 ⭐ |
-
-### 优势
-
-1. **镜像数量减少**：1 个 CANN 版本 = 1 个镜像（而非 4 个）
-2. **灵活性更高**：无需预判 Python 版本需求
-3. **维护更简单**：只需维护 CANN 版本映射表
-
----
-
-## 常见问题
-
-### Q1: 为什么一个镜像支持多个 Python 版本？
-
-A: CANN 安装只需要 Python 3 环境，不绑定特定版本。切换 Python 版本不影响 CANN 功能。
-
-### Q2: 如何在 CI 中使用特定 Python 版本？
-
-A: 在容器内执行切换脚本：
-```bash
-source /usr/local/bin/switch_python.sh 3.12
-```
-
-或在 Dockerfile/脚本中修改 PATH：
-```bash
-export PATH=/opt/python/cp312-cp312/bin:$PATH
-```
-
-### Q3: 如何验证 CANN 是否正常工作？
-
-A:
-```bash
-source /etc/profile.d/cann_env.sh
-python -c "import torch; import torch_npu; print(torch_npu.npu.is_available())"
-```
-
-### Q4: 不同 CANN 版本有什么区别？
-
-A:
-- **9.0**: Stable 版本，推荐用于生产环境
-- **9.0.0-beta.2**: Beta 版本，包含最新特性
-- **8.0**: 旧版本，用于兼容性测试
-
----
-
-## 脚本参数详解
-
-| 参数 | 说明 | 默认值 |
-|------|------|--------|
-| `--cann-version` | CANN 版本号 | 必需 |
-| `--registry` | Docker registry | `quay.io` |
-| `--quay-org` | Quay.io 组织 | `kerer` |
-| `--image-name` | 镜像名称 | `pytorch` |
-| `--push` | 推送镜像 | 不推送 |
-| `--force` | 强制构建 | 不强制 |
-| `--verbose` | 详细日志 | 不显示 |
-| `--list-versions` | 显示版本列表 | - |
-
----
-
-## 更新日志
-
-### 2026-05-06 重构
-
-**主要变更**：
-1. ❌ 移除 Python 版本参数（不再按 Python 构建镜像）
-2. ✅ 预装所有 Python 版本（3.10/3.11/3.12/3.13）
-3. ✅ 添加 Python 版本切换脚本
-4. ✅ 只按 CANN 版本构建镜像
-5. ✅ 维护 CANN 包 URL 映射表
-
-**镜像标签变化**：
-- 原：`py3.11-cann9.0`
-- 新：`cann9.0`
-
----
-
-**生成时间**: 2026-05-06
\ No newline at end of file
diff --git a/.github/scripts/PYTORCH_DOCKER_COMPARISON.md b/.github/scripts/PYTORCH_DOCKER_COMPARISON.md
deleted file mode 100644
index d8318de1b9..0000000000
--- a/.github/scripts/PYTORCH_DOCKER_COMPARISON.md
+++ /dev/null
@@ -1,595 +0,0 @@
-# PyTorch 上游镜像构建逻辑对比分析
-
-## 对比概述
-
-对比上游 PyTorch 仓库和当前 torch-npu 项目的镜像构建策略，分析相似点和差异。
-
-**对应关系**：
-- PyTorch CUDA → torch-npu CANN
-- PyTorch cuDNN → torch-npu NNAL/A3-ops
-- PyTorch Python 版本 → torch-npu Python 版本
-
----
-
-## 一、镜像命名策略对比
-
-### PyTorch 上游命名
-
-**CI 镜像命名**（用于内部测试）：
-```
-pytorch-linux-jammy-cuda13.0-cudnn9-py3.10-clang18
-pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
-pytorch-linux-jammy-py3.11-clang18          # CPU 版本
-```
-
-特点：
-- 格式：`pytorch-linux-{OS}-{CUDA}-{cuDNN}-py{Python}-{Compiler}`
-- CUDA 版本格式：`cuda13.0`（去掉小版本号）
-- cuDNN 版本格式：`cudnn9`（只保留大版本）
-- Python 版本格式：`py3.10` 或 `py3`（默认最新）
-
-**官方发布镜像命名**（用户使用）：
-```
-ghcr.io/pytorch/pytorch:2.5.1-cuda12.1-cudnn9-runtime
-ghcr.io/pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel
-ghcr.io/pytorch/pytorch:2.5.1-runtime          # CPU 版本
-```
-
-特点：
-- 格式：`{PyTorch版本}-cuda{CUDA简版}-cudnn{cuDNN}-{类型}`
-- 镜像类型：`runtime`（运行时）vs `devel`（开发）
-- CUDA 版本简化：`cuda12.1`（去掉补丁版本）
-
-**Nightly 镜像额外标签**：
-```
-ghcr.io/pytorch/pytorch-nightly:2.5.0.dev20250101-cuda12.1-cudnn9-runtime
-ghcr.io/pytorch/pytorch-nightly:{git_commit}-cu121
-ghcr.io/pytorch/pytorch-nightly:latest          # Stable CUDA 的 latest
-```
-
----
-
-### torch-npu 当前命名
-
-**当前标签格式**：
-```
-quay.io/kerer/pytorch:py3.11-cann9.0.0-beta.2-20260506   # 完整版
-quay.io/kerer/pytorch:py3.11-cann9.0.0-beta.2            # 标准版
-quay.io/kerer/pytorch:py3.11-cann9.0.0                   # 大版本简化
-quay.io/kerer/pytorch:py3.11-latest                      # Python latest
-quay.io/kerer/pytorch:latest                            # 全局 latest
-```
-
----
-
-### 命名策略对比表
-
-| 维度 | PyTorch 上游 | torch-npu 当前 | 建议 |
-|------|-------------|---------------|------|
-| **前缀** | 无前缀 | `py` 前缀 | ✅ torch-npu 更直观 |
-| **Python 版本** | `py3.10` 或 `py3` | `py3.11` | ✅ 相似，都用 py 前缀 |
-| **CUDA/CANN** | `cuda13.0` | `cann9.0.0-beta.2` | ⚠️ PyTorch 更简化 |
-| **cuDNN/NNAL** | `cudnn9` | 未包含 | ⚠️ torch-npu 可添加 |
-| **镜像类型** | `runtime`/`devel` | 无区分 | ⚠️ 可考虑添加 |
-| **时间戳** | nightly 包含 | 完整版包含 | ✅ 相似 |
-| **latest 位置** | `latest` 无后缀 | `latest` 或 `py3.11-latest` | ✅ torch-npu 更细致 |
-
----
-
-## 二、版本管理策略对比
-
-### PyTorch CUDA 版本矩阵
-
-**支持的 CUDA 版本**：
-```python
-CUDA_ARCHES = ["12.6", "13.0", "13.2"]
-CUDA_STABLE = "13.0"    # 稳定版本
-
-CUDA_ARCHES_FULL_VERSION = {
-    "12.6": "12.6.3",
-    "13.0": "13.0.2",
-    "13.2": "13.2.1",
-}
-
-CUDA_ARCHES_CUDNN_VERSION = {
-    "12.6": "9",
-    "13.0": "9",
-    "13.2": "9",
-}
-```
-
-**特点**：
-- 维护简化版本 → 完整版本映射表
-- 每个 CUDA 版本对应固定的 cuDNN 版本
-- 明确标记稳定版本（用于 latest 标签）
-
----
-
-### torch-npu CANN 版本管理
-
-**当前实现**：
-```bash
-DEFAULT_CANN_VERSION="9.0.0-beta.2"
-DEFAULT_CANN_DATE="20260330"
-
-# 提取大版本号
-cann_major=$(echo "$CANN_VERSION" | sed 's/-beta.*//' | sed 's/-rc.*//')
-```
-
-**差异**：
-- ❌ 没有版本映射表（简化版本 → 完整版本）
-- ❌ 没有 stable 版本标记
-- ❌ CANN 日期硬编码在参数中
-
----
-
-### 建议改进：版本映射表
-
-创建类似 PyTorch 的版本映射配置：
-
-```bash
-# 在 build_image.sh 中添加版本映射
-declare -A CANN_ARCHES_FULL_VERSION=(
-    ["9.0"]="9.0.0"
-    ["8.0"]="8.0.RC3"
-)
-
-declare -A CANN_ARCHES_DATE=(
-    ["9.0"]="20260330"
-    ["8.0"]="20250101"
-)
-
-declare -A CANN_ARCHES_NNAL_VERSION=(
-    ["9.0"]="9.0.0"   # 对应 cuDNN
-    ["8.0"]="8.0"
-)
-
-CANN_STABLE="9.0"  # 稳定版本
-```
-
----
-
-## 三、构建组织方式对比
-
-### PyTorch 构建架构
-
-**文件组织**：
-```
-pytorch/pytorch/
-├── Dockerfile                           # 用户发布镜像
-├── docker.Makefile                      # 构建脚本
-├── .ci/docker/                          # CI 镜像目录
-│   ├── ubuntu/Dockerfile                # Ubuntu 基础镜像
-│   ├── common/                          # 公共安装脚本
-│   │   ├── install_cuda.sh              # CUDA 安装脚本
-│   │   ├── install_conda.sh             # Conda 安装脚本
-│   │   └── install_gcc.sh               # GCC 安装脚本
-│   ├── requirements-ci.txt              # CI 依赖
-│   └── ci_commit_pins/                  # 版本锁定
-│       ├── triton.txt                   # Triton 版本
-│       ├── nccl*                        # NCCL 版本
-│       └── jax.txt                      # JAX 版本
-├── .github/workflows/
-│   ├── docker-builds.yml                # CI 镜像构建
-│   ├── docker-release.yml               # 发布镜像构建
-│   └── docker-cache-rocm.yml            # ROCm 缓存
-└── .github/scripts/
-    ├── generate_docker_release_matrix.py  # 矩阵生成
-    └── generate_binary_build_matrix.py    # 二进制矩阵
-```
-
-**关键特点**：
-1. **分层组织**：CI 镜像和发布镜像分离
-2. **公共脚本**：`common/` 目录下有各种安装脚本
-3. **版本锁定**：`ci_commit_pins/` 目录锁定所有依赖版本
-4. **矩阵生成**：Python 脚本动态生成构建矩阵
-
----
-
-### torch-npu 构建架构
-
-**当前文件组织**：
-```
-ascend-pytorch/
-├── .github/
-│   ├── docker/
-│   │   └── pytorch-npu-builder.Dockerfile   # 单一 Dockerfile
-│   ├── scripts/
-│   │   ├── build_image.sh                   # 构建脚本
-│   │   └── BUILD_IMAGE_README.md            # 文档
-│   └── workflows/
-│       └── build-docker-image.yml           # Workflow
-```
-
-**对比差异**：
-
-| 组织方式 | PyTorch 上游 | torch-npu 当前 | 建议 |
-|---------|-------------|---------------|------|
-| **CI vs 发布分离** | ✅ 分离 | ❌ 单一 Dockerfile | ⚠️ 可考虑分离 |
-| **公共安装脚本** | ✅ `common/` 目录 | ❌ 直接在 Dockerfile | ⚠️ 建议拆分 |
-| **版本锁定文件** | ✅ `ci_commit_pins/` | ❌ 硬编码参数 | ⚠️ 强烈建议 |
-| **矩阵生成脚本** | ✅ Python 脚本 | ✅ Shell 脚本 | ✅ 相似 |
-| **文档完整性** | ❌ 较少 | ✅ README 文档 | ✅ torch-npu 更好 |
-
----
-
-## 四、Workflow 设计对比
-
-### PyTorch docker-builds.yml
-
-**触发条件**：
-```yaml
-on:
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - .ci/docker/**
-      - .github/workflows/docker-builds.yml
-  push:
-    branches: [main, release/*]
-    paths:
-      - .ci/docker/**
-      - .github/workflows/docker-builds.yml
-  schedule:
-    - cron: 1 3 * * 3   # 每周三 UTC 03:01
-```
-
-**Matrix 策略**：
-```yaml
-matrix:
-  docker-image-name: [
-    pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
-    pytorch-linux-jammy-cuda13.0-cudnn9-py3.12-gcc11-vllm,
-    pytorch-linux-jammy-py3.10-clang18,      # CPU 版本
-    # ... 30+ 种镜像配置
-  ]
-  include:
-    - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
-      runner: linux.arm64.m7g.4xlarge       # ARM64 特定 runner
-```
-
-**镜像推送**：
-```yaml
-# 推送到 ECR（AWS）
-- name: Build docker image
-  uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-  with:
-    docker-image-name: ci-image:${{ matrix.docker-image-name }}
-    always-rebuild: true
-    push: true
-
-# 推送到 ghcr.io（公共）
-- name: Push to https://ghcr.io/
-  if: ${{ github.event_name == 'push' }}
-  run: |
-    ghcr_image="ghcr.io/pytorch/ci-image"
-    tag=${ECR_DOCKER_IMAGE##*:}
-    docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${tag}"
-    docker push "${ghcr_image}:${tag}"
-    # Also push a tag without the hash
-    docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${{ matrix.docker-image-name }}"
-    docker push "${ghcr_image}:${{ matrix.docker-image-name }}"
-```
-
----
-
-### torch-npu build-docker-image.yml
-
-**当前实现**：
-```yaml
-on:
-  push:
-    branches: [dev_master]
-    paths:
-      - '.github/docker/**'
-      - '.github/workflows/**'
-  schedule:
-    - cron: '0 2 * * 0'   # 每周日 UTC 02:00
-  workflow_dispatch:
-    inputs:
-      python_version: ['all', '3.10', '3.11', '3.12', '3.13']
-      cann_version: '9.0.0-beta.2'
-      push_image: true
-
-matrix:
-  python: ['3.10', '3.11', '3.12', '3.13']
-```
-
----
-
-### Workflow 对比表
-
-| 设计要点 | PyTorch 上游 | torch-npu 当前 | 建议 |
-|---------|-------------|---------------|------|
-| **触发路径** | `.ci/docker/**` | `.github/docker/**` | ✅ 相似 |
-| **定时构建** | 每周三 | 每周日 | ✅ 合理 |
-| **Matrix 配置** | 硬编码镜像名列表 | Python 版本列表 | ⚠️ PyTorch 更详细 |
-| **多 Registry** | ECR + ghcr.io | 单一 quay.io | ⚠️ 可考虑多 Registry |
-| **推送策略** | Hash tag + Name tag | 多层级标签 | ✅ torch-npu 更细致 |
-| **手动触发** | 无参数 | 多参数输入 | ✅ torch-npu 更灵活 |
-
----
-
-## 五、CUDA/CANN 安装方式对比
-
-### PyTorch CUDA 安装
-
-**install_cuda.sh 脚本**（分离式）：
-```bash
-# 调用方式
-ARG CUDA_VERSION
-COPY ./common/install_cuda.sh install_cuda.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION}
-
-# Dockerfile 中的环境变量
-ENV DESIRED_CUDA ${CUDA_VERSION}
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-```
-
-**特点**：
-- CUDA 安装逻辑独立在 `install_cuda.sh` 中
-- Dockerfile 只负责调用脚本
-- 版本通过 ARG 参数传递
-
----
-
-### torch-npu CANN 安装
-
-**当前实现**（嵌入式）：
-```dockerfile
-ARG CANN_VERSION
-ARG CANN_DATE
-
-RUN mkdir -p cann && cd cann && \
-    curl -O https://.../Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run && \
-    curl -O https://.../Ascend-cann-A3-ops_${CANN_VERSION}_linux-aarch64.run && \
-    curl -O https://.../Ascend-cann-nnal_${CANN_VERSION}_linux-aarch64.run && \
-    chmod +x Ascend-cann*.run && \
-    ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
-    # ...
-```
-
-**对比差异**：
-
-| 安装方式 | PyTorch 上游 | torch-npu 当前 | 建议 |
-|---------|-------------|---------------|------|
-| **脚本分离** | ✅ `install_cuda.sh` | ❌ 嵌入 Dockerfile | ⚠️ 建议拆分 |
-| **依赖安装** | ✅ NCCL/cuSPARSE 等独立脚本 | ❌ 混在一起 | ⚠️ 建议拆分 |
-| **版本管理** | ✅ 参数传递 + 环境变量 | ✅ 参数传递 | ✅ 相似 |
-| **安装路径** | `/usr/local/cuda` | `/usr/local/Ascend` | ✅ 合理 |
-
----
-
-## 六、镜像类型对比
-
-### PyTorch 镜像类型
-
-**两种镜像类型**：
-```dockerfile
-# runtime 镜像（精简）
-FROM official as runtime
-# 只包含运行时必需的组件
-
-# devel 镜像（完整）
-FROM official as dev
-# 包含开发工具、编译器等
-```
-
-**docker.Makefile 定义**：
-```makefile
-runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-runtime
-devel-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-devel
-```
-
----
-
-### torch-npu 当前状态
-
-**单一镜像类型**：
-- 当前只有一种镜像，包含构建和运行时所有工具
-- 没有区分 runtime 和 devel
-
-**建议**：
-```dockerfile
-# 可以添加多阶段构建
-FROM base as runtime
-# 只包含 CANN runtime + Python
-
-FROM runtime as devel
-# 添加编译工具、调试工具等
-```
-
----
-
-## 七、关键差异总结
-
-### 相似点 ✅
-
-1. **使用 ARG 参数化版本**
-   - 都通过 `ARG CUDA_VERSION` / `ARG CANN_VERSION` 传递版本
-   - 都支持多 Python 版本
-
-2. **Matrix 策略构建**
-   - 都使用 GitHub Actions matrix 并行构建
-   - 都支持定时构建和手动触发
-
-3. **Registry 推送**
-   - 都推送到公共 Registry
-   - 都生成多层级标签
-
-4. **版本简化处理**
-   - 都从完整版本提取简化版本
-   - 都有 latest 标签策略
-
----
-
-### 差异点 ⚠️
-
-| 差异 | PyTorch 上游优势 | torch-npu 待改进 |
-|------|-----------------|----------------|
-| **文件组织** | CI/发布分离，公共脚本目录 | 单一 Dockerfile，建议拆分 |
-| **版本管理** | 版本映射表，stable 标记 | 硬编码日期，建议映射表 |
-| **版本锁定** | `ci_commit_pins/` 目录锁定所有依赖 | 无版本锁定文件 |
-| **镜像类型** | runtime/devel 分离 | 单一镜像，可考虑分离 |
-| **多 Registry** | ECR（私有）+ ghcr.io（公共） | 单一 quay.io |
-| **依赖分离** | CUDA/NCCL/cuSPARSE 独立脚本 | CANN 组件混在一起 |
-
----
-
-## 八、改进建议优先级
-
-### P0（必须改进）
-
-1. **创建版本锁定文件**
-   ```
-   .github/docker/cann_versions.txt
-   .github/docker/nnal_versions.txt
-   .github/docker/a3_ops_versions.txt
-   ```
-
-2. **创建版本映射表**
-   ```bash
-   # 在 build_image.sh 中
-   declare -A CANN_VERSIONS=(
-       ["9.0"]="9.0.0|20260330"
-       ["8.0"]="8.0.RC3|20250101"
-   )
-   ```
-
----
-
-### P1（建议改进）
-
-1. **拆分安装脚本**
-   ```
-   .github/scripts/docker/
-   ├── install_cann.sh
-   ├── install_nnal.sh
-   ├── install_a3_ops.sh
-   └── common_utils.sh
-   ```
-
-2. **添加镜像类型区分**
-   ```dockerfile
-   FROM base as runtime   # 精简镜像
-   FROM runtime as devel  # 完整镜像
-   ```
-
-3. **添加 stable 版本标记**
-   ```bash
-   CANN_STABLE="9.0"  # 用于生成 latest 标签
-   ```
-
----
-
-### P2（可选改进）
-
-1. **多 Registry 支持**
-   - AWS ECR（私有缓存）
-   - ghcr.io（公共发布）
-
-2. **CI/发布镜像分离**
-   - CI 镜像：包含测试工具
-   - 发布镜像：精简运行时
-
----
-
-## 九、标签命名建议调整
-
-### 当前 torch-npu 标签（保持）
-
-```
-✅ py3.11-cann9.0.0-beta.2          # 标准版
-✅ py3.11-latest                     # Python latest
-✅ latest                            # 全局 latest
-```
-
-### 建议新增标签
-
-```
-新增：py3.11-cann9.0-runtime        # 镜像类型标记
-新增：py3.11-cann9.0-devel          # 开发镜像
-新增：cann9.0-stable                # Stable 版本标记
-新增：2.5.1-py3.11-cann9.0-runtime  # 包含 PyTorch 版本（可选）
-```
-
----
-
-## 十、代码示例：版本映射实现
-
-### 建议在 build_image.sh 中添加
-
-```bash
-#!/bin/bash
-
-# CANN 版本映射表（类似 PyTorch）
-declare -A CANN_ARCHES=(
-    ["9.0"]="9.0.0"
-    ["8.0"]="8.0.RC3"
-)
-
-declare -A CANN_ARCHES_DATE=(
-    ["9.0"]="20260330"
-    ["8.0"]="20250101"
-)
-
-declare -A CANN_ARCHES_NNAL=(
-    ["9.0"]="9.0.0"
-    ["8.0"]="8.0"
-)
-
-CANN_STABLE="9.0"  # Stable 版本（用于 latest）
-
-# 解析版本参数
-parse_cann_version() {
-    local input="$1"
-
-    # 如果输入是简化版本（如 "9.0"），查找完整版本
-    if [[ -v CANN_ARCHES[$input] ]]; then
-        CANN_VERSION="${CANN_ARCHES[$input]}"
-        CANN_DATE="${CANN_ARCHES_DATE[$input]}"
-        NNAL_VERSION="${CANN_ARCHES_NNAL[$input]}"
-        CANN_MAJOR="$input"
-    else
-        # 如果输入是完整版本（如 "9.0.0-beta.2"），提取简化版本
-        CANN_VERSION="$input"
-        CANN_MAJOR=$(echo "$input" | sed 's/-beta.*//' | sed 's/-rc.*//' | sed 's/\.[0-9]*$//')
-        CANN_DATE="${CANN_ARCHES_DATE[$CANN_MAJOR]:-DEFAULT_CANN_DATE}"
-        NNAL_VERSION="${CANN_ARCHES_NNAL[$CANN_MAJOR]:-$CANN_VERSION}"
-    fi
-
-    # 判断是否为 stable 版本
-    IS_STABLE=$([[ "$CANN_MAJOR" == "$CANN_STABLE" ]] && echo "true" || echo "false")
-}
-```
-
----
-
-## 结论
-
-### 总体评价
-
-✅ **相似度高**：torch-npu 的设计思路与 PyTorch 上游基本一致，都采用了参数化构建、Matrix 策略、多版本支持等现代 CI/CD 最佳实践。
-
-⚠️ **待改进点**：
-1. 版本管理缺乏映射表和锁定文件
-2. 安装脚本未拆分，维护性较弱
-3. 镜像类型未区分 runtime/devel
-4. 缺少 stable 版本标记
-
-### 下一步行动
-
-**建议按照优先级顺序改进**：
-1. P0：创建版本锁定文件和映射表
-2. P1：拆分安装脚本，添加镜像类型
-3. P2：考虑多 Registry 和 CI/发布分离
-
-**标签命名**：
-- 当前命名策略已经很好，符合 PyTorch 风格
-- 可以考虑添加 runtime/devel 类型标记
-- 建议添加 stable 版本的 latest 标签
-
----
-
-**生成时间**: 2026-05-06
-**对比版本**: PyTorch upstream main branch (2026-05-06)
\ No newline at end of file
diff --git a/.github/scripts/generate_report.py b/.github/scripts/generate_report.py
deleted file mode 100644
index f96a5dce8c..0000000000
--- a/.github/scripts/generate_report.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#!/usr/bin/env python3
-"""
-Generate consolidated test report from all shard results.
-
-Reads all shard result JSON files and generates a Markdown summary report
-and a detailed JSON report.
-"""
-
-import argparse
-import json
-import os
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List
-
-
-def load_shard_results(reports_root: str) -> List[Dict]:
-    """Load all shard result JSON files from the reports directory."""
-    reports_path = Path(reports_root)
-    if not reports_path.exists():
-        raise FileNotFoundError(f"Reports directory not found: {reports_root}")
-
-    results = []
-    for result_file in reports_path.rglob("*_results.json"):
-        with open(result_file) as f:
-            data = json.load(f)
-            results.append(data)
-
-    return results
-
-
-def aggregate_stats(shard_results: List[Dict]) -> Dict:
-    """Aggregate statistics from all shard results."""
-    total_stats = {
-        "passed": 0,
-        "failed": 0,
-        "error": 0,
-        "skipped": 0,
-        "timeout": 0,
-        "crashed": 0,
-        "xfail": 0,
-        "xpass": 0,
-        "unknown": 0,
-    }
-
-    total_cases = 0
-    for shard in shard_results:
-        total_cases += shard.get("total_cases", 0)
-        stats = shard.get("stats", {})
-        for key in total_stats:
-            total_stats[key] += stats.get(key, 0)
-
-    total_stats["total_cases"] = total_cases
-    total_stats["pass_rate"] = (
-        round(total_stats["passed"] / total_cases * 100, 2) if total_cases > 0 else 0
-    )
-
-    return total_stats
-
-
-def generate_markdown_report(stats: Dict, shard_results: List[Dict]) -> str:
-    """Generate a Markdown summary report."""
-    lines = [
-        "# PyTorch NPU Test Report",
-        "",
-        f"**Generated:** {stats.get('timestamp', 'N/A')}",
-        "",
-        "## Summary",
-        "",
-        "| Metric | Count |",
-        "|--------|-------|",
-        f"| Total Cases | {stats['total_cases']} |",
-        f"| Passed | {stats['passed']} |",
-        f"| Failed | {stats['failed']} |",
-        f"| Error | {stats['error']} |",
-        f"| Timeout | {stats['timeout']} |",
-        f"| Crashed | {stats['crashed']} |",
-        f"| Skipped | {stats['skipped']} |",
-        f"| XFail | {stats['xfail']} |",
-        f"| XPass | {stats['xpass']} |",
-        f"| Pass Rate | {stats['pass_rate']}% |",
-        "",
-        "## Shard Details",
-        "",
-    ]
-
-    for shard in sorted(shard_results, key=lambda x: x.get("shard_index", 0)):
-        shard_idx = shard.get("shard_index", "?")
-        shard_stats = shard.get("stats", {})
-        lines.append(f"### Shard {shard_idx}")
-        lines.append("")
-        lines.append(f"- Total cases: {shard.get('total_cases', 0)}")
-        lines.append(f"- Passed: {shard_stats.get('passed', 0)}")
-        lines.append(f"- Failed: {shard_stats.get('failed', 0)}")
-        lines.append(f"- Error: {shard_stats.get('error', 0)}")
-        lines.append(f"- Timeout: {shard_stats.get('timeout', 0)}")
-        lines.append(f"- Crashed: {shard_stats.get('crashed', 0)}")
-        lines.append("")
-
-    # Add failed cases section
-    failed_cases = []
-    for shard in shard_results:
-        for result in shard.get("results", []):
-            if result.get("status") in ["failed", "error", "timeout", "crashed"]:
-                failed_cases.append({
-                    "case_id": result.get("case_id", "?"),
-                    "status": result.get("status", "?"),
-                    "duration": result.get("duration", 0),
-                })
-
-    if failed_cases:
-        lines.append("## Failed Cases")
-        lines.append("")
-        lines.append("| Case ID | Status | Duration |")
-        lines.append("|---------|--------|----------|")
-        for case in failed_cases[:100]:  # Limit to first 100 for readability
-            lines.append(f"| {case['case_id']} | {case['status']} | {case['duration']}s |")
-
-        if len(failed_cases) > 100:
-            lines.append(f"\n*Showing first 100 of {len(failed_cases)} failed cases*")
-
-    return "\n".join(lines)
-
-
-def generate_json_report(stats: Dict, shard_results: List[Dict]) -> Dict:
-    """Generate a detailed JSON report."""
-    report = {
-        "summary": stats,
-        "shards": shard_results,
-        "failed_cases": [],
-    }
-
-    for shard in shard_results:
-        for result in shard.get("results", []):
-            if result.get("status") in ["failed", "error", "timeout", "crashed"]:
-                report["failed_cases"].append(result)
-
-    return report
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Generate consolidated test report")
-    parser.add_argument("--reports-root", required=True, help="Root directory with shard results")
-    parser.add_argument("--output-markdown", required=True, help="Output Markdown file path")
-    parser.add_argument("--output-json", required=True, help="Output JSON file path")
-
-    args = parser.parse_args()
-
-    shard_results = load_shard_results(args.reports_root)
-    print(f"Loaded {len(shard_results)} shard results")
-
-    stats = aggregate_stats(shard_results)
-    stats["timestamp"] = datetime.utcnow().isoformat()
-
-    print(f"Total cases: {stats['total_cases']}")
-    print(f"Pass rate: {stats['pass_rate']}%")
-
-    # Generate Markdown report
-    markdown = generate_markdown_report(stats, shard_results)
-    with open(args.output_markdown, "w") as f:
-        f.write(markdown)
-    print(f"Markdown report saved to {args.output_markdown}")
-
-    # Generate JSON report
-    json_report = generate_json_report(stats, shard_results)
-    with open(args.output_json, "w") as f:
-        json.dump(json_report, f, indent=2)
-    print(f"JSON report saved to {args.output_json}")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/.github/workflows/_build_torch_npu.yml b/.github/workflows/_build_torch_npu.yml
deleted file mode 100644
index 515d474b13..0000000000
--- a/.github/workflows/_build_torch_npu.yml
+++ /dev/null
@@ -1,388 +0,0 @@
-name: Build PyTorch and torch_npu (with cache)
-
-on:
-  workflow_call:
-    inputs:
-      pytorch_branch:
-        required: true
-        type: string
-        default: 'main'
-      python_version:
-        required: true
-        type: string
-        default: '3.11'
-      docker_image_tag:
-        required: true
-        type: string
-        description: 'Docker image tag with timestamp'
-    outputs:
-      docker-image:
-        description: 'Full Docker image URL'
-        value: ${{ jobs.build.outputs.docker-image }}
-      torch-wheel:
-        description: 'PyTorch wheel artifact name'
-        value: 'torch-wheel-main'
-      torch-npu-wheel:
-        description: 'torch_npu wheel artifact name'
-        value: 'torch-npu-wheel-main'
-      test-src:
-        description: 'Test source artifact name'
-        value: 'test-src-main'
-
-env:
-  # 缓存版本号，当需要强制刷新缓存时修改此值
-  CACHE_VERSION: 'v2'
-  # GitHub 代理 URL（用于加速 git clone，留空则不使用代理）
-  GH_PROXY_URL: 'https://gh-proxy.test.osinfra.cn'
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
-jobs:
-  build:
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 240
-    outputs:
-      docker-image: ${{ steps.set_image.outputs.docker-image }}
-
-    container:
-      image: quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}
-      options: --user root
-
-    steps:
-      - name: Set Docker image URL
-        id: set_image
-        run: |
-          DOCKER_IMAGE="quay.io/kerer/pytorch:${{ inputs.docker_image_tag }}"
-          echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
-          echo "Using Docker image: ${DOCKER_IMAGE}"
-
-      - name: Setup CANN environment
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-      - name: Configure git proxy for faster clone
-        run: |
-          # 配置 git URL rewrite 来使用代理（加速 clone 和 submodules）
-          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
-            git config --global url."${{ env.GH_PROXY_URL }}/https://github.com/".insteadOf "https://github.com/"
-            git config --global url."${{ env.GH_PROXY_URL }}/https://gitlab.com/".insteadOf "https://gitlab.com/"
-            echo "Git proxy configured:"
-            git config --global --list | grep url
-          else
-            echo "No proxy configured, using direct connection"
-          fi
-
-      - name: Clone upstream PyTorch with submodules
-        id: clone_pytorch
-        run: |
-          # 使用代理加速 git clone（如果配置了 GH_PROXY_URL）
-          PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
-          if [ -n "${{ env.GH_PROXY_URL }}" ]; then
-            PYTORCH_REPO="${{ env.GH_PROXY_URL }}/${PYTORCH_REPO}"
-            echo "Using proxy: ${PYTORCH_REPO}"
-          fi
-
-          # 克隆指定 commit: fccc94ae83f61fe26559abc999797297196bac29
-          # 先浅克隆，再 fetch 指定 commit，最后 checkout
-          git clone --depth=1 "${PYTORCH_REPO}" pytorch-src
-          cd pytorch-src
-          git fetch --depth=1 origin fccc94ae83f61fe26559abc999797297196bac29
-          git checkout fccc94ae83f61fe26559abc999797297196bac29
-
-          # 初始化 submodules
-          git submodule update --init --recursive
-
-          PYTORCH_SHA=$(git rev-parse HEAD)
-          echo "pytorch_sha=${PYTORCH_SHA}" >> $GITHUB_OUTPUT
-          echo "Cloned PyTorch commit: ${PYTORCH_SHA}"
-          echo "Submodules downloaded:"
-          ls -la third_party/ | head -20
-
-      - name: Checkout torch_npu
-        uses: actions/checkout@v4
-        with:
-          path: torch_npu-src
-          submodules: recursive
-
-      # ==================== pip 缓存配置 ====================
-      # pip 缓存加速依赖下载，不影响构建结果
-      # 缓存键基于 requirements-build.txt hash（依赖变化频率低）
-      - name: Get pip cache key
-        id: pip_key
-        run: |
-          REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
-
-      - name: Restore pip cache
-        uses: actions/cache/restore@v4
-        with:
-          path: /root/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ env.CACHE_VERSION }}-pip-${{ inputs.python_version }}-
-            ${{ env.CACHE_VERSION }}-pip-
-
-      - name: Setup pip cache directory
-        run: |
-          mkdir -p /root/.cache/pip
-
-      - name: Configure pip index URL
-        run: |
-          # 配置 pip 使用 PyPI 缓存加速下载
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            pip${{ inputs.python_version }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            pip${{ inputs.python_version }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          else
-            echo "No PyPI cache URL configured, using default"
-          fi
-
-      - name: Upgrade pip and setuptools
-        run: |
-          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
-          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
-
-      # ==================== ccache 缓存配置 ====================
-      # ccache 是真正加速编译的关键（可节省 30-60 分钟）
-      # 注意：PyTorch 每次 clone 都是新 commit，所以缓存键不包含 PyTorch SHA
-      # 我们依赖 torch_npu SHA 和 requirements-build.txt hash 作为缓存键
-      - name: Get ccache key
-        id: ccache_key
-        run: |
-          # ccache 缓存键：torch_npu SHA + requirements hash
-          # PyTorch SHA 每次都变化（--depth=1 clone 最新），所以不包含在缓存键中
-          TORCH_NPU_SHA=$(cd torch_npu-src && git rev-parse HEAD)
-          REQUIREMENTS_HASH=$(cd pytorch-src && sha256sum requirements-build.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-${TORCH_NPU_SHA}" >> $GITHUB_OUTPUT
-          # partial_key 用于恢复同版本 requirements 的缓存（不同 torch_npu 版本）
-          echo "partial_key=${{ env.CACHE_VERSION }}-ccache-${REQUIREMENTS_HASH}-" >> $GITHUB_OUTPUT
-          # base_key 用于恢复同 CACHE_VERSION 的所有缓存
-          echo "base_key=${{ env.CACHE_VERSION }}-ccache-" >> $GITHUB_OUTPUT
-
-      - name: Restore ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /root/.cache/ccache
-          key: ${{ steps.ccache_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ steps.ccache_key.outputs.partial_key }}
-            ${{ steps.ccache_key.outputs.base_key }}
-
-      - name: Setup ccache
-        run: |
-          # 安装 ccache（manylinux 镜像没有预装）
-          yum install -y ccache
-
-          # 创建 ccache 配置目录（使用绝对路径）
-          CCACHE_DIR_PATH="/root/.cache/ccache"
-          mkdir -p "$CCACHE_DIR_PATH"
-
-          # 直接写入配置文件（使用绝对路径）
-          cat > "$CCACHE_DIR_PATH/ccache.conf" << 'EOF'
-          max_size = 20G
-          cache_dir = /root/.cache/ccache
-          compression = true
-          compression_level = 6
-          EOF
-
-          # 使用符号链接方式让 ccache 模拟 gcc/g++
-          mkdir -p /usr/local/bin
-          ln -sf /usr/bin/ccache /usr/local/bin/gcc
-          ln -sf /usr/bin/ccache /usr/local/bin/g++
-          ln -sf /usr/bin/ccache /usr/local/bin/cc
-          ln -sf /usr/bin/ccache /usr/local/bin/c++
-
-          # 设置 PATH 优先使用符号链接
-          echo "PATH=/usr/local/bin:$PATH" >> $GITHUB_ENV
-
-          # 设置 CCACHE_DIR（使用绝对路径，不使用 ~）
-          echo "CCACHE_DIR=$CCACHE_DIR_PATH" >> $GITHUB_ENV
-
-          # 设置编译器环境变量，确保 CMake/Ninja 使用 ccache
-          echo "CC=/usr/local/bin/gcc" >> $GITHUB_ENV
-          echo "CXX=/usr/local/bin/g++" >> $GITHUB_ENV
-
-          echo "=== ccache Configuration ==="
-          CCACHE_DIR="$CCACHE_DIR_PATH" ccache --show-config
-
-          echo ""
-          echo "=== Config File Contents ==="
-          cat "$CCACHE_DIR_PATH/ccache.conf"
-
-          echo ""
-          echo "=== Cache Directory ==="
-          ls -la "$CCACHE_DIR_PATH/"
-
-          echo ""
-          echo "=== Symbolic Links ==="
-          ls -la /usr/local/bin/gcc /usr/local/bin/g++
-
-          echo ""
-          echo "=== ccache Statistics (before build) ==="
-          CCACHE_DIR="$CCACHE_DIR_PATH" ccache --show-stats
-
-      # ==================== 构建 PyTorch ====================
-      - name: Build PyTorch wheel
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-
-          cd pytorch-src
-
-          # 安装构建依赖（pip 缓存已恢复，加速下载）
-          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
-          pip${{ inputs.python_version }} install -r requirements-build.txt
-
-          # 设置构建环境变量
-          export MAX_JOBS=128
-          export USE_CUDA=0
-          export USE_CUDNN=0
-          export USE_DISTRIBUTED=1
-          export CMAKE_BUILD_TYPE=Release
-          export USE_OPENMP=1
-          export USE_MKLDNN=0
-
-          # 确保使用 ccache（CMake 会检测 CC/CXX 环境变量）
-          export CC=/usr/local/bin/gcc
-          export CXX=/usr/local/bin/g++
-          export CCACHE_DIR=/root/.cache/ccache
-
-          # 清除 ccache 统计（开始新的构建）
-          ccache --zero-stats
-
-          python${{ inputs.python_version }} setup.py build bdist_wheel
-
-          echo "PyTorch wheel built:"
-          ls -la dist/
-
-          echo ""
-          echo "=== ccache Statistics (after PyTorch build) ==="
-          ccache --show-stats
-
-      # ==================== 构建 torch_npu ====================
-      - name: Install PyTorch wheel and build dependencies
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          echo "=== Installing built PyTorch wheel ==="
-          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
-
-          echo ""
-          echo "=== Verifying PyTorch installation ==="
-          python${{ inputs.python_version }} -c "import torch; print(f'torch version: {torch.__version__}')"
-
-          echo ""
-          echo "=== Installing torch_npu build dependencies ==="
-          pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
-          pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
-
-          cd torch_npu-src
-
-          # 显示 ccache 统计（依赖安装阶段）
-          echo ""
-          echo "=== ccache Statistics (before torch_npu build) ==="
-          CCACHE_DIR=/root/.cache/ccache ccache --show-stats
-
-      - name: Build torch_npu wheel
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          cd torch_npu-src
-
-          export MAX_JOBS=128
-
-          # 确保使用 ccache
-          export CC=/usr/local/bin/gcc
-          export CXX=/usr/local/bin/g++
-          export CCACHE_DIR=/root/.cache/ccache
-
-          # 禁用 torchair 构建（上游 PyTorch main API 变化导致兼容性问题）
-          bash ci/build.sh --python=${{ inputs.python_version }} --disable_torchair
-
-          echo "torch_npu wheel built:"
-          ls -la dist/
-
-          echo ""
-          echo "=== ccache Statistics (after torch_npu build) ==="
-          ccache --show-stats
-
-      # ==================== 保存缓存 ====================
-      - name: Save pip cache
-        if: always()
-        uses: actions/cache/save@v4
-        with:
-          path: /root/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-
-      - name: Save ccache
-        if: always()
-        uses: actions/cache/save@v4
-        with:
-          path: /root/.cache/ccache
-          key: ${{ steps.ccache_key.outputs.cache_key }}
-
-      - name: Display cache save status
-        if: always()
-        run: |
-          echo "=== Cache Saved ==="
-          echo "pip cache key: ${{ steps.pip_key.outputs.cache_key }}"
-          PIP_CACHE_SIZE=$(du -sh /root/.cache/pip 2>/dev/null | cut -f1)
-          echo "pip cache size: ${PIP_CACHE_SIZE}"
-          echo ""
-          echo "ccache key: ${{ steps.ccache_key.outputs.cache_key }}"
-          CCACHE_SIZE=$(du -sh /root/.cache/ccache 2>/dev/null | cut -f1)
-          echo "ccache size: ${CCACHE_SIZE}"
-
-      # ==================== 打包和上传 ====================
-      - name: Package PyTorch source and build artifacts
-        run: |
-          # 打包整个 pytorch-src 目录（包含测试源码和编译产物）
-          # 排除不必要的文件以减小体积：
-          # - .git 目录（最占空间）
-          # - build/ 目录中的编译中间产物（CMakeFiles, .o 文件等）
-          # - dist/*.whl（已单独上传为 artifact）
-
-          echo "=== PyTorch source directory size ==="
-          du -sh pytorch-src/
-
-          echo ""
-          echo "=== Build artifacts location ==="
-          ls -la pytorch-src/build/lib.*/torch/*.so 2>/dev/null | head -5 || echo "No .so files found in build/lib"
-          ls -la pytorch-src/torch/_C.so 2>/dev/null || echo "No _C.so in torch/"
-
-          echo ""
-          echo "=== Creating archive (excluding large unnecessary files) ==="
-          tar -czf pytorch-src.tar.gz \
-            --exclude='pytorch-src/.git' \
-            --exclude='pytorch-src/build/CMakeFiles' \
-            --exclude='pytorch-src/build/*.o' \
-            --exclude='pytorch-src/build/**/*.o' \
-            --exclude='pytorch-src/dist/*.whl' \
-            pytorch-src
-
-          echo ""
-          echo "=== Archive size ==="
-          ls -lh pytorch-src.tar.gz
-
-      - name: Upload PyTorch wheel
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-wheel-main
-          path: pytorch-src/dist/*.whl
-          retention-days: 7
-
-      - name: Upload torch_npu wheel
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-npu-wheel-main
-          path: torch_npu-src/dist/*.whl
-          retention-days: 7
-
-      - name: Upload PyTorch source and build artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: pytorch-src-main
-          path: pytorch-src.tar.gz
-          retention-days: 7
\ No newline at end of file
diff --git a/.github/workflows/npu-sync-test.yml b/.github/workflows/npu-sync-test.yml
deleted file mode 100644
index 1f6c16b88e..0000000000
--- a/.github/workflows/npu-sync-test.yml
+++ /dev/null
@@ -1,123 +0,0 @@
-name: PyTorch NPU Sync Test
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-      - main
-      - master
-      - 'release/**'
-  workflow_dispatch:
-    inputs:
-      docker_image_tag:
-        description: 'Docker image tag (e.g., manylinux-cann9.0.0-beta.2-20260428)'
-        default: 'manylinux-cann9.0.0-beta.2-20260428'
-      pytorch_branch:
-        description: 'PyTorch branch to build'
-        default: 'main'
-      distributed_shards:
-        description: 'Distributed test shards'
-        default: 2
-        type: number
-      regular_shards:
-        description: 'Regular test shards'
-        default: 5
-        type: number
-
-env:
-  REGISTRY: quay.io
-  PYTHON_VERSION: '3.11'
-
-jobs:
-  build:
-    uses: ./.github/workflows/_build_torch_npu.yml
-    with:
-      pytorch_branch: ${{ inputs.pytorch_branch || 'main' }}
-      python_version: '3.11'
-      docker_image_tag: ${{ inputs.docker_image_tag || 'latest' }}
-
-  collect_cases:
-    needs: build
-    uses: ./.github/workflows/_collect.yml
-    with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      distributed-shards: ${{ inputs.distributed_shards || 2 }}
-      regular-shards: ${{ inputs.regular_shards || 5 }}
-
-  test_distributed:
-    needs:
-      - build
-      - collect_cases
-    strategy:
-      matrix:
-        shard: ${{ fromJson(needs.collect_cases.outputs.distributed_matrix) }}
-      fail-fast: false
-      max-parallel: 2
-    uses: ./.github/workflows/_test.yml
-    with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-type: distributed
-      shard-index: ${{ matrix.shard }}
-      max-workers: 1
-
-  test_regular:
-    needs:
-      - build
-      - collect_cases
-    strategy:
-      matrix:
-        shard: ${{ fromJson(needs.collect_cases.outputs.regular_matrix) }}
-      fail-fast: false
-      max-parallel: 5
-    uses: ./.github/workflows/_test.yml
-    with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-type: regular
-      shard-index: ${{ matrix.shard }}
-      max-workers: 32
-
-  report:
-    needs:
-      - test_distributed
-      - test_regular
-    runs-on: ubuntu-latest
-    if: always()
-
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: .github/scripts
-
-      - name: Download all test reports
-        uses: actions/download-artifact@v4
-        with:
-          pattern: test-reports-*
-          path: all-reports
-          merge-multiple: false
-
-      - name: Generate consolidated report
-        run: |
-          python .github/scripts/generate_report.py \
-            --reports-root all-reports \
-            --output-markdown report.md \
-            --output-json report.json
-
-      - name: Upload final report
-        uses: actions/upload-artifact@v4
-        with:
-          name: npu-full-test-report
-          path: |
-            report.md
-            report.json
-          retention-days: 30
-
-      - name: Package all logs
-        run: tar -czf all-test-logs.tar.gz all-reports/
-
-      - name: Upload logs artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: all-test-logs
-          path: all-test-logs.tar.gz
-          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/test-collect.yml b/.github/workflows/test-collect.yml
deleted file mode 100644
index a53086dcc4..0000000000
--- a/.github/workflows/test-collect.yml
+++ /dev/null
@@ -1,572 +0,0 @@
-name: Test Collect Stage
-
-on:
-  workflow_dispatch:
-    inputs:
-      distributed_shards:
-        description: 'Distributed test shards'
-        default: 2
-        type: number
-      regular_shards:
-        description: 'Regular test shards'
-        default: 5
-        type: number
-
-env:
-  PYTHON_VERSION: '3.11'
-  CACHE_VERSION: 'v2'
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
-jobs:
-  collect:
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 180
-    outputs:
-      test_matrix: ${{ steps.get_tests.outputs.test_matrix }}
-      total_tests: ${{ steps.get_tests.outputs.total_tests }}
-
-    container:
-      image: quay.io/kerer/pytorch:latest
-      options: --user root
-
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            .github/scripts
-            .ci/docker
-
-      # ==================== pip 缓存配置 ====================
-      - name: Get pip cache key
-        id: pip_key
-        run: |
-          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
-
-      - name: Restore pip cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-
-            ${{ env.CACHE_VERSION }}-pip-collect-
-
-      - name: Setup pip cache directory
-        run: |
-          mkdir -p ~/.cache/pip
-
-      - name: Configure pip index URL
-        run: |
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          else
-            echo "No PyPI cache URL configured, using default"
-          fi
-
-      - name: Upgrade pip and setuptools
-        run: |
-          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
-
-      # ==================== Download artifacts from Ascend/pytorch workflow run ====================
-      # 固定 workflow run: https://github.com/Ascend/pytorch/actions/runs/25473829132
-      # 包含: torch wheel, torch_npu wheel, pytorch 源码 + 编译产物
-      - name: Download torch wheel from workflow run
-        uses: dawidd6/action-download-artifact@v3
-        with:
-          run_id: 25473829132
-          name: torch-wheel-main
-          path: wheels
-          repo: Ascend/pytorch
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          skip_unpack: false
-
-      - name: Download torch_npu wheel from workflow run
-        uses: dawidd6/action-download-artifact@v3
-        with:
-          run_id: 25473829132
-          name: torch-npu-wheel-main
-          path: wheels
-          repo: Ascend/pytorch
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Download PyTorch source from GitHub
-        run: |
-          PYTORCH_COMMIT=fccc94ae83f61fe26559abc999797297196bac29
-          PYTORCH_REPO=pytorch/pytorch
-          PROXY_URL=https://gh-proxy.test.osinfra.cn
-
-          echo "Downloading PyTorch source from commit: ${PYTORCH_COMMIT}"
-          echo "Using proxy: ${PROXY_URL}"
-
-          # Download source tarball via proxy (no submodules needed for testing)
-          curl -L "${PROXY_URL}/https://github.com/${PYTORCH_REPO}/archive/${PYTORCH_COMMIT}.tar.gz" \
-            -o pytorch-src.tar.gz
-
-          echo "Downloaded pytorch-src.tar.gz:"
-          ls -la pytorch-src.tar.gz
-
-      - name: Extract PyTorch source
-        run: |
-          # Extract source tarball
-          tar -xzf pytorch-src.tar.gz
-
-          # Rename directory to pytorch-src (tarball extracts as pytorch-fccc94ae...)
-          mv pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
-            mv pytorch-pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
-            true
-
-          # If still not renamed, find and rename the extracted directory
-          if [ ! -d pytorch-src ]; then
-            EXTRACTED_DIR=$(find . -maxdepth 1 -type d -name "pytorch-*" | head -1)
-            if [ -n "$EXTRACTED_DIR" ]; then
-              mv "$EXTRACTED_DIR" pytorch-src
-            fi
-          fi
-
-          echo "Extracted to pytorch-src directory:"
-          ls -la pytorch-src/ | head -10
-
-      - name: Install wheels
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
-
-      - name: Install test dependencies
-        run: |
-          # 安装当前仓库的基础测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
-
-          # 安装上游 PyTorch 的完整测试依赖（需要先安装 torch）
-          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
-
-          # 安装 PyTorch 开发依赖（可选）
-          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
-
-      - name: Verify NPU availability
-        continue-on-error: true
-        run: |
-          # 切换到 /tmp 目录，避免 pytorch-src 源码目录干扰 torch 导入
-          cd /tmp
-
-          echo "=== CANN Directory Contents ==="
-          ls -la /usr/local/Ascend/ || echo "Ascend directory not found"
-          echo ""
-          echo "=== CANN Version Info ==="
-          if [ -d /usr/local/Ascend/cann ]; then
-            ls -la /usr/local/Ascend/cann/
-            echo ""
-            echo "=== CANN Version File ==="
-            cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo "version.info not found"
-          else
-            echo "CANN directory not found"
-          fi
-          echo ""
-          echo "=== NNAL Directory Contents ==="
-          if [ -d /usr/local/Ascend/nnal ]; then
-            ls -la /usr/local/Ascend/nnal/
-          else
-            echo "NNAL directory not found"
-          fi
-
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          echo ""
-          echo "=== NPU-SMI Info ==="
-          npu-smi info || echo "npu-smi not available"
-
-          PYTHON=python${{ env.PYTHON_VERSION }}
-          echo ""
-          echo "=== PyTorch and NPU Info ==="
-          $PYTHON -c "
-          import torch
-          print(f'torch: {torch.__version__}')
-          import torch_npu
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          if torch.npu.is_available():
-              print(f'NPU name: {torch.npu.get_device_name(0)}')
-              print(f'NPU capability: {torch.npu.get_device_capability(0)}')
-          " 2>&1 || echo "torch/torch_npu import failed - this is expected if torch and torch_npu versions are mismatched"
-
-      # ==================== 保存 pip 缓存 ====================
-      - name: Save pip cache
-        if: always()
-        uses: actions/cache/save@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-
-      # ==================== 使用 discover_tests.py 获取测试列表并分片 ====================
-      - name: Get test list and create shards
-        id: get_tests
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
-          cd pytorch-src/test
-
-          # 创建分片 JSON 文件目录
-          mkdir -p /tmp/shards
-
-          # 从 discover_tests.py 获取测试列表并分片
-          python${{ env.PYTHON_VERSION }} -c "
-          import sys
-          import json
-          sys.path.insert(0, '../tools/testing')
-          from discover_tests import TESTS
-
-          NUM_SHARDS = 6
-
-          # 平均分片
-          tests_per_shard = len(TESTS) // NUM_SHARDS
-          remainder = len(TESTS) % NUM_SHARDS
-
-          shard_data = {}
-          start_idx = 0
-
-          for shard_id in range(1, NUM_SHARDS + 1):
-              # 前几个 shard 多分配余数
-              extra = 1 if shard_id <= remainder else 0
-              end_idx = start_idx + tests_per_shard + extra
-              shard_tests = TESTS[start_idx:end_idx]
-
-              shard_data[shard_id] = {
-                  'shard_id': shard_id,
-                  'total_shards': NUM_SHARDS,
-                  'tests': shard_tests,
-                  'test_count': len(shard_tests)
-              }
-
-              # 写入 JSON 文件
-              with open(f'/tmp/shards/shard_{shard_id}.json', 'w') as f:
-                  json.dump(shard_data[shard_id], f, indent=2)
-
-              print(f'Shard {shard_id}: {len(shard_tests)} tests')
-              start_idx = end_idx
-
-          # 写入汇总信息
-          summary = {
-              'total_tests': len(TESTS),
-              'num_shards': NUM_SHARDS,
-              'shard_distribution': {k: v['test_count'] for k, v in shard_data.items()}
-          }
-          with open('/tmp/shards/summary.json', 'w') as f:
-              json.dump(summary, f, indent=2)
-
-          print(f'Total tests: {len(TESTS)}')
-          "
-
-          # 输出矩阵配置 (6 shards)
-          echo "test_matrix=[1,2,3,4,5,6]" >> $GITHUB_OUTPUT
-          echo "total_tests=$(cat /tmp/shards/summary.json | grep -o '\"total_tests\": [0-9]*' | grep -o '[0-9]*')" >> $GITHUB_OUTPUT
-
-          # 显示分片分布
-          echo "=== Shard Distribution ==="
-          cat /tmp/shards/summary.json
-
-      - name: Upload shard JSONs
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-shards
-          path: /tmp/shards/
-          retention-days: 1
-
-  # ==================== 矩阵分片测试 ====================
-  test:
-    needs: collect
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 1200  # 20 hours per shard
-
-    strategy:
-      matrix:
-        shard: ${{ fromJson(needs.collect.outputs.test_matrix) }}
-      fail-fast: false
-      max-parallel: 6
-
-    container:
-      image: quay.io/kerer/pytorch:latest
-      options: --user root
-
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            .github/scripts
-            .ci/docker
-
-      # ==================== pip 缓存配置 ====================
-      - name: Get pip cache key
-        id: pip_key
-        run: |
-          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
-
-      - name: Restore pip cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-
-            ${{ env.CACHE_VERSION }}-pip-test-
-
-      - name: Setup pip cache directory
-        run: |
-          mkdir -p ~/.cache/pip
-
-      - name: Configure pip index URL
-        run: |
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          else
-            echo "No PyPI cache URL configured, using default"
-          fi
-
-      - name: Upgrade pip and setuptools
-        run: |
-          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
-
-      # ==================== Download artifacts ====================
-      - name: Download torch wheel from workflow run
-        uses: dawidd6/action-download-artifact@v3
-        with:
-          run_id: 25473829132
-          name: torch-wheel-main
-          path: wheels
-          repo: Ascend/pytorch
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          skip_unpack: false
-
-      - name: Download torch_npu wheel from workflow run
-        uses: dawidd6/action-download-artifact@v3
-        with:
-          run_id: 25473829132
-          name: torch-npu-wheel-main
-          path: wheels
-          repo: Ascend/pytorch
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Download PyTorch source from GitHub
-        run: |
-          PYTORCH_COMMIT=fccc94ae83f61fe26559abc999797297196bac29
-          PYTORCH_REPO=pytorch/pytorch
-          PROXY_URL=https://gh-proxy.test.osinfra.cn
-
-          echo "Downloading PyTorch source from commit: ${PYTORCH_COMMIT}"
-          echo "Using proxy: ${PROXY_URL}"
-
-          # Download source tarball via proxy (no submodules needed for testing)
-          curl -L "${PROXY_URL}/https://github.com/${PYTORCH_REPO}/archive/${PYTORCH_COMMIT}.tar.gz" \
-            -o pytorch-src.tar.gz
-
-          echo "Downloaded pytorch-src.tar.gz:"
-          ls -la pytorch-src.tar.gz
-
-      - name: Extract PyTorch source
-        run: |
-          # Extract source tarball
-          tar -xzf pytorch-src.tar.gz
-
-          # Rename directory to pytorch-src (tarball extracts as pytorch-fccc94ae...)
-          mv pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
-            mv pytorch-pytorch-${PYTORCH_COMMIT:-fccc94ae83f61fe26559abc999797297196bac29} pytorch-src 2>/dev/null || \
-            true
-
-          # If still not renamed, find and rename the extracted directory
-          if [ ! -d pytorch-src ]; then
-            EXTRACTED_DIR=$(find . -maxdepth 1 -type d -name "pytorch-*" | head -1)
-            if [ -n "$EXTRACTED_DIR" ]; then
-              mv "$EXTRACTED_DIR" pytorch-src
-            fi
-          fi
-
-          echo "Extracted to pytorch-src directory:"
-          ls -la pytorch-src/ | head -10
-
-      - name: Install wheels
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
-
-      - name: Install test dependencies
-        run: |
-          # 安装当前仓库的基础测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
-
-          # 安装上游 PyTorch 的完整测试依赖（需要先安装 torch）
-          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
-
-          # 安装 PyTorch 开发依赖（可选）
-          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
-
-      - name: Verify NPU availability
-        continue-on-error: true
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          echo "=== NPU-SMI Info ==="
-          npu-smi info || echo "npu-smi not available"
-
-          PYTHON=python${{ env.PYTHON_VERSION }}
-          echo ""
-          echo "=== PyTorch and NPU Info ==="
-          $PYTHON -c "
-          import torch
-          print(f'torch: {torch.__version__}')
-          import torch_npu
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          " 2>&1 || echo "torch/torch_npu import failed"
-
-      # ==================== 保存 pip 缓存 ====================
-      - name: Save pip cache
-        if: always()
-        uses: actions/cache/save@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-
-      # ==================== 运行分片测试 ====================
-      - name: Download shard JSON
-        uses: actions/download-artifact@v4
-        with:
-          name: test-shards
-          path: shards
-
-      - name: Verify shard files
-        run: |
-          echo "=== Checking shard files ==="
-          ls -la shards/
-          echo ""
-          echo "=== Shard ${{ matrix.shard }} content preview ==="
-          cat shards/shard_${{ matrix.shard }}.json | head -20
-
-      - name: Run shard ${{ matrix.shard }} tests
-        timeout-minutes: 1200
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          echo "=== Running Shard ${{ matrix.shard }} of 6 ==="
-          echo "=== NPU-SMI Info ==="
-          npu-smi info || echo "npu-smi not available"
-
-          # 从 JSON 文件获取测试列表（在工作目录根目录）
-          TEST_LIST=$(cat shards/shard_${{ matrix.shard }}.json | python${{ env.PYTHON_VERSION }} -c "import sys, json; data = json.load(sys.stdin); print('\n'.join(data['tests']))")
-
-          # 保存到临时文件
-          echo "$TEST_LIST" > /tmp/test_list_shard_${{ matrix.shard }}.txt
-
-          TEST_COUNT_IN_SHARD=$(wc -l < /tmp/test_list_shard_${{ matrix.shard }}.txt)
-          echo "=== Shard ${{ matrix.shard }} contains ${TEST_COUNT_IN_SHARD} tests ==="
-          head -10 /tmp/test_list_shard_${{ matrix.shard }}.txt
-          if [ $TEST_COUNT_IN_SHARD -gt 10 ]; then
-            echo "... and $(($TEST_COUNT_IN_SHARD - 10)) more tests"
-          fi
-
-          # 切换到 test 目录执行，避免根目录 torch 与安装的 torch 库冲突
-          cd pytorch-src/test
-
-          # 开始执行测试
-          echo "=== Starting test execution ==="
-          TEST_COUNT=0
-          PASSED=0
-          FAILED=0
-          CRASHED=0
-          START_TIME=$(date +%s)
-
-          while IFS= read -r test_name; do
-            if [ -z "$test_name" ]; then
-              continue
-            fi
-
-            TEST_COUNT=$((TEST_COUNT + 1))
-            TEST_START_TIME=$(date +%s)
-            echo ""
-            echo "============================================"
-            echo "Test ${TEST_COUNT}/${TEST_COUNT_IN_SHARD}: ${test_name}"
-            echo "============================================"
-
-            # 运行测试，捕获输出到文件
-            set +e
-            python${{ env.PYTHON_VERSION }} run_test.py -i "${test_name}" -v > /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log 2>&1
-            TEST_EXIT_CODE=$?
-            set -e
-
-            TEST_END_TIME=$(date +%s)
-            TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
-
-            # 检查结果（区分失败和崩溃）
-            # exit code >= 128 表示被信号杀死（如 137=OOM, 139=segfault）
-            if [ $TEST_EXIT_CODE -eq 0 ]; then
-              echo ">>> ${test_name}: PASSED (${TEST_DURATION}s)"
-              PASSED=$((PASSED + 1))
-            elif [ $TEST_EXIT_CODE -ge 128 ]; then
-              echo ">>> ${test_name}: CRASHED (exit code: ${TEST_EXIT_CODE}, signal: $(($TEST_EXIT_CODE - 128)), ${TEST_DURATION}s)"
-              CRASHED=$((CRASHED + 1))
-              echo "--- Last 20 lines of log ---"
-              tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log || echo "(log file may be empty)"
-              echo "--- End of log preview ---"
-            else
-              echo ">>> ${test_name}: FAILED (exit code: ${TEST_EXIT_CODE}, ${TEST_DURATION}s)"
-              FAILED=$((FAILED + 1))
-              echo "--- Last 20 lines of log ---"
-              tail -20 /tmp/test_output_${{ matrix.shard }}_${TEST_COUNT}.log || echo "(log file may be empty)"
-              echo "--- End of log preview ---"
-            fi
-
-            CURRENT_TIME=$(date +%s)
-            TOTAL_DURATION=$((CURRENT_TIME - START_TIME))
-            echo ""
-            echo "--------------------------------------------"
-            echo "Shard ${{ matrix.shard }} Summary: ${PASSED} passed, ${FAILED} failed, ${CRASHED} crashed, ${TEST_COUNT}/${TEST_COUNT_IN_SHARD} done"
-            echo "Elapsed: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date +%H:%M:%S))"
-            echo "--------------------------------------------"
-
-          done < /tmp/test_list_shard_${{ matrix.shard }}.txt
-
-          END_TIME=$(date +%s)
-          TOTAL_DURATION=$((END_TIME - START_TIME))
-
-          echo ""
-          echo "============================================"
-          echo "=== SHARD ${{ matrix.shard }} FINAL SUMMARY ==="
-          echo "Total tests run: ${TEST_COUNT}"
-          echo "Passed: ${PASSED}"
-          echo "Failed: ${FAILED}"
-          echo "Crashed: ${CRASHED}"
-          echo "Total time: ${TOTAL_DURATION}s ($(date -d "@${START_TIME}" +%H:%M:%S) - $(date -d "@${END_TIME}" +%H:%M:%S))"
-          echo "============================================"
-
-          # 保存结果摘要（包含时间信息）
-          echo "{\"shard_id\": ${{ matrix.shard }}, \"total\": ${TEST_COUNT}, \"passed\": ${PASSED}, \"failed\": ${FAILED}, \"crashed\": ${CRASHED}, \"duration_seconds\": ${TOTAL_DURATION}}" > /tmp/test_results_summary_shard_${{ matrix.shard }}.json
-
-      - name: Upload test results for shard ${{ matrix.shard }}
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-results-shard-${{ matrix.shard }}
-          path: |
-            /tmp/test_output_${{ matrix.shard }}_*.log
-            /tmp/test_results_summary_shard_${{ matrix.shard }}.json
-          retention-days: 7
-
-      - name: Print shard summary
-        if: always()
-        run: |
-          echo "=== Shard ${{ matrix.shard }} Summary ==="
-          if [ -f /tmp/test_results_summary_shard_${{ matrix.shard }}.json ]; then
-            cat /tmp/test_results_summary_shard_${{ matrix.shard }}.json
-          fi
diff --git a/.github/workflows/verify-docker-image.yml b/.github/workflows/verify-docker-image.yml
deleted file mode 100644
index 5b29d15a36..0000000000
--- a/.github/workflows/verify-docker-image.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Verify Docker Image Pull
-
-on:
-  push:
-    branches: [dev_master]
-    paths:
-      - '.github/workflows/verify-docker-image.yml'
-  workflow_dispatch:
-    inputs:
-      docker_image:
-        description: 'Docker image to verify'
-        default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
-        type: string
-
-env:
-  DEFAULT_IMAGE: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
-
-jobs:
-  verify-image:
-    runs-on: ubuntu-22.04-arm
-    timeout-minutes: 30
-
-    steps:
-      - name: Set image name
-        id: image
-        run: |
-          IMAGE="${{ inputs.docker_image || env.DEFAULT_IMAGE }}"
-          echo "image=${IMAGE}" >> $GITHUB_OUTPUT
-          echo "Using image: ${IMAGE}"
-
-      - name: Pull Docker image
-        run: |
-          docker pull ${{ steps.image.outputs.image }}
-
-      - name: Verify image exists
-        run: |
-          docker images ${{ steps.image.outputs.image }}
-
-      - name: Check CANN environment
-        run: |
-          docker run --rm ${{ steps.image.outputs.image }} \
-            bash -c "source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true && \
-                     echo 'CANN_PATH: $CANN_PATH' && \
-                     ls -la /usr/local/Ascend/cann/ | head -20"
-
-      - name: Check Python version
-        run: |
-          docker run --rm ${{ steps.image.outputs.image }} \
-            bash -c "python3.11 --version"
-
-      - name: Verify CANN toolkit installation
-        run: |
-          docker run --rm ${{ steps.image.outputs.image }} \
-            bash -c "ls -la /usr/local/Ascend/ && \
-                     cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo 'version.info not found'"
-
-      - name: Clean up
-        if: always()
-        run: |
-          docker rmi ${{ steps.image.outputs.image }} 2>/dev/null || true
\ No newline at end of file

From 95ecec61a0c916bc5233ffee9a5827713f8f370f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 17:57:15 +0800
Subject: [PATCH 108/132] Remove remaining deprecated files

- Remove BUILD_FLOW_CHECK.md documentation
- Remove _collect.yml workflow (replaced by _torch-npu-upstream-collect.yml)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/scripts/BUILD_FLOW_CHECK.md | 382 ----------------------------
 .github/workflows/_collect.yml      | 217 ----------------
 .github/workflows/_test.yml         | 142 -----------
 3 files changed, 741 deletions(-)
 delete mode 100644 .github/scripts/BUILD_FLOW_CHECK.md
 delete mode 100644 .github/workflows/_collect.yml
 delete mode 100644 .github/workflows/_test.yml

diff --git a/.github/scripts/BUILD_FLOW_CHECK.md b/.github/scripts/BUILD_FLOW_CHECK.md
deleted file mode 100644
index 842fb9b02c..0000000000
--- a/.github/scripts/BUILD_FLOW_CHECK.md
+++ /dev/null
@@ -1,382 +0,0 @@
-# Docker 镜像构建流程问题检查与修复报告
-
-## 发现的问题
-
-### 问题 1：定时构建和 push 触发时不推送镜像 ❌❌❌
-
-**严重级别**：P0 - Critical
-
-**问题描述**：
-
-原 workflow 使用条件表达式：
-```yaml
-${{ inputs.push_image && '--push' || '' }}
-```
-
-当触发方式为 `push` 或 `schedule` 时：
-- `inputs.push_image` 为空（undefined）
-- 条件表达式返回空字符串
-- **结果：定时构建和 push 触发时不推送镜像到远端！**
-
-**影响**：
-- 定时构建的镜像无法自动推送到 registry
-- 需要手动触发才能推送镜像
-
----
-
-### 问题 2：镜像推送时机设计不合理 ⚠️
-
-**原设计**：
-- 手动触发：根据 inputs.push_image 决定是否推送
-- push/schedule 触发：不推送 ❌
-
-**PyTorch 上游参考**：
-```yaml
-- name: Push to ghcr.io
-  if: ${{ github.event_name == 'push' }}
-```
-
-PyTorch 只在 push 到 main 分支时推送镜像。
-
-**正确逻辑**：
-- workflow_dispatch：根据 inputs.push_image 决定 ✅
-- push（分支推送）：默认推送 ✅
-- schedule（定时构建）：默认推送 ✅
-- pull_request：不推送 ✅
-
----
-
-### 问题 3：Summary 中版本提取逻辑错误 ⚠️
-
-**原代码**：
-```bash
-CANN_MAJOR=$(echo "${{ inputs.cann_version || env.CANN_STABLE }}" | sed 's/-beta.*//' | sed 's/-rc.*//' | sed 's/\.[0-9]*$//')
-```
-
-**问题**：
-对于简化版本 `9.0`：
-- `sed 's/\.[0-9]*$//'` 删除最后的 `.数字`
-- `9.0` → 删除 `.0` → 结果是 `9` ❌
-- 应该保持 `9.0`
-
-**正确逻辑**：
-```bash
-if [[ "$CANN_INPUT" =~ ^[0-9]+\.[0-9]+$ ]]; then
-  CANN_MAJOR="$CANN_INPUT"  # 简化版本直接使用
-else
-  CANN_MAJOR=$(echo "$CANN_INPUT" | grep -oP '^[0-9]+\.[0-9]+')  # 提取前两位
-fi
-```
-
----
-
-### 问题 4：双重登录导致冗余 ⚠️
-
-**原流程**：
-```
-Workflow: docker/login-action 登录
-Script: login_registry 再次登录
-```
-
-虽然不会出错，但浪费时间，且可能导致登录状态混乱。
-
----
-
-### 问题 5：登录时机不优化 ⚠️
-
-**原执行顺序**：
-```
-parse_args → check_dependencies → parse_cann_version → login_registry → build_image
-```
-
-**问题**：
-如果镜像已存在，build_image 会跳过构建，但 login_registry 已经执行了登录。
-
-**优化**：
-将登录移到确认需要构建之后：
-```
-parse_args → check_dependencies → parse_cann_version → build_image
-                                           ↓
-                                      检查镜像是否存在
-                                           ↓
-                                      确认需要构建 → login_registry → 构建
-```
-
----
-
-## 修复方案
-
-### 修复 1：添加参数确定步骤 ✅
-
-```yaml
-- name: Determine build parameters
-  id: params
-  run: |
-    # 确定是否推送镜像
-    if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
-      PUSH_IMAGE="${{ inputs.push_image }}"
-    elif [[ "${{ github.event_name }}" == "push" || "${{ github.event_name }}" == "schedule" ]]; then
-      PUSH_IMAGE="true"
-    else
-      PUSH_IMAGE="false"
-    fi
-
-    echo "push_image=${PUSH_IMAGE}" >> $GITHUB_OUTPUT
-```
-
-**效果**：
-- workflow_dispatch：根据用户输入决定
-- push/schedule：默认推送 ✅
-- 其他情况：不推送
-
----
-
-### 修复 2：正确配置 login-action ✅
-
-```yaml
-- name: Login to Quay.io
-  if: ${{ steps.params.outputs.push_image == 'true' }}
-  uses: docker/login-action@v3
-  with:
-    registry: ${{ env.REGISTRY }}
-    username: ${{ secrets.QUAY_USERNAME }}
-    password: ${{ secrets.QUAY_PASSWORD }}
-```
-
-**效果**：
-- 只在需要推送时登录
-- 使用官方推荐的 login-action
-- 更安全地处理 secrets
-
----
-
-### 修复 3：优化脚本登录逻辑 ✅
-
-```bash
-# 在 build_image 函数中
-# 先检查镜像是否存在
-if [[ "$FORCE_BUILD" == "false" && "$PUSH_IMAGE" == "true" ]]; then
-    if docker pull "${REGISTRY}/${QUAY_ORG}/${IMAGE_NAME}:${first_tag}" &>/dev/null; then
-        log_info "镜像已存在，跳过构建"
-        return 0
-    fi
-fi
-
-# 确认需要构建，执行登录
-if [[ "$PUSH_IMAGE" == "true" ]]; then
-    if [[ "${SKIP_DOCKER_LOGIN:-false}" != "true" ]]; then
-        login_registry
-    fi
-fi
-```
-
-**效果**：
-- 先检查镜像是否存在，避免不必要的登录
-- 支持 SKIP_DOCKER_LOGIN 环境变量（用于 CI）
-- 本地使用仍然会自动登录
-
----
-
-### 修复 4：添加 SKIP_DOCKER_LOGIN 环境变量 ✅
-
-**Workflow 中**：
-```yaml
-- name: Build and push image
-  env:
-    SKIP_DOCKER_LOGIN: true  # 已通过 login-action 登录
-```
-
-**效果**：
-- 避免 workflow 中双重登录
-- 脚本检测到此变量后跳过登录
-
----
-
-### 修复 5：修正 Summary 版本提取 ✅
-
-```bash
-CANN_INPUT="${{ steps.params.outputs.cann_version }}"
-
-if [[ "$CANN_INPUT" =~ ^[0-9]+\.[0-9]+$ ]]; then
-  CANN_MAJOR="$CANN_INPUT"  # 简化版本直接使用
-else
-  CANN_MAJOR=$(echo "$CANN_INPUT" | grep -oP '^[0-9]+\.[0-9]+')
-fi
-```
-
-**效果**：
-- `9.0` → 保持为 `9.0` ✅
-- `9.0.0-beta.2` → 提取为 `9.0` ✅
-- `8.0` → 保持为 `8.0` ✅
-
----
-
-## 修复后的完整流程
-
-### Workflow 流程
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│  Step 1: Determine build parameters                         │
-│  ├─ 判断 event_name 类型                                     │
-│  ├─ 确定 push_image (workflow_dispatch → inputs.push_image) │
-│  │                    (push/schedule → true)                │
-│  ├─ 确定 force_build                                         │
-│  ├─ 确定 cann_version                                        │
-│  └─ 输出到 GITHUB_OUTPUT                                     │
-└─────────────────────────────────────────────────────────────┘
-                           ↓
-┌─────────────────────────────────────────────────────────────┐
-│  Step 2: Setup Docker Buildx                                │
-│  ├─ 配置 docker-container driver                            │
-│  └─ 使用 moby/buildkit:latest                               │
-└─────────────────────────────────────────────────────────────┘
-                           ↓
-┌─────────────────────────────────────────────────────────────┐
-│  Step 3: Login to Quay.io (conditional)                     │
-│  ├─ if: push_image == 'true'                                │
-│  ├─ 使用 docker/login-action                                │
-│  └─ 处理 secrets 安全                                        │
-└─────────────────────────────────────────────────────────────┘
-                           ↓ (如果需要推送)
-┌─────────────────────────────────────────────────────────────┐
-│  Step 4: Build and push image                               │
-│  ├─ 设置 SKIP_DOCKER_LOGIN=true                             │
-│  ├─ 调用 build_image.sh                                     │
-│  ├─ 根据 push_image 添加 --push 参数                        │
-│  └─ 根据 force_build 添加 --force 参数                      │
-└─────────────────────────────────────────────────────────────┘
-                           ↓
-┌─────────────────────────────────────────────────────────────┐
-│  Step 5: Summary (always)                                   │
-│  ├─ 生成构建报告                                             │
-│  ├─ 显示镜像标签                                             │
-│  └─ 显示 Python 切换方法                                     │
-└─────────────────────────────────────────────────────────────┘
-```
-
-### 脚本内部流程
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│  1. parse_args                                              │
-│  ├─ 解析命令行参数                                           │
-│  └─ 设置 REGISTRY、QUAY_ORG 等                              │
-└─────────────────────────────────────────────────────────────┘
-                           ↓
-┌─────────────────────────────────────────────────────────────┐
-│  2. check_dependencies                                      │
-│  ├─ 检查 docker 是否安装                                     │
-│  └─ 检查 docker buildx                                      │
-└─────────────────────────────────────────────────────────────┘
-                           ↓
-┌─────────────────────────────────────────────────────────────┐
-│  3. parse_cann_version                                      │
-│  ├─ 从版本映射表查找 URL                                     │
-│  ├─ 提取完整版本和大版本                                     │
-│  └─ 判断是否为 stable                                        │
-└─────────────────────────────────────────────────────────────┘
-                           ↓
-┌─────────────────────────────────────────────────────────────┐
-│  4. build_image                                             │
-│  ├─ 生成镜像标签                                             │
-│  ├─ 检查镜像是否已存在（如果 force=false && push=true）      │
-│  │  └─ 如果存在 → return 0                                  │
-│  ├─ 确认需要构建 → 登录（如果 SKIP_DOCKER_LOGIN != true）    │
-│  ├─ 执行 docker buildx build                                │
-│  │  ├─ --build-arg 传递 URL                                 │
-│  │  ├─ --tag 添加多个标签                                    │
-│  │  └─ --push 或 --load                                     │
-│  └─ 输出构建信息                                             │
-└─────────────────────────────────────────────────────────────┘
-```
-
----
-
-## 触发场景对比表
-
-| 触发方式 | CANN 版本 | Push Image | Force Build | 登录 | 推送 |
-|---------|-----------|------------|-------------|------|------|
-| workflow_dispatch (默认) | 9.0 | true | false | ✅ | ✅ |
-| workflow_dispatch (--push=false) | 9.0 | false | false | ❌ | ❌ |
-| workflow_dispatch (--force) | 9.0 | true | true | ✅ | ✅ |
-| push (dev_master) | 9.0 (stable) | true | false | ✅ | ✅ |
-| schedule (周日) | 9.0 (stable) | true | false | ✅ | ✅ |
-| pull_request | - | - | - | ❌ | ❌ |
-
----
-
-## 验证清单
-
-### ✅ 已验证
-
-1. ✅ Workflow YAML 格式正确
-2. ✅ Shell 脚本语法正确
-3. ✅ 参数传递逻辑正确
-4. ✅ 登录时机优化正确
-5. ✅ SKIP_DOCKER_LOGIN 机制正确
-6. ✅ 版本提取逻辑正确
-7. ✅ 条件判断逻辑正确
-
-### 🔄 需要在实际运行中验证
-
-1. 🔄 定时构建是否正确推送镜像
-2. 🔄 push 触发是否正确推送镜像
-3. 🔄 镜像已存在时是否正确跳过构建
-4. 🔄 Docker buildx --push 是否成功推送所有标签
-5. 🔄 Quay.io 登录是否成功
-6. 🔄 镜像标签是否正确生成和推送
-
----
-
-## 后续建议
-
-### 建议 1：添加构建失败通知
-
-```yaml
-- name: Notify on failure
-  if: failure()
-  run: |
-    # 可以集成 Slack/Email 通知
-    echo "::error::Build failed for CANN version ${{ steps.params.outputs.cann_version }}"
-```
-
-### 建议 2：添加镜像验证步骤
-
-```yaml
-- name: Verify pushed image
-  if: ${{ steps.params.outputs.push_image == 'true' }}
-  run: |
-    docker pull "${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:cann${{ steps.params.outputs.cann_version }}"
-    docker inspect --format='{{.Config.Env}}' "${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:cann${{ steps.params.outputs.cann_version }}"
-```
-
-### 建议 3：添加构建缓存
-
-```yaml
-- name: Build and push image
-  uses: docker/build-push-action@v5
-  with:
-    context: .github/docker
-    file: .github/docker/pytorch-npu-builder.Dockerfile
-    push: ${{ steps.params.outputs.push_image == 'true' }}
-    tags: |
-      ${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:cann${{ steps.params.outputs.cann_version }}
-    cache-from: type=gha
-    cache-to: type=gha,mode=max
-```
-
----
-
-## 修复文件列表
-
-| 文件 | 修改内容 |
-|------|---------|
-| `.github/workflows/build-docker-image.yml` | ✅ 添加参数确定步骤<br>✅ 修正 login-action 条件<br>✅ 添加 SKIP_DOCKER_LOGIN<br>✅ 修正 Summary 版本提取 |
-| `.github/scripts/build_image.sh` | ✅ 优化登录时机<br>✅ 添加 SKIP_DOCKER_LOGIN 支持<br>✅ 移除主函数中的 login_registry |
-
----
-
-**检查时间**: 2026-05-06
-**检查人**: Claude Code
-**状态**: ✅ 所有问题已修复，等待实际运行验证
\ No newline at end of file
diff --git a/.github/workflows/_collect.yml b/.github/workflows/_collect.yml
deleted file mode 100644
index d2abb6cc97..0000000000
--- a/.github/workflows/_collect.yml
+++ /dev/null
@@ -1,217 +0,0 @@
-name: Collect Test Cases
-
-on:
-  workflow_call:
-    inputs:
-      docker-image:
-        required: true
-        type: string
-      distributed-shards:
-        required: false
-        type: number
-        default: 2
-      regular-shards:
-        required: false
-        type: number
-        default: 5
-    outputs:
-      distributed_matrix:
-        description: 'Distributed shards matrix'
-        value: ${{ jobs.collect.outputs.distributed_matrix }}
-      regular_matrix:
-        description: 'Regular shards matrix'
-        value: ${{ jobs.collect.outputs.regular_matrix }}
-      total_cases:
-        description: 'Total test cases count'
-        value: ${{ jobs.collect.outputs.total_cases }}
-
-env:
-  PYTHON_VERSION: '3.11'
-  CACHE_VERSION: 'v2'
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
-jobs:
-  collect:
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 60
-    outputs:
-      distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
-      regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
-      total_cases: ${{ steps.collect.outputs.total_cases }}
-
-    container:
-      image: ${{ inputs.docker-image }}
-      options: --user root
-
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            .github/scripts
-            .ci/docker
-
-      # ==================== pip 缓存配置 ====================
-      - name: Get pip cache key
-        id: pip_key
-        run: |
-          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
-
-      - name: Restore pip cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ env.CACHE_VERSION }}-pip-collect-${{ env.PYTHON_VERSION }}-
-            ${{ env.CACHE_VERSION }}-pip-collect-
-
-      - name: Setup pip cache directory
-        run: |
-          mkdir -p ~/.cache/pip
-
-      - name: Configure pip index URL
-        run: |
-          # 配置 pip 使用 PyPI 缓存加速下载
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          else
-            echo "No PyPI cache URL configured, using default"
-          fi
-
-      - name: Upgrade pip and setuptools
-        run: |
-          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
-          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
-
-      - name: Download wheels
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-wheel-main
-          path: wheels
-
-      - name: Download torch_npu wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-npu-wheel-main
-          path: wheels
-
-      - name: Download PyTorch source and build artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: pytorch-src-main
-
-      - name: Extract PyTorch source
-        run: tar -xzf pytorch-src.tar.gz
-
-      - name: Install wheels
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
-
-      - name: Install test dependencies
-        run: |
-          # 安装当前仓库的基础测试依赖
-          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
-
-          # 安装上游 PyTorch 的完整测试依赖（从已下载的源码）
-          pip${{ env.PYTHON_VERSION }} install -r pytorch-src/.ci/docker/requirements-ci.txt || true
-
-          # 安装 PyTorch 开发依赖（可选）
-          pip${{ env.PYTHON_VERSION }} install expecttest hypothesis fsspec sympy optree networkx psutil || true
-
-      # ==================== Verify NPU availability ====================
-      - name: Verify NPU availability
-        continue-on-error: true
-        run: |
-          # 切换到 /tmp 目录，避免 pytorch-src 源码目录干扰 torch 导入
-          cd /tmp
-
-          echo "=== CANN Directory Contents ==="
-          ls -la /usr/local/Ascend/ || echo "Ascend directory not found"
-          echo ""
-          echo "=== CANN Version Info ==="
-          if [ -d /usr/local/Ascend/cann ]; then
-            ls -la /usr/local/Ascend/cann/
-            echo ""
-            echo "=== CANN Version File ==="
-            cat /usr/local/Ascend/cann/version.info 2>/dev/null || echo "version.info not found"
-          else
-            echo "CANN directory not found"
-          fi
-          echo ""
-          echo "=== NNAL Directory Contents ==="
-          if [ -d /usr/local/Ascend/nnal ]; then
-            ls -la /usr/local/Ascend/nnal/
-          else
-            echo "NNAL directory not found"
-          fi
-
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          echo ""
-          echo "=== NPU-SMI Info ==="
-          npu-smi info || echo "npu-smi not available"
-
-          PYTHON=python${{ env.PYTHON_VERSION }}
-          echo ""
-          echo "=== PyTorch and NPU Info ==="
-          $PYTHON -c "
-          import torch
-          print(f'torch: {torch.__version__}')
-          import torch_npu
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          if torch.npu.is_available():
-              print(f'NPU name: {torch.npu.get_device_name(0)}')
-              print(f'NPU capability: {torch.npu.get_device_capability(0)}')
-          " 2>&1 || echo "torch/torch_npu import failed - this is expected if torch and torch_npu versions are mismatched"
-
-      # ==================== 保存 pip 缓存 ====================
-      - name: Save pip cache
-        if: always()
-        uses: actions/cache/save@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-
-      - name: Collect all test cases
-        id: collect
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          echo "=== NPU-SMI Info ==="
-          npu-smi info || echo "npu-smi not available"
-
-          python${{ env.PYTHON_VERSION }} .github/scripts/collect_all_cases.py \
-            --test-dir pytorch-src/test \
-            --distributed-shards ${{ inputs.distributed-shards }} \
-            --regular-shards ${{ inputs.regular-shards }} \
-            --output-dir cases_shards \
-            --parallel 16 \
-            --verbose
-
-          TOTAL=$(python${{ env.PYTHON_VERSION }} -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
-          echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT
-
-          # Generate shard matrices
-          python${{ env.PYTHON_VERSION }} -c "
-          import json
-          d_shards = ${{ inputs.distributed-shards }}
-          r_shards = ${{ inputs.regular-shards }}
-          print(f'distributed_matrix={json.dumps(list(range(1, d_shards + 1)))}')
-          print(f'regular_matrix={json.dumps(list(range(1, r_shards + 1)))}')
-          " >> $GITHUB_OUTPUT
-
-      - name: Upload cases shard JSONs
-        uses: actions/upload-artifact@v4
-        with:
-          name: cases-shards
-          path: cases_shards/
\ No newline at end of file
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
deleted file mode 100644
index 91f61dec00..0000000000
--- a/.github/workflows/_test.yml
+++ /dev/null
@@ -1,142 +0,0 @@
-name: Run NPU Tests
-
-on:
-  workflow_call:
-    inputs:
-      docker-image:
-        required: true
-        type: string
-      test-type:
-        required: true
-        type: string
-      shard-index:
-        required: true
-        type: number
-      max-workers:
-        required: false
-        type: number
-        default: 1
-
-env:
-  PYTHON_VERSION: '3.11'
-  CACHE_VERSION: 'v2'
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
-jobs:
-  test:
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 1200
-
-    container:
-      image: ${{ inputs.docker-image }}
-      options: --user root
-
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            .github/scripts
-            .ci/docker
-
-      # ==================== pip 缓存配置 ====================
-      # pip 缓存加速测试依赖下载
-      - name: Get pip cache key
-        id: pip_key
-        run: |
-          REQUIREMENTS_HASH=$(sha256sum .ci/docker/requirements-ci.txt | cut -d' ' -f1)
-          echo "cache_key=${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-${REQUIREMENTS_HASH}" >> $GITHUB_OUTPUT
-
-      - name: Restore pip cache
-        uses: actions/cache/restore@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-          restore-keys: |
-            ${{ env.CACHE_VERSION }}-pip-test-${{ env.PYTHON_VERSION }}-
-            ${{ env.CACHE_VERSION }}-pip-test-
-
-      - name: Setup pip cache directory
-        run: |
-          mkdir -p ~/.cache/pip
-
-      - name: Configure pip index URL
-        run: |
-          # 配置 pip 使用 PyPI 缓存加速下载
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            pip${{ env.PYTHON_VERSION }} config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            pip${{ env.PYTHON_VERSION }} config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          else
-            echo "No PyPI cache URL configured, using default"
-          fi
-
-      - name: Upgrade pip and setuptools
-        run: |
-          # 先升级 pip 和 setuptools，避免旧版包兼容性问题
-          pip${{ env.PYTHON_VERSION }} install --upgrade pip setuptools wheel
-
-      - name: Download torch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-wheel-main
-          path: wheels
-
-      - name: Download torch_npu wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-npu-wheel-main
-          path: wheels
-
-      - name: Download PyTorch source and build artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: pytorch-src-main
-
-      - name: Download cases shard
-        uses: actions/download-artifact@v4
-        with:
-          name: cases-shards
-          path: cases-shards
-
-      - name: Extract PyTorch source
-        run: tar -xzf pytorch-src.tar.gz
-
-      - name: Install wheels
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          pip${{ env.PYTHON_VERSION }} install wheels/*.whl
-
-      - name: Install test dependencies
-        run: |
-          pip${{ env.PYTHON_VERSION }} install -r .ci/docker/requirements-ci.txt
-
-      - name: Run ${{ inputs.test-type }} shard ${{ inputs.shard-index }}
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          python .github/scripts/run_npu_test_shard.py \
-            --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
-            --test-dir pytorch-src/test \
-            --report-dir test-reports \
-            --timeout 300 \
-            --max-workers ${{ inputs.max-workers }} \
-            --verbose
-
-      # ==================== 保存 pip 缓存 ====================
-      - name: Save pip cache
-        if: always()
-        uses: actions/cache/save@v4
-        with:
-          path: ~/.cache/pip
-          key: ${{ steps.pip_key.outputs.cache_key }}
-
-      - name: Upload test reports
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-reports-${{ inputs.test-type }}-${{ inputs.shard-index }}
-          path: test-reports/
-          retention-days: 30
\ No newline at end of file

From e41da75c87dce0b196091d2178743b70013bda31 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 19:42:29 +0800
Subject: [PATCH 109/132] Checkout torch_npu from current repository instead of
 hardcoded kerer-ai/pytorch

- Use github.repository to checkout the target repository (PR merge target)
- Use inputs.torch_npu_ref with fallback to github.ref
- Ensures submodules are downloaded for compilation

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/workflows/_torch-npu-upstream-build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_torch-npu-upstream-build.yml b/.github/workflows/_torch-npu-upstream-build.yml
index e34fcce8d5..be75ccb6e4 100644
--- a/.github/workflows/_torch-npu-upstream-build.yml
+++ b/.github/workflows/_torch-npu-upstream-build.yml
@@ -107,8 +107,8 @@ jobs:
       - name: Checkout torch_npu
         uses: actions/checkout@v4
         with:
-          repository: kerer-ai/pytorch
-          ref: ${{ inputs.torch_npu_ref }}
+          repository: ${{ github.repository }}
+          ref: ${{ inputs.torch_npu_ref || github.ref }}
           path: torch_npu-src
           submodules: recursive
 

From 0d3cd4a0632580fa45e5597e23195ca05f2ebdf9 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 22:07:14 +0800
Subject: [PATCH 110/132] =?UTF-8?q?=E8=B0=83=E6=95=B4=E4=BE=9D=E8=B5=96?=
 =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E9=A1=BA=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../workflows/_torch-npu-upstream-collect.yml | 26 +++++-----
 .../workflows/_torch-npu-upstream-prepare.yml | 49 -------------------
 .../_torch-npu-upstream-test-custom.yml       | 12 ++---
 .../_torch-npu-upstream-test-dist.yml         | 26 +++++-----
 .../_torch-npu-upstream-test-regular.yml      | 26 +++++-----
 5 files changed, 45 insertions(+), 94 deletions(-)
 delete mode 100644 .github/workflows/_torch-npu-upstream-prepare.yml

diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index ede2996862..f5331f6abe 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -132,6 +132,19 @@ jobs:
           echo "torch: ${TORCH_WHL}"
           echo "torch_npu: ${TORCH_NPU_WHL}"
 
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          cd pytorch-src
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
       - name: Verify NPU availability
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -147,19 +160,6 @@ jobs:
           print(f'NPU count: {torch.npu.device_count()}')
           "
 
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          cd pytorch-src
-
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder
-          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
-
-          if [ -f requirements.txt ]; then
-            $PIP install -r requirements.txt || true
-          fi
-
       - name: Collect all test cases and shard
         id: collect_and_shard
         run: |
diff --git a/.github/workflows/_torch-npu-upstream-prepare.yml b/.github/workflows/_torch-npu-upstream-prepare.yml
deleted file mode 100644
index 69210a5570..0000000000
--- a/.github/workflows/_torch-npu-upstream-prepare.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-name: Torch NPU Upstream Prepare
-
-on:
-  workflow_call:
-    inputs:
-      pytorch_version:
-        required: true
-        type: string
-        description: PyTorch version to clone for test source
-      prepared_test_src_artifact:
-        required: true
-        type: string
-        description: Name of the artifact for prepared test source
-    
-jobs:
-  prepare:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          repository: kerer-ai/pytorch
-          ref: v2.7.1_dev
-          fetch-depth: 1
-
-      - name: Clone PyTorch v${{ inputs.pytorch_version }} (for test source)
-        run: |
-          git clone --depth=1 --branch v${{ inputs.pytorch_version }} \
-            https://github.com/pytorch/pytorch.git pytorch-test-src
-
-      - name: Package prepared test source
-        run: |
-          tar -czf pytorch-test-src.tar.gz pytorch-test-src
-
-      - name: Upload prepared test source
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ inputs.prepared_test_src_artifact }}
-          path: pytorch-test-src.tar.gz
-          retention-days: 7
-
-      - name: Upload prepare logs
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: prepare-logs
-          path: /tmp/*.log
-          if-no-files-found: warn
-          retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index f6bf949870..65a0fc347a 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -91,6 +91,12 @@ jobs:
 
           echo "Installed PyTorch and torch_npu from built wheels"
 
+      - name: Install test dependencies
+        run: |
+          pip${{ inputs.python_version }} install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          pip${{ inputs.python_version }} install pytest-rerunfailures pytest-flakefinder pytest-subtests==0.13.1 xdoctest==1.1.0 pulp>=2.9
+          pip${{ inputs.python_version }} install -r pytorch-src/requirements.txt || true
+
       - name: Verify NPU availability
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -105,12 +111,6 @@ jobs:
           print(f'NPU count: {torch.npu.device_count()}')
           PY
 
-      - name: Install test dependencies
-        run: |
-          pip${{ inputs.python_version }} install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          pip${{ inputs.python_version }} install pytest-rerunfailures pytest-flakefinder pytest-subtests==0.13.1 xdoctest==1.1.0 pulp>=2.9
-          pip${{ inputs.python_version }} install -r pytorch-src/requirements.txt || true
-
       - name: Run custom test files
         id: run_tests
         run: |
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index 1788dbe313..1d196b0410 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -112,6 +112,19 @@ jobs:
           echo "torch: ${TORCH_WHL}"
           echo "torch_npu: ${TORCH_NPU_WHL}"
 
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          cd pytorch-src
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
       - name: Verify NPU availability
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -129,19 +142,6 @@ jobs:
               print(f'NPU name: {torch.npu.get_device_name(0)}')
           "
 
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          cd pytorch-src
-
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder
-          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
-
-          if [ -f requirements.txt ]; then
-            $PIP install -r requirements.txt || true
-          fi
-
       - name: Download cases shard JSONs
         uses: actions/download-artifact@v4
         with:
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index dd1c054e35..d9613b2e17 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -112,6 +112,19 @@ jobs:
           echo "torch: ${TORCH_WHL}"
           echo "torch_npu: ${TORCH_NPU_WHL}"
 
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          cd pytorch-src
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
       - name: Verify NPU availability
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
@@ -129,19 +142,6 @@ jobs:
               print(f'NPU name: {torch.npu.get_device_name(0)}')
           "
 
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          cd pytorch-src
-
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder
-          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
-
-          if [ -f requirements.txt ]; then
-            $PIP install -r requirements.txt || true
-          fi
-
       - name: Download cases shard JSONs
         uses: actions/download-artifact@v4
         with:

From 346ac080e4abb255db07701d68335cbcc0667522 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 22:56:57 +0800
Subject: [PATCH 111/132] Checkout from current repository in all test
 workflows

- Replace hardcoded kerer-ai/pytorch with github.repository
- Replace hardcoded ref with github.ref
- Ensures workflows work on any fork/PR target repository

Affected workflows:
- _torch-npu-upstream-collect.yml
- _torch-npu-upstream-test-dist.yml
- _torch-npu-upstream-test-regular.yml
- _torch-npu-upstream-test-custom.yml
- _torch-npu-upstream-report.yml

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/workflows/_torch-npu-upstream-collect.yml      | 4 ++--
 .github/workflows/_torch-npu-upstream-report.yml       | 4 ++--
 .github/workflows/_torch-npu-upstream-test-custom.yml  | 4 ++--
 .github/workflows/_torch-npu-upstream-test-dist.yml    | 4 ++--
 .github/workflows/_torch-npu-upstream-test-regular.yml | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index f5331f6abe..2895dd576b 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -68,8 +68,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          repository: kerer-ai/pytorch
-          ref: v2.7.1_dev
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
           fetch-depth: 1
           path: ascend_pytorch
 
diff --git a/.github/workflows/_torch-npu-upstream-report.yml b/.github/workflows/_torch-npu-upstream-report.yml
index db88c80775..d7675d3e15 100644
--- a/.github/workflows/_torch-npu-upstream-report.yml
+++ b/.github/workflows/_torch-npu-upstream-report.yml
@@ -34,8 +34,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          repository: kerer-ai/pytorch
-          ref: v2.7.1_dev
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
           fetch-depth: 1
 
       - name: Setup Python ${{ inputs.python_version }}
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index 65a0fc347a..55ab7e52fa 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -40,8 +40,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          repository: kerer-ai/pytorch
-          ref: v2.7.1_dev
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
           fetch-depth: 1
           path: ascend_pytorch
 
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index 1d196b0410..e14d7de9f4 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -49,8 +49,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          repository: kerer-ai/pytorch
-          ref: v2.7.1_dev
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
           fetch-depth: 1
           path: ascend_pytorch
 
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index d9613b2e17..040dcbea41 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -49,8 +49,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          repository: kerer-ai/pytorch
-          ref: v2.7.1_dev
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
           fetch-depth: 1
           path: ascend_pytorch
 

From 1783029ed12092cf096685580a70abcf12370950 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 23:01:34 +0800
Subject: [PATCH 112/132] Checkout torch_npu from Ascend/pytorch upstream
 repository

- Use hardcoded Ascend/pytorch repository for torch_npu source
- Use inputs.torch_npu_ref to specify branch/tag/commit
- Ensures torch_npu is always built from upstream source
- Submodules downloaded recursively for compilation

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/workflows/_torch-npu-upstream-build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_torch-npu-upstream-build.yml b/.github/workflows/_torch-npu-upstream-build.yml
index be75ccb6e4..b13018151e 100644
--- a/.github/workflows/_torch-npu-upstream-build.yml
+++ b/.github/workflows/_torch-npu-upstream-build.yml
@@ -107,8 +107,8 @@ jobs:
       - name: Checkout torch_npu
         uses: actions/checkout@v4
         with:
-          repository: ${{ github.repository }}
-          ref: ${{ inputs.torch_npu_ref || github.ref }}
+          repository: Ascend/pytorch
+          ref: ${{ inputs.torch_npu_ref }}
           path: torch_npu-src
           submodules: recursive
 

From 018fa44571cc534ad48504bea444d829e0657ded Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 23:11:18 +0800
Subject: [PATCH 113/132] =?UTF-8?q?=E8=B0=83=E6=95=B4=E4=BE=9D=E8=B5=96?=
 =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E9=A1=BA=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../workflows/_torch-npu-upstream-build.yml   | 24 +------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/.github/workflows/_torch-npu-upstream-build.yml b/.github/workflows/_torch-npu-upstream-build.yml
index b13018151e..fb266efb64 100644
--- a/.github/workflows/_torch-npu-upstream-build.yml
+++ b/.github/workflows/_torch-npu-upstream-build.yml
@@ -392,26 +392,4 @@ jobs:
         with:
           name: pytorch-src-main
           path: pytorch-src.tar.gz
-          retention-days: 7
-
-      # ==================== 长期制品保存 ====================
-      - name: Upload torch wheel (long-term)
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-wheel-${{ inputs.pytorch_ref }}
-          path: pytorch-src/dist/*.whl
-          retention-days: 30
-
-      - name: Upload torch_npu wheel (long-term)
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-npu-wheel-${{ inputs.pytorch_ref }}
-          path: torch_npu-src/dist/*.whl
-          retention-days: 30
-
-      - name: Upload PyTorch source (long-term)
-        uses: actions/upload-artifact@v4
-        with:
-          name: pytorch-src-${{ inputs.pytorch_ref }}
-          path: pytorch-src.tar.gz
-          retention-days: 30
\ No newline at end of file
+          retention-days: 7
\ No newline at end of file

From bf63cb43c741affab4dcff525801016a5db1bd75 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sat, 9 May 2026 23:27:07 +0800
Subject: [PATCH 114/132] Add PyPI cache proxy configuration for faster pip
 downloads

- Add PYPI_CACHE_URL env variable in all test workflows
- Configure pip index-url before installing packages
- Use cache-service.nginx-pypi-cache for dependency downloads

Affected workflows:
- _torch-npu-upstream-collect.yml
- _torch-npu-upstream-test-dist.yml
- _torch-npu-upstream-test-regular.yml
- _torch-npu-upstream-test-custom.yml

(build.yml already has this configuration)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .github/workflows/_torch-npu-upstream-collect.yml     | 11 +++++++++++
 .github/workflows/_torch-npu-upstream-test-custom.yml | 11 +++++++++++
 .github/workflows/_torch-npu-upstream-test-dist.yml   | 11 +++++++++++
 .../workflows/_torch-npu-upstream-test-regular.yml    | 11 +++++++++++
 4 files changed, 44 insertions(+)

diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index 2895dd576b..80ccd92be4 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -50,6 +50,10 @@ on:
         description: Total number of test cases
         value: ${{ jobs.collect.outputs.total_cases }}
 
+env:
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
 jobs:
   collect:
     runs-on: linux-aarch64-a3-16
@@ -118,6 +122,13 @@ jobs:
           PYTHON=python${{ inputs.python_version }}
           export PIP_CACHE_DIR=/github/home/.cache/pip
 
+          # Configure pip to use PyPI cache for faster downloads
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          fi
+
           $PIP install --upgrade pip
 
           # Install built torch wheel
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index 55ab7e52fa..fb1c5bb77f 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -28,6 +28,10 @@ on:
         type: string
         description: Test files to run (comma-separated)
 
+env:
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
 jobs:
   run_tests:
     runs-on: linux-aarch64-a3-16
@@ -79,6 +83,13 @@ jobs:
           PIP=pip${{ inputs.python_version }}
           export PIP_CACHE_DIR=/github/home/.cache/pip
 
+          # Configure pip to use PyPI cache for faster downloads
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          fi
+
           $PIP install --upgrade pip
 
           # Install built torch wheel
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index e14d7de9f4..aeb82fa39c 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -32,6 +32,10 @@ on:
         type: string
         description: Number of distributed shards
 
+env:
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
 jobs:
   run_tests:
     runs-on: linux-aarch64-a3-16
@@ -98,6 +102,13 @@ jobs:
           PYTHON=python${{ inputs.python_version }}
           export PIP_CACHE_DIR=/github/home/.cache/pip
 
+          # Configure pip to use PyPI cache for faster downloads
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          fi
+
           $PIP install --upgrade pip
 
           # Install built torch wheel
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index 040dcbea41..282a1106a2 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -32,6 +32,10 @@ on:
         type: string
         description: Number of regular shards
 
+env:
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
 jobs:
   run_tests:
     runs-on: linux-aarch64-a3-16
@@ -98,6 +102,13 @@ jobs:
           PYTHON=python${{ inputs.python_version }}
           export PIP_CACHE_DIR=/github/home/.cache/pip
 
+          # Configure pip to use PyPI cache for faster downloads
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          fi
+
           $PIP install --upgrade pip
 
           # Install built torch wheel

From 1225b7fca2db9082818677543086f767490c74f7 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sun, 10 May 2026 08:55:40 +0800
Subject: [PATCH 115/132] =?UTF-8?q?=E5=88=A0=E9=99=A4=E9=BB=91=E7=99=BD?=
 =?UTF-8?q?=E5=90=8D=E5=8D=95=E9=99=90=E5=88=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/_torch-npu-upstream-collect.yml      | 1 -
 .github/workflows/_torch-npu-upstream-test-custom.yml  | 1 -
 .github/workflows/_torch-npu-upstream-test-dist.yml    | 1 -
 .github/workflows/_torch-npu-upstream-test-regular.yml | 1 -
 4 files changed, 4 deletions(-)

diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index 80ccd92be4..1cb99730ac 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -190,7 +190,6 @@ jobs:
 
           $PYTHON ../ascend_pytorch/.github/scripts/collect_all_cases.py \
             --test-dir test \
-            --case-paths-config test_upstream/case_paths_ci.yml \
             --distributed-shards ${DISTRIBUTED_SHARDS} \
             --regular-shards ${REGULAR_SHARDS} \
             --output-dir cases_shards \
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index fb1c5bb77f..214aec75ae 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -135,7 +135,6 @@ jobs:
           python${{ inputs.python_version }} ascend_pytorch/.github/scripts/run_npu_test_shard.py \
             --test-files "${{ inputs.test_files }}" \
             --test-dir pytorch-src/test \
-            --disabled-testcases pytorch-src/test_upstream/disabled_testcases.json \
             --report-dir ${REPORT_DIR} \
             --timeout 1200 \
             --verbose \
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index aeb82fa39c..b21d82d6fd 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -184,7 +184,6 @@ jobs:
           $PYTHON ascend_pytorch/.github/scripts/run_npu_test_shard.py \
             --cases-json "${CASES_JSON}" \
             --test-dir pytorch-src/test \
-            --disabled-testcases pytorch-src/test_upstream/disabled_testcases.json \
             --report-dir ${REPORT_DIR} \
             --timeout 1200 \
             --verbose \
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index 282a1106a2..96056e6045 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -184,7 +184,6 @@ jobs:
           $PYTHON ascend_pytorch/.github/scripts/run_npu_test_shard.py \
             --cases-json "${CASES_JSON}" \
             --test-dir pytorch-src/test \
-            --disabled-testcases pytorch-src/test_upstream/disabled_testcases.json \
             --report-dir ${REPORT_DIR} \
             --timeout 1200 \
             --max-workers 64 \

From 5f11d065f81bf86ec9b8762104dccc6105f74a62 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sun, 10 May 2026 10:24:08 +0800
Subject: [PATCH 116/132] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E7=9F=A9=E9=98=B5=E4=BB=BB=E5=8A=A1=E5=90=8D=E7=A7=B0=E6=98=BE?=
 =?UTF-8?q?=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

为测试执行 workflow 的 job 添加 name 属性，使 GitHub Actions 界面显示更简洁直观：
- test_distributed (1/2) 替代 trigger_test/test_distributed/run_tests (1)
- test_regular (1/5) 替代 trigger_test/test_regular/run_tests (1)
- test_custom 替代 trigger_test/test_custom/run_tests

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/_torch-npu-upstream-test-custom.yml  | 1 +
 .github/workflows/_torch-npu-upstream-test-dist.yml    | 1 +
 .github/workflows/_torch-npu-upstream-test-regular.yml | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index 214aec75ae..b170932cfb 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -34,6 +34,7 @@ env:
 
 jobs:
   run_tests:
+    name: test_custom
     runs-on: linux-aarch64-a3-16
     timeout-minutes: 1200
     container:
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index b21d82d6fd..72dd9e1ca1 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -38,6 +38,7 @@ env:
 
 jobs:
   run_tests:
+    name: test_distributed (${{ matrix.shard }}/${{ inputs.distributed_shards }})
     runs-on: linux-aarch64-a3-16
     timeout-minutes: 1200
     container:
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index 96056e6045..d6b62801a3 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -38,6 +38,7 @@ env:
 
 jobs:
   run_tests:
+    name: test_regular (${{ matrix.shard }}/${{ inputs.regular_shards }})
     runs-on: linux-aarch64-a3-16
     timeout-minutes: 1200
     container:

From fe7eebb073702f5bd4173a936cc6a2133f71a2d1 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Sun, 10 May 2026 22:21:17 +0800
Subject: [PATCH 117/132] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=B6=85=E6=97=B6?=
 =?UTF-8?q?=E6=97=B6=E9=97=B4=E4=B8=BA60s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/_torch-npu-upstream-test-custom.yml  | 2 +-
 .github/workflows/_torch-npu-upstream-test-dist.yml    | 2 +-
 .github/workflows/_torch-npu-upstream-test-regular.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index b170932cfb..27d1621936 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -137,7 +137,7 @@ jobs:
             --test-files "${{ inputs.test_files }}" \
             --test-dir pytorch-src/test \
             --report-dir ${REPORT_DIR} \
-            --timeout 1200 \
+            --timeout 60 \
             --verbose \
             2>&1 | tee /tmp/test_custom.log
 
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index 72dd9e1ca1..199128000c 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -186,7 +186,7 @@ jobs:
             --cases-json "${CASES_JSON}" \
             --test-dir pytorch-src/test \
             --report-dir ${REPORT_DIR} \
-            --timeout 1200 \
+            --timeout 60 \
             --verbose \
             2>&1 | tee /tmp/test_shard_dist_${{ matrix.shard }}.log
 
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index d6b62801a3..7e4dfe4ac6 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -186,7 +186,7 @@ jobs:
             --cases-json "${CASES_JSON}" \
             --test-dir pytorch-src/test \
             --report-dir ${REPORT_DIR} \
-            --timeout 1200 \
+            --timeout 60 \
             --max-workers 64 \
             --verbose \
             2>&1 | tee /tmp/test_shard_reg_${{ matrix.shard }}.log

From d7ed26ddb0683c92340ca42d923c87193df8e6ca Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 11:00:19 +0800
Subject: [PATCH 118/132] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=8A=A5=E5=91=8Abug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../scripts/generate_npu_full_test_report.py  | 256 ++++++++----------
 .../workflows/_torch-npu-upstream-build.yml   |  20 +-
 .../workflows/_torch-npu-upstream-report.yml  |   5 +
 .../workflows/_torch-npu-upstream-test.yml    |   1 +
 4 files changed, 142 insertions(+), 140 deletions(-)

diff --git a/.github/scripts/generate_npu_full_test_report.py b/.github/scripts/generate_npu_full_test_report.py
index 569edc481f..959f8adb3f 100644
--- a/.github/scripts/generate_npu_full_test_report.py
+++ b/.github/scripts/generate_npu_full_test_report.py
@@ -492,22 +492,6 @@ def get_int_value(payload: Dict, *keys: str) -> int:
     return 0
 
 
-def get_selected_test_entries(info: Dict) -> int:
-    return get_int_value(info, "selected_test_entries", "upstream_selected_tests")
-
-
-def get_selected_test_files(info: Dict) -> int:
-    return get_int_value(info, "selected_test_files", "upstream_selected_file_tests")
-
-
-def get_path_filtered_out_files(info: Dict) -> int:
-    return get_int_value(info, "path_filtered_out_files", "excluded_test_files")
-
-
-def get_unhandled_special_tests(info: Dict) -> int:
-    return get_int_value(info, "unhandled_special_tests", "upstream_unhandled_tests")
-
-
 def discover_shard_files(
     reports_root: Path,
 ) -> Tuple[
@@ -614,6 +598,62 @@ def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
     return stats_files, info_files, plan_files, excluded_files, unhandled_files, xml_files, missing_files, cases_files
 
 
+def build_file_to_shards_map(cases_shards_dir: Path) -> Dict[str, List[str]]:
+    """
+    Build a mapping from test file path to shard IDs.
+
+    Scans all shard JSON files in cases_shards_dir and extracts file->shard mapping.
+
+    Args:
+        cases_shards_dir: Directory containing shard JSON files like
+                          distributed_cases_shard_1.json, regular_cases_shard_2.json
+
+    Returns:
+        Dict mapping file path (e.g., "test/test_ops.py") to list of shard IDs
+        (e.g., ["dist-1", "reg-2", "reg-3"])
+    """
+    file_to_shards = {}
+
+    if not cases_shards_dir or not cases_shards_dir.exists():
+        return file_to_shards
+
+    # Pattern: {test_type}_cases_shard_{num}.json
+    for shard_file in cases_shards_dir.glob("*_cases_shard_*.json"):
+        try:
+            data = load_json_file(shard_file)
+            test_type = data.get("test_type", "regular")
+            shard_num = data.get("shard", 0)
+
+            # Build shard ID: "dist-1" or "reg-2"
+            shard_prefix = "dist" if test_type == "distributed" else "reg"
+            shard_id = f"{shard_prefix}-{shard_num}"
+
+            # Extract file paths from cases
+            cases = data.get("cases", [])
+            for case in cases:
+                file_path = case.get("file", "")
+                if file_path:
+                    # Normalize file path (remove leading "test/" if present for consistency)
+                    normalized_file = file_path
+                    if normalized_file.startswith("test/"):
+                        normalized_file = normalized_file[5:]
+
+                    if normalized_file not in file_to_shards:
+                        file_to_shards[normalized_file] = []
+                    if shard_id not in file_to_shards[normalized_file]:
+                        file_to_shards[normalized_file].append(shard_id)
+        except Exception as e:
+            print(f"Warning: Failed to parse shard file {shard_file}: {e}")
+            continue
+
+    # Sort shard IDs for each file
+    for file_path in file_to_shards:
+        # Sort by type (dist first) then number
+        file_to_shards[file_path].sort(key=lambda x: (0 if x.startswith("dist") else 1, int(x.split("-")[1])))
+
+    return file_to_shards
+
+
 def get_shard_status(stats: Dict, present: bool) -> str:
     if not present:
         return "MISSING"
@@ -654,19 +694,6 @@ def format_duration(seconds: float) -> str:
     return f"{secs:.1f}s"
 
 
-def build_note(stats: Dict) -> str:
-    notes = []
-    if stats.get("crash_signal"):
-        notes.append(stats["crash_signal"])
-    if stats.get("timed_out"):
-        notes.append("overall timeout")
-    if stats.get("incomplete"):
-        notes.append("no junit xml")
-    if stats.get("error_message"):
-        notes.append(stats["error_message"])
-    return "; ".join(notes)
-
-
 def sanitize_markdown_cell(value: str) -> str:
     return value.replace("|", "\\|").replace("\n", "<br>")
 
@@ -813,32 +840,21 @@ def main():
     special_test_files = discover_special_test_files(special_reports_root)
     shard_ids = requested_shards or sorted(set(stats_files) | set(info_files) | set(cases_files))
 
+    # Build file to shards mapping from cases-shards directory
+    cases_shards_dir = Path(args.cases_summary).parent if args.cases_summary else None
+    file_to_shards_map = build_file_to_shards_map(cases_shards_dir)
+
     status_counts = Counter()
     totals = {
         "total": 0,
         "passed": 0,
         "failed": 0,
-        "skipped": 0,
         "errors": 0,
+        "skipped": 0,
+        "crashed": 0,
+        "timeout": 0,
         "duration": 0.0,
-        "discovered_test_files": 0,
-        "selected_test_entries": 0,
-        "selected_test_files": 0,
-        "path_filtered_out_files": 0,
-        "planned_files": 0,
-        "junit_generated_shards": 0,
-        "junit_xml_files": 0,
-        "zero_item_test_files": 0,
-        "startup_failures": 0,
-        "import_failures": 0,
-        "test_failures": 0,
         "missing_files": 0,
-        "total_cases": 0,
-        "case_passed": 0,
-        "case_failed": 0,
-        "case_errors": 0,
-        "case_crashed": 0,
-        "case_timeout": 0,
     }
     shard_rows = []
     unique_planned_files = set()
@@ -859,10 +875,6 @@ def main():
         cases_path = cases_files.get(shard_key)
         stats = load_json_file(stats_path) if stats_path else {}
         info = load_json_file(info_path) if info_path else {}
-        selected_test_entries = get_selected_test_entries(info)
-        selected_test_files = get_selected_test_files(info)
-        path_filtered_out_files = get_path_filtered_out_files(info)
-        unhandled_special_tests = get_unhandled_special_tests(info)
         planned_files = load_text_lines(plan_path) if plan_path else []
         excluded_test_files = load_text_lines(excluded_path) if excluded_path else []
         unhandled_tests = load_text_lines(unhandled_path) if unhandled_path else []
@@ -881,13 +893,15 @@ def main():
             stats["crashed"] = cases_data.get("crashed", 0)
             stats["timeout"] = cases_data.get("timeout", 0)
             stats["duration"] = cases_data.get("duration", 0.0)
-            # Update totals
-            totals["total_cases"] += cases_data.get("total_cases", 0)
-            totals["case_passed"] += cases_data.get("passed", 0)
-            totals["case_failed"] += cases_data.get("failed", 0)
-            totals["case_errors"] += cases_data.get("errors", 0)
-            totals["case_crashed"] += cases_data.get("crashed", 0)
-            totals["case_timeout"] += cases_data.get("timeout", 0)
+            # Update totals (正交累加: total = passed + failed + errors + skipped + crashed + timeout)
+            totals["total"] += cases_data.get("total_cases", 0)
+            totals["passed"] += cases_data.get("passed", 0)
+            totals["failed"] += cases_data.get("failed", 0)
+            totals["errors"] += cases_data.get("errors", 0)
+            totals["skipped"] += cases_data.get("skipped", 0)
+            totals["crashed"] += cases_data.get("crashed", 0)
+            totals["timeout"] += cases_data.get("timeout", 0)
+            totals["duration"] += cases_data.get("duration", 0.0)
 
         present = bool(stats_path or cases_path)
 
@@ -945,25 +959,6 @@ def main():
         status = get_shard_status(stats, present)
         status_counts[status] += 1
 
-        totals["total"] += int(stats.get("total", 0))
-        totals["passed"] += int(stats.get("passed", 0))
-        totals["failed"] += int(stats.get("failed", 0))
-        totals["skipped"] += int(stats.get("skipped", 0))
-        totals["errors"] += int(stats.get("errors", 0))
-        totals["duration"] += float(stats.get("duration", 0.0))
-        totals["discovered_test_files"] = max(
-            totals["discovered_test_files"], int(info.get("total_files", 0))
-        )
-        totals["selected_test_entries"] = max(totals["selected_test_entries"], selected_test_entries)
-        totals["selected_test_files"] = max(totals["selected_test_files"], selected_test_files)
-        totals["path_filtered_out_files"] = max(totals["path_filtered_out_files"], path_filtered_out_files)
-        totals["planned_files"] += int(info.get("shard_files", 0))
-        totals["junit_generated_shards"] += 1 if info.get("junit_generated") else 0
-        totals["junit_xml_files"] += int(info.get("junit_xml_files", 0) or stats.get("junit_xml_files", 0))
-        totals["zero_item_test_files"] += int(info.get("zero_item_test_files", 0) or stats.get("zero_item_test_files", 0))
-        totals["startup_failures"] += int(info.get("startup_failures", 0) or stats.get("startup_failures", 0))
-        totals["import_failures"] += int(info.get("import_failures", 0) or stats.get("import_failures", 0))
-        totals["test_failures"] += int(info.get("test_failures", 0) or stats.get("test_failures", 0))
         totals["missing_files"] += len(missing_files_list)
 
         # Convert shard_type to display prefix ("distributed" -> "dist", "regular" -> "reg")
@@ -982,23 +977,6 @@ def main():
                 "crashed": int(stats.get("crashed", 0)),
                 "timeout": int(stats.get("timeout", 0)),
                 "duration": float(stats.get("duration", 0.0)),
-                "planned_files": int(info.get("shard_files", 0)),
-                "discovered_test_files": int(info.get("total_files", 0)),
-                "selected_test_entries": selected_test_entries,
-                "selected_test_files": selected_test_files,
-                "unhandled_special_tests": unhandled_special_tests,
-                "planned_file_names": planned_files,
-                "path_filtered_out_files": path_filtered_out_files,
-                "disabled_matched": int(info.get("disabled_count_matched", 0)),
-                "disabled_deselected": int(info.get("disabled_count_deselected", 0)),
-                "junit_generated": bool(info.get("junit_generated", stats.get("junit_generated", False))),
-                "junit_xml_files": int(info.get("junit_xml_files", stats.get("junit_xml_files", 0))),
-                "zero_item_test_files": int(info.get("zero_item_test_files", stats.get("zero_item_test_files", 0))),
-                "startup_failures": int(info.get("startup_failures", stats.get("startup_failures", 0))),
-                "import_failures": int(info.get("import_failures", stats.get("import_failures", 0))),
-                "test_failures": int(info.get("test_failures", stats.get("test_failures", 0))),
-                "note": build_note(stats),
-                "testsuite_stats": testsuite_stats,  # Per-test-file statistics
             }
         )
 
@@ -1009,17 +987,11 @@ def main():
     unique_planned_count = len(unique_planned_files)
     excluded_test_files_list = sorted(unique_excluded_files)
     unhandled_tests_list = sorted(unique_unhandled_tests)
-    not_covered_by_requested_shards = max(
-        totals["selected_test_files"] - unique_planned_count,
-        0,
-    )
     selection_mode_display = ", ".join(sorted(selection_modes)) if selection_modes else "-"
-    include_selected_entries = totals["selected_test_entries"] > 0
     include_unhandled_tests = bool(unhandled_tests_list)
 
     # Show all shards in the detail table
     sorted_shards = sorted(shard_rows, key=lambda row: (row["shard_type"], row["shard_num"]))
-    slowest = sorted(shard_rows, key=lambda row: row["duration"], reverse=True)[:20]
     special_test_names = expected_special_tests or sorted(special_test_files)
     special_test_rows = []
     special_status_counts = Counter()
@@ -1060,11 +1032,16 @@ def main():
         )
     else:
         # Fallback to original selection mode display
-        selection_content = (
-            f"{selection_mode_display}; "
-            f"{totals['selected_test_files']} selected, "
-            f"{totals['path_filtered_out_files']} filtered out"
-        )
+        selection_content = selection_mode_display
+
+    # Extract planned cases count from cases_collection_summary.json
+    planned_total_cases = 0
+    planned_dist_cases = 0
+    planned_reg_cases = 0
+    if cases_summary_data:
+        planned_total_cases = cases_summary_data.get("total_cases", 0)
+        planned_dist_cases = cases_summary_data.get("distributed", {}).get("cases_summary", {}).get("total_cases", 0)
+        planned_reg_cases = cases_summary_data.get("regular", {}).get("cases_summary", {}).get("total_cases", 0)
 
     overview_rows = [
         ["Overall result", overall_status],
@@ -1076,33 +1053,26 @@ def main():
         ["Shards", f"{received_reports} / {expected_reports} reported"],
         ["Selection", selection_content],
         [
-            "Tests",
+            "实际执行用例",
             (
                 f"{totals['total']} total; {totals['passed']} passed; {totals['failed']} failed; "
-                f"{totals['errors']} errors; {totals['skipped']} skipped"
+                f"{totals['errors']} errors; {totals['skipped']} skipped; "
+                f"{totals['crashed']} crashed; {totals['timeout']} timeout"
             ),
         ],
-        ["Duration", format_duration(totals["duration"])],
     ]
+    # Add planned cases count row if available
+    if planned_total_cases > 0:
+        overview_rows.append([
+            "规划用例总数",
+            f"{planned_total_cases} (distributed: {planned_dist_cases}, regular: {planned_reg_cases})",
+        ])
+    overview_rows.append(["Duration", format_duration(totals["duration"])])
     if totals["missing_files"] > 0:
         overview_rows.append(["Missing files", f"{totals['missing_files']} crashed without report"])
     if include_special_tests:
         overview_rows.append(["Special tests expected", str(len(special_test_names))])
 
-    # Add case-level statistics if available (replace Tests row with case-level data)
-    if totals["total_cases"] > 0:
-        overview_rows.append([
-            "Case-level stats",
-            (
-                f"{totals['total_cases']} cases; "
-                f"{totals['case_passed']} passed; "
-                f"{totals['case_failed']} failed; "
-                f"{totals['case_errors']} errors; "
-                f"{totals['case_crashed']} crashed; "
-                f"{totals['case_timeout']} timeout"
-            ),
-        ])
-
     markdown_lines = [
         "# PyTorch NPU Full Test Summary",
         "",
@@ -1154,8 +1124,17 @@ def main():
             for fs in sorted_files:  # Show all files
                 failed_total = fs["failed"] + fs["errors"] + fs["crashed"] + fs["timeout"]
                 fail_rate = f"{(failed_total / fs['total'] * 100):.1f}%" if fs["total"] > 0 else "0%"
+                # Get shard info for this file
+                file_path = fs["file"]
+                # Normalize file path for lookup (remove leading "test/")
+                lookup_path = file_path
+                if lookup_path.startswith("test/"):
+                    lookup_path = lookup_path[5:]
+                shards_for_file = file_to_shards_map.get(lookup_path, [])
+                shard_info = ", ".join(shards_for_file) if shards_for_file else "-"
                 file_rows.append([
                     sanitize_markdown_cell(fs["file"]),
+                    shard_info,
                     str(fs["total"]),
                     str(fs["passed"]),
                     str(fs["failed"]),
@@ -1167,7 +1146,7 @@ def main():
 
             markdown_lines.extend(
                 render_table(
-                    ["测试文件", "总用例", "通过", "失败", "错误", "崩溃", "超时", "失败率"],
+                    ["测试文件", "分片", "总用例", "通过", "失败", "错误", "崩溃", "超时", "失败率"],
                     file_rows,
                 )
             )
@@ -1206,26 +1185,23 @@ def main():
         "status_counts": dict(status_counts),
         "totals": totals,
         "file_discovery_stats": file_discovery_stats,
+        "planned_cases": {
+            "total": planned_total_cases,
+            "distributed": planned_dist_cases,
+            "regular": planned_reg_cases,
+        },
         "execution_scope": {
             "selection_mode": sorted(selection_modes),
-            "selected_test_entries": totals["selected_test_entries"],
-            "selected_test_files": totals["selected_test_files"],
-            "path_filtered_out_files": totals["path_filtered_out_files"],
             "unique_planned_test_files": unique_planned_count,
-            "files_not_covered_by_requested_shards": not_covered_by_requested_shards,
             "excluded_test_files": excluded_test_files_list,
             "unhandled_special_tests": unhandled_tests_list,
             "missing_files": sorted(unique_missing_files),
         },
         "failure_breakdown": {
-            "startup_failures": totals["startup_failures"],
-            "import_failures": totals["import_failures"],
-            "test_failures": totals["test_failures"],
             "missing_files": totals["missing_files"],
         },
         "shards": shard_rows,
         "failed_shards": [row for row in shard_rows if row["status"] not in ("PASSED", "NO TESTS")],
-        "slowest_shards": slowest,
     }
 
     # Add full cases summary if available
@@ -1235,12 +1211,6 @@ def main():
     # Add case-level results if available
     if cases_results:
         report_json["cases_results"] = {
-            "total_cases": totals["total_cases"],
-            "passed": totals["case_passed"],
-            "failed": totals["case_failed"],
-            "errors": totals["case_errors"],
-            "crashed": totals["case_crashed"],
-            "timeout": totals["case_timeout"],
             "shards": {
                 f"{shard_type}-{shard_num}": data
                 for (shard_type, shard_num), data in cases_results.items()
@@ -1249,8 +1219,18 @@ def main():
 
         # Add file-level aggregation
         file_stats = parse_test_results.aggregate_all_cases_by_file(cases_results)
+        # Add shard info to file stats
+        file_stats_with_shards = {}
+        for file_path, stats in file_stats.items():
+            # Normalize file path for lookup
+            lookup_path = file_path
+            if lookup_path.startswith("test/"):
+                lookup_path = lookup_path[5:]
+            shards_for_file = file_to_shards_map.get(lookup_path, [])
+            stats["shards"] = shards_for_file
+            file_stats_with_shards[file_path] = stats
         report_json["file_level_stats"] = dict(sorted(
-            file_stats.items(),
+            file_stats_with_shards.items(),
             key=lambda x: (-x[1]["total"], x[0])
         ))
 
diff --git a/.github/workflows/_torch-npu-upstream-build.yml b/.github/workflows/_torch-npu-upstream-build.yml
index fb266efb64..efd28f93be 100644
--- a/.github/workflows/_torch-npu-upstream-build.yml
+++ b/.github/workflows/_torch-npu-upstream-build.yml
@@ -33,6 +33,9 @@ on:
       pytorch-src:
         description: 'PyTorch source and test code artifact name'
         value: 'pytorch-src-main'
+      pytorch-version:
+        description: 'PyTorch version string'
+        value: ${{ jobs.build.outputs.pytorch-version }}
 
 env:
   # 缓存版本号，当需要强制刷新缓存时修改此值
@@ -46,6 +49,8 @@ jobs:
   build:
     runs-on: linux-aarch64-a3-16
     timeout-minutes: 240
+    outputs:
+      pytorch-version: ${{ steps.get_version.outputs.pytorch_version }}
 
     container:
       image: ${{ inputs.docker_image }}
@@ -279,8 +284,19 @@ jobs:
           echo "=== Verifying PyTorch installation ==="
           python${{ inputs.python_version }} -c "import torch; print(f'torch version: {torch.__version__}')"
 
-          echo ""
-          echo "=== Installing torch_npu build dependencies ==="
+      - name: Get PyTorch version
+        id: get_version
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          PYTORCH_VERSION=$(python${{ inputs.python_version }} -c "import torch; print(torch.__version__)")
+          echo "pytorch_version=${PYTORCH_VERSION}" >> $GITHUB_OUTPUT
+          echo "PyTorch version: ${PYTORCH_VERSION}"
+
+      - name: Install torch_npu build dependencies
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
diff --git a/.github/workflows/_torch-npu-upstream-report.yml b/.github/workflows/_torch-npu-upstream-report.yml
index d7675d3e15..d75248e5dc 100644
--- a/.github/workflows/_torch-npu-upstream-report.yml
+++ b/.github/workflows/_torch-npu-upstream-report.yml
@@ -7,6 +7,10 @@ on:
         required: true
         type: string
         description: Python version to use
+      pytorch_version:
+        required: true
+        type: string
+        description: PyTorch version string
       torch_npu_wheel_name:
         required: false
         type: string
@@ -87,6 +91,7 @@ jobs:
             --reports-root all-test-reports \
             --output-markdown ${REPORT_MD} \
             --output-json ${REPORT_JSON} \
+            --pytorch-version "${{ inputs.pytorch_version }}" \
             --torch-npu-whl "${{ inputs.torch_npu_wheel_name }}" \
             --shard-matrix-json "${COMBINED_MATRIX}" \
             --docker-image "${{ inputs.docker_image }}" \
diff --git a/.github/workflows/_torch-npu-upstream-test.yml b/.github/workflows/_torch-npu-upstream-test.yml
index 678e80f9ce..3b7ac687ce 100644
--- a/.github/workflows/_torch-npu-upstream-test.yml
+++ b/.github/workflows/_torch-npu-upstream-test.yml
@@ -137,6 +137,7 @@ jobs:
     uses: ./.github/workflows/_torch-npu-upstream-report.yml
     with:
       python_version: ${{ inputs.python_version }}
+      pytorch_version: ${{ needs.build.outputs.pytorch-version }}
       torch_npu_wheel_name: ${{ needs.build.outputs.torch-npu-wheel }}
       docker_image: ${{ inputs.docker_image }}
       distributed_matrix: ${{ needs.collect_cases.outputs.distributed_matrix || '[]' }}

From 0c902d627b6df61d153fd22cbfd4f22d5b6122ed Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 11:01:59 +0800
Subject: [PATCH 119/132] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=8A=A5=E5=91=8Abug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/torch-npu-upstream-test-trigger.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/torch-npu-upstream-test-trigger.yml b/.github/workflows/torch-npu-upstream-test-trigger.yml
index 4bd8d793a4..fa3d8f1ca5 100644
--- a/.github/workflows/torch-npu-upstream-test-trigger.yml
+++ b/.github/workflows/torch-npu-upstream-test-trigger.yml
@@ -44,7 +44,7 @@ on:
       distributed_shards:
         description: 'Number of shards for distributed tests (default 2)'
         required: false
-        default: '2'
+        default: '3'
         type: string
       regular_shards:
         description: 'Number of shards for regular tests (default 5)'

From f82e3bb61b452eb491ee3ebcdfe5e32e31892f2c Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 12:29:30 +0800
Subject: [PATCH 120/132] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E6=8A=A5=E5=91=8Ajso?=
 =?UTF-8?q?n?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/scripts/collect_all_cases.py          |  12 +-
 .../scripts/generate_npu_full_test_report.py  | 210 ++----------------
 2 files changed, 24 insertions(+), 198 deletions(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 9e15b047ee..33375ace9e 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -388,6 +388,14 @@ def main():
     # ========================================
     # Step 3: Save overall summary
     # ========================================
+    # Calculate file counts (distributed + regular = total_files, no overlap)
+    dist_total = dist_meta.get("total_files", 0)
+    dist_selected = dist_meta.get("type_selected", 0)
+    reg_total = reg_meta.get("total_files", 0)
+    reg_selected = reg_meta.get("type_selected", 0)
+    # total_files is same for both (all test_*.py files), use one value
+    total_files = dist_total
+
     overall_summary = {
         "distributed": {
             "cases_summary": dist_summary,
@@ -398,7 +406,9 @@ def main():
             "discovery_metadata": reg_meta,
         },
         "total_cases": len(dist_cases) + len(reg_cases),
-        "total_files_scanned": dist_meta.get("total_files", 0) + reg_meta.get("total_files", 0),
+        "total_files_scanned": total_files,
+        "distributed_files": dist_selected,
+        "regular_files": reg_selected,
     }
     summary_file = output_dir / "cases_collection_summary.json"
     summary_file.write_text(json.dumps(overall_summary, indent=2), encoding="utf-8")
diff --git a/.github/scripts/generate_npu_full_test_report.py b/.github/scripts/generate_npu_full_test_report.py
index 959f8adb3f..dcc46b8599 100644
--- a/.github/scripts/generate_npu_full_test_report.py
+++ b/.github/scripts/generate_npu_full_test_report.py
@@ -151,7 +151,6 @@ def aggregate_testsuite_stats_for_shard(
     shard_type: str,
     shard: int,
     planned_files: List[str],
-    missing_files_list: List[str] = None
 ) -> List[Dict]:
     """
     Aggregate all testsuite statistics for a specific shard.
@@ -164,15 +163,10 @@ def aggregate_testsuite_stats_for_shard(
         shard_type: Shard type ("distributed" or "regular")
         shard: Shard number to aggregate for
         planned_files: List of test file paths planned for this shard
-        missing_files_list: List of test file paths that crashed and didn't generate XML
 
     Returns:
         List of testsuite statistics for tests belonging to this shard.
-        Missing files are included with status="MISSING" and tests=0.
     """
-    if missing_files_list is None:
-        missing_files_list = []
-
     all_testsuites = {}
     # Map from test identifier -> aggregated stats
 
@@ -275,22 +269,6 @@ def aggregate_testsuite_stats_for_shard(
                     aggregated["skipped"] += ts.get("skipped", 0)
                     aggregated["time"] += ts.get("time", 0.0)
 
-    # Add missing files (crashed without generating XML) to the result
-    # These files show as "MISSING" in the test file details
-    for missing_file in missing_files_list:
-        missing_identifier = extract_test_identifier(missing_file)
-        if missing_identifier and missing_identifier not in all_testsuites:
-            all_testsuites[missing_identifier] = {
-                "name": missing_identifier,
-                "tests": 0,
-                "passed": 0,
-                "failures": 0,
-                "errors": 0,
-                "skipped": 0,
-                "time": 0.0,
-                "status": "MISSING",  # Special status for crashed files
-            }
-
     # Convert to list and sort by name
     result = list(all_testsuites.values())
     result.sort(key=lambda x: x["name"])
@@ -497,11 +475,7 @@ def discover_shard_files(
 ) -> Tuple[
     Dict[Tuple[str, int], Path],  # stats_files
     Dict[Tuple[str, int], Path],  # info_files
-    Dict[Tuple[str, int], Path],  # plan_files
-    Dict[Tuple[str, int], Path],  # excluded_files
-    Dict[Tuple[str, int], Path],  # unhandled_files
     Dict[Tuple[str, int], Path],  # xml_files
-    Dict[Tuple[str, int], Path],  # missing_files
     Dict[Tuple[str, int], Path],  # cases_files
 ]:
     """
@@ -517,11 +491,7 @@ def discover_shard_files(
     """
     stats_files = {}
     info_files = {}
-    plan_files = {}
-    excluded_files = {}
-    unhandled_files = {}
     xml_files = {}
-    missing_files = {}
     cases_files = {}
 
     def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
@@ -530,7 +500,6 @@ def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
 
         Filename format: shard_{type}-{number}_{suffix}
         e.g., shard_dist-1_stats.json -> ("distributed", 1)
-        e.g., shard_reg-2_planned_test_files.txt -> ("regular", 2)
         """
         stem = path.stem  # filename without extension
         # Match pattern: shard_{type}-{number}_{suffix}
@@ -554,21 +523,6 @@ def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
         if key:
             info_files[key] = path
 
-    for path in reports_root.rglob("shard_*_planned_test_files.txt"):
-        key = parse_shard_filename(path, "planned_test_files")
-        if key:
-            plan_files[key] = path
-
-    for path in reports_root.rglob("shard_*_excluded_test_files.txt"):
-        key = parse_shard_filename(path, "excluded_test_files")
-        if key:
-            excluded_files[key] = path
-
-    for path in reports_root.rglob("shard_*_unhandled_upstream_tests.txt"):
-        key = parse_shard_filename(path, "unhandled_upstream_tests")
-        if key:
-            unhandled_files[key] = path
-
     # Discover XML files for per-test-file statistics
     for path in reports_root.rglob("shard_*_pytest*.xml"):
         # XML filename: shard_{type}-{number}_pytest{suffix}.xml
@@ -583,19 +537,13 @@ def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
                 key = ("regular", shard_num)
             xml_files[key] = path
 
-    # Discover missing files list (files that crashed and didn't generate XML)
-    for path in reports_root.rglob("shard_*_missing_files.txt"):
-        key = parse_shard_filename(path, "missing_files")
-        if key:
-            missing_files[key] = path
-
     # Discover case-level results files
     for path in reports_root.rglob("shard_*_cases.json"):
         key = parse_shard_filename(path, "cases")
         if key:
             cases_files[key] = path
 
-    return stats_files, info_files, plan_files, excluded_files, unhandled_files, xml_files, missing_files, cases_files
+    return stats_files, info_files, xml_files, cases_files
 
 
 def build_file_to_shards_map(cases_shards_dir: Path) -> Dict[str, List[str]]:
@@ -698,86 +646,6 @@ def sanitize_markdown_cell(value: str) -> str:
     return value.replace("|", "\\|").replace("\n", "<br>")
 
 
-def format_planned_files_cell(planned_files: List[str]) -> str:
-    if not planned_files:
-        return "-"
-    return "<br>".join(sanitize_markdown_cell(path) for path in planned_files)
-
-
-def format_testsuite_detail(stats: Dict) -> str:
-    """
-    Format a single testsuite's stats for display.
-
-    Format: "test_file.py: 5 passed, 2 failed, 1 error, 0 skipped, 3.2s"
-    Or for missing files: "test_file.py: MISSING (crashed, no report)"
-    """
-    name = sanitize_markdown_cell(stats.get("name", "unknown"))
-
-    # Check for MISSING status (file crashed without generating report)
-    if stats.get("status") == "MISSING":
-        return f"{name}: MISSING (crashed, no report)"
-    passed = stats.get("passed", 0)
-    failures = stats.get("failures", 0)
-    errors = stats.get("errors", 0)
-    skipped = stats.get("skipped", 0)
-    time = stats.get("time", 0.0)
-
-    # Build stats parts (comma-separated)
-    stats_parts = []
-    if passed > 0:
-        stats_parts.append(f"{passed} passed")
-    if failures > 0:
-        stats_parts.append(f"{failures} failed")
-    if errors > 0:
-        stats_parts.append(f"{errors} error")
-    if skipped > 0:
-        stats_parts.append(f"{skipped} skipped")
-    stats_parts.append(format_duration_short(time))
-
-    # Format: "name: stats1, stats2, ..."
-    stats_str = ", ".join(stats_parts)
-    return f"{name}: {stats_str}"
-
-
-def format_duration_short(seconds: float) -> str:
-    """Format duration in a compact form for testsuite display."""
-    seconds = float(seconds)
-    if seconds >= 60:
-        minutes = int(seconds // 60)
-        secs = seconds % 60
-        return f"{minutes}m{secs:.0f}s"
-    return f"{seconds:.1f}s"
-
-
-def format_testsuite_details_cell(testsuites: List[Dict]) -> str:
-    """
-    Format all testsuite stats for a shard into a single cell.
-
-    Each testsuite is displayed on a separate line with its stats.
-    """
-    if not testsuites:
-        return "-"
-
-    lines = []
-    for ts in testsuites:
-        lines.append(format_testsuite_detail(ts))
-
-    return "<br>".join(lines)
-
-
-def format_summary_note(note: str) -> str:
-    cleaned = (note or "").strip()
-    if not cleaned or cleaned == "pytest exited with code 1":
-        return "-"
-    return sanitize_markdown_cell(cleaned)
-
-
-def format_scope_list(items: List[str]) -> List[str]:
-    if not items:
-        return ["- None"]
-    return [f"- {sanitize_markdown_cell(item)}" for item in items]
-
-
 def render_table(headers: List[str], rows: List[List[str]]) -> List[str]:
     lines = [
         "| " + " | ".join(headers) + " |",
@@ -817,26 +685,20 @@ def main():
     cases_summary_data = None
     file_discovery_stats = {
         "total_files_scanned": 0,
-        "distributed_files_before_filter": 0,
-        "distributed_files_after_filter": 0,
-        "regular_files_before_filter": 0,
-        "regular_files_after_filter": 0,
+        "distributed_files": 0,
+        "regular_files": 0,
     }
     if args.cases_summary:
         cases_summary_path = Path(args.cases_summary)
         if cases_summary_path.exists():
             cases_summary_data = load_json_file(cases_summary_path)
-            # Extract file discovery stats from metadata
+            # Extract file discovery stats (正交: total = distributed + regular)
             if cases_summary_data:
                 file_discovery_stats["total_files_scanned"] = cases_summary_data.get("total_files_scanned", 0)
-                dist_meta = cases_summary_data.get("distributed", {}).get("discovery_metadata", {})
-                reg_meta = cases_summary_data.get("regular", {}).get("discovery_metadata", {})
-                file_discovery_stats["distributed_files_before_filter"] = dist_meta.get("type_selected", 0)
-                file_discovery_stats["distributed_files_after_filter"] = dist_meta.get("rules_selected", 0)
-                file_discovery_stats["regular_files_before_filter"] = reg_meta.get("type_selected", 0)
-                file_discovery_stats["regular_files_after_filter"] = reg_meta.get("rules_selected", 0)
-
-    stats_files, info_files, plan_files, excluded_files, unhandled_files, xml_files, missing_files_paths, cases_files = discover_shard_files(reports_root)
+                file_discovery_stats["distributed_files"] = cases_summary_data.get("distributed_files", 0)
+                file_discovery_stats["regular_files"] = cases_summary_data.get("regular_files", 0)
+
+    stats_files, info_files, xml_files, cases_files = discover_shard_files(reports_root)
     special_test_files = discover_special_test_files(special_reports_root)
     shard_ids = requested_shards or sorted(set(stats_files) | set(info_files) | set(cases_files))
 
@@ -854,13 +716,8 @@ def main():
         "crashed": 0,
         "timeout": 0,
         "duration": 0.0,
-        "missing_files": 0,
     }
     shard_rows = []
-    unique_planned_files = set()
-    unique_excluded_files = set()
-    unique_unhandled_tests = set()
-    unique_missing_files = set()
     selection_modes = set()
     cases_results = {}  # Store case-level results for each shard
 
@@ -868,17 +725,9 @@ def main():
         shard_key = (shard_type, shard_num)
         stats_path = stats_files.get(shard_key)
         info_path = info_files.get(shard_key)
-        plan_path = plan_files.get(shard_key)
-        excluded_path = excluded_files.get(shard_key)
-        unhandled_path = unhandled_files.get(shard_key)
-        missing_path = missing_files_paths.get(shard_key)
         cases_path = cases_files.get(shard_key)
         stats = load_json_file(stats_path) if stats_path else {}
         info = load_json_file(info_path) if info_path else {}
-        planned_files = load_text_lines(plan_path) if plan_path else []
-        excluded_test_files = load_text_lines(excluded_path) if excluded_path else []
-        unhandled_tests = load_text_lines(unhandled_path) if unhandled_path else []
-        missing_files_list = load_text_lines(missing_path) if missing_path else []
 
         # Load case-level results if available
         cases_data = load_json_file(cases_path) if cases_path else {}
@@ -906,10 +755,7 @@ def main():
         present = bool(stats_path or cases_path)
 
         # Parse ALL XML files to get per-test-file statistics
-        # This includes Phase 1 (run_test.py) and Phase 2 (pytest fallback) results
-        # Filter by planned test files to ensure we only include tests for this shard
-        # Include missing files that crashed without generating reports
-        testsuite_stats = aggregate_testsuite_stats_for_shard(reports_root, shard_type, shard_num, planned_files, missing_files_list)
+        testsuite_stats = aggregate_testsuite_stats_for_shard(reports_root, shard_type, shard_num, [])
 
         # If testsuite_stats has entries, aggregate their totals and override incomplete status
         has_phase1_xmls = len(testsuite_stats) > 0
@@ -949,18 +795,12 @@ def main():
                     if not present:
                         present = True
 
-        unique_planned_files.update(planned_files)
-        unique_excluded_files.update(excluded_test_files)
-        unique_unhandled_tests.update(unhandled_tests)
-        unique_missing_files.update(missing_files_list)
         if info.get("selection_mode"):
             selection_modes.add(str(info.get("selection_mode")))
 
         status = get_shard_status(stats, present)
         status_counts[status] += 1
 
-        totals["missing_files"] += len(missing_files_list)
-
         # Convert shard_type to display prefix ("distributed" -> "dist", "regular" -> "reg")
         shard_prefix = "dist" if shard_type == "distributed" else "reg"
         shard_rows.append(
@@ -984,11 +824,7 @@ def main():
     whl_name = Path(args.torch_npu_whl).name
     received_reports = len(stats_files)
     expected_reports = len(shard_ids)
-    unique_planned_count = len(unique_planned_files)
-    excluded_test_files_list = sorted(unique_excluded_files)
-    unhandled_tests_list = sorted(unique_unhandled_tests)
     selection_mode_display = ", ".join(sorted(selection_modes)) if selection_modes else "-"
-    include_unhandled_tests = bool(unhandled_tests_list)
 
     # Show all shards in the detail table
     sorted_shards = sorted(shard_rows, key=lambda row: (row["shard_type"], row["shard_num"]))
@@ -1020,15 +856,11 @@ def main():
     if cases_summary_data:
         # Use file discovery stats from cases_collection_summary.json
         total_scanned = file_discovery_stats["total_files_scanned"]
-        dist_before = file_discovery_stats["distributed_files_before_filter"]
-        dist_after = file_discovery_stats["distributed_files_after_filter"]
-        reg_before = file_discovery_stats["regular_files_before_filter"]
-        reg_after = file_discovery_stats["regular_files_after_filter"]
-        total_after_filter = dist_after + reg_after
+        dist_files = file_discovery_stats["distributed_files"]
+        reg_files = file_discovery_stats["regular_files"]
         selection_content = (
-            f"扫描发现 {total_scanned} 个测试文件; "
-            f"黑白名单过滤后 {total_after_filter} 个文件 "
-            f"(distributed: {dist_before} -> {dist_after}, regular: {reg_before} -> {reg_after})"
+            f"扫描发现 {total_scanned} 个测试文件 "
+            f"(distributed: {dist_files}, regular: {reg_files})"
         )
     else:
         # Fallback to original selection mode display
@@ -1068,8 +900,6 @@ def main():
             f"{planned_total_cases} (distributed: {planned_dist_cases}, regular: {planned_reg_cases})",
         ])
     overview_rows.append(["Duration", format_duration(totals["duration"])])
-    if totals["missing_files"] > 0:
-        overview_rows.append(["Missing files", f"{totals['missing_files']} crashed without report"])
     if include_special_tests:
         overview_rows.append(["Special tests expected", str(len(special_test_names))])
 
@@ -1151,9 +981,6 @@ def main():
                 )
             )
 
-    if include_unhandled_tests:
-        markdown_lines.extend(["", "## Unhandled Special Tests"])
-        markdown_lines.extend(format_scope_list(unhandled_tests_list))
     if include_special_tests:
         markdown_lines.extend(["", "## Special Test Results"])
         markdown_lines.extend(
@@ -1190,18 +1017,7 @@ def main():
             "distributed": planned_dist_cases,
             "regular": planned_reg_cases,
         },
-        "execution_scope": {
-            "selection_mode": sorted(selection_modes),
-            "unique_planned_test_files": unique_planned_count,
-            "excluded_test_files": excluded_test_files_list,
-            "unhandled_special_tests": unhandled_tests_list,
-            "missing_files": sorted(unique_missing_files),
-        },
-        "failure_breakdown": {
-            "missing_files": totals["missing_files"],
-        },
         "shards": shard_rows,
-        "failed_shards": [row for row in shard_rows if row["status"] not in ("PASSED", "NO TESTS")],
     }
 
     # Add full cases summary if available

From 48bbe1855f6610d3301365a0745783440f20dd1f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 13:08:36 +0800
Subject: [PATCH 121/132] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95xml=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../scripts/generate_npu_full_test_report.py  | 396 +-----------------
 1 file changed, 2 insertions(+), 394 deletions(-)

diff --git a/.github/scripts/generate_npu_full_test_report.py b/.github/scripts/generate_npu_full_test_report.py
index dcc46b8599..d7cd31ed5e 100644
--- a/.github/scripts/generate_npu_full_test_report.py
+++ b/.github/scripts/generate_npu_full_test_report.py
@@ -6,7 +6,6 @@
 import argparse
 import json
 import re
-import xml.etree.ElementTree as ET
 from collections import Counter
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
@@ -57,340 +56,6 @@ def load_json_file(path: Path) -> Dict:
         return {}
 
 
-def parse_junit_xml_testsuites(xml_path: Path) -> List[Dict]:
-    """
-    Parse JUnit XML file and extract per-testsuite statistics.
-
-    Each testsuite represents a test file with its own stats:
-    - name: test file name
-    - tests: total test cases
-    - failures: failed test cases
-    - errors: error test cases
-    - skipped: skipped test cases
-    - time: execution time in seconds
-
-    Returns a list of testsuite statistics.
-    """
-    testsuites = []
-
-    if not xml_path.exists():
-        return testsuites
-
-    try:
-        tree = ET.parse(xml_path)
-        root = tree.getroot()
-
-        # Handle both <testsuites> and <testsuite> as root
-        if root.tag == "testsuites":
-            for testsuite in root.findall("testsuite"):
-                stats = parse_testsuite_element(testsuite)
-                if stats:
-                    testsuites.append(stats)
-        elif root.tag == "testsuite":
-            stats = parse_testsuite_element(root)
-            if stats:
-                testsuites.append(stats)
-
-    except ET.ParseError as e:
-        print(f"Warning: Failed to parse XML {xml_path}: {e}")
-    except Exception as e:
-        print(f"Warning: Error reading XML {xml_path}: {e}")
-
-    return testsuites
-
-
-def parse_testsuite_element(testsuite: ET.Element) -> Optional[Dict]:
-    """Parse a single testsuite element and return its statistics."""
-    try:
-        name = testsuite.get("name", "unknown")
-        tests = int(testsuite.get("tests", 0))
-        failures = int(testsuite.get("failures", 0))
-        errors = int(testsuite.get("errors", 0))
-        skipped = int(testsuite.get("skipped", 0))
-        time = float(testsuite.get("time", 0.0))
-        passed = tests - failures - errors - skipped
-
-        return {
-            "name": name,
-            "tests": tests,
-            "passed": passed,
-            "failures": failures,
-            "errors": errors,
-            "skipped": skipped,
-            "time": time,
-        }
-    except (ValueError, TypeError):
-        return None
-
-
-def extract_test_identifier(test_path: str) -> str:
-    """
-    Extract a test identifier from a test file path.
-
-    Converts paths like:
-    - "test/distributed/_composable/fsdp/test_fully_shard_autograd.py"
-    To:
-    - "distributed._composable.fsdp.test_fully_shard_autograd"
-
-    This matches the testsuite naming convention used by pytest/run_test.py.
-    """
-    # Remove 'test/' prefix if present
-    path = test_path
-    if path.startswith("test/"):
-        path = path[5:]
-    # Remove '.py' suffix
-    if path.endswith(".py"):
-        path = path[:-3]
-    # Convert path separators to dots
-    path = path.replace("/", ".").replace("\\", ".")
-    return path
-
-
-def aggregate_testsuite_stats_for_shard(
-    reports_root: Path,
-    shard_type: str,
-    shard: int,
-    planned_files: List[str],
-) -> List[Dict]:
-    """
-    Aggregate all testsuite statistics for a specific shard.
-
-    The test execution generates XML files named `shard_{type}-{shard}_pytest*.xml`.
-    Each XML file contains testcases with `file` attribute indicating the test file.
-
-    Args:
-        reports_root: Root directory containing all merged report files
-        shard_type: Shard type ("distributed" or "regular")
-        shard: Shard number to aggregate for
-        planned_files: List of test file paths planned for this shard
-
-    Returns:
-        List of testsuite statistics for tests belonging to this shard.
-    """
-    all_testsuites = {}
-    # Map from test identifier -> aggregated stats
-
-    # Build set of test identifiers from planned files
-    planned_identifiers = set()
-    for planned in planned_files:
-        identifier = extract_test_identifier(planned)
-        if identifier:
-            planned_identifiers.add(identifier)
-
-    # Also include just the test file names for simpler matching
-    planned_test_names = set()
-    for planned in planned_files:
-        name = Path(planned).name.replace(".py", "")
-        planned_test_names.add(name)
-
-    print(f"DEBUG: planned_files count={len(planned_files)}, planned_identifiers count={len(planned_identifiers)}")
-    if planned_identifiers:
-        print(f"DEBUG: First 3 planned_identifiers: {list(planned_identifiers)[:3]}")
-
-    # Convert shard_type to file prefix ("distributed" -> "dist", "regular" -> "reg")
-    type_prefix = "dist" if shard_type == "distributed" else "reg"
-
-    # Debug: List all files in reports_root
-    print(f"DEBUG aggregate_testsuite_stats_for_shard: shard_type={shard_type}, shard={shard}")
-    print(f"DEBUG: reports_root={reports_root}, exists={reports_root.exists()}")
-    if reports_root.exists():
-        all_xml_files = list(reports_root.rglob("*.xml"))
-        print(f"DEBUG: Total XML files in reports_root (rglob): {len(all_xml_files)}")
-        matching_xml_files = list(reports_root.rglob(f"shard_{type_prefix}-{shard}_pytest*.xml"))
-        print(f"DEBUG: Matching XML files for shard_{type_prefix}-{shard}_pytest*.xml (rglob): {len(matching_xml_files)}")
-        for xf in matching_xml_files[:5]:
-            print(f"DEBUG:   - {xf.relative_to(reports_root)}")
-
-    # Find all XML files for this shard: shard_{type}-{shard}_pytest*.xml
-    # Use rglob to search recursively (files may be in subdirectories due to artifact merge)
-    for xml_path in reports_root.rglob(f"shard_{type_prefix}-{shard}_pytest*.xml"):
-        # Parse testcase elements and aggregate by file attribute
-        test_file_stats = aggregate_testcases_by_file(xml_path, planned_identifiers, planned_test_names)
-        for test_id, stats in test_file_stats.items():
-            if test_id in all_testsuites:
-                # Merge with existing stats
-                existing = all_testsuites[test_id]
-                existing["tests"] += stats["tests"]
-                existing["passed"] += stats["passed"]
-                existing["failures"] += stats["failures"]
-                existing["errors"] += stats["errors"]
-                existing["skipped"] += stats["skipped"]
-                existing["time"] += stats["time"]
-            else:
-                all_testsuites[test_id] = stats
-
-    # Also check nested directories for Phase 1 style XMLs (run_test.py output)
-    phase1_patterns = [
-        "junit",
-        "pytorch-test-src/test/test-reports/python-pytest",
-    ]
-
-    for phase1_pattern in phase1_patterns:
-        phase1_base = reports_root / phase1_pattern
-        if not phase1_base.exists():
-            continue
-
-        for test_dir in phase1_base.iterdir():
-            if not test_dir.is_dir():
-                continue
-            test_identifier = test_dir.name
-            matched = False
-            for planned_id in planned_identifiers:
-                if test_identifier == planned_id or test_identifier.startswith(planned_id) or planned_id.startswith(test_identifier):
-                    matched = True
-                    break
-            if not matched:
-                for test_name in planned_test_names:
-                    if test_identifier.endswith(test_name) or test_name in test_identifier:
-                        matched = True
-                        break
-            if not matched:
-                continue
-
-            if test_identifier not in all_testsuites:
-                all_testsuites[test_identifier] = {
-                    "name": test_identifier,
-                    "tests": 0,
-                    "passed": 0,
-                    "failures": 0,
-                    "errors": 0,
-                    "skipped": 0,
-                    "time": 0.0,
-                }
-            aggregated = all_testsuites[test_identifier]
-
-            for xml_file in test_dir.glob("*.xml"):
-                testsuites = parse_junit_xml_testsuites(xml_file)
-                for ts in testsuites:
-                    aggregated["tests"] += ts.get("tests", 0)
-                    aggregated["passed"] += ts.get("passed", 0)
-                    aggregated["failures"] += ts.get("failures", 0)
-                    aggregated["errors"] += ts.get("errors", 0)
-                    aggregated["skipped"] += ts.get("skipped", 0)
-                    aggregated["time"] += ts.get("time", 0.0)
-
-    # Convert to list and sort by name
-    result = list(all_testsuites.values())
-    result.sort(key=lambda x: x["name"])
-
-    return result
-
-
-def aggregate_testcases_by_file(xml_path: Path, planned_identifiers: set, planned_test_names: set) -> Dict[str, Dict]:
-    """
-    Parse XML file and aggregate testcase statistics by file attribute.
-
-    Used for XMLs where testsuite name is generic "pytest".
-    If planned_identifiers is empty, accept all testcases.
-    """
-    result = {}
-    debug_count = 0
-
-    try:
-        tree = ET.parse(xml_path)
-        root = tree.getroot()
-
-        # Find all testcase elements
-        testcases = root.findall(".//testcase")
-        print(f"DEBUG aggregate_testcases_by_file: {xml_path.name}, testcases={len(testcases)}, planned_ids={len(planned_identifiers)}")
-
-        for testcase in testcases:
-            file_attr = testcase.get("file", "")
-            classname_attr = testcase.get("classname", "")
-
-            # Extract test identifier from file attribute or classname
-            test_identifier = None
-
-            if file_attr:
-                # e.g., "distributed/fsdp/test_fsdp_sharded_grad_scaler.py"
-                test_identifier = extract_test_identifier("test/" + file_attr) if not file_attr.startswith("test/") else extract_test_identifier(file_attr)
-                if debug_count < 3:
-                    print(f"DEBUG: file_attr='{file_attr}' -> test_identifier='{test_identifier}'")
-                    debug_count += 1
-            elif classname_attr:
-                # classname format: "test.distributed._composable.fsdp.test_fully_shard_comm.TestFullyShardCollectiveOps"
-                # The last part is the class name, need to extract the module path
-                # e.g., extract "test.distributed._composable.fsdp.test_fully_shard_comm" (module name)
-                parts = classname_attr.split(".")
-                if len(parts) > 1:
-                    # Remove the last part (class name like TestFullyShardCollectiveOps)
-                    # Keep everything before the class name
-                    module_parts = parts[:-1]
-                    classname_attr = ".".join(module_parts)
-                # Convert to match planned_identifiers format (dot-separated, no test/ prefix)
-                # planned_identifiers format: "distributed._composable.fsdp.test_fully_shard_comm"
-                test_identifier = classname_attr
-                # Remove 'test.' prefix if present to match planned_identifiers
-                if test_identifier.startswith("test."):
-                    test_identifier = test_identifier[5:]
-                if debug_count < 3:
-                    print(f"DEBUG: classname_attr='{classname_attr}' -> test_identifier='{test_identifier}'")
-                    debug_count += 1
-
-            if not test_identifier:
-                continue
-
-            # If planned_identifiers is empty, accept all testcases
-            # Otherwise, check if this test belongs to planned files
-            if planned_identifiers or planned_test_names:
-                matched = False
-                for planned_id in planned_identifiers:
-                    if test_identifier == planned_id or test_identifier.startswith(planned_id) or planned_id.startswith(test_identifier):
-                        matched = True
-                        break
-                if not matched:
-                    for test_name in planned_test_names:
-                        if test_identifier.endswith(test_name) or test_name in test_identifier:
-                            matched = True
-                            break
-                if not matched:
-                    continue
-
-            # Initialize stats for this test file
-            if test_identifier not in result:
-                result[test_identifier] = {
-                    "name": test_identifier,
-                    "tests": 0,
-                    "passed": 0,
-                    "failures": 0,
-                    "errors": 0,
-                    "skipped": 0,
-                    "time": 0.0,
-                }
-
-            # Count testcase
-            stats = result[test_identifier]
-            stats["tests"] += 1
-
-            # Determine outcome
-            failure = testcase.find("failure")
-            error = testcase.find("error")
-            skipped = testcase.find("skipped")
-
-            if failure is not None:
-                stats["failures"] += 1
-            elif error is not None:
-                stats["errors"] += 1
-            elif skipped is not None:
-                stats["skipped"] += 1
-            else:
-                stats["passed"] += 1
-
-            # Add time
-            time_str = testcase.get("time", "0")
-            try:
-                stats["time"] += float(time_str)
-            except ValueError:
-                pass
-
-    except ET.ParseError as e:
-        print(f"Warning: Failed to parse XML {xml_path}: {e}")
-    except Exception as e:
-        print(f"Warning: Error reading XML {xml_path}: {e}")
-
-    return result
-
-
 def parse_requested_shards(raw: str) -> List[Tuple[str, int]]:
     """
     Parse shard identifiers from JSON array.
@@ -475,7 +140,6 @@ def discover_shard_files(
 ) -> Tuple[
     Dict[Tuple[str, int], Path],  # stats_files
     Dict[Tuple[str, int], Path],  # info_files
-    Dict[Tuple[str, int], Path],  # xml_files
     Dict[Tuple[str, int], Path],  # cases_files
 ]:
     """
@@ -491,7 +155,6 @@ def discover_shard_files(
     """
     stats_files = {}
     info_files = {}
-    xml_files = {}
     cases_files = {}
 
     def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
@@ -523,27 +186,13 @@ def parse_shard_filename(path: Path, suffix_pattern: str) -> Tuple[str, int]:
         if key:
             info_files[key] = path
 
-    # Discover XML files for per-test-file statistics
-    for path in reports_root.rglob("shard_*_pytest*.xml"):
-        # XML filename: shard_{type}-{number}_pytest{suffix}.xml
-        stem = path.stem
-        match = re.match(r"shard_(dist|reg)-(\d+)_pytest", stem)
-        if match:
-            type_prefix = match.group(1)
-            shard_num = int(match.group(2))
-            if type_prefix == "dist":
-                key = ("distributed", shard_num)
-            elif type_prefix == "reg":
-                key = ("regular", shard_num)
-            xml_files[key] = path
-
     # Discover case-level results files
     for path in reports_root.rglob("shard_*_cases.json"):
         key = parse_shard_filename(path, "cases")
         if key:
             cases_files[key] = path
 
-    return stats_files, info_files, xml_files, cases_files
+    return stats_files, info_files, cases_files
 
 
 def build_file_to_shards_map(cases_shards_dir: Path) -> Dict[str, List[str]]:
@@ -698,7 +347,7 @@ def main():
                 file_discovery_stats["distributed_files"] = cases_summary_data.get("distributed_files", 0)
                 file_discovery_stats["regular_files"] = cases_summary_data.get("regular_files", 0)
 
-    stats_files, info_files, xml_files, cases_files = discover_shard_files(reports_root)
+    stats_files, info_files, cases_files = discover_shard_files(reports_root)
     special_test_files = discover_special_test_files(special_reports_root)
     shard_ids = requested_shards or sorted(set(stats_files) | set(info_files) | set(cases_files))
 
@@ -754,47 +403,6 @@ def main():
 
         present = bool(stats_path or cases_path)
 
-        # Parse ALL XML files to get per-test-file statistics
-        testsuite_stats = aggregate_testsuite_stats_for_shard(reports_root, shard_type, shard_num, [])
-
-        # If testsuite_stats has entries, aggregate their totals and override incomplete status
-        has_phase1_xmls = len(testsuite_stats) > 0
-        if has_phase1_xmls:
-            # Aggregate stats from Phase 1 XMLs
-            xml_totals = {
-                "tests": 0,
-                "passed": 0,
-                "failures": 0,
-                "errors": 0,
-                "skipped": 0,
-                "time": 0.0,
-            }
-            for ts in testsuite_stats:
-                xml_totals["tests"] += ts.get("tests", 0)
-                xml_totals["passed"] += ts.get("passed", 0)
-                xml_totals["failures"] += ts.get("failures", 0)
-                xml_totals["errors"] += ts.get("errors", 0)
-                xml_totals["skipped"] += ts.get("skipped", 0)
-                xml_totals["time"] += ts.get("time", 0.0)
-
-            # Use XML data to fill stats if:
-            # 1. stats.json doesn't exist (stats is empty) but we have XML data
-            # 2. stats.json exists but is incomplete and we have XML data to override
-            # This ensures per-file isolation mode shards get correct totals even without stats.json
-            if xml_totals["tests"] > 0:
-                # Always fill stats from XML if stats is empty or incomplete
-                if not stats or stats.get("incomplete"):
-                    stats["incomplete"] = False
-                    stats["total"] = xml_totals["tests"]
-                    stats["passed"] = xml_totals["passed"]
-                    stats["failed"] = xml_totals["failures"]
-                    stats["errors"] = xml_totals["errors"]
-                    stats["skipped"] = xml_totals["skipped"]
-                    stats["duration"] = xml_totals["time"]
-                    # Mark as present if we have XML data (even without stats.json)
-                    if not present:
-                        present = True
-
         if info.get("selection_mode"):
             selection_modes.add(str(info.get("selection_mode")))
 

From 69518283d310e60f1ccb94ed421adf7a2a9f2fc6 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 20:33:25 +0800
Subject: [PATCH 122/132] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E7=94=A8=E4=BE=8B?=
 =?UTF-8?q?=E6=89=A7=E8=A1=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/scripts/run_npu_test_shard.py | 296 +++++++++++++++-----------
 1 file changed, 172 insertions(+), 124 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 2bb5686216..f5310efc92 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -42,10 +42,12 @@
 import importlib.util
 import json
 import os
+import re
 import signal
 import subprocess
 import sys
 import threading
+import xml.etree.ElementTree as ET
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from pathlib import Path
@@ -194,8 +196,9 @@ def save_failed_case_log(
     Returns:
         Path to the saved log file
     """
-    # Only save for failed/error/crashed/timeout cases
-    if status not in ("failed", "error", "crashed", "timeout"):
+    # Save log for error/timeout cases (no XML generated)
+    # Cases with valid XML don't need log saving (XML contains all info)
+    if status not in ("error", "timeout"):
         return None
 
     # Create failed cases log directory
@@ -248,7 +251,6 @@ def __init__(self):
         self._failed_count: int = 0
         self._error_count: int = 0
         self._skipped_count: int = 0
-        self._crashed_count: int = 0
         self._timeout_count: int = 0
         self._total_cases: int = 0
 
@@ -265,19 +267,15 @@ def add_case_result(self, case_result: Dict) -> None:
                 self._failed_count += 1
             elif status == "skipped":
                 self._skipped_count += 1
-            elif status == "crashed":
-                self._crashed_count += 1
             elif status == "timeout":
                 self._timeout_count += 1
-            elif status == "error":
-                self._error_count += 1
             else:
-                # Unknown status treated as error
+                # error
                 self._error_count += 1
 
-            # Track worst returncode (ignore skipped/no_tests)
+            # Track worst returncode (ignore skipped)
             rc = case_result.get("returncode", 1)
-            if rc != 0 and rc != 3 and rc != 5:
+            if rc != 0:
                 if self._worst_returncode == 0:
                     self._worst_returncode = rc
 
@@ -295,7 +293,6 @@ def get_summary(self) -> Dict:
                 "failed_count": self._failed_count,
                 "error_count": self._error_count,
                 "skipped_count": self._skipped_count,
-                "crashed_count": self._crashed_count,
                 "timeout_count": self._timeout_count,
                 "worst_returncode": self._worst_returncode,
             }
@@ -322,8 +319,7 @@ def mark_completed(self, nodeid: str, status: str, duration: float) -> None:
                 "passed": "[PASS]",
                 "failed": "[FAIL]",
                 "error": "[ERR]",
-                "crashed": "[CRASH]",
-                "timeout": "[TIMEOUT]",
+                "timeout": "[TIME]",
                 "skipped": "[SKIP]",
             }.get(status, "[?]")
 
@@ -340,6 +336,66 @@ def get_progress(self) -> Tuple[int, int]:
             return self._completed_tasks, self._total_tasks
 
 
+# ==============================================================================
+# JUnit XML Parsing for Accurate Status Detection
+# ==============================================================================
+
+
+def parse_junit_xml_status(xml_file: Path) -> Dict:
+    """
+    解析 JUnit XML 报告，获取测试状态。
+
+    Args:
+        xml_file: JUnit XML 文件路径
+
+    Returns:
+        Dict: {"status": "passed" | "skipped" | "failed" | "error" | "no_xml", "message": str}
+    """
+    if not xml_file.exists():
+        return {"status": "no_xml", "message": "XML file not generated"}
+
+    try:
+        tree = ET.parse(str(xml_file))
+        root = tree.getroot()
+
+        for testcase in root.iter("testcase"):
+            result = {"status": "passed", "message": ""}
+
+            # Check <skipped>
+            skipped_elem = testcase.find("skipped")
+            if skipped_elem is not None:
+                result["status"] = "skipped"
+                result["message"] = skipped_elem.get("message", "")
+                return result
+
+            # Check <failure>
+            failure_elem = testcase.find("failure")
+            if failure_elem is not None:
+                result["status"] = "failed"
+                result["message"] = failure_elem.get("message", "")
+                return result
+
+            # Check <error>
+            error_elem = testcase.find("error")
+            if error_elem is not None:
+                result["status"] = "error"
+                result["message"] = error_elem.get("message", "")
+                return result
+
+            # No failure/error/skipped = passed
+            return result
+
+        return {"status": "error", "message": "No testcase in XML"}
+
+    except Exception:
+        return {"status": "no_xml", "message": "XML parse failed"}
+
+
+# ==============================================================================
+# Utility Functions
+# ==============================================================================
+
+
 def get_signal_name(signal_num: int) -> str:
     """Convert signal number to human-readable name."""
     try:
@@ -621,17 +677,24 @@ def run_single_test_case(
     test_file: str = "",
 ) -> Dict:
     """
-    Run a single test case in isolated subprocess.
+    Run a single test case in isolated subprocess with JUnit XML output.
 
     Adds test file's parent directory to PYTHONPATH to enable
     imports of sibling modules (e.g., 'from model_registry import MLPModule').
 
+    Uses JUnit XML report for accurate status detection (passed/skipped/failed/error).
+    pytest returncode cannot distinguish passed from skipped accurately.
+
     Args:
         case_nodeid: Test case nodeid (e.g., "test_autograd.py::TestAutograd::test_grad")
         test_dir: Path to PyTorch test directory
         env: Environment dict for subprocess (will be modified for this call)
         timeout: Per-case timeout in seconds
         verbose: Verbose output
+        report_dir: Directory for XML reports and logs
+        shard: Shard number for XML filename
+        shard_type: "distributed" or "regular"
+        case_idx: Case index for XML filename
         test_file: Test file path for PYTHONPATH calculation
 
     Returns:
@@ -643,7 +706,6 @@ def run_single_test_case(
     original_nodeid = case_nodeid
 
     # Strip test/ prefix from nodeid if present (pytest --collect-only outputs with test/ prefix)
-    # When cwd is test_dir, the path should be relative to test_dir, not include test/
     if case_nodeid.startswith("test/"):
         case_nodeid = case_nodeid[5:]
 
@@ -661,6 +723,12 @@ def run_single_test_case(
         existing_pythonpath = case_env.get("PYTHONPATH", "")
         case_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
 
+    # Generate XML file path for this case
+    prefix = "dist" if shard_type == "distributed" else "reg"
+    xml_filename = f"case_{prefix}-{shard}_{case_idx}.xml"
+    xml_file = report_dir / "junit_xmls" / xml_filename
+    xml_file.parent.mkdir(parents=True, exist_ok=True)
+
     command = [
         sys.executable,
         "-m",
@@ -669,6 +737,8 @@ def run_single_test_case(
         "-ra",
         "--tb=short",
         case_nodeid,
+        f"--junitxml={xml_file}",
+        "--junit-prefix=",
     ]
 
     if timeout > 0:
@@ -687,7 +757,7 @@ def run_single_test_case(
         result = subprocess.run(
             command,
             cwd=str(test_dir),
-            env=case_env,  # Use per-case environment with test file directory in PYTHONPATH
+            env=case_env,
             capture_output=True,
             text=True,
             encoding="utf-8",
@@ -698,32 +768,34 @@ def run_single_test_case(
         duration = monotonic() - start_time
         returncode = result.returncode
 
-        # Determine status
-        if returncode == 0:
-            status = "passed"
-        elif returncode == 1:
-            status = "failed"
-        elif returncode == 2:
+        # Parse JUnit XML for status (simple logic)
+        # - Has XML: use XML status (passed/failed/skipped/error), don't save logs
+        # - No XML: status = error, save stdout/stderr
+        xml_result = parse_junit_xml_status(xml_file)
+        xml_status = xml_result.get("status")
+
+        if xml_status == "no_xml":
+            # No XML generated → error, save logs
             status = "error"
-        elif returncode == 3:
-            status = "skipped"
-        elif returncode == 4:
-            status = "error"  # usage error
-        elif returncode == 5:
-            status = "no_tests"
-        elif returncode < 0:
-            status = "crashed"
+            message = xml_result.get("message")
+            if report_dir:
+                save_failed_case_log(
+                    report_dir=report_dir,
+                    shard=shard,
+                    shard_type=shard_type,
+                    nodeid=original_nodeid,
+                    case_idx=case_idx,
+                    status=status,
+                    stdout=result.stdout,
+                    stderr=result.stderr,
+                    duration=duration,
+                    returncode=returncode,
+                    command=command_str,
+                )
         else:
-            status = "error"
-
-        # Extract error message from output
-        message = ""
-        if status in ("failed", "error", "crashed"):
-            # Extract last meaningful lines from stderr/stdout
-            output = result.stderr + result.stdout
-            lines = output.splitlines()
-            error_lines = [l for l in lines[-20:] if l.strip()]
-            message = "\n".join(error_lines[-5:])[:500]  # Limit message length
+            # Has valid XML → use XML status, don't save logs
+            status = xml_status
+            message = xml_result.get("message", "")
 
         case_result = {
             "nodeid": original_nodeid,
@@ -734,36 +806,22 @@ def run_single_test_case(
             "command": command_str,
         }
 
-        # Save failed case log to file
-        if status in ("failed", "error", "crashed") and report_dir:
-            save_failed_case_log(
-                report_dir=report_dir,
-                shard=shard,
-                shard_type=shard_type,
-                nodeid=original_nodeid,
-                case_idx=case_idx,
-                status=status,
-                stdout=result.stdout,
-                stderr=result.stderr,
-                duration=duration,
-                returncode=returncode,
-                command=command_str,
-            )
-
         return case_result
 
     except subprocess.TimeoutExpired:
         duration = monotonic() - start_time
+        # Timeout → no XML generated
+        status = "timeout"
         case_result = {
             "nodeid": original_nodeid,
-            "status": "timeout",
+            "status": status,
             "duration": duration,
             "returncode": -1,
             "message": f"Timeout after {timeout}s",
             "command": command_str,
         }
 
-        # Save timeout case log
+        # Save log for timeout (no XML)
         if report_dir:
             save_failed_case_log(
                 report_dir=report_dir,
@@ -771,7 +829,7 @@ def run_single_test_case(
                 shard_type=shard_type,
                 nodeid=original_nodeid,
                 case_idx=case_idx,
-                status="timeout",
+                status=status,
                 stdout="(process timed out, no output captured)",
                 stderr="(process timed out, no output captured)",
                 duration=duration,
@@ -859,6 +917,11 @@ def run_single_case_concurrent(
     if case_nodeid.startswith("test/"):
         case_nodeid = case_nodeid[5:]
 
+    # Generate XML file path for this case
+    prefix = "dist" if shard_type == "distributed" else "reg"
+    xml_filename = f"case_{prefix}-{shard}_{task.case_idx}.xml"
+    xml_file = report_dir / "junit_xmls" / xml_filename
+
     command = [
         sys.executable,
         "-m",
@@ -867,6 +930,8 @@ def run_single_case_concurrent(
         "-ra",
         "--tb=short",
         case_nodeid,
+        f"--junitxml={xml_file}",
+        "--junit-prefix=",
     ]
 
     if config.per_case_timeout > 0:
@@ -924,49 +989,16 @@ def run_single_case_concurrent(
         duration = monotonic() - start_time
         returncode = result.returncode
 
-        # Determine status (including crashed with negative returncode)
-        if returncode == 0:
-            status = "passed"
-        elif returncode == 1:
-            status = "failed"
-        elif returncode == 2:
-            status = "error"
-        elif returncode == 3:
-            status = "skipped"
-        elif returncode == 4:
-            status = "error"
-        elif returncode == 5:
-            status = "no_tests"
-        elif returncode < 0:
-            # Core dump or signal crash
-            status = "crashed"
-            signal_name = get_signal_name(abs(returncode))
-        else:
-            status = "error"
-
-        # Extract error message
-        message = ""
-        if status in ("failed", "error", "crashed"):
-            output = result.stderr + result.stdout
-            lines = output.splitlines()
-            error_lines = [l for l in lines[-20:] if l.strip()]
-            message = "\n".join(error_lines[-5:])[:500]
-            if status == "crashed":
-                message = f"Process crashed with {signal_name}\n{message}"
-
-        case_result = {
-            "nodeid": original_nodeid,
-            "status": status,
-            "duration": duration,
-            "returncode": returncode,
-            "message": message,
-            "command": command_str,
-            "file": task.test_file,
-            "case_idx": task.case_idx,
-        }
+        # Parse JUnit XML for status
+        # - Has XML: use XML status, don't save logs
+        # - No XML: error, save logs
+        xml_result = parse_junit_xml_status(xml_file)
+        xml_status = xml_result.get("status")
 
-        # Save failed case log to file
-        if status in ("failed", "error", "crashed"):
+        if xml_status == "no_xml":
+            # No XML → error, save logs
+            status = "error"
+            message = xml_result.get("message")
             save_failed_case_log(
                 report_dir=report_dir,
                 shard=shard,
@@ -980,13 +1012,29 @@ def run_single_case_concurrent(
                 returncode=returncode,
                 command=command_str,
             )
+        else:
+            # Has XML → use XML status, don't save logs
+            status = xml_status
+            message = xml_result.get("message", "")
+
+        case_result = {
+            "nodeid": original_nodeid,
+            "status": status,
+            "duration": duration,
+            "returncode": returncode,
+            "message": message,
+            "command": command_str,
+            "file": task.test_file,
+            "case_idx": task.case_idx,
+        }
 
     except subprocess.TimeoutExpired:
-        # Timeout - return result, don't raise
+        # Timeout → no XML, status = timeout
         duration = monotonic() - start_time
+        status = "timeout"
         case_result = {
             "nodeid": original_nodeid,
-            "status": "timeout",
+            "status": status,
             "duration": duration,
             "returncode": -1,
             "message": f"Timeout after {config.per_case_timeout}s",
@@ -995,14 +1043,14 @@ def run_single_case_concurrent(
             "case_idx": task.case_idx,
         }
 
-        # Save timeout case log
+        # Save log for timeout
         save_failed_case_log(
             report_dir=report_dir,
             shard=shard,
             shard_type=shard_type,
             nodeid=original_nodeid,
             case_idx=task.case_idx,
-            status="timeout",
+            status=status,
             stdout="(process timed out, no output captured)",
             stderr="(process timed out, no output captured)",
             duration=duration,
@@ -1132,6 +1180,10 @@ def run_tests_with_concurrent_isolation(
     start = monotonic()
     log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
 
+    # Create junit_xmls directory for XML reports
+    junit_xml_dir = report_dir / "junit_xmls"
+    junit_xml_dir.mkdir(parents=True, exist_ok=True)
+
     merged_env = os.environ.copy()
     merged_env.update(env_updates)
 
@@ -1261,7 +1313,6 @@ def run_tests_with_concurrent_isolation(
             f"  Passed: {summary['passed_count']}\n"
             f"  Failed: {summary['failed_count']}\n"
             f"  Errors: {summary['error_count']}\n"
-            f"  Crashed: {summary['crashed_count']}\n"
             f"  Timeout: {summary['timeout_count']}\n"
             f"  Skipped: {summary['skipped_count']}\n"
             f"  Duration: {elapsed:.2f}s\n"
@@ -1279,7 +1330,6 @@ def run_tests_with_concurrent_isolation(
     print(f"  Passed: {summary['passed_count']}", flush=True)
     print(f"  Failed: {summary['failed_count']}", flush=True)
     print(f"  Errors: {summary['error_count']}", flush=True)
-    print(f"  Crashed: {summary['crashed_count']}", flush=True)
     print(f"  Timeout: {summary['timeout_count']}", flush=True)
     print(f"  Skipped: {summary['skipped_count']}", flush=True)
     print(f"  Duration: {elapsed:.2f}s", flush=True)
@@ -1312,6 +1362,10 @@ def run_tests_with_case_isolation(
     prefix = result_module.get_shard_type_prefix(shard_type)
     log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
 
+    # Create junit_xmls directory for XML reports
+    junit_xml_dir = report_dir / "junit_xmls"
+    junit_xml_dir.mkdir(parents=True, exist_ok=True)
+
     merged_env = os.environ.copy()
     merged_env.update(env_updates)
 
@@ -1398,7 +1452,7 @@ def run_tests_with_case_isolation(
 
                 print(f"      {status_str} ({duration_str})")
                 # Print error message for failed/error cases to stdout
-                if status_str in ("failed", "error", "crashed", "timeout") and message:
+                if status_str in ("failed", "error") and message:
                     # Print first few lines of error message
                     msg_lines = message.splitlines()[:5]
                     for msg_line in msg_lines:
@@ -1419,7 +1473,6 @@ def run_tests_with_case_isolation(
         passed_count = sum(1 for c in cases_list if c["status"] == "passed")
         failed_count = sum(1 for c in cases_list if c["status"] == "failed")
         error_count = sum(1 for c in cases_list if c["status"] == "error")
-        crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
         timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
         skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
 
@@ -1428,7 +1481,6 @@ def run_tests_with_case_isolation(
         log_handle.write(f"  Passed: {passed_count}\n")
         log_handle.write(f"  Failed: {failed_count}\n")
         log_handle.write(f"  Errors: {error_count}\n")
-        log_handle.write(f"  Crashed: {crashed_count}\n")
         log_handle.write(f"  Timeout: {timeout_count}\n")
         log_handle.write(f"  Skipped: {skipped_count}\n")
         log_handle.write(f"  Duration: {elapsed:.2f}s\n")
@@ -1437,7 +1489,7 @@ def run_tests_with_case_isolation(
 
         print(f"\n{'=' * 80}")
         print(f"Summary: {total_cases} cases executed")
-        print(f"  Passed: {passed_count}, Failed: {failed_count}, Errors: {error_count}, Crashed: {crashed_count}, Timeout: {timeout_count}")
+        print(f"  Passed: {passed_count}, Failed: {failed_count}, Errors: {error_count}, Timeout: {timeout_count}, Skipped: {skipped_count}")
         print(f"  Duration: {elapsed:.2f}s")
         print(f"{'=' * 80}")
 
@@ -1480,6 +1532,10 @@ def run_tests_with_tasks_concurrent(
     start = monotonic()
     log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
 
+    # Create junit_xmls directory for XML reports
+    junit_xml_dir = report_dir / "junit_xmls"
+    junit_xml_dir.mkdir(parents=True, exist_ok=True)
+
     merged_env = os.environ.copy()
     merged_env.update(env_updates)
 
@@ -1580,7 +1636,6 @@ def run_tests_with_tasks_concurrent(
             f"  Passed: {summary['passed_count']}\n"
             f"  Failed: {summary['failed_count']}\n"
             f"  Errors: {summary['error_count']}\n"
-            f"  Crashed: {summary['crashed_count']}\n"
             f"  Timeout: {summary['timeout_count']}\n"
             f"  Skipped: {summary['skipped_count']}\n"
             f"  Duration: {elapsed:.2f}s\n"
@@ -1598,7 +1653,6 @@ def run_tests_with_tasks_concurrent(
     print(f"  Passed: {summary['passed_count']}", flush=True)
     print(f"  Failed: {summary['failed_count']}", flush=True)
     print(f"  Errors: {summary['error_count']}", flush=True)
-    print(f"  Crashed: {summary['crashed_count']}", flush=True)
     print(f"  Timeout: {summary['timeout_count']}", flush=True)
     print(f"  Skipped: {summary['skipped_count']}", flush=True)
     print(f"  Duration: {elapsed:.2f}s", flush=True)
@@ -1858,7 +1912,6 @@ def main():
         passed_count = sum(1 for c in cases_list if c["status"] == "passed")
         failed_count = sum(1 for c in cases_list if c["status"] == "failed")
         error_count = sum(1 for c in cases_list if c["status"] == "error")
-        crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
         timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
         skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
 
@@ -1871,9 +1924,8 @@ def main():
             "passed": passed_count,
             "failed": failed_count,
             "errors": error_count,
-            "skipped": skipped_count,
-            "crashed": crashed_count,
             "timeout": timeout_count,
+            "skipped": skipped_count,
             "duration": duration,
             "cases": cases_list,
         }
@@ -1890,6 +1942,7 @@ def main():
             "failed": failed_count,
             "skipped": skipped_count,
             "errors": error_count,
+            "timeout": timeout_count,
             "duration": duration,
             "returncode": returncode,
             "per_case_isolation": True,
@@ -2011,9 +2064,8 @@ def main():
         passed_count = sum(1 for c in cases_list if c["status"] == "passed")
         failed_count = sum(1 for c in cases_list if c["status"] == "failed")
         error_count = sum(1 for c in cases_list if c["status"] == "error")
-        skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
-        crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
         timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
+        skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
 
         output_cases_data = {
             "shard": shard,
@@ -2024,9 +2076,8 @@ def main():
             "passed": passed_count,
             "failed": failed_count,
             "errors": error_count,
-            "skipped": skipped_count,
-            "crashed": crashed_count,
             "timeout": timeout_count,
+            "skipped": skipped_count,
             "duration": duration,
             "cases": cases_list,
         }
@@ -2043,7 +2094,6 @@ def main():
             "failed": failed_count,
             "skipped": skipped_count,
             "errors": error_count,
-            "crashed": crashed_count,
             "timeout": timeout_count,
             "duration": duration,
             "returncode": returncode,
@@ -2180,7 +2230,6 @@ def main():
     passed_count = sum(1 for c in cases_list if c["status"] == "passed")
     failed_count = sum(1 for c in cases_list if c["status"] == "failed")
     error_count = sum(1 for c in cases_list if c["status"] == "error")
-    crashed_count = sum(1 for c in cases_list if c["status"] == "crashed")
     timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
     skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
 
@@ -2193,9 +2242,8 @@ def main():
         "passed": passed_count,
         "failed": failed_count,
         "errors": error_count,
-        "skipped": skipped_count,
-        "crashed": crashed_count,
         "timeout": timeout_count,
+        "skipped": skipped_count,
         "duration": duration,
         "cases": cases_list,
     }

From aed81ccf282bfd081dc3f4290111f9e9b27817f4 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 21:32:53 +0800
Subject: [PATCH 123/132] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=94=A8=E4=BE=8Bxml?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=91=BD=E5=90=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/scripts/run_npu_test_shard.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index f5310efc92..8e5526378a 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -723,9 +723,10 @@ def run_single_test_case(
         existing_pythonpath = case_env.get("PYTHONPATH", "")
         case_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
 
-    # Generate XML file path for this case
+    # Generate XML file path for this case with descriptive name
     prefix = "dist" if shard_type == "distributed" else "reg"
-    xml_filename = f"case_{prefix}-{shard}_{case_idx}.xml"
+    safe_case_name = sanitize_nodeid_for_filename(original_nodeid)
+    xml_filename = f"{prefix}-{shard}_{case_idx}_{safe_case_name}.xml"
     xml_file = report_dir / "junit_xmls" / xml_filename
     xml_file.parent.mkdir(parents=True, exist_ok=True)
 
@@ -917,9 +918,10 @@ def run_single_case_concurrent(
     if case_nodeid.startswith("test/"):
         case_nodeid = case_nodeid[5:]
 
-    # Generate XML file path for this case
+    # Generate XML file path with descriptive name
     prefix = "dist" if shard_type == "distributed" else "reg"
-    xml_filename = f"case_{prefix}-{shard}_{task.case_idx}.xml"
+    safe_case_name = sanitize_nodeid_for_filename(original_nodeid)
+    xml_filename = f"{prefix}-{shard}_{task.case_idx}_{safe_case_name}.xml"
     xml_file = report_dir / "junit_xmls" / xml_filename
 
     command = [

From 0f9f66484c74d83f156c4baac4ee868e582b0d23 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Mon, 11 May 2026 21:49:18 +0800
Subject: [PATCH 124/132] =?UTF-8?q?=E4=BF=AE=E6=94=B9dis=E7=94=A8=E4=BE=8B?=
 =?UTF-8?q?=E5=88=86=E7=89=87=E6=95=B03?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/torch-npu-upstream-test-trigger.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/torch-npu-upstream-test-trigger.yml b/.github/workflows/torch-npu-upstream-test-trigger.yml
index fa3d8f1ca5..203864cb3e 100644
--- a/.github/workflows/torch-npu-upstream-test-trigger.yml
+++ b/.github/workflows/torch-npu-upstream-test-trigger.yml
@@ -42,7 +42,7 @@ on:
         default: 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428'
         type: string
       distributed_shards:
-        description: 'Number of shards for distributed tests (default 2)'
+        description: 'Number of shards for distributed tests (default 3)'
         required: false
         default: '3'
         type: string
@@ -65,6 +65,6 @@ jobs:
       pytorch_ref: ${{ github.event.inputs.pytorch_ref || 'fccc94ae83f61fe26559abc999797297196bac29' }}
       torch_npu_ref: ${{ github.event.inputs.torch_npu_ref || 'master' }}
       docker_image: ${{ github.event.inputs.docker_image || 'quay.io/kerer/pytorch:manylinux-cann9.0.0-beta.2-20260428' }}
-      distributed_shards: ${{ github.event.inputs.distributed_shards || '2' }}
+      distributed_shards: ${{ github.event.inputs.distributed_shards || '3' }}
       regular_shards: ${{ github.event.inputs.regular_shards || '5' }}
       test_files: ${{ github.event.inputs.test_files || '' }}
\ No newline at end of file

From 601e622cb257d7a0f1de7ddfec7f1affedbb39ac Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Tue, 12 May 2026 11:12:31 +0800
Subject: [PATCH 125/132] =?UTF-8?q?=E5=90=8C=E6=AD=A52.7.1=E5=88=86?=
 =?UTF-8?q?=E6=94=AF=E4=BF=AE=E6=94=B9=EF=BC=9Axml=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E5=91=BD=E5=90=8D=E7=89=B9=E6=AE=8A=E5=AD=97=E7=AC=A6bug;xml?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E9=A2=84=E5=8E=8B=E7=BC=A9=E6=88=90tar.gz;?=
 =?UTF-8?q?=E6=8A=A5=E5=91=8A=E5=88=A0=E9=99=A4crash=E8=A1=A5=E5=85=85skip?=
 =?UTF-8?q?=E7=8A=B6=E6=80=81;=E6=89=A7=E8=A1=8C=E8=84=9A=E6=9C=AC?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9Equick=E2=80=94test=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../scripts/generate_npu_full_test_report.py  | 22 +++----
 .github/scripts/parse_test_results.py         | 15 +----
 .github/scripts/run_npu_test_shard.py         | 57 ++++++++++++++++++-
 .../_torch-npu-upstream-test-custom.yml       | 10 ++++
 .../_torch-npu-upstream-test-dist.yml         | 10 ++++
 .../_torch-npu-upstream-test-regular.yml      | 10 ++++
 6 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/.github/scripts/generate_npu_full_test_report.py b/.github/scripts/generate_npu_full_test_report.py
index d7cd31ed5e..55822ec30a 100644
--- a/.github/scripts/generate_npu_full_test_report.py
+++ b/.github/scripts/generate_npu_full_test_report.py
@@ -254,8 +254,6 @@ def build_file_to_shards_map(cases_shards_dir: Path) -> Dict[str, List[str]]:
 def get_shard_status(stats: Dict, present: bool) -> str:
     if not present:
         return "MISSING"
-    if stats.get("crashed"):
-        return "CRASHED"
     if stats.get("timed_out"):
         return "TIMEOUT"
     if stats.get("incomplete"):
@@ -272,7 +270,7 @@ def get_shard_status(stats: Dict, present: bool) -> str:
 def get_overall_status(status_counts: Counter) -> str:
     if status_counts["MISSING"] > 0:
         return "FAILED"
-    if any(status_counts[key] > 0 for key in ("CRASHED", "TIMEOUT", "INCOMPLETE", "ERROR", "FAILED")):
+    if any(status_counts[key] > 0 for key in ("TIMEOUT", "INCOMPLETE", "ERROR", "FAILED")):
         return "FAILED"
     if status_counts["PASSED"] > 0:
         return "PASSED"
@@ -362,7 +360,6 @@ def main():
         "failed": 0,
         "errors": 0,
         "skipped": 0,
-        "crashed": 0,
         "timeout": 0,
         "duration": 0.0,
     }
@@ -388,16 +385,14 @@ def main():
             stats["failed"] = cases_data.get("failed", 0)
             stats["errors"] = cases_data.get("errors", 0)
             stats["skipped"] = cases_data.get("skipped", 0)
-            stats["crashed"] = cases_data.get("crashed", 0)
             stats["timeout"] = cases_data.get("timeout", 0)
             stats["duration"] = cases_data.get("duration", 0.0)
-            # Update totals (正交累加: total = passed + failed + errors + skipped + crashed + timeout)
+            # Update totals (正交累加: total = passed + failed + errors + skipped + timeout)
             totals["total"] += cases_data.get("total_cases", 0)
             totals["passed"] += cases_data.get("passed", 0)
             totals["failed"] += cases_data.get("failed", 0)
             totals["errors"] += cases_data.get("errors", 0)
             totals["skipped"] += cases_data.get("skipped", 0)
-            totals["crashed"] += cases_data.get("crashed", 0)
             totals["timeout"] += cases_data.get("timeout", 0)
             totals["duration"] += cases_data.get("duration", 0.0)
 
@@ -422,7 +417,6 @@ def main():
                 "failed": int(stats.get("failed", 0)),
                 "skipped": int(stats.get("skipped", 0)),
                 "errors": int(stats.get("errors", 0)),
-                "crashed": int(stats.get("crashed", 0)),
                 "timeout": int(stats.get("timeout", 0)),
                 "duration": float(stats.get("duration", 0.0)),
             }
@@ -497,7 +491,7 @@ def main():
             (
                 f"{totals['total']} total; {totals['passed']} passed; {totals['failed']} failed; "
                 f"{totals['errors']} errors; {totals['skipped']} skipped; "
-                f"{totals['crashed']} crashed; {totals['timeout']} timeout"
+                f"{totals['timeout']} timeout"
             ),
         ],
     ]
@@ -528,7 +522,7 @@ def main():
         markdown_lines.extend(["", "## 用例级执行统计"])
         markdown_lines.extend(
             render_table(
-                ["Shard", "总用例", "通过", "失败", "错误", "崩溃", "超时", "Duration"],
+                ["Shard", "总用例", "通过", "失败", "错误", "跳过", "超时", "Duration"],
                 [
                     [
                         f"{row['shard']}",
@@ -536,7 +530,7 @@ def main():
                         str(row["passed"]),
                         str(row["failed"]),
                         str(row["errors"]),
-                        str(row.get("crashed", 0)),
+                        str(row.get("skipped", 0)),
                         str(row.get("timeout", 0)),
                         format_duration(row["duration"]),
                     ]
@@ -560,7 +554,7 @@ def main():
 
             file_rows = []
             for fs in sorted_files:  # Show all files
-                failed_total = fs["failed"] + fs["errors"] + fs["crashed"] + fs["timeout"]
+                failed_total = fs["failed"] + fs["errors"] + fs["timeout"]
                 fail_rate = f"{(failed_total / fs['total'] * 100):.1f}%" if fs["total"] > 0 else "0%"
                 # Get shard info for this file
                 file_path = fs["file"]
@@ -577,14 +571,14 @@ def main():
                     str(fs["passed"]),
                     str(fs["failed"]),
                     str(fs["errors"]),
-                    str(fs["crashed"]),
+                    str(fs["skipped"]),
                     str(fs["timeout"]),
                     fail_rate,
                 ])
 
             markdown_lines.extend(
                 render_table(
-                    ["测试文件", "分片", "总用例", "通过", "失败", "错误", "崩溃", "超时", "失败率"],
+                    ["测试文件", "分片", "总用例", "通过", "失败", "错误", "跳过", "超时", "失败率"],
                     file_rows,
                 )
             )
diff --git a/.github/scripts/parse_test_results.py b/.github/scripts/parse_test_results.py
index 0dfb34e8e1..46768b56d2 100644
--- a/.github/scripts/parse_test_results.py
+++ b/.github/scripts/parse_test_results.py
@@ -485,7 +485,6 @@ def aggregate_cases_by_file(cases_list: List[Dict]) -> Dict[str, Dict]:
                 "passed": 0,
                 "failed": 0,
                 "errors": 0,
-                "crashed": 0,
                 "timeout": 0,
                 "skipped": 0,
                 "failed_cases": [],
@@ -514,14 +513,6 @@ def aggregate_cases_by_file(cases_list: List[Dict]) -> Dict[str, Dict]:
                 "message": case.get("message", ""),
                 "duration": duration,
             })
-        elif status == "crashed":
-            stats["crashed"] += 1
-            stats["failed_cases"].append({
-                "nodeid": case.get("nodeid"),
-                "status": "crashed",
-                "message": case.get("message", ""),
-                "duration": duration,
-            })
         elif status == "timeout":
             stats["timeout"] += 1
             stats["failed_cases"].append({
@@ -560,7 +551,6 @@ def aggregate_all_cases_by_file(cases_results: Dict) -> Dict[str, Dict]:
                     "passed": 0,
                     "failed": 0,
                     "errors": 0,
-                    "crashed": 0,
                     "timeout": 0,
                     "skipped": 0,
                     "failed_cases": [],
@@ -572,7 +562,6 @@ def aggregate_all_cases_by_file(cases_results: Dict) -> Dict[str, Dict]:
             existing["passed"] += stats["passed"]
             existing["failed"] += stats["failed"]
             existing["errors"] += stats["errors"]
-            existing["crashed"] += stats["crashed"]
             existing["timeout"] += stats["timeout"]
             existing["skipped"] += stats["skipped"]
             existing["duration"] += stats["duration"]
@@ -589,7 +578,7 @@ def aggregate_all_cases_by_file(cases_results: Dict) -> Dict[str, Dict]:
 
 def get_files_with_failures(file_stats: Dict[str, Dict]) -> List[Dict]:
     """
-    Get list of test files that have failures/errors/crashes/timeout.
+    Get list of test files that have failures/errors/timeout.
 
     Args:
         file_stats: Dict from aggregate_all_cases_by_file()
@@ -599,7 +588,7 @@ def get_files_with_failures(file_stats: Dict[str, Dict]) -> List[Dict]:
     """
     failed_files = []
     for test_file, stats in file_stats.items():
-        if stats["failed"] > 0 or stats["errors"] > 0 or stats["crashed"] > 0 or stats["timeout"] > 0:
+        if stats["failed"] > 0 or stats["errors"] > 0 or stats["timeout"] > 0:
             failed_files.append(stats)
 
     failed_files.sort(key=lambda x: x["file"])
diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 8e5526378a..365493fc48 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -152,11 +152,17 @@ def sanitize_nodeid_for_filename(nodeid: str) -> str:
     Convert nodeid to a safe filename.
 
     Replaces special characters with underscores and truncates if too long.
+    Invalid characters for NTFS/filesystems: " : < > | * ? \r \n
     """
-    # Replace special characters
+    # Replace special characters (including NTFS-invalid chars)
     safe_name = nodeid.replace("::", "_").replace("/", "_").replace("\\", "_")
     safe_name = safe_name.replace("(", "_").replace(")", "_").replace("[", "_").replace("]", "_")
-    safe_name = safe_name.replace("*", "_").replace("?", "_").replace(" ", "_")
+    # NTFS-invalid characters that GitHub Actions artifact upload rejects
+    safe_name = safe_name.replace("<", "_lt_").replace(">", "_gt_")
+    safe_name = safe_name.replace('"', "_quot_").replace("|", "_pipe_")
+    safe_name = safe_name.replace("*", "_star_").replace("?", "_q_")
+    safe_name = safe_name.replace(":", "_colon_")
+    safe_name = safe_name.replace(" ", "_")
     safe_name = safe_name.replace(".", "_")
 
     # Remove leading underscores and collapse multiple underscores
@@ -1152,6 +1158,7 @@ def run_tests_with_concurrent_isolation(
     shard_type: str,
     max_workers: int,
     result_module,
+    quick_test: int = None,
 ) -> Tuple[int, float, List[Dict]]:
     """
     Execute tests with concurrent per-case isolation.
@@ -1175,6 +1182,7 @@ def run_tests_with_concurrent_isolation(
         shard_type: "distributed" or "regular"
         max_workers: Maximum concurrent subprocesses (default: 4)
         result_module: parse_test_results module
+        quick_test: Maximum number of cases to execute (None = all cases)
 
     Returns:
         Tuple of (worst_returncode, duration, cases_list_sorted)
@@ -1233,7 +1241,15 @@ def run_tests_with_concurrent_isolation(
     case_idx = 0
 
     print("Phase 1: Collecting test cases...")
+    if quick_test:
+        print(f"  Quick test mode: will collect up to {quick_test} cases")
+
     for file_idx, test_file in enumerate(planned_tests, 1):
+        # Quick test: stop collecting if already have enough cases
+        if quick_test and case_idx >= quick_test:
+            print(f"\n  Quick test limit reached ({quick_test} cases), stopping collection")
+            break
+
         test_name = strip_test_prefix_and_suffix(test_file)
         print(f"\n  [File {file_idx}/{len(planned_tests)}] Collecting: {test_name}")
 
@@ -1254,6 +1270,11 @@ def run_tests_with_concurrent_isolation(
                 file_idx=file_idx,
             ))
 
+            # Quick test: stop collecting if reached limit
+            if quick_test and case_idx >= quick_test:
+                print(f"    Quick test limit reached ({quick_test} cases)")
+                break
+
     total_cases = len(all_tasks)
     print(f"\n{'=' * 80}")
     print(f"Phase 2: Concurrent execution with {max_workers} workers")
@@ -1350,6 +1371,7 @@ def run_tests_with_case_isolation(
     verbose: bool,
     shard_type: str,
     result_module,
+    quick_test: int = None,
 ) -> Tuple[int, float, List[Dict]]:
     """
     Execute tests with per-case isolation (strict serial execution).
@@ -1357,6 +1379,9 @@ def run_tests_with_case_isolation(
     Each test case runs in its own pytest subprocess for crash isolation.
     No parallel execution - strict serial processing.
 
+    Args:
+        quick_test: Maximum number of cases to execute (None = all cases)
+
     Returns:
         Tuple of (worst_returncode, duration, cases_list)
     """
@@ -1386,12 +1411,19 @@ def run_tests_with_case_isolation(
         print(f"\n{'=' * 80}")
         print(f"Per-case isolation mode: {len(planned_tests)} files")
         print("Execution mode: strict serial, each case in own process")
+        if quick_test:
+            print(f"Quick test mode: will execute up to {quick_test} cases")
         print(f"{'=' * 80}\n")
 
         total_cases = 0
         case_idx = 0
 
         for file_idx, test_file in enumerate(planned_tests, 1):
+            # Quick test: stop if already have enough cases
+            if quick_test and case_idx >= quick_test:
+                print(f"\nQuick test limit reached ({quick_test} cases), stopping execution")
+                break
+
             test_name = strip_test_prefix_and_suffix(test_file)
 
             log_handle.write(f"\n{'=' * 80}\n")
@@ -1469,6 +1501,11 @@ def run_tests_with_case_isolation(
                     if worst_returncode == 0:
                         worst_returncode = rc
 
+                # Quick test: stop after executing enough cases
+                if quick_test and case_idx >= quick_test:
+                    print(f"    Quick test limit reached ({quick_test} cases), stopping")
+                    break
+
         # Summary
         elapsed = monotonic() - start
 
@@ -1509,6 +1546,7 @@ def run_tests_with_tasks_concurrent(
     shard_type: str,
     max_workers: int,
     result_module,
+    quick_test: int = None,
 ) -> Tuple[int, float, List[Dict]]:
     """
     Execute pre-collected test cases with concurrent per-case isolation.
@@ -1527,6 +1565,7 @@ def run_tests_with_tasks_concurrent(
         shard_type: "distributed" or "regular"
         max_workers: Maximum concurrent subprocesses
         result_module: parse_test_results module
+        quick_test: Maximum number of cases to execute (None = all cases)
 
     Returns:
         Tuple of (worst_returncode, duration, cases_list_sorted)
@@ -1575,6 +1614,11 @@ def run_tests_with_tasks_concurrent(
 
     log_thread.start()
 
+    # Quick test: limit number of cases to execute
+    if quick_test and len(tasks) > quick_test:
+        tasks = tasks[:quick_test]
+        print(f"\nQuick test mode: executing only {quick_test} cases", flush=True)
+
     print(f"\n{'=' * 80}", flush=True)
     print(f"Pre-collected cases: {len(tasks)} cases", flush=True)
     print(f"Execution mode: {max_workers} workers concurrent, each case in subprocess", flush=True)
@@ -1803,6 +1847,7 @@ def parse_args():
         help="Maximum concurrent workers for regular tests (default: 4). Each worker runs one pytest subprocess.",
     )
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument("--quick-test", type=int, default=None, help="Quick test mode: execute only N cases for fast verification (default: None, run all cases)")
     args = parser.parse_args()
 
     # Validate required arguments based on mode
@@ -1904,6 +1949,7 @@ def main():
                 shard_type,
                 args.max_workers,
                 result_module,
+                args.quick_test,
             )
             info["per_case_isolation"] = True
             info["concurrent_workers"] = args.max_workers
@@ -2051,6 +2097,7 @@ def main():
                 shard_type,
                 effective_workers,
                 result_module,
+                args.quick_test,
             )
             info["execution_mode"] = "serial" if effective_workers == 1 else "concurrent"
             info["concurrent_workers"] = effective_workers
@@ -2193,6 +2240,8 @@ def main():
         if shard_type == "distributed":
             # Distributed tests: serial execution for stability
             print("\nExecution mode: SERIAL (distributed tests require sequential execution)")
+            if args.quick_test:
+                print(f"Quick test mode: will execute up to {args.quick_test} cases")
             returncode, duration, cases_list = run_tests_with_case_isolation(
                 planned_tests,
                 args.shard,
@@ -2203,11 +2252,14 @@ def main():
                 args.verbose,
                 shard_type,
                 result_module,
+                args.quick_test,
             )
             info["execution_mode"] = "serial"
         else:
             # Regular tests: concurrent execution for efficiency
             print(f"\nExecution mode: CONCURRENT ({args.max_workers} workers)")
+            if args.quick_test:
+                print(f"Quick test mode: will execute up to {args.quick_test} cases")
             returncode, duration, cases_list = run_tests_with_concurrent_isolation(
                 planned_tests,
                 args.shard,
@@ -2219,6 +2271,7 @@ def main():
                 shard_type,
                 args.max_workers,
                 result_module,
+                args.quick_test,
             )
             info["execution_mode"] = "concurrent"
             info["concurrent_workers"] = args.max_workers
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index 27d1621936..8a6df97f8d 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -148,6 +148,16 @@ jobs:
       - name: Package and upload test reports
         if: always()
         run: |
+          # Package junit XMLs into compressed archive
+          if [ -d "test-reports/junit_xmls" ]; then
+            echo "=== Compressing junit XMLs ==="
+            XML_COUNT=$(find test-reports/junit_xmls -type f -name "*.xml" | wc -l)
+            echo "Found ${XML_COUNT} XML files"
+            tar -czf test-reports/junit_xmls.tar.gz -C test-reports junit_xmls
+            rm -rf test-reports/junit_xmls
+            echo "JUnit XMLs compressed"
+          fi
+
           # Package failed cases logs into compressed archive
           if [ -d "test-reports/failed_cases_logs" ]; then
             echo "=== Compressing failed cases logs ==="
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index 199128000c..74411b354c 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -198,6 +198,16 @@ jobs:
       - name: Package and upload test reports
         if: always()
         run: |
+          # Package junit XMLs into compressed archive
+          if [ -d "test-reports/junit_xmls" ]; then
+            echo "=== Compressing junit XMLs ==="
+            XML_COUNT=$(find test-reports/junit_xmls -type f -name "*.xml" | wc -l)
+            echo "Found ${XML_COUNT} XML files"
+            tar -czf test-reports/junit_xmls.tar.gz -C test-reports junit_xmls
+            rm -rf test-reports/junit_xmls
+            echo "JUnit XMLs compressed: $(ls -lh test-reports/junit_xmls.tar.gz)"
+          fi
+
           # Package failed cases logs into compressed archive
           if [ -d "test-reports/failed_cases_logs" ]; then
             echo "=== Compressing failed cases logs ==="
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index 7e4dfe4ac6..a88559965a 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -199,6 +199,16 @@ jobs:
       - name: Package and upload test reports
         if: always()
         run: |
+          # Package junit XMLs into compressed archive
+          if [ -d "test-reports/junit_xmls" ]; then
+            echo "=== Compressing junit XMLs ==="
+            XML_COUNT=$(find test-reports/junit_xmls -type f -name "*.xml" | wc -l)
+            echo "Found ${XML_COUNT} XML files"
+            tar -czf test-reports/junit_xmls.tar.gz -C test-reports junit_xmls
+            rm -rf test-reports/junit_xmls
+            echo "JUnit XMLs compressed: $(ls -lh test-reports/junit_xmls.tar.gz)"
+          fi
+
           # Package failed cases logs into compressed archive
           if [ -d "test-reports/failed_cases_logs" ]; then
             echo "=== Compressing failed cases logs ==="

From e7daf2e3b970c2ac68ab41e158c6ce2212aa431f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 13 May 2026 00:53:56 +0800
Subject: [PATCH 126/132] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E4=BE=9D=E8=B5=96=E7=BC=93=E5=AD=98=E5=92=8C=E9=BB=91=E5=90=8D?=
 =?UTF-8?q?=E5=8D=95=E9=85=8D=E7=BD=AE=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 统一所有 workflow 的 pip 缓存配置
   - build workflow: 4 处 PIP_CACHE_DIR 设置
   - collect/regular/dist/custom workflow: 添加 PIP_CACHE_DIR 和 Cache pip action
   - 所有 pip install 步骤使用缓存加速下载

2. 添加 torchvision 安装（忽略版本检查）
   - 使用 --no-deps 绕过 torch 版本绑定问题
   - 解决 onnx/test_models 系列测试的依赖缺失

3. 更新黑名单配置 case_paths_ci.yml
   - 新增 torch_openreg 测试（需单独编译，上游默认排除）
   - 新增 dynamo/test_torchrec（fbgemm-gpu 无 ARM64 支持）
   - 新增 onnx/exporter/test_hf_models_e2e（transformers 依赖）
   - 详细注释说明每个黑名单的原因

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../workflows/_torch-npu-upstream-build.yml   |   5 +
 .../workflows/_torch-npu-upstream-collect.yml |  25 ++-
 .../_torch-npu-upstream-test-custom.yml       |  36 +++-
 .../_torch-npu-upstream-test-dist.yml         |  12 ++
 .../_torch-npu-upstream-test-regular.yml      |  12 ++
 test_upstream/case_paths_ci.yml               | 161 ++++++++++++++++++
 6 files changed, 247 insertions(+), 4 deletions(-)
 create mode 100644 test_upstream/case_paths_ci.yml

diff --git a/.github/workflows/_torch-npu-upstream-build.yml b/.github/workflows/_torch-npu-upstream-build.yml
index efd28f93be..b63ddc8f3a 100644
--- a/.github/workflows/_torch-npu-upstream-build.yml
+++ b/.github/workflows/_torch-npu-upstream-build.yml
@@ -152,6 +152,7 @@ jobs:
 
       - name: Upgrade pip and setuptools
         run: |
+          export PIP_CACHE_DIR=/root/.cache/pip
           # 先升级 pip 和 setuptools，避免旧版包兼容性问题
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
 
@@ -239,6 +240,7 @@ jobs:
         run: |
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
 
+          export PIP_CACHE_DIR=/root/.cache/pip
           cd pytorch-src
 
           # 安装构建依赖（pip 缓存已恢复，加速下载）
@@ -277,6 +279,8 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
+          export PIP_CACHE_DIR=/root/.cache/pip
+
           echo "=== Installing built PyTorch wheel ==="
           pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
 
@@ -297,6 +301,7 @@ jobs:
           source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
           source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
 
+          export PIP_CACHE_DIR=/root/.cache/pip
           pip${{ inputs.python_version }} install --upgrade pip setuptools wheel
           pip${{ inputs.python_version }} install cmake ninja numpy packaging pyyaml requests six typing-extensions
 
diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index 1cb99730ac..83e51ae70c 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -57,7 +57,7 @@ env:
 jobs:
   collect:
     runs-on: linux-aarch64-a3-16
-    timeout-minutes: 60
+    timeout-minutes: 120
     container:
       image: ${{ inputs.docker_image }}
       options: --user root
@@ -146,12 +146,29 @@ jobs:
       - name: Install test dependencies
         run: |
           PIP=pip${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
           cd pytorch-src
 
+          # Core test dependencies
           $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
           $PIP install pytest-rerunfailures pytest-flakefinder
           $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
 
+          # Optional dependencies for ONNX tests
+          # These are not in PyTorch requirements.txt but needed by specific tests
+          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
+
+          # torchvision for ONNX model tests (install without deps to bypass torch version check)
+          # PyPI torchvision requires exact torch version (torch==2.11.0), but we have dev build
+          # Use --no-deps to skip torch dependency, we already have our compiled torch installed
+          $PIP install numpy pillow || true
+          $PIP install torchvision --no-deps || true
+
+          # Other optional dependencies
+          $PIP install parameterized pandas || true
+          $PIP install opencv-python || true
+
+          # PyTorch requirements (if exists)
           if [ -f requirements.txt ]; then
             $PIP install -r requirements.txt || true
           fi
@@ -180,6 +197,11 @@ jobs:
           PYTHON=python${{ inputs.python_version }}
           cd pytorch-src
 
+          # 设置 BACKEND 环境变量，避免分布式测试收集阶段 KeyError
+          # 值设为 hccl（NPU 分布式后端），不是 gloo/nccl 时测试类不会被定义
+          # 结果：pytest 收集到 0 个用例，不会报错
+          export BACKEND="hccl"
+
           # Case-level sharding
           DISTRIBUTED_SHARDS='${{ inputs.distributed_shards }}'
           REGULAR_SHARDS='${{ inputs.regular_shards }}'
@@ -190,6 +212,7 @@ jobs:
 
           $PYTHON ../ascend_pytorch/.github/scripts/collect_all_cases.py \
             --test-dir test \
+            --case-paths-config ../ascend_pytorch/test_upstream/case_paths_ci.yml \
             --distributed-shards ${DISTRIBUTED_SHARDS} \
             --regular-shards ${REGULAR_SHARDS} \
             --output-dir cases_shards \
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index 8a6df97f8d..48fb6c9fe8 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -53,6 +53,15 @@ jobs:
       - name: Setup cache directories
         run: |
           mkdir -p /github/home/.cache/pip
+          chmod -R 777 /github/home/.cache
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: /github/home/.cache/pip
+          key: pip-arm-custom-py${{ inputs.python_version }}
+          restore-keys: |
+            pip-arm-custom-py${{ inputs.python_version }}-
 
       - name: Download built torch wheel
         uses: actions/download-artifact@v4
@@ -105,9 +114,30 @@ jobs:
 
       - name: Install test dependencies
         run: |
-          pip${{ inputs.python_version }} install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          pip${{ inputs.python_version }} install pytest-rerunfailures pytest-flakefinder pytest-subtests==0.13.1 xdoctest==1.1.0 pulp>=2.9
-          pip${{ inputs.python_version }} install -r pytorch-src/requirements.txt || true
+          PIP=pip${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+
+          # Configure pip to use PyPI cache for faster downloads
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }} || true
+            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local" || true
+          fi
+
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder pytest-subtests==0.13.1 xdoctest==1.1.0 pulp>=2.9
+
+          # Optional dependencies for ONNX tests
+          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
+
+          # torchvision for ONNX model tests (install without deps to bypass torch version check)
+          $PIP install numpy pillow || true
+          $PIP install torchvision --no-deps || true
+
+          # Other optional dependencies
+          $PIP install parameterized pandas || true
+          $PIP install opencv-python || true
+
+          $PIP install -r pytorch-src/requirements.txt || true
 
       - name: Verify NPU availability
         run: |
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index 74411b354c..433a3f599b 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -127,12 +127,24 @@ jobs:
       - name: Install test dependencies
         run: |
           PIP=pip${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
           cd pytorch-src
 
           $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
           $PIP install pytest-rerunfailures pytest-flakefinder
           $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
 
+          # Optional dependencies for ONNX tests
+          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
+
+          # torchvision for ONNX model tests (install without deps to bypass torch version check)
+          $PIP install numpy pillow || true
+          $PIP install torchvision --no-deps || true
+
+          # Other optional dependencies
+          $PIP install parameterized pandas || true
+          $PIP install opencv-python || true
+
           if [ -f requirements.txt ]; then
             $PIP install -r requirements.txt || true
           fi
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index a88559965a..11b96bce87 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -127,12 +127,24 @@ jobs:
       - name: Install test dependencies
         run: |
           PIP=pip${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
           cd pytorch-src
 
           $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
           $PIP install pytest-rerunfailures pytest-flakefinder
           $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
 
+          # Optional dependencies for ONNX tests
+          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
+
+          # torchvision for ONNX model tests (install without deps to bypass torch version check)
+          $PIP install numpy pillow || true
+          $PIP install torchvision --no-deps || true
+
+          # Other optional dependencies
+          $PIP install parameterized pandas || true
+          $PIP install opencv-python || true
+
           if [ -f requirements.txt ]; then
             $PIP install -r requirements.txt || true
           fi
diff --git a/test_upstream/case_paths_ci.yml b/test_upstream/case_paths_ci.yml
new file mode 100644
index 0000000000..f7e50e31d6
--- /dev/null
+++ b/test_upstream/case_paths_ci.yml
@@ -0,0 +1,161 @@
+# Test file blacklist configuration for NPU CI
+#
+# This file defines test files/directories to exclude from collection.
+# Rules support:
+#   - Exact path match: "distributed/test_c10d_nccl"
+#   - Directory prefix match: "dynamo/cpython/3_13" (matches all files in that directory)
+#   - Glob patterns: "distributed/*nccl*"
+#
+# Rule paths are relative to the test directory (e.g., "test/distributed/...")
+# The "test/" prefix is automatically added if missing.
+
+blacklist:
+  # ==============================================================================
+  # Python version specific tests (Python 3.13 syntax, incompatible with 3.11)
+  # ==============================================================================
+  - dynamo/cpython/3_13
+
+  # ==============================================================================
+  # Platform-specific tests (CUDA/XPU/MPS - not supported on NPU)
+  # ==============================================================================
+  # CUDA specific tests
+  - test_cuda_multigpu
+  - test_cuda_nvml_based_avail
+  - test_cuda_primary_ctx
+  - test_cuda_sanitizer
+  - test_cuda_trace
+  - test_jiterator
+  - test_varlen_attention
+
+  # CUDA inductor tests
+  - inductor/test_cuda_repro
+  - inductor/test_cudagraph_trees
+  - inductor/test_cudagraph_trees_expandable_segments
+  - inductor/test_gpu_select_algorithm
+  - inductor/test_triton_cpu_backend
+  - inductor/test_triton_heuristics
+  - inductor/test_pallas
+  - inductor/test_layout_optim
+  - inductor/test_op_dtype_prop
+  - inductor/test_autoheuristic
+  - inductor/test_b2b_gemm
+  - inductor/test_best_config
+  - inductor/test_coordinate_descent_tuner
+
+  # JIT CUDA tests
+  - jit/test_cuda
+
+  # Flash Attention tests (CUDA only)
+  - nn/attention/test_fa3
+  - nn/attention/test_fa4
+
+  # XPU (Intel) tests
+  - test_xpu
+  - test_xpu_expandable_segments
+  - xpu/test_conv
+  - xpu/test_fusion
+  - xpu/test_gemm
+
+  # MPS (Apple Metal) tests
+  - test_mps
+
+  # ==============================================================================
+  # ONNX tests with heavy/optional dependencies
+  # ==============================================================================
+  # HuggingFace transformers tests
+  # Issue: Test requires transformers package (HuggingFace) with no graceful import handling
+  # Problem: Test file directly imports `import transformers` without try-import or skipif
+  # - transformers package: ~10 MB + dependencies (tokenizers, safetensors, huggingface-hub)
+  # - Test downloads model: hf-internal-testing/tiny-random-gptj (small, but needs network)
+  # - PyTorch CI excludes all ONNX tests by default (only runs with --onnx flag)
+  # Impact: ModuleNotFoundError: No module named 'transformers'
+  # Note: PyTorch run_test.py: options.exclude.extend(onnx_tests) in default behavior
+  - onnx/exporter/test_hf_models_e2e
+
+  # ==============================================================================
+  # Tests requiring specific compiled libraries not available in NPU build
+  # ==============================================================================
+  - custom_operator/test_custom_ops  # requires libcustom_ops.so
+
+  # ==============================================================================
+  # Custom device extension tests (require separate compilation)
+  # ==============================================================================
+  # torch_openreg tests
+  # Issue: torch_openreg is a separate extension that must be compiled AFTER PyTorch is installed
+  # Problem: PyTorch run_test.py calls install_cpp_extensions() before running these tests
+  # - Requires CMake build linking against installed PyTorch
+  # - setup.py shows: "-DPYTORCH_INSTALL_DIR=" + get_pytorch_dir()
+  # - By default, PyTorch CI EXCLUDES test_openreg (only runs with --openreg flag)
+  # Impact: ModuleNotFoundError: No module named 'torch_openreg'
+  # See: run_test.py - options.exclude.append("test_openreg") in default behavior
+  - cpp_extensions/open_registration_extension/torch_openreg/tests/test_memory
+
+  # ==============================================================================
+  # Architecture-specific tests (x86_64 only, no ARM64/aarch64 support)
+  # ==============================================================================
+  # TorchRec tests
+  # Issue: torchrec depends on fbgemm-gpu, which only has x86_64 wheels on PyPI
+  # Problem: fbgemm-gpu provides pre-built wheels for x86_64 only:
+  #   - fbgemm_gpu-1.6.0-cp311-cp311-manylinux_2_28_x86_64.whl
+  #   - No aarch64/ARM64 wheels available
+  # Impact: Cannot install fbgemm-gpu on ARM64 NPU runner (linux-aarch64-a3-16)
+  # Test uses graceful handling (NoTest fallback), but we blacklist for clarity
+  - dynamo/test_torchrec
+
+  # ==============================================================================
+  # PyTorch upstream design issues (missing __init__.py or internal API changes)
+  # ==============================================================================
+  # Quantization experimental tests
+  # Issue: torch/ao/quantization/experimental/ directory exists but has NO __init__.py
+  # Python cannot recognize it as a package, causing ModuleNotFoundError on import
+  # See: https://github.com/pytorch/pytorch/tree/main/torch/ao/quantization/experimental
+  # Files exist: adaround_optimization.py, linear.py, observer.py, etc.
+  # Missing: __init__.py (required for package import)
+  # Impact: 6 test files fail with "No module named 'torch.ao.quantization.experimental'"
+  - quantization/core/experimental/test_adaround_eager
+  - quantization/core/experimental/test_fake_quantize
+  - quantization/core/experimental/test_linear
+  - quantization/core/experimental/test_nonuniform_observer
+  - quantization/core/experimental/test_quantized_tensor
+  - quantization/core/experimental/test_quantizer
+
+  # numpy internal private module dependency
+  # Issue: Test imports private function from numpy's internal module structure
+  # Code: `from numpy.linalg.linalg import _multi_dot_matrix_chain_order`
+  # Problem: numpy 2.0+ restructured internal modules, `numpy.linalg.linalg` no longer exists
+  # - Private APIs (_ prefix) have no stability guarantee, can be moved/removed anytime
+  # - numpy 2.0 moved _multi_dot_matrix_chain_order to numpy.linalg._linalg or removed it
+  # - PyTorch test code uses private API instead of stable public API
+  # Impact: ModuleNotFoundError: No module named 'numpy.linalg.linalg'
+  - torch_np/numpy_tests/linalg/test_linalg
+
+  # PyTorch test directory structure issue (test/ has no __init__.py)
+  # Issue: Test imports using package path `from test.jit.fixtures_srcs.generate_models import ...`
+  # Problem: PyTorch test/ directory is NOT a Python package (missing __init__.py files)
+  # - test/__init__.py does not exist
+  # - test/jit/__init__.py does not exist
+  # - Without __init__.py, Python cannot recognize `test.jit` as a valid package path
+  # - PyTorch CI works via special PYTHONPATH setup that our collection script doesn't replicate
+  # Impact: ModuleNotFoundError: No module named 'test.jit'
+  - jit/fixtures_srcs/test_upgrader_models_generation
+
+  # ==============================================================================
+  # Tests with torch_npu compatibility issues
+  # ==============================================================================
+  # RPC tests fail due to torch_npu serialization.py 'object' type annotation
+  # (TorchScript incompatible) - these are handled by BACKEND=hccl skip
+  # Uncomment if needed:
+  # - distributed/rpc/test_tensorpipe_agent
+  # - distributed/rpc/test_faulty_agent
+  # - distributed/rpc/cuda/test_tensorpipe_agent
+
+  # ==============================================================================
+  # Tests with environment variable requirements (handled by BACKEND=hccl)
+  # ==============================================================================
+  # These tests require BACKEND=gloo/nccl environment variable
+  # Setting BACKEND=hccl causes them to define no test classes
+  # - distributed/algorithms/quantization/test_quantization
+  # - distributed/test_distributed_spawn
+
+# Whitelist is empty - collect all tests except blacklist
+whitelist: []
\ No newline at end of file

From a64c107f4145fc345f0a3eef7fde2754266551b2 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 13 May 2026 09:50:13 +0800
Subject: [PATCH 127/132] =?UTF-8?q?=E5=88=A0=E9=99=A4case-paths-config?=
 =?UTF-8?q?=E5=8F=82=E6=95=B0=E5=8F=8A=E7=9B=B8=E5=85=B3=E5=86=97=E4=BD=99?=
 =?UTF-8?q?=E6=AD=A5=E9=AA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- run_npu_test_shard.py: 移除--case-paths-config参数，discover调用时传None
- _torch-npu-upstream-collect.yml: 移除collect_all_cases.py的--case-paths-config传参
- _torch-npu-upstream-test-dist.yml: 删除冗余的error logs上传步骤
- _torch-npu-upstream-test-regular.yml: 删除冗余的error logs上传步骤

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/run_npu_test_shard.py           | 14 ++------------
 .../workflows/_torch-npu-upstream-collect.yml   |  1 -
 .../workflows/_torch-npu-upstream-test-dist.yml | 17 -----------------
 .../_torch-npu-upstream-test-regular.yml        | 17 -----------------
 4 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 365493fc48..4f1f3c9b70 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -29,7 +29,6 @@
         --num-shards 50 \
         --test-type distributed \
         --test-dir /path/to/pytorch/test \
-        --case-paths-config /path/to/case_paths_ci.yml \
         --disabled-testcases /path/to/disabled_testcases.json \
         --report-dir test-reports \
         --timeout 1200 \
@@ -431,14 +430,13 @@ def load_discover_module(script_dir: Path):
 def run_discovery(
     test_dir: Path,
     test_type: str,
-    case_paths_config: Optional[str],
     discover_module,
 ) -> DiscoveryResult:
     """Run test discovery and return DiscoveryResult."""
     test_files, metadata = discover_module.discover_test_files(
         test_dir=test_dir,
         test_type=test_type,
-        case_paths_config=case_paths_config,
+        case_paths_config=None,
     )
 
     return DiscoveryResult(
@@ -505,11 +503,10 @@ def plan_shard_tests(
     shard: int,
     num_shards: int,
     test_type: str,
-    case_paths_config: Optional[str],
     discover_module,
 ) -> ShardPlanResult:
     """Complete test planning: discovery + shard assignment."""
-    discovery_result = run_discovery(test_dir, test_type, case_paths_config, discover_module)
+    discovery_result = run_discovery(test_dir, test_type, discover_module)
     shard_assignment_result = assign_shard(discovery_result, shard, num_shards)
 
     return ShardPlanResult(
@@ -1837,7 +1834,6 @@ def parse_args():
     )
     parser.add_argument("--test-dir", type=str, required=True, help="Path to PyTorch test directory")
     parser.add_argument("--disabled-testcases", type=str, help="Path to disabled_testcases.json")
-    parser.add_argument("--case-paths-config", type=str, help="Path to case_paths_ci.yml")
     parser.add_argument("--report-dir", type=str, default="test-reports", help="Directory for reports")
     parser.add_argument("--timeout", type=int, default=1200, help="Per-case timeout in seconds (default: 1200 = 20 minutes)")
     parser.add_argument(
@@ -2177,7 +2173,6 @@ def main():
         shard=args.shard,
         num_shards=args.num_shards,
         test_type=shard_type,
-        case_paths_config=args.case_paths_config,
         discover_module=discover_module,
     )
     planned_tests = plan_result.get_planned_tests()
@@ -2193,9 +2188,6 @@ def main():
     info["excluded_test_files"] = plan_result.discovery.metadata.get("rules_excluded", 0)
     info["shard_files"] = plan_result.shard_assignment.planned_count
 
-    if args.case_paths_config:
-        info["path_rules_file"] = args.case_paths_config
-
     # Save test plan
     result_module.save_test_plan_file(str(report_dir), args.shard, planned_tests, shard_type)
 
@@ -2212,8 +2204,6 @@ def main():
         print("Execution mode: SERIAL (per-case subprocess isolation)")
     else:
         print(f"Execution mode: CONCURRENT ({args.max_workers} workers, per-case subprocess isolation)")
-    if args.case_paths_config:
-        print(f"Case path rules: {args.case_paths_config}")
     print(f"Disabled testcase entries: {info['disabled_count']}")
     print(f"\n{'=' * 80}\n")
 
diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index 83e51ae70c..5cd68f2627 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -212,7 +212,6 @@ jobs:
 
           $PYTHON ../ascend_pytorch/.github/scripts/collect_all_cases.py \
             --test-dir test \
-            --case-paths-config ../ascend_pytorch/test_upstream/case_paths_ci.yml \
             --distributed-shards ${DISTRIBUTED_SHARDS} \
             --regular-shards ${REGULAR_SHARDS} \
             --output-dir cases_shards \
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index 433a3f599b..e37eadb880 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -239,21 +239,4 @@ jobs:
         with:
           name: test-reports-dist-${{ matrix.shard }}
           path: test-reports/
-          retention-days: 30
-
-      - name: Compress and upload error logs
-        if: failure()
-        run: |
-          # Only upload logs when tests failed
-          mkdir -p error-logs
-          cp /tmp/test_shard_dist_${{ matrix.shard }}.log error-logs/ 2>/dev/null || true
-          tar -czf error-logs-dist-${{ matrix.shard }}.tar.gz error-logs/
-          echo "Error logs compressed: $(ls -lh error-logs-dist-${{ matrix.shard }}.tar.gz)"
-
-      - name: Upload error logs
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: error-logs-dist-${{ matrix.shard }}
-          path: error-logs-dist-${{ matrix.shard }}.tar.gz
           retention-days: 30
\ No newline at end of file
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index 11b96bce87..75064a1126 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -242,21 +242,4 @@ jobs:
         with:
           name: test-reports-reg-${{ matrix.shard }}
           path: test-reports/
-          retention-days: 30
-
-      - name: Compress and upload error logs
-        if: failure()
-        run: |
-          # Only upload logs when tests failed
-          mkdir -p error-logs
-          cp /tmp/test_shard_reg_${{ matrix.shard }}.log error-logs/ 2>/dev/null || true
-          tar -czf error-logs-reg-${{ matrix.shard }}.tar.gz error-logs/
-          echo "Error logs compressed: $(ls -lh error-logs-reg-${{ matrix.shard }}.tar.gz)"
-
-      - name: Upload error logs
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: error-logs-reg-${{ matrix.shard }}
-          path: error-logs-reg-${{ matrix.shard }}.tar.gz
           retention-days: 30
\ No newline at end of file

From f4f3e378e6f13c45e64db126efd53c52155d430f Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Wed, 13 May 2026 12:08:17 +0800
Subject: [PATCH 128/132] s

---
 CLAUDE.md | 857 ------------------------------------------------------
 1 file changed, 857 deletions(-)
 delete mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
deleted file mode 100644
index 12c69a7edc..0000000000
--- a/CLAUDE.md
+++ /dev/null
@@ -1,857 +0,0 @@
-# Torch-NPU 编译和上游 PyTorch 测试用例拉起流程设计
-
-## Context
-
-设计一个完整的 CI/CD 流程，用于：
-1. 拉取上游 PyTorch main 分支代码并编译
-2. 安装上游 PyTorch wheel 包
-3. 编译 torch-npu
-4. 扫描 PyTorch test 目录下的所有测试用例
-5. 每个用例独立进程执行全量测试
-6. 记录测试结果并上传日志制品
-
-参考 PR #117 (https://github.com/Ascend/pytorch/pull/117) 的实现方案，简化 patch 和白名单逻辑。
-
-**关键改进**：采用上游 PyTorch 社区的最佳实践 —— **构建和测试使用同一个 Docker 镜像 URL**，通过 Job 依赖传递机制保证环境绝对一致。
-
----
-
-## 方案确认
-
-| 项目 | 选择 |
-|------|------|
-| 基础镜像 | `pytorch/manylinux-builder:aarch64` |
-| 镜像托管 | GitHub Container Registry (ghcr.io) |
-| CANN安装 | 单独镜像构建流程，定期构建推送 |
-| Runner规格 | 统一使用 `linux-aarch64-a3-16` |
-| 镜像标签 | 带时间戳版本号，如 `manylinux-cann9.0-20260428` |
-| Dockerfile位置 | 项目仓库 `.github/docker/` 目录 |
-
----
-
-## 整体架构
-
-```
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                           GitHub Actions Workflow                            │
-├─────────────────────────────────────────────────────────────────────────────┤
-│                                                                              │
-│  ┌──────────────────────────────────────────────────────────────────────┐   │
-│  │           阶段1: 镜像构建 (单独 workflow，定期运行)                    │   │
-│  │  Workflow: build-docker-image.yml                                    │   │
-│  │  Runner: linux-aarch64-a3-16                                         │   │
-│  │  基础镜像: pytorch/manylinux-builder:aarch64                         │   │
-│  │  安装 CANN 9.0.0-beta.2                                               │   │
-│  │  推送: ghcr.io/${{ github.repository_owner }}/pytorch-npu-builder    │   │
-│  │  标签: manylinux-cann9.0-${{ timestamp }}                            │   │
-│  └──────────────────────────────────────────────────────────────────────┘   │
-│                              │                                              │
-│                              │ 镜像已预构建                                  │
-│                              ▼                                              │
-│  ┌──────────────────────────────────────────────────────────────────────┐   │
-│  │           阶段2: 统一构建 (PyTorch + torch_npu)                       │   │
-│  │  Workflow: _build.yml                                                │   │
-│  │  Runner: linux-aarch64-a3-16                                         │   │
-│  │  Container: ghcr.io/xxx/pytorch-npu-builder:manylinux-cann9.0-xxx    │   │
-│  │  - Clone 上游 PyTorch main                                           │   │
-│  │  - 编译 PyTorch wheel                                                │   │
-│  │  - Checkout torch_npu                                                │   │
-│  │  - 编译 torch_npu wheel                                              │   │
-│  │  - 打包测试源码                                                      │   │
-│  │  Outputs: docker-image, torch-wheel, torch-npu-wheel, test-src       │   │
-│  └──────────────────────────────────────────────────────────────────────┘   │
-│                              │                                              │
-│                              │ docker-image URL 传递                        │
-│                              ▼                                              │
-│  ┌──────────────────────────────────────────────────────────────────────┐   │
-│  │           阶段3: 用例收集                                             │   │
-│  │  Job: collect_cases                                                  │   │
-│  │  Runner: linux-aarch64-a3-16                                         │   │
-│  │  Container: SAME Docker 镜像                                         │   │
-│  │  - 安装 torch + torch_npu                                            │   │
-│  │  - pytest --collect-only 收集所有用例                                │   │
-│  │  - 按用例分片 (distributed/regular)                                  │   │
-│  │  Outputs: distributed_matrix, regular_matrix, total_cases            │   │
-│  └──────────────────────────────────────────────────────────────────────┘   │
-│                              │                                              │
-│              ┌───────────────┴───────────────┐                              │
-│              │ docker-image URL 传递         │                              │
-│              ▼                               ▼                              │
-│  ┌─────────────────────────┐   ┌─────────────────────────────────┐         │
-│  │  阶段4: Distributed测试 │   │      阶段4: Regular测试          │         │
-│  │  Runner: a3-16          │   │      Runner: a3-16               │         │
-│  │  Container: SAME镜像    │   │      Container: SAME镜像         │         │
-│  │  串行执行               │   │      并发执行 (32 workers)       │         │
-│  │  2 shards               │   │      5 shards                    │         │
-│  └─────────────────────────┘   └─────────────────────────────────┘         │
-│              │                               │                              │
-│              └───────────────┬───────────────┘                              │
-│                              ▼                                              │
-│  ┌──────────────────────────────────────────────────────────────────────┐   │
-│  │           阶段5: 报告生成                                             │   │
-│  │  - 汇总所有 shard 结果                                               │   │
-│  │  - 生成 Markdown + JSON 报告                                        │   │
-│  │  - 压缩上传制品                                                      │   │
-│  └──────────────────────────────────────────────────────────────────────┘   │
-│                                                                              │
-└─────────────────────────────────────────────────────────────────────────────┘
-```
-
----
-
-## 核心设计：Docker 镜像传递机制
-
-### 上游 PyTorch 社区的做法
-
-在 PyTorch 的 CI 中，Build Job 和 Test Job 使用**同一个 Docker 镜像**：
-
-```yaml
-# Build Job 计算并输出完整的镜像 URL
-linux-build:
-  outputs:
-    docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-# Test Job 通过 needs 获取同一个镜像
-linux-test:
-  needs: linux-build
-  container:
-    image: ${{ needs.linux-build.outputs.docker-image }}
-```
-
-这种做法的优势：
-1. **环境绝对一致** - 构建和测试使用同一镜像
-2. **避免重复安装** - 镜像中预装依赖
-3. **减少 artifact 传递** - 直接挂载构建产物
-4. **可追溯性** - 镜像 URL 带 SHA256 哈希，确保版本锁定
-
----
-
-## 文件结构
-
-```
-.github/
-├── docker/
-│   └── pytorch-npu-builder.Dockerfile   # 基于 manylinux + CANN 的构建镜像
-│
-├── workflows/
-│   ├── build-docker-image.yml           # 镜像构建 workflow (定期运行)
-│   ├── npu-full-test.yml                # 主 workflow
-│   ├── _build.yml                       # 统一构建 workflow (可复用)
-│   └── _test.yml                        # 测试执行 workflow (可复用)
-│
-├── scripts/
-│   ├── collect_all_cases.py             # 收集所有测试用例
-│   ├── run_npu_test_shard.py            # 执行测试分片
-│   ├── parse_test_results.py            # 解析测试结果
-│   └── generate_report.py               # 生成汇总报告
-```
-
----
-
-## 阶段1: Docker 镜像构建
-
-### Dockerfile: pytorch-npu-builder.Dockerfile
-
-```dockerfile
-# 基于 PyTorch manylinux builder 镜像
-FROM ghcr.io/pytorch/manylinux-builder:aarch64
-
-# 设置工作目录
-WORKDIR /root
-
-# 安装 CANN 9.0.0-beta.2
-RUN mkdir -p cann && cd cann && \
-    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-toolkit_9.0.0-beta.2_linux-aarch64.run && \
-    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-A3-ops_9.0.0-beta.2_linux-aarch64.run && \
-    curl -O https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package/20260330/Ascend-cann-nnal_9.0.0-beta.2_linux-aarch64.run && \
-    chmod +x Ascend-cann*.run && \
-    ./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend && \
-    ./Ascend-cann-A3*.run --install --quiet --install-path=/usr/local/Ascend && \
-    ./Ascend-cann-nnal*.run --install --quiet --install-path=/usr/local/Ascend && \
-    rm -rf cann
-
-# 设置环境变量
-ENV CANN_PATH=/usr/local/Ascend/cann
-ENV NNAL_PATH=/usr/local/Ascend/nnal
-ENV ASCEND_HOME=/usr/local/Ascend
-
-# 添加 CANN 环境初始化脚本
-RUN echo '#!/bin/bash\n\
-source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true\n\
-source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true' > /etc/profile.d/cann_env.sh && \
-    chmod +x /etc/profile.d/cann_env.sh
-
-# 预安装 pytest 等测试依赖
-RUN pip3.11 install pytest pytest-timeout pytest-xdist hypothesis pyyaml zstandard
-```
-
-### Workflow: build-docker-image.yml
-
-```yaml
-name: Build Docker Image
-
-on:
-  schedule:
-    - cron: '0 2 * * 0'  # UTC 02:00, Beijing 10:00, every Sunday
-  workflow_dispatch:
-    inputs:
-      cann_version:
-        description: 'CANN version'
-        default: '9.0.0-beta.2'
-      force_build:
-        description: 'Force rebuild even if image exists'
-        default: false
-        type: boolean
-
-env:
-  REGISTRY: ghcr.io
-  IMAGE_NAME: pytorch-npu-builder
-
-jobs:
-  build:
-    runs-on: linux-aarch64-a3-16
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Generate timestamp tag
-        id: tag
-        run: |
-          TIMESTAMP=$(date +%Y%m%d)
-          echo "tag=manylinux-cann${{ inputs.cann_version || '9.0.0-beta.2' }}-${TIMESTAMP}" >> $GITHUB_OUTPUT
-
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .github/docker
-          file: .github/docker/pytorch-npu-builder.Dockerfile
-          push: true
-          tags: |
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}
-            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-
-      - name: Output image URL
-        run: |
-          echo "Built image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }}"
-```
-
----
-
-## 阶段2: 统一构建 (_build.yml)
-
-```yaml
-name: Build PyTorch and torch_npu
-
-on:
-  workflow_call:
-    inputs:
-      pytorch_branch:
-        required: true
-        type: string
-        default: 'main'
-      python_version:
-        required: true
-        type: string
-        default: '3.11'
-      docker_image_tag:
-        required: true
-        type: string
-        description: 'Docker image tag with timestamp'
-    outputs:
-      docker-image:
-        description: 'Full Docker image URL'
-        value: ${{ jobs.build.outputs.docker-image }}
-      torch-wheel:
-        description: 'PyTorch wheel artifact name'
-        value: 'torch-wheel-main'
-      torch-npu-wheel:
-        description: 'torch_npu wheel artifact name'
-        value: 'torch-npu-wheel-main'
-      test-src:
-        description: 'Test source artifact name'
-        value: 'test-src-main'
-
-env:
-  REGISTRY: ghcr.io
-
-jobs:
-  build:
-    runs-on: linux-aarch64-a3-16
-    outputs:
-      docker-image: ${{ steps.set_image.outputs.docker-image }}
-    
-    container:
-      image: ${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}
-      options: --user root
-    
-    steps:
-      - name: Set Docker image URL
-        id: set_image
-        run: |
-          DOCKER_IMAGE="${{ env.REGISTRY }}/${{ github.repository_owner }}/pytorch-npu-builder:${{ inputs.docker_image_tag }}"
-          echo "docker-image=${DOCKER_IMAGE}" >> $GITHUB_OUTPUT
-          echo "Using Docker image: ${DOCKER_IMAGE}"
-
-      - name: Setup CANN environment
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-      - name: Clone upstream PyTorch main
-        run: |
-          git clone --depth=1 --branch ${{ inputs.pytorch_branch }} \
-            https://github.com/pytorch/pytorch.git pytorch-src
-          PYTORCH_SHA=$(cd pytorch-src && git rev-parse HEAD)
-          echo "pytorch_sha=${PYTORCH_SHA}"
-
-      - name: Checkout torch_npu
-        uses: actions/checkout@v4
-        with:
-          path: torch_npu-src
-          submodules: recursive
-
-      - name: Build PyTorch wheel
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          
-          cd pytorch-src
-          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
-          
-          export MAX_JOBS=40
-          export USE_CUDA=0
-          export USE_CUDNN=0
-          export CMAKE_BUILD_TYPE=Release
-          
-          python${{ inputs.python_version }} setup.py build bdist_wheel
-          
-          echo "PyTorch wheel built:"
-          ls -la dist/
-
-      - name: Build torch_npu wheel
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-          
-          # 安装刚编译的 PyTorch
-          pip${{ inputs.python_version }} install pytorch-src/dist/*.whl
-          
-          cd torch_npu-src
-          pip${{ inputs.python_version }} install pyyaml setuptools wheel cmake ninja
-          
-          export MAX_JOBS=40
-          bash ci/build.sh --python=${{ inputs.python_version }}
-          
-          echo "torch_npu wheel built:"
-          ls -la dist/
-
-      - name: Package test source
-        run: |
-          tar -czf test-src.tar.gz pytorch-src
-          ls -la test-src.tar.gz
-
-      - name: Upload PyTorch wheel
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-wheel-main
-          path: pytorch-src/dist/*.whl
-          retention-days: 7
-
-      - name: Upload torch_npu wheel
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch-npu-wheel-main
-          path: torch_npu-src/dist/*.whl
-          retention-days: 7
-
-      - name: Upload test source
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-src-main
-          path: test-src.tar.gz
-          retention-days: 7
-```
-
----
-
-## 阶段3: 用例收集 (collect_cases Job)
-
-```yaml
-collect_cases:
-  needs: build
-  runs-on: linux-aarch64-a3-16
-  timeout-minutes: 60
-  
-  # 使用 Build Job 输出的同一个 Docker 镜像
-  container:
-    image: ${{ needs.build.outputs.docker-image }}
-    options: --user root
-  
-  outputs:
-    distributed_matrix: ${{ steps.collect.outputs.distributed_matrix }}
-    regular_matrix: ${{ steps.collect.outputs.regular_matrix }}
-    total_cases: ${{ steps.collect.outputs.total_cases }}
-
-  steps:
-    - name: Checkout scripts
-      uses: actions/checkout@v4
-      with:
-        sparse-checkout: .github/scripts
-
-    - name: Download wheels
-      uses: actions/download-artifact@v4
-      with:
-        name: torch-wheel-main
-        path: wheels
-
-    - name: Download torch_npu wheel
-      uses: actions/download-artifact@v4
-      with:
-        name: torch-npu-wheel-main
-        path: wheels
-
-    - name: Download test source
-      uses: actions/download-artifact@v4
-      with:
-        name: test-src-main
-
-    - name: Extract test source
-      run: tar -xzf test-src.tar.gz
-
-    - name: Install wheels
-      run: |
-        source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-        pip3.11 install wheels/*.whl
-
-    - name: Collect all test cases
-      id: collect
-      run: |
-        source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-        cd pytorch-src
-        
-        python3.11 ../.github/scripts/collect_all_cases.py \
-          --test-dir test \
-          --distributed-shards 2 \
-          --regular-shards 5 \
-          --output-dir cases_shards \
-          --parallel 16
-        
-        TOTAL=$(python3 -c "import json; d=json.load(open('cases_shards/cases_collection_summary.json')); print(d['total_cases'])")
-        echo "total_cases=${TOTAL}" >> $GITHUB_OUTPUT
-        
-        echo "distributed_matrix=[1,2]" >> $GITHUB_OUTPUT
-        echo "regular_matrix=[1,2,3,4,5]" >> $GITHUB_OUTPUT
-
-    - name: Upload cases shard JSONs
-      uses: actions/upload-artifact@v4
-      with:
-        name: cases-shards
-        path: pytorch-src/cases_shards/
-```
-
----
-
-## 阶段4: 测试执行 (_test.yml)
-
-```yaml
-name: Run NPU Tests
-
-on:
-  workflow_call:
-    inputs:
-      docker-image:
-        required: true
-        type: string
-      test-type:
-        required: true
-        type: string
-      shard-index:
-        required: true
-        type: number
-      max-workers:
-        required: false
-        type: number
-        default: 1
-      timeout:
-        required: false
-        type: number
-        default: 1200
-
-jobs:
-  test:
-    runs-on: linux-aarch64-a3-16
-    timeout-minutes: 1200
-    
-    # 使用 Build Job 输出的同一个 Docker 镜像
-    container:
-      image: ${{ inputs.docker-image }}
-      options: --user root
-
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: .github/scripts
-
-      - name: Download torch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-wheel-main
-          path: wheels
-
-      - name: Download torch_npu wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-npu-wheel-main
-          path: wheels
-
-      - name: Download test source
-        uses: actions/download-artifact@v4
-        with:
-          name: test-src-main
-
-      - name: Download cases shard
-        uses: actions/download-artifact@v4
-        with:
-          name: cases-shards
-          path: cases-shards
-
-      - name: Extract test source
-        run: tar -xzf test-src.tar.gz
-
-      - name: Install wheels
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          pip3.11 install wheels/*.whl
-
-      - name: Run ${{ inputs.test-type }} shard ${{ inputs.shard-index }}
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-          
-          python .github/scripts/run_npu_test_shard.py \
-            --cases-json cases-shards/${{ inputs.test-type }}_cases_shard_${{ inputs.shard-index }}.json \
-            --test-dir pytorch-src/test \
-            --report-dir test-reports \
-            --timeout ${{ inputs.timeout }} \
-            --max-workers ${{ inputs.max-workers }} \
-            --verbose
-
-      - name: Upload test reports
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-reports-${{ inputs.test-type }}-${{ inputs.shard-index }}
-          path: test-reports/
-          retention-days: 30
-```
-
----
-
-## 主 Workflow: npu-full-test.yml
-
-```yaml
-name: PyTorch NPU Full Test
-
-on:
-  push:
-    branches: [main, master]
-    paths:
-      - '.github/workflows/**'
-      - '.github/scripts/**'
-      - '.github/docker/**'
-  schedule:
-    - cron: '0 22 * * 1'  # UTC 22:00, Beijing 06:00, every Monday
-  workflow_dispatch:
-    inputs:
-      docker_image_tag:
-        description: 'Docker image tag (e.g., manylinux-cann9.0-20260428)'
-        default: 'latest'
-      pytorch_branch:
-        description: 'PyTorch branch to build'
-        default: 'main'
-      distributed_shards:
-        description: 'Distributed test shards'
-        default: '2'
-      regular_shards:
-        description: 'Regular test shards'
-        default: '5'
-
-env:
-  REGISTRY: ghcr.io
-  PYTHON_VERSION: '3.11'
-
-jobs:
-  build:
-    uses: ./.github/workflows/_build.yml
-    with:
-      pytorch_branch: ${{ inputs.pytorch_branch || 'main' }}
-      python_version: '3.11'
-      docker_image_tag: ${{ inputs.docker_image_tag || 'latest' }}
-
-  collect_cases:
-    needs: build
-    uses: ./.github/workflows/_collect.yml
-    with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
-
-  test_distributed:
-    needs: 
-      - build
-      - collect_cases
-    strategy:
-      matrix:
-        shard: ${{ fromJson(needs.collect_cases.outputs.distributed_matrix) }}
-      fail-fast: false
-      max-parallel: 2
-    uses: ./.github/workflows/_test.yml
-    with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-type: distributed
-      shard-index: ${{ matrix.shard }}
-      max-workers: 1  # Distributed 串行执行
-      timeout: 1200
-
-  test_regular:
-    needs: 
-      - build
-      - collect_cases
-    strategy:
-      matrix:
-        shard: ${{ fromJson(needs.collect_cases.outputs.regular_matrix) }}
-      fail-fast: false
-      max-parallel: 5
-    uses: ./.github/workflows/_test.yml
-    with:
-      docker-image: ${{ needs.build.outputs.docker-image }}
-      test-type: regular
-      shard-index: ${{ matrix.shard }}
-      max-workers: 32  # Regular 并发执行
-      timeout: 1200
-
-  report:
-    needs: 
-      - test_distributed
-      - test_regular
-    runs-on: ubuntu-latest
-    if: always()
-    
-    steps:
-      - name: Checkout scripts
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: .github/scripts
-
-      - name: Download all test reports
-        uses: actions/download-artifact@v4
-        with:
-          pattern: test-reports-*
-          path: all-reports
-          merge-multiple: false
-
-      - name: Generate consolidated report
-        run: |
-          python .github/scripts/generate_report.py \
-            --reports-root all-reports \
-            --output-markdown report.md \
-            --output-json report.json
-
-      - name: Upload final report
-        uses: actions/upload-artifact@v4
-        with:
-          name: npu-full-test-report
-          path: |
-            report.md
-            report.json
-          retention-days: 30
-
-      - name: Package all logs
-        run: tar -czf all-test-logs.tar.gz all-reports/
-
-      - name: Upload logs artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: all-test-logs
-          path: all-test-logs.tar.gz
-          retention-days: 30
-```
-
----
-
-## 关键脚本设计
-
-### collect_all_cases.py (用例收集)
-
-核心功能：
-1. 扫描 test 目录下所有 test_*.py 文件
-2. 分类为 distributed/regular 类型
-3. 通过 pytest --collect-only 收集每个文件的用例
-4. 将用例均分到 shards
-5. 保存 shard JSON 文件
-
-关键实现参考 PR #117 的 collect_all_cases.py。
-
-### run_npu_test_shard.py (测试执行)
-
-核心功能：
-1. 加载 shard JSON 中的用例列表
-2. 每个用例启动独立 pytest subprocess (崩溃隔离)
-3. 并发执行 (ThreadPoolExecutor，max_workers 参数控制)
-4. 收集结果：passed/failed/error/crashed/timeout
-5. 保存每个用例的结果 JSON
-
-关键实现参考 PR #117 的 run_npu_test_shard.py。
-
-### generate_report.py (报告生成)
-
-核心功能：
-1. 读取所有 shard 的 cases JSON
-2. 统计汇总：总用例数、passed/failed/crashed/timeout
-3. 生成 Markdown 报告表格
-4. 生成 JSON 详细报告
-
----
-
-## 关键设计要点
-
-### 1. Docker 镜像传递机制
-
-与上游 PyTorch 一致的做法：
-
-```yaml
-# Build Job 输出镜像 URL
-outputs:
-  docker-image: ${{ steps.set_image.outputs.docker-image }}
-
-# Test Job 使用同一个镜像
-container:
-  image: ${{ needs.build.outputs.docker-image }}
-```
-
-优势：
-- 构建和测试环境完全一致
-- 避免 artifact 传递导致的环境差异
-- 镜像标签带时间戳，便于追溯
-
-### 2. CANN 环境初始化
-
-在 Dockerfile 中预装 CANN，并在镜像中添加环境初始化脚本：
-
-```bash
-source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-```
-
-每个 job 开始时调用此脚本确保环境正确。
-
-### 3. 用例级分片
-
-- pytest --collect-only 预收集所有用例
-- 按用例数量均分到 shards，实现负载均衡
-- 遏制某些 shard 因包含大文件而过载
-
-### 4. 每用例独立进程执行
-
-- 每个 pytest case 在独立 subprocess 中运行
-- 崩溃隔离：一个 case crash 不影响其他 case
-- 支持超时控制
-- 结果独立记录
-
-### 5. Distributed vs Regular 区分
-
-- Distributed: 需要 HCCL 分布式环境，串行执行 (max-workers=1)
-- Regular: 单机测试，并发执行 (max-workers=32)
-
-### 6. PYTHONPATH 处理
-
-某些测试文件导入同级模块：
-- 执行前动态添加测试文件父目录到 PYTHONPATH
-
-### 7. Runner 统一使用 a3-16
-
-- Build、Collect、Test 都使用 linux-aarch64-a3-16
-- 简化配置，统一环境
-- 16-card NPU 支持分布式测试
-
----
-
-## 验证方案
-
-### 步骤1: 验证 Docker 镜像构建
-
-```bash
-# 本地构建测试
-cd .github/docker
-docker build -t pytorch-npu-builder:test -f pytorch-npu-builder.Dockerfile .
-
-# 验证 CANN 安装
-docker run --rm pytorch-npu-builder:test \
-  bash -c "source /usr/local/Ascend/cann/set_env.sh && echo CANN installed"
-```
-
-### 步骤2: 验证构建流程
-
-触发 workflow_dispatch，使用少量配置：
-- docker_image_tag: latest (或手动构建的标签)
-- pytorch_branch: main
-- distributed_shards: 1
-- regular_shards: 1
-
-### 步骤3: 验证用例收集和执行
-
-检查 collect_cases job 输出：
-- cases_collection_summary.json 内容
-- shard JSON 文件数量和内容
-
-检查 test job 输出：
-- cases JSON 结果统计
-- 失败用例日志
-
-### 步骤4: 验证报告生成
-
-检查 report job 输出：
-- report.md Markdown 报告
-- report.json 详细数据
-- all-test-logs.tar.gz 日志制品
-
----
-
-## 预估工作量
-
-| 任务 | 预估时间 |
-|------|----------|
-| 创建 Dockerfile | 1 小时 |
-| 创建 build-docker-image.yml | 1 小时 |
-| 创建 _build.yml | 2 小时 |
-| 创建 _test.yml | 2 小时 |
-| 创建 npu-full-test.yml | 1 小时 |
-| 开发 collect_all_cases.py | 3 小时 |
-| 开发 run_npu_test_shard.py | 4 小时 |
-| 开发 generate_report.py | 2 小时 |
-| 本地镜像构建验证 | 2 小时 |
-| CI 调试和优化 | 4 小时 |
-| **总计** | **~20 小时** |
-
----
-
-## 待确认事项
-
-1. **镜像仓库权限**: ghcr.io 推送需要 `packages: write` 权限，确认仓库已启用
-2. **Runner 访问**: linux-aarch64-a3-16 runner 确认可用
-3. **CANN 版本**: 当前使用 9.0.0-beta.2，是否需要支持多版本
-4. **镜像更新频率**: 每周构建一次是否合适，或需要手动触发机制
\ No newline at end of file

From 84e198ab19d6eaf45ae8a0a5170f34922cc8518a Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 14 May 2026 15:29:33 +0800
Subject: [PATCH 129/132] =?UTF-8?q?=E9=87=8D=E6=9E=84workflow=EF=BC=9A?=
 =?UTF-8?q?=E6=8A=BD=E5=8F=96=E5=85=AC=E5=85=B1=E4=BE=9D=E8=B5=96=E5=AE=89?=
 =?UTF-8?q?=E8=A3=85=E4=B8=BAaction=E5=B9=B6=E4=BC=98=E5=8C=96=E6=97=A5?=
 =?UTF-8?q?=E5=BF=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

主要修改：
- 新增setup-npu-test-env action：封装checkout、cache、安装torch/torch_npu、测试依赖等公共步骤
- 简化4个子workflow：collect/custom/dist/regular统一调用action
- 优化collect_all_cases.py日志：display_name提前计算避免重复逻辑
- 简化run_npu_test_shard.py：移除废弃的shard discovery模式，保留cases-json和test-files模式
- distributed测试串行执行通过max_workers=1实现

参数清理：
- action只保留实际使用的参数：python_version、torch_wheel_artifact、torch_npu_wheel_artifact、pytorch_src_artifact
- 删除无意义的pytorch_version、cache_key_prefix、patch_log_suffix参数

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/actions/setup-npu-test-env/action.yml | 146 +++
 .github/scripts/collect_all_cases.py          |  51 +-
 .github/scripts/run_npu_test_shard.py         | 870 ++----------------
 .../workflows/_torch-npu-upstream-collect.yml | 127 +--
 .../_torch-npu-upstream-test-custom.yml       | 121 +--
 .../_torch-npu-upstream-test-dist.yml         | 126 +--
 .../_torch-npu-upstream-test-regular.yml      | 128 +--
 7 files changed, 266 insertions(+), 1303 deletions(-)
 create mode 100644 .github/actions/setup-npu-test-env/action.yml

diff --git a/.github/actions/setup-npu-test-env/action.yml b/.github/actions/setup-npu-test-env/action.yml
new file mode 100644
index 0000000000..1c45976642
--- /dev/null
+++ b/.github/actions/setup-npu-test-env/action.yml
@@ -0,0 +1,146 @@
+name: 'Setup NPU Test Environment'
+description: 'Common environment setup for NPU upstream tests - checkout, cache, install PyTorch/torch_npu/triton-ascend, test dependencies'
+
+inputs:
+  python_version:
+    required: true
+    type: string
+    description: Python version to use
+  torch_wheel_artifact:
+    required: true
+    type: string
+    description: Name of the torch wheel artifact
+  torch_npu_wheel_artifact:
+    required: true
+    type: string
+    description: Name of the torch_npu wheel artifact
+  pytorch_src_artifact:
+    required: true
+    type: string
+    description: Name of the PyTorch source artifact
+
+env:
+  # PyPI 缓存 URL（用于加速 pip 下载）
+  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
+
+runs:
+  using: 'composite'
+  steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
+          fetch-depth: 1
+          path: ascend_pytorch
+
+      - name: Setup cache directories
+        run: |
+          mkdir -p /github/home/.cache/pip
+          chmod -R 777 /github/home/.cache
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: /github/home/.cache/pip
+          key: pip-arm-collect-py${{ inputs.python_version }}
+          restore-keys: |
+            pip-arm-collect-py${{ inputs.python_version }}-
+            pip-arm-collect-
+
+      - name: Download built torch wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_wheel_artifact }}
+          path: torch-wheel-artifact
+
+      - name: Download built torch_npu wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.torch_npu_wheel_artifact }}
+          path: torch-npu-wheel-artifact
+
+      - name: Download PyTorch source and test code
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.pytorch_src_artifact }}
+          path: pytorch-src-artifact
+
+      - name: Extract PyTorch source
+        run: |
+          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
+
+      - name: Install built PyTorch and torch_npu
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PIP=pip${{ inputs.python_version }}
+          PYTHON=python${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+
+          # Configure pip to use PyPI cache for faster downloads
+          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
+            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
+            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
+            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
+          fi
+
+          $PIP install --upgrade pip
+
+          # Install built torch wheel
+          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_WHL}"
+
+          # Install built torch_npu wheel
+          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
+          $PIP install "${TORCH_NPU_WHL}"
+
+          echo "Installed PyTorch and torch_npu from built wheels"
+          echo "torch: ${TORCH_WHL}"
+          echo "torch_npu: ${TORCH_NPU_WHL}"
+
+      - name: Install test dependencies
+        run: |
+          PIP=pip${{ inputs.python_version }}
+          export PIP_CACHE_DIR=/github/home/.cache/pip
+          cd pytorch-src
+
+          # Core test dependencies
+          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
+          $PIP install pytest-rerunfailures pytest-flakefinder
+          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
+
+          # Optional dependencies for ONNX tests
+          # These are not in PyTorch requirements.txt but needed by specific tests
+          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
+
+          # torchvision for ONNX model tests (install without deps to bypass torch version check)
+          # PyPI torchvision requires exact torch version (torch==2.11.0), but we have dev build
+          # Use --no-deps to skip torch dependency, we already have our compiled torch installed
+          $PIP install numpy pillow || true
+          $PIP install torchvision --no-deps || true
+
+          # Other optional dependencies
+          $PIP install parameterized pandas || true
+          $PIP install opencv-python || true
+
+          # PyTorch requirements (if exists)
+          if [ -f requirements.txt ]; then
+            $PIP install -r requirements.txt || true
+          fi
+
+      - name: Verify NPU availability
+        run: |
+          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
+          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
+
+          PYTHON=python${{ inputs.python_version }}
+          $PYTHON -c "
+          import torch
+          print(f'torch: {torch.__version__}')
+          import torch_npu
+          print(f'torch_npu: {torch_npu.__version__}')
+          print(f'NPU available: {torch.npu.is_available()}')
+          print(f'NPU count: {torch.npu.device_count()}')
+          "
\ No newline at end of file
diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 33375ace9e..782fa3679f 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -33,6 +33,21 @@
 import discover_test_files
 
 
+def _normalize_test_file_path(test_file: str) -> str:
+    """
+    Remove 'test/' prefix from test file path if present.
+
+    Args:
+        test_file: Test file path (e.g., "test/distributed/pipelining/test_backward.py")
+
+    Returns:
+        Relative path without 'test/' prefix
+    """
+    if test_file.startswith("test/"):
+        return test_file[5:]
+    return test_file
+
+
 def get_test_file_parent_dir(test_file: str, test_dir: Path) -> Path:
     """
     Get the parent directory of a test file.
@@ -47,16 +62,12 @@ def get_test_file_parent_dir(test_file: str, test_dir: Path) -> Path:
     Returns:
         Path to the test file's parent directory
     """
-    if test_file.startswith("test/"):
-        test_file_rel = test_file[5:]
-    else:
-        test_file_rel = test_file
-
+    test_file_rel = _normalize_test_file_path(test_file)
     test_file_path = Path(test_file_rel)
     return test_dir / test_file_path.parent
 
 
-def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, List[str], bool, str]:
+def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, str, List[str], bool, str]:
     """
     Collect test cases from a single file.
 
@@ -64,18 +75,16 @@ def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, List[st
     imports of sibling modules (e.g., 'from model_registry import MLPModule').
 
     Returns:
-        Tuple of (test_file, nodeids, success, error_message)
+        Tuple of (test_file, display_name, nodeids, success, error_message)
         - test_file: Original test file path
+        - display_name: Short name for logging (remove test/ prefix and .py suffix)
         - nodeids: List of collected test case nodeids
         - success: True if collection succeeded without errors
         - error_message: Error details if collection failed, empty string otherwise
     """
-    if test_file.startswith("test/"):
-        test_file_rel = test_file[5:]
-    else:
-        test_file_rel = test_file
+    test_file_rel = _normalize_test_file_path(test_file)
 
-    # Extract display name (remove test/ prefix and .py suffix)
+    # Extract display name (remove .py suffix)
     display_name = test_file_rel
     if display_name.endswith(".py"):
         display_name = display_name[:-3]
@@ -124,21 +133,21 @@ def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, List[st
         # returncode 5 means 0 cases collected, which indicates a problem.
         if result.returncode in (0, 3):
             # Normal: passed or skipped
-            return (test_file, nodeids, True, "")
+            return (test_file, display_name, nodeids, True, "")
         else:
             # returncode 2, 4, 5: real collection error
             # returncode 5 specifically means no tests collected - a problem for selected files
             error_msg = result.stdout.strip()
             if result.stderr.strip():
                 error_msg += "\n--- stderr ---\n" + result.stderr.strip()
-            return (test_file, nodeids, False, error_msg)
+            return (test_file, display_name, nodeids, False, error_msg)
 
     except subprocess.TimeoutExpired:
         error_msg = f"TIMEOUT: Collection took >120s for {display_name}"
-        return (test_file, [], False, error_msg)
+        return (test_file, display_name, [], False, error_msg)
     except Exception as e:
         error_msg = f"ERROR: {e}"
-        return (test_file, [], False, error_msg)
+        return (test_file, display_name, [], False, error_msg)
 
 
 def collect_all_cases(
@@ -180,17 +189,9 @@ def collect_all_cases(
         total_cases = 0
 
         for future in as_completed(futures):
-            test_file, nodeids, success, error_msg = future.result()
+            test_file, display_name, nodeids, success, error_msg = future.result()
             completed += 1
 
-            # Extract display name for logging
-            if test_file.startswith("test/"):
-                display_name = test_file[5:]
-            else:
-                display_name = test_file
-            if display_name.endswith(".py"):
-                display_name = display_name[:-3]
-
             if success:
                 successful_count += 1
                 # Print concise log for successful files
diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 4f1f3c9b70..0d7d1ba4b4 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -1,39 +1,45 @@
 #!/usr/bin/env python3
 """
-Run a shard of patched upstream PyTorch tests via per-case isolation pytest execution.
+Run PyTorch NPU tests via per-case isolation pytest execution.
 
-This script focuses on:
-    - Test discovery (via discover_test_files.py)
-    - Shard assignment (Step 4)
-    - Per-case isolation execution (serial or concurrent subprocess)
+This script executes pre-collected test cases or specified test files
+with per-case subprocess isolation for crash safety.
 
-Result parsing is handled by parse_test_results.py.
-
-Test types:
-    - distributed: NPU distributed tests (test/distributed/*) - serial execution
-    - regular: All other tests - concurrent execution (max 4 workers by default)
+Execution modes:
+    - Pre-collected cases (--cases-json): Execute cases from JSON file
+    - Custom test files (--test-files): Execute specified test files
 
-Each shard executes tests in per-case isolation mode:
-    - First collect all test cases via pytest --collect-only
-    - Each case runs in its own pytest subprocess
-    - NPU kernel crashes won't cascade to other cases (each case isolated)
+Each case runs in its own pytest subprocess for isolation:
+    - NPU kernel crashes won't cascade to other cases
     - Results recorded in cases.json file
 
-Execution modes:
-    - Serial: One case at a time (for distributed tests)
-    - Concurrent: Up to max_workers subprocesses running simultaneously (for regular tests)
+Test types:
+    - distributed: Serial execution (one case at a time)
+    - regular: Concurrent execution (multiple workers)
 
 Usage:
+    # Pre-collected cases mode (primary usage):
+    python run_npu_test_shard.py \
+        --cases-json distributed_cases_shard_1.json \
+        --test-dir /path/to/pytorch/test \
+        --disabled-testcases /path/to/disabled_testcases.json \
+        --report-dir test-reports \
+        --timeout 1200 \
+        --max-workers 64 \
+        --verbose
+
+    # Custom test files mode:
     python run_npu_test_shard.py \
-        --shard 1 \
-        --num-shards 50 \
-        --test-type distributed \
+        --test-files test_meta.py,test_nn.py \
         --test-dir /path/to/pytorch/test \
         --disabled-testcases /path/to/disabled_testcases.json \
         --report-dir test-reports \
         --timeout 1200 \
         --max-workers 4 \
         --verbose
+
+Note: Shard discovery mode (--shard/--num-shards/--test-type) has been removed.
+      Use collect_all_cases.py for case discovery and sharding.
 """
 
 import argparse
@@ -41,8 +47,6 @@
 import importlib.util
 import json
 import os
-import re
-import signal
 import subprocess
 import sys
 import threading
@@ -52,7 +56,7 @@
 from pathlib import Path
 from queue import Queue
 from time import monotonic
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Tuple
 
 
 # ==============================================================================
@@ -77,53 +81,6 @@ def load_parse_test_results_module(script_dir: Path):
 # ==============================================================================
 
 
-@dataclasses.dataclass
-class DiscoveryResult:
-    """Result from discover_test_files.py."""
-    test_files: List[str]
-    metadata: Dict
-    total_files: int
-
-
-@dataclasses.dataclass
-class ShardAssignmentResult:
-    """Result of Step 4: Shard assignment."""
-    shard: int
-    num_shards: int
-    planned_tests: List[str]
-    planned_count: int
-
-
-@dataclasses.dataclass
-class ShardPlanResult:
-    """Complete result of discovery + shard assignment."""
-    discovery: DiscoveryResult
-    shard_assignment: ShardAssignmentResult
-
-    def get_planned_tests(self) -> List[str]:
-        return self.shard_assignment.planned_tests
-
-    def to_info_dict(self) -> Dict:
-        return {
-            "total_files": self.discovery.metadata.get("total_files", 0),
-            "test_type": self.discovery.metadata.get("test_type", "regular"),
-            "type_selected_files": self.discovery.metadata.get("type_selected", 0),
-            "type_excluded_files": self.discovery.metadata.get("type_excluded", 0),
-            "whitelist_entries": self.discovery.metadata.get("whitelist_entries", 0),
-            "blacklist_entries": self.discovery.metadata.get("blacklist_entries", 0),
-            "rules_selected": self.discovery.metadata.get("rules_selected", 0),
-            "rules_excluded": self.discovery.metadata.get("rules_excluded", 0),
-            "shard": self.shard_assignment.shard,
-            "num_shards": self.shard_assignment.num_shards,
-            "shard_files": self.shard_assignment.planned_count,
-        }
-
-
-# ==============================================================================
-# Concurrent Execution Data Classes
-# ==============================================================================
-
-
 @dataclasses.dataclass
 class CaseExecutionTask:
     """Task for concurrent case execution."""
@@ -142,7 +99,7 @@ class ConcurrentExecutionConfig:
 
 
 # ==============================================================================
-# Failed Case Log Saving Functions
+# Case Log Saving Functions
 # ==============================================================================
 
 
@@ -177,7 +134,7 @@ def sanitize_nodeid_for_filename(nodeid: str) -> str:
     return safe_name or "unknown_case"
 
 
-def save_failed_case_log(
+def save_case_log(
     report_dir: Path,
     shard: int,
     shard_type: str,
@@ -191,7 +148,7 @@ def save_failed_case_log(
     command: str,
 ) -> Path:
     """
-    Save complete execution log for a failed case.
+    Save complete execution log for all test cases.
 
     Creates a dedicated log file containing:
     - Case metadata (nodeid, status, duration, returncode)
@@ -201,25 +158,20 @@ def save_failed_case_log(
     Returns:
         Path to the saved log file
     """
-    # Save log for error/timeout cases (no XML generated)
-    # Cases with valid XML don't need log saving (XML contains all info)
-    if status not in ("error", "timeout"):
-        return None
-
-    # Create failed cases log directory
-    failed_logs_dir = report_dir / "failed_cases_logs"
-    failed_logs_dir.mkdir(parents=True, exist_ok=True)
+    # Create cases log directory
+    cases_logs_dir = report_dir / "cases_logs"
+    cases_logs_dir.mkdir(parents=True, exist_ok=True)
 
     # Generate safe filename
     safe_name = sanitize_nodeid_for_filename(nodeid)
     prefix = "dist" if shard_type == "distributed" else "reg"
     log_filename = f"{prefix}-{shard}_{case_idx}_{safe_name}.log"
-    log_path = failed_logs_dir / log_filename
+    log_path = cases_logs_dir / log_filename
 
     # Write log content
     content_lines = [
         "=" * 80,
-        f"FAILED CASE LOG",
+        f"CASE LOG",
         "=" * 80,
         f"Shard: {prefix}-{shard}",
         f"Case Index: {case_idx}",
@@ -335,11 +287,6 @@ def mark_completed(self, nodeid: str, status: str, duration: float) -> None:
                   f"{status_icon} {display_nodeid} ({duration:.1f}s) "
                   f"[elapsed: {elapsed:.0f}s]", flush=True)
 
-    def get_progress(self) -> Tuple[int, int]:
-        """Get current progress."""
-        with self._lock:
-            return self._completed_tasks, self._total_tasks
-
 
 # ==============================================================================
 # JUnit XML Parsing for Accurate Status Detection
@@ -401,141 +348,6 @@ def parse_junit_xml_status(xml_file: Path) -> Dict:
 # ==============================================================================
 
 
-def get_signal_name(signal_num: int) -> str:
-    """Convert signal number to human-readable name."""
-    try:
-        name = signal.Signals(signal_num).name
-        return f"{name}({signal_num})"
-    except ValueError:
-        return f"SIG{signal_num}"
-
-
-# ==============================================================================
-# Discovery Integration
-# ==============================================================================
-
-
-def load_discover_module(script_dir: Path):
-    """Load discover_test_files module dynamically."""
-    module_path = script_dir / "discover_test_files.py"
-    if not module_path.exists():
-        raise FileNotFoundError(f"discover_test_files.py not found at {module_path}")
-
-    spec = importlib.util.spec_from_file_location("discover_test_files", str(module_path))
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
-
-
-def run_discovery(
-    test_dir: Path,
-    test_type: str,
-    discover_module,
-) -> DiscoveryResult:
-    """Run test discovery and return DiscoveryResult."""
-    test_files, metadata = discover_module.discover_test_files(
-        test_dir=test_dir,
-        test_type=test_type,
-        case_paths_config=None,
-    )
-
-    return DiscoveryResult(
-        test_files=test_files,
-        metadata=metadata,
-        total_files=len(test_files),
-    )
-
-
-# ==============================================================================
-# Shard Assignment (Step 4)
-# ==============================================================================
-
-
-def select_shard_files(test_files: List[str], shard: int, num_shards: int) -> List[str]:
-    """
-    Select test files for a shard using contiguous range-based selection.
-
-    Args:
-        test_files: List of test file paths, already sorted alphabetically
-        shard: Shard number (1-indexed, 1 <= shard <= num_shards)
-        num_shards: Total number of shards
-
-    Returns:
-        List of test files assigned to this shard
-    """
-    if not test_files:
-        return []
-
-    shard_index = shard - 1
-    total_files = len(test_files)
-
-    base_size = total_files // num_shards
-    remainder = total_files % num_shards
-
-    if shard_index < remainder:
-        start = shard_index * (base_size + 1)
-        end = start + base_size + 1
-    else:
-        start = remainder * (base_size + 1) + (shard_index - remainder) * base_size
-        end = start + base_size
-
-    return test_files[start:end]
-
-
-def assign_shard(discovery_result: DiscoveryResult, shard: int, num_shards: int) -> ShardAssignmentResult:
-    """Assign test files to a specific shard."""
-    planned_tests = select_shard_files(discovery_result.test_files, shard, num_shards)
-    return ShardAssignmentResult(
-        shard=shard,
-        num_shards=num_shards,
-        planned_tests=planned_tests,
-        planned_count=len(planned_tests),
-    )
-
-
-# ==============================================================================
-# Complete Test Planning
-# ==============================================================================
-
-
-def plan_shard_tests(
-    test_dir: Path,
-    shard: int,
-    num_shards: int,
-    test_type: str,
-    discover_module,
-) -> ShardPlanResult:
-    """Complete test planning: discovery + shard assignment."""
-    discovery_result = run_discovery(test_dir, test_type, discover_module)
-    shard_assignment_result = assign_shard(discovery_result, shard, num_shards)
-
-    return ShardPlanResult(
-        discovery=discovery_result,
-        shard_assignment=shard_assignment_result,
-    )
-
-
-def create_test_plan_summary(result: ShardPlanResult) -> str:
-    """Create human-readable summary."""
-    lines = [
-        "=" * 60,
-        "Test Planning Summary",
-        "=" * 60,
-        f"Discovery (Steps 1-3): {result.discovery.metadata.get('total_files', 0)} files scanned",
-        f"  Test type: {result.discovery.metadata.get('test_type', 'regular')}",
-        f"  Type filter: {result.discovery.metadata.get('type_selected', 0)} selected",
-        f"  Rules filter: {result.discovery.metadata.get('rules_selected', 0)} after whitelist/blacklist",
-        f"Shard Assignment (Step 4): {result.shard_assignment.planned_count} files for shard {result.shard_assignment.shard}/{result.shard_assignment.num_shards}",
-        "=" * 60,
-    ]
-    return "\n".join(lines)
-
-
-# ==============================================================================
-# Utility Functions
-# ==============================================================================
-
-
 def strip_test_prefix_and_suffix(test_path: str) -> str:
     """Remove 'test/' prefix and '.py' suffix from path."""
     path = test_path
@@ -662,217 +474,6 @@ def collect_test_cases(test_file: str, test_dir: Path, env: Dict) -> List[str]:
         return []
 
 
-# ==============================================================================
-# Case Execution
-# ==============================================================================
-
-
-def run_single_test_case(
-    case_nodeid: str,
-    test_dir: Path,
-    env: Dict,
-    timeout: int,
-    verbose: bool,
-    report_dir: Path = None,
-    shard: int = 0,
-    shard_type: str = "regular",
-    case_idx: int = 0,
-    test_file: str = "",
-) -> Dict:
-    """
-    Run a single test case in isolated subprocess with JUnit XML output.
-
-    Adds test file's parent directory to PYTHONPATH to enable
-    imports of sibling modules (e.g., 'from model_registry import MLPModule').
-
-    Uses JUnit XML report for accurate status detection (passed/skipped/failed/error).
-    pytest returncode cannot distinguish passed from skipped accurately.
-
-    Args:
-        case_nodeid: Test case nodeid (e.g., "test_autograd.py::TestAutograd::test_grad")
-        test_dir: Path to PyTorch test directory
-        env: Environment dict for subprocess (will be modified for this call)
-        timeout: Per-case timeout in seconds
-        verbose: Verbose output
-        report_dir: Directory for XML reports and logs
-        shard: Shard number for XML filename
-        shard_type: "distributed" or "regular"
-        case_idx: Case index for XML filename
-        test_file: Test file path for PYTHONPATH calculation
-
-    Returns:
-        Dict with: nodeid, status, duration, returncode, message, command
-    """
-    start_time = monotonic()
-
-    # Preserve original nodeid for result reporting
-    original_nodeid = case_nodeid
-
-    # Strip test/ prefix from nodeid if present (pytest --collect-only outputs with test/ prefix)
-    if case_nodeid.startswith("test/"):
-        case_nodeid = case_nodeid[5:]
-
-    # Build per-case environment with test file directory in PYTHONPATH
-    case_env = env.copy()
-    if test_file:
-        if test_file.startswith("test/"):
-            test_file_rel = test_file[5:]
-        else:
-            test_file_rel = test_file
-
-        test_file_path = Path(test_file_rel)
-        test_file_dir = test_dir / test_file_path.parent
-
-        existing_pythonpath = case_env.get("PYTHONPATH", "")
-        case_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
-
-    # Generate XML file path for this case with descriptive name
-    prefix = "dist" if shard_type == "distributed" else "reg"
-    safe_case_name = sanitize_nodeid_for_filename(original_nodeid)
-    xml_filename = f"{prefix}-{shard}_{case_idx}_{safe_case_name}.xml"
-    xml_file = report_dir / "junit_xmls" / xml_filename
-    xml_file.parent.mkdir(parents=True, exist_ok=True)
-
-    command = [
-        sys.executable,
-        "-m",
-        "pytest",
-        "--color=no",
-        "-ra",
-        "--tb=short",
-        case_nodeid,
-        f"--junitxml={xml_file}",
-        "--junit-prefix=",
-    ]
-
-    if timeout > 0:
-        command.append(f"--timeout={timeout}")
-
-    if verbose:
-        command.append("-vv")
-    else:
-        command.append("-v")
-
-    # Print command to log
-    command_str = " ".join(command)
-    print(f"    Command: {command_str}")
-
-    try:
-        result = subprocess.run(
-            command,
-            cwd=str(test_dir),
-            env=case_env,
-            capture_output=True,
-            text=True,
-            encoding="utf-8",
-            errors="replace",
-            timeout=timeout + 10,  # Extra buffer for timeout handling
-        )
-
-        duration = monotonic() - start_time
-        returncode = result.returncode
-
-        # Parse JUnit XML for status (simple logic)
-        # - Has XML: use XML status (passed/failed/skipped/error), don't save logs
-        # - No XML: status = error, save stdout/stderr
-        xml_result = parse_junit_xml_status(xml_file)
-        xml_status = xml_result.get("status")
-
-        if xml_status == "no_xml":
-            # No XML generated → error, save logs
-            status = "error"
-            message = xml_result.get("message")
-            if report_dir:
-                save_failed_case_log(
-                    report_dir=report_dir,
-                    shard=shard,
-                    shard_type=shard_type,
-                    nodeid=original_nodeid,
-                    case_idx=case_idx,
-                    status=status,
-                    stdout=result.stdout,
-                    stderr=result.stderr,
-                    duration=duration,
-                    returncode=returncode,
-                    command=command_str,
-                )
-        else:
-            # Has valid XML → use XML status, don't save logs
-            status = xml_status
-            message = xml_result.get("message", "")
-
-        case_result = {
-            "nodeid": original_nodeid,
-            "status": status,
-            "duration": duration,
-            "returncode": returncode,
-            "message": message,
-            "command": command_str,
-        }
-
-        return case_result
-
-    except subprocess.TimeoutExpired:
-        duration = monotonic() - start_time
-        # Timeout → no XML generated
-        status = "timeout"
-        case_result = {
-            "nodeid": original_nodeid,
-            "status": status,
-            "duration": duration,
-            "returncode": -1,
-            "message": f"Timeout after {timeout}s",
-            "command": command_str,
-        }
-
-        # Save log for timeout (no XML)
-        if report_dir:
-            save_failed_case_log(
-                report_dir=report_dir,
-                shard=shard,
-                shard_type=shard_type,
-                nodeid=original_nodeid,
-                case_idx=case_idx,
-                status=status,
-                stdout="(process timed out, no output captured)",
-                stderr="(process timed out, no output captured)",
-                duration=duration,
-                returncode=-1,
-                command=command_str,
-            )
-
-        return case_result
-
-    except Exception as e:
-        duration = monotonic() - start_time
-        case_result = {
-            "nodeid": original_nodeid,
-            "status": "error",
-            "duration": duration,
-            "returncode": 1,
-            "message": str(e)[:500],
-            "command": command_str,
-        }
-
-        # Save error case log
-        if report_dir:
-            save_failed_case_log(
-                report_dir=report_dir,
-                shard=shard,
-                shard_type=shard_type,
-                nodeid=original_nodeid,
-                case_idx=case_idx,
-                status="error",
-                stdout="(exception occurred before execution)",
-                stderr=str(e),
-                duration=duration,
-                returncode=1,
-                command=command_str,
-            )
-
-        return case_result
-
-
 # ==============================================================================
 # Concurrent Case Execution
 # ==============================================================================
@@ -995,33 +596,35 @@ def run_single_case_concurrent(
         returncode = result.returncode
 
         # Parse JUnit XML for status
-        # - Has XML: use XML status, don't save logs
-        # - No XML: error, save logs
+        # - Has XML: use XML status
+        # - No XML: error
         xml_result = parse_junit_xml_status(xml_file)
         xml_status = xml_result.get("status")
 
         if xml_status == "no_xml":
-            # No XML → error, save logs
+            # No XML → error
             status = "error"
             message = xml_result.get("message")
-            save_failed_case_log(
-                report_dir=report_dir,
-                shard=shard,
-                shard_type=shard_type,
-                nodeid=original_nodeid,
-                case_idx=task.case_idx,
-                status=status,
-                stdout=result.stdout,
-                stderr=result.stderr,
-                duration=duration,
-                returncode=returncode,
-                command=command_str,
-            )
         else:
-            # Has XML → use XML status, don't save logs
+            # Has XML → use XML status
             status = xml_status
             message = xml_result.get("message", "")
 
+        # Save logs for all cases
+        save_case_log(
+            report_dir=report_dir,
+            shard=shard,
+            shard_type=shard_type,
+            nodeid=original_nodeid,
+            case_idx=task.case_idx,
+            status=status,
+            stdout=result.stdout,
+            stderr=result.stderr,
+            duration=duration,
+            returncode=returncode,
+            command=command_str,
+        )
+
         case_result = {
             "nodeid": original_nodeid,
             "status": status,
@@ -1049,7 +652,7 @@ def run_single_case_concurrent(
         }
 
         # Save log for timeout
-        save_failed_case_log(
+        save_case_log(
             report_dir=report_dir,
             shard=shard,
             shard_type=shard_type,
@@ -1078,7 +681,7 @@ def run_single_case_concurrent(
         }
 
         # Save error case log
-        save_failed_case_log(
+        save_case_log(
             report_dir=report_dir,
             shard=shard,
             shard_type=shard_type,
@@ -1358,180 +961,6 @@ def run_tests_with_concurrent_isolation(
     return summary["worst_returncode"], elapsed, result_aggregator.get_sorted_cases()
 
 
-def run_tests_with_case_isolation(
-    planned_tests: List[str],
-    shard: int,
-    test_dir: Path,
-    report_dir: Path,
-    env_updates: Dict[str, str],
-    timeout: int,
-    verbose: bool,
-    shard_type: str,
-    result_module,
-    quick_test: int = None,
-) -> Tuple[int, float, List[Dict]]:
-    """
-    Execute tests with per-case isolation (strict serial execution).
-
-    Each test case runs in its own pytest subprocess for crash isolation.
-    No parallel execution - strict serial processing.
-
-    Args:
-        quick_test: Maximum number of cases to execute (None = all cases)
-
-    Returns:
-        Tuple of (worst_returncode, duration, cases_list)
-    """
-    start = monotonic()
-    prefix = result_module.get_shard_type_prefix(shard_type)
-    log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
-
-    # Create junit_xmls directory for XML reports
-    junit_xml_dir = report_dir / "junit_xmls"
-    junit_xml_dir.mkdir(parents=True, exist_ok=True)
-
-    merged_env = os.environ.copy()
-    merged_env.update(env_updates)
-
-    cases_list = []
-    worst_returncode = 0
-
-    with log_file.open("w", encoding="utf-8") as log_handle:
-        log_handle.write("=" * 80 + "\n")
-        log_handle.write(f"Per-case isolation pytest execution ({shard_type} shard)\n")
-        log_handle.write("=" * 80 + "\n")
-        log_handle.write(f"Total test files: {len(planned_tests)}\n")
-        log_handle.write("Execution mode: strict serial, each case in own process\n")
-        log_handle.write("=" * 80 + "\n\n")
-        log_handle.flush()
-
-        print(f"\n{'=' * 80}")
-        print(f"Per-case isolation mode: {len(planned_tests)} files")
-        print("Execution mode: strict serial, each case in own process")
-        if quick_test:
-            print(f"Quick test mode: will execute up to {quick_test} cases")
-        print(f"{'=' * 80}\n")
-
-        total_cases = 0
-        case_idx = 0
-
-        for file_idx, test_file in enumerate(planned_tests, 1):
-            # Quick test: stop if already have enough cases
-            if quick_test and case_idx >= quick_test:
-                print(f"\nQuick test limit reached ({quick_test} cases), stopping execution")
-                break
-
-            test_name = strip_test_prefix_and_suffix(test_file)
-
-            log_handle.write(f"\n{'=' * 80}\n")
-            log_handle.write(f"[File {file_idx}/{len(planned_tests)}] {test_name}\n")
-            log_handle.write(f"{'=' * 80}\n")
-            log_handle.flush()
-
-            print(f"\n[File {file_idx}/{len(planned_tests)}] {test_name}")
-            print("  Collecting test cases...")
-
-            # Collect cases for this file
-            case_nodeids = collect_test_cases(test_file, test_dir, merged_env)
-
-            if not case_nodeids:
-                log_handle.write(f"  No cases collected\n")
-                print(f"    No cases collected")
-                continue
-
-            log_handle.write(f"  Collected {len(case_nodeids)} cases\n")
-            log_handle.flush()
-            print(f"    Collected {len(case_nodeids)} cases")
-
-            # Execute each case serially
-            for nodeid in case_nodeids:
-                case_idx += 1
-                total_cases += 1
-
-                log_handle.write(f"\n  [{case_idx}] {nodeid}\n")
-                log_handle.flush()
-
-                print(f"    [{case_idx}] {nodeid}")
-
-                # Run single case
-                case_result = run_single_test_case(
-                    nodeid,
-                    test_dir,
-                    merged_env,
-                    timeout,
-                    verbose,
-                    report_dir,
-                    shard,
-                    shard_type,
-                    case_idx,
-                    test_file,
-                )
-
-                # Add file info
-                case_result["file"] = test_file
-
-                # Log result
-                status_str = case_result["status"]
-                duration_str = f"{case_result['duration']:.2f}s"
-                command_str = case_result.get("command", "")
-                message = case_result.get("message", "")
-                log_handle.write(f"    Command: {command_str}\n")
-                log_handle.write(f"    Status: {status_str}, Duration: {duration_str}\n")
-                if message:
-                    log_handle.write(f"    Message: {message[:500]}\n")
-                log_handle.flush()
-
-                print(f"      {status_str} ({duration_str})")
-                # Print error message for failed/error cases to stdout
-                if status_str in ("failed", "error") and message:
-                    # Print first few lines of error message
-                    msg_lines = message.splitlines()[:5]
-                    for msg_line in msg_lines:
-                        if msg_line.strip():
-                            print(f"        {msg_line[:200]}")
-
-                cases_list.append(case_result)
-
-                # Track worst returncode
-                rc = case_result["returncode"]
-                if rc != 0 and rc != 3 and rc != 5:  # Ignore skipped/no_tests
-                    if worst_returncode == 0:
-                        worst_returncode = rc
-
-                # Quick test: stop after executing enough cases
-                if quick_test and case_idx >= quick_test:
-                    print(f"    Quick test limit reached ({quick_test} cases), stopping")
-                    break
-
-        # Summary
-        elapsed = monotonic() - start
-
-        passed_count = sum(1 for c in cases_list if c["status"] == "passed")
-        failed_count = sum(1 for c in cases_list if c["status"] == "failed")
-        error_count = sum(1 for c in cases_list if c["status"] == "error")
-        timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
-        skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
-
-        log_handle.write(f"\n{'=' * 80}\n")
-        log_handle.write(f"Summary: {total_cases} cases executed\n")
-        log_handle.write(f"  Passed: {passed_count}\n")
-        log_handle.write(f"  Failed: {failed_count}\n")
-        log_handle.write(f"  Errors: {error_count}\n")
-        log_handle.write(f"  Timeout: {timeout_count}\n")
-        log_handle.write(f"  Skipped: {skipped_count}\n")
-        log_handle.write(f"  Duration: {elapsed:.2f}s\n")
-        log_handle.write(f"{'=' * 80}\n")
-        log_handle.flush()
-
-        print(f"\n{'=' * 80}")
-        print(f"Summary: {total_cases} cases executed")
-        print(f"  Passed: {passed_count}, Failed: {failed_count}, Errors: {error_count}, Timeout: {timeout_count}, Skipped: {skipped_count}")
-        print(f"  Duration: {elapsed:.2f}s")
-        print(f"{'=' * 80}")
-
-    return worst_returncode, elapsed, cases_list
-
-
 def run_tests_with_tasks_concurrent(
     tasks: List[CaseExecutionTask],
     shard: int,
@@ -1819,19 +1248,10 @@ def parse_test_files_input(test_files_str: str, test_dir: Path) -> List[str]:
 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(
-        description="Run PyTorch NPU tests for a shard via per-case isolation"
-    )
-    parser.add_argument("--test-files", type=str, help="Comma-separated test file paths to run directly (skip shard assignment, e.g., 'test_meta.py,test_nn.py')")
-    parser.add_argument("--cases-json", type=str, help="Path to pre-collected cases JSON file (skip case collection, use test_type from JSON)")
-    parser.add_argument("--shard", type=int, help="Shard number (1-indexed, required if --test-files/--cases-json not set)")
-    parser.add_argument("--num-shards", type=int, help="Total number of shards (required if --test-files/--cases-json not set)")
-    parser.add_argument(
-        "--test-type",
-        type=str,
-        choices=["distributed", "regular"],
-        default="regular",
-        help="Test type (ignored if --test-files is set). distributed uses serial execution, regular uses concurrent.",
+        description="Run PyTorch NPU tests via per-case isolation pytest execution"
     )
+    parser.add_argument("--test-files", type=str, help="Comma-separated test file paths to run directly (e.g., 'test_meta.py,test_nn.py')")
+    parser.add_argument("--cases-json", type=str, help="Path to pre-collected cases JSON file")
     parser.add_argument("--test-dir", type=str, required=True, help="Path to PyTorch test directory")
     parser.add_argument("--disabled-testcases", type=str, help="Path to disabled_testcases.json")
     parser.add_argument("--report-dir", type=str, default="test-reports", help="Directory for reports")
@@ -1846,10 +1266,9 @@ def parse_args():
     parser.add_argument("--quick-test", type=int, default=None, help="Quick test mode: execute only N cases for fast verification (default: None, run all cases)")
     args = parser.parse_args()
 
-    # Validate required arguments based on mode
+    # Validate required arguments: must specify either --test-files or --cases-json
     if not args.test_files and not args.cases_json:
-        if not args.shard or not args.num_shards:
-            parser.error("--shard and --num-shards are required when --test-files/--cases-json is not set")
+        parser.error("Either --test-files or --cases-json must be specified")
 
     # Validate max_workers
     if args.max_workers < 1:
@@ -1875,7 +1294,6 @@ def main():
     report_dir.mkdir(parents=True, exist_ok=True)
 
     # Load modules
-    discover_module = load_discover_module(script_dir)
     result_module = load_parse_test_results_module(script_dir)
 
     timestamp = datetime.now().isoformat()
@@ -2154,169 +1572,9 @@ def main():
         # The actual test results are recorded in cases.json
         sys.exit(0)
 
-    # ==========================================================================
-    # Mode: Shard-based execution (original logic)
-    # ==========================================================================
-
-    # Validate shard number
-    if args.shard < 1 or args.shard > args.num_shards:
-        raise ValueError(f"Invalid shard {args.shard}; expected 1 <= shard <= {args.num_shards}")
-
-    shard_type = args.test_type
-    timestamp = datetime.now().isoformat()
-
-    # ==========================================================================
-    # Execute test planning
-    # ==========================================================================
-    plan_result = plan_shard_tests(
-        test_dir=test_dir,
-        shard=args.shard,
-        num_shards=args.num_shards,
-        test_type=shard_type,
-        discover_module=discover_module,
-    )
-    planned_tests = plan_result.get_planned_tests()
-
-    # ==========================================================================
-    # Create info dict
-    # ==========================================================================
-    info = result_module.create_shard_info(args.shard, args.num_shards, timestamp)
-    info.update(plan_result.to_info_dict())
-    info["shard_type"] = shard_type
-    info["disabled_count"] = result_module.load_disabled_testcases_count(args.disabled_testcases)
-    info["selected_test_files"] = plan_result.discovery.metadata.get("rules_selected", 0)
-    info["excluded_test_files"] = plan_result.discovery.metadata.get("rules_excluded", 0)
-    info["shard_files"] = plan_result.shard_assignment.planned_count
-
-    # Save test plan
-    result_module.save_test_plan_file(str(report_dir), args.shard, planned_tests, shard_type)
-
-    # Save excluded files (not assigned to this shard)
-    all_selected = plan_result.discovery.test_files
-    excluded_for_shard = [f for f in all_selected if f not in planned_tests]
-    result_module.save_excluded_test_files_file(str(report_dir), args.shard, excluded_for_shard, shard_type)
-
-    # Print summary
-    print(create_test_plan_summary(plan_result))
-    print(f"\nRepository root: {repo_root}")
-    print(f"Test directory: {test_dir}")
-    if shard_type == "distributed":
-        print("Execution mode: SERIAL (per-case subprocess isolation)")
-    else:
-        print(f"Execution mode: CONCURRENT ({args.max_workers} workers, per-case subprocess isolation)")
-    print(f"Disabled testcase entries: {info['disabled_count']}")
-    print(f"\n{'=' * 80}\n")
-
-    for index, target in enumerate(planned_tests, 1):
-        display_name = strip_test_prefix_and_suffix(target)
-        print(f"  [{index:03d}] {display_name}")
-
-    # Clean old files
-    clean_existing_junit_xml(report_dir)
-    remove_existing_file(result_module.get_shard_log_file(report_dir, args.shard, shard_type))
-
-    # Build execution env
-    env_updates = build_execution_env(
-        test_dir, script_dir, args.disabled_testcases, args.shard, shard_type
-    )
-
-    # ==========================================================================
-    # Execute tests - choose execution mode based on shard_type
-    # ==========================================================================
-    # distributed tests: serial execution (each case in own process, one at a time)
-    # regular tests: concurrent execution (max_workers subprocesses simultaneously)
-    cases_list = []
-    if planned_tests:
-        if shard_type == "distributed":
-            # Distributed tests: serial execution for stability
-            print("\nExecution mode: SERIAL (distributed tests require sequential execution)")
-            if args.quick_test:
-                print(f"Quick test mode: will execute up to {args.quick_test} cases")
-            returncode, duration, cases_list = run_tests_with_case_isolation(
-                planned_tests,
-                args.shard,
-                test_dir,
-                report_dir,
-                env_updates,
-                args.timeout,
-                args.verbose,
-                shard_type,
-                result_module,
-                args.quick_test,
-            )
-            info["execution_mode"] = "serial"
-        else:
-            # Regular tests: concurrent execution for efficiency
-            print(f"\nExecution mode: CONCURRENT ({args.max_workers} workers)")
-            if args.quick_test:
-                print(f"Quick test mode: will execute up to {args.quick_test} cases")
-            returncode, duration, cases_list = run_tests_with_concurrent_isolation(
-                planned_tests,
-                args.shard,
-                test_dir,
-                report_dir,
-                env_updates,
-                args.timeout,
-                args.verbose,
-                shard_type,
-                args.max_workers,
-                result_module,
-                args.quick_test,
-            )
-            info["execution_mode"] = "concurrent"
-            info["concurrent_workers"] = args.max_workers
-        info["per_case_isolation"] = True
-    else:
-        print("No test files assigned to this shard after file-level filtering.")
-        returncode = 0
-        duration = 0.0
-
-    # Build cases.json data
-    passed_count = sum(1 for c in cases_list if c["status"] == "passed")
-    failed_count = sum(1 for c in cases_list if c["status"] == "failed")
-    error_count = sum(1 for c in cases_list if c["status"] == "error")
-    timeout_count = sum(1 for c in cases_list if c["status"] == "timeout")
-    skipped_count = sum(1 for c in cases_list if c["status"] == "skipped")
-
-    cases_data = {
-        "shard": args.shard,
-        "shard_type": shard_type,
-        "execution_mode": info.get("execution_mode", "serial"),
-        "concurrent_workers": info.get("concurrent_workers", 1),
-        "total_cases": len(cases_list),
-        "passed": passed_count,
-        "failed": failed_count,
-        "errors": error_count,
-        "timeout": timeout_count,
-        "skipped": skipped_count,
-        "duration": duration,
-        "cases": cases_list,
-    }
-
-    # Save cases.json
-    result_module.save_cases_file(str(report_dir), args.shard, cases_data, shard_type)
-
-    # ==========================================================================
-    # Generate reports
-    # ==========================================================================
-    stats = {
-        "total": len(cases_list),
-        "passed": passed_count,
-        "failed": failed_count,
-        "skipped": skipped_count,
-        "errors": error_count,
-        "duration": duration,
-        "returncode": returncode,
-        "per_case_isolation": True,
-    }
-
-    result_module.save_info_file(str(report_dir), args.shard, info, shard_type)
-    result_module.save_stats_file(str(report_dir), args.shard, stats, shard_type)
-    result_module.print_stats_summary(args.shard, stats, shard_type)
-
-    # Exit with 0 to allow step to succeed and report generation to proceed
-    # The actual test results are recorded in cases.json
-    sys.exit(0)
+    # No valid mode specified (should not reach here due to argument validation)
+    print("ERROR: Either --test-files or --cases-json must be specified")
+    sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml
index 5cd68f2627..bf57ab38c7 100644
--- a/.github/workflows/_torch-npu-upstream-collect.yml
+++ b/.github/workflows/_torch-npu-upstream-collect.yml
@@ -50,10 +50,6 @@ on:
         description: Total number of test cases
         value: ${{ jobs.collect.outputs.total_cases }}
 
-env:
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
 jobs:
   collect:
     runs-on: linux-aarch64-a3-16
@@ -69,124 +65,13 @@ jobs:
       total_cases: ${{ steps.collect_and_shard.outputs.total_cases }}
 
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.repository }}
-          ref: ${{ github.ref }}
-          fetch-depth: 1
-          path: ascend_pytorch
-
-      - name: Setup cache directories
-        run: |
-          mkdir -p /github/home/.cache/pip
-          chmod -R 777 /github/home/.cache
-
-      - name: Cache pip
-        uses: actions/cache@v4
-        with:
-          path: /github/home/.cache/pip
-          key: pip-arm-collect-py${{ inputs.python_version }}
-          restore-keys: |
-            pip-arm-collect-py${{ inputs.python_version }}-
-            pip-arm-collect-
-
-      - name: Download built torch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.torch_wheel_artifact }}
-          path: torch-wheel-artifact
-
-      - name: Download built torch_npu wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.torch_npu_wheel_artifact }}
-          path: torch-npu-wheel-artifact
-
-      - name: Download PyTorch source and test code
-        uses: actions/download-artifact@v4
+      - name: Setup NPU test environment
+        uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@dev_master
         with:
-          name: ${{ inputs.pytorch_src_artifact }}
-          path: pytorch-src-artifact
-
-      - name: Extract PyTorch source
-        run: |
-          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
-
-      - name: Install built PyTorch and torch_npu
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PIP=pip${{ inputs.python_version }}
-          PYTHON=python${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-
-          # Configure pip to use PyPI cache for faster downloads
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          fi
-
-          $PIP install --upgrade pip
-
-          # Install built torch wheel
-          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_WHL}"
-
-          # Install built torch_npu wheel
-          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_NPU_WHL}"
-
-          echo "Installed PyTorch and torch_npu from built wheels"
-          echo "torch: ${TORCH_WHL}"
-          echo "torch_npu: ${TORCH_NPU_WHL}"
-
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-          cd pytorch-src
-
-          # Core test dependencies
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder
-          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
-
-          # Optional dependencies for ONNX tests
-          # These are not in PyTorch requirements.txt but needed by specific tests
-          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
-
-          # torchvision for ONNX model tests (install without deps to bypass torch version check)
-          # PyPI torchvision requires exact torch version (torch==2.11.0), but we have dev build
-          # Use --no-deps to skip torch dependency, we already have our compiled torch installed
-          $PIP install numpy pillow || true
-          $PIP install torchvision --no-deps || true
-
-          # Other optional dependencies
-          $PIP install parameterized pandas || true
-          $PIP install opencv-python || true
-
-          # PyTorch requirements (if exists)
-          if [ -f requirements.txt ]; then
-            $PIP install -r requirements.txt || true
-          fi
-
-      - name: Verify NPU availability
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PYTHON=python${{ inputs.python_version }}
-          $PYTHON -c "
-          import torch
-          print(f'torch: {torch.__version__}')
-          import torch_npu
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          "
+          python_version: ${{ inputs.python_version }}
+          torch_wheel_artifact: ${{ inputs.torch_wheel_artifact }}
+          torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }}
+          pytorch_src_artifact: ${{ inputs.pytorch_src_artifact }}
 
       - name: Collect all test cases and shard
         id: collect_and_shard
diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml
index 48fb6c9fe8..027eea0fb9 100644
--- a/.github/workflows/_torch-npu-upstream-test-custom.yml
+++ b/.github/workflows/_torch-npu-upstream-test-custom.yml
@@ -28,10 +28,6 @@ on:
         type: string
         description: Test files to run (comma-separated)
 
-env:
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
 jobs:
   run_tests:
     name: test_custom
@@ -42,116 +38,13 @@ jobs:
       options: --user root
 
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.repository }}
-          ref: ${{ github.ref }}
-          fetch-depth: 1
-          path: ascend_pytorch
-
-      - name: Setup cache directories
-        run: |
-          mkdir -p /github/home/.cache/pip
-          chmod -R 777 /github/home/.cache
-
-      - name: Cache pip
-        uses: actions/cache@v4
+      - name: Setup NPU test environment
+        uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@dev_master
         with:
-          path: /github/home/.cache/pip
-          key: pip-arm-custom-py${{ inputs.python_version }}
-          restore-keys: |
-            pip-arm-custom-py${{ inputs.python_version }}-
-
-      - name: Download built torch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.torch_wheel_artifact }}
-          path: torch-wheel-artifact
-
-      - name: Download built torch_npu wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.torch_npu_wheel_artifact }}
-          path: torch-npu-wheel-artifact
-
-      - name: Download PyTorch source and test code
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.pytorch_src_artifact }}
-          path: pytorch-src-artifact
-
-      - name: Extract PyTorch source
-        run: |
-          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
-
-      - name: Install built PyTorch and torch_npu
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PIP=pip${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-
-          # Configure pip to use PyPI cache for faster downloads
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          fi
-
-          $PIP install --upgrade pip
-
-          # Install built torch wheel
-          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_WHL}"
-
-          # Install built torch_npu wheel
-          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_NPU_WHL}"
-
-          echo "Installed PyTorch and torch_npu from built wheels"
-
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-
-          # Configure pip to use PyPI cache for faster downloads
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }} || true
-            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local" || true
-          fi
-
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder pytest-subtests==0.13.1 xdoctest==1.1.0 pulp>=2.9
-
-          # Optional dependencies for ONNX tests
-          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
-
-          # torchvision for ONNX model tests (install without deps to bypass torch version check)
-          $PIP install numpy pillow || true
-          $PIP install torchvision --no-deps || true
-
-          # Other optional dependencies
-          $PIP install parameterized pandas || true
-          $PIP install opencv-python || true
-
-          $PIP install -r pytorch-src/requirements.txt || true
-
-      - name: Verify NPU availability
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          python${{ inputs.python_version }} - <<'PY'
-          import torch
-          import torch_npu
-          print(f'torch: {torch.__version__}')
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          PY
+          python_version: ${{ inputs.python_version }}
+          torch_wheel_artifact: ${{ inputs.torch_wheel_artifact }}
+          torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }}
+          pytorch_src_artifact: ${{ inputs.pytorch_src_artifact }}
 
       - name: Run custom test files
         id: run_tests
@@ -167,7 +60,7 @@ jobs:
             --test-files "${{ inputs.test_files }}" \
             --test-dir pytorch-src/test \
             --report-dir ${REPORT_DIR} \
-            --timeout 60 \
+            --timeout 600 \
             --verbose \
             2>&1 | tee /tmp/test_custom.log
 
diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml
index e37eadb880..77eb07d0ec 100644
--- a/.github/workflows/_torch-npu-upstream-test-dist.yml
+++ b/.github/workflows/_torch-npu-upstream-test-dist.yml
@@ -32,10 +32,6 @@ on:
         type: string
         description: Number of distributed shards
 
-env:
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
 jobs:
   run_tests:
     name: test_distributed (${{ matrix.shard }}/${{ inputs.distributed_shards }})
@@ -51,120 +47,13 @@ jobs:
       max-parallel: 2
 
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.repository }}
-          ref: ${{ github.ref }}
-          fetch-depth: 1
-          path: ascend_pytorch
-
-      - name: Setup cache directories
-        run: |
-          mkdir -p /github/home/.cache/pip
-          chmod -R 777 /github/home/.cache
-
-      - name: Cache pip
-        uses: actions/cache@v4
-        with:
-          path: /github/home/.cache/pip
-          key: pip-arm-dist-py${{ inputs.python_version }}
-          restore-keys: |
-            pip-arm-dist-py${{ inputs.python_version }}-
-
-      - name: Download built torch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.torch_wheel_artifact }}
-          path: torch-wheel-artifact
-
-      - name: Download built torch_npu wheel
-        uses: actions/download-artifact@v4
+      - name: Setup NPU test environment
+        uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@dev_master
         with:
-          name: ${{ inputs.torch_npu_wheel_artifact }}
-          path: torch-npu-wheel-artifact
-
-      - name: Download PyTorch source and test code
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.pytorch_src_artifact }}
-          path: pytorch-src-artifact
-
-      - name: Extract PyTorch source
-        run: |
-          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
-
-      - name: Install built PyTorch and torch_npu
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PIP=pip${{ inputs.python_version }}
-          PYTHON=python${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-
-          # Configure pip to use PyPI cache for faster downloads
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          fi
-
-          $PIP install --upgrade pip
-
-          # Install built torch wheel
-          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_WHL}"
-
-          # Install built torch_npu wheel
-          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_NPU_WHL}"
-
-          echo "Installed PyTorch and torch_npu from built wheels"
-          echo "torch: ${TORCH_WHL}"
-          echo "torch_npu: ${TORCH_NPU_WHL}"
-
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-          cd pytorch-src
-
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder
-          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
-
-          # Optional dependencies for ONNX tests
-          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
-
-          # torchvision for ONNX model tests (install without deps to bypass torch version check)
-          $PIP install numpy pillow || true
-          $PIP install torchvision --no-deps || true
-
-          # Other optional dependencies
-          $PIP install parameterized pandas || true
-          $PIP install opencv-python || true
-
-          if [ -f requirements.txt ]; then
-            $PIP install -r requirements.txt || true
-          fi
-
-      - name: Verify NPU availability
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PYTHON=python${{ inputs.python_version }}
-          $PYTHON -c "
-          import torch
-          print(f'torch: {torch.__version__}')
-          import torch_npu
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          if torch.npu.is_available():
-              print(f'NPU name: {torch.npu.get_device_name(0)}')
-          "
+          python_version: ${{ inputs.python_version }}
+          torch_wheel_artifact: ${{ inputs.torch_wheel_artifact }}
+          torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }}
+          pytorch_src_artifact: ${{ inputs.pytorch_src_artifact }}
 
       - name: Download cases shard JSONs
         uses: actions/download-artifact@v4
@@ -198,7 +87,8 @@ jobs:
             --cases-json "${CASES_JSON}" \
             --test-dir pytorch-src/test \
             --report-dir ${REPORT_DIR} \
-            --timeout 60 \
+            --quick-test 100 \
+            --timeout 600 \
             --verbose \
             2>&1 | tee /tmp/test_shard_dist_${{ matrix.shard }}.log
 
diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml
index 75064a1126..c9c61128c4 100644
--- a/.github/workflows/_torch-npu-upstream-test-regular.yml
+++ b/.github/workflows/_torch-npu-upstream-test-regular.yml
@@ -32,10 +32,6 @@ on:
         type: string
         description: Number of regular shards
 
-env:
-  # PyPI 缓存 URL（用于加速 pip 下载）
-  PYPI_CACHE_URL: 'http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple'
-
 jobs:
   run_tests:
     name: test_regular (${{ matrix.shard }}/${{ inputs.regular_shards }})
@@ -51,120 +47,13 @@ jobs:
       max-parallel: 5
 
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.repository }}
-          ref: ${{ github.ref }}
-          fetch-depth: 1
-          path: ascend_pytorch
-
-      - name: Setup cache directories
-        run: |
-          mkdir -p /github/home/.cache/pip
-          chmod -R 777 /github/home/.cache
-
-      - name: Cache pip
-        uses: actions/cache@v4
-        with:
-          path: /github/home/.cache/pip
-          key: pip-arm-regular-py${{ inputs.python_version }}
-          restore-keys: |
-            pip-arm-regular-py${{ inputs.python_version }}-
-
-      - name: Download built torch wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.torch_wheel_artifact }}
-          path: torch-wheel-artifact
-
-      - name: Download built torch_npu wheel
-        uses: actions/download-artifact@v4
+      - name: Setup NPU test environment
+        uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@dev_master
         with:
-          name: ${{ inputs.torch_npu_wheel_artifact }}
-          path: torch-npu-wheel-artifact
-
-      - name: Download PyTorch source and test code
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ inputs.pytorch_src_artifact }}
-          path: pytorch-src-artifact
-
-      - name: Extract PyTorch source
-        run: |
-          tar -xzf pytorch-src-artifact/pytorch-src.tar.gz
-
-      - name: Install built PyTorch and torch_npu
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PIP=pip${{ inputs.python_version }}
-          PYTHON=python${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-
-          # Configure pip to use PyPI cache for faster downloads
-          if [ -n "${{ env.PYPI_CACHE_URL }}" ]; then
-            $PIP config set global.index-url ${{ env.PYPI_CACHE_URL }}
-            $PIP config set global.trusted-host "cache-service.nginx-pypi-cache.svc.cluster.local"
-            echo "pip index-url configured: ${{ env.PYPI_CACHE_URL }}"
-          fi
-
-          $PIP install --upgrade pip
-
-          # Install built torch wheel
-          TORCH_WHL=$(ls torch-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_WHL}"
-
-          # Install built torch_npu wheel
-          TORCH_NPU_WHL=$(ls torch-npu-wheel-artifact/*.whl | head -1)
-          $PIP install "${TORCH_NPU_WHL}"
-
-          echo "Installed PyTorch and torch_npu from built wheels"
-          echo "torch: ${TORCH_WHL}"
-          echo "torch_npu: ${TORCH_NPU_WHL}"
-
-      - name: Install test dependencies
-        run: |
-          PIP=pip${{ inputs.python_version }}
-          export PIP_CACHE_DIR=/github/home/.cache/pip
-          cd pytorch-src
-
-          $PIP install pytest pytest-timeout pytest-xdist hypothesis zstandard pyyaml
-          $PIP install pytest-rerunfailures pytest-flakefinder
-          $PIP install 'pytest-subtests==0.13.1' 'xdoctest==1.1.0' 'pulp>=2.9'
-
-          # Optional dependencies for ONNX tests
-          $PIP install onnxruntime onnxscript onnx-ir ml-dtypes || true
-
-          # torchvision for ONNX model tests (install without deps to bypass torch version check)
-          $PIP install numpy pillow || true
-          $PIP install torchvision --no-deps || true
-
-          # Other optional dependencies
-          $PIP install parameterized pandas || true
-          $PIP install opencv-python || true
-
-          if [ -f requirements.txt ]; then
-            $PIP install -r requirements.txt || true
-          fi
-
-      - name: Verify NPU availability
-        run: |
-          source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true
-          source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true
-
-          PYTHON=python${{ inputs.python_version }}
-          $PYTHON -c "
-          import torch
-          print(f'torch: {torch.__version__}')
-          import torch_npu
-          print(f'torch_npu: {torch_npu.__version__}')
-          print(f'NPU available: {torch.npu.is_available()}')
-          print(f'NPU count: {torch.npu.device_count()}')
-          if torch.npu.is_available():
-              print(f'NPU name: {torch.npu.get_device_name(0)}')
-          "
+          python_version: ${{ inputs.python_version }}
+          torch_wheel_artifact: ${{ inputs.torch_wheel_artifact }}
+          torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }}
+          pytorch_src_artifact: ${{ inputs.pytorch_src_artifact }}
 
       - name: Download cases shard JSONs
         uses: actions/download-artifact@v4
@@ -198,8 +87,9 @@ jobs:
             --cases-json "${CASES_JSON}" \
             --test-dir pytorch-src/test \
             --report-dir ${REPORT_DIR} \
-            --timeout 60 \
-            --max-workers 64 \
+            --timeout 600 \
+            --quick-test 100 \
+            --max-workers 16 \
             --verbose \
             2>&1 | tee /tmp/test_shard_reg_${{ matrix.shard }}.log
 

From d884663ddbd64223ad015ddc6542ff039f1c8945 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 14 May 2026 15:32:32 +0800
Subject: [PATCH 130/132] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dpytest=20nodeid?=
 =?UTF-8?q?=E6=94=B6=E9=9B=86=E9=80=BB=E8=BE=91=EF=BC=9A=E8=BF=87=E6=BB=A4?=
 =?UTF-8?q?=E9=9D=9E=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E7=AC=A6=E5=8F=B7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

问题：收集逻辑只检查"::"存在，错误收集了非测试用例：
- @torch.library.register_fake("torchvision::nms") 包含::但不是测试用例

修复：添加严格过滤条件：
1. 必须包含 ".py::" (Python测试文件标识)
2. 不能以 "@" 开头 (装饰器/注册符号)
3. 不能以 "<" 开头 (pytest收集标记)
4. 不能包含 "(" (函数调用语法)

影响文件：
- collect_all_cases.py: collect_cases_for_file()
- run_npu_test_shard.py: collect_test_cases()

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py  | 11 +++++++++--
 .github/scripts/run_npu_test_shard.py | 20 +++++++++-----------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 782fa3679f..1dd7ea9b3e 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -120,8 +120,15 @@ def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, str, Li
 
         nodeids = []
         for line in result.stdout.splitlines():
-            if "::" in line and not line.strip().startswith("<"):
-                nodeids.append(line.strip())
+            stripped = line.strip()
+            # Filter valid pytest nodeids:
+            # 1. Must contain "::" (pytest nodeid separator)
+            # 2. Must contain ".py::" (indicates a Python test file)
+            # 3. Must not start with "@" (decorators/registrations, not test cases)
+            # 4. Must not start with "<" (pytest collection markers)
+            # 5. Must not contain function call syntax like "(" (non-nodeid symbols)
+            if "::" in stripped and ".py::" in stripped and not stripped.startswith("@") and not stripped.startswith("<") and "(" not in stripped:
+                nodeids.append(stripped)
 
         # Check for collection errors based on pytest exit codes:
         #   0: all passed (success)
diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 0d7d1ba4b4..03843b190d 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -452,17 +452,15 @@ def collect_test_cases(test_file: str, test_dir: Path, env: Dict) -> List[str]:
         # Parse nodeids from output
         nodeids = []
         for line in result.stdout.splitlines():
-            # pytest --collect-only outputs nodeids like:
-            # <Function test_grad>
-            # or with verbose:
-            # test_autograd.py::TestAutograd::test_grad
-            if "::" in line:
-                # Extract nodeid (remove leading spaces and markers)
-                nodeid = line.strip()
-                # Remove pytest markers like <Function ...>
-                if nodeid.startswith("<"):
-                    continue
-                nodeids.append(nodeid)
+            stripped = line.strip()
+            # Filter valid pytest nodeids:
+            # 1. Must contain "::" (pytest nodeid separator)
+            # 2. Must contain ".py::" (indicates a Python test file)
+            # 3. Must not start with "@" (decorators/registrations, not test cases)
+            # 4. Must not start with "<" (pytest collection markers)
+            # 5. Must not contain function call syntax like "(" (non-nodeid symbols)
+            if "::" in stripped and ".py::" in stripped and not stripped.startswith("@") and not stripped.startswith("<") and "(" not in stripped:
+                nodeids.append(stripped)
 
         return nodeids
 

From 36060558dce56102f3f57767a46808d10bf08bf0 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 14 May 2026 15:37:09 +0800
Subject: [PATCH 131/132] =?UTF-8?q?=E7=AE=80=E5=8C=96pytest=20nodeid?=
 =?UTF-8?q?=E8=A7=A3=E6=9E=90=E9=80=BB=E8=BE=91=EF=BC=9A=E5=88=A9=E7=94=A8?=
 =?UTF-8?q?-q=E6=A8=A1=E5=BC=8F=E7=9A=84=E6=A0=87=E5=87=86=E8=BE=93?=
 =?UTF-8?q?=E5=87=BA=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pytest --collect-only -q 模式输出标准格式：
- 每行一个完整nodeid: test_file.py::TestClass::test_method
- 最后有统计信息: "X tests collected"

简化解析规则：
1. 跳过空行
2. 跳过包含 "collected"/"selected" 的统计行
3. 跳过以 "=" 开头的分隔线
4. 只检查 ".py::" 确保是Python测试文件

移除之前的复杂字符串匹配（检查@、<、(等），
因为-q模式输出已经很干净，不需要防御性过滤。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/collect_all_cases.py  | 20 +++++++++++++-------
 .github/scripts/run_npu_test_shard.py | 20 +++++++++++++-------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/.github/scripts/collect_all_cases.py b/.github/scripts/collect_all_cases.py
index 1dd7ea9b3e..6773817dd9 100644
--- a/.github/scripts/collect_all_cases.py
+++ b/.github/scripts/collect_all_cases.py
@@ -121,13 +121,19 @@ def collect_cases_for_file(test_file: str, test_dir: Path) -> Tuple[str, str, Li
         nodeids = []
         for line in result.stdout.splitlines():
             stripped = line.strip()
-            # Filter valid pytest nodeids:
-            # 1. Must contain "::" (pytest nodeid separator)
-            # 2. Must contain ".py::" (indicates a Python test file)
-            # 3. Must not start with "@" (decorators/registrations, not test cases)
-            # 4. Must not start with "<" (pytest collection markers)
-            # 5. Must not contain function call syntax like "(" (non-nodeid symbols)
-            if "::" in stripped and ".py::" in stripped and not stripped.startswith("@") and not stripped.startswith("<") and "(" not in stripped:
+            # pytest --collect-only -q outputs clean nodeids, one per line
+            # Filter rules:
+            # 1. Skip empty lines
+            # 2. Skip summary lines (contain "collected" or "selected")
+            # 3. Skip separator lines (start with "=")
+            # 4. Must contain ".py::" to ensure it's a Python test file nodeid
+            if not stripped:
+                continue
+            if "collected" in stripped or "selected" in stripped:
+                continue
+            if stripped.startswith("="):
+                continue
+            if ".py::" in stripped:
                 nodeids.append(stripped)
 
         # Check for collection errors based on pytest exit codes:
diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 03843b190d..9d20171aaf 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -450,16 +450,22 @@ def collect_test_cases(test_file: str, test_dir: Path, env: Dict) -> List[str]:
                         print(f"      {line[:200]}")
 
         # Parse nodeids from output
+        # pytest --collect-only -q outputs clean nodeids, one per line
         nodeids = []
         for line in result.stdout.splitlines():
             stripped = line.strip()
-            # Filter valid pytest nodeids:
-            # 1. Must contain "::" (pytest nodeid separator)
-            # 2. Must contain ".py::" (indicates a Python test file)
-            # 3. Must not start with "@" (decorators/registrations, not test cases)
-            # 4. Must not start with "<" (pytest collection markers)
-            # 5. Must not contain function call syntax like "(" (non-nodeid symbols)
-            if "::" in stripped and ".py::" in stripped and not stripped.startswith("@") and not stripped.startswith("<") and "(" not in stripped:
+            # Filter rules:
+            # 1. Skip empty lines
+            # 2. Skip summary lines (contain "collected" or "selected")
+            # 3. Skip separator lines (start with "=")
+            # 4. Must contain ".py::" to ensure it's a Python test file nodeid
+            if not stripped:
+                continue
+            if "collected" in stripped or "selected" in stripped:
+                continue
+            if stripped.startswith("="):
+                continue
+            if ".py::" in stripped:
                 nodeids.append(stripped)
 
         return nodeids

From 936aa386eae9b2fa852917c5d4198f1d14da8560 Mon Sep 17 00:00:00 2001
From: wangsike <wangsike@huawei.com>
Date: Thu, 14 May 2026 15:44:40 +0800
Subject: [PATCH 132/132] =?UTF-8?q?=E9=87=8D=E6=9E=84run=5Fnpu=5Ftest=5Fsh?=
 =?UTF-8?q?ard.py=EF=BC=9A=E5=88=A0=E9=99=A4=E9=87=8D=E5=A4=8D=E7=9A=84?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B=E6=94=B6=E9=9B=86=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

问题：collect_test_cases()与collect_all_cases.py的collect_cases_for_file()功能重复

修改：
1. 删除collect_test_cases()函数（110行）
2. 删除run_tests_with_concurrent_isolation()函数（214行）
3. 添加import collect_all_cases
4. --test-files模式改为：
   - 调用collect_all_cases.collect_all_cases()收集用例
   - 构建CaseExecutionTask列表
   - 调用run_tests_with_tasks_concurrent()执行

统一执行流程：
- --cases-json模式：预收集用例 → run_tests_with_tasks_concurrent()
- --test-files模式：现场收集用例 → run_tests_with_tasks_concurrent()

代码减少291行（1584→1293）

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/scripts/run_npu_test_shard.py | 361 +++-----------------------
 1 file changed, 35 insertions(+), 326 deletions(-)

diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py
index 9d20171aaf..d85bed23c7 100644
--- a/.github/scripts/run_npu_test_shard.py
+++ b/.github/scripts/run_npu_test_shard.py
@@ -58,6 +58,8 @@
 from time import monotonic
 from typing import Dict, List, Tuple
 
+import collect_all_cases
+
 
 # ==============================================================================
 # Import Result Parser Module
@@ -368,116 +370,6 @@ def load_installed_torch_root() -> str:
         return ""
 
 
-# ==============================================================================
-# Case Collection
-# ==============================================================================
-
-
-def collect_test_cases(test_file: str, test_dir: Path, env: Dict) -> List[str]:
-    """
-    Collect all test cases from a test file via pytest --collect-only.
-
-    Adds test file's parent directory to PYTHONPATH to enable
-    imports of sibling modules (e.g., 'from model_registry import MLPModule').
-
-    Args:
-        test_file: Test file path (e.g., "test/test_autograd.py")
-        test_dir: Path to PyTorch test directory
-        env: Environment dict for subprocess (will be modified for this call)
-
-    Returns:
-        List of case nodeids (e.g., ["test_autograd.py::TestAutograd::test_grad"])
-    """
-    # Strip test/ prefix if present
-    original_test_file = test_file
-    if test_file.startswith("test/"):
-        test_file = test_file[5:]
-
-    # Get test file's parent directory for PYTHONPATH
-    test_file_path = Path(test_file)
-    test_file_dir = test_dir / test_file_path.parent
-
-    # Build per-file environment with test file directory in PYTHONPATH
-    file_env = env.copy()
-    existing_pythonpath = file_env.get("PYTHONPATH", "")
-    file_env["PYTHONPATH"] = str(test_file_dir) + (":" + existing_pythonpath if existing_pythonpath else "")
-
-    command = [
-        sys.executable,
-        "-m",
-        "pytest",
-        "--collect-only",
-        "--quiet",
-        test_file,
-    ]
-
-    try:
-        result = subprocess.run(
-            command,
-            cwd=str(test_dir),
-            env=file_env,  # Use per-file environment with test file directory in PYTHONPATH
-            capture_output=True,
-            text=True,
-            encoding="utf-8",
-            errors="replace",
-            timeout=60,  # Collection timeout
-        )
-
-        # Check for collection errors based on pytest exit codes:
-        #   0: all passed (success)
-        #   2: pytest error (includes collection errors like ImportError)
-        #   3: all skipped (success)
-        #   4: command line error (error)
-        #   5: no tests collected (ERROR - test file should have cases)
-        # Key insight: if a test file is selected for execution, it should have cases.
-        # returncode 5 means 0 cases collected, which indicates a problem.
-        stdout_content = result.stdout.strip()
-
-        if result.returncode not in (0, 3):
-            # returncode 2, 4, 5: real collection error
-            # returncode 5 specifically means no tests collected - a problem for selected files
-            print(f"    WARNING: Collection errors for {test_file}:")
-            # Print relevant lines from stdout (pytest collection errors are in stdout)
-            stdout_lines = stdout_content.splitlines()
-            for line in stdout_lines[-20:]:
-                if line.strip():
-                    print(f"      {line[:200]}")
-            # Also print stderr if relevant
-            if result.stderr:
-                stderr_lines = result.stderr.strip().splitlines()
-                for line in stderr_lines[-10:]:
-                    if line.strip():
-                        print(f"      {line[:200]}")
-
-        # Parse nodeids from output
-        # pytest --collect-only -q outputs clean nodeids, one per line
-        nodeids = []
-        for line in result.stdout.splitlines():
-            stripped = line.strip()
-            # Filter rules:
-            # 1. Skip empty lines
-            # 2. Skip summary lines (contain "collected" or "selected")
-            # 3. Skip separator lines (start with "=")
-            # 4. Must contain ".py::" to ensure it's a Python test file nodeid
-            if not stripped:
-                continue
-            if "collected" in stripped or "selected" in stripped:
-                continue
-            if stripped.startswith("="):
-                continue
-            if ".py::" in stripped:
-                nodeids.append(stripped)
-
-        return nodeids
-
-    except subprocess.TimeoutExpired:
-        print(f"WARNING: Collection timeout for {test_file}")
-        return []
-    except Exception as e:
-        print(f"WARNING: Collection failed for {test_file}: {e}")
-        return []
-
-
 # ==============================================================================
 # Concurrent Case Execution
 # ==============================================================================
@@ -751,220 +643,6 @@ def log_writer_thread(log_queue: Queue, log_file: Path, stop_event: threading.Ev
                 log_handle.flush()
 
 
-def run_tests_with_concurrent_isolation(
-    planned_tests: List[str],
-    shard: int,
-    test_dir: Path,
-    report_dir: Path,
-    env_updates: Dict[str, str],
-    timeout: int,
-    verbose: bool,
-    shard_type: str,
-    max_workers: int,
-    result_module,
-    quick_test: int = None,
-) -> Tuple[int, float, List[Dict]]:
-    """
-    Execute tests with concurrent per-case isolation.
-
-    Each test case runs in its own pytest subprocess for crash isolation.
-    Up to max_workers subprocesses execute concurrently via ThreadPoolExecutor.
-
-    Core dumps in subprocess do NOT affect:
-    - The main Python process
-    - Other concurrent subprocesses
-    - Pending tasks in the queue
-
-    Args:
-        planned_tests: List of test file paths
-        shard: Shard number
-        test_dir: PyTorch test directory
-        report_dir: Report output directory
-        env_updates: Environment variable updates
-        timeout: Per-case timeout in seconds
-        verbose: Verbose output
-        shard_type: "distributed" or "regular"
-        max_workers: Maximum concurrent subprocesses (default: 4)
-        result_module: parse_test_results module
-        quick_test: Maximum number of cases to execute (None = all cases)
-
-    Returns:
-        Tuple of (worst_returncode, duration, cases_list_sorted)
-    """
-    start = monotonic()
-    log_file = result_module.get_shard_log_file(report_dir, shard, shard_type)
-
-    # Create junit_xmls directory for XML reports
-    junit_xml_dir = report_dir / "junit_xmls"
-    junit_xml_dir.mkdir(parents=True, exist_ok=True)
-
-    merged_env = os.environ.copy()
-    merged_env.update(env_updates)
-
-    config = ConcurrentExecutionConfig(
-        max_workers=max_workers,
-        per_case_timeout=timeout,
-        verbose=verbose,
-    )
-
-    # Thread-safe result aggregator
-    result_aggregator = ConcurrentResultAggregator()
-
-    # Log queue and writer thread
-    log_queue = Queue()
-    stop_event = threading.Event()
-    log_thread = threading.Thread(
-        target=log_writer_thread,
-        args=(log_queue, log_file, stop_event),
-        daemon=True,
-    )
-
-    # Write log header
-    log_queue.put({
-        "type": "header",
-        "content": (
-            "=" * 80 + "\n"
-            f"Concurrent per-case isolation pytest execution ({shard_type} shard)\n"
-            "=" * 80 + "\n"
-            f"Total test files: {len(planned_tests)}\n"
-            f"Max concurrent workers: {max_workers}\n"
-            "Execution mode: concurrent subprocess, each case isolated\n"
-            "=" * 80 + "\n\n"
-        ),
-    })
-
-    log_thread.start()
-
-    print(f"\n{'=' * 80}")
-    print(f"Concurrent per-case isolation mode: {len(planned_tests)} files")
-    print(f"Execution mode: {max_workers} workers concurrent, each case in subprocess")
-    print(f"{'=' * 80}\n")
-
-    # Phase 1: Collect all test cases (serial, as parsing test files)
-    all_tasks: List[CaseExecutionTask] = []
-    case_idx = 0
-
-    print("Phase 1: Collecting test cases...")
-    if quick_test:
-        print(f"  Quick test mode: will collect up to {quick_test} cases")
-
-    for file_idx, test_file in enumerate(planned_tests, 1):
-        # Quick test: stop collecting if already have enough cases
-        if quick_test and case_idx >= quick_test:
-            print(f"\n  Quick test limit reached ({quick_test} cases), stopping collection")
-            break
-
-        test_name = strip_test_prefix_and_suffix(test_file)
-        print(f"\n  [File {file_idx}/{len(planned_tests)}] Collecting: {test_name}")
-
-        case_nodeids = collect_test_cases(test_file, test_dir, merged_env)
-
-        if not case_nodeids:
-            print(f"    No cases collected")
-            continue
-
-        print(f"    Collected {len(case_nodeids)} cases")
-
-        for nodeid in case_nodeids:
-            case_idx += 1
-            all_tasks.append(CaseExecutionTask(
-                case_idx=case_idx,
-                nodeid=nodeid,
-                test_file=test_file,
-                file_idx=file_idx,
-            ))
-
-            # Quick test: stop collecting if reached limit
-            if quick_test and case_idx >= quick_test:
-                print(f"    Quick test limit reached ({quick_test} cases)")
-                break
-
-    total_cases = len(all_tasks)
-    print(f"\n{'=' * 80}")
-    print(f"Phase 2: Concurrent execution with {max_workers} workers")
-    print(f"Total cases to execute: {total_cases}")
-    print(f"{'=' * 80}\n")
-
-    # Phase 2: Concurrent execution via ThreadPoolExecutor
-    progress_tracker = ProgressTracker(total_cases)
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all tasks
-        future_to_task = {
-            executor.submit(
-                run_single_case_concurrent,
-                task,
-                test_dir,
-                merged_env,
-                config,
-                result_aggregator,
-                progress_tracker,
-                log_queue,
-                report_dir,
-                shard,
-                shard_type,
-            ): task
-            for task in all_tasks
-        }
-
-        # Wait for completion (as_completed gives results as they finish)
-        for future in as_completed(future_to_task):
-            task = future_to_task[future]
-            try:
-                # Result already collected in aggregator
-                _ = future.result()
-            except Exception as e:
-                # Should never happen (run_single_case_concurrent catches all)
-                # But as safety, create error result
-                case_result = {
-                    "nodeid": task.nodeid,
-                    "status": "error",
-                    "duration": 0.0,
-                    "returncode": 1,
-                    "message": f"Future error: {str(e)[:200]}",
-                    "file": task.test_file,
-                    "case_idx": task.case_idx,
-                }
-                result_aggregator.add_case_result(case_result)
-                progress_tracker.mark_completed(task.nodeid, "error", 0.0)
-
-    # Stop log thread
-    elapsed = monotonic() - start
-    summary = result_aggregator.get_summary()
-
-    log_queue.put({
-        "type": "summary",
-        "content": (
-            f"\n{'=' * 80}\n"
-            f"Summary: {summary['total_cases']} cases executed\n"
-            f"  Passed: {summary['passed_count']}\n"
-            f"  Failed: {summary['failed_count']}\n"
-            f"  Errors: {summary['error_count']}\n"
-            f"  Timeout: {summary['timeout_count']}\n"
-            f"  Skipped: {summary['skipped_count']}\n"
-            f"  Duration: {elapsed:.2f}s\n"
-            f"  Concurrent workers: {max_workers}\n"
-            f"{'=' * 80}\n"
-        ),
-    })
-
-    stop_event.set()
-    log_thread.join(timeout=5)
-
-    # Print final summary
-    print(f"\n{'=' * 80}", flush=True)
-    print(f"Summary: {summary['total_cases']} cases executed", flush=True)
-    print(f"  Passed: {summary['passed_count']}", flush=True)
-    print(f"  Failed: {summary['failed_count']}", flush=True)
-    print(f"  Errors: {summary['error_count']}", flush=True)
-    print(f"  Timeout: {summary['timeout_count']}", flush=True)
-    print(f"  Skipped: {summary['skipped_count']}", flush=True)
-    print(f"  Duration: {elapsed:.2f}s", flush=True)
-    print(f"{'=' * 80}", flush=True)
-
-    return summary["worst_returncode"], elapsed, result_aggregator.get_sorted_cases()
-
-
 def run_tests_with_tasks_concurrent(
     tasks: List[CaseExecutionTask],
     shard: int,
@@ -1355,9 +1033,37 @@ def main():
         # Execute tests (custom mode uses concurrent execution by default)
         cases_list = []
         if planned_tests:
-            # Custom mode: concurrent execution for efficiency
-            returncode, duration, cases_list = run_tests_with_concurrent_isolation(
+            # Phase 1: Collect all test cases using collect_all_cases module
+            print("\nPhase 1: Collecting test cases...")
+            error_log_dir = report_dir / "collection_errors"
+            collected_cases = collect_all_cases.collect_all_cases(
                 planned_tests,
+                test_dir,
+                error_log_dir,
+                parallel=16,
+            )
+
+            # Apply quick_test limit if specified
+            if args.quick_test and len(collected_cases) > args.quick_test:
+                collected_cases = collected_cases[:args.quick_test]
+                print(f"  Quick test mode: using only {args.quick_test} cases")
+
+            total_cases = len(collected_cases)
+            print(f"\nPhase 2: Executing {total_cases} cases with {args.max_workers} workers")
+
+            # Build CaseExecutionTask list
+            tasks = []
+            for i, case in enumerate(collected_cases, 1):
+                tasks.append(CaseExecutionTask(
+                    case_idx=i,
+                    nodeid=case["nodeid"],
+                    test_file=case["file"],
+                    file_idx=0,  # Not needed for pre-collected cases
+                ))
+
+            # Phase 2: Execute cases using run_tests_with_tasks_concurrent
+            returncode, duration, cases_list = run_tests_with_tasks_concurrent(
+                tasks,
                 shard,
                 test_dir,
                 report_dir,
@@ -1373,6 +1079,9 @@ def main():
             info["concurrent_workers"] = args.max_workers
             info["returncode"] = returncode
             info["duration"] = duration
+        else:
+            returncode = 0
+            duration = 0.0
 
         # Build cases.json data
         passed_count = sum(1 for c in cases_list if c["status"] == "passed")