From ca278d848efa390bc1e20f3a922e151e4dfebf08 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 26 Jun 2025 16:18:57 +0200 Subject: [PATCH 01/20] Use dict.items --- test/easyblocks/easyblock_specific.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/easyblocks/easyblock_specific.py b/test/easyblocks/easyblock_specific.py index e18c7b93ee1..2c2de72ec42 100644 --- a/test/easyblocks/easyblock_specific.py +++ b/test/easyblocks/easyblock_specific.py @@ -495,8 +495,8 @@ def test_translate_lammps_version(self): '29Aug2024_update2': '2024.08.29', '28Oct2024': '2024.10.28', } - for key in lammps_versions: - self.assertEqual(lammps.translate_lammps_version(key), lammps_versions[key]) + for key, expected_version in lammps_versions.items(): + self.assertEqual(lammps.translate_lammps_version(key), expected_version) version_file = os.path.join(self.tmpdir, 'src', 'version.h') version_txt = '\n'.join([ From 3ff46c0d4cb350768976b843d4909a3f014b3328 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 26 Jun 2025 15:21:40 +0200 Subject: [PATCH 02/20] Allow rerun and skipped tests --- easybuild/easyblocks/p/pytorch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index fdc787155c6..a53408ace36 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -905,8 +905,8 @@ def parse_test_cases(test_suite_el: ET.Element) -> List[TestCase]: num_reruns = len(testcase.findall("rerun")) if skipped: - if num_reruns > 0 or failed or errored: - raise ValueError(f"Invalid state for testcase '{test_name}'") + if failed or errored: + raise ValueError(f"Invalid state for testcase '{test_name}': Both skipped and failed/errored") state = TestState.SKIPPED else: state = TestState.FAILURE if failed else TestState.ERROR if errored else TestState.SUCCESS From a609de210efbc817b37f6205ce1c94c44f12394a Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 26 Jun 2025 15:22:26 +0200 Subject: [PATCH 03/20] Fix trimming test case name The code parses class names if they start with the prefix 'test.' and then trims a prefix consisting of the common part. That common part specifically excludes the 'test.' part of the prefix which hence needs to be re-added to match with `startswith`. --- easybuild/easyblocks/p/pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index a53408ace36..93f09dc76a0 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -964,7 +964,7 @@ def extract_path(classname: str) -> str: # We can remove possible class names by only using the common part suite_name = os.path.commonpath(possible_paths) # Strip of common prefix to all classes, but keep the last part for uniqueness - non_classname_prefix = os.path.dirname(suite_name).replace(os.path.sep, '.') + '.' + non_classname_prefix = 'test.' + os.path.dirname(suite_name).replace(os.path.sep, '.') + '.' for testcase in test_cases: classname = testcase.attrib["classname"] if classname.startswith(non_classname_prefix): From f3699b48608b2e9f435d68c1ce30bffcb6a607a6 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 26 Jun 2025 15:54:06 +0200 Subject: [PATCH 04/20] Add test for PyTorch log parsing Select some test reports from real runs, trim them down a bit and parse them in the test. Provide script for automatic cleanup of test reports. --- test/easyblocks/easyblock_specific.py | 155 ++++++++++++++++++ test/pytorch_test_logs/README.md | 5 + test/pytorch_test_logs/cleanup_files.py | 125 ++++++++++++++ .../consistency/test_name/test_name-1.xml | 15 ++ .../sync.skip.test_foo-1.xml | 7 + .../duplicate/test_name/test_name-1.xml | 7 + .../TEST-foo.test_name.TestName-1.xml | 7 + .../multi_file/test_name/TEST-Name-1.xml | 8 + .../no_tests/test_name/test_name-1.xml | 6 + .../root/test_name/test_name-1.xml | 3 + .../skip_and_failed/test_name/test_name-1.xml | 9 + ...s.quantization.test_quantization-1d671.xml | 6 + ...s.quantization.test_quantization-78879.xml | 6 + ...s.quantization.test_quantization-5f224.xml | 6 + ...s.quantization.test_quantization-d5cb5.xml | 6 + .../backends.xeon.test_launch-1.xml | 26 +++ .../backends.xeon.test_launch-2.xml | 8 + .../backends.xeon.test_launch-3.xml | 6 + ...tributed.tensor.test_dtensor_ops-2fe9b.xml | 17 ++ .../dynamo.test_dynamic_shapes-189f6.xml | 20 +++ .../dynamo.test_dynamic_shapes-266ee.xml | 16 ++ .../dynamo.test_dynamic_shapes-3f6e0.xml | 13 ++ .../dynamo.test_misc-18930.xml | 11 ++ .../dynamo.test_misc-86d5b.xml | 16 ++ .../dynamo.test_misc-d062d.xml | 20 +++ .../python-pytest/run_test/run_test.xml | 11 ++ .../test_nestedtensor-671fe.xml | 8 + .../test_nestedtensor-8e17a.xml | 26 +++ .../test_quantization-3146b.xml | 6 + .../test_quantization-97a67.xml | 36 ++++ .../TEST-jit.test_builtins.TestBuiltins-1.xml | 4 + .../test_autoload/TEST-TestBackend-1.xml | 4 + .../test_autoload/TEST-TestBackend-2.xml | 7 + 33 files changed, 626 insertions(+) create mode 100644 test/pytorch_test_logs/README.md create mode 100755 test/pytorch_test_logs/cleanup_files.py create mode 100644 test/pytorch_test_logs/faulty-reports/consistency/test_name/test_name-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/different_file_name/sync.skip.test_name/sync.skip.test_foo-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/duplicate/test_name/test_name-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/file_attribute/test_name/TEST-foo.test_name.TestName-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/multi_file/test_name/TEST-Name-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/no_tests/test_name/test_name-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/root/test_name/test_name-1.xml create mode 100644 test/pytorch_test_logs/faulty-reports/skip_and_failed/test_name/test_name-1.xml create mode 100644 test/pytorch_test_logs/test-reports/dist-gloo-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-1d671.xml create mode 100644 test/pytorch_test_logs/test-reports/dist-gloo-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-78879.xml create mode 100644 test/pytorch_test_logs/test-reports/dist-nccl-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-5f224.xml create mode 100644 test/pytorch_test_logs/test-reports/dist-nccl-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-d5cb5.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-1.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-2.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-3.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/distributed.tensor.test_dtensor_ops/distributed.tensor.test_dtensor_ops-2fe9b.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-189f6.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-266ee.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-3f6e0.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-18930.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-86d5b.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-d062d.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/run_test/run_test.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-671fe.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-8e17a.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-3146b.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-97a67.xml create mode 100644 test/pytorch_test_logs/test-reports/python-unittest/jit.test_builtins/TEST-jit.test_builtins.TestBuiltins-1.xml create mode 100644 test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-1.xml create mode 100644 test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-2.xml diff --git a/test/easyblocks/easyblock_specific.py b/test/easyblocks/easyblock_specific.py index 2c2de72ec42..023087b8499 100644 --- a/test/easyblocks/easyblock_specific.py +++ b/test/easyblocks/easyblock_specific.py @@ -35,6 +35,7 @@ import tempfile import textwrap from io import StringIO +from pathlib import Path from unittest import TestLoader, TextTestRunner from test.easyblocks.module import cleanup @@ -42,6 +43,7 @@ import easybuild.easyblocks.generic.pythonpackage as pythonpackage import easybuild.easyblocks.l.lammps as lammps import easybuild.easyblocks.p.python as python +import easybuild.easyblocks.p.pytorch as pytorch from easybuild.base.testing import TestCase from easybuild.easyblocks.generic.cmakemake import det_cmake_version from easybuild.easyblocks.generic.toolchain import Toolchain @@ -508,6 +510,159 @@ def test_translate_lammps_version(self): self.assertEqual(lammps.translate_lammps_version('d3adb33f', path=self.tmpdir), '2025.04.02') self.assertEqual(lammps.translate_lammps_version('devel', path=self.tmpdir), '2025.04.02') + def test_pytorch_test_log_parsing(self): + """Verify parsing of XML files produced by PyTorch tests.""" + TestState = pytorch.TestState + + test_log_dir = Path(__file__).parent.parent / 'pytorch_test_logs' + + results = pytorch.get_test_results(test_log_dir / 'test-reports') + results2 = pytorch.get_test_results(test_log_dir) + self.assertEqual(results.keys(), results2.keys()) + for name, suite in results.items(): + self.assertEqual((name, suite.summary), (name, results2[name].summary)) + del results2 + + self.assertEqual(len(results), 13) + + # 2 small test suites used as a smoke test using a most features + self.assertIn('backends/xeon/test_launch', results) + suite = results['backends/xeon/test_launch'] + self.assertEqual((suite.errors, suite.failures, suite.num_tests, suite.skipped), (1, 2, 8, 3)) + # Failure in one file, success in the other --> Success + self.assertEqual(suite['TestTorchrun.test_cpu_info'].state, TestState.SUCCESS) + # New in 2nd file + self.assertEqual(suite['TestTorchrun.test_multi_threads'].state, TestState.SUCCESS) + self.assertEqual(suite['TestTorchrun.test_reshape_cpu_float64'].state, TestState.FAILURE) + self.assertEqual(suite['TestTorchrun.test_foo'].state, TestState.SKIPPED) + self.assertEqual(suite['TestTorchrun.test_bar'].state, TestState.ERROR) + self.assertEqual(suite.get_errored_tests(), ['TestTorchrun.test_bar']) + self.assertEqual(suite.get_failed_tests(), ['TestTorchrun.test_reshape_cpu_float64', 'TestTorchrun.test_baz']) + self.assertIn('test_autoload', results) + suite = results['test_autoload'] + self.assertEqual((suite.errors, suite.failures, suite.num_tests, suite.skipped), (0, 0, 2, 1)) + self.assertEqual(suite['TestBackendAutoload.test_autoload'].state, TestState.SUCCESS) + self.assertEqual(suite['TestBackendAutoload.test_unload'].state, TestState.SKIPPED) + + # Verify summaries which should be enough to catch most issues + report = '\n'.join(sorted(f'{suite.name}: {suite.summary}' for suite in results.values())) + self.assertEqual(report, textwrap.dedent(""" + backends/xeon/test_launch: 2 failed, 2 passed, 3 skipped, 1 errors + dist-gloo-init-env/distr/algorithms/quantization/test_quantization: 0 failed, 1 passed, 0 skipped, 0 errors + dist-gloo-init-file/distr/algorithms/quantization/test_quantization: 0 failed, 1 passed, 0 skipped, 0 errors + dist-nccl-init-env/distr/algorithms/quantization/test_quantization: 0 failed, 1 passed, 0 skipped, 0 errors + dist-nccl-init-file/distr/algorithms/quantization/test_quantization: 0 failed, 1 passed, 0 skipped, 0 errors + dist/foo/bar: 0 failed, 4 passed, 0 skipped, 0 errors + distributed/tensor/test_dtensor_ops: 0 failed, 2 passed, 2 skipped, 0 errors + dynamo/test_dynamic_shapes: 3 failed, 14 passed, 0 skipped, 0 errors + dynamo/test_misc: 1 failed, 9 passed, 0 skipped, 0 errors + jit/test_builtins: 0 failed, 1 passed, 0 skipped, 0 errors + test_autoload: 0 failed, 1 passed, 1 skipped, 0 errors + test_nestedtensor: 3 failed, 2 passed, 3 skipped, 1 errors + test_quantization: 0 failed, 12 passed, 5 skipped, 0 errors + """).strip()) + tests = '\n'.join(sorted(f'{test.name}: {test.state.value}' + for suite in results.values() + for test in suite.get_tests())) + self.assertEqual(tests, textwrap.dedent(""" + DistQuantizationTests.test_all_gather_fp16: success + DistQuantizationTests.test_all_gather_fp16: success + DistQuantizationTests.test_all_gather_fp16: success + DistQuantizationTests.test_all_gather_fp16: success + DynamicShapesCtxManagerTests.test_autograd_profiler_dynamic_shapes: success + DynamicShapesCtxManagerTests.test_generic_context_manager_with_graph_break_dynamic_shapes: success + DynamicShapesCtxManagerTests.test_generic_ctx_manager_with_graph_break_dynamic_shapes: success + DynamicShapesMiscTests.test_outside_linear_module_free_dynamic_shapes: failure + DynamicShapesMiscTests.test_packaging_version_parse_dynamic_shapes: success + DynamicShapesMiscTests.test_pair_dynamic_shapes: success + DynamicShapesMiscTests.test_param_shape_binops_dynamic_shapes: success + DynamicShapesMiscTests.test_parameter_free_dynamic_shapes: failure + DynamicShapesMiscTests.test_patched_builtin_functions_dynamic_shapes: success + DynamicShapesMiscTests.test_proxy_frozen_dataclass_dynamic_shapes: success + DynamicShapesMiscTests.test_pt2_compliant_ops_are_allowed_dynamic_shapes: success + DynamicShapesMiscTests.test_pt2_compliant_overload_dynamic_shapes: success + DynamicShapesMiscTests.test_pure_python_accumulate_dynamic_shapes: success + DynamicShapesMiscTests.test_py_guards_mark_dynamic_dynamic_shapes: success + DynamicShapesMiscTests.test_python_slice_dynamic_shapes: success + DynamicShapesMiscTests.test_pytree_tree_flatten_unflatten_dynamic_shapes: success + DynamicShapesMiscTests.test_pytree_tree_leaves_dynamic_shapes: failure + MiscTests.test_packaging_version_parse: success + MiscTests.test_pair: success + MiscTests.test_param_shape_binops: success + MiscTests.test_parameter_free: failure + MiscTests.test_pytree_tree_map: success + MiscTests.test_shape_env_no_recording: success + MiscTests.test_shape_env_recorded_function_fallback: success + MiscTests.test_yield_from_in_a_loop: success + TestBackendAutoload.test_autoload: success + TestBackendAutoload.test_unload: skipped + TestBuiltins.test_name: success + TestCustomFunction.test_autograd_function_with_matmul_folding_at_output: success + TestDTensorOpsCPU.test_dtensor_op_db_H_cpu_float16: success + TestDTensorOpsCPU.test_dtensor_op_db_H_cpu_float32: success + TestDTensorOpsCPU.test_dtensor_op_db_H_cpu_float64: skipped + TestDTensorOpsCPU.test_dtensor_op_db_H_cpu_int8: skipped + TestDynamicQuantizedOps.test_qrnncell: success + TestFakeQuantizeOps.test_backward_per_channel: skipped + TestFakeQuantizeOps.test_backward_per_channel_cachemask_cpu: success + TestFakeQuantizeOps.test_backward_per_channel_cachemask_cuda: success + TestName.test_bar: success + TestNestedTensor.test_bmm_cuda_gpu_float16: failure + TestNestedTensor.test_bmm_cuda_gpu_float32: failure + TestNestedTensor.test_bmm_cuda_gpu_float64: error + TestNestedTensor.test_cat: success + TestNestedTensor.test_copy_: success + TestNestedTensor.test_reshape_cpu_float16: skipped + TestNestedTensor.test_reshape_cpu_float32: skipped + TestNestedTensor.test_reshape_cpu_float64: failure + TestNestedTensorSubclassCPU.test_linear_backward_memory_usage_cpu_float32: skipped + TestNumericDebugger.test_quantize_pt2e_preserve_handle: success + TestNumericDebugger.test_re_export_preserve_handle: success + TestPadding.test_reflection_pad1d: success + TestQuantizedConv.test_conv_reorder_issue_onednn: success + TestQuantizedConv.test_conv_transpose_reorder_issue_onednn: success + TestQuantizedFunctionalOps.test_relu_api: success + TestQuantizedLinear.test_qlinear_cudnn: skipped + TestQuantizedLinear.test_qlinear_gelu_pt2e: success + TestQuantizedOps.test_adaptive_avg_pool2d_nhwc: success + TestQuantizedOps.test_adaptive_avg_pool: skipped + TestQuantizedOps.test_qadd_relu_cudnn: skipped + TestQuantizedOps.test_qadd_relu_cudnn_nhwc: skipped + TestQuantizedOps.test_qadd_relu_different_qparams: success + TestTorchrun.test_bar: error + TestTorchrun.test_baz: failure + TestTorchrun.test_cpu_info: success + TestTorchrun.test_foo2: skipped + TestTorchrun.test_foo3: skipped + TestTorchrun.test_foo: skipped + TestTorchrun.test_multi_threads: success + TestTorchrun.test_reshape_cpu_float64: failure + TestTracer.test_jit_save: success + bar.test_2.test_func3: success + bar.test_foo.TestBar.test_func2: success + bar.test_foo.TestName.test_func1: success + """).strip()) + + # Some error cases + error_log_dir = test_log_dir / 'faulty-reports' + + self.assertErrorRegex(ValueError, " or ", + pytorch.get_test_results, error_log_dir / 'root') + self.assertErrorRegex(ValueError, "multiple reported files", + pytorch.get_test_results, error_log_dir / 'multi_file') + self.assertErrorRegex(ValueError, "Path from folder and filename should be equal", + pytorch.get_test_results, error_log_dir / 'different_file_name') + self.assertErrorRegex(ValueError, "Unexpected file attribute", + pytorch.get_test_results, error_log_dir / 'file_attribute') + self.assertErrorRegex(ValueError, "Invalid state", + pytorch.get_test_results, error_log_dir / 'skip_and_failed') + self.assertErrorRegex(ValueError, "no test", + pytorch.get_test_results, error_log_dir / 'no_tests') + self.assertErrorRegex(ValueError, "Invalid test count", + pytorch.get_test_results, error_log_dir / 'consistency') + self.assertErrorRegex(ValueError, "Duplicate test", + pytorch.get_test_results, error_log_dir / 'duplicate') + def suite(loader): """Return all easyblock-specific tests.""" diff --git a/test/pytorch_test_logs/README.md b/test/pytorch_test_logs/README.md new file mode 100644 index 00000000000..7191e9e8815 --- /dev/null +++ b/test/pytorch_test_logs/README.md @@ -0,0 +1,5 @@ +# PyTorch test result files + +This Folder contains files as written by the PyTorch test step (via `unittest-xml-reportin`) to be used in tests of the parsing in the PyTorch easyblock. + +Most files are simplified or constructed in a way to reproduce a specific corner case of the parser or format. diff --git a/test/pytorch_test_logs/cleanup_files.py b/test/pytorch_test_logs/cleanup_files.py new file mode 100755 index 00000000000..9578f6f2532 --- /dev/null +++ b/test/pytorch_test_logs/cleanup_files.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +"""This script strips content and filenames of PyTorch test result XML files in a deterministic way and formats them. +The intent is to keep the general structure of the files but still make them shorter and easier to read. + +Usage: Pass the target directory as the single argument or +run this script to format the XML files in the "full" directory next to the script. +""" + +import re +import subprocess +import sys +from hashlib import md5 +from pathlib import Path + + +def shorten_filename(path: Path) -> Path: + """Shorten the file name by truncating random part of .e.g. test_quantization-d1303cbc2b57cf06.xml""" + match = re.search(r'-(?P[a-z0-9]{6,})\.xml$', path.name) + if match: + fixed_part: str = path.name[:match.start()] + short_hash = match['hash'][:5] + new_name: Path = path.with_name(f"{fixed_part}-{short_hash}.xml") + path.rename(new_name) + return new_name + return path + + +def shorten_content(path: Path): + """Shorten attribute values and tag content (stdout, stderr, etc.) in the XML file.""" + content: str = path.read_text(encoding='utf-8') + + # Shorten messages in tags: + content = re.sub(r'message="[^"]+"', 'message="..."', content) + # Shorten time + content = re.sub(r'time="[^"]+"', 'time="4.2"', content) + # Ignore timestamp & hostname + content = re.sub(r'timestamp="[^"]+"', '', content) + content = re.sub(r'hostname="[^"]+"', '', content) + # Remove type attribute from tags + content = re.sub(r'( or ) + pattern = re.compile( + rf'(<{tag}([^>/]*?)>)(.*?)', + re.DOTALL + ) + if remove_output and tag in ["system-out", "system-err"]: + content = pattern.sub('', content) + else: + content = pattern.sub(rf'\1[snip]', content) + + # Remove empty lines + content = re.sub(r'\n\s*\n', '\n', content) + # Combine empty tags + content = re.sub(r'(<(\w+) [^>]*)>\s*', r'\1/>', content) + + path.write_text(content, encoding='utf-8') + + +def format_xml(path: Path) -> bool: + try: + subprocess.check_output( + ["xmllint", "--format", str(path), "-o", str(path)], + encoding='utf-8', + stderr=subprocess.STDOUT, + ) + except subprocess.CalledProcessError as e: + print(f'\nError formatting {path}: {e.output}', file=sys.stderr) + return False + return True + + +def remove_if_empty(path: Path) -> bool: + content = path.read_text(encoding='utf-8') + if not re.search(r']*[^/]>', content) and ' + + + + + + [snip] + + + [snip] + + + diff --git a/test/pytorch_test_logs/faulty-reports/different_file_name/sync.skip.test_name/sync.skip.test_foo-1.xml b/test/pytorch_test_logs/faulty-reports/different_file_name/sync.skip.test_name/sync.skip.test_foo-1.xml new file mode 100644 index 00000000000..67893db4a7e --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/different_file_name/sync.skip.test_name/sync.skip.test_foo-1.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/test/pytorch_test_logs/faulty-reports/duplicate/test_name/test_name-1.xml b/test/pytorch_test_logs/faulty-reports/duplicate/test_name/test_name-1.xml new file mode 100644 index 00000000000..35f6368ed12 --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/duplicate/test_name/test_name-1.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/test/pytorch_test_logs/faulty-reports/file_attribute/test_name/TEST-foo.test_name.TestName-1.xml b/test/pytorch_test_logs/faulty-reports/file_attribute/test_name/TEST-foo.test_name.TestName-1.xml new file mode 100644 index 00000000000..bdc796e33e3 --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/file_attribute/test_name/TEST-foo.test_name.TestName-1.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/test/pytorch_test_logs/faulty-reports/multi_file/test_name/TEST-Name-1.xml b/test/pytorch_test_logs/faulty-reports/multi_file/test_name/TEST-Name-1.xml new file mode 100644 index 00000000000..78dd9f5c7fb --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/multi_file/test_name/TEST-Name-1.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/test/pytorch_test_logs/faulty-reports/no_tests/test_name/test_name-1.xml b/test/pytorch_test_logs/faulty-reports/no_tests/test_name/test_name-1.xml new file mode 100644 index 00000000000..568b67bb677 --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/no_tests/test_name/test_name-1.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/faulty-reports/root/test_name/test_name-1.xml b/test/pytorch_test_logs/faulty-reports/root/test_name/test_name-1.xml new file mode 100644 index 00000000000..a2ecb902aa7 --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/root/test_name/test_name-1.xml @@ -0,0 +1,3 @@ + + + diff --git a/test/pytorch_test_logs/faulty-reports/skip_and_failed/test_name/test_name-1.xml b/test/pytorch_test_logs/faulty-reports/skip_and_failed/test_name/test_name-1.xml new file mode 100644 index 00000000000..fdae2191838 --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/skip_and_failed/test_name/test_name-1.xml @@ -0,0 +1,9 @@ + + + + + + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/dist-gloo-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-1d671.xml b/test/pytorch_test_logs/test-reports/dist-gloo-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-1d671.xml new file mode 100644 index 00000000000..bdf01393666 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/dist-gloo-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-1d671.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/dist-gloo-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-78879.xml b/test/pytorch_test_logs/test-reports/dist-gloo-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-78879.xml new file mode 100644 index 00000000000..bdf01393666 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/dist-gloo-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-78879.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/dist-nccl-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-5f224.xml b/test/pytorch_test_logs/test-reports/dist-nccl-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-5f224.xml new file mode 100644 index 00000000000..bdf01393666 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/dist-nccl-init-env/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-5f224.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/dist-nccl-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-d5cb5.xml b/test/pytorch_test_logs/test-reports/dist-nccl-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-d5cb5.xml new file mode 100644 index 00000000000..bdf01393666 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/dist-nccl-init-file/distr.algorithms.quantization.test_quantization/distr.algorithms.quantization.test_quantization-d5cb5.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-1.xml b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-1.xml new file mode 100644 index 00000000000..dbc55f47eec --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-1.xml @@ -0,0 +1,26 @@ + + + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-2.xml b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-2.xml new file mode 100644 index 00000000000..c8372ce9101 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-2.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-3.xml b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-3.xml new file mode 100644 index 00000000000..1c86924bf8f --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-3.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/distributed.tensor.test_dtensor_ops/distributed.tensor.test_dtensor_ops-2fe9b.xml b/test/pytorch_test_logs/test-reports/python-pytest/distributed.tensor.test_dtensor_ops/distributed.tensor.test_dtensor_ops-2fe9b.xml new file mode 100644 index 00000000000..b1010ead5c7 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/distributed.tensor.test_dtensor_ops/distributed.tensor.test_dtensor_ops-2fe9b.xml @@ -0,0 +1,17 @@ + + + + + + + + + [snip] + + + + [snip] + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-189f6.xml b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-189f6.xml new file mode 100644 index 00000000000..9b5af2a0487 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-189f6.xml @@ -0,0 +1,20 @@ + + + + + [snip] + + + [snip] + + + [snip] + + + [snip] + [snip] + [snip] + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-266ee.xml b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-266ee.xml new file mode 100644 index 00000000000..94500d25c6a --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-266ee.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-3f6e0.xml b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-3f6e0.xml new file mode 100644 index 00000000000..c0cbe2faa59 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_dynamic_shapes/dynamo.test_dynamic_shapes-3f6e0.xml @@ -0,0 +1,13 @@ + + + + + + + + [snip] + [snip] + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-18930.xml b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-18930.xml new file mode 100644 index 00000000000..dd64b286cbb --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-18930.xml @@ -0,0 +1,11 @@ + + + + + [snip] + [snip] + [snip] + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-86d5b.xml b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-86d5b.xml new file mode 100644 index 00000000000..7969c7d1799 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-86d5b.xml @@ -0,0 +1,16 @@ + + + + + + + + + + [snip] + [snip] + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-d062d.xml b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-d062d.xml new file mode 100644 index 00000000000..74e37835363 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/dynamo.test_misc/dynamo.test_misc-d062d.xml @@ -0,0 +1,20 @@ + + + + + [snip] + + + [snip] + + + [snip] + + + [snip] + [snip] + [snip] + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/run_test/run_test.xml b/test/pytorch_test_logs/test-reports/python-pytest/run_test/run_test.xml new file mode 100644 index 00000000000..2964c5a3e6f --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/run_test/run_test.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-671fe.xml b/test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-671fe.xml new file mode 100644 index 00000000000..802b93b6665 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-671fe.xml @@ -0,0 +1,8 @@ + + + + + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-8e17a.xml b/test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-8e17a.xml new file mode 100644 index 00000000000..c2049f46e24 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/test_nestedtensor/test_nestedtensor-8e17a.xml @@ -0,0 +1,26 @@ + + + + + + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-3146b.xml b/test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-3146b.xml new file mode 100644 index 00000000000..981e103162d --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-3146b.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-97a67.xml b/test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-97a67.xml new file mode 100644 index 00000000000..18afe8bc184 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/test_quantization/test_quantization-97a67.xml @@ -0,0 +1,36 @@ + + + + + [snip] + + + + [snip] + + + [snip] + + + + [snip] + + + + + + + + + [snip] + + + + + + [snip] + [snip] + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-unittest/jit.test_builtins/TEST-jit.test_builtins.TestBuiltins-1.xml b/test/pytorch_test_logs/test-reports/python-unittest/jit.test_builtins/TEST-jit.test_builtins.TestBuiltins-1.xml new file mode 100644 index 00000000000..f161a814732 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-unittest/jit.test_builtins/TEST-jit.test_builtins.TestBuiltins-1.xml @@ -0,0 +1,4 @@ + + + + diff --git a/test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-1.xml b/test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-1.xml new file mode 100644 index 00000000000..c2686aee667 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-1.xml @@ -0,0 +1,4 @@ + + + + diff --git a/test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-2.xml b/test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-2.xml new file mode 100644 index 00000000000..3477dc744ca --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-unittest/test_autoload/TEST-TestBackend-2.xml @@ -0,0 +1,7 @@ + + + + + [snip] + + From 420d8502c58a8e49ea94732fcd131ef321293b41 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 26 Jun 2025 17:21:49 +0200 Subject: [PATCH 05/20] Gracefully handle empty test result files --- easybuild/easyblocks/p/pytorch.py | 7 ++++++- test/easyblocks/easyblock_specific.py | 2 ++ .../faulty-reports/invalid_xml/test_name/test_name-1.xml | 5 +++++ .../backends.xeon.test_launch-4.xml | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 test/pytorch_test_logs/faulty-reports/invalid_xml/test_name/test_name-1.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-4.xml diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 93f09dc76a0..304b72b3198 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -992,7 +992,12 @@ def parse_test_result_file(xml_file: Path) -> List[TestSuite]: :return: A list of TestSuite objects representing the parsed structure. """ try: - root = ET.parse(xml_file).getroot() + try: + root = ET.parse(xml_file).getroot() + except ET.ParseError: + if ' or ", pytorch.get_test_results, error_log_dir / 'root') + self.assertErrorRegex(ValueError, "Failed to parse", + pytorch.get_test_results, error_log_dir / 'invalid_xml') self.assertErrorRegex(ValueError, "multiple reported files", pytorch.get_test_results, error_log_dir / 'multi_file') self.assertErrorRegex(ValueError, "Path from folder and filename should be equal", diff --git a/test/pytorch_test_logs/faulty-reports/invalid_xml/test_name/test_name-1.xml b/test/pytorch_test_logs/faulty-reports/invalid_xml/test_name/test_name-1.xml new file mode 100644 index 00000000000..d32f954bbb8 --- /dev/null +++ b/test/pytorch_test_logs/faulty-reports/invalid_xml/test_name/test_name-1.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-4.xml b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-4.xml new file mode 100644 index 00000000000..9a3e05b534f --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/backends.xeon.test_launch/backends.xeon.test_launch-4.xml @@ -0,0 +1 @@ + From b472ced887078880e2e436882e6167688edf15fe Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 22 Jul 2025 17:03:48 +0200 Subject: [PATCH 06/20] Also clean errror-tags --- test/pytorch_test_logs/cleanup_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pytorch_test_logs/cleanup_files.py b/test/pytorch_test_logs/cleanup_files.py index 9578f6f2532..22538be3851 100755 --- a/test/pytorch_test_logs/cleanup_files.py +++ b/test/pytorch_test_logs/cleanup_files.py @@ -45,7 +45,7 @@ def shorten_content(path: Path): remove_output: bool = int(md5(str(path.name).encode('utf-8')).hexdigest(), 16) % 2 == 0 # Shorten output shown between various tags - for tag in ["failure", "skipped", "system-out", "system-err", "rerun"]: + for tag in ["error", "failure", "skipped", "system-out", "system-err", "rerun"]: # Beware of multiline content in tags and empty tags ( or ) pattern = re.compile( rf'(<{tag}([^>/]*?)>)(.*?)', From 22108906dea0883458465135123e08d7c5e5adb0 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 22 Jul 2025 17:04:05 +0200 Subject: [PATCH 07/20] Ignore error on formatting empty XML --- test/pytorch_test_logs/cleanup_files.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/pytorch_test_logs/cleanup_files.py b/test/pytorch_test_logs/cleanup_files.py index 22538be3851..59ce1de3016 100755 --- a/test/pytorch_test_logs/cleanup_files.py +++ b/test/pytorch_test_logs/cleanup_files.py @@ -72,8 +72,10 @@ def format_xml(path: Path) -> bool: stderr=subprocess.STDOUT, ) except subprocess.CalledProcessError as e: - print(f'\nError formatting {path}: {e.output}', file=sys.stderr) - return False + # Ignore error "Start tag expected" for empty files + if ' - + @@ -11,5 +11,8 @@ [snip] + + [snip] + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-17dac.xml b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-17dac.xml new file mode 100644 index 00000000000..6fb969802b8 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-17dac.xml @@ -0,0 +1,14 @@ + + + + + + [snip] + [snip] + [snip] + + + [snip] + + + From 8e2033c4caa4062288ee6cfc58bb891b302801f0 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 28 Jul 2025 10:31:47 +0200 Subject: [PATCH 09/20] Find PyTorch test suite variants Current failures include > Parsing the test result files missed the following failed suites: distributed/algorithms/quantization/test_quantization The suite name as contained in the XML results is: > dist-nccl/distributed/algorithms/quantization/test_quantization So if the suite name isn't found as-is (fast due to dict hashing) also check for the name without the variant (rare). To avoid false-positives limit to variants starting with `dist-`. --- easybuild/easyblocks/p/pytorch.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index d3bae5e6dfa..abc788ce0dc 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -645,8 +645,17 @@ def test_step(self): else: msg = f'Failed to find any test report files at {test_reports_path}' raise EasyBuildError(msg) + + def suite_is_in_xml_results(suite_name): + """Check if the suite is in the XML results""" + if suite_name in xml_results: + return True + # Handle variants like dist-nccl/test_c10d_nccl + return any(xml_suite_name.split(os.path.sep, maxsplit=1)[-1] == suite_name + for xml_suite_name in xml_results if xml_suite_name.startswith('dist-')) + missing_suites = [suite.name for suite in parsed_test_result.failed_suites - if suite.name not in xml_results] + if not suite_is_in_xml_results(suite.name)] if missing_suites: raise EasyBuildError('Parsing the test result files missed the following failed suites: %s', ', '.join(sorted(missing_suites))) From 804a0486f090dcfad11de7ed59c686797c7ec8c4 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 19 Sep 2025 10:11:02 +0200 Subject: [PATCH 10/20] Isolate against more user env variables --- easybuild/easyblocks/p/pytorch.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index abc788ce0dc..f206b1af7e9 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -538,15 +538,18 @@ def add_enable_option(name, enabled): self.cfg.update('prebuildopts', ' '.join(unique_options) + ' ') self.cfg.update('preinstallopts', ' '.join(unique_options) + ' ') - def _set_cache_dir(self): - """Set $XDG_CACHE_HOME and $TRITON_HOME to avoid PyTorch defaulting to $HOME""" + def _set_cache_dirs(self): + """Set $XDG_CACHE_HOME and $TRITON_HOME to avoid PyTorch defaulting to $HOME + and similar variables to ensure clean build/test environment + """ cache_dir = os.path.join(self.tmpdir, '.cache') # The path must exist! mkdir(cache_dir, parents=True) env.setvar('XDG_CACHE_HOME', cache_dir) # Triton also uses a path defaulting to $HOME - # Isolate against user-set variables - env.unset_env_vars(('TRITON_DUMP_DIR', 'TRITON_OVERRIDE_DIR', 'TRITON_CACHE_DIR')) + # Isolate against user-set variables which could lead to reusing caches that may fail test + env.unset_env_vars(('TRITON_DUMP_DIR', 'TRITON_OVERRIDE_DIR', 'TRITON_CACHE_DIR', + 'TORCH_HOME', 'TORCHINDUCTOR_CACHE_DIR', 'PYTORCH_KERNEL_CACHE_PATH')) triton_home = os.path.join(self.tmpdir, '.triton_home') env.setvar('TRITON_HOME', triton_home) @@ -599,7 +602,7 @@ def get_test_name_diff(lst_should, lst_is): def test_step(self): """Run unit tests""" - self._set_cache_dir() + self._set_cache_dirs() # Pretend to be on FB CI which disables some tests, especially those which download stuff env.setvar('SANDCASTLE', '1') # Skip this test(s) which is very flaky @@ -780,7 +783,7 @@ def suite_is_in_xml_results(suite_name): raise EasyBuildError("Test command had non-zero exit code (%s), but no failed tests found?!", tests_ec) def test_cases_step(self): - self._set_cache_dir() + self._set_cache_dirs() super().test_cases_step() def sanity_check_step(self, *args, **kwargs): From 9949ca89c7abe14901a8f9fb35f212baa834f57a Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 19 Sep 2025 10:11:38 +0200 Subject: [PATCH 11/20] Avoid PyTorch configure warnings/issues by explicitly setting (more) dependency options Most of the options have a True/False value which we should set to False/0 when we don't have/use that dependency. This ensures that a) no system lib will be found and b) no warning will be shown. Also update the list with options added or removed until PyTorch 2.7 --- easybuild/easyblocks/p/pytorch.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index f206b1af7e9..e7b03046688 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -332,12 +332,17 @@ def is_version_ok(version_range): available_libs = ( # Format: (PyTorch flag to enable, EB name, ':') # Use `None` for the EB name if no known EC exists - ('USE_FFMPEG=1', 'FFmpeg', '1.0.0:'), + # Check the comment on top of setup.y + ('USE_FFMPEG=1', 'FFmpeg', '1.0.0:2.4.0'), ('USE_GFLAGS=1', 'gflags', '1.0.0:'), ('USE_GLOG=1', 'glog', '1.0.0:'), + ('USE_CUDSS=1', 'cuDSS', '1.0.0:'), + ('USE_CUSPARSELT=1', 'cuSPARSELt', '2.7:'), + ('USE_UCC=1', 'UCC-CUDA', '1.13.0:'), + ('USE_SYSTEM_UCC=1', 'UCC-CUDA', '1.13.0:'), # For system libs check CMakeLists.txt, below `if(USE_SYSTEM_LIBS)`, order kept here - # NCCL handled specially as other env variables are requires for it + # NCCL handled specially as other env variables are required for it ('USE_SYSTEM_CPUINFO=1', None, '1.6.0:'), ('USE_SYSTEM_SLEEF=1', None, '1.6.0:'), ('USE_SYSTEM_GLOO=1', None, '1.6.0:'), @@ -448,7 +453,7 @@ def add_enable_option(name, enabled): raise EasyBuildError("Did not find a supported BLAS in dependencies. Don't know which BLAS lib to use") available_dependency_options = EB_PyTorch.get_dependency_options_for_version(self.version) - dependency_names = {dep['name'] for dep in self.cfg.dependencies()} + dependency_names = self.cfg.dependency_names() not_used_dep_names = [] for enable_opt, dep_name in available_dependency_options: if dep_name is None: @@ -457,6 +462,9 @@ def add_enable_option(name, enabled): options.append(enable_opt) else: not_used_dep_names.append(dep_name) + # Explicitely toggle to avoid picking up system libs, restricted to 2.7+ to avoid retesting older ECs + if pytorch_version >= '2.7' and enable_opt[-1] in ('0', '1'): + options.append(enable_opt[:-1] + ('0' if enable_opt[-1] == '1' else '1')) self.log.info('Did not enable options for the following dependencies as they are not used in the EC: %s', not_used_dep_names) @@ -510,7 +518,8 @@ def add_enable_option(name, enabled): options.append('USE_FBGEMM=0') # Metal only supported on IOS which likely doesn't work with EB, so disabled - options.append('USE_METAL=0') + if pytorch_version < '2.4': # Removed in 2.4 + options.append('USE_METAL=0') build_type = self.cfg.get('build_type') if build_type is None: From 906d8cf00e9b7e1096e92086370159f703abd145 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 19 Sep 2025 10:17:59 +0200 Subject: [PATCH 12/20] Symlink NCCL library when added as a build dependency As PyTorch is sensitive to specific NCCL versions one approach is to use it as a build dependency only and add an rpath to it after copying it into a (non-standard) folder inside the PyTorch module. This is similar to the PyPI package that depends on various nvidia-packages and adds relative rpaths to ensure they are used when loading the torch package/libraries. --- easybuild/easyblocks/p/pytorch.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index e7b03046688..af204a3b04b 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -47,8 +47,8 @@ from easybuild.tools import LooseVersion from easybuild.tools.build_log import EasyBuildError, print_warning from easybuild.tools.config import ERROR, build_option -from easybuild.tools.filetools import apply_regex_substitutions, mkdir, symlink -from easybuild.tools.modules import get_software_root, get_software_version +from easybuild.tools.filetools import apply_regex_substitutions, mkdir, symlink, copy +from easybuild.tools.modules import get_software_root, get_software_version, get_software_libdir from easybuild.tools.run import run_shell_cmd from easybuild.tools.systemtools import POWER, get_cpu_architecture @@ -795,6 +795,32 @@ def test_cases_step(self): self._set_cache_dirs() super().test_cases_step() + def install_step(self): + """Set rpath if required""" + super().install_step() + # If NCCL is used as a build dependency only, we need to make sure it is found at runtime + if 'NCCL' in self.cfg.dependency_names(build_only=True): + if 'patchelf' not in self.cfg.dependency_names(): + raise EasyBuildError("PyTorch requires patchelf to set the RPATH of the NCCL" + " as NCCL is only a build dependency") + nccl_libdir = get_software_libdir('NCCL', full_path=True) + if not nccl_libdir: + raise EasyBuildError("Did not find libdir of NCCL installation") + nccl_libs = list(Path(nccl_libdir).glob('libnccl.so*')) + if not nccl_libs: + raise EasyBuildError("Did not find any NCCL libraries in %s", nccl_libdir) + torch_libs = list(Path(self.installdir).glob('lib/**/torch/**/*.so')) + if not torch_libs: + raise EasyBuildError("Did not find any PyTorch libraries in %s", self.installdir) + nvidia_libs_dir = os.path.join(self.installdir, 'nvidia_libs') + mkdir(nvidia_libs_dir, parents=True) + copy(nccl_libs, nvidia_libs_dir, symlinks=True) + + for lib in torch_libs: + rpath = os.path.relpath(nvidia_libs_dir, lib.parent) + run_shell_cmd(['patchelf', '--force-rpath', '--add-rpath', f'$ORIGIN/{rpath}', str(lib)], + use_bash=False) + def sanity_check_step(self, *args, **kwargs): """Custom sanity check for PyTorch""" From a986775e49c383d2bdf03f7fa5849b134708d100 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 19 Sep 2025 10:20:21 +0200 Subject: [PATCH 13/20] Revert "Symlink NCCL library when added as a build dependency" This reverts commit 906d8cf00e9b7e1096e92086370159f703abd145. --- easybuild/easyblocks/p/pytorch.py | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index af204a3b04b..e7b03046688 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -47,8 +47,8 @@ from easybuild.tools import LooseVersion from easybuild.tools.build_log import EasyBuildError, print_warning from easybuild.tools.config import ERROR, build_option -from easybuild.tools.filetools import apply_regex_substitutions, mkdir, symlink, copy -from easybuild.tools.modules import get_software_root, get_software_version, get_software_libdir +from easybuild.tools.filetools import apply_regex_substitutions, mkdir, symlink +from easybuild.tools.modules import get_software_root, get_software_version from easybuild.tools.run import run_shell_cmd from easybuild.tools.systemtools import POWER, get_cpu_architecture @@ -795,32 +795,6 @@ def test_cases_step(self): self._set_cache_dirs() super().test_cases_step() - def install_step(self): - """Set rpath if required""" - super().install_step() - # If NCCL is used as a build dependency only, we need to make sure it is found at runtime - if 'NCCL' in self.cfg.dependency_names(build_only=True): - if 'patchelf' not in self.cfg.dependency_names(): - raise EasyBuildError("PyTorch requires patchelf to set the RPATH of the NCCL" - " as NCCL is only a build dependency") - nccl_libdir = get_software_libdir('NCCL', full_path=True) - if not nccl_libdir: - raise EasyBuildError("Did not find libdir of NCCL installation") - nccl_libs = list(Path(nccl_libdir).glob('libnccl.so*')) - if not nccl_libs: - raise EasyBuildError("Did not find any NCCL libraries in %s", nccl_libdir) - torch_libs = list(Path(self.installdir).glob('lib/**/torch/**/*.so')) - if not torch_libs: - raise EasyBuildError("Did not find any PyTorch libraries in %s", self.installdir) - nvidia_libs_dir = os.path.join(self.installdir, 'nvidia_libs') - mkdir(nvidia_libs_dir, parents=True) - copy(nccl_libs, nvidia_libs_dir, symlinks=True) - - for lib in torch_libs: - rpath = os.path.relpath(nvidia_libs_dir, lib.parent) - run_shell_cmd(['patchelf', '--force-rpath', '--add-rpath', f'$ORIGIN/{rpath}', str(lib)], - use_bash=False) - def sanity_check_step(self, *args, **kwargs): """Custom sanity check for PyTorch""" From 4feb3a9d4864b41d1619720cc399f3963d152fb2 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 20 Oct 2025 11:13:58 +0200 Subject: [PATCH 14/20] Use raise-from for better error reporting --- easybuild/easyblocks/p/pytorch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index e7b03046688..ceaa33055ff 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -649,7 +649,7 @@ def test_step(self): try: xml_results = get_test_results(test_reports_path) except ValueError as e: - raise EasyBuildError(f"Failed to parse test results at {test_reports_path}: {e}") + raise EasyBuildError(f"Failed to parse test results at {test_reports_path}: {e}") from e if not xml_results: files = [file for file in test_reports_path.rglob('*.*') if file.is_file()] if files: @@ -1073,7 +1073,7 @@ def parse_test_result_file(xml_file: Path) -> List[TestSuite]: ) ) except Exception as e: - raise ValueError(f"Failed to parse test result file '{xml_file}': {e}") + raise ValueError(f"Failed to parse test result file '{xml_file}': {e}") from e return test_suites From c98ab49df8bb680a67d2181a385e9e6ca5b0344e Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 20 Oct 2025 11:14:15 +0200 Subject: [PATCH 15/20] Don't fail for incomplete testcase tags Some are missing all tags except for 'time'. Just ignore those. --- easybuild/easyblocks/p/pytorch.py | 7 ++++++- .../inductor.test_cudagraph_trees-bbc64.xml | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-bbc64.xml diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index ceaa33055ff..19b660615b6 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -918,7 +918,12 @@ def parse_test_cases(test_suite_el: ET.Element) -> List[TestCase]: """Extract all test cases from the testsuite XML element""" test_cases: List[TestCase] = [] for testcase in test_suite_el.iterfind("testcase"): - classname = testcase.attrib["classname"] + try: + classname = testcase.attrib["classname"] + except KeyError as e: + if any(tag in testcase.attrib for tag in ('name', 'file')): + raise ValueError(f"Missing 'classname' attribute in testcase (Attributes: '{testcase.attrib}')") from e + continue # Skip invalid testcase entries without classname test_name = f'{classname}.{testcase.attrib["name"]}' # Note: It is possible that a test has (the same?) element multiple times, likely when using variants. # Ignore that and only check if it has one of the failure tags at least once. diff --git a/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-bbc64.xml b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-bbc64.xml new file mode 100644 index 00000000000..3ce296ae6a6 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_cudagraph_trees/inductor.test_cudagraph_trees-bbc64.xml @@ -0,0 +1,7 @@ + + + + + + + From 6de55c465866e003c9b579d56dd0978212c38d9e Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 20 Oct 2025 12:49:36 +0200 Subject: [PATCH 16/20] Handle skip-and-fail mismatch PyTorch reruns single tests by skipping portions of the test before that. If those other tests don't succeed the parser will error out during merging as it will see a test that was skipped and failed. Handle that by ignoring the skipped test result during merge. --- easybuild/easyblocks/p/pytorch.py | 8 ++++---- test/easyblocks/easyblock_specific.py | 5 ++++- .../inductor.test_aot_inductor_arrayref-bfd31.xml | 13 +++++++++++++ .../inductor.test_aot_inductor_arrayref-bfd31_2.xml | 13 +++++++++++++ 4 files changed, 34 insertions(+), 5 deletions(-) create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31.xml create mode 100644 test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31_2.xml diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 19b660615b6..aed270d7aef 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -1096,11 +1096,11 @@ def merge_test_suites(test_suites: Iterable[TestSuite]) -> TestSuite: except KeyError: result_suite.add_test(current_test) else: - if (existing_test.state == TestState.SKIPPED) != (current_test.state == TestState.SKIPPED): - raise ValueError(f"Mismatch in whether test was skipped or not in suite {result_suite.name}: " - f"{existing_test} vs. {current_test}") - # If test was rerun and succeeded use that if current_test.state == TestState.SUCCESS and existing_test.state != TestState.SUCCESS: + # If test was rerun and succeeded use that + result_suite.replace_test(current_test) + elif existing_test.state == TestState.SKIPPED and current_test.state != TestState.SKIPPED: + # If test was skipped but later run use that result_suite.replace_test(current_test) return result_suite diff --git a/test/easyblocks/easyblock_specific.py b/test/easyblocks/easyblock_specific.py index 766d27246c0..6d3ae2fdbca 100644 --- a/test/easyblocks/easyblock_specific.py +++ b/test/easyblocks/easyblock_specific.py @@ -523,7 +523,7 @@ def test_pytorch_test_log_parsing(self): self.assertEqual((name, suite.summary), (name, results2[name].summary)) del results2 - self.assertEqual(len(results), 14) + self.assertEqual(len(results), 15) # 2 small test suites used as a smoke test using a most features self.assertIn('backends/xeon/test_launch', results) @@ -556,6 +556,7 @@ def test_pytorch_test_log_parsing(self): distributed/tensor/test_dtensor_ops: 0 failed, 2 passed, 2 skipped, 0 errors dynamo/test_dynamic_shapes: 3 failed, 14 passed, 0 skipped, 0 errors dynamo/test_misc: 1 failed, 9 passed, 0 skipped, 0 errors + inductor/test_aot_inductor_arrayref: 2 failed, 0 passed, 0 skipped, 0 errors inductor/test_cudagraph_trees: 1 failed, 0 passed, 0 skipped, 0 errors jit/test_builtins: 0 failed, 1 passed, 0 skipped, 0 errors test_autoload: 0 failed, 1 passed, 1 skipped, 0 errors @@ -566,6 +567,8 @@ def test_pytorch_test_log_parsing(self): for suite in results.values() for test in suite.get_tests())) self.assertEqual(tests, textwrap.dedent(""" + AOTInductorTestABICompatibleCpuWithStackAllocation.test_fail_and_skip: failure + AOTInductorTestABICompatibleCpuWithStackAllocation.test_skip_and_fail: failure CudaGraphTreeTests.test_workspace_allocation_error: failure DistQuantizationTests.test_all_gather_fp16: success DistQuantizationTests.test_all_gather_fp16: success diff --git a/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31.xml b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31.xml new file mode 100644 index 00000000000..b2b186b80bd --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31.xml @@ -0,0 +1,13 @@ + + + + + + [snip] + + + [snip] + + + diff --git a/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31_2.xml b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31_2.xml new file mode 100644 index 00000000000..37ff24b617b --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-pytest/inductor.test_aot_inductor_arrayref/inductor.test_aot_inductor_arrayref-bfd31_2.xml @@ -0,0 +1,13 @@ + + + + + + [snip] + + + [snip] + + + From bacadb86126ac6d31e08fb89e4be31066911f65c Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 22 Oct 2025 13:31:05 +0200 Subject: [PATCH 17/20] Update for 2.8+ --- easybuild/easyblocks/p/pytorch.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index aed270d7aef..1c21b60deee 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -400,18 +400,19 @@ def configure_step(self): [(r'(default=_get_test_report_path\(\) if) IS(_IN)?_CI else None', fr'\1 os.getenv("{self.GENERATE_TEST_REPORT_VAR_NAME}") else None')], backup=False, on_missing_match=ERROR) - if pytorch_version >= '2.1.0': - run_test_subs = [(r'if IS_CI:\n\s+# Add the option to generate XML test report.*', - 'if TEST_SAVE_XML:\n')] - else: - run_test_subs = [ - (r'from torch.testing._internal.common_utils import\s+\(\n\s+', - r'\g<0>get_report_path, '), - (r'# If using pytest.*\n\s+if options.pytest:\n\s+unittest_args = \[', - r'\g<0>"--junit-xml-reruns", get_report_path(pytest=True)] + ['), - ] - apply_regex_substitutions('test/run_test.py', run_test_subs, backup=False, on_missing_match=ERROR, - single_line=False) + if pytorch_version < '2.8.0': + if pytorch_version >= '2.1.0': + run_test_subs = [(r'if IS_CI:\n\s+# Add the option to generate XML test report.*', + 'if TEST_SAVE_XML:\n')] + else: + run_test_subs = [ + (r'from torch.testing._internal.common_utils import\s+\(\n\s+', + r'\g<0>get_report_path, '), + (r'# If using pytest.*\n\s+if options.pytest:\n\s+unittest_args = \[', + r'\g<0>"--junit-xml-reruns", get_report_path(pytest=True)] + ['), + ] + apply_regex_substitutions('test/run_test.py', run_test_subs, backup=False, on_missing_match=ERROR, + single_line=False) self.has_xml_test_reports = True # Gather default options. Will be checked against (and can be overwritten by) custom_opts From de6a86f215672f6a3322b0a21a48e64330300feb Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 7 Nov 2025 17:02:35 +0100 Subject: [PATCH 18/20] Add exception for "suitename" of "-c" In PyTorch `test_testing.py` it runs a subtest via Python code, i.e. as `python -c` This shows up in the test report path and as not having a `file` attribute for the tag. `determine_suite_name` fails in `reported_file = os.path.basename(file_attribute.pop())` with > KeyError: 'pop from an empty set' Simply ignore those. --- easybuild/easyblocks/p/pytorch.py | 22 ++++++++++++++----- .../-c/TEST-TestFooCPU-20251030150550.xml | 7 ++++++ ...test.suite._ErrorHolder-20251030150550.xml | 11 ++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-TestFooCPU-20251030150550.xml create mode 100644 test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-unittest.suite._ErrorHolder-20251030150550.xml diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 1c21b60deee..b7a23029b50 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -39,7 +39,7 @@ from itertools import chain, groupby from operator import attrgetter from pathlib import Path -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Optional import easybuild.tools.environment as env from easybuild.easyblocks.generic.pythonpackage import PythonPackage @@ -942,20 +942,27 @@ def parse_test_cases(test_suite_el: ET.Element) -> List[TestCase]: return test_cases -def determine_suite_name(xml_file: Path, test_suite_xml: List[ET.Element]) -> str: +def determine_suite_name(xml_file: Path, test_suite_xml: List[ET.Element]) -> Optional[str]: """Determine main test suite name from path(s) to match against run_test.py output""" # Gather all file attributes from the test cases if set test_cases = [testcase for suite in test_suite_xml for testcase in suite.iterfind("testcase")] - file_attribute = {testcase.attrib.get("file") for testcase in test_cases} - file_attribute.discard(None) suite_name = xml_file.parent.name.replace('.', os.path.sep) # Usually the suite name is the folder name + if xml_file.name.startswith('TEST-'): + # A unittest test could be run directly (`python -c 'code...'`) in which case there is no name + if suite_name == '-c': + return None # Python unittest reports have 1 file per test class: # test-reports/python-unittest/test_package/TEST-test_repackage.TestRepackage-20250217120914.xml # -> test_repackage.py ran TestRepackage # test-reports/dist-gloo/distributed.algorithms.test_quantization/TEST-DistQuantizationTests-20250123170925.xml # -> distributed/algorithms/test_quantization ran DistQuantizationTests in dist-gloo variant # Just do a sanity check + file_attribute = {testcase.attrib.get("file") for testcase in test_cases} + file_attribute.discard(None) + if not file_attribute: # Fallback to checking the tags + file_attribute = {suite.attrib.get("file") for suite in test_suite_xml} + file_attribute.discard(None) if len(file_attribute) > 1: raise ValueError(f"Found multiple reported files in unittest report of '{xml_file}': {file_attribute}") reported_file = os.path.basename(file_attribute.pop()) @@ -963,11 +970,12 @@ def determine_suite_name(xml_file: Path, test_suite_xml: List[ET.Element]) -> st name_parts = xml_file.name[len('TEST-'):].rsplit('-', 1)[0].rsplit('.', 2) # If there is only one part it is the class -> filename is in the suite name if len(name_parts) == 1: - test_file_name = os.path.basename(suite_name) + '.py' + test_file_name = os.path.basename(suite_name) else: # Note that multiple parts are possible for sub-test files: # TEST-jit.test_builtins.TestBuiltins (jit/test_builtins.py) - test_file_name = name_parts[-2] + '.py' + test_file_name = name_parts[-2] + test_file_name += '.py' if test_file_name != reported_file: raise ValueError(f"Unexpected file attributes in test cases of '{xml_file}'. " f"Expected {test_file_name}, got {file_attribute}") @@ -1036,6 +1044,8 @@ def parse_test_result_file(xml_file: Path) -> List[TestSuite]: # Suite name to correctly deduplicate tests and match against run_test.py output suite_name = determine_suite_name(xml_file, test_suite_xml) + if suite_name is None: + return [] test_suites: List[TestSuite] = [] diff --git a/test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-TestFooCPU-20251030150550.xml b/test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-TestFooCPU-20251030150550.xml new file mode 100644 index 00000000000..4afd9909341 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-TestFooCPU-20251030150550.xml @@ -0,0 +1,7 @@ + + + + + + diff --git a/test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-unittest.suite._ErrorHolder-20251030150550.xml b/test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-unittest.suite._ErrorHolder-20251030150550.xml new file mode 100644 index 00000000000..2015b560986 --- /dev/null +++ b/test/pytorch_test_logs/test-reports/python-unittest/-c/TEST-unittest.suite._ErrorHolder-20251030150550.xml @@ -0,0 +1,11 @@ + + + + ", line 16, in tearDownClass +RuntimeError: called with TestFooCPU +]]> + + From 174b7bbcd2e69f2520de3943a411f51133b2af5d Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 10 Nov 2025 12:06:43 +0100 Subject: [PATCH 19/20] Add CLI arg to sort suites by custom attribute --- easybuild/easyblocks/p/pytorch.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index b7a23029b50..80540204d5a 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -1133,6 +1133,13 @@ def get_test_results(folder: Path) -> Dict[str, TestSuite]: def main(arg: Path): + # Get attribute on which to sort suites + try: + sort_key = sys.argv[sys.argv.index('--sort') + 1] + except ValueError: + sort_key = next((arg.split('=', 1)[1] for arg in sys.argv if arg.startswith('--sort=')), None) + if not sort_key: + sort_key = 'name' if arg.is_file(): content = arg.read_text() m = re.search(r'cmd .*python[^ ]* run_test\.py .* exited with exit code.*output', content) @@ -1152,15 +1159,16 @@ def main(arg: Path): raise RuntimeError(msg) else: results = get_test_results(Path(arg)) - print(f"Found {len(results)} test suites:") - for suite in results.values(): + print(f"Found {len(results)} test suites (sorted by {sort_key}):") + sorted_suites = sorted(results.values(), key=lambda suite: getattr(suite, sort_key)) + for suite in sorted_suites: print(f"Suite {suite.name} {suite.num_tests}:\t{suite.summary}") print("Total tests:", sum(suite.num_tests for suite in results.values())) print("Total failures:", sum(suite.failures for suite in results.values())) print("Total skipped:", sum(suite.skipped for suite in results.values())) print("Total errors:", sum(suite.errors for suite in results.values())) - failed_suites = [suite.name for suite in results.values() if suite.failures + suite.errors > 0] - print(f"Failed suites ({len(failed_suites)}):\n\t" + '\n\t'.join(sorted(failed_suites))) + failed_suites = [suite.name for suite in sorted_suites if suite.failures + suite.errors > 0] + print(f"Failed suites ({len(failed_suites)}):\n\t" + '\n\t'.join(failed_suites)) failed_tests = sum((suite.get_failed_tests() for suite in results.values()), []) print(f"Failed tests ({len(failed_tests)}):\n\t" + '\n\t'.join(sorted(failed_tests))) errored_tests = sum((suite.get_errored_tests() for suite in results.values()), []) From 79fa0c582fedc7e2d7fc26b78e08f260d0d3f064 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 10 Nov 2025 12:14:03 +0100 Subject: [PATCH 20/20] Show number of failed tests in list of failed suites --- easybuild/easyblocks/p/pytorch.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/easybuild/easyblocks/p/pytorch.py b/easybuild/easyblocks/p/pytorch.py index 80540204d5a..8a73b611c5e 100755 --- a/easybuild/easyblocks/p/pytorch.py +++ b/easybuild/easyblocks/p/pytorch.py @@ -1162,13 +1162,14 @@ def main(arg: Path): print(f"Found {len(results)} test suites (sorted by {sort_key}):") sorted_suites = sorted(results.values(), key=lambda suite: getattr(suite, sort_key)) for suite in sorted_suites: - print(f"Suite {suite.name} {suite.num_tests}:\t{suite.summary}") + print(f"Suite {suite.name}:\t{suite.num_tests} tests, {suite.summary}") print("Total tests:", sum(suite.num_tests for suite in results.values())) print("Total failures:", sum(suite.failures for suite in results.values())) print("Total skipped:", sum(suite.skipped for suite in results.values())) print("Total errors:", sum(suite.errors for suite in results.values())) - failed_suites = [suite.name for suite in sorted_suites if suite.failures + suite.errors > 0] - print(f"Failed suites ({len(failed_suites)}):\n\t" + '\n\t'.join(failed_suites)) + failed_suites = [suite for suite in sorted_suites if suite.failures + suite.errors > 0] + print(f"Failed suites ({len(failed_suites)}):\n\t" + '\n\t'.join( + f'{suite.name} ({suite.failures + suite.errors}/{suite.num_tests})' for suite in failed_suites)) failed_tests = sum((suite.get_failed_tests() for suite in results.values()), []) print(f"Failed tests ({len(failed_tests)}):\n\t" + '\n\t'.join(sorted(failed_tests))) errored_tests = sum((suite.get_errored_tests() for suite in results.values()), [])