From 377ede8f6eb6a19cb7ac1ae242fb8c46dad27d4e Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 3 Mar 2026 16:30:53 +0100 Subject: [PATCH 1/5] Fix race condition in checking for disabled tests in PyTorch-2.9.1-foss-2024a --- .../p/PyTorch/PyTorch-2.9.1-foss-2024a.eb | 2 +- .../PyTorch-2.9.1_disable-slow-tests.patch | 24 +++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb index b649048eb125..a58e11727256 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb @@ -148,7 +148,7 @@ checksums = [ '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'}, {'PyTorch-2.9.1_check-device-avail-test_schedule.patch': '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'}, - {'PyTorch-2.9.1_disable-slow-tests.patch': '76e6d8f7366b91a0ddc65f73685f2b09988bb5537d10d294f9bb6a48c7fec3d0'}, + {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'}, {'PyTorch-2.9.1_fix-hypothesis-deadline.patch': 'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'}, {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch': 'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'}, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch index 8f6d6e0c7677..9db987094fff 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch @@ -5,10 +5,14 @@ Enable the default for non-CI environments to cut down testing time. Don't check for SANDCASTLE when determining whether to skip disabled tests. However, the disabled-tests JSON file needs to be downloaded from S3 and placed at "tests/.pytorch-disabled-tests.json". +This file may be modified and/or redownloaded in import_test_stats.py +Disable this by just returning it's content as-if it is always up to date. +If it doesn't exist the failure will be handled by the calling function. +This modification removes the PR number field, so make it optional in the tuple expansion to allow either format. + Author: Alexander Grund (TU Dresden) diff --git a/test/run_test.py b/test/run_test.py -index 44a15d4ab2c..269d4206f3e 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -531,7 +531,7 @@ def run_test( @@ -20,8 +24,19 @@ index 44a15d4ab2c..269d4206f3e 100755 ci_args = ["--import-slow-tests", "--import-disabled-tests"] if RERUN_DISABLED_TESTS: ci_args.append("--rerun-disabled-tests") +diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py +--- a/tools/stats/import_test_stats.py ++++ b/tools/stats/import_test_stats.py +@@ -47,6 +47,8 @@ def fetch_and_cache( + Path(dirpath).mkdir(exist_ok=True) + + path = os.path.join(dirpath, name) ++ with open(path) as f: ++ return cast(dict[str, Any], json.load(f)) + print(f"Downloading {url} to {path}") + + def is_cached_file_valid() -> bool: diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py -index bfc568bc146..7ef37cccccb 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -2722,11 +2722,11 @@ def check_if_enable(test: unittest.TestCase): @@ -32,9 +47,10 @@ index bfc568bc146..7ef37cccccb 100644 + if True: should_skip = False skip_msg = "" - +- - for disabled_test, (issue_url, platforms) in disabled_tests_dict.items(): -+ for disabled_test, (pr_num, issue_url, platforms) in disabled_tests_dict.items(): ++ # Allow for a potentially existing PR number ++ for disabled_test, (*pr_num, issue_url, platforms) in disabled_tests_dict.items(): if matches_test(disabled_test): platform_to_conditional: dict = { "mac": IS_MACOS, From dc58f64d0905380d9b485b05baaccd73dd8de9df Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 4 Mar 2026 15:26:31 +0100 Subject: [PATCH 2/5] Remove pytest-shard --- easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb | 1 - 1 file changed, 1 deletion(-) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb index a58e11727256..c3141f244cfd 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb @@ -183,7 +183,6 @@ builddependencies = [ ('parameterized', '0.9.0'), ('pytest-flakefinder', '1.1.0'), ('pytest-rerunfailures', '15.0'), - ('pytest-shard', '0.1.2'), ('pytest-subtests', '0.13.1'), ('tlparse', '0.4.0'), ('optree', '0.14.1'), From 8713e4001c116223a17e2571039cee0a844e61a7 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 5 Mar 2026 16:33:45 +0100 Subject: [PATCH 3/5] Add more patches --- .../p/PyTorch/PyTorch-2.9.1-foss-2024a.eb | 8 +++++++ .../PyTorch-2.9.1_dont-print-test-items.patch | 24 +++++++++++++++++++ ...yTorch-2.9.1_skip-cutlass-addmm-test.patch | 16 +++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb index c3141f244cfd..8f6fad5939f2 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb @@ -66,6 +66,7 @@ patches = [ 'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch', 'PyTorch-2.9.1_check-device-avail-test_schedule.patch', 'PyTorch-2.9.1_disable-slow-tests.patch', + 'PyTorch-2.9.1_dont-print-test-items.patch', 'PyTorch-2.9.1_fix-hypothesis-deadline.patch', 'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch', 'PyTorch-2.9.1_fix-test_dist2-decorators.patch', @@ -74,6 +75,7 @@ patches = [ 'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch', 'PyTorch-2.9.1_normalize_tree_output.patch', 'PyTorch-2.9.1_set-test-timeout.patch', + 'PyTorch-2.9.1_skip-cutlass-addmm-test.patch', 'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch', 'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch', 'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch', @@ -149,6 +151,7 @@ checksums = [ {'PyTorch-2.9.1_check-device-avail-test_schedule.patch': '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'}, {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'}, + {'PyTorch-2.9.1_dont-print-test-items.patch': '2b524cf3d557c0672feefc3a7165e5555e549b0720647a84d546f769cea0be07'}, {'PyTorch-2.9.1_fix-hypothesis-deadline.patch': 'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'}, {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch': 'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'}, @@ -161,6 +164,8 @@ checksums = [ 'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'}, {'PyTorch-2.9.1_normalize_tree_output.patch': '7d5994580339b73c28de595d9e5a0448db97b7d284f17efd18909e4613d170df'}, {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'}, + {'PyTorch-2.9.1_skip-cutlass-addmm-test.patch': + '1f81a8a9eea8eda51fc93dff84cd994772febf4fd05d77efbf21b8440dadfd4e'}, {'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch': 'd8489c192da549083569e09e5f94d2a83c9e41e111b1322f86512a9c5a58c0d9'}, {'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch': @@ -242,6 +247,9 @@ excluded_tests = { 'test_license', # No triton 'distributed/test_nvshmem_triton', + # Occasional segfaults on CPU + 'inductor/test_flex_attention', + 'inductor/test_flex_decoding ', ] } diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch new file mode 100644 index 000000000000..b029f0a8a5a8 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch @@ -0,0 +1,24 @@ +Reduce verbosity of the test output by not showing all ~270k test names. + +Author: Alexander Grund (TU Dresden) +diff --git a/test/run_test.py b/test/run_test.py +--- a/test/run_test.py ++++ b/test/run_test.py +@@ -623,7 +623,7 @@ def run_test( + test_file, + ) + else: +- command.extend([f"--sc={stepcurrent_key}", "--print-items"]) ++ command.extend([f"--sc={stepcurrent_key}"]) + ret_code, was_rerun = retry_shell( + command, + test_directory, +@@ -725,7 +725,7 @@ def run_test_retries( + + num_failures = defaultdict(int) + +- print_items = ["--print-items"] ++ print_items = [] + sc_command = f"--sc={stepcurrent_key}" + while True: + ret_code, _ = retry_shell( diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch new file mode 100644 index 000000000000..aa7e88a859dc --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch @@ -0,0 +1,16 @@ +The test fails with accuracy issues in at least H100, possibly on CUDA 12.8 in general. +See https://github.com/pytorch/pytorch/pull/156626 + +Author: Alexander Grund (TU Dresden) +diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py +--- a/test/inductor/test_cutlass_backend.py ++++ b/test/inductor/test_cutlass_backend.py +@@ -613,7 +613,7 @@ class TestCutlassBackend(TestCase): + + torch.testing.assert_close(actual, expected, rtol=1e-2, atol=0.05) + +- @unittest.skipIf(not SM90OrLater, "need sm_90") ++ @unittest.skip("Fails on CUDA 12.8+") + @parametrize("dynamic", (False, True)) + @parametrize("use_aoti", (False, True)) + @parametrize("dtype", (torch.float16, torch.bfloat16)) From f8993db3e2abdbe4fcabba0bae8dca8b37ab5966 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 11 Mar 2026 12:45:08 +0100 Subject: [PATCH 4/5] Fix using wrong OpenMP library --- .../p/PyTorch/PyTorch-2.9.1-foss-2024a.eb | 3 ++ ...Torch-2.9.1_avoid-using-wrong-libomp.patch | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb index 8f6fad5939f2..06ef2a690972 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb @@ -64,6 +64,7 @@ patches = [ 'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch', 'PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch', 'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch', + 'PyTorch-2.9.1_avoid-using-wrong-libomp.patch', 'PyTorch-2.9.1_check-device-avail-test_schedule.patch', 'PyTorch-2.9.1_disable-slow-tests.patch', 'PyTorch-2.9.1_dont-print-test-items.patch', @@ -148,6 +149,8 @@ checksums = [ '5c68e0de73212ed266879f4528a6041ef7ab2f1ac83c6cf7142c4baa78e7664c'}, {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch': '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'}, + {'PyTorch-2.9.1_avoid-using-wrong-libomp.patch': + '2fc2bb82cce87ba0ce73718b0502735ecdf360ca6bfac4482396f7f1c51c1866'}, {'PyTorch-2.9.1_check-device-avail-test_schedule.patch': '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'}, {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'}, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch new file mode 100644 index 000000000000..a9b58a7d8e5c --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch @@ -0,0 +1,34 @@ +When using GCC `libgomp.so` should be used which will be automatically done with `-fopenmp`. +However the custom FindOpenMP searches for `libomp.so` first which ends up being used if found +e.g. on the system in /lib64 + +See https://github.com/pytorch/pytorch/pull/177126 + +Author: Alexander Grund (TU Dresden) + +diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake +--- a/cmake/Modules/FindOpenMP.cmake ++++ b/cmake/Modules/FindOpenMP.cmake +@@ -289,21 +289,13 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR) + mark_as_advanced(OpenMP_libomp_LIBRARY) + endif() + +- if (NOT OpenMP_libomp_LIBRARY) +- find_library(OpenMP_libomp_LIBRARY +- NAMES omp gomp iomp5 +- HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES} +- DOC "libomp location for OpenMP" +- ) +- mark_as_advanced(OpenMP_libomp_LIBRARY) +- endif() +- + # Use OpenMP_PREFIX if defined + if (NOT OpenMP_libomp_LIBRARY AND NOT "${OpenMP_PREFIX}" STREQUAL "") + find_library(OpenMP_libomp_LIBRARY + NAMES omp gomp iomp5 + HINTS "${OpenMP_PREFIX}/lib" + DOC "libomp location for OpenMP" ++ NO_DEFAULT_PATH + ) + mark_as_advanced(OpenMP_libomp_LIBRARY) + endif() From 5c3c03b3a7d1bd769b7032e352595885c0a4b12d Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 13 Mar 2026 17:17:33 +0100 Subject: [PATCH 5/5] Skip some tests failing on ARM --- .../p/PyTorch/PyTorch-2.9.1-foss-2024a.eb | 15 ++++ ...increase-tolerance-TestDecomp-matmul.patch | 32 +++++++++ ..._skip-cpu_repro-tests-failing-on-ARM.patch | 72 +++++++++++++++++++ ....1_skip-svd-pca-lowrank-tests-on-cpu.patch | 26 +++++++ ...skip-test_optree_graph_break_message.patch | 24 +++++++ ...ch-2.9.1_skip-tests-requiring-MKLDNN.patch | 38 ++++++++++ 6 files changed, 207 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb index 06ef2a690972..83383bd798aa 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb @@ -82,6 +82,11 @@ patches = [ 'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch', 'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch', 'PyTorch-2.9.1_skip-tests-requiring-SM90.patch', + 'PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch', + 'PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch', + 'PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch', + 'PyTorch-2.9.1_skip-test_optree_graph_break_message.patch', + 'PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch', ] checksums = [ {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'}, @@ -179,6 +184,16 @@ checksums = [ 'e57486cc42f3dbcae29753168febc251d070a283229e2d76ccbdf19fee53f06e'}, {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch': '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'}, + {'PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch': + 'dd82203ce3b6262255aba6b59fb3b547c4c17875d5711f6d3d489aa8f0f59f32'}, + {'PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch': + '99055fde02ca17c1db1cd72f41821387a50901d6cd947161cafa12257b3a1c5a'}, + {'PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch': + '4fc772293047dc737b99e232b8a8db904aa8e88e3c8b2bcc3602fb723941fb89'}, + {'PyTorch-2.9.1_skip-test_optree_graph_break_message.patch': + '2ef1ad424d5f12a4d0ae06938da623819596cee7c0fb4616008f27583c29494d'}, + {'PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch': + '03756a8069bad01018f422f41aa24c7c543519fd857db88a0c6de661976c8859'}, ] osdependencies = [OS_PKG_IBVERBS_DEV] diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch new file mode 100644 index 000000000000..9bd54ea4d8da --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch @@ -0,0 +1,32 @@ + +TestDecompCPU.test_comprehensive___rmatmul___cpu_float32, TestDecompCPU.test_comprehensive_matmul_cpu_float32 fail with small tolerance issues: +> Expected 12.534862518310547 but got 12.534895896911621. +> Absolute difference: 3.337860107421875e-05 (up to 1e-05 allowed) +> Relative difference: 2.6628613616990456e-06 (up to 1.3e-06 allowed) + +Increase the tolerances slightly to make them pass. + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py +--- a/torch/testing/_internal/common_methods_invocations.py ++++ b/torch/testing/_internal/common_methods_invocations.py +@@ -14286,6 +14286,9 @@ op_db: list[OpInfo] = [ + DecorateInfo(toleranceOverride({torch.float32: tol(atol=0, rtol=1e-5)}), + 'TestCommon', 'test_noncontiguous_samples', + device_type='cpu'), ++ DecorateInfo(toleranceOverride({torch.float32: tol(atol=4e-5, rtol=3e-6)}), ++ "TestDecomp", "test_comprehensive", device_type="cpu", ++ ), + DecorateInfo( + toleranceOverride({ + torch.float32: tol(atol=1e-5, rtol=1e-5), +@@ -17690,6 +17693,8 @@ op_db: list[OpInfo] = [ + 'TestMathBits', 'test_conj_view'), + DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}), + 'TestCommon', 'test_noncontiguous_samples'), ++ DecorateInfo(toleranceOverride({torch.float32: tol(atol=4e-05, rtol=3e-06)}), ++ "TestDecomp", "test_comprehensive", device_type="cpu"), + DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1e-05)}), + "TestDecomp", "test_comprehensive", device_type="cuda", + active_if=TEST_WITH_ROCM), diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch new file mode 100644 index 000000000000..ca205deb257e --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch @@ -0,0 +1,72 @@ +Those tests fail with precision issues on ARM which seems to be known: +https://github.com/pytorch/pytorch/pull/171095 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py +--- a/test/inductor/test_cpu_repro.py ++++ b/test/inductor/test_cpu_repro.py +@@ -31,6 +31,7 @@ from torch.fx.experimental.proxy_tensor import make_fx + from torch.nn import functional as F + from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, ++ IS_ARM64, + IS_FBCODE, + IS_MACOS, + parametrize, +@@ -3245,6 +3246,7 @@ class CPUReproTests(TestCase): + 3, + ) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + @config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False}) + def test_two_local_buffers_in_outer_loop_fusion(self): + def fn(x): +@@ -3568,6 +3570,7 @@ class CPUReproTests(TestCase): + self.common(m, (x,)) + check_metrics_vec_kernel_count(6) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + @requires_vectorization + @config.patch("cpp.enable_tiling_heuristics", False) + def test_transpose_copy(self): +@@ -3812,6 +3815,7 @@ class CPUReproTests(TestCase): + self.common(fn, (x, y)) + check_metrics_vec_kernel_count(2) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + def test_transpose_mxn_16_16_bf16_fp16(self): + def fn(a, b): + c = a * b +@@ -3885,6 +3889,7 @@ class CPUReproTests(TestCase): + x = torch.rand(4, 5) + self.common(f, (x,)) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + def test_broadcast_scalar_cpp_tile_2d_kernel(self): + # Based on detectron2_maskrcnn backbone (conv2d -> max_pool2d) + s0 = 12 +@@ -4384,6 +4389,7 @@ class CPUReproTests(TestCase): + y = torch.randint(0, 255, (3, 3), dtype=torch.uint8) + self.common(fn, (x, y)) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + def test_float32_to_uint8(self): + # https://github.com/pytorch/pytorch/issues/156788 + @torch.compile +@@ -4868,6 +4874,7 @@ class CPUReproTests(TestCase): + x = torch.randn(1, 4, 2, 2) + self.common(fn, (x,)) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + @parametrize("is_inference", (True, False)) + def test_disabled_amp(self, is_inference): + class M(torch.nn.Module): +@@ -5367,6 +5374,7 @@ class CPUReproTests(TestCase): + code + ) + ++ @unittest.skipIf(IS_ARM64, "Fails on ARM") + @config.patch(freezing=True) + def test_add_layernorm(self): + class Model(torch.nn.Module): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch new file mode 100644 index 000000000000..f0934960ac62 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch @@ -0,0 +1,26 @@ +On ARM those tests fail with +> torch._C._LinAlgError: linalg.svd: The algorithm failed to converge because the input matrix contained non-finite values. + +Traced to OpenBLAS with a fix in OpenBLAS 0.3.30, see https://github.com/pytorch/pytorch/issues/142131 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_linalg.py b/test/test_linalg.py +--- a/test/test_linalg.py ++++ b/test/test_linalg.py +@@ -2674,6 +2674,7 @@ class TestLinalg(TestCase): + self.assertRaisesRegex(RuntimeError, "must be different", torch.norm, x, "nuc", (0, 0)) + self.assertRaisesRegex(IndexError, "Dimension out of range", torch.norm, x, "nuc", (0, 2)) + ++ @onlyCUDA + @skipCUDAIfNoCusolver + @skipCPUIfNoLapack + @dtypes(torch.double, torch.cdouble) +@@ -9383,6 +9384,7 @@ scipy_lobpcg | {eq_err_scipy:10.2e} | {eq_err_general_scipy:10.2e} | {iters2: + + run_test((1, 1), (1, 1, 1025)) + ++ @onlyCUDA + @skipCUDAIfNoCusolver + @skipCPUIfNoLapack + def test_pca_lowrank(self, device): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch new file mode 100644 index 000000000000..5eec8929e5db --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch @@ -0,0 +1,24 @@ +Test fails with output mismatch: +> - Explanation: Dynamo cannot trace optree C/C++ function optree._C.PyCapsule.flatten. +> + Explanation: Dynamo cannot trace optree C/C++ function optree._C.pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1.flatten. +> Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py +> +> - Developer debug context: module: optree._C, qualname: PyCapsule.flatten, skip reason: +> + Developer debug context: module: optree._C, qualname: pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1.flatten, skip reason: + +Seems to be related to pybind11 version, GCC version, ... + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py +--- a/test/dynamo/test_error_messages.py ++++ b/test/dynamo/test_error_messages.py +@@ -461,7 +461,7 @@ from user code: + warnings.warn("test")""", + ) + +- @unittest.skipIf(not python_pytree._cxx_pytree_exists, "missing optree package") ++ @unittest.skip("Failes depending on Pybind11/GCC versions") + def test_optree_graph_break_message(self): + import optree + diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch new file mode 100644 index 000000000000..65cc3882ef63 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch @@ -0,0 +1,38 @@ +test_int8_woq_mm fail without MKLDDN at +> self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1) + +See https://github.com/pytorch/pytorch/pull/177387 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py +--- a/test/inductor/test_cpu_select_algorithm.py ++++ b/test/inductor/test_cpu_select_algorithm.py +@@ -50,6 +50,11 @@ run_and_get_cpp_code = test_torchinductor.run_and_get_cpp_code + + aten = torch.ops.aten + ++skipIfNoMkldnn = unittest.skipIf( ++ not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()), ++ "no MKLDNN", ++) ++ + + def patches(fn): + def skip_cache(self, choices, name, key, benchmark, hint_override=None): +@@ -1374,6 +1379,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm): + @inductor_config.patch({"freezing": True}) + @patches + @torch.no_grad ++ @skipIfNoMkldnn + @dtypes(torch.bfloat16) + @parametrize( + "batch_size", +@@ -1437,6 +1443,7 @@ class TestSelectAlgorithm(BaseTestSelectAlgorithm): + @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True}) + @patches + @torch.no_grad ++ @skipIfNoMkldnn + @dtypes(torch.bfloat16) + @parametrize( + "batch_size",