easybuilders · boegel · Mar 19, 2026 · Mar 3, 2026 · Mar 4, 2026 · Mar 5, 2026
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1-foss-2024a.eb
@@ -64,8 +64,10 @@ patches = [
     'PyTorch-2.9.0_skip-unexpected-success-in-test_fake_export.patch',
     'PyTorch-2.9.0_update-exptected-output-for-z3-4.13.0.patch',
     'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch',
+    'PyTorch-2.9.1_avoid-using-wrong-libomp.patch',
     'PyTorch-2.9.1_check-device-avail-test_schedule.patch',
     'PyTorch-2.9.1_disable-slow-tests.patch',
+    'PyTorch-2.9.1_dont-print-test-items.patch',
     'PyTorch-2.9.1_fix-hypothesis-deadline.patch',
     'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch',
     'PyTorch-2.9.1_fix-test_dist2-decorators.patch',
@@ -74,11 +76,17 @@ patches = [
     'PyTorch-2.9.1_ignore-warning-incompatible-pointer-types.patch',
     'PyTorch-2.9.1_normalize_tree_output.patch',
     'PyTorch-2.9.1_set-test-timeout.patch',
+    'PyTorch-2.9.1_skip-cutlass-addmm-test.patch',
     'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch',
     'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch',
     'PyTorch-2.9.1_skip-RingFlexAttentionTest.patch',
     'PyTorch-2.9.1_skip-test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32.patch',
     'PyTorch-2.9.1_skip-tests-requiring-SM90.patch',
+    'PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch',
+    'PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch',
+    'PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch',
+    'PyTorch-2.9.1_skip-test_optree_graph_break_message.patch',
+    'PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch',
 ]
 checksums = [
     {'pytorch-v2.9.1.tar.gz': 'e17504700ebc4c87f9b57059df1c4d790b769458c04db144c7a92aea90f2c92b'},
@@ -146,9 +154,12 @@ checksums = [
      '5c68e0de73212ed266879f4528a6041ef7ab2f1ac83c6cf7142c4baa78e7664c'},
     {'PyTorch-2.9.1_avoid-multiprocess-tests-hanging-forever.patch':
      '86ce380e69b3b20e010d817889cb1b825b05b4054a045b00f2ac12161b77d7e4'},
+    {'PyTorch-2.9.1_avoid-using-wrong-libomp.patch':
+     '2fc2bb82cce87ba0ce73718b0502735ecdf360ca6bfac4482396f7f1c51c1866'},
     {'PyTorch-2.9.1_check-device-avail-test_schedule.patch':
      '64c28d38ce69147565509add36d310473ce46f14a0a876d38b5049cb7fce9817'},
-    {'PyTorch-2.9.1_disable-slow-tests.patch': '76e6d8f7366b91a0ddc65f73685f2b09988bb5537d10d294f9bb6a48c7fec3d0'},
+    {'PyTorch-2.9.1_disable-slow-tests.patch': '6b365a3531b0ac5446b5f0e8ab924b5e5742cd0331e6d9ec979118a3ef0ffc09'},
+    {'PyTorch-2.9.1_dont-print-test-items.patch': '2b524cf3d557c0672feefc3a7165e5555e549b0720647a84d546f769cea0be07'},
     {'PyTorch-2.9.1_fix-hypothesis-deadline.patch': 'f7a130669eee9924a303df9e2bd5743ff023a7d994b7a3e43c86dcccf0206c49'},
     {'PyTorch-2.9.1_fix-iteration-in-fligh-reporter.patch':
      'ab408275ec66e836112a50054acc4e789ef38196efeb6137c6061d60d9ac9ead'},
@@ -161,6 +172,8 @@ checksums = [
      'c4dad43a5d76e292bb0cada56ea05e8cbd522e3e83749cf3b2c15cd1e4ff6d7b'},
     {'PyTorch-2.9.1_normalize_tree_output.patch': '7d5994580339b73c28de595d9e5a0448db97b7d284f17efd18909e4613d170df'},
     {'PyTorch-2.9.1_set-test-timeout.patch': '15fa1149c250b1333b0bc491f659aaf89d5d6eaf6df5ebc81eea545478c1239c'},
+    {'PyTorch-2.9.1_skip-cutlass-addmm-test.patch':
+     '1f81a8a9eea8eda51fc93dff84cd994772febf4fd05d77efbf21b8440dadfd4e'},
     {'PyTorch-2.9.1_skip-flex-attention-test_block_mask_non_divisible.patch':
      'd8489c192da549083569e09e5f94d2a83c9e41e111b1322f86512a9c5a58c0d9'},
     {'PyTorch-2.9.1_skip-flex-attention-tests-on-unsupported-cpus.patch':
@@ -171,6 +184,16 @@ checksums = [
      'e57486cc42f3dbcae29753168febc251d070a283229e2d76ccbdf19fee53f06e'},
     {'PyTorch-2.9.1_skip-tests-requiring-SM90.patch':
      '7db02152db2ae70c0fd4c4602fe381e26a74b8e4f7b16b1a3554b2353d761b10'},
+    {'PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch':
+     'dd82203ce3b6262255aba6b59fb3b547c4c17875d5711f6d3d489aa8f0f59f32'},
+    {'PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch':
+     '99055fde02ca17c1db1cd72f41821387a50901d6cd947161cafa12257b3a1c5a'},
+    {'PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch':
+     '4fc772293047dc737b99e232b8a8db904aa8e88e3c8b2bcc3602fb723941fb89'},
+    {'PyTorch-2.9.1_skip-test_optree_graph_break_message.patch':
+     '2ef1ad424d5f12a4d0ae06938da623819596cee7c0fb4616008f27583c29494d'},
+    {'PyTorch-2.9.1_skip-tests-requiring-MKLDNN.patch':
+     '03756a8069bad01018f422f41aa24c7c543519fd857db88a0c6de661976c8859'},
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
@@ -183,7 +206,6 @@ builddependencies = [
     ('parameterized', '0.9.0'),
     ('pytest-flakefinder', '1.1.0'),
     ('pytest-rerunfailures', '15.0'),
-    ('pytest-shard', '0.1.2'),
     ('pytest-subtests', '0.13.1'),
     ('tlparse', '0.4.0'),
     ('optree', '0.14.1'),
@@ -243,6 +265,9 @@ excluded_tests = {
         'test_license',
         # No triton
         'distributed/test_nvshmem_triton',
+        # Occasional segfaults on CPU
+        'inductor/test_flex_attention',
+        'inductor/test_flex_decoding ',
     ]
 }
 

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_avoid-using-wrong-libomp.patch
@@ -0,0 +1,34 @@
+When using GCC `libgomp.so` should be used which will be automatically done with `-fopenmp`.
+However the custom FindOpenMP searches for `libomp.so` first which ends up being used if found
+e.g. on the system in /lib64
+
+See https://github.com/pytorch/pytorch/pull/177126
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
+--- a/cmake/Modules/FindOpenMP.cmake
++++ b/cmake/Modules/FindOpenMP.cmake
+@@ -289,21 +289,13 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
+       mark_as_advanced(OpenMP_libomp_LIBRARY)
+     endif()
+
+-    if (NOT OpenMP_libomp_LIBRARY)
+-      find_library(OpenMP_libomp_LIBRARY
+-        NAMES omp gomp iomp5
+-        HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES}
+-        DOC "libomp location for OpenMP"
+-      )
+-      mark_as_advanced(OpenMP_libomp_LIBRARY)
+-    endif()
+-
+     # Use OpenMP_PREFIX if defined
+     if (NOT OpenMP_libomp_LIBRARY AND NOT "${OpenMP_PREFIX}" STREQUAL "")
+       find_library(OpenMP_libomp_LIBRARY
+         NAMES omp gomp iomp5
+         HINTS "${OpenMP_PREFIX}/lib"
+         DOC "libomp location for OpenMP"
++        NO_DEFAULT_PATH
+       )
+       mark_as_advanced(OpenMP_libomp_LIBRARY)
+     endif()
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_disable-slow-tests.patch
@@ -5,10 +5,14 @@ Enable the default for non-CI environments to cut down testing time.
 Don't check for SANDCASTLE when determining whether to skip disabled tests.
 However, the disabled-tests JSON file needs to be downloaded from S3 and placed at "tests/.pytorch-disabled-tests.json".
 
+This file may be modified and/or redownloaded in import_test_stats.py
+Disable this by just returning it's content as-if it is always up to date.
+If it doesn't exist the failure will be handled by the calling function.
+This modification removes the PR number field, so make it optional in the tuple expansion to allow either format.
+
 Author: Alexander Grund (TU Dresden)
 
 diff --git a/test/run_test.py b/test/run_test.py
-index 44a15d4ab2c..269d4206f3e 100755
 --- a/test/run_test.py
 +++ b/test/run_test.py
 @@ -531,7 +531,7 @@ def run_test(
@@ -20,8 +24,19 @@ index 44a15d4ab2c..269d4206f3e 100755
          ci_args = ["--import-slow-tests", "--import-disabled-tests"]
          if RERUN_DISABLED_TESTS:
              ci_args.append("--rerun-disabled-tests")
+diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
+--- a/tools/stats/import_test_stats.py
++++ b/tools/stats/import_test_stats.py
+@@ -47,6 +47,8 @@ def fetch_and_cache(
+     Path(dirpath).mkdir(exist_ok=True)
+
+     path = os.path.join(dirpath, name)
++    with open(path) as f:
++        return cast(dict[str, Any], json.load(f))
+     print(f"Downloading {url} to {path}")
+
+     def is_cached_file_valid() -> bool:
 diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
-index bfc568bc146..7ef37cccccb 100644
 --- a/torch/testing/_internal/common_utils.py
 +++ b/torch/testing/_internal/common_utils.py
 @@ -2722,11 +2722,11 @@ def check_if_enable(test: unittest.TestCase):
@@ -32,9 +47,10 @@ index bfc568bc146..7ef37cccccb 100644
 +    if True:
          should_skip = False
          skip_msg = ""
- 
+-
 -        for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
-+        for disabled_test, (pr_num, issue_url, platforms) in disabled_tests_dict.items():
++        # Allow for a potentially existing PR number
++        for disabled_test, (*pr_num, issue_url, platforms) in disabled_tests_dict.items():
              if matches_test(disabled_test):
                  platform_to_conditional: dict = {
                      "mac": IS_MACOS,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_dont-print-test-items.patch
@@ -0,0 +1,24 @@
+Reduce verbosity of the test output by not showing all ~270k test names.
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/run_test.py b/test/run_test.py
+--- a/test/run_test.py
++++ b/test/run_test.py
+@@ -623,7 +623,7 @@ def run_test(
+                 test_file,
+             )
+         else:
+-            command.extend([f"--sc={stepcurrent_key}", "--print-items"])
++            command.extend([f"--sc={stepcurrent_key}"])
+             ret_code, was_rerun = retry_shell(
+                 command,
+                 test_directory,
+@@ -725,7 +725,7 @@ def run_test_retries(
+
+     num_failures = defaultdict(int)
+
+-    print_items = ["--print-items"]
++    print_items = []
+     sc_command = f"--sc={stepcurrent_key}"
+     while True:
+         ret_code, _ = retry_shell(
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_increase-tolerance-TestDecomp-matmul.patch
@@ -0,0 +1,32 @@
+
+TestDecompCPU.test_comprehensive___rmatmul___cpu_float32, TestDecompCPU.test_comprehensive_matmul_cpu_float32 fail with small tolerance issues:
+>           Expected 12.534862518310547 but got 12.534895896911621.
+>           Absolute difference: 3.337860107421875e-05 (up to 1e-05 allowed)
+>           Relative difference: 2.6628613616990456e-06 (up to 1.3e-06 allowed)
+
+Increase the tolerances slightly to make them pass.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
+--- a/torch/testing/_internal/common_methods_invocations.py
++++ b/torch/testing/_internal/common_methods_invocations.py
+@@ -14286,6 +14286,9 @@ op_db: list[OpInfo] = [
+                DecorateInfo(toleranceOverride({torch.float32: tol(atol=0, rtol=1e-5)}),
+                             'TestCommon', 'test_noncontiguous_samples',
+                             device_type='cpu'),
++               DecorateInfo(toleranceOverride({torch.float32: tol(atol=4e-5, rtol=3e-6)}),
++                   "TestDecomp", "test_comprehensive", device_type="cpu",
++               ),
+                DecorateInfo(
+                    toleranceOverride({
+                        torch.float32: tol(atol=1e-5, rtol=1e-5),
+@@ -17690,6 +17693,8 @@ op_db: list[OpInfo] = [
+                             'TestMathBits', 'test_conj_view'),
+                DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}),
+                             'TestCommon', 'test_noncontiguous_samples'),
++               DecorateInfo(toleranceOverride({torch.float32: tol(atol=4e-05, rtol=3e-06)}),
++                            "TestDecomp", "test_comprehensive", device_type="cpu"),
+                DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1e-05)}),
+                             "TestDecomp", "test_comprehensive", device_type="cuda",
+                             active_if=TEST_WITH_ROCM),
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cpu_repro-tests-failing-on-ARM.patch
@@ -0,0 +1,72 @@
+Those tests fail with precision issues on ARM which seems to be known:
+https://github.com/pytorch/pytorch/pull/171095
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
+--- a/test/inductor/test_cpu_repro.py
++++ b/test/inductor/test_cpu_repro.py
+@@ -31,6 +31,7 @@ from torch.fx.experimental.proxy_tensor import make_fx
+ from torch.nn import functional as F
+ from torch.testing._internal.common_utils import (
+     instantiate_parametrized_tests,
++    IS_ARM64,
+     IS_FBCODE,
+     IS_MACOS,
+     parametrize,
+@@ -3245,6 +3246,7 @@ class CPUReproTests(TestCase):
+                 3,
+             )
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False})
+     def test_two_local_buffers_in_outer_loop_fusion(self):
+         def fn(x):
+@@ -3568,6 +3570,7 @@ class CPUReproTests(TestCase):
+                     self.common(m, (x,))
+                     check_metrics_vec_kernel_count(6)
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @requires_vectorization
+     @config.patch("cpp.enable_tiling_heuristics", False)
+     def test_transpose_copy(self):
+@@ -3812,6 +3815,7 @@ class CPUReproTests(TestCase):
+         self.common(fn, (x, y))
+         check_metrics_vec_kernel_count(2)
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     def test_transpose_mxn_16_16_bf16_fp16(self):
+         def fn(a, b):
+             c = a * b
+@@ -3885,6 +3889,7 @@ class CPUReproTests(TestCase):
+         x = torch.rand(4, 5)
+         self.common(f, (x,))
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     def test_broadcast_scalar_cpp_tile_2d_kernel(self):
+         # Based on detectron2_maskrcnn backbone (conv2d -> max_pool2d)
+         s0 = 12
+@@ -4384,6 +4389,7 @@ class CPUReproTests(TestCase):
+         y = torch.randint(0, 255, (3, 3), dtype=torch.uint8)
+         self.common(fn, (x, y))
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     def test_float32_to_uint8(self):
+         # https://github.com/pytorch/pytorch/issues/156788
+         @torch.compile
+@@ -4868,6 +4874,7 @@ class CPUReproTests(TestCase):
+         x = torch.randn(1, 4, 2, 2)
+         self.common(fn, (x,))
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @parametrize("is_inference", (True, False))
+     def test_disabled_amp(self, is_inference):
+         class M(torch.nn.Module):
+@@ -5367,6 +5374,7 @@ class CPUReproTests(TestCase):
+             code
+         )
+
++    @unittest.skipIf(IS_ARM64, "Fails on ARM")
+     @config.patch(freezing=True)
+     def test_add_layernorm(self):
+         class Model(torch.nn.Module):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-cutlass-addmm-test.patch
@@ -0,0 +1,16 @@
+The test fails with accuracy issues in at least H100, possibly on CUDA 12.8 in general.
+See https://github.com/pytorch/pytorch/pull/156626
+
+Author: Alexander Grund (TU Dresden)
+diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
+--- a/test/inductor/test_cutlass_backend.py
++++ b/test/inductor/test_cutlass_backend.py
+@@ -613,7 +613,7 @@ class TestCutlassBackend(TestCase):
+
+             torch.testing.assert_close(actual, expected, rtol=1e-2, atol=0.05)
+
+-    @unittest.skipIf(not SM90OrLater, "need sm_90")
++    @unittest.skip("Fails on CUDA 12.8+")
+     @parametrize("dynamic", (False, True))
+     @parametrize("use_aoti", (False, True))
+     @parametrize("dtype", (torch.float16, torch.bfloat16))
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-svd-pca-lowrank-tests-on-cpu.patch
@@ -0,0 +1,26 @@
+On ARM those tests fail with
+> torch._C._LinAlgError: linalg.svd: The algorithm failed to converge because the input matrix contained non-finite values.
+
+Traced to OpenBLAS with a fix in OpenBLAS 0.3.30, see https://github.com/pytorch/pytorch/issues/142131
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -2674,6 +2674,7 @@ class TestLinalg(TestCase):
+         self.assertRaisesRegex(RuntimeError, "must be different", torch.norm, x, "nuc", (0, 0))
+         self.assertRaisesRegex(IndexError, "Dimension out of range", torch.norm, x, "nuc", (0, 2))
+
++    @onlyCUDA
+     @skipCUDAIfNoCusolver
+     @skipCPUIfNoLapack
+     @dtypes(torch.double, torch.cdouble)
+@@ -9383,6 +9384,7 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
+
+         run_test((1, 1), (1, 1, 1025))
+
++    @onlyCUDA
+     @skipCUDAIfNoCusolver
+     @skipCPUIfNoLapack
+     def test_pca_lowrank(self, device):
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.9.1_skip-test_optree_graph_break_message.patch
@@ -0,0 +1,24 @@
+Test fails with output mismatch:
+> -   Explanation: Dynamo cannot trace optree C/C++ function optree._C.PyCapsule.flatten.
+> +   Explanation: Dynamo cannot trace optree C/C++ function optree._C.pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1.flatten.
+>     Hint: Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py
+>   
+> -   Developer debug context: module: optree._C, qualname: PyCapsule.flatten, skip reason: <missing reason>
+> +   Developer debug context: module: optree._C, qualname: pybind11_detail_function_record_v1_system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_1.flatten, skip reason: <missing reason>
+
+Seems to be related to pybind11 version, GCC version, ...
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
+--- a/test/dynamo/test_error_messages.py
++++ b/test/dynamo/test_error_messages.py
+@@ -461,7 +461,7 @@ from user code:
+     warnings.warn("test")""",
+         )
+
+-    @unittest.skipIf(not python_pytree._cxx_pytree_exists, "missing optree package")
++    @unittest.skip("Failes depending on Pybind11/GCC versions")
+     def test_optree_graph_break_message(self):
+         import optree
+