diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2022a-CUDA-11.7.0.eb
new file mode 100644
index 000000000000..515a5b6c8beb
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2022a-CUDA-11.7.0.eb
@@ -0,0 +1,232 @@
+name = 'PyTorch'
+version = '2.1.2'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2022a'}
+
+source_urls = [GITHUB_RELEASE]
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+patches = [
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
+    'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
+    'PyTorch-1.13.1_fix-protobuf-dependency.patch',
+    'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
+    'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
+    'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
+    'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
+    'PyTorch-2.0.1_fix-skip-decorators.patch',
+    'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch',
+    'PyTorch-2.0.1_fix-vsx-loadu.patch',
+    'PyTorch-2.0.1_no-cuda-stubs-rpath.patch',
+    'PyTorch-2.0.1_skip-failing-gradtest.patch',
+    'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
+    'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch',
+    'PyTorch-2.1.0_fix-validationError-output-test.patch',
+    'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
+    'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
+    'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
+    'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
+    'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
+    'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
+    'PyTorch-2.1.0_skip-test_wrap_bad.patch',
+    'PyTorch-2.1.2_add-cuda-skip-markers.patch',
+    'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
+    'PyTorch-2.1.2_fix-device-mesh-check.patch',
+    'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch',
+    ('PyTorch-2.1.2_fix-kineto-cuda-11.7-compat.patch', 'third_party/kineto'),
+    'PyTorch-2.1.2_fix-test_cuda-non-x86.patch',
+    'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
+    'PyTorch-2.1.2_fix-test_memory_profiler.patch',
+    'PyTorch-2.1.2_fix-test_parallelize_api.patch',
+    'PyTorch-2.1.2_fix-test-tp_random_state.patch',
+    'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
+    'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
+    'PyTorch-2.1.2_fix-vsx-vector-div.patch',
+    'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch',
+    'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch',
+    'PyTorch-2.1.2_relax-cuda-tolerances.patch',
+    'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
+    'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
+    'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
+    'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch',
+    'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
+]
+checksums = [
+    {'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
+     '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
+     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
+    {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
+    {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
+    {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
+     '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
+    {'PyTorch-1.13.1_fix-protobuf-dependency.patch':
+     '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
+    {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
+     'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
+    {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
+     '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
+    {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
+     '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
+    {'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
+     '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
+    {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
+    {'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch':
+     '1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'},
+    {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
+    {'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'},
+    {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
+    {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
+     '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
+     'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'},
+    {'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch':
+     'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'},
+    {'PyTorch-2.1.0_fix-validationError-output-test.patch':
+     '7eba0942afb121ed92fac30d1529447d892a89eb3d53c565f8e9d480e95f692b'},
+    {'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
+     '3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
+    {'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
+     'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
+    {'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch':
+     '0ac36411e76506b3354c85a8a1260987f66af947ee52ffc64230aee1fa02ea8b'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
+    {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
+     '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
+    {'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
+     '5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
+    {'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
+     '5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
+    {'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
+    {'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'},
+    {'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
+     'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
+    {'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
+    {'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch':
+     'f583532c59f35f36998851957d501b3ac8c883884efd61bbaa308db55cb6bdcd'},
+    {'PyTorch-2.1.2_fix-kineto-cuda-11.7-compat.patch':
+     '742f8e9b911b6cde19cdc05804d60a043e7cb936e994e66a6fe6a9490c53e34f'},
+    {'PyTorch-2.1.2_fix-test_cuda-non-x86.patch': '1ed76fcc87e6c50606ac286487292a3d534707068c94af74c3a5de8153fa2c2c'},
+    {'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
+     'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
+    {'PyTorch-2.1.2_fix-test_memory_profiler.patch':
+     '30b0c9355636c0ab3dedae02399789053825dc3835b4d7dac6e696767772b1ce'},
+    {'PyTorch-2.1.2_fix-test_parallelize_api.patch':
+     'f8387a1693af344099c806981ca38df1306d7f4847d7d44713306338384b1cfd'},
+    {'PyTorch-2.1.2_fix-test-tp_random_state.patch':
+     'da352e32471ae9a68920c91a122c7194f3c53d947536116e33216d3ae6b15e61'},
+    {'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch':
+     'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
+    {'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
+    {'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
+    {'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch':
+     '90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'},
+    {'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch':
+     '07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'},
+    {'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
+    {'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
+     'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
+    {'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
+     '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
+    {'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
+     '6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
+    {'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch':
+     '943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'},
+    {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
+     'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.23.1'),
+    ('hypothesis', '6.46.7'),
+    # For tests
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '11.1'),
+    ('pytest-shard', '0.1.2'),
+    ('pytest-xdist', '2.5.0'),
+    ('unittest-xml-reporting', '3.1.0'),
+]
+
+dependencies = [
+    ('CUDA', '11.7.0', '', SYSTEM),
+    ('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', SYSTEM),
+    ('magma', '2.6.2', '-CUDA-%(cudaver)s'),
+    ('NCCL', '2.12.12', '-CUDA-%(cudaver)s'),
+    ('Ninja', '1.10.2'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.10.4'),
+    ('protobuf', '3.19.4'),
+    ('protobuf-python', '3.19.4'),
+    ('pybind11', '2.9.2'),
+    ('SciPy-bundle', '2022.05'),
+    ('typing-extensions', '4.3.0'),
+    ('PyYAML', '6.0'),
+    ('MPFR', '4.1.0'),
+    ('GMP', '6.2.1'),
+    ('numactl', '2.0.14'),
+    ('FFmpeg', '4.4.2'),
+    ('Pillow', '9.1.1'),
+    ('expecttest', '0.1.3'),
+    ('networkx', '2.8.4'),
+    ('sympy', '1.10.1'),
+    ('Z3', '4.10.2', '-Python-%(pyver)s'),
+]
+
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
+        'distributions/test_constraints',
+        # no xdoctest
+        'doctests',
+        # failing on broadwell
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'test_native_mha',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+# Especially test_quantization has a few corner cases that are triggered by the random input values,
+# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
+# So allow a low number of tests to fail as the tests "usually" succeed
+max_failed_tests = 2
+
+# The readelf sanity check command can be taken out once the TestRPATH test from
+# https://github.com/pytorch/pytorch/pull/109493 is accepted, since it is then checked as part of the PyTorch test suite
+local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
+sanity_check_commands = [
+    "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
+]
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch
new file mode 100644
index 000000000000..5a54cac4d0b7
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch
@@ -0,0 +1,30 @@
+This test seems to expect at most 4 GPUs.
+Especially when the number of GPUs is not a power of 2 (e.g. 6) test_fsdp_tp_integration_tensor_parallel_size_2_cpu_offload_CPUOffload(offload_params=False) fails:
+
+torch.testing._internal.common_distributed: [ERROR]   File "/dev/shm//pytorch/test/distributed/fsdp/test_fsdp_tp_integration.py", line 157, in _sync_tp_grads
+torch.testing._internal.common_distributed: [ERROR]     per_param_masks = unsharded_zeros.split(splits)
+torch.testing._internal.common_distributed: [ERROR]   File "/tmp/easybuild-install/lib/python3.10/site-packages/torch/_tensor.py", line 864, in split
+torch.testing._internal.common_distributed: [ERROR]     return torch._VF.split_with_sizes(self, split_size, dim)
+torch.testing._internal.common_distributed: [ERROR] RuntimeError: split_with_sizes expects split_sizes to sum exactly to 105 (input tensor's size at dimension 0), but got split_sizes=[20, 4, 16, 4, 48, 12]
+
+See https://github.com/pytorch/pytorch/issues/141237
+
+Limitting to 4 GPUs seems to work.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
+index bc7a4aef4a3..61eb13162f2 100644
+--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
++++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
+@@ -71,6 +71,10 @@ class SimpleModel(torch.nn.Module):
+ 
+ 
+ class TestTPFSDPIntegration(FSDPTest):
++    @property
++    def world_size(self):
++        return min(4, super().world_size)
++
+     def _get_params_and_sharding_info(
+         self,
+         model: SimpleModel,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-kineto-cuda-11.7-compat.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-kineto-cuda-11.7-compat.patch
new file mode 100644
index 000000000000..16a5af44ba38
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-kineto-cuda-11.7-compat.patch
@@ -0,0 +1,51 @@
+From 288e9553c6579964cd7b765a34f7f9c6f51c5c2b Mon Sep 17 00:00:00 2001
+From: Brian Coutinho <bcoutinho@meta.com>
+Date: Fri, 29 Sep 2023 14:29:34 -0700
+Subject: [PATCH] Fix support for CUDA 11.7 with new cudalaunchkernelexc
+ callbacks (#810)
+
+Summary:
+Pull Request resolved: https://github.com/pytorch/kineto/pull/810
+
+Fix issue https://github.com/pytorch/kineto/issues/809
+
+There was a change to guard the use of that callback based on CUPTI API VERSION.
+https://github.com/pytorch/kineto/pull/792 that enables this above CUPTI API version >=17
+
+Just checking the headers however.
+CUDA 11.7.1 (and 11.7.0) do not have the CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060 callback
+> https://gitlab.com/nvidia/headers/cuda-individual/cupti/-/blob/cuda-11.7.1/cupti_runtime_cbid.h?ref_type=tags
+
+CUPTI API Version 17
+>https://gitlab.com/nvidia/headers/cuda-individual/cupti/-/blob/cuda-11.7.1/cupti_version.h?ref_type=tags#L104
+
+And,  CUDA 11.8.0 does have CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060 callback
+> https://gitlab.com/nvidia/headers/cuda-individual/cupti/-/blob/cuda-11.8.0/cupti_runtime_cbid.h?ref_type=tags#L440
+
+CUPTI API Version 18
+> https://gitlab.com/nvidia/headers/cuda-individual/cupti/-/blob/cuda-11.8.0/cupti_version.h?ref_type=tags#L105
+
+Update the define to use CUPTI API 18 and above
+
+Reviewed By: davidberard98
+
+Differential Revision: D49779962
+
+fbshipit-source-id: 8c7371a27e117f7a1df6bb3017156728715ded94
+---
+ libkineto/src/CuptiActivity.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/libkineto/src/CuptiActivity.cpp b/libkineto/src/CuptiActivity.cpp
+index 5ecfa1ad..feddb288 100644
+--- a/libkineto/src/CuptiActivity.cpp
++++ b/libkineto/src/CuptiActivity.cpp
+@@ -248,7 +248,7 @@ inline bool RuntimeActivity::flowStart() const {
+       activity_.cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 ||
+       activity_.cbid == CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020;
+ 
+-#if defined(CUPTI_API_VERSION) && CUPTI_API_VERSION >= 17
++#if defined(CUPTI_API_VERSION) && CUPTI_API_VERSION >= 18
+   should_correlate |=
+       activity_.cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060;
+ #endif
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test-tp_random_state.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test-tp_random_state.patch
new file mode 100644
index 000000000000..d256ad51237d
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test-tp_random_state.patch
@@ -0,0 +1,96 @@
+Fix failure of distributed/tensor/parallel/test_tp_random_state.py::TensorParallelRandomStateTests::test_model_init
+
+> IndexError: list index out of range
+
+See https://github.com/pytorch/pytorch/issues/122184
+Backported from https://github.com/pytorch/pytorch/pull/122248
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
+index 1e8b705cdf0..c96641dbbda 100644
+--- a/test/distributed/tensor/parallel/test_tp_random_state.py
++++ b/test/distributed/tensor/parallel/test_tp_random_state.py
+@@ -3,7 +3,7 @@ import torch
+ import torch.distributed._functional_collectives as funcol
+ import torch.distributed._tensor.random as random
+ 
+-from torch.distributed._tensor import DeviceMesh
++from torch.distributed._tensor import init_device_mesh
+ from torch.distributed.tensor.parallel.api import parallelize_module
+ from torch.distributed.tensor.parallel.style import (
+     ColwiseParallel,
+@@ -42,10 +42,17 @@ class TensorParallelRandomStateTests(DTensorTestBase):
+     @with_comms
+     @skip_if_lt_x_gpu(4)
+     def test_model_init(self):
+-        mesh = torch.arange(self.world_size).reshape(2, 2)
+-        device_mesh = DeviceMesh(self.device_type, mesh)
+-        tp_rank = device_mesh.get_coordinate()[0]  # the tensor parallel dimension is 0
+-        dp_rank = device_mesh.get_coordinate()[1]  # the data parallel dimension is 1
++        dp_size = 2
++        tp_size = self.world_size // dp_size
++        mesh_2d = init_device_mesh(
++            self.device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
++        )
++        dp_mesh = mesh_2d["dp"]
++        tp_mesh = mesh_2d["tp"]
++        dp_rank = dp_mesh.get_coordinate()[0]
++        tp_rank = tp_mesh.get_coordinate()[0]
++        self.assertEqual(dp_rank, self.rank // tp_size)
++        self.assertEqual(tp_rank, self.rank % tp_size)
+ 
+         for enable_distribute_flag in [False, True]:
+             # a local model on meta device
+@@ -53,7 +60,7 @@ class TensorParallelRandomStateTests(DTensorTestBase):
+             # the col-wise parallel style shards the weight over tensor dim 0
+             model_tp = parallelize_module(
+                 model,
+-                device_mesh,
++                tp_mesh,
+                 {
+                     "net1": ColwiseParallel(
+                         make_input_replicate_1d, make_output_replicate_1d
+@@ -81,6 +88,7 @@ class TensorParallelRandomStateTests(DTensorTestBase):
+                 # the 1d mesh represents the TP group
+                 _1d_mesh = dtensor.device_mesh
+                 assert _1d_mesh.ndim == 1
++                self.assertEqual(_1d_mesh, tp_mesh)
+ 
+                 tensor_local = dtensor.to_local()
+ 
+@@ -88,7 +96,7 @@ class TensorParallelRandomStateTests(DTensorTestBase):
+                 tensor_gather = funcol.all_gather_tensor(
+                     tensor_local,
+                     gather_dim=0,
+-                    group=(_1d_mesh, 0)
++                    group=_1d_mesh,
+                 )
+                 self.assertEqual(_1d_mesh.get_coordinate()[0], tp_rank)
+ 
+@@ -102,14 +110,14 @@ class TensorParallelRandomStateTests(DTensorTestBase):
+                         # each rank within a TP group has the same initial weights
+                         self.assertEqual(tensor1, tensor2)
+ 
+-                self.check_gathered_tensors(tp_rank, 2, tensor_gather, tp_weights_assert)
++                self.check_gathered_tensors(tp_rank, tp_size, tensor_gather, tp_weights_assert)
+ 
+                 # check across TP groups
+                 # all-gather local shards
+                 tensor_gather = funcol.all_gather_tensor(
+                     tensor_local,
+                     gather_dim=0,
+-                    group=(_1d_mesh, 1)
++                    group=dp_mesh,
+                 )
+ 
+                 # compare local shards across TP groups
+@@ -123,7 +131,7 @@ class TensorParallelRandomStateTests(DTensorTestBase):
+                         # random seeds set in data loading.
+                         self.assertNotEqual(tensor1, tensor2)
+ 
+-                self.check_gathered_tensors(dp_rank, 2, tensor_gather, dp_weights_assert)
++                self.check_gathered_tensors(dp_rank, dp_size, tensor_gather, dp_weights_assert)
+ 
+ 
+ if __name__ == "__main__":
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_cuda-non-x86.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_cuda-non-x86.patch
new file mode 100644
index 000000000000..6ed59612f40f
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_cuda-non-x86.patch
@@ -0,0 +1,65 @@
+test_cuda fails on non-x86 machines because the tested feature is not available there.
+> RuntimeError: record_context_cpp is not support on non-linux non-x86_64 platforms
+
+Skip the tests on the non-supported platforms.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_cuda.py b/test/test_cuda.py
+index e81c9365139..79b438060fe 100644
+--- a/test/test_cuda.py
++++ b/test/test_cuda.py
+@@ -28,7 +28,7 @@ from torch.utils.checkpoint import checkpoint_sequential
+ from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \
+     NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_WINDOWS, \
+     slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_CUDA, TEST_CUDA_GRAPH, TEST_WITH_ROCM, TEST_NUMPY, \
+-    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson, NoTest, IS_LINUX
++    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson, NoTest, IS_LINUX, IS_X86
+ from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_MULTIGPU, _create_scaling_case, _create_scaling_models_optimizers
+ from torch.testing._internal.autocast_test_lists import AutocastTestLists
+ from torch.utils.viz._cycles import observe_tensor_cycles
+@@ -3386,7 +3386,7 @@ class TestCudaMallocAsync(TestCase):
+         finally:
+             torch.cuda.memory._record_memory_history(None)
+ 
+-    @unittest.skipIf(not IS_LINUX, "linux only cpp unwinding")
++    @unittest.skipIf(not IS_LINUX or not IS_X86, "linux only cpp unwinding")
+     def test_direct_traceback(self):
+         from torch._C._profiler import gather_traceback, symbolize_tracebacks
+         c = gather_traceback(True, True, True)
+@@ -3396,7 +3396,7 @@ class TestCudaMallocAsync(TestCase):
+         self.assertTrue("unwind" in r)
+ 
+     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
++    @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only")
+     def test_memory_snapshot_with_cpp(self):
+         try:
+             torch.cuda.memory.empty_cache()
+@@ -3432,7 +3432,7 @@ class TestCudaMallocAsync(TestCase):
+         self.assertTrue('category' in plot)
+ 
+     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
++    @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only")
+     def test_cycles(self):
+         fired = False
+ 
+@@ -3469,7 +3469,7 @@ class TestCudaMallocAsync(TestCase):
+             disarm()
+ 
+     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
++    @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only")
+     def test_memory_plots(self):
+         for context, stacks in (("all", "all" if IS_LINUX else "python"), ("all", "python"), (None, "python")):
+             try:
+@@ -3497,7 +3497,7 @@ class TestCudaMallocAsync(TestCase):
+                 torch.cuda.memory._record_memory_history(None)
+ 
+     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
++    @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only")
+     def test_memory_plots_free_stack(self):
+         for context in ["alloc", "all", "state"]:
+             try:
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_parallelize_api.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_parallelize_api.patch
new file mode 100644
index 000000000000..540885db01dc
--- /dev/null
+++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_parallelize_api.patch
@@ -0,0 +1,24 @@
+The test_linear_row_wise_parallel subtest fails when run on e.g. 6 GPUs with
+
+> RuntimeError: a and b must have same reduction dim, but got [9, 18] X [16, 10].
+> RuntimeError: a and b must have same reduction dim, but got [9, 6] X [16, 10].
+
+Reason is the test suite expects at most 4 GPUs.
+See https://github.com/pytorch/pytorch/issues/141335
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
+index b5eaf6eaf78..901c7131cfe 100644
+--- a/test/distributed/tensor/parallel/test_parallelize_api.py
++++ b/test/distributed/tensor/parallel/test_parallelize_api.py
+@@ -28,8 +28,7 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
+ class TensorParallelAPITests(DTensorTestBase):
+     @property
+     def world_size(self):
+-        gpu_num = torch.cuda.device_count()
+-        return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
++        return 4
+ 
+     @with_comms
+     def test_create_1d_device_mesh(self):