Skip to content

{ai}[foss/2023a] DeePMD-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1#22219

Open
pavelToman wants to merge 5 commits into
easybuilders:developfrom
pavelToman:20250127173956_new_pr_DeePDM-kit301
Open

{ai}[foss/2023a] DeePMD-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1#22219
pavelToman wants to merge 5 commits into
easybuilders:developfrom
pavelToman:20250127173956_new_pr_DeePDM-kit301

Conversation

@pavelToman
Copy link
Copy Markdown
Collaborator

@pavelToman pavelToman commented Jan 27, 2025

…M-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb, Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@github-actions
Copy link
Copy Markdown

github-actions Bot commented Jan 27, 2025

Updated software Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb

Diff against Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb

easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb

diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
index 90aa0d7222..82290c1cbf 100644
--- a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb
+++ b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@@ -2,33 +2,44 @@ easyblock = 'PythonBundle'
 
 name = 'Horovod'
 version = '0.28.1'
-local_pt_version = '1.12.0'
-versionsuffix = '-PyTorch-%s' % local_pt_version
+local_tf_version = '2.15.1'
+local_cuda_suffix = '-CUDA-%(cudaver)s'
+versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
 
 homepage = 'https://github.com/uber/horovod'
-description = """Horovod is a distributed training framework for TensorFlow, PyTorch and MXnet.
-This build only has PyTorch enabled."""
+description = "Horovod is a distributed training framework for TensorFlow."
 
-toolchain = {'name': 'foss', 'version': '2022a'}
+toolchain = {'name': 'foss', 'version': '2023a'}
 
 builddependencies = [
-    ('CMake', '3.23.1'),
+    ('CMake', '3.26.3'),
 ]
 dependencies = [
-    ('Python', '3.10.4'),
+    ('Python', '3.11.3'),
     ('PyYAML', '6.0'),
-    ('PyTorch', local_pt_version),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', local_cuda_suffix),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
 ]
 
-preinstallopts = 'HOROVOD_WITH_MPI=1 '
-preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+use_pip = True
+sanity_pip_check = True
+
+local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
+local_preinstallopts += 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
 
 exts_list = [
     ('cloudpickle', '2.2.1', {
         'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
     }),
     ('horovod', version, {
-        'checksums': ['92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0'],
+        'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
+        'preinstallopts': local_preinstallopts,
+        'checksums': [
+            '92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
+            '9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
+        ],
     }),
 ]
 
Diff against Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb

easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb

diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
index 744c678169..82290c1cbf 100644
--- a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb
+++ b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@@ -2,36 +2,44 @@ easyblock = 'PythonBundle'
 
 name = 'Horovod'
 version = '0.28.1'
-local_pt_version = '1.13.1'
+local_tf_version = '2.15.1'
 local_cuda_suffix = '-CUDA-%(cudaver)s'
-versionsuffix = local_cuda_suffix + '-PyTorch-%s' % local_pt_version
+versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
 
 homepage = 'https://github.com/uber/horovod'
-description = """Horovod is a distributed training framework for TensorFlow, PyTorch and MXnet.
-This build only has PyTorch enabled."""
+description = "Horovod is a distributed training framework for TensorFlow."
 
-toolchain = {'name': 'foss', 'version': '2022a'}
+toolchain = {'name': 'foss', 'version': '2023a'}
 
 builddependencies = [
-    ('CMake', '3.23.1'),
+    ('CMake', '3.26.3'),
 ]
 dependencies = [
-    ('Python', '3.10.4'),
+    ('Python', '3.11.3'),
     ('PyYAML', '6.0'),
-    ('CUDA', '11.7.0', '', SYSTEM),
-    ('NCCL', '2.12.12', local_cuda_suffix),
-    ('PyTorch', local_pt_version, local_cuda_suffix),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', local_cuda_suffix),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
 ]
 
-preinstallopts = 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
-preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+use_pip = True
+sanity_pip_check = True
+
+local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
+local_preinstallopts += 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
 
 exts_list = [
     ('cloudpickle', '2.2.1', {
         'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
     }),
     ('horovod', version, {
-        'checksums': ['92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0'],
+        'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
+        'preinstallopts': local_preinstallopts,
+        'checksums': [
+            '92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
+            '9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
+        ],
     }),
 ]
 
Diff against Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.12.1.eb

easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.12.1.eb

diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.12.1.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
index 8485defceb..82290c1cbf 100644
--- a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.12.1.eb
+++ b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@@ -2,36 +2,44 @@ easyblock = 'PythonBundle'
 
 name = 'Horovod'
 version = '0.28.1'
-local_pt_version = '1.12.1'
+local_tf_version = '2.15.1'
 local_cuda_suffix = '-CUDA-%(cudaver)s'
-versionsuffix = local_cuda_suffix + '-PyTorch-%s' % local_pt_version
+versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
 
 homepage = 'https://github.com/uber/horovod'
-description = """Horovod is a distributed training framework for TensorFlow, PyTorch and MXnet.
-This build only has PyTorch enabled."""
+description = "Horovod is a distributed training framework for TensorFlow."
 
-toolchain = {'name': 'foss', 'version': '2022a'}
+toolchain = {'name': 'foss', 'version': '2023a'}
 
 builddependencies = [
-    ('CMake', '3.23.1'),
+    ('CMake', '3.26.3'),
 ]
 dependencies = [
-    ('Python', '3.10.4'),
+    ('Python', '3.11.3'),
     ('PyYAML', '6.0'),
-    ('CUDA', '11.7.0', '', SYSTEM),
-    ('NCCL', '2.12.12', local_cuda_suffix),
-    ('PyTorch', local_pt_version, local_cuda_suffix),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', local_cuda_suffix),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
 ]
 
-preinstallopts = 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
-preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+use_pip = True
+sanity_pip_check = True
+
+local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
+local_preinstallopts += 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
 
 exts_list = [
     ('cloudpickle', '2.2.1', {
         'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
     }),
     ('horovod', version, {
-        'checksums': ['92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0'],
+        'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
+        'preinstallopts': local_preinstallopts,
+        'checksums': [
+            '92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
+            '9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
+        ],
     }),
 ]
 

@pavelToman pavelToman changed the title {ai}[foss/2023a] DeePDM-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1, CUDA 12.1.1 TensorFlow 2.15.1, ... {ai}[foss/2023a] DeePDM-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1 Jan 27, 2025
@pavelToman pavelToman added new and removed update labels Jan 27, 2025
@pavelToman
Copy link
Copy Markdown
Collaborator Author

Test report by @pavelToman
SUCCESS
Build succeeded for 3 out of 3 (3 easyconfigs in total)
node4012.donphan.os - Linux RHEL 8.8, x86_64, Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz, 1 x NVIDIA NVIDIA A2, 545.23.08, Python 3.6.8
See https://gist.github.com/pavelToman/17efd63f20d8113a8751102975af2595 for a full test report.

'easyblock': 'PythonPackage',
'source_urls': ['https://pypi.python.org/packages/source/d/deepmd-kit/'],
'sources': ['deepmd_kit-%(version)s.tar.gz'],
'preinstallopts': "export DP_VARIANT=cuda && module swap protobuf/3.21.9-GCCcore-12.3.0 && ",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see also #22217, would be nice if we could avoid having to use this...

@boegel boegel self-assigned this May 20, 2025
Copy link
Copy Markdown
Contributor

@laraPPr laraPPr left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small typo

@github-actions github-actions Bot added update and removed new labels Jun 19, 2025
@pavelToman pavelToman changed the title {ai}[foss/2023a] DeePDM-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1 {ai}[foss/2023a] DeePMD-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1 Jun 19, 2025
…-plugin.eb to DeePMD-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb - fix name
@github-actions github-actions Bot added the new label Jun 19, 2025
…D-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb - fix name
'easyblock': 'PythonPackage',
'source_urls': ['https://pypi.python.org/packages/source/d/deepmd-kit/'],
'sources': ['deepmd_kit-%(version)s.tar.gz'],
'preinstallopts': "export DP_VARIANT=cuda && module swap protobuf/3.21.9-GCCcore-12.3.0 && ",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be clear: we need to get rid of this nasty hack by implementing support for swap_dependencies in framework, see also #22217 (comment)

@github-actions github-actions Bot added the new label Jun 25, 2025
@Thyre Thyre added the 2023a label Aug 18, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

DeePMD-kit

4 participants