From 07ff5aa8810d1fb83d538c5ecda6ab5c428f6b44 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 28 Apr 2026 13:32:12 -0400 Subject: [PATCH 1/3] Migrate pynvml to cuda.core.system for GPU metrics polling --- python/utils/gpu_metric_poller.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py index 90135f78428..70e5989c706 100755 --- a/python/utils/gpu_metric_poller.py +++ b/python/utils/gpu_metric_poller.py @@ -5,6 +5,10 @@ # Utility class and helpers for retrieving GPU metrics for a specific section # of code. # +# Requires: +# cuda_core >= 1.0.0 +# cuda_bindings >= 12.9.6 or >= 13.2.0 + """ # Example: @@ -21,7 +25,9 @@ import os import sys import threading -import pynvml + + +from cuda.core import system class GPUMetricPoller(threading.Thread): @@ -81,18 +87,17 @@ def __runChildLoop(self, readFileNo, writeFileNo): childReadPipe = os.fdopen(readFileNo) childWritePipe = os.fdopen(writeFileNo, "w") - pynvml.nvmlInit() # hack - get actual device ID somehow - devObj = pynvml.nvmlDeviceGetHandleByIndex(0) - memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj) - utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj) + devObj = system.Device(0) + memObj = devObj.memory_info + utilObj = devObj.utilization initialMemUsed = memObj.used initialGpuUtil = utilObj.gpu controlStr = self.__waitForInput(childReadPipe) while True: - memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj) - utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj) + memObj = devObj.memory_info + utilObj = devObj.utilization memUsed = memObj.used - initialMemUsed gpuUtil = utilObj.gpu - initialGpuUtil @@ -103,7 +108,6 @@ def __runChildLoop(self, readFileNo, writeFileNo): break controlStr = self.__waitForInput(childReadPipe) - pynvml.nvmlShutdown() childReadPipe.close() childWritePipe.close() From d6d521a242e529d015b46cb3fffd8b1019219fe0 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 28 Apr 2026 13:34:35 -0400 Subject: [PATCH 2/3] Precommit fix --- python/utils/gpu_metric_poller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py index 70e5989c706..ef5e25193bc 100755 --- a/python/utils/gpu_metric_poller.py +++ b/python/utils/gpu_metric_poller.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # GPUMetricPoller From a49acdbe97a847c4c30c96bf9256635abb4e48c3 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 28 Apr 2026 13:39:30 -0400 Subject: [PATCH 3/3] Fix Device creation --- python/utils/gpu_metric_poller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py index ef5e25193bc..c4d0f17df93 100755 --- a/python/utils/gpu_metric_poller.py +++ b/python/utils/gpu_metric_poller.py @@ -88,7 +88,7 @@ def __runChildLoop(self, readFileNo, writeFileNo): childWritePipe = os.fdopen(writeFileNo, "w") # hack - get actual device ID somehow - devObj = system.Device(0) + devObj = system.Device(index=0) memObj = devObj.memory_info utilObj = devObj.utilization initialMemUsed = memObj.used