From 6e19f8c6ccba86cf9290978e417c60ce6a1c4eef Mon Sep 17 00:00:00 2001 From: Swapnil Date: Fri, 15 May 2026 23:32:11 +0530 Subject: [PATCH 1/6] CONSOLE-GPU: Display GPU metrics on the Node Details page Adds a new GPU metrics section to the Node Details page that surfaces DCGM exporter metrics (utilization, temperature, power usage, framebuffer memory) per GPU device, along with summary information (GPU count, model, capacity, allocatable) from the Kubernetes Node resource. The section is only rendered for nodes that report GPU capacity (nvidia.com/gpu or amd.com/gpu) or have active DCGM metrics. PromQL queries use both Hostname and node label selectors joined with `or` to support common DCGM exporter labeling conventions. Includes unit tests for query generation helpers and component rendering. Co-authored-by: Cursor --- .../console-app/locales/en/console-app.json | 11 + .../src/components/nodes/NodeDetails.tsx | 3 + .../nodes/NodeDetailsGpuMetrics.tsx | 272 ++++++++++++++++++ .../__tests__/NodeDetailsGpuMetrics.spec.tsx | 156 ++++++++++ .../__tests__/nodeGpuMetricsQueries.spec.ts | 86 ++++++ .../components/nodes/nodeGpuMetricsQueries.ts | 74 +++++ 6 files changed, 602 insertions(+) create mode 100644 frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx create mode 100644 frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx create mode 100644 frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts create mode 100644 frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts diff --git a/frontend/packages/console-app/locales/en/console-app.json b/frontend/packages/console-app/locales/en/console-app.json index ce594b3c6a8..e745a58ca75 100644 --- a/frontend/packages/console-app/locales/en/console-app.json +++ b/frontend/packages/console-app/locales/en/console-app.json @@ -437,6 +437,17 @@ "Container runtime": "Container runtime", "Kubelet version": "Kubelet version", "Kube-Proxy version": "Kube-Proxy version", + "GPU metrics": "GPU metrics", + "GPU count": "GPU count", + "GPU model": "GPU model", + "GPU capacity": "GPU capacity", + "GPU allocatable": "GPU allocatable", + "GPU device": "GPU device", + "Temperature": "Temperature", + "Power usage": "Power usage", + "FB memory used": "FB memory used", + "FB memory free": "FB memory free", + "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.": "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.", "Machine set": "Machine set", "This count is based on your access permissions and might not include all virtual machines.": "This count is based on your access permissions and might not include all virtual machines.", "MachineConfigPool": "MachineConfigPool", diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx index 7d9f97596c2..a3edbb2223c 100644 --- a/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx +++ b/frontend/packages/console-app/src/components/nodes/NodeDetails.tsx @@ -1,6 +1,8 @@ import type { FC } from 'react'; +import { PROMETHEUS_BASE_PATH } from '@console/internal/components/graphs/consts'; import type { NodeKind } from '@console/internal/module/k8s'; import NodeDetailsConditions from './NodeDetailsConditions'; +import NodeDetailsGpuMetrics from './NodeDetailsGpuMetrics'; import NodeDetailsImages from './NodeDetailsImages'; import NodeDetailsOverview from './NodeDetailsOverview'; @@ -11,6 +13,7 @@ type NodeDetailsProps = { const NodeDetails: FC = ({ obj: node }) => ( <> + {PROMETHEUS_BASE_PATH && } diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx new file mode 100644 index 00000000000..945fae2d320 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx @@ -0,0 +1,272 @@ +import type { FC } from 'react'; +import { useMemo } from 'react'; +import { + Bullseye, + DescriptionList, + DescriptionListDescription, + DescriptionListGroup, + DescriptionListTerm, + Spinner, +} from '@patternfly/react-core'; +import { useTranslation } from 'react-i18next'; +import type { PrometheusResponse, PrometheusResult } from '@console/internal/components/graphs'; +import { PrometheusEndpoint } from '@console/internal/components/graphs/helpers'; +import { usePrometheusPoll } from '@console/internal/components/graphs/prometheus-poll-hook'; +import { SectionHeading } from '@console/internal/components/utils/headings'; +import type { NodeKind } from '@console/internal/module/k8s'; +import PaneBody from '@console/shared/src/components/layout/PaneBody'; +import { + GpuMetricQuery, + getGpuMetricQueries, + nodeHasGpuCapacity, + GPU_RESOURCE_KEYS, +} from './nodeGpuMetricsQueries'; + +type GpuMetricResult = { + value: string; + modelName?: string; + device?: string; +}; + +type GpuDeviceRow = { + id: string; + label: string; + utilization: string; + temperature: string; + power: string; + fbUsed: string; + fbFree: string; +}; + +const resultsByGpu = ( + response: PrometheusResponse | undefined, +): Record => { + if (!response?.data?.result?.length) { + return {}; + } + return response.data.result.reduce>( + (acc, r: PrometheusResult) => { + const gpu = r.metric?.gpu ?? r.metric?.GPU_I_ID ?? r.metric?.UUID ?? r.metric?.device ?? ''; + acc[gpu] = { + value: r.value?.[1] ?? '', + modelName: r.metric?.modelName, + device: r.metric?.device, + }; + return acc; + }, + {}, + ); +}; + +const collectGpuIds = (...maps: Record[]): string[] => { + const ids = new Set(); + maps.forEach((m) => Object.keys(m).forEach((k) => ids.add(k))); + return [...ids].sort(); +}; + +const gpuDeviceLabel = (gpuId: string, meta: GpuMetricResult | undefined): string => { + const index = `GPU ${gpuId}`; + const model = meta?.modelName; + if (model) { + return `${index} \u2014 ${model}`; + } + const dev = meta?.device; + if (dev) { + return `${index} (${dev})`; + } + return index; +}; + +const findFirstMeta = (...maps: Record[]): GpuMetricResult | undefined => { + for (const m of maps) { + for (const entry of Object.values(m)) { + if (entry.modelName) return entry; + } + } + return Object.values(maps[0] ?? {})[0]; +}; + +const formatValue = (val: string | undefined, suffix: string): string => { + if (val === undefined || val === '') return '-'; + const num = parseFloat(val); + if (Number.isNaN(num)) return '-'; + return `${Math.round(num * 10) / 10} ${suffix}`; +}; + +const formatMemMiB = (val: string | undefined): string => { + if (val === undefined || val === '') return '-'; + const mib = parseFloat(val); + if (Number.isNaN(mib)) return '-'; + if (mib >= 1024) return `${(mib / 1024).toFixed(1)} GiB`; + return `${Math.round(mib)} MiB`; +}; + +type NodeDetailsGpuMetricsProps = { + node: NodeKind; +}; + +const NodeDetailsGpuMetrics: FC = ({ node }) => { + const { t } = useTranslation(); + const nodeName = node.metadata.name; + + const hasCapacity = nodeHasGpuCapacity(node.status?.capacity); + + const queries = useMemo(() => getGpuMetricQueries(nodeName), [nodeName]); + + const [countResponse, , countLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_COUNT], + }); + const [utilResponse, , utilLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_UTILIZATION], + }); + const [tempResponse, , tempLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_TEMPERATURE], + }); + const [powerResponse, , powerLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_POWER_USAGE], + }); + const [fbUsedResponse, , fbUsedLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_FB_USED], + }); + const [fbFreeResponse, , fbFreeLoading] = usePrometheusPoll({ + endpoint: PrometheusEndpoint.QUERY, + query: queries[GpuMetricQuery.GPU_FB_FREE], + }); + + const isLoading = + countLoading || utilLoading || tempLoading || powerLoading || fbUsedLoading || fbFreeLoading; + + const utilMap = useMemo(() => resultsByGpu(utilResponse), [utilResponse]); + const tempMap = useMemo(() => resultsByGpu(tempResponse), [tempResponse]); + const powerMap = useMemo(() => resultsByGpu(powerResponse), [powerResponse]); + const fbUsedMap = useMemo(() => resultsByGpu(fbUsedResponse), [fbUsedResponse]); + const fbFreeMap = useMemo(() => resultsByGpu(fbFreeResponse), [fbFreeResponse]); + + const gpuIds = useMemo(() => collectGpuIds(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap), [ + utilMap, + tempMap, + powerMap, + fbUsedMap, + fbFreeMap, + ]); + + const hasMetrics = gpuIds.length > 0; + + if (!hasCapacity && !isLoading && !hasMetrics) { + return null; + } + + const gpuCountValue = countResponse?.data?.result?.[0]?.value?.[1]; + const gpuCountStr = + gpuCountValue !== undefined && gpuCountValue !== '' + ? String(Math.round(parseFloat(gpuCountValue))) + : undefined; + + const gpuCapacityStr = GPU_RESOURCE_KEYS.map((key) => node.status?.capacity?.[key]) + .filter(Boolean) + .join(', '); + const gpuAllocatableStr = GPU_RESOURCE_KEYS.map((key) => node.status?.allocatable?.[key]) + .filter(Boolean) + .join(', '); + + const firstMeta = findFirstMeta(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap); + const gpuModelStr = firstMeta?.modelName; + + const rows: GpuDeviceRow[] = gpuIds.map((id) => { + const meta = utilMap[id] ?? tempMap[id] ?? powerMap[id] ?? fbUsedMap[id] ?? fbFreeMap[id]; + return { + id, + label: gpuDeviceLabel(id, meta), + utilization: formatValue(utilMap[id]?.value, '%'), + temperature: formatValue(tempMap[id]?.value, '°C'), + power: formatValue(powerMap[id]?.value, 'W'), + fbUsed: formatMemMiB(fbUsedMap[id]?.value), + fbFree: formatMemMiB(fbFreeMap[id]?.value), + }; + }); + + return ( + + + + {(gpuCountStr || gpuCapacityStr || gpuAllocatableStr || gpuModelStr) && ( + + {gpuCountStr && ( + + {t('console-app~GPU count')} + {gpuCountStr} + + )} + {gpuModelStr && ( + + {t('console-app~GPU model')} + {gpuModelStr} + + )} + {gpuCapacityStr && ( + + {t('console-app~GPU capacity')} + {gpuCapacityStr} + + )} + {gpuAllocatableStr && ( + + {t('console-app~GPU allocatable')} + {gpuAllocatableStr} + + )} + + )} + + {isLoading && ( + + + + )} + + {!isLoading && hasMetrics && ( +
+ + + + + + + + + + + + + {rows.map((row) => ( + + + + + + + + + ))} + +
{t('console-app~GPU device')}{t('console-app~Utilization')}{t('console-app~Temperature')}{t('console-app~Power usage')}{t('console-app~FB memory used')}{t('console-app~FB memory free')}
{row.label}{row.utilization}{row.temperature}{row.power}{row.fbUsed}{row.fbFree}
+
+ )} + + {!isLoading && !hasMetrics && hasCapacity && ( +

+ {t( + 'console-app~GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.', + )} +

+ )} +
+ ); +}; + +export default NodeDetailsGpuMetrics; diff --git a/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx b/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx new file mode 100644 index 00000000000..40b6aa3f570 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx @@ -0,0 +1,156 @@ +import { render, screen } from '@testing-library/react'; +import { usePrometheusPoll } from '@console/internal/components/graphs/prometheus-poll-hook'; +import type { NodeKind } from '@console/internal/module/k8s'; +import NodeDetailsGpuMetrics from '../NodeDetailsGpuMetrics'; + +jest.mock('@console/internal/components/graphs/prometheus-poll-hook', () => ({ + usePrometheusPoll: jest.fn(), +})); + +const mockUsePrometheusPoll = usePrometheusPoll as jest.Mock; + +const baseNode: NodeKind = { + apiVersion: 'v1', + kind: 'Node', + metadata: { name: 'gpu-node-1', uid: 'uid-1' }, + spec: {}, + status: { + capacity: { 'nvidia.com/gpu': '2', cpu: '8', memory: '32Gi' }, + allocatable: { 'nvidia.com/gpu': '2', cpu: '7500m', memory: '30Gi' }, + conditions: [], + images: [], + }, +}; + +const nonGpuNode: NodeKind = { + apiVersion: 'v1', + kind: 'Node', + metadata: { name: 'cpu-node-1', uid: 'uid-2' }, + spec: {}, + status: { + capacity: { cpu: '8', memory: '32Gi' }, + allocatable: { cpu: '7500m', memory: '30Gi' }, + conditions: [], + images: [], + }, +}; + +const makeResponse = ( + results: { gpu: string; value: string; modelName?: string; device?: string }[], +) => ({ + status: 'success', + data: { + resultType: 'vector' as const, + result: results.map((r) => ({ + metric: { + gpu: r.gpu, + ...(r.modelName && { modelName: r.modelName }), + ...(r.device && { device: r.device }), + }, + value: [Date.now() / 1000, r.value], + })), + }, +}); + +const makeScalarResponse = (value: string) => ({ + status: 'success', + data: { + resultType: 'vector' as const, + result: [{ metric: {}, value: [Date.now() / 1000, value] }], + }, +}); + +const emptyResponse = { status: 'success', data: { resultType: 'vector' as const, result: [] } }; + +describe('NodeDetailsGpuMetrics', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('renders nothing for a non-GPU node when no metrics are returned', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + const { container } = render(); + expect(container).toBeEmptyDOMElement(); + }); + + it('shows the GPU metrics heading when the node has GPU capacity', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText('GPU metrics')).toBeInTheDocument(); + }); + + it('shows capacity and allocatable counts from node status', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText('GPU capacity')).toBeInTheDocument(); + expect(screen.getByText('GPU allocatable')).toBeInTheDocument(); + expect(screen.getAllByText('2').length).toBeGreaterThanOrEqual(1); + }); + + it('shows a spinner while loading', () => { + mockUsePrometheusPoll.mockReturnValue([undefined, null, true]); + render(); + expect(screen.getByRole('progressbar')).toBeInTheDocument(); + }); + + it('renders GPU count, model, and a table with device labels when GPU metrics are returned', () => { + const countResp = makeScalarResponse('2'); + const utilResp = makeResponse([ + { gpu: '0', value: '45', modelName: 'Tesla T4', device: 'nvidia0' }, + { gpu: '1', value: '78', modelName: 'Tesla T4', device: 'nvidia1' }, + ]); + const tempResp = makeResponse([ + { gpu: '0', value: '62', modelName: 'Tesla T4' }, + { gpu: '1', value: '71', modelName: 'Tesla T4' }, + ]); + const powerResp = makeResponse([ + { gpu: '0', value: '120.5' }, + { gpu: '1', value: '185.3' }, + ]); + const fbUsedResp = makeResponse([ + { gpu: '0', value: '4096' }, + { gpu: '1', value: '8192' }, + ]); + const fbFreeResp = makeResponse([ + { gpu: '0', value: '12288' }, + { gpu: '1', value: '8192' }, + ]); + + mockUsePrometheusPoll + .mockReturnValueOnce([countResp, null, false]) + .mockReturnValueOnce([utilResp, null, false]) + .mockReturnValueOnce([tempResp, null, false]) + .mockReturnValueOnce([powerResp, null, false]) + .mockReturnValueOnce([fbUsedResp, null, false]) + .mockReturnValueOnce([fbFreeResp, null, false]); + + render(); + + expect(screen.getByText('GPU count')).toBeInTheDocument(); + expect(screen.getByText('GPU model')).toBeInTheDocument(); + expect(screen.getByText('Tesla T4')).toBeInTheDocument(); + expect(screen.getByText('GPU device')).toBeInTheDocument(); + + expect(screen.getByText('GPU 0 \u2014 Tesla T4')).toBeInTheDocument(); + expect(screen.getByText('GPU 1 \u2014 Tesla T4')).toBeInTheDocument(); + + expect(screen.getByText('Utilization')).toBeInTheDocument(); + expect(screen.getByText('Temperature')).toBeInTheDocument(); + expect(screen.getByText('Power usage')).toBeInTheDocument(); + + expect(screen.getByText('45 %')).toBeInTheDocument(); + expect(screen.getByText('78 %')).toBeInTheDocument(); + expect(screen.getByText('62 °C')).toBeInTheDocument(); + expect(screen.getByText('71 °C')).toBeInTheDocument(); + expect(screen.getByText('120.5 W')).toBeInTheDocument(); + expect(screen.getByText('185.3 W')).toBeInTheDocument(); + expect(screen.getByText('4.0 GiB')).toBeInTheDocument(); + expect(screen.getAllByText('8.0 GiB')).toHaveLength(2); + }); + + it('shows the not-available message when node has capacity but no metric data', () => { + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText(/GPU metrics are not available/)).toBeInTheDocument(); + }); +}); diff --git a/frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts b/frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts new file mode 100644 index 00000000000..1cf377f9997 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/__tests__/nodeGpuMetricsQueries.spec.ts @@ -0,0 +1,86 @@ +import { + escapePromQLLabel, + getGpuMetricQueries, + GpuMetricQuery, + nodeHasGpuCapacity, + GPU_RESOURCE_KEYS, +} from '../nodeGpuMetricsQueries'; + +describe('escapePromQLLabel', () => { + it('returns a plain name unchanged', () => { + expect(escapePromQLLabel('worker-gpu-01')).toBe('worker-gpu-01'); + }); + + it('escapes single quotes', () => { + expect(escapePromQLLabel("node's-name")).toBe("node\\'s-name"); + }); + + it('escapes backslashes', () => { + expect(escapePromQLLabel('path\\node')).toBe('path\\\\node'); + }); + + it('escapes both backslash and single quote together', () => { + expect(escapePromQLLabel("a\\'b")).toBe("a\\\\\\'b"); + }); +}); + +describe('getGpuMetricQueries', () => { + it('returns queries keyed by GpuMetricQuery', () => { + const queries = getGpuMetricQueries('gpu-node-1'); + expect(Object.keys(queries)).toHaveLength(Object.keys(GpuMetricQuery).length); + }); + + it('uses PromQL or between two instant vectors for each label convention', () => { + const queries = getGpuMetricQueries('worker-gpu-01'); + const utilQuery = queries[GpuMetricQuery.GPU_UTILIZATION]; + expect(utilQuery).toBe( + "DCGM_FI_DEV_GPU_UTIL{Hostname='worker-gpu-01'} or DCGM_FI_DEV_GPU_UTIL{node='worker-gpu-01'}", + ); + }); + + it('uses the correct DCGM metric name for each query', () => { + const queries = getGpuMetricQueries('n1'); + expect(queries[GpuMetricQuery.GPU_COUNT]).toBe( + "count(DCGM_FI_DEV_GPU_UTIL{Hostname='n1'} or DCGM_FI_DEV_GPU_UTIL{node='n1'})", + ); + expect(queries[GpuMetricQuery.GPU_UTILIZATION]).toContain('DCGM_FI_DEV_GPU_UTIL'); + expect(queries[GpuMetricQuery.GPU_TEMPERATURE]).toContain('DCGM_FI_DEV_GPU_TEMP'); + expect(queries[GpuMetricQuery.GPU_POWER_USAGE]).toContain('DCGM_FI_DEV_POWER_USAGE'); + expect(queries[GpuMetricQuery.GPU_FB_USED]).toContain('DCGM_FI_DEV_FB_USED'); + expect(queries[GpuMetricQuery.GPU_FB_FREE]).toContain('DCGM_FI_DEV_FB_FREE'); + }); + + it('escapes special characters in node names', () => { + const queries = getGpuMetricQueries("node'special"); + expect(queries[GpuMetricQuery.GPU_UTILIZATION]).toContain("Hostname='node\\'special'"); + }); +}); + +describe('nodeHasGpuCapacity', () => { + it('returns false for undefined capacity', () => { + expect(nodeHasGpuCapacity(undefined)).toBe(false); + }); + + it('returns false when no GPU keys are present', () => { + expect(nodeHasGpuCapacity({ cpu: '8', memory: '32Gi' })).toBe(false); + }); + + it('returns false when GPU capacity is 0', () => { + expect(nodeHasGpuCapacity({ 'nvidia.com/gpu': '0' })).toBe(false); + }); + + it('returns true when nvidia.com/gpu > 0', () => { + expect(nodeHasGpuCapacity({ 'nvidia.com/gpu': '2' })).toBe(true); + }); + + it('returns true when amd.com/gpu > 0', () => { + expect(nodeHasGpuCapacity({ 'amd.com/gpu': '1' })).toBe(true); + }); +}); + +describe('GPU_RESOURCE_KEYS', () => { + it('includes nvidia and amd', () => { + expect(GPU_RESOURCE_KEYS).toContain('nvidia.com/gpu'); + expect(GPU_RESOURCE_KEYS).toContain('amd.com/gpu'); + }); +}); diff --git a/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts new file mode 100644 index 00000000000..85e616db3b2 --- /dev/null +++ b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts @@ -0,0 +1,74 @@ +import * as _ from 'lodash'; + +export enum GpuMetricQuery { + GPU_COUNT = 'GPU_COUNT', + GPU_UTILIZATION = 'GPU_UTILIZATION', + GPU_TEMPERATURE = 'GPU_TEMPERATURE', + GPU_POWER_USAGE = 'GPU_POWER_USAGE', + GPU_FB_USED = 'GPU_FB_USED', + GPU_FB_FREE = 'GPU_FB_FREE', +} + +/** + * Escapes a node name for safe inclusion in PromQL label matchers. + * Backslash and single-quote are the only characters that need escaping + * inside a PromQL single-quoted string literal. + */ +export const escapePromQLLabel = (value: string): string => + value.replace(/\\/g, '\\\\').replace(/'/g, "\\'"); + +/** + * Builds two separate label selectors for matching the node across common DCGM + * label conventions. PromQL does not support `or` inside `{}` label matchers, + * so each query must join two full instant vectors with the `or` operator: + * metric{Hostname='name'} or metric{node='name'} + */ +const buildNodeSelectors = (nodeName: string): { hn: string; nd: string } => { + const escaped = escapePromQLLabel(nodeName); + return { + hn: `Hostname='${escaped}'`, + nd: `node='${escaped}'`, + }; +}; + +const gpuQueries = { + [GpuMetricQuery.GPU_COUNT]: _.template( + `count(DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>})`, + ), + [GpuMetricQuery.GPU_UTILIZATION]: _.template( + `DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_TEMPERATURE]: _.template( + `DCGM_FI_DEV_GPU_TEMP{<%= hn %>} or DCGM_FI_DEV_GPU_TEMP{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_POWER_USAGE]: _.template( + `DCGM_FI_DEV_POWER_USAGE{<%= hn %>} or DCGM_FI_DEV_POWER_USAGE{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_FB_USED]: _.template( + `DCGM_FI_DEV_FB_USED{<%= hn %>} or DCGM_FI_DEV_FB_USED{<%= nd %>}`, + ), + [GpuMetricQuery.GPU_FB_FREE]: _.template( + `DCGM_FI_DEV_FB_FREE{<%= hn %>} or DCGM_FI_DEV_FB_FREE{<%= nd %>}`, + ), +}; + +export const getGpuMetricQueries = (nodeName: string): Record => { + const selectors = buildNodeSelectors(nodeName); + return { + [GpuMetricQuery.GPU_COUNT]: gpuQueries[GpuMetricQuery.GPU_COUNT](selectors), + [GpuMetricQuery.GPU_UTILIZATION]: gpuQueries[GpuMetricQuery.GPU_UTILIZATION](selectors), + [GpuMetricQuery.GPU_TEMPERATURE]: gpuQueries[GpuMetricQuery.GPU_TEMPERATURE](selectors), + [GpuMetricQuery.GPU_POWER_USAGE]: gpuQueries[GpuMetricQuery.GPU_POWER_USAGE](selectors), + [GpuMetricQuery.GPU_FB_USED]: gpuQueries[GpuMetricQuery.GPU_FB_USED](selectors), + [GpuMetricQuery.GPU_FB_FREE]: gpuQueries[GpuMetricQuery.GPU_FB_FREE](selectors), + }; +}; + +/** Resource keys that indicate GPU presence in node.status.capacity / allocatable. */ +export const GPU_RESOURCE_KEYS = ['nvidia.com/gpu', 'amd.com/gpu'] as const; + +export const nodeHasGpuCapacity = (capacity?: { [key: string]: string }): boolean => + GPU_RESOURCE_KEYS.some((key) => { + const val = capacity?.[key]; + return val !== undefined && parseInt(val, 10) > 0; + }); From cf2c07d9960c74dd7dccbbe827aabc0defd926f0 Mon Sep 17 00:00:00 2001 From: Swapnil Date: Fri, 15 May 2026 23:50:24 +0530 Subject: [PATCH 2/6] fixup: Address CodeRabbit review feedback - Skip Prometheus results without a valid GPU identifier to prevent silent data loss when multiple results lack label keys. - Guard GPU count display against NaN from non-numeric Prometheus values. Co-authored-by: Cursor --- .../src/components/nodes/NodeDetailsGpuMetrics.tsx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx index 945fae2d320..1443c7e44fa 100644 --- a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx +++ b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx @@ -47,6 +47,9 @@ const resultsByGpu = ( return response.data.result.reduce>( (acc, r: PrometheusResult) => { const gpu = r.metric?.gpu ?? r.metric?.GPU_I_ID ?? r.metric?.UUID ?? r.metric?.device ?? ''; + if (!gpu) { + return acc; + } acc[gpu] = { value: r.value?.[1] ?? '', modelName: r.metric?.modelName, @@ -162,10 +165,11 @@ const NodeDetailsGpuMetrics: FC = ({ node }) => { } const gpuCountValue = countResponse?.data?.result?.[0]?.value?.[1]; - const gpuCountStr = - gpuCountValue !== undefined && gpuCountValue !== '' - ? String(Math.round(parseFloat(gpuCountValue))) - : undefined; + const gpuCountStr = (() => { + if (gpuCountValue === undefined || gpuCountValue === '') return undefined; + const parsed = parseFloat(gpuCountValue); + return Number.isNaN(parsed) ? undefined : String(Math.round(parsed)); + })(); const gpuCapacityStr = GPU_RESOURCE_KEYS.map((key) => node.status?.capacity?.[key]) .filter(Boolean) From c794438c2839524a851ff90b22ef8292e0ca440a Mon Sep 17 00:00:00 2001 From: Swapnil Date: Sat, 16 May 2026 00:05:17 +0530 Subject: [PATCH 3/6] fixup: Replace lodash templates with native template literals Remove the lodash dependency from nodeGpuMetricsQueries.ts and use native template literals for PromQL query construction, reducing bundle weight with zero functional change. Co-authored-by: Cursor --- .../components/nodes/nodeGpuMetricsQueries.ts | 38 +++++-------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts index 85e616db3b2..401d7dad848 100644 --- a/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts +++ b/frontend/packages/console-app/src/components/nodes/nodeGpuMetricsQueries.ts @@ -1,5 +1,3 @@ -import * as _ from 'lodash'; - export enum GpuMetricQuery { GPU_COUNT = 'GPU_COUNT', GPU_UTILIZATION = 'GPU_UTILIZATION', @@ -31,36 +29,18 @@ const buildNodeSelectors = (nodeName: string): { hn: string; nd: string } => { }; }; -const gpuQueries = { - [GpuMetricQuery.GPU_COUNT]: _.template( - `count(DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>})`, - ), - [GpuMetricQuery.GPU_UTILIZATION]: _.template( - `DCGM_FI_DEV_GPU_UTIL{<%= hn %>} or DCGM_FI_DEV_GPU_UTIL{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_TEMPERATURE]: _.template( - `DCGM_FI_DEV_GPU_TEMP{<%= hn %>} or DCGM_FI_DEV_GPU_TEMP{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_POWER_USAGE]: _.template( - `DCGM_FI_DEV_POWER_USAGE{<%= hn %>} or DCGM_FI_DEV_POWER_USAGE{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_FB_USED]: _.template( - `DCGM_FI_DEV_FB_USED{<%= hn %>} or DCGM_FI_DEV_FB_USED{<%= nd %>}`, - ), - [GpuMetricQuery.GPU_FB_FREE]: _.template( - `DCGM_FI_DEV_FB_FREE{<%= hn %>} or DCGM_FI_DEV_FB_FREE{<%= nd %>}`, - ), -}; +const buildQuery = (metric: string, hn: string, nd: string): string => + `${metric}{${hn}} or ${metric}{${nd}}`; export const getGpuMetricQueries = (nodeName: string): Record => { - const selectors = buildNodeSelectors(nodeName); + const { hn, nd } = buildNodeSelectors(nodeName); return { - [GpuMetricQuery.GPU_COUNT]: gpuQueries[GpuMetricQuery.GPU_COUNT](selectors), - [GpuMetricQuery.GPU_UTILIZATION]: gpuQueries[GpuMetricQuery.GPU_UTILIZATION](selectors), - [GpuMetricQuery.GPU_TEMPERATURE]: gpuQueries[GpuMetricQuery.GPU_TEMPERATURE](selectors), - [GpuMetricQuery.GPU_POWER_USAGE]: gpuQueries[GpuMetricQuery.GPU_POWER_USAGE](selectors), - [GpuMetricQuery.GPU_FB_USED]: gpuQueries[GpuMetricQuery.GPU_FB_USED](selectors), - [GpuMetricQuery.GPU_FB_FREE]: gpuQueries[GpuMetricQuery.GPU_FB_FREE](selectors), + [GpuMetricQuery.GPU_COUNT]: `count(${buildQuery('DCGM_FI_DEV_GPU_UTIL', hn, nd)})`, + [GpuMetricQuery.GPU_UTILIZATION]: buildQuery('DCGM_FI_DEV_GPU_UTIL', hn, nd), + [GpuMetricQuery.GPU_TEMPERATURE]: buildQuery('DCGM_FI_DEV_GPU_TEMP', hn, nd), + [GpuMetricQuery.GPU_POWER_USAGE]: buildQuery('DCGM_FI_DEV_POWER_USAGE', hn, nd), + [GpuMetricQuery.GPU_FB_USED]: buildQuery('DCGM_FI_DEV_FB_USED', hn, nd), + [GpuMetricQuery.GPU_FB_FREE]: buildQuery('DCGM_FI_DEV_FB_FREE', hn, nd), }; }; From 2bd33e11f2d87e96856f94700450c5718b4de472 Mon Sep 17 00:00:00 2001 From: Swapnil Date: Mon, 18 May 2026 20:44:33 +0530 Subject: [PATCH 4/6] fixup: Run yarn i18n to fix locale key ordering The i18n parser places GPU keys after "Changed" based on their usage order in NodeDetailsGpuMetrics.tsx, not after "Kube-Proxy version" where they were manually added. Co-authored-by: Cursor --- .../console-app/locales/en/console-app.json | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/frontend/packages/console-app/locales/en/console-app.json b/frontend/packages/console-app/locales/en/console-app.json index e745a58ca75..72cf9cf76fa 100644 --- a/frontend/packages/console-app/locales/en/console-app.json +++ b/frontend/packages/console-app/locales/en/console-app.json @@ -416,6 +416,17 @@ "Reason": "Reason", "Updated": "Updated", "Changed": "Changed", + "GPU metrics": "GPU metrics", + "GPU count": "GPU count", + "GPU model": "GPU model", + "GPU capacity": "GPU capacity", + "GPU allocatable": "GPU allocatable", + "GPU device": "GPU device", + "Temperature": "Temperature", + "Power usage": "Power usage", + "FB memory used": "FB memory used", + "FB memory free": "FB memory free", + "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.": "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.", "Node details": "Node details", "External ID": "External ID", "Labels": "Labels", @@ -437,17 +448,6 @@ "Container runtime": "Container runtime", "Kubelet version": "Kubelet version", "Kube-Proxy version": "Kube-Proxy version", - "GPU metrics": "GPU metrics", - "GPU count": "GPU count", - "GPU model": "GPU model", - "GPU capacity": "GPU capacity", - "GPU allocatable": "GPU allocatable", - "GPU device": "GPU device", - "Temperature": "Temperature", - "Power usage": "Power usage", - "FB memory used": "FB memory used", - "FB memory free": "FB memory free", - "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.": "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.", "Machine set": "Machine set", "This count is based on your access permissions and might not include all virtual machines.": "This count is based on your access permissions and might not include all virtual machines.", "MachineConfigPool": "MachineConfigPool", From 081f50375e44bc360948235a695f706f4079d97d Mon Sep 17 00:00:00 2001 From: Swapnil Date: Tue, 19 May 2026 19:10:45 +0530 Subject: [PATCH 5/6] fixup: Address cajieh review feedback - Add aria-label to GPU metrics table for accessibility (WCAG). - Add 7 unit tests covering: non-numeric GPU count, missing GPU identifiers, DCGM data without capacity keys, AMD GPU nodes, partial poll failures, and alternative GPU label fallbacks (GPU_I_ID, UUID, device). - Add i18n key for table aria-label. Co-authored-by: Cursor --- .../console-app/locales/en/console-app.json | 1 + .../nodes/NodeDetailsGpuMetrics.tsx | 5 +- .../__tests__/NodeDetailsGpuMetrics.spec.tsx | 126 ++++++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/frontend/packages/console-app/locales/en/console-app.json b/frontend/packages/console-app/locales/en/console-app.json index 72cf9cf76fa..948fefb98da 100644 --- a/frontend/packages/console-app/locales/en/console-app.json +++ b/frontend/packages/console-app/locales/en/console-app.json @@ -426,6 +426,7 @@ "Power usage": "Power usage", "FB memory used": "FB memory used", "FB memory free": "FB memory free", + "GPU metrics per device": "GPU metrics per device", "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.": "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.", "Node details": "Node details", "External ID": "External ID", diff --git a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx index 1443c7e44fa..78907ed14fd 100644 --- a/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx +++ b/frontend/packages/console-app/src/components/nodes/NodeDetailsGpuMetrics.tsx @@ -235,7 +235,10 @@ const NodeDetailsGpuMetrics: FC = ({ node }) => { {!isLoading && hasMetrics && (
- +
diff --git a/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx b/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx index 40b6aa3f570..87060556381 100644 --- a/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx +++ b/frontend/packages/console-app/src/components/nodes/__tests__/NodeDetailsGpuMetrics.spec.tsx @@ -153,4 +153,130 @@ describe('NodeDetailsGpuMetrics', () => { render(); expect(screen.getByText(/GPU metrics are not available/)).toBeInTheDocument(); }); + + it('does not show GPU count when the Prometheus count value is non-numeric', () => { + const countResp = makeScalarResponse('not-a-number'); + const utilResp = makeResponse([{ gpu: '0', value: '50' }]); + + mockUsePrometheusPoll + .mockReturnValueOnce([countResp, null, false]) + .mockReturnValueOnce([utilResp, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]); + + render(); + expect(screen.queryByText('GPU count')).not.toBeInTheDocument(); + }); + + it('ignores Prometheus results without a GPU identifier', () => { + const respWithMissing = { + status: 'success', + data: { + resultType: 'vector' as const, + result: [ + { metric: { gpu: '0' }, value: [Date.now() / 1000, '30'] }, + { metric: {}, value: [Date.now() / 1000, '99'] }, + ], + }, + }; + + mockUsePrometheusPoll + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([respWithMissing, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]); + + render(); + expect(screen.getByText('30 %')).toBeInTheDocument(); + expect(screen.queryByText('99 %')).not.toBeInTheDocument(); + }); + + it('renders metrics when the node has DCGM data but no GPU capacity keys', () => { + const utilResp = makeResponse([{ gpu: '0', value: '55' }]); + + mockUsePrometheusPoll + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([utilResp, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]); + + render(); + expect(screen.getByText('GPU metrics')).toBeInTheDocument(); + expect(screen.getByText('55 %')).toBeInTheDocument(); + }); + + it('shows capacity and unavailable message for an AMD-only GPU node', () => { + const amdNode: NodeKind = { + apiVersion: 'v1', + kind: 'Node', + metadata: { name: 'amd-node-1', uid: 'uid-3' }, + spec: {}, + status: { + capacity: { 'amd.com/gpu': '1', cpu: '16', memory: '64Gi' }, + allocatable: { 'amd.com/gpu': '1', cpu: '15', memory: '62Gi' }, + conditions: [], + images: [], + }, + }; + + mockUsePrometheusPoll.mockReturnValue([emptyResponse, null, false]); + render(); + expect(screen.getByText('GPU metrics')).toBeInTheDocument(); + expect(screen.getByText('GPU capacity')).toBeInTheDocument(); + expect(screen.getAllByText('1').length).toBeGreaterThanOrEqual(1); + expect(screen.getByText(/GPU metrics are not available/)).toBeInTheDocument(); + }); + + it('shows the metrics table when some polls fail but utilization data is present', () => { + const utilResp = makeResponse([{ gpu: '0', value: '80' }]); + + mockUsePrometheusPoll + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([utilResp, null, false]) + .mockReturnValueOnce([undefined, new Error('fetch failed'), false]) + .mockReturnValueOnce([undefined, new Error('fetch failed'), false]) + .mockReturnValueOnce([undefined, new Error('fetch failed'), false]) + .mockReturnValueOnce([undefined, new Error('fetch failed'), false]); + + render(); + expect(screen.getByText('GPU device')).toBeInTheDocument(); + expect(screen.getByText('80 %')).toBeInTheDocument(); + expect(screen.getAllByText('-').length).toBeGreaterThanOrEqual(4); + }); + + it('maps GPU metrics using GPU_I_ID, UUID, or device when gpu label is absent', () => { + const respWithAltLabels = { + status: 'success', + data: { + resultType: 'vector' as const, + result: [ + { metric: { GPU_I_ID: 'mig-0' }, value: [Date.now() / 1000, '40'] }, + { metric: { UUID: 'GPU-abc-123' }, value: [Date.now() / 1000, '60'] }, + { metric: { device: 'nvidia1' }, value: [Date.now() / 1000, '75'] }, + ], + }, + }; + + mockUsePrometheusPoll + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([respWithAltLabels, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]) + .mockReturnValueOnce([emptyResponse, null, false]); + + render(); + expect(screen.getByText('40 %')).toBeInTheDocument(); + expect(screen.getByText('60 %')).toBeInTheDocument(); + expect(screen.getByText('75 %')).toBeInTheDocument(); + expect(screen.getByText(/GPU mig-0/)).toBeInTheDocument(); + expect(screen.getByText(/GPU GPU-abc-123/)).toBeInTheDocument(); + expect(screen.getByText(/GPU nvidia1/)).toBeInTheDocument(); + }); }); From 1ec113858c57cbe1528c56dfbbb788fe3beea3f5 Mon Sep 17 00:00:00 2001 From: Swapnil Date: Tue, 19 May 2026 20:11:08 +0530 Subject: [PATCH 6/6] fixup: Fix i18n key ordering for GPU metrics per device Co-authored-by: Cursor --- frontend/packages/console-app/locales/en/console-app.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/packages/console-app/locales/en/console-app.json b/frontend/packages/console-app/locales/en/console-app.json index 948fefb98da..175dbd83f49 100644 --- a/frontend/packages/console-app/locales/en/console-app.json +++ b/frontend/packages/console-app/locales/en/console-app.json @@ -421,12 +421,12 @@ "GPU model": "GPU model", "GPU capacity": "GPU capacity", "GPU allocatable": "GPU allocatable", + "GPU metrics per device": "GPU metrics per device", "GPU device": "GPU device", "Temperature": "Temperature", "Power usage": "Power usage", "FB memory used": "FB memory used", "FB memory free": "FB memory free", - "GPU metrics per device": "GPU metrics per device", "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.": "GPU metrics are not available. Ensure DCGM exporter metrics are being scraped and labeled with the node name.", "Node details": "Node details", "External ID": "External ID",
{t('console-app~GPU device')}