Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion dashboard/src/lib/components/GpuRichBar.svelte
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
<script lang="ts">
import { nodeGpuProfile, topologyData } from "$lib/stores/app.svelte";
import {
nodeAneProfile,
nodeGpuProfile,
topologyData,
type RawNodeAneProfile,
} from "$lib/stores/app.svelte";

interface Props {
class?: string;
Expand All @@ -8,6 +13,7 @@
let { class: className = "" }: Props = $props();

const profiles = $derived(nodeGpuProfile());
const aneProfiles = $derived(nodeAneProfile());
const topology = $derived(topologyData());

const totalTflops = $derived(
Expand All @@ -19,6 +25,9 @@
0,
),
);
const totalAneTops = $derived(
Object.values(aneProfiles).reduce((sum, p) => sum + getPeakAneTops(p), 0),
);
const totalMemoryBytes = $derived(
Object.values(topology?.nodes ?? {}).reduce(
(sum, n) => sum + (n.system_info?.memory ?? 0),
Expand All @@ -31,6 +40,19 @@
return `${value.toFixed(1)} TFLOPS`;
}

function formatTops(value: number): string {
if (value >= 1000) return `${(value / 1000).toFixed(2)} POPS`;
return `${value.toFixed(1)} TOPS`;
}

function getPeakAneTops(profile: RawNodeAneProfile | undefined): number {
if (!profile) return 0;
return Math.max(
0,
...profile.precisionProfiles.map((p) => p.computeTops ?? 0),
);
}

function formatBandwidth(value: number): string {
if (value >= 1000) return `${(value / 1000).toFixed(2)} TB/s`;
return `${value.toFixed(0)} GB/s`;
Expand All @@ -53,6 +75,10 @@
<span class="stat-value">{formatBandwidth(totalBandwidthGbps)}</span>
<span class="stat-label">Memory bandwidth</span>
</div>
<div class="stat-block">
<span class="stat-value">{formatTops(totalAneTops)}</span>
<span class="stat-label">ANE peak</span>
</div>
<div class="stat-block">
<span class="stat-value">{formatMemory(totalMemoryBytes)}</span>
<span class="stat-label">Memory</span>
Expand Down
58 changes: 55 additions & 3 deletions dashboard/src/lib/components/TopologyGraph.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
nodeThunderboltBridge,
nodeRdmaCtl,
nodeIdentities,
nodeAneProfile,
nodeGpuProfile,
nodeLinkProfiles,
nodeNetworkRaw,
nodeThunderbolt,
type NodeInfo,
type RawNodeAnePrecisionProfile,
type RawNodeAneProfile,
} from "$lib/stores/app.svelte";
import {
inferRdmaConnectionType,
Expand Down Expand Up @@ -44,6 +47,7 @@
const tbBridgeData = $derived(nodeThunderboltBridge());
const rdmaCtlData = $derived(nodeRdmaCtl());
const identitiesData = $derived(nodeIdentities());
const aneProfileData = $derived(nodeAneProfile());
const gpuProfileData = $derived(nodeGpuProfile());
const linkProfilesData = $derived(nodeLinkProfiles());
const nodeNetworkData = $derived(nodeNetworkRaw());
Expand All @@ -70,6 +74,32 @@
return `${value.toFixed(2)} ms`;
}

function formatAneTops(value: number | null | undefined): string {
if (value == null || !isFinite(value)) return "—";
if (value >= 1000) return `${(value / 1000).toFixed(2)}P`;
return `${value.toFixed(value >= 10 ? 0 : 1)}T`;
}

function formatAnePrecisionLabel(
profile: RawNodeAnePrecisionProfile,
): string {
return `W${profile.weightBits}A${profile.activationBits} ${formatAneTops(profile.computeTops)}`;
}

function formatAneProfileLine(profile: RawNodeAneProfile): string {
const preferredOrder: RawNodeAnePrecisionProfile["precisionBits"][] = [
16, 8, 4, 32,
];
const byBits = new Map(
profile.precisionProfiles.map((p) => [p.precisionBits, p]),
);
return preferredOrder
.map((bits) => byBits.get(bits))
.filter((p): p is RawNodeAnePrecisionProfile => Boolean(p?.supported))
.map((p) => formatAnePrecisionLabel(p))
.join(" · ");
}

interface PairProfileEntry {
fromId: string;
toId: string;
Expand Down Expand Up @@ -1242,8 +1272,9 @@
.text(powerText);
}

// GPU profile (TFLOPS + memory bandwidth) — only shown when we have a
// measurement; otherwise the slot collapses.
// Hardware profiles are only shown when we have measurements; otherwise
// their slots collapse.
const aneProfile = aneProfileData[nodeInfo.id];
const gpuProfile = gpuProfileData[nodeInfo.id];

// Labels - adapt based on mode
Expand Down Expand Up @@ -1317,6 +1348,21 @@
.attr("fill", "rgba(255,215,0,0.8)")
.text(`${gpuProfile.memoryBandwidthGbps.toFixed(0)} GB/s`);
}
if (aneProfile) {
const aneProfileLine = formatAneProfileLine(aneProfile);
const aneY = infoY + fontSize * (gpuProfile ? 1.95 : 1.05);
const aneText = nodeG
.append("text")
.attr("x", nodeInfo.x)
.attr("y", aneY)
.attr("text-anchor", "middle")
.attr("font-size", fontSize * 0.72)
.attr("font-family", "SF Mono, Monaco, monospace");
aneText
.append("tspan")
.attr("fill", "rgba(96,165,250,0.95)")
.text(aneProfileLine ? `ANE ${aneProfileLine}` : "ANE profiling");
}
} else if (showCompactLabels) {
// COMPACT MODE: Just name and basic info (4+ nodes)
const fontSize = Math.max(7, nodeRadius * 0.11);
Expand Down Expand Up @@ -1426,10 +1472,15 @@

// Debug mode: Show TB bridge and RDMA status
if (debugEnabled) {
const profileLineCount =
(gpuProfile ? 1 : 0) + (aneProfile && showFullLabels ? 1 : 0);
const profileDebugOffset =
showFullLabels && profileLineCount > 1 ? 12 : 0;
let debugLabelY =
nodeInfo.y +
iconBaseHeight / 2 +
(showFullLabels ? 32 : showCompactLabels ? 26 : 22);
(showFullLabels ? 32 : showCompactLabels ? 26 : 22) +
profileDebugOffset;
const debugFontSize = showFullLabels ? 9 : 7;
const debugLineHeight = showFullLabels ? 11 : 9;

Expand Down Expand Up @@ -1493,6 +1544,7 @@
const _hoveredNodeId = hoveredNodeId;
const _filteredNodes = filteredNodes;
const _highlightedNodes = highlightedNodes;
const _ane = aneProfileData;
const _gpu = gpuProfileData;
const _links = linkProfilesData;
const _network = nodeNetworkData;
Expand Down
29 changes: 29 additions & 0 deletions dashboard/src/lib/stores/app.svelte.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export interface NodeInfo {
};
gpu_usage?: [number, number];
sys_power?: number;
ane_power?: number;
};
last_macmon_update: number;
friendly_name?: string;
Expand Down Expand Up @@ -100,6 +101,7 @@ interface RawSystemPerformanceProfile {
gpuUsage?: number;
temp?: number;
sysPower?: number;
anePower?: number;
pcpuUsage?: number;
ecpuUsage?: number;
}
Expand Down Expand Up @@ -133,6 +135,27 @@ export interface RawNodeGpuProfile {
measuredAt: string;
}

export interface RawNodeAnePrecisionProfile {
precisionBits: 32 | 16 | 8 | 4;
weightBits: 32 | 16 | 8 | 4;
activationBits: 32 | 16 | 8 | 4;
supported: boolean;
computeTops?: number | null;
weightOnlyComputeTops?: number | null;
singleInstanceComputeTops?: number | null;
computeInstances?: number;
memoryBandwidthGbps?: number | null;
activationQuantizationSpeedup?: number | null;
nativeQuantizedCompute?: boolean | null;
error?: string | null;
}

export interface RawNodeAneProfile {
engine: "ane";
precisionProfiles: RawNodeAnePrecisionProfile[];
measuredAt: string;
}

export interface RawNodeSocketLinkProfile {
transport: "socket";
sinkIp: string;
Expand Down Expand Up @@ -309,6 +332,8 @@ interface RawStateResponse {
>;
// Per-node GPU compute + memory bandwidth profile.
nodeGpuProfile?: Record<string, RawNodeGpuProfile>;
// Per-node ANE compute + streaming bandwidth profile.
nodeAneProfile?: Record<string, RawNodeAneProfile>;
// Per-edge link probe results, keyed source -> sink -> [profiles].
nodeLinkProfiles?: RawNodeLinkProfiles;
}
Expand Down Expand Up @@ -511,6 +536,7 @@ function transformTopology(
gpu_usage:
system?.gpuUsage !== undefined ? [0, system.gpuUsage] : undefined,
sys_power: system?.sysPower,
ane_power: system?.anePower,
},
last_macmon_update: Date.now() / 1000,
friendly_name: identity?.friendlyName,
Expand Down Expand Up @@ -634,6 +660,7 @@ class AppStore {
>
>({});
nodeGpuProfile = $state<Record<string, RawNodeGpuProfile>>({});
nodeAneProfile = $state<Record<string, RawNodeAneProfile>>({});
nodeLinkProfiles = $state<RawNodeLinkProfiles>({});
nodeNetworkRaw = $state<Record<string, RawNodeNetworkInfo>>({});

Expand Down Expand Up @@ -1406,6 +1433,7 @@ class AppStore {
this.nodeThunderboltBridge = data.nodeThunderboltBridge ?? {};
// Profiler outputs
this.nodeGpuProfile = data.nodeGpuProfile ?? {};
this.nodeAneProfile = data.nodeAneProfile ?? {};
this.nodeLinkProfiles = data.nodeLinkProfiles ?? {};
// Raw network info — kept so the connection-type inference can use
// interfaceType, which the topology-shaped `NodeInfo.network_interfaces`
Expand Down Expand Up @@ -3671,6 +3699,7 @@ export const nodeThunderboltBridge = () => appStore.nodeThunderboltBridge;

// Profiler outputs
export const nodeGpuProfile = () => appStore.nodeGpuProfile;
export const nodeAneProfile = () => appStore.nodeAneProfile;
export const nodeLinkProfiles = () => appStore.nodeLinkProfiles;
export const nodeNetworkRaw = () => appStore.nodeNetworkRaw;

Expand Down
3 changes: 3 additions & 0 deletions src/exo/api/types/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,16 @@ class NodePowerStats(BaseModel, frozen=True):
node_id: NodeId
samples: int
avg_sys_power: float
avg_ane_power: float


class PowerUsage(BaseModel, frozen=True):
elapsed_seconds: float
nodes: list[NodePowerStats]
total_avg_sys_power_watts: float
total_avg_ane_power_watts: float
total_energy_joules: float
total_ane_energy_joules: float


class BenchChatCompletionResponse(ChatCompletionResponse):
Expand Down
35 changes: 35 additions & 0 deletions src/exo/shared/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
)
from exo.shared.types.instance_link import InstanceLink, InstanceLinkId
from exo.shared.types.profiling import (
NodeAnePrecisionProfile,
NodeAneProfile,
NodeGpuProfile,
NodeIdentity,
NodeLinkProfile,
Expand Down Expand Up @@ -67,6 +69,7 @@
StaticNodeInformation,
ThunderboltBridgeInfo,
)
from exo.utils.profilers.ane_profiler import AneProfile
from exo.utils.profilers.gpu_profiler import GpuProfile
from exo.utils.profilers.link_profiler import RDMALinkProfile, SocketLinkProfile

Expand Down Expand Up @@ -315,6 +318,11 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
for key, value in state.node_gpu_profile.items()
if key != event.node_id
}
node_ane_profile = {
key: value
for key, value in state.node_ane_profile.items()
if key != event.node_id
}
# Drop the leaving node both as source (outer key) and as sink (inner key).
node_link_profiles: dict[NodeId, Mapping[NodeId, Sequence[NodeLinkProfile]]] = {}
for source_id, sinks in state.node_link_profiles.items():
Expand Down Expand Up @@ -348,6 +356,7 @@ def apply_node_timed_out(event: NodeTimedOut, state: State) -> State:
"node_thunderbolt": node_thunderbolt,
"node_thunderbolt_bridge": node_thunderbolt_bridge,
"node_rdma_ctl": node_rdma_ctl,
"node_ane_profile": node_ane_profile,
"node_gpu_profile": node_gpu_profile,
"node_link_profiles": node_link_profiles,
"thunderbolt_bridge_cycles": thunderbolt_bridge_cycles,
Expand Down Expand Up @@ -467,6 +476,32 @@ def apply_node_gathered_info(event: NodeGatheredInfo, state: State) -> State:
measured_at=measured_at,
),
}
case AneProfile():
measured_at = datetime.fromisoformat(event.when)
update["node_ane_profile"] = {
**state.node_ane_profile,
event.node_id: NodeAneProfile(
engine=info.engine,
precision_profiles=tuple(
NodeAnePrecisionProfile(
precision_bits=profile.precision_bits,
weight_bits=profile.weight_bits,
activation_bits=profile.activation_bits,
supported=profile.supported,
compute_tops=profile.compute_tops,
weight_only_compute_tops=profile.weight_only_compute_tops,
single_instance_compute_tops=profile.single_instance_compute_tops,
compute_instances=profile.compute_instances,
memory_bandwidth_gbps=profile.memory_bandwidth_gbps,
activation_quantization_speedup=profile.activation_quantization_speedup,
native_quantized_compute=profile.native_quantized_compute,
error=profile.error,
)
for profile in info.precision_profiles
),
measured_at=measured_at,
),
}
case SocketLinkProfile():
measured_at = datetime.fromisoformat(event.when)
new_entry = NodeSocketLinkProfile(
Expand Down
26 changes: 26 additions & 0 deletions src/exo/shared/types/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class SystemPerformanceProfile(FrozenModel):
gpu_usage: float = 0.0
temp: float = 0.0
sys_power: float = 0.0
ane_power: float = 0.0
pcpu_usage: float = 0.0
ecpu_usage: float = 0.0

Expand Down Expand Up @@ -121,6 +122,31 @@ class NodeGpuProfile(FrozenModel):
measured_at: datetime


class NodeAnePrecisionProfile(FrozenModel):
"""Measured ANE profile for a single weight precision."""

precision_bits: Literal[32, 16, 8, 4]
weight_bits: Literal[32, 16, 8, 4]
activation_bits: Literal[32, 16, 8, 4]
supported: bool
compute_tops: float | None = None
weight_only_compute_tops: float | None = None
single_instance_compute_tops: float | None = None
compute_instances: int = 1
memory_bandwidth_gbps: float | None = None
activation_quantization_speedup: float | None = None
native_quantized_compute: bool | None = None
error: str | None = None


class NodeAneProfile(FrozenModel):
"""Measured ANE compute throughput and streaming bandwidth for a node."""

engine: Literal["ane"]
precision_profiles: Sequence[NodeAnePrecisionProfile]
measured_at: datetime


class NodeSocketLinkProfile(FrozenModel):
"""Per-direction TCP/IP bandwidth + round-trip latency (with jitter).

Expand Down
Loading
Loading