From f2f5031cf8c1b6723f37ed4a9e40b27c58d4e1f7 Mon Sep 17 00:00:00 2001 From: Moshe Malawach Date: Wed, 10 Jun 2026 09:39:07 +0200 Subject: [PATCH] fix(stats): degrade gracefully instead of rendering 0/0 on transient API failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getOverviewStats was an all-or-nothing Promise.all over /stats plus every page of /vms (~52 requests) and /nodes; one transient failure rejected the whole query, and StatCard coerced the missing data to a literal "0 / 0". - client.ts: headline totals come from the single cheap /api/v1/stats call; the VM/node page fan-outs get catch fallbacks so derived breakdowns degrade to empty instead of rejecting. Total VMs keeps the 7d retention count (Decision #110) when the fan-out succeeds and falls back to stats.total_vms when it fails. - stats-bar.tsx: explicit error state — em-dash + "Data unavailable" instead of coercing undefined to 0. - use-overview-stats.ts: placeholderData: keepPreviousData so last-good values persist through transient refetch failures. - scripts/smoke-stats.mjs: smoke test exercising the client's exact data path against the live scheduler API (nonzero totals, fan-out within 5% of /stats; headless-browser DOM check deliberately out of scope). Refs marketing backlog P0-03. --- scripts/smoke-stats.mjs | 106 ++++++++++++++++++++++++++++++++ src/api/client.ts | 19 ++++-- src/components/stats-bar.tsx | 20 +++++- src/hooks/use-overview-stats.ts | 5 +- 4 files changed, 142 insertions(+), 8 deletions(-) create mode 100644 scripts/smoke-stats.mjs diff --git a/scripts/smoke-stats.mjs b/scripts/smoke-stats.mjs new file mode 100644 index 0000000..8611001 --- /dev/null +++ b/scripts/smoke-stats.mjs @@ -0,0 +1,106 @@ +#!/usr/bin/env node +// Smoke test for the Overview headline stats (marketing backlog P0-03: +// dashboard rendering "0 nodes / 0 VMs" on transient API failures). +// +// The deployed dashboard is a static export rendered client-side with React +// Query, so its HTML carries no stat values — verifying the *rendered* DOM +// would require headless Chrome, which we deliberately avoid here (no new +// deps). Instead this script exercises the exact data path the client uses +// (`getOverviewStats` in src/api/client.ts): the cheap /api/v1/stats call +// for the headline totals, plus the paginated /api/v1/vms and /api/v1/nodes +// fan-outs for the derived breakdowns. It asserts: +// +// 1. /api/v1/stats reports nonzero total_nodes / total_vms / healthy_nodes +// 2. The node fan-out count agrees with stats.total_nodes within 5% +// 3. The 7d-retention VM count (the Overview "Total VMs" headline, +// Decision #110) is nonzero +// +// Usage: node scripts/smoke-stats.mjs [api-base-url] +// default api-base-url: https://rust-scheduler.aleph.im + +const BASE_URL = process.argv[2] ?? "https://rust-scheduler.aleph.im"; +const MAX_PAGE_SIZE = 200; // mirrors MAX_PAGE_SIZE in src/api/client.ts +const RETENTION_MS = 7 * 86_400_000; // DEFAULT_RETENTION ("7d") in src/lib/filters.ts +const TOLERANCE = 0.05; + +let failures = 0; + +function check(ok, label, detail) { + const status = ok ? "ok " : "FAIL"; + console.log(`[${status}] ${label}${detail ? ` — ${detail}` : ""}`); + if (!ok) failures++; +} + +async function fetchJson(path) { + const res = await fetch(`${BASE_URL}${path}`); + if (!res.ok) { + throw new Error(`API error: ${res.status} ${res.statusText} for ${path}`); + } + return res.json(); +} + +// Mirrors fetchAllPages in src/api/client.ts. +async function fetchAllPages(path) { + const separator = path.includes("?") ? "&" : "?"; + const firstPage = await fetchJson( + `${path}${separator}page=1&page_size=${MAX_PAGE_SIZE}`, + ); + if (firstPage.pagination.total_pages <= 1) return firstPage.items; + const remaining = Array.from( + { length: firstPage.pagination.total_pages - 1 }, + (_, i) => + fetchJson(`${path}${separator}page=${i + 2}&page_size=${MAX_PAGE_SIZE}`), + ); + const pages = await Promise.all(remaining); + return [firstPage, ...pages].flatMap((p) => p.items); +} + +function withinTolerance(a, b) { + if (b === 0) return a === 0; + return Math.abs(a - b) / b <= TOLERANCE; +} + +// Mirrors lastActivityMs + applyRetentionWindow in src/lib/filters.ts +// (wire-format field names, since we skip the client's transform step). +function lastActivityMs(vm) { + const t = (s) => (s ? new Date(s).getTime() : Number.NEGATIVE_INFINITY); + return Math.max( + t(vm.last_observed_at), + t(vm.updated_at), + t(vm.allocated_at), + ); +} + +const stats = await fetchJson("/api/v1/stats"); +check(stats.total_nodes > 0, "stats.total_nodes nonzero", String(stats.total_nodes)); +check(stats.total_vms > 0, "stats.total_vms nonzero", String(stats.total_vms)); +check( + stats.healthy_nodes > 0, + "stats.healthy_nodes nonzero", + String(stats.healthy_nodes), +); + +const [nodes, vms] = await Promise.all([ + fetchAllPages("/api/v1/nodes"), + fetchAllPages("/api/v1/vms"), +]); + +check( + withinTolerance(nodes.length, stats.total_nodes), + "node fan-out agrees with stats.total_nodes (±5%)", + `${nodes.length} fetched vs ${stats.total_nodes} reported`, +); + +const cutoff = Date.now() - RETENTION_MS; +const recentVms = vms.filter((vm) => lastActivityMs(vm) >= cutoff).length; +check( + recentVms > 0, + 'Overview "Total VMs" headline (7d retention) nonzero', + `${recentVms} of ${vms.length} fetched VMs`, +); + +if (failures > 0) { + console.error(`\n${failures} check(s) failed against ${BASE_URL}`); + process.exit(1); +} +console.log(`\nAll checks passed against ${BASE_URL}`); diff --git a/src/api/client.ts b/src/api/client.ts index 4ff0465..b26a796 100644 --- a/src/api/client.ts +++ b/src/api/client.ts @@ -257,13 +257,17 @@ export async function getVM(hash: string): Promise { } export async function getOverviewStats(): Promise { + // Headline totals come from the single cheap /stats call. The VM/node + // page fan-outs (dozens of requests) only feed derived breakdowns, so a + // transient failure there degrades those to empty (null → []) instead + // of rejecting the whole query and rendering the headline as "0 / 0". const [stats, rawVms, rawNodes] = await Promise.all([ fetchApi("/api/v1/stats"), - fetchAllPages("/api/v1/vms"), - fetchAllPages("/api/v1/nodes"), + fetchAllPages("/api/v1/vms").catch(() => null), + fetchAllPages("/api/v1/nodes").catch(() => null), ]); - const nodes = rawNodes.map(transformNode); - const vms = rawVms.map(transformVm); + const nodes = (rawNodes ?? []).map(transformNode); + const vms = (rawVms ?? []).map(transformVm); return { totalNodes: stats.total_nodes, healthyNodes: stats.healthy_nodes, @@ -274,7 +278,12 @@ export async function getOverviewStats(): Promise { .length, removedNodes: nodes.filter((n) => n.status === "removed") .length, - totalVMs: applyRetentionWindow(vms, DEFAULT_RETENTION, Date.now()).length, + // Retention-window count when the VM list is available (Decision + // #110); all-time total from /stats when the fan-out failed, so the + // headline never collapses to 0 on a transient error. + totalVMs: rawVms + ? applyRetentionWindow(vms, DEFAULT_RETENTION, Date.now()).length + : stats.total_vms, dispatchedVMs: vms.filter((v) => v.status === "dispatched") .length, missingVMs: vms.filter((v) => v.status === "missing").length, diff --git a/src/components/stats-bar.tsx b/src/components/stats-bar.tsx index 4941816..9f56a96 100644 --- a/src/components/stats-bar.tsx +++ b/src/components/stats-bar.tsx @@ -17,6 +17,7 @@ type StatProps = { total: number | undefined; subtitle: string; isLoading: boolean; + isError: boolean; color?: string | undefined; tint?: string | undefined; icon?: React.ReactNode; @@ -89,11 +90,15 @@ function StatCard({ total, subtitle, isLoading, + isError, color, tint, icon, }: Omit) { const showRing = color && !isLoading && value !== undefined && total; + // No value to show (initial fetch failed and there is no cached data): + // render an explicit "unavailable" state instead of coercing to 0. + const unavailable = !isLoading && isError && value === undefined; return (
{isLoading ? ( + ) : unavailable ? ( +

+ — +

) : (

)}

- {subtitle} + {unavailable ? "Data unavailable" : subtitle}

); @@ -187,7 +199,7 @@ const iconCheck = ( ); export function StatsBar() { - const { data: stats, isLoading } = useOverviewStats(); + const { data: stats, isLoading, isError } = useOverviewStats(); const hasDispatched = (stats?.dispatchedVMs ?? 0) > 0; @@ -201,6 +213,7 @@ export function StatsBar() { total={undefined} subtitle="Compute nodes registered with the scheduler" isLoading={isLoading} + isError={isError} href="/nodes" index={0} /> @@ -210,6 +223,7 @@ export function StatsBar() { total={stats?.totalNodes} subtitle="Nodes that passed their last health check" isLoading={isLoading} + isError={isError} color="var(--color-success-500)" tint="var(--color-success-500)" icon={iconCheck} @@ -225,6 +239,7 @@ export function StatsBar() { total={undefined} subtitle="VMs active in the last 7 days" isLoading={isLoading} + isError={isError} href="/vms" index={2} /> @@ -234,6 +249,7 @@ export function StatsBar() { total={stats?.totalVMs} subtitle="VMs running on their correct assigned node" isLoading={isLoading} + isError={isError} icon={iconCheck} href="/vms?status=dispatched" index={3} diff --git a/src/hooks/use-overview-stats.ts b/src/hooks/use-overview-stats.ts index 2a4a972..54ed23c 100644 --- a/src/hooks/use-overview-stats.ts +++ b/src/hooks/use-overview-stats.ts @@ -1,4 +1,4 @@ -import { useQuery } from "@tanstack/react-query"; +import { keepPreviousData, useQuery } from "@tanstack/react-query"; import { getOverviewStats } from "@/api/client"; export function useOverviewStats() { @@ -6,5 +6,8 @@ export function useOverviewStats() { queryKey: ["overview-stats"], queryFn: getOverviewStats, refetchInterval: 30_000, + // Keep the last-good stats on screen through transient refetch + // failures instead of dropping back to `undefined` (rendered "0"). + placeholderData: keepPreviousData, }); }