Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 72 additions & 6 deletions v2/pkg/engine/resolve/cache_analytics.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,15 @@ type CacheWriteEvent struct {

// FetchTimingEvent records the duration of a subgraph fetch or cache lookup.
type FetchTimingEvent struct {
DataSource string // subgraph name
EntityType string // entity type (empty for root fetches)
DurationMs int64 // time spent on this operation in milliseconds
Source FieldSource // what handled this: Subgraph (fetch), L2 (cache GET)
ItemCount int // number of entities in this fetch/lookup
IsEntityFetch bool // true for _entities, false for root field
DataSource string // subgraph name
EntityType string // entity type (empty for root fetches)
DurationMs int64 // time spent on this operation in milliseconds
Source FieldSource // what handled this: Subgraph (fetch), L2 (cache GET)
ItemCount int // number of entities in this fetch/lookup
IsEntityFetch bool // true for _entities, false for root field
HTTPStatusCode int // HTTP status code from subgraph response (0 for cache hits)
ResponseBytes int // response body size in bytes (0 for cache hits)
TTFBMs int64 // time to first byte in milliseconds (0 when unavailable)
}

// SubgraphErrorEvent records a subgraph error for analytics.
Expand Down Expand Up @@ -824,6 +827,69 @@ func (s *CacheAnalyticsSnapshot) ShadowFreshnessRateByEntityType() map[string]fl
return result
}

// SubgraphRequestMetrics holds per-subgraph aggregate metrics for a single request.
// Designed for export to external SLO systems (e.g., schema registry).
type SubgraphRequestMetrics struct {
SubgraphName string
RequestCount int // number of fetches to this subgraph
ErrorCount int // number of errors from this subgraph
TotalDurationMs int64 // sum of fetch durations
MaxDurationMs int64 // max single-fetch duration
TotalResponseBytes int64 // sum of response body sizes
}

// SubgraphMetrics returns per-subgraph aggregate metrics for this request.
// Only considers actual subgraph fetches (not cache hits).
// Returns nil if there are no subgraph fetches or errors.
func (s *CacheAnalyticsSnapshot) SubgraphMetrics() []SubgraphRequestMetrics {
// Collect metrics by subgraph name, preserving insertion order
type entry struct {
metrics SubgraphRequestMetrics
index int
}
byName := make(map[string]*entry)
var order []string

for _, ft := range s.FetchTimings {
if ft.Source != FieldSourceSubgraph {
continue
}
e, ok := byName[ft.DataSource]
if !ok {
e = &entry{metrics: SubgraphRequestMetrics{SubgraphName: ft.DataSource}, index: len(order)}
byName[ft.DataSource] = e
order = append(order, ft.DataSource)
}
e.metrics.RequestCount++
e.metrics.TotalDurationMs += ft.DurationMs
if ft.DurationMs > e.metrics.MaxDurationMs {
e.metrics.MaxDurationMs = ft.DurationMs
}
e.metrics.TotalResponseBytes += int64(ft.ResponseBytes)
}

for _, ev := range s.ErrorEvents {
e, ok := byName[ev.DataSource]
if !ok {
e = &entry{metrics: SubgraphRequestMetrics{SubgraphName: ev.DataSource}, index: len(order)}
byName[ev.DataSource] = e
order = append(order, ev.DataSource)
}
e.metrics.ErrorCount++
}

if len(order) == 0 {
return nil
}

results := make([]SubgraphRequestMetrics, len(order))
for _, name := range order {
e := byName[name]
results[e.index] = e.metrics
}
return results
}

// computeCacheAgeMs computes cache age in milliseconds from remaining TTL and original TTL.
// Returns 0 if either value is zero or if the computed age would be negative.
func computeCacheAgeMs(remainingTTL, originalTTL time.Duration) int64 {
Expand Down
14 changes: 8 additions & 6 deletions v2/pkg/engine/resolve/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -2260,12 +2260,14 @@ func (l *Loader) executeSourceLoad(ctx context.Context, fetchItem *FetchItem, so
isEntityFetch = info.OperationType == ast.OperationTypeQuery && (entityType != "Query" && entityType != "Mutation" && entityType != "Subscription")
}
res.l2FetchTimings = append(res.l2FetchTimings, FetchTimingEvent{
DataSource: res.ds.Name,
EntityType: entityType,
DurationMs: time.Since(fetchStart).Milliseconds(),
Source: FieldSourceSubgraph,
ItemCount: 1,
IsEntityFetch: isEntityFetch,
DataSource: res.ds.Name,
EntityType: entityType,
DurationMs: time.Since(fetchStart).Milliseconds(),
Source: FieldSourceSubgraph,
ItemCount: 1,
IsEntityFetch: isEntityFetch,
HTTPStatusCode: res.statusCode,
ResponseBytes: len(res.out),
})
}

Expand Down
165 changes: 165 additions & 0 deletions v2/pkg/engine/resolve/subgraph_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
package resolve

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestCacheAnalyticsSnapshot_SubgraphMetrics(t *testing.T) {
t.Run("returns nil when no subgraph fetches or errors", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{
FetchTimings: []FetchTimingEvent{
{DataSource: "accounts", DurationMs: 10, Source: FieldSourceL1}, // cache hit, not subgraph
{DataSource: "accounts", DurationMs: 5, Source: FieldSourceL2}, // cache hit, not subgraph
},
}
assert.Equal(t, []SubgraphRequestMetrics(nil), snap.SubgraphMetrics())
})

t.Run("single subgraph with one fetch", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{
FetchTimings: []FetchTimingEvent{
{DataSource: "accounts", DurationMs: 42, Source: FieldSourceSubgraph, ResponseBytes: 256, HTTPStatusCode: 200},
},
}
result := snap.SubgraphMetrics()
assert.Equal(t, 1, len(result), "should have exactly 1 subgraph")
assert.Equal(t, SubgraphRequestMetrics{
SubgraphName: "accounts",
RequestCount: 1,
ErrorCount: 0,
TotalDurationMs: 42,
MaxDurationMs: 42,
TotalResponseBytes: 256,
}, result[0])
})

t.Run("single subgraph with multiple fetches picks max duration", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{
FetchTimings: []FetchTimingEvent{
{DataSource: "accounts", DurationMs: 10, Source: FieldSourceSubgraph, ResponseBytes: 100, HTTPStatusCode: 200},
{DataSource: "accounts", DurationMs: 50, Source: FieldSourceSubgraph, ResponseBytes: 200, HTTPStatusCode: 200},
{DataSource: "accounts", DurationMs: 30, Source: FieldSourceSubgraph, ResponseBytes: 150, HTTPStatusCode: 200},
},
}
result := snap.SubgraphMetrics()
assert.Equal(t, 1, len(result), "should have exactly 1 subgraph")
assert.Equal(t, SubgraphRequestMetrics{
SubgraphName: "accounts",
RequestCount: 3,
ErrorCount: 0,
TotalDurationMs: 90,
MaxDurationMs: 50,
TotalResponseBytes: 450,
}, result[0])
})

t.Run("multiple subgraphs with mixed success and errors", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{
FetchTimings: []FetchTimingEvent{
{DataSource: "accounts", DurationMs: 20, Source: FieldSourceSubgraph, ResponseBytes: 100},
{DataSource: "products", DurationMs: 80, Source: FieldSourceSubgraph, ResponseBytes: 500},
{DataSource: "accounts", DurationMs: 15, Source: FieldSourceSubgraph, ResponseBytes: 90},
{DataSource: "products", DurationMs: 120, Source: FieldSourceSubgraph, ResponseBytes: 600},
},
ErrorEvents: []SubgraphErrorEvent{
{DataSource: "products", EntityType: "Product", Message: "timeout", Code: "TIMEOUT"},
{DataSource: "reviews", EntityType: "Review", Message: "not found", Code: "NOT_FOUND"},
},
}
result := snap.SubgraphMetrics()
assert.Equal(t, 3, len(result), "should have exactly 3 subgraphs")

// accounts: 2 fetches, 0 errors
assert.Equal(t, SubgraphRequestMetrics{
SubgraphName: "accounts",
RequestCount: 2,
ErrorCount: 0,
TotalDurationMs: 35,
MaxDurationMs: 20,
TotalResponseBytes: 190,
}, result[0])

// products: 2 fetches, 1 error
assert.Equal(t, SubgraphRequestMetrics{
SubgraphName: "products",
RequestCount: 2,
ErrorCount: 1,
TotalDurationMs: 200,
MaxDurationMs: 120,
TotalResponseBytes: 1100,
}, result[1])

// reviews: 0 fetches, 1 error (error-only subgraph)
assert.Equal(t, SubgraphRequestMetrics{
SubgraphName: "reviews",
RequestCount: 0,
ErrorCount: 1,
TotalDurationMs: 0,
MaxDurationMs: 0,
}, result[2])
})

t.Run("cache hits are excluded from subgraph metrics", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{
FetchTimings: []FetchTimingEvent{
{DataSource: "accounts", DurationMs: 0, Source: FieldSourceL1}, // L1 cache hit
{DataSource: "accounts", DurationMs: 5, Source: FieldSourceL2}, // L2 cache hit
{DataSource: "accounts", DurationMs: 30, Source: FieldSourceSubgraph}, // actual fetch
},
}
result := snap.SubgraphMetrics()
assert.Equal(t, 1, len(result), "should have exactly 1 subgraph")
assert.Equal(t, 1, result[0].RequestCount, "should count only the subgraph fetch")
assert.Equal(t, int64(30), result[0].TotalDurationMs, "should only sum subgraph fetch duration")
})

t.Run("empty snapshot returns nil", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{}
assert.Equal(t, []SubgraphRequestMetrics(nil), snap.SubgraphMetrics())
})

t.Run("errors-only subgraph has zero request count", func(t *testing.T) {
snap := CacheAnalyticsSnapshot{
ErrorEvents: []SubgraphErrorEvent{
{DataSource: "accounts", Message: "connection refused"},
{DataSource: "accounts", Message: "connection refused"},
},
}
result := snap.SubgraphMetrics()
assert.Equal(t, 1, len(result), "should have exactly 1 subgraph")
assert.Equal(t, SubgraphRequestMetrics{
SubgraphName: "accounts",
RequestCount: 0,
ErrorCount: 2,
}, result[0])
})
}

func TestFetchTimingEvent_NewFields(t *testing.T) {
t.Run("subgraph fetch carries HTTP status and response size", func(t *testing.T) {
event := FetchTimingEvent{
DataSource: "accounts",
DurationMs: 42,
Source: FieldSourceSubgraph,
HTTPStatusCode: 200,
ResponseBytes: 1024,
TTFBMs: 0, // not yet instrumented
}
assert.Equal(t, 200, event.HTTPStatusCode)
assert.Equal(t, 1024, event.ResponseBytes)
assert.Equal(t, int64(0), event.TTFBMs)
})

t.Run("cache hit has zero values for HTTP fields", func(t *testing.T) {
event := FetchTimingEvent{
DataSource: "accounts",
DurationMs: 1,
Source: FieldSourceL1,
}
assert.Equal(t, 0, event.HTTPStatusCode, "cache hits should have zero status code")
assert.Equal(t, 0, event.ResponseBytes, "cache hits should have zero response bytes")
assert.Equal(t, int64(0), event.TTFBMs, "cache hits should have zero TTFB")
})
}
Loading