go-coldbrew · ankurs · Apr 20, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -0,0 +1,127 @@
+# Benchmarks: Function / Type Name Resolution Caching
+
+`data-builder` resolves two "locations" on every request:
+
+1. **PC → function name** via `runtime.FuncForPC(pc).Name()`
+2. **`reflect.Type` → qualified struct name** via `t.PkgPath() + "." + t.Name()`
+
+Both are now cached in process-global `sync.Map`s (see `cache.go`). Keys
+(reflect.Type identity, function PC) are stable for the life of the program,
+so the caches never need eviction.
+
+## Reproducing
+
+```sh
+go install golang.org/x/perf/cmd/benchstat@latest
+
+# "before": with cache.go rewritten to a pass-through (no caching)
+go test -run=^$ -bench=. -benchmem -count=6 ./... | tee before.txt
+
+# "after": with cache.go in its cached form
+go test -run=^$ -bench=. -benchmem -count=6 ./... | tee after.txt
+
+benchstat before.txt after.txt
+```
+
+The benchmark suite lives in `benchmarks_test.go`. `make bench` runs it with
+`-count=1`; use the commands above for statistically stable comparisons.
+
+## Environment
+
+- `go version go1.25.9 linux/amd64`
+- CPU: INTEL(R) XEON(R) PLATINUM 8581C @ 2.10GHz (16 logical cores)
+- Kernel: Linux 4.4.0
+- `benchstat` with `-count=6`
+
+## Results (benchstat)
+
+### Time per op
+
+| Benchmark                    |   Before |    After |          Δ |
+| ---------------------------- | -------: | -------: | ---------: |
+| `GetStructName_Uncached`     |  81.28ns |  84.98ns |         ~  |
+| `CachedStructName_Hit`       |  83.79ns |  11.23ns | **-86.6%** |
+| `CachedStructName_MixedHit`  |  86.16ns |  11.46ns | **-86.7%** |
+| `FuncForPC_Uncached`         |  32.71ns |  33.38ns |         ~  |
+| `ResolveFuncName_Hit`        |  32.60ns |  12.07ns | **-63.0%** |
+| `ResolveFuncName_MixedHit`   |  30.90ns |  12.13ns | **-60.7%** |
+| `AddBuilders`                |  3.950µs |  2.300µs | **-41.8%** |
+| `AddBuilders_ColdCache`      |  8.089µs | 10.357µs |    +28.1%  |
+| `Compile`                    |  6.920µs |  7.006µs |         ~  |
+| `RunParallel_Workers1`       |  15.64µs |  15.44µs |         ~  |
+| `RunParallel_Workers4`       |  20.71µs |  20.71µs |         ~  |
+| `RunParallel_Workers8`       |  23.91µs |  23.59µs |         ~  |
+| `ResultGet`                  | 103.70ns |  25.54ns | **-75.4%** |
+| `ResultGet_Parallel`         |  16.80ns |   1.54ns | **-90.8%** |
+| **geomean**                  |   498ns  |   244ns  | **-51.0%** |
+
+### Allocations
+
+| Benchmark               | Before   | After   | Δ B/op     | Δ allocs/op |
+| ----------------------- | -------: | ------: | ---------: | ----------: |
+| `CachedStructName_Hit`  |      48B |      0B |   **-100%** |   **-100%** |
+| `CachedStructName_MixedHit` |  51B |      0B |   **-100%** |   **-100%** |
+| `AddBuilders`           |   1872B |    928B |    -50.4%  |    -59.4%   |
+| `Compile`               |   4328B |   4266B |     -1.4%  |     -2.2%   |
+| `RunParallel_Workers1`  |   4945B |   4695B |     -5.1%  |     -6.3%   |
+| `RunParallel_Workers4`  |   5036B |   4786B |     -5.0%  |     -6.1%   |
+| `RunParallel_Workers8`  |   5161B |   4911B |     -4.8%  |     -5.8%   |
+| `ResultGet`             |     48B |      0B |   **-100%** |   **-100%** |
+| `ResultGet_Parallel`    |     48B |      0B |   **-100%** |   **-100%** |
+
+Statistical significance: all reported deltas have `p=0.002` with n=6; entries
+marked `~` are not statistically distinguishable from the baseline.
+
+## Interpretation
+
+**Where caching helps most**
+
+- `Result.Get` and the hot path inside `doWorkAndGetResult` / `RunParallel`
+  init loops used to allocate a fresh `string` for every type lookup
+  (`t.PkgPath() + "." + t.Name()`). Interning the result via `sync.Map`
+  eliminates that allocation entirely: `ResultGet` drops 78ns and one
+  allocation; under parallel load (`ResultGet_Parallel`) it goes from
+  16.8ns to 1.5ns — an **11× speedup** because `sync.Map`'s read-only
+  fast-path is lock-free and scales linearly across cores.
+- `AddBuilders` gets a steady-state 42% latency win and 59% fewer
+  allocations because each builder registration re-resolves the same input
+  and output type names several times via `IsValidBuilder` and `getBuilder`.
+- `FuncForPC` caching is a smaller absolute win (20ns / call) than struct
+  name caching, but it's on the same hot path for `getBuilder` and
+  `plan.Replace`, so it still helps `AddBuilders` directly.
+
+**Where caching does not help (and that's fine)**
+
+- `Compile` and `RunParallel` end-to-end are dominated by
+  `resolveDependencies`, goroutine scheduling, and `reflect.Value.Call`.
+  Name resolution is <5% of those timings, so benchstat reports "no
+  significant change" — but the memory column still shows a real reduction
+  (~5% bytes/allocs per run) because those allocations were shifted off
+  the hot path.
+- `_Uncached` baselines for both resolvers come in identical before and
+  after (as expected — they call the un-cached code directly).
+
+**The `AddBuilders_ColdCache` regression**
+
+This synthetic benchmark resets both `sync.Map`s to empty at the start of
+every iteration, so every call is a miss. `sync.Map` is slower than a
+direct computation in the pure-miss case because it pays for an atomic
+`Load` + an `LoadOrStore` on top of the original work. In production the
+cache warms up once and then serves hits forever, so this scenario isn't
+observable in practice — it's included only to pin the worst-case cost.
+
+## Caveats
+
+- `sync.Map` has higher per-op overhead than a plain `map` when the working
+  set is tiny **and** purely single-threaded. The `_MixedHit` benchmarks
+  are intentionally small (5 types / 4 PCs) and pre-warm the cache before
+  timing, so they measure mixed-key lookup overhead on a tiny hot set
+  rather than true cold misses. They still show ~85–87% wins, because
+  caching avoids repeated `PkgPath()+Name()` and `FuncForPC` work even in
+  that small-set regime. True cold-miss behavior is captured end-to-end by
+  `AddBuilders_ColdCache`, which resets both caches every iteration.
+- Absolute numbers depend on CPU, OS scheduler, and the number of distinct
+  types/builders the program touches. Don't generalize — re-measure in
+  the target deployment if it matters.
+- Benchmarks should be run with the machine idle; pin `GOMAXPROCS` if you
+  want tighter variance across runs.
diff --git a/benchmarks_test.go b/benchmarks_test.go
@@ -0,0 +1,235 @@
+package databuilder
+
+import (
+	"context"
+	"reflect"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+// Quiet benchmark-only builder variants. The production fixtures in
+// common_test.go call fmt.Println and dominate end-to-end timings, hiding
+// the effect we want to measure.
+
+type benchStructIn struct{ Value string }
+type benchStructA struct{ Value string }
+type benchStructB struct{ Value string }
+type benchStructC struct{ Value string }
+type benchStructD struct{ Value string }
+
+func benchFuncA(_ context.Context, s benchStructIn) (benchStructA, error) {
+	return benchStructA{Value: strings.ReplaceAll(s.Value, "-", "_")}, nil
+}
+
+func benchFuncB(_ context.Context, s benchStructA) (benchStructB, error) {
+	return benchStructB{Value: s.Value + "B"}, nil
+}
+
+func benchFuncC(_ context.Context, s benchStructA) (benchStructC, error) {
+	return benchStructC{Value: s.Value + "C"}, nil
+}
+
+func benchFuncD(_ context.Context, _ benchStructB, _ benchStructC) (benchStructD, error) {
+	return benchStructD{Value: "D"}, nil
+}
+
+// uncachedStructName reproduces the pre-caching implementation for apples-to-apples
+// comparison in the micro-benchmarks.
+func uncachedStructName(t reflect.Type) string {
+	return t.PkgPath() + "." + t.Name()
+}
+
+// --- struct name resolution ---
+
+func BenchmarkGetStructName_Uncached(b *testing.B) {
+	t := reflect.TypeOf(benchStructA{})
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = uncachedStructName(t)
+	}
+}
+
+func BenchmarkCachedStructName_Hit(b *testing.B) {
+	t := reflect.TypeOf(benchStructA{})
+	_ = cachedStructName(t)
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = cachedStructName(t)
+	}
+}
+
+func BenchmarkCachedStructName_MixedHit(b *testing.B) {
+	types := []reflect.Type{
+		reflect.TypeOf(benchStructIn{}),
+		reflect.TypeOf(benchStructA{}),
+		reflect.TypeOf(benchStructB{}),
+		reflect.TypeOf(benchStructC{}),
+		reflect.TypeOf(benchStructD{}),
+	}
+	for _, t := range types {
+		_ = cachedStructName(t)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = cachedStructName(types[i%len(types)])
+	}
+}
+
+// --- function PC resolution ---
+
+func BenchmarkFuncForPC_Uncached(b *testing.B) {
+	pc := reflect.ValueOf(benchFuncA).Pointer()
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = runtime.FuncForPC(pc).Name()
+	}
+}
+
+func BenchmarkResolveFuncName_Hit(b *testing.B) {
+	pc := reflect.ValueOf(benchFuncA).Pointer()
+	_ = resolveFuncName(pc)
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = resolveFuncName(pc)
+	}
+}
+
+func BenchmarkResolveFuncName_MixedHit(b *testing.B) {
+	pcs := []uintptr{
+		reflect.ValueOf(benchFuncA).Pointer(),
+		reflect.ValueOf(benchFuncB).Pointer(),
+		reflect.ValueOf(benchFuncC).Pointer(),
+		reflect.ValueOf(benchFuncD).Pointer(),
+	}
+	for _, pc := range pcs {
+		_ = resolveFuncName(pc)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = resolveFuncName(pcs[i%len(pcs)])
+	}
+}
+
+// --- registration ---
+
+func BenchmarkAddBuilders(b *testing.B) {
+	// Pin cache state to "warm" so this benchmark measures steady-state
+	// registration and doesn't drift based on prior benchmark ordering.
+	resetCachesForTest()
+	warm := New()
+	if err := warm.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+		b.Fatal(err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		d := New()
+		if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// BenchmarkAddBuilders_ColdCache exercises the worst-case path where the
+// caches are purged before every iteration. Not realistic, but it pins the
+// ceiling of how much the caches can help registration.
+func BenchmarkAddBuilders_ColdCache(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		b.StopTimer()
+		resetCachesForTest()
+		b.StartTimer()
+		d := New()
+		if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// --- compile ---
+
+func BenchmarkCompile(b *testing.B) {
+	d := New()
+	if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+		b.Fatal(err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if _, err := d.Compile(benchStructIn{}); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// --- end-to-end execution ---
+
+func newBenchPlan(b *testing.B) Plan {
+	b.Helper()
+	d := New()
+	if err := d.AddBuilders(benchFuncA, benchFuncB, benchFuncC, benchFuncD); err != nil {
+		b.Fatal(err)
+	}
+	plan, err := d.Compile(benchStructIn{})
+	if err != nil {
+		b.Fatal(err)
+	}
+	return plan
+}
+
+func benchRunParallel(b *testing.B, workers uint) {
+	plan := newBenchPlan(b)
+	ctx := context.Background()
+	in := benchStructIn{Value: "hello-world"}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if _, err := plan.RunParallel(ctx, workers, in); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func BenchmarkRunParallel_Workers1(b *testing.B) { benchRunParallel(b, 1) }
+func BenchmarkRunParallel_Workers4(b *testing.B) { benchRunParallel(b, 4) }
+func BenchmarkRunParallel_Workers8(b *testing.B) { benchRunParallel(b, 8) }
+
+// --- Result.Get ---
+
+func BenchmarkResultGet(b *testing.B) {
+	plan := newBenchPlan(b)
+	result, err := plan.RunParallel(context.Background(), 4, benchStructIn{Value: "x"})
+	if err != nil {
+		b.Fatal(err)
+	}
+	key := benchStructC{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = result.Get(key)
+	}
+}
+
+func BenchmarkResultGet_Parallel(b *testing.B) {
+	plan := newBenchPlan(b)
+	result, err := plan.RunParallel(context.Background(), 4, benchStructIn{Value: "x"})
+	if err != nil {
+		b.Fatal(err)
+	}
+	key := benchStructC{}
+	b.ReportAllocs()
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			_ = result.Get(key)
+		}
+	})
+}
diff --git a/cache.go b/cache.go
@@ -0,0 +1,33 @@
+package databuilder
+
+import (
+	"reflect"
+	"runtime"
+	"sync"
+)
+
+// Keys (reflect.Type identity, function PC) are stable for the lifetime of
+// the process, so these caches never need eviction and are bounded by the
+// number of distinct types and builder functions ever observed.
+var (
+	structNameCache sync.Map // reflect.Type -> string
+	funcNameCache   sync.Map // uintptr      -> string
+)
+
+func cachedStructName(t reflect.Type) string {
+	if v, ok := structNameCache.Load(t); ok {
+		return v.(string)
+	}
+	name := t.PkgPath() + "." + t.Name()
+	actual, _ := structNameCache.LoadOrStore(t, name)
+	return actual.(string)
+}
+
+func resolveFuncName(pc uintptr) string {
+	if v, ok := funcNameCache.Load(pc); ok {
+		return v.(string)
+	}
+	name := runtime.FuncForPC(pc).Name()
+	actual, _ := funcNameCache.LoadOrStore(pc, name)
+	return actual.(string)
+}