From 495cc266e0ef81dba89b66fae47aba471aa6eb0b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 14 Apr 2026 15:44:46 +0100 Subject: [PATCH 1/4] fix(metrics): negative increment to a counter should not cause panics --- CHANGELOG.md | 3 +++ server/metrics.go | 4 ++++ server/metrics_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 server/metrics_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 21835a5c3..956765c6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [keep a changelog](http://keepachangelog.com) and this pr ## [Unreleased] +### Fixed +- Ignore negative custom runtime counter deltas to avoid panics during metrics collection. + ## [3.38.0] - 2026-03-20 ### Added - Add runtime Satori client feature to delete identities. diff --git a/server/metrics.go b/server/metrics.go index 9fbfee516..c142e88fd 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -523,6 +523,10 @@ func (m *LocalMetrics) StorageWriteRejectCount(tags map[string]string, delta int // CustomCounter adds the given delta to a counter with the specified name and tags. func (m *LocalMetrics) CustomCounter(name string, tags map[string]string, delta int64) { + if delta < 0 { + return + } + scope := m.prometheusCustomScope if len(tags) != 0 { scope = scope.Tagged(tags) diff --git a/server/metrics_test.go b/server/metrics_test.go new file mode 100644 index 000000000..bfda19a70 --- /dev/null +++ b/server/metrics_test.go @@ -0,0 +1,39 @@ +// Copyright 2026 The Nakama Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package server + +import ( + "testing" + "time" + + "go.uber.org/zap" +) + +func TestMetricsCounterAddNegativeDoesNotPanics(t *testing.T) { + logger := zap.NewNop() + cfg := NewConfig(logger) + cfg.Metrics.ReportingFreqSec = 1 + + metrics := NewLocalMetrics(logger, logger, nil, cfg) + defer metrics.Stop(logger) + + module := &RuntimeGoNakamaModule{metrics: metrics} + module.MetricsCounterAdd("panic_counter", nil, 1) + + time.Sleep(1500 * time.Millisecond) + module.MetricsCounterAdd("panic_counter", nil, -1) + + time.Sleep(1500 * time.Millisecond) +} From d0f6720edf5292d0b35e61df103c2f1c47c3275b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 14 Apr 2026 17:05:41 +0100 Subject: [PATCH 2/4] feat(metrics): derive wait time from metrics flush interval during tests --- server/metrics_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/metrics_test.go b/server/metrics_test.go index bfda19a70..e89a19038 100644 --- a/server/metrics_test.go +++ b/server/metrics_test.go @@ -25,6 +25,8 @@ func TestMetricsCounterAddNegativeDoesNotPanics(t *testing.T) { logger := zap.NewNop() cfg := NewConfig(logger) cfg.Metrics.ReportingFreqSec = 1 + reportingInterval := time.Duration(cfg.Metrics.ReportingFreqSec) * time.Second + flushWait := reportingInterval + 200*time.Millisecond metrics := NewLocalMetrics(logger, logger, nil, cfg) defer metrics.Stop(logger) @@ -32,8 +34,8 @@ func TestMetricsCounterAddNegativeDoesNotPanics(t *testing.T) { module := &RuntimeGoNakamaModule{metrics: metrics} module.MetricsCounterAdd("panic_counter", nil, 1) - time.Sleep(1500 * time.Millisecond) + time.Sleep(flushWait) module.MetricsCounterAdd("panic_counter", nil, -1) - time.Sleep(1500 * time.Millisecond) + time.Sleep(flushWait) } From 5e5511d70145ea6d5507eb178f143c23d530a3fa Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 14 Apr 2026 17:38:20 +0100 Subject: [PATCH 3/4] feat(metrics): track custom counter increments errors in the dedicated metric metric name is `metrics_collection_errors_count` --- server/metrics.go | 1 + server/metrics_test.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/server/metrics.go b/server/metrics.go index c142e88fd..b66472215 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -524,6 +524,7 @@ func (m *LocalMetrics) StorageWriteRejectCount(tags map[string]string, delta int // CustomCounter adds the given delta to a counter with the specified name and tags. func (m *LocalMetrics) CustomCounter(name string, tags map[string]string, delta int64) { if delta < 0 { + m.PrometheusScope.Tagged(map[string]string{"name": name, "err": "negative_increment"}).Counter("metrics_collection_errors_count").Inc(1) return } diff --git a/server/metrics_test.go b/server/metrics_test.go index e89a19038..215c77840 100644 --- a/server/metrics_test.go +++ b/server/metrics_test.go @@ -21,7 +21,7 @@ import ( "go.uber.org/zap" ) -func TestMetricsCounterAddNegativeDoesNotPanics(t *testing.T) { +func TestMetricsCounterAddNegativeDoesNotPanic(t *testing.T) { logger := zap.NewNop() cfg := NewConfig(logger) cfg.Metrics.ReportingFreqSec = 1 From 28f2d06d94b5357fa59e2b4a135d5dae9914c178 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 14 Apr 2026 18:13:38 +0100 Subject: [PATCH 4/4] feat(metrics): apply same cardinality restrictions as for custom metrics when tracking custom metrics collection errors --- server/metrics.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/server/metrics.go b/server/metrics.go index b66472215..d2c0bbe1f 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -92,10 +92,11 @@ type LocalMetrics struct { currentRecvBytes *atomic.Int64 currentSentBytes *atomic.Int64 - PrometheusScope tally.Scope - prometheusCustomScope tally.Scope - prometheusCloser io.Closer - prometheusHTTPServer *http.Server + PrometheusScope tally.Scope + prometheusCustomScope tally.Scope + metricsCollectionScope tally.Scope + prometheusCloser io.Closer + prometheusHTTPServer *http.Server } func NewLocalMetrics(logger, startupLogger *zap.Logger, db *sql.DB, config Config) *LocalMetrics { @@ -162,8 +163,10 @@ func NewLocalMetrics(logger, startupLogger *zap.Logger, db *sql.DB, config Confi SanitizeOptions: &prometheus.DefaultSanitizerOpts, }, time.Duration(config.GetMetrics().ReportingFreqSec)*time.Second) m.prometheusCustomScope = m.PrometheusScope.SubScope(config.GetMetrics().CustomPrefix) + m.metricsCollectionScope = m.PrometheusScope if config.GetMetrics().CustomScopeLimit > 0 { m.prometheusCustomScope = newMetricsLimitedScope(m.prometheusCustomScope, int64(config.GetMetrics().CustomScopeLimit)) + m.metricsCollectionScope = newMetricsLimitedScope(m.metricsCollectionScope, int64(config.GetMetrics().CustomScopeLimit)) } // Check if exposing Prometheus metrics directly is enabled. @@ -524,7 +527,7 @@ func (m *LocalMetrics) StorageWriteRejectCount(tags map[string]string, delta int // CustomCounter adds the given delta to a counter with the specified name and tags. func (m *LocalMetrics) CustomCounter(name string, tags map[string]string, delta int64) { if delta < 0 { - m.PrometheusScope.Tagged(map[string]string{"name": name, "err": "negative_increment"}).Counter("metrics_collection_errors_count").Inc(1) + m.metricsCollectionScope.Tagged(map[string]string{"name": name, "err": "negative_increment"}).Counter("metrics_collection_errors_count").Inc(1) return }