Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions logfire/_internal/integrations/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
'system.cpu.simple_utilization',
'system.cpu.time',
'system.cpu.utilization',
'system.cpu.load_average.1m',
'system.memory.usage',
'system.memory.utilization',
'system.swap.usage',
Expand All @@ -43,6 +44,7 @@
'system.network.errors',
'system.network.io',
'system.network.connections',
'system.processes.count',
'system.thread_count',
'process.open_file_descriptor.count',
'process.context_switches',
Expand All @@ -62,6 +64,7 @@
'system.cpu.simple_utilization',
'system.cpu.time',
'system.cpu.utilization',
'system.cpu.load_average.1m',
'system.memory.usage',
'system.memory.utilization',
'system.swap.usage',
Expand All @@ -74,6 +77,7 @@
'system.network.errors',
'system.network.io',
'system.network.connections',
'system.processes.count',
'system.thread_count',
'process.open_file_descriptor.count',
'process.context_switches',
Expand Down Expand Up @@ -109,6 +113,11 @@
'process.cpu.core_utilization': None,
'system.cpu.time': CPU_FIELDS,
'system.cpu.utilization': CPU_FIELDS,
# The Hosts page reads these — upstream `_DEFAULT_CONFIG` doesn't include
# them and they aren't expensive (one number per scrape each), so add them
# so `base='full'` covers the page without a follow-up `config={…}` knob.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If these are so cheap, and they're used in a default UI view, then they should be in the basic config.

'system.cpu.load_average.1m': None,
'system.processes.count': None,
# For usage, knowing the total amount of bytes available might be handy.
'system.memory.usage': MEMORY_FIELDS + ['total'],
# For utilization, the total is always just 1 (100%), so it's not included.
Expand Down Expand Up @@ -173,6 +182,14 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None =
measure_process_cpu_utilization(logfire_instance)
del config['process.cpu.utilization']

if 'system.cpu.load_average.1m' in config:
measure_system_cpu_load_average_1m(logfire_instance)
del config['system.cpu.load_average.1m']

if 'system.processes.count' in config:
measure_system_processes_count(logfire_instance)
del config['system.processes.count']

instrumentor = SystemMetricsInstrumentor(config=config) # pyright: ignore[reportArgumentType]
instrumentor.instrument(meter_provider=logfire_instance.config.get_meter_provider())

Expand Down Expand Up @@ -248,3 +265,41 @@ def callback(_options: CallbackOptions) -> Iterable[Observation]:
description='Runtime CPU utilization, not divided by the number of available cores.',
unit='core',
)


def measure_system_cpu_load_average_1m(logfire_instance: Logfire):
"""1-minute system load average (run-queue length, smoothed).

Upstream `SystemMetricsInstrumentor` doesn't include this metric; we add it
so `base='full'` covers the Hosts page's `Load 1m` column. On Windows,
`psutil.getloadavg()` is an emulated polling shim — first call returns 0
and subsequent values update every ~5s.
"""

def callback(_options: CallbackOptions) -> Iterable[Observation]:
yield Observation(psutil.getloadavg()[0])

logfire_instance.metric_gauge_callback(
'system.cpu.load_average.1m',

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://opentelemetry.io/docs/specs/semconv/system/system-metrics/ mentions a hypothetical system.linux.cpu.load_1m. Not a real semconv, but worth mimicking. If we don't want to include the OS prefix, then system.cpu.load_1m.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@strawgate Can we just change the name to this? Is there a reason you used the existing name?

[callback],
description='Average system load over the last minute.',
unit='{run_queue_item}',
)


def measure_system_processes_count(logfire_instance: Logfire):
"""Total number of processes on the system.

Upstream `SystemMetricsInstrumentor` doesn't include this metric; we add
it so `base='full'` covers the Hosts page's `Procs` column.
"""

def callback(_options: CallbackOptions) -> Iterable[Observation]:
yield Observation(len(psutil.pids()))

logfire_instance.metric_gauge_callback(
'system.processes.count',

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should probably be system.process.count as in https://opentelemetry.io/docs/specs/semconv/system/system-metrics/#metric-systemprocesscount, which is an updowncounter. If including the process.state attribute isn't easy it's probably fine to leave out.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the point is — rename the attribute to system.process.count, and make it an updowncounter not a gauge. If convenient, we can add the process.state attribute, but note that could increase the number of series significantly, which we probably wouldn't want to do in default config if it costs a decent amount. @strawgate do you have thoughts about whether we should add that attribute?

Alternatively we could change the name more significantly, I don't think it should be close to the semantic convention one though if we do that (like it is now). But I'm okay using this name and not having the attribute if we think this is good.

[callback],
description='Total number of processes on the system.',
unit='{process}',
)
16 changes: 16 additions & 0 deletions tests/otel_integrations/test_system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) ->
'process.open_file_descriptor.count',
'process.runtime.cpython.gc_count',
'process.thread.count',
'system.cpu.load_average.1m',
'system.cpu.simple_utilization',
'system.cpu.time',
'system.cpu.utilization',
Expand All @@ -79,6 +80,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) ->
'system.network.errors',
'system.network.io',
'system.network.packets',
'system.processes.count',
'system.swap.usage',
'system.swap.utilization',
'system.thread_count',
Expand All @@ -92,6 +94,18 @@ def test_measure_process_runtime_cpu_utilization(metrics_reader: InMemoryMetricR
assert get_collected_metric_names(metrics_reader) == ['process.runtime.cpython.cpu.utilization']


def test_system_cpu_load_average_1m(metrics_reader: InMemoryMetricReader) -> None:
"""Load average isn't in upstream `SystemMetricsInstrumentor` — Logfire emits it."""
logfire.instrument_system_metrics({'system.cpu.load_average.1m': None}, base=None)
assert get_collected_metric_names(metrics_reader) == ['system.cpu.load_average.1m']


def test_system_processes_count(metrics_reader: InMemoryMetricReader) -> None:
"""Process count isn't in upstream `SystemMetricsInstrumentor` — Logfire emits it."""
logfire.instrument_system_metrics({'system.processes.count': None}, base=None)
assert get_collected_metric_names(metrics_reader) == ['system.processes.count']


def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None:
logfire.instrument_system_metrics(
{
Expand Down Expand Up @@ -186,6 +200,8 @@ def test_full_base():
'cpython.gc.collected_objects': None,
'cpython.gc.collections': None,
'cpython.gc.uncollectable_objects': None,
'system.cpu.load_average.1m': None,
'system.processes.count': None,
}, 'Docs and the MetricName type need to be updated if this test fails'

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs weren't updated

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(the test explains that if it fails the docs need to be updated, we're guessing the AI fixed the test pre-emptively but didn't realize it also needed to update the docs.)



Expand Down
Loading