-
Notifications
You must be signed in to change notification settings - Fork 253
Add system.cpu.load_average.1m and system.processes.count
#2014
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| 'system.cpu.simple_utilization', | ||
| 'system.cpu.time', | ||
| 'system.cpu.utilization', | ||
| 'system.cpu.load_average.1m', | ||
| 'system.memory.usage', | ||
| 'system.memory.utilization', | ||
| 'system.swap.usage', | ||
|
|
@@ -43,6 +44,7 @@ | |
| 'system.network.errors', | ||
| 'system.network.io', | ||
| 'system.network.connections', | ||
| 'system.processes.count', | ||
| 'system.thread_count', | ||
| 'process.open_file_descriptor.count', | ||
| 'process.context_switches', | ||
|
|
@@ -62,6 +64,7 @@ | |
| 'system.cpu.simple_utilization', | ||
| 'system.cpu.time', | ||
| 'system.cpu.utilization', | ||
| 'system.cpu.load_average.1m', | ||
| 'system.memory.usage', | ||
| 'system.memory.utilization', | ||
| 'system.swap.usage', | ||
|
|
@@ -74,6 +77,7 @@ | |
| 'system.network.errors', | ||
| 'system.network.io', | ||
| 'system.network.connections', | ||
| 'system.processes.count', | ||
| 'system.thread_count', | ||
| 'process.open_file_descriptor.count', | ||
| 'process.context_switches', | ||
|
|
@@ -109,6 +113,11 @@ | |
| 'process.cpu.core_utilization': None, | ||
| 'system.cpu.time': CPU_FIELDS, | ||
| 'system.cpu.utilization': CPU_FIELDS, | ||
| # The Hosts page reads these — upstream `_DEFAULT_CONFIG` doesn't include | ||
| # them and they aren't expensive (one number per scrape each), so add them | ||
| # so `base='full'` covers the page without a follow-up `config={…}` knob. | ||
| 'system.cpu.load_average.1m': None, | ||
| 'system.processes.count': None, | ||
| # For usage, knowing the total amount of bytes available might be handy. | ||
| 'system.memory.usage': MEMORY_FIELDS + ['total'], | ||
| # For utilization, the total is always just 1 (100%), so it's not included. | ||
|
|
@@ -173,6 +182,14 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = | |
| measure_process_cpu_utilization(logfire_instance) | ||
| del config['process.cpu.utilization'] | ||
|
|
||
| if 'system.cpu.load_average.1m' in config: | ||
| measure_system_cpu_load_average_1m(logfire_instance) | ||
| del config['system.cpu.load_average.1m'] | ||
|
|
||
| if 'system.processes.count' in config: | ||
| measure_system_processes_count(logfire_instance) | ||
| del config['system.processes.count'] | ||
|
|
||
| instrumentor = SystemMetricsInstrumentor(config=config) # pyright: ignore[reportArgumentType] | ||
| instrumentor.instrument(meter_provider=logfire_instance.config.get_meter_provider()) | ||
|
|
||
|
|
@@ -248,3 +265,41 @@ def callback(_options: CallbackOptions) -> Iterable[Observation]: | |
| description='Runtime CPU utilization, not divided by the number of available cores.', | ||
| unit='core', | ||
| ) | ||
|
|
||
|
|
||
| def measure_system_cpu_load_average_1m(logfire_instance: Logfire): | ||
| """1-minute system load average (run-queue length, smoothed). | ||
|
|
||
| Upstream `SystemMetricsInstrumentor` doesn't include this metric; we add it | ||
| so `base='full'` covers the Hosts page's `Load 1m` column. On Windows, | ||
| `psutil.getloadavg()` is an emulated polling shim — first call returns 0 | ||
| and subsequent values update every ~5s. | ||
| """ | ||
|
|
||
| def callback(_options: CallbackOptions) -> Iterable[Observation]: | ||
| yield Observation(psutil.getloadavg()[0]) | ||
|
|
||
| logfire_instance.metric_gauge_callback( | ||
| 'system.cpu.load_average.1m', | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://opentelemetry.io/docs/specs/semconv/system/system-metrics/ mentions a hypothetical
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @strawgate Can we just change the name to this? Is there a reason you used the existing name? |
||
| [callback], | ||
| description='Average system load over the last minute.', | ||
| unit='{run_queue_item}', | ||
| ) | ||
|
|
||
|
|
||
| def measure_system_processes_count(logfire_instance: Logfire): | ||
| """Total number of processes on the system. | ||
|
|
||
| Upstream `SystemMetricsInstrumentor` doesn't include this metric; we add | ||
| it so `base='full'` covers the Hosts page's `Procs` column. | ||
| """ | ||
|
|
||
| def callback(_options: CallbackOptions) -> Iterable[Observation]: | ||
| yield Observation(len(psutil.pids())) | ||
|
|
||
| logfire_instance.metric_gauge_callback( | ||
| 'system.processes.count', | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should probably be
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the point is — rename the attribute to system.process.count, and make it an updowncounter not a gauge. If convenient, we can add the process.state attribute, but note that could increase the number of series significantly, which we probably wouldn't want to do in default config if it costs a decent amount. @strawgate do you have thoughts about whether we should add that attribute? Alternatively we could change the name more significantly, I don't think it should be close to the semantic convention one though if we do that (like it is now). But I'm okay using this name and not having the attribute if we think this is good. |
||
| [callback], | ||
| description='Total number of processes on the system.', | ||
| unit='{process}', | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,6 +67,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> | |
| 'process.open_file_descriptor.count', | ||
| 'process.runtime.cpython.gc_count', | ||
| 'process.thread.count', | ||
| 'system.cpu.load_average.1m', | ||
| 'system.cpu.simple_utilization', | ||
| 'system.cpu.time', | ||
| 'system.cpu.utilization', | ||
|
|
@@ -79,6 +80,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> | |
| 'system.network.errors', | ||
| 'system.network.io', | ||
| 'system.network.packets', | ||
| 'system.processes.count', | ||
| 'system.swap.usage', | ||
| 'system.swap.utilization', | ||
| 'system.thread_count', | ||
|
|
@@ -92,6 +94,18 @@ def test_measure_process_runtime_cpu_utilization(metrics_reader: InMemoryMetricR | |
| assert get_collected_metric_names(metrics_reader) == ['process.runtime.cpython.cpu.utilization'] | ||
|
|
||
|
|
||
| def test_system_cpu_load_average_1m(metrics_reader: InMemoryMetricReader) -> None: | ||
| """Load average isn't in upstream `SystemMetricsInstrumentor` — Logfire emits it.""" | ||
| logfire.instrument_system_metrics({'system.cpu.load_average.1m': None}, base=None) | ||
| assert get_collected_metric_names(metrics_reader) == ['system.cpu.load_average.1m'] | ||
|
|
||
|
|
||
| def test_system_processes_count(metrics_reader: InMemoryMetricReader) -> None: | ||
| """Process count isn't in upstream `SystemMetricsInstrumentor` — Logfire emits it.""" | ||
| logfire.instrument_system_metrics({'system.processes.count': None}, base=None) | ||
| assert get_collected_metric_names(metrics_reader) == ['system.processes.count'] | ||
|
|
||
|
|
||
| def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: | ||
| logfire.instrument_system_metrics( | ||
| { | ||
|
|
@@ -186,6 +200,8 @@ def test_full_base(): | |
| 'cpython.gc.collected_objects': None, | ||
| 'cpython.gc.collections': None, | ||
| 'cpython.gc.uncollectable_objects': None, | ||
| 'system.cpu.load_average.1m': None, | ||
| 'system.processes.count': None, | ||
| }, 'Docs and the MetricName type need to be updated if this test fails' | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. docs weren't updated
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (the test explains that if it fails the docs need to be updated, we're guessing the AI fixed the test pre-emptively but didn't realize it also needed to update the docs.) |
||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If these are so cheap, and they're used in a default UI view, then they should be in the basic config.