diff --git a/diagnostic_common_diagnostics/README.md b/diagnostic_common_diagnostics/README.md index 959adc85f..f1ae46ead 100644 --- a/diagnostic_common_diagnostics/README.md +++ b/diagnostic_common_diagnostics/README.md @@ -13,7 +13,9 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "cpu_monitor_" + hostname. * Uses the following args: + * use_average: If true, the average CPU usage over all cores will be used to determine the status. If false, the maximum CPU usage among all cores will be used. * warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised. + * error_percentage: If the CPU usage is > error_percentage, an ERROR status will be published. * window: the maximum length of the used collections.deque for queuing CPU readings. ### Published Topics @@ -97,6 +99,7 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "ram_monitor_" + hostname. * Uses the following args: * warning_percentage: If the RAM usage is > warning_percentage, a WARN status will be published. + * error_percentage: If the RAM usage is > error_percentage, an ERROR status will be published. * window: the maximum length of the used collections.deque for queuing RAM readings. ### Published Topics diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py index 32dd60eb3..03faf84c4 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -51,11 +51,13 @@ class CpuTask(DiagnosticTask): - def __init__(self, warning_percentage=90, window=1): + def __init__(self, warning_percentage, error_percentage, window, use_average): DiagnosticTask.__init__(self, 'CPU Information') self._warning_percentage = int(warning_percentage) + self._error_percentage = int(error_percentage) self._readings = deque(maxlen=window) + self._use_average = use_average def _get_average_reading(self): def avg(lst): @@ -71,15 +73,20 @@ def run(self, stat): stat.add('CPU Load Average', f'{cpu_average:.2f}') - warn = False for idx, cpu_percentage in enumerate(cpu_percentages): stat.add(f'CPU {idx} Load', f'{cpu_percentage:.2f}') - if cpu_percentage > self._warning_percentage: - warn = True - if warn: + if self._use_average: + cpu_usage = cpu_average + else: + cpu_usage = max(cpu_percentages) + + if cpu_usage > self._error_percentage: + stat.summary(DiagnosticStatus.ERROR, + f'CPU usage exceeds {self._error_percentage} percent') + elif cpu_usage > self._warning_percentage: stat.summary(DiagnosticStatus.WARN, - f'At least one CPU exceeds {self._warning_percentage} percent') + f'CPU usage exceeds {self._warning_percentage} percent') else: stat.summary(DiagnosticStatus.OK, f'CPU Average {cpu_average:.2f} percent') @@ -100,16 +107,22 @@ def main(args=None): # Declare and get parameters node.declare_parameter('warning_percentage', 90) + node.declare_parameter('error_percentage', 95) + node.declare_parameter('use_average', False) node.declare_parameter('window', 1) warning_percentage = node.get_parameter( 'warning_percentage').get_parameter_value().integer_value + error_percentage = node.get_parameter( + 'error_percentage').get_parameter_value().integer_value + use_average = node.get_parameter('use_average').get_parameter_value().bool_value window = node.get_parameter('window').get_parameter_value().integer_value # Create diagnostic updater with default updater rate of 1 hz updater = Updater(node) updater.setHardwareID(hostname) - updater.add(CpuTask(warning_percentage=warning_percentage, window=window)) + updater.add(CpuTask(warning_percentage=warning_percentage, error_percentage=error_percentage, + window=window, use_average=use_average)) rclpy.spin(node) diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/param_decl.yaml b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/param_decl.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py index da59a6d25..e50d78b89 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py @@ -48,9 +48,10 @@ class RamTask(DiagnosticTask): - def __init__(self, warning_percentage, window): + def __init__(self, warning_percentage, error_percentage, window): DiagnosticTask.__init__(self, 'RAM Information') self._warning_percentage = int(warning_percentage) + self._error_percentage = int(error_percentage) self._readings = collections.deque(maxlen=window) def run(self, stat): @@ -59,7 +60,12 @@ def run(self, stat): stat.add('RAM Load Average', f'{ram_average:.2f}') - if ram_average > self._warning_percentage: + if ram_average > self._error_percentage: + stat.summary( + DiagnosticStatus.ERROR, + f'RAM Average exceeds {self._error_percentage:d} percent', + ) + elif ram_average > self._warning_percentage: stat.summary( DiagnosticStatus.WARN, f'RAM Average exceeds {self._warning_percentage:d} percent', @@ -84,6 +90,7 @@ def main(): updater.add( RamTask( node.declare_parameter('warning_percentage', 90).value, + node.declare_parameter('error_percentage', 95).value, node.declare_parameter('window', 1).value, ) ) diff --git a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py index 28430c482..33e66f1d6 100644 --- a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py +++ b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py @@ -67,7 +67,7 @@ def diagnostics_callback(self, msg): def test_ok(self): warning_percentage = 100 - task = CpuTask(warning_percentage) + task = CpuTask(warning_percentage=warning_percentage, error_percentage=100, window=1, use_average=False) stat = DiagnosticStatusWrapper() task.run(stat) self.assertEqual(task.name, 'CPU Information') @@ -79,13 +79,13 @@ def test_ok(self): def test_warn(self): warning_percentage = -1 - task = CpuTask(warning_percentage) + task = CpuTask(warning_percentage=warning_percentage, error_percentage=100, window=1, use_average=False) stat = DiagnosticStatusWrapper() task.run(stat) print(f'Raw readings: {task._readings}') self.assertEqual(task.name, 'CPU Information') self.assertEqual(stat.level, DiagnosticStatus.WARN) - self.assertIn(str('At least one CPU exceeds'), stat.message) + self.assertIn(str('CPU usage exceeds'), stat.message) # Check for at least 1 CPU Load Average and 1 CPU Load self.assertGreaterEqual(len(stat.values), 2) @@ -96,7 +96,7 @@ def test_updater(self): node = Node('cpu_monitor_test') updater = Updater(node) updater.setHardwareID('test_id') - updater.add(CpuTask()) + updater.add(CpuTask(warning_percentage=95, error_percentage=100, window=1, use_average=False)) node.create_subscription( DiagnosticArray, '/diagnostics', self.diagnostics_callback, 10)