From 4fa7069c705ef30264a94b0d192e95dbe1d4a341 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 1 Apr 2026 14:32:17 -0500 Subject: [PATCH 01/25] min API for external invocation --- nodescraper/cli/__init__.py | 18 +++++- nodescraper/cli/cli.py | 30 +++++---- nodescraper/cli/embed.py | 53 +++++++++++++++ nodescraper/cli/invocation.py | 118 ++++++++++++++++++++++++++++++++++ 4 files changed, 206 insertions(+), 13 deletions(-) create mode 100644 nodescraper/cli/embed.py create mode 100644 nodescraper/cli/invocation.py diff --git a/nodescraper/cli/__init__.py b/nodescraper/cli/__init__.py index 12ed1099..f5e396b2 100644 --- a/nodescraper/cli/__init__.py +++ b/nodescraper/cli/__init__.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (C) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -25,5 +25,19 @@ ############################################################################### from .cli import main as cli_entry +from .embed import run_main_return_code +from .invocation import ( + PluginRunInvocation, + get_plugin_run_invocation, + plugin_run_invocation_scope, + run_plugin_queue_with_invocation, +) -__all__ = ["cli_entry"] +__all__ = [ + "cli_entry", + "run_main_return_code", + "PluginRunInvocation", + "get_plugin_run_invocation", + "plugin_run_invocation_scope", + "run_plugin_queue_with_invocation", +] diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index f4e2fe86..d9044cbe 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -49,6 +49,7 @@ process_args, ) from nodescraper.cli.inputargtypes import ModelArgHandler, json_arg, log_path_arg +from nodescraper.cli.invocation import run_plugin_queue_with_invocation from nodescraper.configregistry import ConfigRegistry from nodescraper.connection.redfish import ( RedfishConnection, @@ -359,11 +360,17 @@ def setup_logger(log_level: str = "INFO", log_path: Optional[str] = None) -> log return logger -def main(arg_input: Optional[list[str]] = None): +def main( + arg_input: Optional[list[str]] = None, + *, + host_cli_args: Optional[argparse.Namespace] = None, +): """Main entry point for the CLI Args: arg_input (Optional[list[str]], optional): list of args to parse. Defaults to None. + host_cli_args: Optional namespace from an embedding host (e.g. detect-errors) for code that + calls get_plugin_run_invocation during the plugin queue. """ if arg_input is None: arg_input = sys.argv[1:] @@ -524,17 +531,18 @@ def main(arg_input: Optional[list[str]] = None): except Exception as e: parser.error(str(e)) - plugin_executor = PluginExecutor( - logger=logger, - plugin_configs=plugin_config_inst_list, - connections=parsed_args.connection_config, - system_info=system_info, - log_path=log_path, - plugin_registry=plugin_reg, - ) - try: - results = plugin_executor.run_queue() + results = run_plugin_queue_with_invocation( + plugin_reg=plugin_reg, + parsed_args=parsed_args, + plugin_config_inst_list=plugin_config_inst_list, + system_info=system_info, + log_path=log_path, + logger=logger, + timestamp=timestamp, + sname=sname, + host_cli_args=host_cli_args, + ) dump_results_to_csv(results, sname, log_path, timestamp, logger) diff --git a/nodescraper/cli/embed.py b/nodescraper/cli/embed.py new file mode 100644 index 00000000..aa5ad082 --- /dev/null +++ b/nodescraper/cli/embed.py @@ -0,0 +1,53 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""In-process CLI entry without adding new argparse flags.""" + +from __future__ import annotations + +import argparse +from typing import Optional + +__all__ = ["run_main_return_code"] + + +def run_main_return_code( + arg_input: list[str], + *, + host_cli_args: Optional[argparse.Namespace] = None, +) -> int: + """Runs the nodescraper main entrypoint and maps SystemExit to an integer return code.""" + from nodescraper.cli.cli import main + + try: + main(arg_input, host_cli_args=host_cli_args) + except SystemExit as exc: + code = exc.code + if code is None: + return 0 + if isinstance(code, int): + return code + return 1 + return 0 diff --git a/nodescraper/cli/invocation.py b/nodescraper/cli/invocation.py new file mode 100644 index 00000000..024f1882 --- /dev/null +++ b/nodescraper/cli/invocation.py @@ -0,0 +1,118 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Plugin run invocation context for embedded hosts (e.g. error-scraper OOB).""" + +from __future__ import annotations + +import argparse +import logging +from contextlib import contextmanager +from contextvars import ContextVar +from dataclasses import dataclass +from typing import Iterator, Optional + +from nodescraper.models import PluginConfig, SystemInfo +from nodescraper.models.pluginresult import PluginResult +from nodescraper.pluginexecutor import PluginExecutor +from nodescraper.pluginregistry import PluginRegistry + +_plugin_run_invocation_ctx: ContextVar[Optional["PluginRunInvocation"]] = ContextVar( + "nodescraper_plugin_run_invocation", default=None +) + + +def get_plugin_run_invocation() -> Optional[PluginRunInvocation]: + """Return the active invocation while run_plugin_queue_with_invocation is running, if any.""" + return _plugin_run_invocation_ctx.get() + + +@contextmanager +def plugin_run_invocation_scope(inv: PluginRunInvocation) -> Iterator[None]: + """Bind *inv* for nested code (connection managers, plugins) for the scope of the context.""" + token = _plugin_run_invocation_ctx.set(inv) + try: + yield + finally: + _plugin_run_invocation_ctx.reset(token) + + +@dataclass +class PluginRunInvocation: + """Recorded inputs for one plugin run; optional host_cli_args for embedded hosts.""" + + plugin_reg: PluginRegistry + parsed_args: argparse.Namespace + plugin_config_inst_list: list[PluginConfig] + system_info: SystemInfo + log_path: Optional[str] + logger: logging.Logger + timestamp: str + sname: str + host_cli_args: Optional[argparse.Namespace] = None + + +def run_plugin_queue_with_invocation( + *, + plugin_reg: PluginRegistry, + parsed_args: argparse.Namespace, + plugin_config_inst_list: list[PluginConfig], + system_info: SystemInfo, + log_path: Optional[str], + logger: logging.Logger, + timestamp: str, + sname: str, + host_cli_args: Optional[argparse.Namespace] = None, +) -> list[PluginResult]: + """Constructs the plugin executor, binds invocation context, and runs the plugin queue.""" + inv = PluginRunInvocation( + plugin_reg=plugin_reg, + parsed_args=parsed_args, + plugin_config_inst_list=plugin_config_inst_list, + system_info=system_info, + log_path=log_path, + logger=logger, + timestamp=timestamp, + sname=sname, + host_cli_args=host_cli_args, + ) + plugin_executor = PluginExecutor( + logger=logger, + plugin_configs=plugin_config_inst_list, + connections=parsed_args.connection_config, + system_info=system_info, + log_path=log_path, + plugin_registry=plugin_reg, + ) + with plugin_run_invocation_scope(inv): + return plugin_executor.run_queue() + + +__all__ = [ + "PluginRunInvocation", + "get_plugin_run_invocation", + "plugin_run_invocation_scope", + "run_plugin_queue_with_invocation", +] From 0731d597753acabd262f4d0d800a00e8042740fe Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 1 Apr 2026 15:40:24 -0500 Subject: [PATCH 02/25] entry point for connection --- nodescraper/pluginregistry.py | 55 +++++++++++++++ .../test_connection_manager_entrypoints.py | 67 +++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 test/unit/framework/test_connection_manager_entrypoints.py diff --git a/nodescraper/pluginregistry.py b/nodescraper/pluginregistry.py index 997fd67b..559d96f6 100644 --- a/nodescraper/pluginregistry.py +++ b/nodescraper/pluginregistry.py @@ -47,6 +47,7 @@ def __init__( plugin_pkg: Optional[list[types.ModuleType]] = None, load_internal_plugins: bool = True, load_entry_point_plugins: bool = True, + load_entry_point_connection_managers: bool = True, ) -> None: """Initialize the PluginRegistry with optional plugin packages. @@ -54,6 +55,8 @@ def __init__( plugin_pkg (Optional[list[types.ModuleType]], optional): The module to search for plugins in. Defaults to None. load_internal_plugins (bool, optional): Whether internal plugin should be loaded. Defaults to True. load_entry_point_plugins (bool, optional): Whether to load plugins from entry points. Defaults to True. + load_entry_point_connection_managers (bool, optional): Whether to load connection managers from the + ``nodescraper.connection_managers`` entry-point group. Defaults to True. """ if load_internal_plugins: self.plugin_pkg = [internal_plugins, internal_connections, internal_collators] @@ -73,6 +76,13 @@ def __init__( PluginResultCollator, self.plugin_pkg ) + if load_entry_point_connection_managers: + for ( + name, + mgr_cls, + ) in PluginRegistry.load_connection_managers_from_entry_points().items(): + self.connection_managers[name] = mgr_cls + if load_entry_point_plugins: entry_point_plugins = self.load_plugins_from_entry_points() self.plugins.update(entry_point_plugins) @@ -112,6 +122,51 @@ def _recurse_pkg(pkg: types.ModuleType, base_class: type) -> None: _recurse_pkg(pkg, base_class) return registry + @staticmethod + def load_connection_managers_from_entry_points() -> dict[str, type]: + """Load ConnectionManager subclasses from ``nodescraper.connection_managers`` entry points. + + The class ``__name__`` is always a lookup key. If the distribution entry-point name + differs, it is registered as an alias (for ``--connection-config`` JSON keys). + + Returns: + dict[str, type]: Map of lookup key to connection manager class. + """ + managers: dict[str, type] = {} + + try: + try: + eps = importlib.metadata.entry_points( # type: ignore[call-arg] + group="nodescraper.connection_managers" + ) + except TypeError: + all_eps = importlib.metadata.entry_points() # type: ignore[assignment] + eps = all_eps.get("nodescraper.connection_managers", []) # type: ignore[assignment, attr-defined, arg-type] + + for entry_point in eps: + try: + loaded = entry_point.load() # type: ignore[attr-defined] + if not ( + inspect.isclass(loaded) + and issubclass(loaded, ConnectionManager) + and not inspect.isabstract(loaded) + ): + continue + if hasattr(loaded, "is_valid") and not loaded.is_valid(): + continue + cls = loaded + managers[cls.__name__] = cls + ep_name = getattr(entry_point, "name", None) + if ep_name and ep_name != cls.__name__: + managers[ep_name] = cls + except Exception: + pass + + except Exception: + pass + + return managers + @staticmethod def load_plugins_from_entry_points() -> dict[str, type]: """Load plugins registered via entry points. diff --git a/test/unit/framework/test_connection_manager_entrypoints.py b/test/unit/framework/test_connection_manager_entrypoints.py new file mode 100644 index 00000000..16721196 --- /dev/null +++ b/test/unit/framework/test_connection_manager_entrypoints.py @@ -0,0 +1,67 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from unittest.mock import MagicMock, patch + +from nodescraper.connection.inband.inbandmanager import InBandConnectionManager +from nodescraper.pluginregistry import PluginRegistry + + +def _entry_points_side_effect_cm_only(mock_ep, *args, **kwargs): + group = kwargs.get("group") + if group == "nodescraper.connection_managers": + return [mock_ep] + return [] + + +def test_load_connection_managers_from_entry_points_registers_class_and_alias(): + mock_ep = MagicMock() + mock_ep.name = "AliasInBand" + mock_ep.load.return_value = InBandConnectionManager + + with patch("nodescraper.pluginregistry.importlib.metadata.entry_points") as mock_eps: + mock_eps.side_effect = lambda *a, **k: _entry_points_side_effect_cm_only(mock_ep, *a, **k) + found = PluginRegistry.load_connection_managers_from_entry_points() + + assert found["InBandConnectionManager"] is InBandConnectionManager + assert found["AliasInBand"] is InBandConnectionManager + + +def test_plugin_registry_merges_entry_point_connection_managers(): + mock_ep = MagicMock() + mock_ep.name = "AliasInBand" + mock_ep.load.return_value = InBandConnectionManager + + with patch("nodescraper.pluginregistry.importlib.metadata.entry_points") as mock_eps: + mock_eps.side_effect = lambda *a, **k: _entry_points_side_effect_cm_only(mock_ep, *a, **k) + reg = PluginRegistry(load_entry_point_connection_managers=True) + + assert reg.connection_managers["InBandConnectionManager"] is InBandConnectionManager + assert reg.connection_managers["AliasInBand"] is InBandConnectionManager + + +def test_plugin_registry_can_disable_entry_point_connection_managers(): + reg = PluginRegistry(load_entry_point_connection_managers=False) + assert "InBandConnectionManager" in reg.connection_managers From b964f546e97a68389d7404a18febeaa285620aa1 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 1 Apr 2026 16:04:52 -0500 Subject: [PATCH 03/25] dynamically load connection --- nodescraper/cli/invocation.py | 1 - nodescraper/pluginexecutor.py | 33 +++++++++++++-------- test/unit/framework/test_plugin_executor.py | 17 +++++++++++ 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/nodescraper/cli/invocation.py b/nodescraper/cli/invocation.py index 024f1882..12cd3a94 100644 --- a/nodescraper/cli/invocation.py +++ b/nodescraper/cli/invocation.py @@ -23,7 +23,6 @@ # SOFTWARE. # ############################################################################### -"""Plugin run invocation context for embedded hosts (e.g. error-scraper OOB).""" from __future__ import annotations diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index a8da102b..1782bb50 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -26,6 +26,7 @@ from __future__ import annotations import copy +import inspect import logging from collections import deque from typing import Optional, Type, Union @@ -160,30 +161,38 @@ def run_queue(self) -> list[PluginResult]: connection_manager_class: Type[ConnectionManager] = plugin_class.CONNECTION_TYPE if ( connection_manager_class.__name__ - not in self.plugin_registry.connection_managers + in self.plugin_registry.connection_managers ): + mgr_impl = self.plugin_registry.connection_managers[ + connection_manager_class.__name__ + ] + elif ( + inspect.isclass(connection_manager_class) + and issubclass(connection_manager_class, ConnectionManager) + and not inspect.isabstract(connection_manager_class) + ): + # External packages set CONNECTION_TYPE on the plugin; + # use it when not listed under nodescraper.connection_managers entry points. + mgr_impl = connection_manager_class + else: self.logger.error( "Unable to find registered connection manager class for %s that is required by", connection_manager_class.__name__, ) continue - if connection_manager_class not in self.connection_library: + if mgr_impl not in self.connection_library: self.logger.info( "Initializing connection manager for %s with default args", - connection_manager_class.__name__, + mgr_impl.__name__, ) - self.connection_library[connection_manager_class] = ( - connection_manager_class( - system_info=self.system_info, - logger=self.logger, - task_result_hooks=self.connection_result_hooks, - ) + self.connection_library[mgr_impl] = mgr_impl( + system_info=self.system_info, + logger=self.logger, + task_result_hooks=self.connection_result_hooks, ) - init_payload["connection_manager"] = self.connection_library[ - connection_manager_class - ] + init_payload["connection_manager"] = self.connection_library[mgr_impl] try: plugin_inst = plugin_class(**init_payload) diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index a5121398..7ed75b93 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -164,3 +164,20 @@ def test_apply_global_args_to_plugin(): "foo": "analyzed", "regex_match": False, } + + +def test_connection_manager_from_plugin_when_not_in_registry(): + """CONNECTION_TYPE may come from an external package without a registry entry.""" + registry = PluginRegistry() + registry.plugins = {"TestPluginB": TestPluginB} + registry.connection_managers = {} + + executor = PluginExecutor( + plugin_configs=[PluginConfig(plugins={"TestPluginB": {}})], + plugin_registry=registry, + ) + results = executor.run_queue() + + assert len(results) == 1 + assert results[0].source == "testB" + assert results[0].status == ExecutionStatus.OK From 380e98ac8dc1ef00175f94bee31ffd3b2ec76f49 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 10 Apr 2026 09:56:42 -0500 Subject: [PATCH 04/25] updaes --- nodescraper/cli/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index d9044cbe..80c791e4 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -527,7 +527,6 @@ def main( "skip_sudo" ] = True - log_system_info(log_path, system_info, logger) except Exception as e: parser.error(str(e)) @@ -544,6 +543,8 @@ def main( host_cli_args=host_cli_args, ) + log_system_info(log_path, system_info, logger) + dump_results_to_csv(results, sname, log_path, timestamp, logger) if parsed_args.reference_config: From faea3443d5e458fb6ef7d175c724fd467922c6cf Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 10 Apr 2026 10:14:09 -0500 Subject: [PATCH 05/25] added cpu/gpu_count to system_info --- nodescraper/models/systeminfo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nodescraper/models/systeminfo.py b/nodescraper/models/systeminfo.py index e82d6212..d91a68cf 100644 --- a/nodescraper/models/systeminfo.py +++ b/nodescraper/models/systeminfo.py @@ -39,6 +39,8 @@ class SystemInfo(BaseModel): os_family: OSFamily = OSFamily.UNKNOWN sku: Optional[str] = None platform: Optional[str] = None + gpu_count: Optional[int] = None + cpu_count: Optional[int] = None metadata: Optional[dict] = Field(default_factory=dict) location: Optional[SystemLocation] = SystemLocation.LOCAL vendorid_ep: int = 0x1002 From baa472433acccc6690ab679b51fb776793712e65 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 10 Apr 2026 15:24:46 -0500 Subject: [PATCH 06/25] supported sku updates --- nodescraper/interfaces/datacollectortask.py | 27 ++++++++++++++++++--- test/unit/framework/test_datacollector.py | 27 +++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 737a297c..020bf053 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -26,6 +26,7 @@ import abc import inspect import logging +from enum import Enum from functools import wraps from typing import Callable, ClassVar, Generic, Optional, Type, Union @@ -47,6 +48,21 @@ from .taskresulthook import TaskResultHook +def _supported_sku_name_set(supported: Optional[set[Union[str, Enum]]]) -> Optional[set[str]]: + """Map ``SUPPORTED_SKUS`` to string names for comparison with ``SystemInfo.sku``.""" + if not supported: + return None + names: set[str] = set() + for item in supported: + if isinstance(item, Enum): + names.add(item.name) + elif isinstance(item, str): + names.add(item) + else: + names.add(str(item)) + return names + + def collect_decorator( func: Callable[..., tuple[TaskResult, Optional[TDataModel]]], ) -> Callable[..., tuple[TaskResult, Optional[TDataModel]]]: @@ -111,8 +127,8 @@ class DataCollector(Task, abc.ABC, Generic[TConnection, TDataModel, TCollectArg] DATA_MODEL: Type[TDataModel] - # A set of supported SKUs for this data collector - SUPPORTED_SKUS: ClassVar[Optional[set[str]]] = None + # A set of supported SKUs for this data collector (strings or enum members; enum uses .name) + SUPPORTED_SKUS: ClassVar[Optional[set[Union[str, Enum]]]] = None # A set of supported Platforms for this data collector, SUPPORTED_PLATFORMS: ClassVar[Optional[set[str]]] = None @@ -153,7 +169,12 @@ def __init__( self.system_interaction_level = system_interaction_level self.connection = connection - if self.SUPPORTED_SKUS and self.system_info.sku not in self.SUPPORTED_SKUS: + allowed_skus = _supported_sku_name_set(self.SUPPORTED_SKUS) + if ( + allowed_skus is not None + and self.system_info.sku is not None + and self.system_info.sku not in allowed_skus + ): raise SystemCompatibilityError( f"{self.system_info.sku} SKU is not supported for this collector" ) diff --git a/test/unit/framework/test_datacollector.py b/test/unit/framework/test_datacollector.py index 1315d797..30fde48f 100644 --- a/test/unit/framework/test_datacollector.py +++ b/test/unit/framework/test_datacollector.py @@ -24,6 +24,7 @@ # ############################################################################### import logging +from enum import Enum from typing import Optional, Tuple import pytest @@ -127,6 +128,32 @@ def test_good_sku_and_platform(conn_mock): assert res.status == ExecutionStatus.OK +class _SampleSku(Enum): + GOOD = 1 + + +def test_supported_skus_may_use_enum_members(conn_mock): + class EnumSkuCollector(DummyCollector): + SUPPORTED_SKUS = {_SampleSku.GOOD} + + args = {"name": "h", "sku": "GOOD", "platform": "X", "os_family": 1} + info = SystemInfo(**args) + col = EnumSkuCollector(info, conn_mock) + res, data = col.collect_data() + assert res.status == ExecutionStatus.OK + + +def test_supported_skus_not_enforced_when_system_sku_is_none(conn_mock): + class RestrictedSkuCollector(DummyCollector): + SUPPORTED_SKUS = {_SampleSku.GOOD} + + args = {"name": "h", "sku": None, "platform": "X", "os_family": 1} + info = SystemInfo(**args) + col = RestrictedSkuCollector(info, conn_mock) + res, data = col.collect_data() + assert res.status == ExecutionStatus.OK + + def test_missing_data_model(): with pytest.raises(TypeError, match="No data model set for DummyCollector1"): From 41164240b9ba811e1c201f226e841ec941d9d32f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 14 Apr 2026 00:18:29 +0000 Subject: [PATCH 07/25] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 31b7ff74..baa51d5a 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -4,7 +4,7 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | -| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | +| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `analysis_firmware_ids`: Optional[list[str]] — amd-smi fw_id values to record in analysis_ref.firmware_versions
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | | BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | | CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | | DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | @@ -970,6 +970,8 @@ Data model for amd-smi data. - **xgmi_link**: `Optional[list[nodescraper.plugins.inband.amdsmi.amdsmidata.XgmiLinks]]` - **cper_data**: `Optional[list[nodescraper.models.datamodel.FileModel]]` - **cper_afids**: `dict[str, int]` +- **analysis_firmware_ids**: `Optional[list[str]]` +- **analysis_ref**: `Optional[nodescraper.plugins.inband.amdsmi.amdsmidata.AmdSmiAnalysisRef]` ## BiosDataModel Model From 7d42526f9ae2bbf2999b73ca382ddae6a45be6c8 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 16 Apr 2026 09:30:04 -0500 Subject: [PATCH 08/25] added new API --- nodescraper/cli/__init__.py | 6 ++- nodescraper/cli/cli.py | 36 +++++++++++++++++ nodescraper/cli/embed.py | 39 ++++++++++++++++-- test/unit/cli/test_cli_embed_api.py | 63 +++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 test/unit/cli/test_cli_embed_api.py diff --git a/nodescraper/cli/__init__.py b/nodescraper/cli/__init__.py index f5e396b2..44e9b02b 100644 --- a/nodescraper/cli/__init__.py +++ b/nodescraper/cli/__init__.py @@ -24,8 +24,9 @@ # ############################################################################### +from .cli import get_cli_top_level_subcommands from .cli import main as cli_entry -from .embed import run_main_return_code +from .embed import CLI_TOP_LEVEL_SUBCOMMANDS, run_cli_return_code, run_main_return_code from .invocation import ( PluginRunInvocation, get_plugin_run_invocation, @@ -34,7 +35,10 @@ ) __all__ = [ + "CLI_TOP_LEVEL_SUBCOMMANDS", "cli_entry", + "get_cli_top_level_subcommands", + "run_cli_return_code", "run_main_return_code", "PluginRunInvocation", "get_plugin_run_invocation", diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 7447e02a..f3f754d9 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -25,6 +25,7 @@ ############################################################################### import argparse import datetime +import functools import json import logging import os @@ -334,6 +335,41 @@ def build_parser( return parser, plugin_subparser_map +def _top_level_subcommand_names(root: argparse.ArgumentParser) -> tuple[str, ...]: + """Return ``dest=subcmd`` subparser names from the root CLI parser. + + Args: + root: Parser returned by :func:`build_parser`. + + Returns: + Tuple of top-level subcommand strings. + """ + for action in root._actions: + if isinstance(action, argparse._SubParsersAction) and action.dest == "subcmd": + return tuple(action.choices.keys()) + raise RuntimeError("nodescraper CLI root parser has no subcmd subparsers") + + +@functools.lru_cache(maxsize=1) +def get_cli_top_level_subcommands() -> tuple[str, ...]: + """Return top-level subcommand names from a parser built like :func:`main` (cached). + + Returns: + Tuple of ``subcmd`` subparser names; call ``cache_clear()`` if registries change in-process. + """ + plugin_reg = PluginRegistry() + config_reg = ConfigRegistry() + config_reg.configs["AllPlugins"] = PluginConfig( + name="AllPlugins", + desc="Run all registered plugins with default arguments", + global_args={}, + plugins={name: {} for name in plugin_reg.plugins}, + result_collators={}, + ) + parser, _plugin_subparser_map = build_parser(plugin_reg, config_reg) + return _top_level_subcommand_names(parser) + + def setup_logger( log_level: str = "INFO", log_path: Optional[str] = None, diff --git a/nodescraper/cli/embed.py b/nodescraper/cli/embed.py index aa5ad082..60d94515 100644 --- a/nodescraper/cli/embed.py +++ b/nodescraper/cli/embed.py @@ -23,14 +23,39 @@ # SOFTWARE. # ############################################################################### -"""In-process CLI entry without adding new argparse flags.""" from __future__ import annotations import argparse from typing import Optional -__all__ = ["run_main_return_code"] +from nodescraper.cli.cli import get_cli_top_level_subcommands + +CLI_TOP_LEVEL_SUBCOMMANDS = get_cli_top_level_subcommands() + +__all__ = [ + "CLI_TOP_LEVEL_SUBCOMMANDS", + "get_cli_top_level_subcommands", + "run_cli_return_code", + "run_main_return_code", +] + + +def run_cli_return_code( + argv: list[str], + *, + host_cli_args: Optional[argparse.Namespace] = None, +) -> int: + """Run nodescraper in-process; same behavior as :func:`run_main_return_code`. + + Args: + argv: Tokens after the program name. + host_cli_args: Optional host namespace forwarded to :func:`nodescraper.cli.cli.main`. + + Returns: + Integer exit code (``SystemExit`` is mapped, not raised). + """ + return run_main_return_code(argv, host_cli_args=host_cli_args) def run_main_return_code( @@ -38,7 +63,15 @@ def run_main_return_code( *, host_cli_args: Optional[argparse.Namespace] = None, ) -> int: - """Runs the nodescraper main entrypoint and maps SystemExit to an integer return code.""" + """Run :func:`nodescraper.cli.cli.main` and map ``SystemExit`` to an exit code. + + Args: + arg_input: Tokens after the program name. + host_cli_args: Optional host namespace for embedded runs. + + Returns: + Integer exit code. + """ from nodescraper.cli.cli import main try: diff --git a/test/unit/cli/test_cli_embed_api.py b/test/unit/cli/test_cli_embed_api.py new file mode 100644 index 00000000..db44f6cf --- /dev/null +++ b/test/unit/cli/test_cli_embed_api.py @@ -0,0 +1,63 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from __future__ import annotations + +import pytest + +from nodescraper.cli.cli import get_cli_top_level_subcommands +from nodescraper.cli.embed import ( + CLI_TOP_LEVEL_SUBCOMMANDS, + run_cli_return_code, + run_main_return_code, +) + + +def test_get_cli_top_level_subcommands_matches_argparse_subparsers() -> None: + subs = get_cli_top_level_subcommands() + assert isinstance(subs, tuple) + assert "run-plugins" in subs + assert "summary" in subs + assert all(isinstance(s, str) for s in subs) + + +def test_cli_top_level_subcommands_lazy_alias_matches_getter() -> None: + assert CLI_TOP_LEVEL_SUBCOMMANDS == get_cli_top_level_subcommands() + + +def test_run_cli_return_code_and_run_main_return_code_delegate( + monkeypatch: pytest.MonkeyPatch, +) -> None: + calls: list[list[str]] = [] + + def fake_main(arg_input: list[str], *, host_cli_args=None) -> None: + calls.append(list(arg_input)) + raise SystemExit(7) + + monkeypatch.setattr("nodescraper.cli.cli.main", fake_main) + assert run_cli_return_code(["describe", "plugin", "X"]) == 7 + assert run_main_return_code(["a", "b"]) == 7 + assert calls == [["describe", "plugin", "X"], ["a", "b"]] From 4117eadda5a3e0f3d14c0376a5cca003979bfb85 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 16 Apr 2026 11:39:06 -0500 Subject: [PATCH 09/25] adding --plugin-config= opt --- README.md | 22 ++++----- nodescraper/cli/cli.py | 16 +++++-- test/functional/test_fabrics_plugin.py | 8 ++-- test/functional/test_network_plugin.py | 11 ++--- test/functional/test_nic_plugin.py | 8 ++-- test/functional/test_pcie_plugin.py | 14 ++---- test/functional/test_plugin_configs.py | 48 ++++++++----------- test/functional/test_rdma_plugin.py | 8 ++-- .../test_reference_config_workflow.py | 4 +- test/functional/test_sys_settings_plugin.py | 6 +-- test/unit/cli/test_cli_no_console_stdout.py | 2 +- 11 files changed, 67 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index eda8dea4..cf4647a7 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ usage: cli.py [-h] [--version] [--sys-name STRING] [--sys-location {LOCAL,REMOTE}] [--sys-interaction-level {PASSIVE,INTERACTIVE,DISRUPTIVE}] [--sys-sku STRING] [--sys-platform STRING] - [--plugin-configs [STRING ...]] [--system-config STRING] + [--plugin-configs LIST] [--system-config STRING] [--connection-config STRING] [--log-path STRING] [--log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}] [--no-console-log] [--gen-reference-config] [--skip-sudo] @@ -112,10 +112,9 @@ options: --sys-sku STRING Manually specify SKU of system (default: None) --sys-platform STRING Specify system platform (default: None) - --plugin-configs [STRING ...] - built-in config names or paths to plugin config JSONs. - Available built-in configs: NodeStatus, AllPlugins - (default: None) + --plugin-configs LIST + Comma-separated built-in names and/or plugin config JSON + paths. Built-in: NodeStatus, AllPlugins (default: None) --system-config STRING Path to system config json (default: None) --connection-config STRING @@ -348,7 +347,7 @@ You can extend the built-in error detection with custom regex patterns. Create a Save this to `dmesg_custom_config.json` and run: ```sh -node-scraper --plugin-configs dmesg_custom_config.json run-plugins DmesgPlugin +node-scraper --plugin-configs=dmesg_custom_config.json run-plugins DmesgPlugin ``` #### **'compare-runs' subcommand** @@ -539,8 +538,9 @@ Built-in configs include **NodeStatus** (a subset of plugins) and **AllPlugins** registered plugin with default arguments—useful for generating a reference config from the full system). **NodeStatus plus additional plugins** — built-in configs merge with plugins named after `run-plugins`. -Use **`--plugin-configs=`** (equals form): with a space -after `--plugin-configs`. See below for examples: +Values are comma-separated; pass as **`--plugin-configs=…`** or **`--plugin-configs` …** (same as other +optional flags), e.g. `--plugin-configs=NodeStatus,/path/extra.json`. +Examples: ```sh node-scraper --plugin-configs=NodeStatus run-plugins PciePlugin ``` @@ -551,7 +551,7 @@ node-scraper --log-path ./logs --plugin-configs=NodeStatus run-plugins PciePlugi Using a JSON file: ```sh -node-scraper --plugin-configs plugin_config.json +node-scraper --plugin-configs=plugin_config.json ``` Here is an example of a comprehensive plugin config that specifies analyzer args for each plugin: ```json @@ -613,7 +613,7 @@ data. **Run all registered plugins (AllPlugins config):** ```sh -node-scraper --plugin-config AllPlugins +node-scraper --plugin-configs=AllPlugins ``` @@ -647,7 +647,7 @@ This will generate the following config: ``` This config can later be used on a different platform for comparison, using the steps at #2: ```sh -node-scraper --plugin-configs reference_config.json +node-scraper --plugin-configs=reference_config.json ``` diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index f3f754d9..26a30aa3 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -64,6 +64,11 @@ from nodescraper.pluginregistry import PluginRegistry +def _parse_plugin_configs_csv(value: str) -> list[str]: + """Split a comma-separated ``--plugin-configs`` value into names/paths.""" + return [p.strip() for p in value.split(",") if p.strip()] + + def build_parser( plugin_reg: PluginRegistry, config_reg: ConfigRegistry, @@ -126,10 +131,13 @@ def build_parser( parser.add_argument( "--plugin-configs", - type=str, - nargs="*", - help=f"built-in config names or paths to plugin config JSONs.\nAvailable built-in configs: {', '.join(config_reg.configs.keys())}", - metavar=META_VAR_MAP[str], + type=_parse_plugin_configs_csv, + default=None, + help=( + "Comma-separated built-in names and/or plugin config JSON paths " + f"(e.g. --plugin-configs=NodeStatus,/path/c.json). Built-ins: {', '.join(config_reg.configs.keys())}" + ), + metavar="LIST", ) parser.add_argument( diff --git a/test/functional/test_fabrics_plugin.py b/test/functional/test_fabrics_plugin.py index a8f0cd62..a334d966 100644 --- a/test/functional/test_fabrics_plugin.py +++ b/test/functional/test_fabrics_plugin.py @@ -48,7 +48,7 @@ def test_fabrics_plugin_with_basic_config(run_cli_command, fabrics_config_file, log_path = str(tmp_path / "logs_fabrics_basic") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(fabrics_config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={fabrics_config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -76,8 +76,7 @@ def test_fabrics_plugin_with_passive_interaction(run_cli_command, fabrics_config log_path, "--sys-interaction-level", "PASSIVE", - "--plugin-configs", - str(fabrics_config_file), + f"--plugin-configs={fabrics_config_file}", ], check=False, ) @@ -95,8 +94,7 @@ def test_fabrics_plugin_skip_sudo(run_cli_command, fabrics_config_file, tmp_path "--log-path", log_path, "--skip-sudo", - "--plugin-configs", - str(fabrics_config_file), + f"--plugin-configs={fabrics_config_file}", ], check=False, ) diff --git a/test/functional/test_network_plugin.py b/test/functional/test_network_plugin.py index 5759ad3b..d4ff71fc 100644 --- a/test/functional/test_network_plugin.py +++ b/test/functional/test_network_plugin.py @@ -48,7 +48,7 @@ def test_network_plugin_with_basic_config(run_cli_command, network_config_file, log_path = str(tmp_path / "logs_network_basic") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(network_config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={network_config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -76,8 +76,7 @@ def test_network_plugin_with_passive_interaction(run_cli_command, network_config log_path, "--sys-interaction-level", "PASSIVE", - "--plugin-configs", - str(network_config_file), + f"--plugin-configs={network_config_file}", ], check=False, ) @@ -95,8 +94,7 @@ def test_network_plugin_skip_sudo(run_cli_command, network_config_file, tmp_path "--log-path", log_path, "--skip-sudo", - "--plugin-configs", - str(network_config_file), + f"--plugin-configs={network_config_file}", ], check=False, ) @@ -113,8 +111,7 @@ def test_network_plugin_with_url(run_cli_command, network_config_file, tmp_path) [ "--log-path", log_path, - "--plugin-configs", - str(network_config_file), + f"--plugin-configs={network_config_file}", ], check=False, ) diff --git a/test/functional/test_nic_plugin.py b/test/functional/test_nic_plugin.py index ed9d28f2..b862845e 100644 --- a/test/functional/test_nic_plugin.py +++ b/test/functional/test_nic_plugin.py @@ -60,8 +60,7 @@ def test_nic_plugin_with_full_analyzer_args_config( [ "--log-path", log_path, - "--plugin-configs", - str(nic_plugin_config_full_analyzer_args), + f"--plugin-configs={nic_plugin_config_full_analyzer_args}", ], check=False, ) @@ -82,7 +81,7 @@ def test_nic_plugin_with_minimal_config(run_cli_command, nic_plugin_config_minim log_path = str(tmp_path / "logs_nic_minimal") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(nic_plugin_config_minimal)], + ["--log-path", log_path, f"--plugin-configs={nic_plugin_config_minimal}"], check=False, ) @@ -122,8 +121,7 @@ def test_nic_plugin_full_config_validates_analysis_args( [ "--log-path", log_path, - "--plugin-configs", - str(nic_plugin_config_full_analyzer_args), + f"--plugin-configs={nic_plugin_config_full_analyzer_args}", ], check=False, ) diff --git a/test/functional/test_pcie_plugin.py b/test/functional/test_pcie_plugin.py index 9d6c70c9..63bc21cf 100644 --- a/test/functional/test_pcie_plugin.py +++ b/test/functional/test_pcie_plugin.py @@ -54,7 +54,7 @@ def test_pcie_plugin_with_basic_config(run_cli_command, pcie_config_file, tmp_pa log_path = str(tmp_path / "logs_pcie_basic") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(pcie_config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={pcie_config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -69,7 +69,7 @@ def test_pcie_plugin_with_advanced_config(run_cli_command, pcie_advanced_config_ log_path = str(tmp_path / "logs_pcie_advanced") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(pcie_advanced_config_file)], + ["--log-path", log_path, f"--plugin-configs={pcie_advanced_config_file}"], check=False, ) @@ -97,8 +97,7 @@ def test_pcie_plugin_with_passive_interaction(run_cli_command, pcie_config_file, log_path, "--sys-interaction-level", "PASSIVE", - "--plugin-configs", - str(pcie_config_file), + f"--plugin-configs={pcie_config_file}", ], check=False, ) @@ -116,8 +115,7 @@ def test_pcie_plugin_skip_sudo(run_cli_command, pcie_config_file, tmp_path): "--log-path", log_path, "--skip-sudo", - "--plugin-configs", - str(pcie_config_file), + f"--plugin-configs={pcie_config_file}", ], check=False, ) @@ -136,9 +134,7 @@ def test_pcie_plugin_combined_configs( [ "--log-path", log_path, - "--plugin-configs", - str(pcie_config_file), - str(pcie_advanced_config_file), + f"--plugin-configs={pcie_config_file},{pcie_advanced_config_file}", ], check=False, ) diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py index 768e9e6e..90c68218 100644 --- a/test/functional/test_plugin_configs.py +++ b/test/functional/test_plugin_configs.py @@ -95,9 +95,7 @@ def invalid_plugin_config(tmp_path): def test_plugin_config_with_builtin_config(run_cli_command, tmp_path): """Test using a built-in config name.""" log_path = str(tmp_path / "logs_builtin") - result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", "NodeStatus"], check=False - ) + result = run_cli_command(["--log-path", log_path, "--plugin-configs=NodeStatus"], check=False) assert result.returncode in [0, 1, 2] output = result.stdout + result.stderr @@ -139,7 +137,7 @@ def test_individual_plugin_with_config_file( log_path = str(tmp_path / f"logs_{plugin_name.lower()}") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -153,7 +151,7 @@ def test_plugin_config_with_custom_json_file(run_cli_command, sample_plugin_conf """Test using a custom JSON config file path.""" log_path = str(tmp_path / "logs_custom") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", sample_plugin_config], check=False + ["--log-path", log_path, f"--plugin-configs={sample_plugin_config}"], check=False ) assert result.returncode in [0, 1, 2] @@ -168,13 +166,7 @@ def test_plugin_config_with_multiple_configs(run_cli_command, plugin_config_file os_config = str(plugin_config_files["OsPlugin"]) result = run_cli_command( - [ - "--log-path", - log_path, - "--plugin-configs", - bios_config, - os_config, - ], + ["--log-path", log_path, f"--plugin-configs={bios_config},{os_config}"], check=False, ) @@ -186,7 +178,7 @@ def test_plugin_config_with_multiple_configs(run_cli_command, plugin_config_file def test_plugin_config_with_nonexistent_file(run_cli_command, tmp_path): """Test that a nonexistent config file path fails gracefully.""" nonexistent_path = str(tmp_path / "nonexistent_config.json") - result = run_cli_command(["--plugin-configs", nonexistent_path], check=False) + result = run_cli_command([f"--plugin-configs={nonexistent_path}"], check=False) assert result.returncode != 0 output = (result.stdout + result.stderr).lower() @@ -195,7 +187,7 @@ def test_plugin_config_with_nonexistent_file(run_cli_command, tmp_path): def test_plugin_config_with_invalid_builtin_name(run_cli_command): """Test that an invalid built-in config name fails gracefully.""" - result = run_cli_command(["--plugin-configs", "NonExistentConfig"], check=False) + result = run_cli_command(["--plugin-configs=NonExistentConfig"], check=False) assert result.returncode != 0 output = (result.stdout + result.stderr).lower() @@ -204,7 +196,7 @@ def test_plugin_config_with_invalid_builtin_name(run_cli_command): def test_plugin_config_with_invalid_json(run_cli_command, invalid_plugin_config): """Test that an invalid JSON file fails gracefully.""" - result = run_cli_command(["--plugin-configs", invalid_plugin_config], check=False) + result = run_cli_command([f"--plugin-configs={invalid_plugin_config}"], check=False) assert result.returncode != 0 output = (result.stdout + result.stderr).lower() @@ -212,9 +204,9 @@ def test_plugin_config_with_invalid_json(run_cli_command, invalid_plugin_config) def test_plugin_config_empty_list(run_cli_command, tmp_path): - """Test --plugin-configs with no arguments (uses default config).""" + """Test omitting --plugin-configs (uses default config).""" log_path = str(tmp_path / "logs_empty") - result = run_cli_command(["--log-path", log_path, "--plugin-configs"], check=False) + result = run_cli_command(["--log-path", log_path], check=False) assert result.returncode in [0, 1, 2] output = result.stdout + result.stderr @@ -234,8 +226,7 @@ def test_plugin_config_with_system_interaction_level( log_path, "--sys-interaction-level", "PASSIVE", - "--plugin-configs", - config_file, + f"--plugin-configs={config_file}", ], check=False, ) @@ -254,8 +245,7 @@ def test_plugin_config_combined_with_run_plugins(run_cli_command, plugin_config_ [ "--log-path", log_path, - "--plugin-configs", - config_file, + f"--plugin-configs={config_file}", "run-plugins", "UptimePlugin", ], @@ -272,7 +262,9 @@ def test_plugin_config_verify_log_output(run_cli_command, plugin_config_files, t log_path = str(tmp_path / "logs_verify") config_file = str(plugin_config_files["OsPlugin"]) - result = run_cli_command(["--log-path", log_path, "--plugin-configs", config_file], check=False) + result = run_cli_command( + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False + ) log_dirs = [d for d in os.listdir(tmp_path) if d.startswith("logs_verify")] if result.returncode in [0, 1]: @@ -304,7 +296,7 @@ def test_dmesg_plugin_log_dmesg_data_false(run_cli_command, tmp_path): log_path = str(tmp_path / "logs_dmesg_no_log") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -331,7 +323,7 @@ def test_dmesg_plugin_log_dmesg_data_true(run_cli_command, tmp_path): log_path = str(tmp_path / "logs_dmesg_with_log") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False ) if result.returncode in [0, 1]: @@ -388,7 +380,7 @@ def test_dmesg_plugin_with_custom_regex_in_config(run_cli_command, tmp_path): log_path = str(tmp_path / "logs_custom_regex") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False ) # Check that command ran successfully @@ -448,7 +440,7 @@ def test_dmesg_plugin_with_event_collapsing_config(run_cli_command, tmp_path): log_path = str(tmp_path / "logs_collapse") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -521,7 +513,7 @@ def test_dmesg_plugin_with_custom_regex_and_collapsing(run_cli_command, tmp_path log_path = str(tmp_path / "logs_custom_collapse") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -579,7 +571,7 @@ def test_dmesg_plugin_different_collapse_intervals(run_cli_command, tmp_path): log_path_small = str(tmp_path / "logs_small_interval") result = run_cli_command( - ["--log-path", log_path_small, "--plugin-configs", str(config_file_small)], check=False + ["--log-path", log_path_small, f"--plugin-configs={config_file_small}"], check=False ) assert result.returncode in [0, 1, 2] diff --git a/test/functional/test_rdma_plugin.py b/test/functional/test_rdma_plugin.py index 862de3b8..31de828f 100644 --- a/test/functional/test_rdma_plugin.py +++ b/test/functional/test_rdma_plugin.py @@ -48,7 +48,7 @@ def test_rdma_plugin_with_basic_config(run_cli_command, rdma_config_file, tmp_pa log_path = str(tmp_path / "logs_rdma_basic") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(rdma_config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={rdma_config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -76,8 +76,7 @@ def test_rdma_plugin_with_passive_interaction(run_cli_command, rdma_config_file, log_path, "--sys-interaction-level", "PASSIVE", - "--plugin-configs", - str(rdma_config_file), + f"--plugin-configs={rdma_config_file}", ], check=False, ) @@ -95,8 +94,7 @@ def test_rdma_plugin_skip_sudo(run_cli_command, rdma_config_file, tmp_path): "--log-path", log_path, "--skip-sudo", - "--plugin-configs", - str(rdma_config_file), + f"--plugin-configs={rdma_config_file}", ], check=False, ) diff --git a/test/functional/test_reference_config_workflow.py b/test/functional/test_reference_config_workflow.py index 7b929d38..b9f1648d 100644 --- a/test/functional/test_reference_config_workflow.py +++ b/test/functional/test_reference_config_workflow.py @@ -155,7 +155,7 @@ def test_use_generated_reference_config(run_cli_command, tmp_path): assert reference_config_path.exists() use_result = run_cli_command( - ["--log-path", use_log_path, "--plugin-configs", str(reference_config_path)], + ["--log-path", use_log_path, f"--plugin-configs={reference_config_path}"], check=False, ) @@ -209,7 +209,7 @@ def test_full_workflow_all_plugins(run_cli_command, tmp_path, all_plugin_names): assert isinstance(plugin_config["analysis_args"], dict) use_result = run_cli_command( - ["--log-path", use_log_path, "--plugin-configs", str(reference_config_path)], + ["--log-path", use_log_path, f"--plugin-configs={reference_config_path}"], check=False, ) diff --git a/test/functional/test_sys_settings_plugin.py b/test/functional/test_sys_settings_plugin.py index 85b1bc03..03869184 100644 --- a/test/functional/test_sys_settings_plugin.py +++ b/test/functional/test_sys_settings_plugin.py @@ -54,7 +54,7 @@ def test_sys_settings_plugin_with_config_file(run_cli_command, sys_settings_conf log_path = str(tmp_path / "logs_sys_settings") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(sys_settings_config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={sys_settings_config_file}"], check=False ) assert result.returncode in [0, 1, 2] @@ -85,7 +85,7 @@ def test_sys_settings_plugin_output_contains_plugin_result( log_path = str(tmp_path / "logs_sys_settings_result") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(sys_settings_config_file)], check=False + ["--log-path", log_path, f"--plugin-configs={sys_settings_config_file}"], check=False ) output = result.stdout + result.stderr @@ -99,7 +99,7 @@ def test_sys_settings_plugin_with_plugin_config_json(run_cli_command, plugin_con log_path = str(tmp_path / "logs_plugin_config") result = run_cli_command( - ["--log-path", log_path, "--plugin-configs", str(plugin_config_json)], check=False + ["--log-path", log_path, f"--plugin-configs={plugin_config_json}"], check=False ) assert result.returncode in [0, 1, 2] diff --git a/test/unit/cli/test_cli_no_console_stdout.py b/test/unit/cli/test_cli_no_console_stdout.py index 775bccdf..afa53d13 100644 --- a/test/unit/cli/test_cli_no_console_stdout.py +++ b/test/unit/cli/test_cli_no_console_stdout.py @@ -109,7 +109,7 @@ def test_run_plugins_empty_config_no_stdout(no_console_base, tmp_path): encoding="utf-8", ) _assert_main_leaves_stdout_empty( - no_console_base + ["run-plugins", "--plugin-configs", str(cfg)], + no_console_base + ["run-plugins", f"--plugin-configs={cfg}"], ) From 476ec2ee7ca05fc5ac135246582da79387528c65 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 20 Apr 2026 09:17:47 -0500 Subject: [PATCH 10/25] updates --- README.md | 23 +++--- nodescraper/cli/cli.py | 88 +++++++++++++++++++++- nodescraper/cli/embed.py | 21 +++++- nodescraper/cli/helper.py | 31 ++++++++ nodescraper/cli_connection_profile.py | 86 +++++++++++++++++++++ nodescraper/connection_profile/__init__.py | 6 ++ nodescraper/connection_profile/loader.py | 22 ++++++ nodescraper/connection_profile/registry.py | 43 +++++++++++ nodescraper/pluginexecutor.py | 20 ++++- test/unit/framework/test_cli_helper.py | 40 +++++++++- 10 files changed, 359 insertions(+), 21 deletions(-) create mode 100644 nodescraper/cli_connection_profile.py create mode 100644 nodescraper/connection_profile/__init__.py create mode 100644 nodescraper/connection_profile/loader.py create mode 100644 nodescraper/connection_profile/registry.py diff --git a/README.md b/README.md index eda8dea4..73852043 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ usage: cli.py [-h] [--version] [--sys-name STRING] [--sys-location {LOCAL,REMOTE}] [--sys-interaction-level {PASSIVE,INTERACTIVE,DISRUPTIVE}] [--sys-sku STRING] [--sys-platform STRING] - [--plugin-configs [STRING ...]] [--system-config STRING] + [--plugin-configs=LIST] [--system-config STRING] [--connection-config STRING] [--log-path STRING] [--log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}] [--no-console-log] [--gen-reference-config] [--skip-sudo] @@ -112,9 +112,9 @@ options: --sys-sku STRING Manually specify SKU of system (default: None) --sys-platform STRING Specify system platform (default: None) - --plugin-configs [STRING ...] - built-in config names or paths to plugin config JSONs. - Available built-in configs: NodeStatus, AllPlugins + --plugin-configs=LIST + Comma-separated built-in names and/or plugin config JSON + paths (equals form only). Built-in: NodeStatus, AllPlugins (default: None) --system-config STRING Path to system config json (default: None) @@ -152,6 +152,8 @@ To use remote execution, specify `--sys-location REMOTE` and provide a connectio node-scraper --sys-name --sys-location REMOTE --connection-config ./connection_config.json run-plugins DmesgPlugin ``` +The file path given to `--connection-config` is JSON. The built-in CLI loads it as a mapping from registered connection manager names to their argument objects, so it may contain **only** the connection blocks below. This repository also defines `nodescraper.connection_profile.load_connection_profile` and the setuptools entry-point group `nodescraper.connection_profile_loaders`; a subclass of `nodescraper.connection_profile.loader.ConnectionProfileLoader` registered there can load a richer document that still includes those blocks plus optional host fields (for example `sys_*`, SSH, or OOB-related entries), depending on the loader you use. + ##### Example: connection_config.json In-band (SSH) connection: @@ -348,7 +350,7 @@ You can extend the built-in error detection with custom regex patterns. Create a Save this to `dmesg_custom_config.json` and run: ```sh -node-scraper --plugin-configs dmesg_custom_config.json run-plugins DmesgPlugin +node-scraper --plugin-configs=dmesg_custom_config.json run-plugins DmesgPlugin ``` #### **'compare-runs' subcommand** @@ -539,8 +541,9 @@ Built-in configs include **NodeStatus** (a subset of plugins) and **AllPlugins** registered plugin with default arguments—useful for generating a reference config from the full system). **NodeStatus plus additional plugins** — built-in configs merge with plugins named after `run-plugins`. -Use **`--plugin-configs=`** (equals form): with a space -after `--plugin-configs`. See below for examples: +Use **`--plugin-configs=`** only (never `--plugin-configs` followed by a separate token). Values are +comma-separated, e.g. `--plugin-configs=NodeStatus,/path/extra.json`. +Examples: ```sh node-scraper --plugin-configs=NodeStatus run-plugins PciePlugin ``` @@ -551,7 +554,7 @@ node-scraper --log-path ./logs --plugin-configs=NodeStatus run-plugins PciePlugi Using a JSON file: ```sh -node-scraper --plugin-configs plugin_config.json +node-scraper --plugin-configs=plugin_config.json ``` Here is an example of a comprehensive plugin config that specifies analyzer args for each plugin: ```json @@ -613,7 +616,7 @@ data. **Run all registered plugins (AllPlugins config):** ```sh -node-scraper --plugin-config AllPlugins +node-scraper --plugin-configs=AllPlugins ``` @@ -647,7 +650,7 @@ This will generate the following config: ``` This config can later be used on a different platform for comparison, using the steps at #2: ```sh -node-scraper --plugin-configs reference_config.json +node-scraper --plugin-configs=reference_config.json ``` diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 7447e02a..6073bf79 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -25,6 +25,7 @@ ############################################################################### import argparse import datetime +import functools import json import logging import os @@ -37,6 +38,7 @@ from nodescraper.cli.constants import DEFAULT_CONFIG, META_VAR_MAP from nodescraper.cli.dynamicparserbuilder import DynamicParserBuilder from nodescraper.cli.helper import ( + drop_skipped_plugins_from_configs, dump_results_to_csv, generate_reference_config, generate_reference_config_from_logs, @@ -63,6 +65,23 @@ from nodescraper.pluginregistry import PluginRegistry +def _parse_plugin_configs_csv(value: str) -> list[str]: + """Split a comma-separated ``--plugin-configs`` value into names/paths.""" + return [p.strip() for p in value.split(",") if p.strip()] + + +class _PluginConfigsEqualsAction(argparse.Action): + """Store plugin config list; require ``--plugin-configs=…`` (not ``--plugin-configs …``).""" + + def __call__(self, parser, namespace, values, option_string=None): + if option_string is not None and "=" not in option_string: + parser.error( + "argument --plugin-configs: must use equals form " + "(e.g. --plugin-configs=NodeStatus or --plugin-configs=a.json,b.json)" + ) + setattr(namespace, self.dest, values) + + def build_parser( plugin_reg: PluginRegistry, config_reg: ConfigRegistry, @@ -125,10 +144,14 @@ def build_parser( parser.add_argument( "--plugin-configs", - type=str, - nargs="*", - help=f"built-in config names or paths to plugin config JSONs.\nAvailable built-in configs: {', '.join(config_reg.configs.keys())}", - metavar=META_VAR_MAP[str], + action=_PluginConfigsEqualsAction, + type=_parse_plugin_configs_csv, + default=None, + help=( + "After '=' only: comma-separated built-in names and/or plugin config JSON paths " + f"(e.g. --plugin-configs=NodeStatus,/path/c.json). Built-ins: {', '.join(config_reg.configs.keys())}" + ), + metavar="LIST", ) parser.add_argument( @@ -184,6 +207,19 @@ def build_parser( help="Skip plugins that require sudo permissions", ) + parser.add_argument( + "--skip-plugin", + dest="skip_plugin", + action="extend", + nargs=1, + metavar="PLUGIN", + choices=sorted(plugin_reg.plugins.keys()), + help=( + "Registered plugin class name(s) to exclude from this run (repeat the flag for " + "multiple). Same idea as error-scraper ``--skip-task``." + ), + ) + subparsers = parser.add_subparsers(dest="subcmd", help="Subcommands") subparsers.default = "run-plugins" @@ -334,6 +370,41 @@ def build_parser( return parser, plugin_subparser_map +def _top_level_subcommand_names(root: argparse.ArgumentParser) -> tuple[str, ...]: + """Return ``dest=subcmd`` subparser names from the root CLI parser. + + Args: + root: Parser returned by :func:`build_parser`. + + Returns: + Tuple of top-level subcommand strings. + """ + for action in root._actions: + if isinstance(action, argparse._SubParsersAction) and action.dest == "subcmd": + return tuple(action.choices.keys()) + raise RuntimeError("nodescraper CLI root parser has no subcmd subparsers") + + +@functools.lru_cache(maxsize=1) +def get_cli_top_level_subcommands() -> tuple[str, ...]: + """Return top-level subcommand names from a parser built like :func:`main` (cached). + + Returns: + Tuple of ``subcmd`` subparser names; call ``cache_clear()`` if registries change in-process. + """ + plugin_reg = PluginRegistry() + config_reg = ConfigRegistry() + config_reg.configs["AllPlugins"] = PluginConfig( + name="AllPlugins", + desc="Run all registered plugins with default arguments", + global_args={}, + plugins={name: {} for name in plugin_reg.plugins}, + result_collators={}, + ) + parser, _plugin_subparser_map = build_parser(plugin_reg, config_reg) + return _top_level_subcommand_names(parser) + + def setup_logger( log_level: str = "INFO", log_path: Optional[str] = None, @@ -554,6 +625,15 @@ def main( plugin_subparser_map=plugin_subparser_map, ) + skip_plugin_list = getattr(parsed_args, "skip_plugin", None) or [] + drop_skipped_plugins_from_configs(plugin_config_inst_list, skip_plugin_list) + merged_for_skip_check = PluginExecutor.merge_configs(plugin_config_inst_list) + if not merged_for_skip_check.plugins: + logger.error( + "No plugins remain to run after applying --skip-plugin; check your config and skip list." + ) + sys.exit(2) + if parsed_args.skip_sudo: plugin_config_inst_list[-1].global_args.setdefault("collection_args", {})[ "skip_sudo" diff --git a/nodescraper/cli/embed.py b/nodescraper/cli/embed.py index aa5ad082..ac4d59a9 100644 --- a/nodescraper/cli/embed.py +++ b/nodescraper/cli/embed.py @@ -30,7 +30,24 @@ import argparse from typing import Optional -__all__ = ["run_main_return_code"] +__all__ = ["run_cli_return_code", "run_main_return_code"] + + +def run_cli_return_code( + argv: list[str], + *, + host_cli_args: Optional[argparse.Namespace] = None, +) -> int: + """Run nodescraper in-process; same behavior as :func:`run_main_return_code`. + + Args: + argv: Tokens after the program name. + host_cli_args: Optional host namespace forwarded to :func:`nodescraper.cli.cli.main`. + + Returns: + Integer exit code (``SystemExit`` is mapped, not raised). + """ + return run_main_return_code(argv, host_cli_args=host_cli_args) def run_main_return_code( @@ -38,7 +55,7 @@ def run_main_return_code( *, host_cli_args: Optional[argparse.Namespace] = None, ) -> int: - """Runs the nodescraper main entrypoint and maps SystemExit to an integer return code.""" + """Run :func:`nodescraper.cli.cli.main` and map ``SystemExit`` to an exit code.""" from nodescraper.cli.cli import main try: diff --git a/nodescraper/cli/helper.py b/nodescraper/cli/helper.py index 620c8f38..71e61d6b 100644 --- a/nodescraper/cli/helper.py +++ b/nodescraper/cli/helper.py @@ -30,6 +30,7 @@ import logging import os import sys +from datetime import datetime from pathlib import Path from typing import Optional, Sequence, Tuple @@ -89,6 +90,8 @@ def get_plugin_configs( built_in_configs: dict[str, PluginConfig], parsed_plugin_args: dict[str, argparse.Namespace], plugin_subparser_map: dict[str, tuple[argparse.ArgumentParser, dict]], + global_analysis_range_start: Optional[datetime] = None, + global_analysis_range_end: Optional[datetime] = None, ) -> list[PluginConfig]: """Build list of plugin configs based on input args @@ -98,6 +101,8 @@ def get_plugin_configs( built_in_configs (dict[str, PluginConfig]): built-in plugin configs, mapping from config name to PluginConfig instance parsed_plugin_args (dict[str, argparse.Namespace]): parsed plugin arguments, mapping from plugin name to parsed args plugin_subparser_map (dict[str, tuple[argparse.ArgumentParser, dict]]): plugin subparser map, mapping from plugin name to tuple of parser and model type map + global_analysis_range_start: optional global analysis window start (merged into ``global_args.analysis_args``) + global_analysis_range_end: optional global analysis window end (merged into ``global_args.analysis_args``) Raises: argparse.ArgumentTypeError: if system interaction level is invalid @@ -115,6 +120,14 @@ def get_plugin_configs( base_config.global_args["system_interaction_level"] = system_interaction_level + if global_analysis_range_start is not None or global_analysis_range_end is not None: + ar: dict = {} + if global_analysis_range_start is not None: + ar["analysis_range_start"] = global_analysis_range_start + if global_analysis_range_end is not None: + ar["analysis_range_end"] = global_analysis_range_end + base_config.global_args.setdefault("analysis_args", {}).update(ar) + plugin_configs = [base_config] if plugin_config_input: @@ -150,6 +163,24 @@ def get_plugin_configs( return plugin_configs +def drop_skipped_plugins_from_configs( + plugin_configs: list[PluginConfig], + plugins_to_drop: Optional[list[str]], +) -> None: + """Remove named plugins from every ``PluginConfig.plugins`` (in-place). + + Intended for ``--skip-plugin``: after built-in / JSON configs and ``run-plugins`` + selections are merged into a list of :class:`PluginConfig`, drop entries so they + never reach :meth:`~nodescraper.pluginexecutor.PluginExecutor.merge_configs`. + """ + if not plugins_to_drop: + return + drop_set = set(plugins_to_drop) + for cfg in plugin_configs: + for name in drop_set: + cfg.plugins.pop(name, None) + + def build_config( config_reg: ConfigRegistry, plugin_reg: PluginRegistry, diff --git a/nodescraper/cli_connection_profile.py b/nodescraper/cli_connection_profile.py new file mode 100644 index 00000000..84e33db5 --- /dev/null +++ b/nodescraper/cli_connection_profile.py @@ -0,0 +1,86 @@ +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# Drop-in helpers for node-scraper: register neutral CLI flags and resolve host_cli_args. +# Wire into your top-level ArgumentParser and embed entrypoints as needed. + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any + +from nodescraper.pluginregistry import PluginRegistry + + +def apply_host_cli_args_to_parsed_args( + parsed_args: argparse.Namespace, + host_ns: argparse.Namespace | None, +) -> None: + """Copy ``sys_*`` fields from embed/profile namespace onto parsed top-level args.""" + if host_ns is None: + return + for attr in ("sys_name", "sys_location", "sys_sku", "sys_platform"): + if hasattr(host_ns, attr): + val = getattr(host_ns, attr) + if val is not None: + setattr(parsed_args, attr, val) + + +def materialize_connection_config_dict( + parsed_args: argparse.Namespace, + host_ns: argparse.Namespace | None, + plugin_reg: PluginRegistry, +) -> None: + """Set ``parsed_args.connection_config`` (plugin manager name → args) from loaded namespace. + + The entry-point loader returns a namespace that may include: + + * A ``connection_config`` dict (same shape as legacy JSON), and/or + * Top-level keys matching registered :attr:`PluginRegistry.connection_managers` names. + + Values in ``connection_config`` win over same keys from top-level manager attributes. + """ + if host_ns is None: + return + names = set(plugin_reg.connection_managers.keys()) + by_registry = {k: v for k, v in vars(host_ns).items() if k in names and v is not None} + nested = getattr(host_ns, "connection_config", None) + if nested is not None and isinstance(nested, dict): + merged = {**by_registry, **nested} + else: + merged = by_registry or {} + if merged: + parsed_args.connection_config = merged + + +def register_connection_config_loader_arguments(parser: argparse.ArgumentParser) -> None: + """Add ``--connection-config`` (JSON file path) and optional loader entry-point name.""" + parser.add_argument( + "--connection-config", + type=Path, + metavar="PATH", + dest="connection_config_path", + help="JSON file loaded via ``nodescraper.connection_profile_loaders`` (host fields + optional plugin connection dict).", + ) + parser.add_argument( + "--connection-config-loader", + dest="connection_config_loader", + default="amd_error_scraper", + help=argparse.SUPPRESS, + ) + + +def load_connection_config_namespace(args: Any) -> argparse.Namespace | None: + """If ``args.connection_config_path`` is set, run the named loader and return its namespace.""" + path = getattr(args, "connection_config_path", None) + if path is None: + return None + from nodescraper.connection_profile import load_connection_profile + + name = getattr(args, "connection_config_loader", "amd_error_scraper") + loaded = load_connection_profile(Path(path), str(name)) + if not isinstance(loaded, argparse.Namespace): + raise TypeError( + f"Connection profile loader {name!r} must return argparse.Namespace; got {type(loaded)!r}" + ) + return loaded diff --git a/nodescraper/connection_profile/__init__.py b/nodescraper/connection_profile/__init__.py new file mode 100644 index 00000000..62cf3784 --- /dev/null +++ b/nodescraper/connection_profile/__init__.py @@ -0,0 +1,6 @@ +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + +from nodescraper.connection_profile.loader import ConnectionProfileLoader +from nodescraper.connection_profile.registry import load_connection_profile + +__all__ = ["ConnectionProfileLoader", "load_connection_profile"] diff --git a/nodescraper/connection_profile/loader.py b/nodescraper/connection_profile/loader.py new file mode 100644 index 00000000..bace494d --- /dev/null +++ b/nodescraper/connection_profile/loader.py @@ -0,0 +1,22 @@ +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path + + +class ConnectionProfileLoader(ABC): + """Load connection-related settings from a JSON file into an object for plugin runs. + + Node-scraper does not interpret the file contents; implementations live in other + distributions and are registered via importlib entry points. + """ + + @abstractmethod + def load(self, path: Path) -> object: + """Read ``path`` and return an object suitable for :attr:`PluginRunInvocation.host_cli_args`. + + Implementations may also attach a ``connection_config`` dict (same shape as + ``--connection-config``) for :class:`~nodescraper.pluginexecutor.PluginExecutor`. + """ diff --git a/nodescraper/connection_profile/registry.py b/nodescraper/connection_profile/registry.py new file mode 100644 index 00000000..a3e2a47f --- /dev/null +++ b/nodescraper/connection_profile/registry.py @@ -0,0 +1,43 @@ +# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + +from __future__ import annotations + +import importlib.metadata +from pathlib import Path + +from nodescraper.connection_profile.loader import ConnectionProfileLoader + + +def load_connection_profile(path: Path, loader_name: str) -> object: + """Instantiate the named loader entry point and call :meth:`ConnectionProfileLoader.load`. + + Args: + path: JSON file path. + loader_name: Entry point name under ``nodescraper.connection_profile_loaders``. + + Returns: + Loader return value (often :class:`argparse.Namespace` for use as ``host_cli_args``). + """ + try: + eps = importlib.metadata.entry_points( # type: ignore[call-arg] + group="nodescraper.connection_profile_loaders" + ) + except TypeError: + all_eps = importlib.metadata.entry_points() # type: ignore[assignment] + eps = all_eps.get("nodescraper.connection_profile_loaders", []) # type: ignore[assignment, attr-defined, arg-type] + + matches = [ep for ep in eps if ep.name == loader_name] # type: ignore[attr-defined] + if not matches: + available = [ep.name for ep in eps] # type: ignore[attr-defined] + raise KeyError( + f"No nodescraper.connection_profile_loaders entry named {loader_name!r}; " + f"available: {available}" + ) + loader_cls = matches[0].load() # type: ignore[attr-defined] + if not isinstance(loader_cls, type) or not issubclass(loader_cls, ConnectionProfileLoader): + raise TypeError( + f"Entry point {loader_name!r} must resolve to a subclass of ConnectionProfileLoader; " + f"got {loader_cls!r}" + ) + loader = loader_cls() + return loader.load(path) diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index 1782bb50..973d9d24 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -53,6 +53,7 @@ def __init__( logger: Optional[logging.Logger] = None, plugin_registry: Optional[PluginRegistry] = None, log_path: Optional[str] = None, + disable_result_colour: bool = False, ): if logger is None: @@ -70,6 +71,7 @@ def __init__( self.connection_library: dict[type[ConnectionManager], ConnectionManager] = {} self.log_path = log_path + self.disable_result_colour = disable_result_colour self.connection_result_hooks = [] if log_path: @@ -211,8 +213,16 @@ def run_queue(self) -> list[PluginResult]: # Merge analysis_args and collection_args for args_key in ["analysis_args", "collection_args"]: if args_key in global_run_args and args_key in run_payload: - # Merge: global args override plugin-specific args keys specified in both global and plugin-specific args - run_payload[args_key].update(global_run_args[args_key]) + if args_key == "analysis_args": + # Global defaults first; per-plugin analysis_args override on key collision. + g = global_run_args[args_key] + g_dict = g if isinstance(g, dict) else {} + p = run_payload.get(args_key) + p_dict = p if isinstance(p, dict) else {} + run_payload[args_key] = {**g_dict, **p_dict} + else: + # collection_args: global keys override plugin (historical behavior). + run_payload[args_key].update(global_run_args[args_key]) del global_run_args[args_key] run_payload.update(global_run_args) except ValueError as ve: @@ -246,13 +256,17 @@ def run_queue(self) -> list[PluginResult]: self.logger.info("Running %s result collator", collator) collator_inst = collator_class(logger=self.logger, log_path=self.log_path) + merged_collator_args = { + **collator_args, + "disable_result_colour": self.disable_result_colour, + } collator_inst.collate_results( plugin_results, [ connection_manager.result for connection_manager in self.connection_library.values() ], - **collator_args, + **merged_collator_args, ) for connection_manager in self.connection_library.values(): connection_manager.disconnect() diff --git a/test/unit/framework/test_cli_helper.py b/test/unit/framework/test_cli_helper.py index 5b88bf7e..3d0cefe7 100644 --- a/test/unit/framework/test_cli_helper.py +++ b/test/unit/framework/test_cli_helper.py @@ -25,6 +25,7 @@ ############################################################################### import argparse import csv +import datetime import json import logging import os @@ -39,6 +40,7 @@ from nodescraper.cli.helper import ( build_config, + drop_skipped_plugins_from_configs, dump_results_to_csv, dump_to_csv, find_datamodel_and_result, @@ -52,9 +54,27 @@ from nodescraper.models import PluginConfig, TaskResult from nodescraper.models.datapluginresult import DataPluginResult from nodescraper.models.pluginresult import PluginResult +from nodescraper.pluginexecutor import PluginExecutor from nodescraper.pluginregistry import PluginRegistry +def test_drop_skipped_plugins_from_configs(): + cfg_a = PluginConfig(plugins={"P1": {}, "P2": {}}) + cfg_b = PluginConfig(plugins={"P2": {"collection_args": {"a": 1}}, "P3": {}}) + lst = [ + PluginConfig( + global_args={"system_interaction_level": SystemInteractionLevel.PASSIVE}, + plugins={}, + result_collators={"TableSummary": {}}, + ), + cfg_a, + cfg_b, + ] + drop_skipped_plugins_from_configs(lst, ["P1", "P2"]) + merged = PluginExecutor.merge_configs(lst) + assert merged.plugins == {"P3": {}} + + def test_generate_reference_config(plugin_registry): results = [ PluginResult( @@ -124,6 +144,22 @@ def test_get_plugin_configs(): ] +def test_get_plugin_configs_global_analysis_range(): + dt_s = datetime.datetime(2025, 6, 1, 12, 0, 0, tzinfo=datetime.timezone.utc) + dt_e = datetime.datetime(2025, 6, 2, 12, 0, 0, tzinfo=datetime.timezone.utc) + plugin_configs = get_plugin_configs( + system_interaction_level="PASSIVE", + plugin_config_input=[], + built_in_configs={}, + parsed_plugin_args={}, + plugin_subparser_map={}, + global_analysis_range_start=dt_s, + global_analysis_range_end=dt_e, + ) + assert plugin_configs[0].global_args["analysis_args"]["analysis_range_start"] == dt_s + assert plugin_configs[0].global_args["analysis_args"]["analysis_range_end"] == dt_e + + def test_config_builder(plugin_registry): config = build_config( @@ -261,8 +297,8 @@ def test_generate_summary(tmp_path): subdir = tmp_path / "sub" subdir.mkdir() - errorscraper_path = subdir / "nodescraper.csv" - with open(errorscraper_path, "w", newline="") as f: + nodescraper_csv_path = subdir / "nodescraper.csv" + with open(nodescraper_csv_path, "w", newline="") as f: writer = csv.DictWriter( f, fieldnames=["nodename", "plugin", "status", "timestamp", "message"] ) From e042dd4cb90f29505534a5ae3be38e30fb012b76 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 20 Apr 2026 09:33:28 -0500 Subject: [PATCH 11/25] undo commit --- README.md | 23 +++--- nodescraper/cli/cli.py | 88 +--------------------- nodescraper/cli/embed.py | 21 +----- nodescraper/cli/helper.py | 31 -------- nodescraper/cli_connection_profile.py | 86 --------------------- nodescraper/connection_profile/__init__.py | 6 -- nodescraper/connection_profile/loader.py | 22 ------ nodescraper/connection_profile/registry.py | 43 ----------- nodescraper/pluginexecutor.py | 20 +---- test/unit/framework/test_cli_helper.py | 40 +--------- 10 files changed, 21 insertions(+), 359 deletions(-) delete mode 100644 nodescraper/cli_connection_profile.py delete mode 100644 nodescraper/connection_profile/__init__.py delete mode 100644 nodescraper/connection_profile/loader.py delete mode 100644 nodescraper/connection_profile/registry.py diff --git a/README.md b/README.md index 73852043..eda8dea4 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ usage: cli.py [-h] [--version] [--sys-name STRING] [--sys-location {LOCAL,REMOTE}] [--sys-interaction-level {PASSIVE,INTERACTIVE,DISRUPTIVE}] [--sys-sku STRING] [--sys-platform STRING] - [--plugin-configs=LIST] [--system-config STRING] + [--plugin-configs [STRING ...]] [--system-config STRING] [--connection-config STRING] [--log-path STRING] [--log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}] [--no-console-log] [--gen-reference-config] [--skip-sudo] @@ -112,9 +112,9 @@ options: --sys-sku STRING Manually specify SKU of system (default: None) --sys-platform STRING Specify system platform (default: None) - --plugin-configs=LIST - Comma-separated built-in names and/or plugin config JSON - paths (equals form only). Built-in: NodeStatus, AllPlugins + --plugin-configs [STRING ...] + built-in config names or paths to plugin config JSONs. + Available built-in configs: NodeStatus, AllPlugins (default: None) --system-config STRING Path to system config json (default: None) @@ -152,8 +152,6 @@ To use remote execution, specify `--sys-location REMOTE` and provide a connectio node-scraper --sys-name --sys-location REMOTE --connection-config ./connection_config.json run-plugins DmesgPlugin ``` -The file path given to `--connection-config` is JSON. The built-in CLI loads it as a mapping from registered connection manager names to their argument objects, so it may contain **only** the connection blocks below. This repository also defines `nodescraper.connection_profile.load_connection_profile` and the setuptools entry-point group `nodescraper.connection_profile_loaders`; a subclass of `nodescraper.connection_profile.loader.ConnectionProfileLoader` registered there can load a richer document that still includes those blocks plus optional host fields (for example `sys_*`, SSH, or OOB-related entries), depending on the loader you use. - ##### Example: connection_config.json In-band (SSH) connection: @@ -350,7 +348,7 @@ You can extend the built-in error detection with custom regex patterns. Create a Save this to `dmesg_custom_config.json` and run: ```sh -node-scraper --plugin-configs=dmesg_custom_config.json run-plugins DmesgPlugin +node-scraper --plugin-configs dmesg_custom_config.json run-plugins DmesgPlugin ``` #### **'compare-runs' subcommand** @@ -541,9 +539,8 @@ Built-in configs include **NodeStatus** (a subset of plugins) and **AllPlugins** registered plugin with default arguments—useful for generating a reference config from the full system). **NodeStatus plus additional plugins** — built-in configs merge with plugins named after `run-plugins`. -Use **`--plugin-configs=`** only (never `--plugin-configs` followed by a separate token). Values are -comma-separated, e.g. `--plugin-configs=NodeStatus,/path/extra.json`. -Examples: +Use **`--plugin-configs=`** (equals form): with a space +after `--plugin-configs`. See below for examples: ```sh node-scraper --plugin-configs=NodeStatus run-plugins PciePlugin ``` @@ -554,7 +551,7 @@ node-scraper --log-path ./logs --plugin-configs=NodeStatus run-plugins PciePlugi Using a JSON file: ```sh -node-scraper --plugin-configs=plugin_config.json +node-scraper --plugin-configs plugin_config.json ``` Here is an example of a comprehensive plugin config that specifies analyzer args for each plugin: ```json @@ -616,7 +613,7 @@ data. **Run all registered plugins (AllPlugins config):** ```sh -node-scraper --plugin-configs=AllPlugins +node-scraper --plugin-config AllPlugins ``` @@ -650,7 +647,7 @@ This will generate the following config: ``` This config can later be used on a different platform for comparison, using the steps at #2: ```sh -node-scraper --plugin-configs=reference_config.json +node-scraper --plugin-configs reference_config.json ``` diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 6073bf79..7447e02a 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -25,7 +25,6 @@ ############################################################################### import argparse import datetime -import functools import json import logging import os @@ -38,7 +37,6 @@ from nodescraper.cli.constants import DEFAULT_CONFIG, META_VAR_MAP from nodescraper.cli.dynamicparserbuilder import DynamicParserBuilder from nodescraper.cli.helper import ( - drop_skipped_plugins_from_configs, dump_results_to_csv, generate_reference_config, generate_reference_config_from_logs, @@ -65,23 +63,6 @@ from nodescraper.pluginregistry import PluginRegistry -def _parse_plugin_configs_csv(value: str) -> list[str]: - """Split a comma-separated ``--plugin-configs`` value into names/paths.""" - return [p.strip() for p in value.split(",") if p.strip()] - - -class _PluginConfigsEqualsAction(argparse.Action): - """Store plugin config list; require ``--plugin-configs=…`` (not ``--plugin-configs …``).""" - - def __call__(self, parser, namespace, values, option_string=None): - if option_string is not None and "=" not in option_string: - parser.error( - "argument --plugin-configs: must use equals form " - "(e.g. --plugin-configs=NodeStatus or --plugin-configs=a.json,b.json)" - ) - setattr(namespace, self.dest, values) - - def build_parser( plugin_reg: PluginRegistry, config_reg: ConfigRegistry, @@ -144,14 +125,10 @@ def build_parser( parser.add_argument( "--plugin-configs", - action=_PluginConfigsEqualsAction, - type=_parse_plugin_configs_csv, - default=None, - help=( - "After '=' only: comma-separated built-in names and/or plugin config JSON paths " - f"(e.g. --plugin-configs=NodeStatus,/path/c.json). Built-ins: {', '.join(config_reg.configs.keys())}" - ), - metavar="LIST", + type=str, + nargs="*", + help=f"built-in config names or paths to plugin config JSONs.\nAvailable built-in configs: {', '.join(config_reg.configs.keys())}", + metavar=META_VAR_MAP[str], ) parser.add_argument( @@ -207,19 +184,6 @@ def build_parser( help="Skip plugins that require sudo permissions", ) - parser.add_argument( - "--skip-plugin", - dest="skip_plugin", - action="extend", - nargs=1, - metavar="PLUGIN", - choices=sorted(plugin_reg.plugins.keys()), - help=( - "Registered plugin class name(s) to exclude from this run (repeat the flag for " - "multiple). Same idea as error-scraper ``--skip-task``." - ), - ) - subparsers = parser.add_subparsers(dest="subcmd", help="Subcommands") subparsers.default = "run-plugins" @@ -370,41 +334,6 @@ def build_parser( return parser, plugin_subparser_map -def _top_level_subcommand_names(root: argparse.ArgumentParser) -> tuple[str, ...]: - """Return ``dest=subcmd`` subparser names from the root CLI parser. - - Args: - root: Parser returned by :func:`build_parser`. - - Returns: - Tuple of top-level subcommand strings. - """ - for action in root._actions: - if isinstance(action, argparse._SubParsersAction) and action.dest == "subcmd": - return tuple(action.choices.keys()) - raise RuntimeError("nodescraper CLI root parser has no subcmd subparsers") - - -@functools.lru_cache(maxsize=1) -def get_cli_top_level_subcommands() -> tuple[str, ...]: - """Return top-level subcommand names from a parser built like :func:`main` (cached). - - Returns: - Tuple of ``subcmd`` subparser names; call ``cache_clear()`` if registries change in-process. - """ - plugin_reg = PluginRegistry() - config_reg = ConfigRegistry() - config_reg.configs["AllPlugins"] = PluginConfig( - name="AllPlugins", - desc="Run all registered plugins with default arguments", - global_args={}, - plugins={name: {} for name in plugin_reg.plugins}, - result_collators={}, - ) - parser, _plugin_subparser_map = build_parser(plugin_reg, config_reg) - return _top_level_subcommand_names(parser) - - def setup_logger( log_level: str = "INFO", log_path: Optional[str] = None, @@ -625,15 +554,6 @@ def main( plugin_subparser_map=plugin_subparser_map, ) - skip_plugin_list = getattr(parsed_args, "skip_plugin", None) or [] - drop_skipped_plugins_from_configs(plugin_config_inst_list, skip_plugin_list) - merged_for_skip_check = PluginExecutor.merge_configs(plugin_config_inst_list) - if not merged_for_skip_check.plugins: - logger.error( - "No plugins remain to run after applying --skip-plugin; check your config and skip list." - ) - sys.exit(2) - if parsed_args.skip_sudo: plugin_config_inst_list[-1].global_args.setdefault("collection_args", {})[ "skip_sudo" diff --git a/nodescraper/cli/embed.py b/nodescraper/cli/embed.py index ac4d59a9..aa5ad082 100644 --- a/nodescraper/cli/embed.py +++ b/nodescraper/cli/embed.py @@ -30,24 +30,7 @@ import argparse from typing import Optional -__all__ = ["run_cli_return_code", "run_main_return_code"] - - -def run_cli_return_code( - argv: list[str], - *, - host_cli_args: Optional[argparse.Namespace] = None, -) -> int: - """Run nodescraper in-process; same behavior as :func:`run_main_return_code`. - - Args: - argv: Tokens after the program name. - host_cli_args: Optional host namespace forwarded to :func:`nodescraper.cli.cli.main`. - - Returns: - Integer exit code (``SystemExit`` is mapped, not raised). - """ - return run_main_return_code(argv, host_cli_args=host_cli_args) +__all__ = ["run_main_return_code"] def run_main_return_code( @@ -55,7 +38,7 @@ def run_main_return_code( *, host_cli_args: Optional[argparse.Namespace] = None, ) -> int: - """Run :func:`nodescraper.cli.cli.main` and map ``SystemExit`` to an exit code.""" + """Runs the nodescraper main entrypoint and maps SystemExit to an integer return code.""" from nodescraper.cli.cli import main try: diff --git a/nodescraper/cli/helper.py b/nodescraper/cli/helper.py index 71e61d6b..620c8f38 100644 --- a/nodescraper/cli/helper.py +++ b/nodescraper/cli/helper.py @@ -30,7 +30,6 @@ import logging import os import sys -from datetime import datetime from pathlib import Path from typing import Optional, Sequence, Tuple @@ -90,8 +89,6 @@ def get_plugin_configs( built_in_configs: dict[str, PluginConfig], parsed_plugin_args: dict[str, argparse.Namespace], plugin_subparser_map: dict[str, tuple[argparse.ArgumentParser, dict]], - global_analysis_range_start: Optional[datetime] = None, - global_analysis_range_end: Optional[datetime] = None, ) -> list[PluginConfig]: """Build list of plugin configs based on input args @@ -101,8 +98,6 @@ def get_plugin_configs( built_in_configs (dict[str, PluginConfig]): built-in plugin configs, mapping from config name to PluginConfig instance parsed_plugin_args (dict[str, argparse.Namespace]): parsed plugin arguments, mapping from plugin name to parsed args plugin_subparser_map (dict[str, tuple[argparse.ArgumentParser, dict]]): plugin subparser map, mapping from plugin name to tuple of parser and model type map - global_analysis_range_start: optional global analysis window start (merged into ``global_args.analysis_args``) - global_analysis_range_end: optional global analysis window end (merged into ``global_args.analysis_args``) Raises: argparse.ArgumentTypeError: if system interaction level is invalid @@ -120,14 +115,6 @@ def get_plugin_configs( base_config.global_args["system_interaction_level"] = system_interaction_level - if global_analysis_range_start is not None or global_analysis_range_end is not None: - ar: dict = {} - if global_analysis_range_start is not None: - ar["analysis_range_start"] = global_analysis_range_start - if global_analysis_range_end is not None: - ar["analysis_range_end"] = global_analysis_range_end - base_config.global_args.setdefault("analysis_args", {}).update(ar) - plugin_configs = [base_config] if plugin_config_input: @@ -163,24 +150,6 @@ def get_plugin_configs( return plugin_configs -def drop_skipped_plugins_from_configs( - plugin_configs: list[PluginConfig], - plugins_to_drop: Optional[list[str]], -) -> None: - """Remove named plugins from every ``PluginConfig.plugins`` (in-place). - - Intended for ``--skip-plugin``: after built-in / JSON configs and ``run-plugins`` - selections are merged into a list of :class:`PluginConfig`, drop entries so they - never reach :meth:`~nodescraper.pluginexecutor.PluginExecutor.merge_configs`. - """ - if not plugins_to_drop: - return - drop_set = set(plugins_to_drop) - for cfg in plugin_configs: - for name in drop_set: - cfg.plugins.pop(name, None) - - def build_config( config_reg: ConfigRegistry, plugin_reg: PluginRegistry, diff --git a/nodescraper/cli_connection_profile.py b/nodescraper/cli_connection_profile.py deleted file mode 100644 index 84e33db5..00000000 --- a/nodescraper/cli_connection_profile.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. -# -# Drop-in helpers for node-scraper: register neutral CLI flags and resolve host_cli_args. -# Wire into your top-level ArgumentParser and embed entrypoints as needed. - -from __future__ import annotations - -import argparse -from pathlib import Path -from typing import Any - -from nodescraper.pluginregistry import PluginRegistry - - -def apply_host_cli_args_to_parsed_args( - parsed_args: argparse.Namespace, - host_ns: argparse.Namespace | None, -) -> None: - """Copy ``sys_*`` fields from embed/profile namespace onto parsed top-level args.""" - if host_ns is None: - return - for attr in ("sys_name", "sys_location", "sys_sku", "sys_platform"): - if hasattr(host_ns, attr): - val = getattr(host_ns, attr) - if val is not None: - setattr(parsed_args, attr, val) - - -def materialize_connection_config_dict( - parsed_args: argparse.Namespace, - host_ns: argparse.Namespace | None, - plugin_reg: PluginRegistry, -) -> None: - """Set ``parsed_args.connection_config`` (plugin manager name → args) from loaded namespace. - - The entry-point loader returns a namespace that may include: - - * A ``connection_config`` dict (same shape as legacy JSON), and/or - * Top-level keys matching registered :attr:`PluginRegistry.connection_managers` names. - - Values in ``connection_config`` win over same keys from top-level manager attributes. - """ - if host_ns is None: - return - names = set(plugin_reg.connection_managers.keys()) - by_registry = {k: v for k, v in vars(host_ns).items() if k in names and v is not None} - nested = getattr(host_ns, "connection_config", None) - if nested is not None and isinstance(nested, dict): - merged = {**by_registry, **nested} - else: - merged = by_registry or {} - if merged: - parsed_args.connection_config = merged - - -def register_connection_config_loader_arguments(parser: argparse.ArgumentParser) -> None: - """Add ``--connection-config`` (JSON file path) and optional loader entry-point name.""" - parser.add_argument( - "--connection-config", - type=Path, - metavar="PATH", - dest="connection_config_path", - help="JSON file loaded via ``nodescraper.connection_profile_loaders`` (host fields + optional plugin connection dict).", - ) - parser.add_argument( - "--connection-config-loader", - dest="connection_config_loader", - default="amd_error_scraper", - help=argparse.SUPPRESS, - ) - - -def load_connection_config_namespace(args: Any) -> argparse.Namespace | None: - """If ``args.connection_config_path`` is set, run the named loader and return its namespace.""" - path = getattr(args, "connection_config_path", None) - if path is None: - return None - from nodescraper.connection_profile import load_connection_profile - - name = getattr(args, "connection_config_loader", "amd_error_scraper") - loaded = load_connection_profile(Path(path), str(name)) - if not isinstance(loaded, argparse.Namespace): - raise TypeError( - f"Connection profile loader {name!r} must return argparse.Namespace; got {type(loaded)!r}" - ) - return loaded diff --git a/nodescraper/connection_profile/__init__.py b/nodescraper/connection_profile/__init__.py deleted file mode 100644 index 62cf3784..00000000 --- a/nodescraper/connection_profile/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. - -from nodescraper.connection_profile.loader import ConnectionProfileLoader -from nodescraper.connection_profile.registry import load_connection_profile - -__all__ = ["ConnectionProfileLoader", "load_connection_profile"] diff --git a/nodescraper/connection_profile/loader.py b/nodescraper/connection_profile/loader.py deleted file mode 100644 index bace494d..00000000 --- a/nodescraper/connection_profile/loader.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. - -from __future__ import annotations - -from abc import ABC, abstractmethod -from pathlib import Path - - -class ConnectionProfileLoader(ABC): - """Load connection-related settings from a JSON file into an object for plugin runs. - - Node-scraper does not interpret the file contents; implementations live in other - distributions and are registered via importlib entry points. - """ - - @abstractmethod - def load(self, path: Path) -> object: - """Read ``path`` and return an object suitable for :attr:`PluginRunInvocation.host_cli_args`. - - Implementations may also attach a ``connection_config`` dict (same shape as - ``--connection-config``) for :class:`~nodescraper.pluginexecutor.PluginExecutor`. - """ diff --git a/nodescraper/connection_profile/registry.py b/nodescraper/connection_profile/registry.py deleted file mode 100644 index a3e2a47f..00000000 --- a/nodescraper/connection_profile/registry.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. - -from __future__ import annotations - -import importlib.metadata -from pathlib import Path - -from nodescraper.connection_profile.loader import ConnectionProfileLoader - - -def load_connection_profile(path: Path, loader_name: str) -> object: - """Instantiate the named loader entry point and call :meth:`ConnectionProfileLoader.load`. - - Args: - path: JSON file path. - loader_name: Entry point name under ``nodescraper.connection_profile_loaders``. - - Returns: - Loader return value (often :class:`argparse.Namespace` for use as ``host_cli_args``). - """ - try: - eps = importlib.metadata.entry_points( # type: ignore[call-arg] - group="nodescraper.connection_profile_loaders" - ) - except TypeError: - all_eps = importlib.metadata.entry_points() # type: ignore[assignment] - eps = all_eps.get("nodescraper.connection_profile_loaders", []) # type: ignore[assignment, attr-defined, arg-type] - - matches = [ep for ep in eps if ep.name == loader_name] # type: ignore[attr-defined] - if not matches: - available = [ep.name for ep in eps] # type: ignore[attr-defined] - raise KeyError( - f"No nodescraper.connection_profile_loaders entry named {loader_name!r}; " - f"available: {available}" - ) - loader_cls = matches[0].load() # type: ignore[attr-defined] - if not isinstance(loader_cls, type) or not issubclass(loader_cls, ConnectionProfileLoader): - raise TypeError( - f"Entry point {loader_name!r} must resolve to a subclass of ConnectionProfileLoader; " - f"got {loader_cls!r}" - ) - loader = loader_cls() - return loader.load(path) diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index 973d9d24..1782bb50 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -53,7 +53,6 @@ def __init__( logger: Optional[logging.Logger] = None, plugin_registry: Optional[PluginRegistry] = None, log_path: Optional[str] = None, - disable_result_colour: bool = False, ): if logger is None: @@ -71,7 +70,6 @@ def __init__( self.connection_library: dict[type[ConnectionManager], ConnectionManager] = {} self.log_path = log_path - self.disable_result_colour = disable_result_colour self.connection_result_hooks = [] if log_path: @@ -213,16 +211,8 @@ def run_queue(self) -> list[PluginResult]: # Merge analysis_args and collection_args for args_key in ["analysis_args", "collection_args"]: if args_key in global_run_args and args_key in run_payload: - if args_key == "analysis_args": - # Global defaults first; per-plugin analysis_args override on key collision. - g = global_run_args[args_key] - g_dict = g if isinstance(g, dict) else {} - p = run_payload.get(args_key) - p_dict = p if isinstance(p, dict) else {} - run_payload[args_key] = {**g_dict, **p_dict} - else: - # collection_args: global keys override plugin (historical behavior). - run_payload[args_key].update(global_run_args[args_key]) + # Merge: global args override plugin-specific args keys specified in both global and plugin-specific args + run_payload[args_key].update(global_run_args[args_key]) del global_run_args[args_key] run_payload.update(global_run_args) except ValueError as ve: @@ -256,17 +246,13 @@ def run_queue(self) -> list[PluginResult]: self.logger.info("Running %s result collator", collator) collator_inst = collator_class(logger=self.logger, log_path=self.log_path) - merged_collator_args = { - **collator_args, - "disable_result_colour": self.disable_result_colour, - } collator_inst.collate_results( plugin_results, [ connection_manager.result for connection_manager in self.connection_library.values() ], - **merged_collator_args, + **collator_args, ) for connection_manager in self.connection_library.values(): connection_manager.disconnect() diff --git a/test/unit/framework/test_cli_helper.py b/test/unit/framework/test_cli_helper.py index 3d0cefe7..5b88bf7e 100644 --- a/test/unit/framework/test_cli_helper.py +++ b/test/unit/framework/test_cli_helper.py @@ -25,7 +25,6 @@ ############################################################################### import argparse import csv -import datetime import json import logging import os @@ -40,7 +39,6 @@ from nodescraper.cli.helper import ( build_config, - drop_skipped_plugins_from_configs, dump_results_to_csv, dump_to_csv, find_datamodel_and_result, @@ -54,27 +52,9 @@ from nodescraper.models import PluginConfig, TaskResult from nodescraper.models.datapluginresult import DataPluginResult from nodescraper.models.pluginresult import PluginResult -from nodescraper.pluginexecutor import PluginExecutor from nodescraper.pluginregistry import PluginRegistry -def test_drop_skipped_plugins_from_configs(): - cfg_a = PluginConfig(plugins={"P1": {}, "P2": {}}) - cfg_b = PluginConfig(plugins={"P2": {"collection_args": {"a": 1}}, "P3": {}}) - lst = [ - PluginConfig( - global_args={"system_interaction_level": SystemInteractionLevel.PASSIVE}, - plugins={}, - result_collators={"TableSummary": {}}, - ), - cfg_a, - cfg_b, - ] - drop_skipped_plugins_from_configs(lst, ["P1", "P2"]) - merged = PluginExecutor.merge_configs(lst) - assert merged.plugins == {"P3": {}} - - def test_generate_reference_config(plugin_registry): results = [ PluginResult( @@ -144,22 +124,6 @@ def test_get_plugin_configs(): ] -def test_get_plugin_configs_global_analysis_range(): - dt_s = datetime.datetime(2025, 6, 1, 12, 0, 0, tzinfo=datetime.timezone.utc) - dt_e = datetime.datetime(2025, 6, 2, 12, 0, 0, tzinfo=datetime.timezone.utc) - plugin_configs = get_plugin_configs( - system_interaction_level="PASSIVE", - plugin_config_input=[], - built_in_configs={}, - parsed_plugin_args={}, - plugin_subparser_map={}, - global_analysis_range_start=dt_s, - global_analysis_range_end=dt_e, - ) - assert plugin_configs[0].global_args["analysis_args"]["analysis_range_start"] == dt_s - assert plugin_configs[0].global_args["analysis_args"]["analysis_range_end"] == dt_e - - def test_config_builder(plugin_registry): config = build_config( @@ -297,8 +261,8 @@ def test_generate_summary(tmp_path): subdir = tmp_path / "sub" subdir.mkdir() - nodescraper_csv_path = subdir / "nodescraper.csv" - with open(nodescraper_csv_path, "w", newline="") as f: + errorscraper_path = subdir / "nodescraper.csv" + with open(errorscraper_path, "w", newline="") as f: writer = csv.DictWriter( f, fieldnames=["nodename", "plugin", "status", "timestamp", "message"] ) From a3b791ed07ee2f408beb1b606f05ce9b7e7c3f8a Mon Sep 17 00:00:00 2001 From: niratner Date: Mon, 20 Apr 2026 16:15:05 -0400 Subject: [PATCH 12/25] Created a new input to the dmesg analyzer allowing for a list of rules which can change the priority of regex events which the given rules. --- .../plugins/inband/dmesg/analyzer_args.py | 10 ++ .../plugins/inband/dmesg/dmesg_analyzer.py | 65 ++++++++ test/unit/plugin/test_dmesg_analyzer.py | 154 ++++++++++++++++++ 3 files changed, 229 insertions(+) diff --git a/nodescraper/plugins/inband/dmesg/analyzer_args.py b/nodescraper/plugins/inband/dmesg/analyzer_args.py index cd9ba765..b68aec27 100644 --- a/nodescraper/plugins/inband/dmesg/analyzer_args.py +++ b/nodescraper/plugins/inband/dmesg/analyzer_args.py @@ -52,3 +52,13 @@ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs): default=None, description="Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern.", ) + priority_override_rules: Optional[list[dict]] = Field( + default=None, + description=( + "Rules to override the priority of matched ErrorRegex objects. " + "Each rule is a dict where all keys except 'new_priority' and 'match_all' " + "are filter fields matched against ErrorRegex attributes. " + "'new_priority' must be an EventPriority name (e.g. 'WARNING', 'ERROR') " + "or 'NO_CHANGE' to leave the priority unchanged." + ), + ) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index ccfe9ce0..f63d56bb 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -535,6 +535,62 @@ def _norm(s: str) -> str: return True return False + def resolve_priority( + self, + regex_obj: ErrorRegex, + priority_override_rules: list[dict], + ) -> EventPriority | None: + """ + Walk the priority_override_rules in order (first-match-wins). + All keys in each rule except 'new_priority' and 'match_all' are treated + as filter fields compared against ErrorRegex attributes. Filter values + may be a single value or a list of values (match if any value matches). + Enum fields are compared by their name. Returns the overriding + EventPriority, or None to keep the original. + + Example rule format: + { + "message": ["mode 1 reset failed", "mode 2 reset failed"], + "new_priority": "NO_CHANGE" + } + { + "event_category": "RAS", + "new_priority": "WARNING" + } + """ + + _NO_CHANGE = "NO_CHANGE" + _EXCLUDED_KEYS = {"new_priority", "match_all"} + + for rule in priority_override_rules: + filter_fields = {key: value for key, value in rule.items() if key not in _EXCLUDED_KEYS} + + matched = True + # check for matches in all fields of the current rule + for field, filter_value in filter_fields.items(): + obj_value = getattr(regex_obj, field, None) + + # Normalize enum values to their name for string comparison + if hasattr(obj_value, "name"): + obj_value = obj_value.name + + if isinstance(filter_value, list): + if obj_value not in filter_value: + matched = False + break + else: + if obj_value != filter_value: + matched = False + break + + if matched: # return on encountering first fully matched rule + new_priority = rule.get("new_priority", _NO_CHANGE) + if new_priority == _NO_CHANGE: + return None + return EventPriority[new_priority] + + return None # if no rules are matched, return None + def analyze_data( self, data: DmesgData, @@ -555,6 +611,15 @@ def analyze_data( final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) + if args.priority_override_rules: + updated_regex = [] + for regex_obj in final_error_regex: + new_priority = self.resolve_priority(regex_obj, args.priority_override_rules) + if new_priority is not None: + regex_obj = regex_obj.model_copy(update={"event_priority": new_priority}) + updated_regex.append(regex_obj) + final_error_regex = updated_regex + if args.analysis_range_start or args.analysis_range_end: self.logger.info( "Filtering dmesg using range %s - %s", diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index c14b090c..27ff231d 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -25,7 +25,10 @@ ############################################################################### import datetime import pathlib +import re +from nodescraper.base.regexanalyzer import ErrorRegex +from nodescraper.enums.eventcategory import EventCategory from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.executionstatus import ExecutionStatus from nodescraper.plugins.inband.dmesg.analyzer_args import DmesgAnalyzerArgs @@ -708,6 +711,157 @@ def test_custom_regex_empty_list(system_info): assert res.events[0].description == "Out of memory error" +def test_resolve_priority_no_match(system_info): + """No rule matches → returns None (keep original priority).""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [{"event_category": "SW_DRIVER", "new_priority": "WARNING"}] + assert analyzer.resolve_priority(regex_obj, rules) is None + + +def test_resolve_priority_match_by_category(system_info): + """Rule with event_category filter matches and returns the new priority.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [{"event_category": "RAS", "new_priority": "WARNING"}] + result = analyzer.resolve_priority(regex_obj, rules) + assert result == EventPriority.WARNING + + +def test_resolve_priority_match_by_message_list(system_info): + """Rule with a list for message matches when the object's message is in the list.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"Mode2 reset failed"), + message="Mode 2 Reset Failed", + event_category=EventCategory.RAS, + ) + rules = [ + { + "message": ["Mode 2 Reset Failed", "GPU reset failed"], + "new_priority": "WARNING", + } + ] + result = analyzer.resolve_priority(regex_obj, rules) + assert result == EventPriority.WARNING + + +def test_resolve_priority_no_change(system_info): + """new_priority=NO_CHANGE → returns None (keep original priority).""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [{"event_category": "RAS", "new_priority": "NO_CHANGE"}] + assert analyzer.resolve_priority(regex_obj, rules) is None + + +def test_resolve_priority_first_match_wins(system_info): + """First matching rule wins; subsequent matching rules are ignored.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [ + {"event_category": "RAS", "new_priority": "WARNING"}, + {"event_category": "RAS", "new_priority": "ERROR"}, + ] + result = analyzer.resolve_priority(regex_obj, rules) + assert result == EventPriority.WARNING + + +def test_resolve_priority_multiple_filter_fields(system_info): + """All filter fields must match (AND logic).""" + analyzer = DmesgAnalyzer(system_info=system_info) + # Matches both category AND message + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + rules = [ + {"event_category": "RAS", "message": "GPU reset failed", "new_priority": "WARNING"}, + ] + assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.WARNING + + # Does NOT match because message differs + rules_mismatch = [ + {"event_category": "RAS", "message": "ACA Error", "new_priority": "WARNING"}, + ] + assert analyzer.resolve_priority(regex_obj, rules_mismatch) is None + + +def test_priority_override_rules_in_analyze_data(system_info): + """priority_override_rules passed via DmesgAnalyzerArgs overrides matched regex priorities.""" + dmesg_data = DmesgData( + dmesg_content=( + # RAS event — default ERROR, should become WARNING + "kern :err : 2024-10-07T10:17:15,145363-04:00 " + "amdgpu 0000:0c:00.0: amdgpu: socket: 4 1 correctable hardware errors detected in total in gfx block\n" + # SW_DRIVER event — default ERROR, should stay ERROR (no matching rule) + "kern :err : 2024-10-07T10:17:15,145363-04:00 IO_PAGE_FAULT\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=False, + priority_override_rules=[ + {"event_category": "RAS", "new_priority": "WARNING"}, + ], + ), + ) + + assert res.status == ExecutionStatus.ERROR + ras_events = [e for e in res.events if e.category == "RAS"] + sw_events = [e for e in res.events if e.category == "SW_DRIVER"] + + assert all( + e.priority == EventPriority.WARNING for e in ras_events + ), f"Expected all RAS events to be WARNING, got {[e.priority for e in ras_events]}" + assert all( + e.priority == EventPriority.ERROR for e in sw_events + ), f"Expected SW_DRIVER events to remain ERROR, got {[e.priority for e in sw_events]}" + + +def test_priority_override_no_change_keeps_original(system_info): + """NO_CHANGE rule leaves the original event priority intact.""" + dmesg_data = DmesgData( + dmesg_content=( + "kern :err : 2024-10-07T10:17:15,145363-04:00 " + "amdgpu 0000:0c:00.0: amdgpu: socket: 4 1 correctable hardware errors detected in total in gfx block\n" + ) + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=False, + priority_override_rules=[ + {"event_category": "RAS", "new_priority": "NO_CHANGE"}, + ], + ), + ) + + assert len(res.events) == 1 + assert res.events[0].priority == EventPriority.ERROR + + def test_custom_regex_with_multiline_pattern(system_info): """Test custom regex that should NOT match across multiple dmesg lines (each line processed separately)""" dmesg_data = DmesgData( From f91b146b927a495e6fb8f15c5d51faaca7d18422 Mon Sep 17 00:00:00 2001 From: niratner Date: Mon, 20 Apr 2026 16:50:43 -0400 Subject: [PATCH 13/25] Added 'match_all' flag to dmesg analyzer priority_override_rules to allow a rule to match everything. Can be used as a default rule. --- .../plugins/inband/dmesg/dmesg_analyzer.py | 34 +++++----- test/unit/plugin/test_dmesg_analyzer.py | 63 +++++++++++++++++++ 2 files changed, 81 insertions(+), 16 deletions(-) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index f63d56bb..7fae9c07 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -566,22 +566,24 @@ def resolve_priority( filter_fields = {key: value for key, value in rule.items() if key not in _EXCLUDED_KEYS} matched = True - # check for matches in all fields of the current rule - for field, filter_value in filter_fields.items(): - obj_value = getattr(regex_obj, field, None) - - # Normalize enum values to their name for string comparison - if hasattr(obj_value, "name"): - obj_value = obj_value.name - - if isinstance(filter_value, list): - if obj_value not in filter_value: - matched = False - break - else: - if obj_value != filter_value: - matched = False - break + # if match_all is True, don't check attributes, simply move to priority update + if rule.get("match_all", False) is False: + # check for matches in all fields of the current rule + for field, filter_value in filter_fields.items(): + obj_value = getattr(regex_obj, field, None) + + # Normalize enum values to their name for string comparison + if hasattr(obj_value, "name"): + obj_value = obj_value.name + + if isinstance(filter_value, list): + if obj_value not in filter_value: + matched = False + break + else: + if obj_value != filter_value: + matched = False + break if matched: # return on encountering first fully matched rule new_priority = rule.get("new_priority", _NO_CHANGE) diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 27ff231d..968c0b04 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -803,6 +803,69 @@ def test_resolve_priority_multiple_filter_fields(system_info): assert analyzer.resolve_priority(regex_obj, rules_mismatch) is None +def test_resolve_priority_match_all_matches_any_regex(system_info): + """match_all=True with no other filter fields always matches any ErrorRegex.""" + analyzer = DmesgAnalyzer(system_info=system_info) + for regex_obj in [ + ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ), + ErrorRegex( + regex=re.compile(r"IO_PAGE_FAULT"), + message="I/O Page Fault", + event_category=EventCategory.SW_DRIVER, + ), + ]: + result = analyzer.resolve_priority( + regex_obj, [{"match_all": True, "new_priority": "WARNING"}] + ) + assert ( + result == EventPriority.WARNING + ), f"Expected WARNING for {regex_obj.message}, got {result}" + + +def test_resolve_priority_match_all_ignores_non_matching_filters(system_info): + """match_all=True ignores filter fields that would otherwise not match.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + # event_category is RAS, but filter says SW_DRIVER — would normally NOT match. + # match_all=True should bypass this check and still apply the rule. + result = analyzer.resolve_priority( + regex_obj, + [{"match_all": True, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], + ) + assert result == EventPriority.WARNING + + +def test_resolve_priority_match_all_false_still_filters(system_info): + """match_all=False (explicit) falls through to normal filter logic.""" + analyzer = DmesgAnalyzer(system_info=system_info) + regex_obj = ErrorRegex( + regex=re.compile(r"GPU reset failed"), + message="GPU reset failed", + event_category=EventCategory.RAS, + ) + # match_all=False with a non-matching filter → should NOT match + result = analyzer.resolve_priority( + regex_obj, + [{"match_all": False, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], + ) + assert result is None + + # match_all=False with a matching filter → should match + result = analyzer.resolve_priority( + regex_obj, + [{"match_all": False, "event_category": "RAS", "new_priority": "WARNING"}], + ) + assert result == EventPriority.WARNING + + def test_priority_override_rules_in_analyze_data(system_info): """priority_override_rules passed via DmesgAnalyzerArgs overrides matched regex priorities.""" dmesg_data = DmesgData( From 89727a4f3009481d8afb4d2e822f0bcda121c202 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 21 Apr 2026 09:55:26 -0500 Subject: [PATCH 14/25] updates --- nodescraper/cli/cli.py | 93 ++++++++++++------- nodescraper/cli/host_cli_embed.py | 84 +++++++++++++++++ .../cli/test_build_global_argument_parser.py | 18 ++++ test/unit/cli/test_host_cli_embed.py | 71 ++++++++++++++ test/unit/cli/test_plugin_configs_cli.py | 37 ++++++++ 5 files changed, 269 insertions(+), 34 deletions(-) create mode 100644 nodescraper/cli/host_cli_embed.py create mode 100644 test/unit/cli/test_build_global_argument_parser.py create mode 100644 test/unit/cli/test_host_cli_embed.py create mode 100644 test/unit/cli/test_plugin_configs_cli.py diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 26a30aa3..054c2c5b 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -49,6 +49,10 @@ parse_gen_plugin_config, process_args, ) +from nodescraper.cli.host_cli_embed import ( + apply_host_cli_args_to_parsed_args, + merge_plugin_connection_config_from_host_ns, +) from nodescraper.cli.inputargtypes import ModelArgHandler, json_arg, log_path_arg from nodescraper.cli.invocation import run_plugin_queue_with_invocation from nodescraper.configregistry import ConfigRegistry @@ -69,24 +73,25 @@ def _parse_plugin_configs_csv(value: str) -> list[str]: return [p.strip() for p in value.split(",") if p.strip()] -def build_parser( - plugin_reg: PluginRegistry, - config_reg: ConfigRegistry, -) -> tuple[argparse.ArgumentParser, dict[str, tuple[argparse.ArgumentParser, dict]]]: - """Build an argument parser - - Args: - plugin_reg (PluginRegistry): registry of plugins - - Returns: - tuple[argparse.ArgumentParser, dict[str, tuple[argparse.ArgumentParser, dict]]]: tuple containing main - parser and subparsers for each plugin module - """ - parser = argparse.ArgumentParser( - description="node scraper CLI", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, +def _config_registry_with_all_plugins(plugin_reg: PluginRegistry) -> ConfigRegistry: + """Synthetic ``AllPlugins`` config used for CLI help and :func:`build_global_argument_parser`.""" + config_reg = ConfigRegistry() + config_reg.configs["AllPlugins"] = PluginConfig( + name="AllPlugins", + desc="Run all registered plugins with default arguments", + global_args={}, + plugins={name: {} for name in plugin_reg.plugins}, + result_collators={}, ) + return config_reg + +def _add_cli_root_globals( + parser: argparse.ArgumentParser, + plugin_reg: PluginRegistry, + config_reg: ConfigRegistry, +) -> None: + """Register top-level flags before ``subcmd`` subparsers (shared with :func:`build_global_argument_parser`).""" parser.add_argument( "--version", action="version", @@ -193,6 +198,40 @@ def build_parser( help="Skip plugins that require sudo permissions", ) + +def build_global_argument_parser(*, add_help: bool = True) -> argparse.ArgumentParser: + """Globals only (no subcommands), for host CLIs such as amd-error-scraper ``error-scraper``.""" + plugin_reg = PluginRegistry() + config_reg = _config_registry_with_all_plugins(plugin_reg) + parser = argparse.ArgumentParser( + description="node scraper CLI (global options only)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + add_help=add_help, + ) + _add_cli_root_globals(parser, plugin_reg, config_reg) + return parser + + +def build_parser( + plugin_reg: PluginRegistry, + config_reg: ConfigRegistry, +) -> tuple[argparse.ArgumentParser, dict[str, tuple[argparse.ArgumentParser, dict]]]: + """Build an argument parser + + Args: + plugin_reg (PluginRegistry): registry of plugins + + Returns: + tuple[argparse.ArgumentParser, dict[str, tuple[argparse.ArgumentParser, dict]]]: tuple containing main + parser and subparsers for each plugin module + """ + parser = argparse.ArgumentParser( + description="node scraper CLI", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + _add_cli_root_globals(parser, plugin_reg, config_reg) + subparsers = parser.add_subparsers(dest="subcmd", help="Subcommands") subparsers.default = "run-plugins" @@ -366,14 +405,7 @@ def get_cli_top_level_subcommands() -> tuple[str, ...]: Tuple of ``subcmd`` subparser names; call ``cache_clear()`` if registries change in-process. """ plugin_reg = PluginRegistry() - config_reg = ConfigRegistry() - config_reg.configs["AllPlugins"] = PluginConfig( - name="AllPlugins", - desc="Run all registered plugins with default arguments", - global_args={}, - plugins={name: {} for name in plugin_reg.plugins}, - result_collators={}, - ) + config_reg = _config_registry_with_all_plugins(plugin_reg) parser, _plugin_subparser_map = build_parser(plugin_reg, config_reg) return _top_level_subcommand_names(parser) @@ -441,16 +473,7 @@ def main( arg_input = sys.argv[1:] plugin_reg = PluginRegistry() - - config_reg = ConfigRegistry() - # Add synthetic "AllPlugins" config that includes every registered plugin - config_reg.configs["AllPlugins"] = PluginConfig( - name="AllPlugins", - desc="Run all registered plugins with default arguments", - global_args={}, - plugins={name: {} for name in plugin_reg.plugins}, - result_collators={}, - ) + config_reg = _config_registry_with_all_plugins(plugin_reg) parser, plugin_subparser_map = build_parser(plugin_reg, config_reg) try: @@ -459,6 +482,8 @@ def main( ) parsed_args = parser.parse_args(top_level_args) + apply_host_cli_args_to_parsed_args(parsed_args, host_cli_args) + merge_plugin_connection_config_from_host_ns(parsed_args, host_cli_args) system_info = get_system_info(parsed_args) sname = system_info.name.lower().replace("-", "_").replace(".", "_") timestamp = datetime.datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p") diff --git a/nodescraper/cli/host_cli_embed.py b/nodescraper/cli/host_cli_embed.py new file mode 100644 index 00000000..bffeb378 --- /dev/null +++ b/nodescraper/cli/host_cli_embed.py @@ -0,0 +1,84 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from __future__ import annotations + +import argparse +from typing import Optional + +__all__ = [ + "apply_host_cli_args_to_parsed_args", + "merge_plugin_connection_config_from_host_ns", +] + + +def apply_host_cli_args_to_parsed_args( + parsed_args: argparse.Namespace, + host_ns: Optional[argparse.Namespace], +) -> None: + """Copy host profile fields from an embedding host onto parsed top-level args. + + Used when ``main(..., host_cli_args=...)`` is invoked (e.g. from the + error-scraper wrapper) so ``--connection-config`` profile data loaded by the + host is visible to :func:`get_system_info` and the rest of the CLI. + """ + if host_ns is None: + return + for attr in ( + "sys_name", + "sys_location", + "sys_sku", + "sys_platform", + "sys_interaction_level", + ): + if hasattr(host_ns, attr): + val = getattr(host_ns, attr) + if val is not None: + setattr(parsed_args, attr, val) + + +def merge_plugin_connection_config_from_host_ns( + parsed_args: argparse.Namespace, + host_ns: Optional[argparse.Namespace], +) -> None: + """Merge plugin ``connection_config`` from the host namespace into ``parsed_args``. + + The same key shape as ``--connection-config`` on the node-scraper argv: + connection manager class name (string) → args dict. When both the host + profile and the CLI supply ``connection_config``, **CLI** values win for + duplicate manager keys. + """ + if host_ns is None: + return + from_host = getattr(host_ns, "connection_config", None) + if not from_host: + return + if not isinstance(from_host, dict): + return + from_cli = getattr(parsed_args, "connection_config", None) or {} + if not isinstance(from_cli, dict): + from_cli = {} + parsed_args.connection_config = {**from_host, **from_cli} diff --git a/test/unit/cli/test_build_global_argument_parser.py b/test/unit/cli/test_build_global_argument_parser.py new file mode 100644 index 00000000..33489e9c --- /dev/null +++ b/test/unit/cli/test_build_global_argument_parser.py @@ -0,0 +1,18 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. + +from __future__ import annotations + +from nodescraper.cli.cli import build_global_argument_parser + + +def test_build_global_argument_parser_leaves_subcommand_in_unknown() -> None: + p = build_global_argument_parser(add_help=False) + ns, rest = p.parse_known_args(["--sys-name", "sut.example", "run-plugins", "DmesgPlugin"]) + assert ns.sys_name == "sut.example" + assert rest == ["run-plugins", "DmesgPlugin"] + + +def test_build_global_argument_parser_has_no_subcmd_default() -> None: + p = build_global_argument_parser(add_help=False) + ns, _ = p.parse_known_args([]) + assert getattr(ns, "subcmd", None) is None diff --git a/test/unit/cli/test_host_cli_embed.py b/test/unit/cli/test_host_cli_embed.py new file mode 100644 index 00000000..232fc2ee --- /dev/null +++ b/test/unit/cli/test_host_cli_embed.py @@ -0,0 +1,71 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +############################################################################### + +from __future__ import annotations + +import argparse + +from nodescraper.cli.host_cli_embed import ( + apply_host_cli_args_to_parsed_args, + merge_plugin_connection_config_from_host_ns, +) + + +def test_apply_host_cli_args_to_parsed_args_copies_sys_fields() -> None: + parsed = argparse.Namespace( + sys_name="client", + sys_location="LOCAL", + sys_sku=None, + sys_platform=None, + sys_interaction_level="INTERACTIVE", + ) + host = argparse.Namespace( + sys_name="sut.example.com", + sys_location="REMOTE", + sys_sku="MI450", + sys_platform="whitman", + sys_interaction_level="STANDARD", + ) + apply_host_cli_args_to_parsed_args(parsed, host) + assert parsed.sys_name == "sut.example.com" + assert parsed.sys_location == "REMOTE" + assert parsed.sys_sku == "MI450" + assert parsed.sys_platform == "whitman" + assert parsed.sys_interaction_level == "STANDARD" + + +def test_apply_host_cli_args_to_parsed_args_noop_without_host() -> None: + parsed = argparse.Namespace(sys_name="x") + apply_host_cli_args_to_parsed_args(parsed, None) + assert parsed.sys_name == "x" + + +def test_merge_plugin_connection_config_from_host_ns_host_first_cli_wins() -> None: + parsed = argparse.Namespace( + connection_config={"InBandConnectionManager": {"hostname": "cli-host"}} + ) + host = argparse.Namespace( + connection_config={ + "InBandConnectionManager": {"hostname": "host-host"}, + "RedfishConnectionManager": {"host": "10.0.0.1"}, + } + ) + merge_plugin_connection_config_from_host_ns(parsed, host) + assert parsed.connection_config["InBandConnectionManager"]["hostname"] == "cli-host" + assert parsed.connection_config["RedfishConnectionManager"]["host"] == "10.0.0.1" + + +def test_merge_plugin_connection_config_from_host_ns_host_only() -> None: + parsed = argparse.Namespace(connection_config=None) + host = argparse.Namespace( + connection_config={"InBandConnectionManager": {"hostname": "h", "username": "u"}} + ) + merge_plugin_connection_config_from_host_ns(parsed, host) + assert parsed.connection_config == { + "InBandConnectionManager": {"hostname": "h", "username": "u"} + } diff --git a/test/unit/cli/test_plugin_configs_cli.py b/test/unit/cli/test_plugin_configs_cli.py new file mode 100644 index 00000000..5c4af37b --- /dev/null +++ b/test/unit/cli/test_plugin_configs_cli.py @@ -0,0 +1,37 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +############################################################################### + +from __future__ import annotations + +from nodescraper.cli.cli import build_parser +from nodescraper.configregistry import ConfigRegistry +from nodescraper.models import PluginConfig +from nodescraper.pluginregistry import PluginRegistry + + +def _parser(): + plugin_reg = PluginRegistry() + config_reg = ConfigRegistry() + config_reg.configs["AllPlugins"] = PluginConfig( + name="AllPlugins", + desc="Run all registered plugins with default arguments", + global_args={}, + plugins={name: {} for name in plugin_reg.plugins}, + result_collators={}, + ) + return build_parser(plugin_reg, config_reg)[0] + + +def test_plugin_configs_equals_form_parses_csv() -> None: + ns = _parser().parse_args(["--plugin-configs=NodeStatus,AllPlugins"]) + assert ns.plugin_configs == ["NodeStatus", "AllPlugins"] + + +def test_plugin_configs_space_separated_parses() -> None: + ns = _parser().parse_args(["--plugin-configs", "NodeStatus,AllPlugins"]) + assert ns.plugin_configs == ["NodeStatus", "AllPlugins"] From 3787fe0d0bc3b0f9a1f4cc72f56db102669175b3 Mon Sep 17 00:00:00 2001 From: niratner Date: Tue, 21 Apr 2026 11:42:10 -0400 Subject: [PATCH 15/25] updated syntax of optional return value in dmesg plugin analyzer function --- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index 7fae9c07..65dc668d 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -539,7 +539,7 @@ def resolve_priority( self, regex_obj: ErrorRegex, priority_override_rules: list[dict], - ) -> EventPriority | None: + ) -> Optional[EventPriority]: """ Walk the priority_override_rules in order (first-match-wins). All keys in each rule except 'new_priority' and 'match_all' are treated From 66faa69ee6d2f449a1de985553e0bc588da67999 Mon Sep 17 00:00:00 2001 From: niratner Date: Tue, 21 Apr 2026 13:54:08 -0400 Subject: [PATCH 16/25] Updated README with usage example, updated resolve_priority docstring and changed return from None to the original priority if it doesn't change --- README.md | 10 +++++++ .../plugins/inband/dmesg/dmesg_analyzer.py | 30 ++++++++++++------- test/unit/plugin/test_dmesg_analyzer.py | 16 +++++----- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index eda8dea4..ebc95b8f 100644 --- a/README.md +++ b/README.md @@ -337,6 +337,16 @@ You can extend the built-in error detection with custom regex patterns. Create a "event_category": "SW_DRIVER", "event_priority": 4 } + ], + "priority_override_rules": [ + { + "message": "Application Crash", + "new_priority": "ERROR" + }, + { + "event_category": "SW_DRIVER", + "new_priority": "WARNING" + } ] } } diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index 65dc668d..a790cfd0 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -539,14 +539,14 @@ def resolve_priority( self, regex_obj: ErrorRegex, priority_override_rules: list[dict], - ) -> Optional[EventPriority]: - """ + ) -> EventPriority: + """Determine the new priority of an ErrorRegex based on provided rules + Walk the priority_override_rules in order (first-match-wins). - All keys in each rule except 'new_priority' and 'match_all' are treated - as filter fields compared against ErrorRegex attributes. Filter values - may be a single value or a list of values (match if any value matches). - Enum fields are compared by their name. Returns the overriding - EventPriority, or None to keep the original. + Each rule should be a dict with only these keys allowed: + 1. Any attribute of an ErrorRegex object by which to filter. Currently this include "regex", "message", "event_category", "event_priority". This key should match to a string or a list (match if any value in the list matches). + 2. "new_priority": str. The string value of any EventPriority enum, or "NO_CHANGE", to determine the updated priority of the regex_obj if it matches the given rule. + 3. "match_all": bool. Determines if the rule will automatically match for any regex_obj. Will ignore any provided filters if given. Example rule format: { @@ -557,11 +557,20 @@ def resolve_priority( "event_category": "RAS", "new_priority": "WARNING" } + + Args: + regex_obj (ErrorRegex): The ErrorRegex object to have its priority updated + priority_override_rules (list[dict]): The list of rules which determine what the updated priority should be + + Returns: + EventPriority: The new priority of the event. Returns the original priority if no rule matches or the matched rule specifies NO_CHANGE """ _NO_CHANGE = "NO_CHANGE" _EXCLUDED_KEYS = {"new_priority", "match_all"} + current_priority = regex_obj.event_priority + for rule in priority_override_rules: filter_fields = {key: value for key, value in rule.items() if key not in _EXCLUDED_KEYS} @@ -588,10 +597,10 @@ def resolve_priority( if matched: # return on encountering first fully matched rule new_priority = rule.get("new_priority", _NO_CHANGE) if new_priority == _NO_CHANGE: - return None + return current_priority return EventPriority[new_priority] - return None # if no rules are matched, return None + return current_priority # if no rules are matched, keep the current priority def analyze_data( self, @@ -617,8 +626,7 @@ def analyze_data( updated_regex = [] for regex_obj in final_error_regex: new_priority = self.resolve_priority(regex_obj, args.priority_override_rules) - if new_priority is not None: - regex_obj = regex_obj.model_copy(update={"event_priority": new_priority}) + regex_obj = regex_obj.model_copy(update={"event_priority": new_priority}) updated_regex.append(regex_obj) final_error_regex = updated_regex diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 968c0b04..7aaeb850 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -712,7 +712,7 @@ def test_custom_regex_empty_list(system_info): def test_resolve_priority_no_match(system_info): - """No rule matches → returns None (keep original priority).""" + """No rule matches → returns the original priority unchanged.""" analyzer = DmesgAnalyzer(system_info=system_info) regex_obj = ErrorRegex( regex=re.compile(r"GPU reset failed"), @@ -720,7 +720,7 @@ def test_resolve_priority_no_match(system_info): event_category=EventCategory.RAS, ) rules = [{"event_category": "SW_DRIVER", "new_priority": "WARNING"}] - assert analyzer.resolve_priority(regex_obj, rules) is None + assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.ERROR def test_resolve_priority_match_by_category(system_info): @@ -755,7 +755,7 @@ def test_resolve_priority_match_by_message_list(system_info): def test_resolve_priority_no_change(system_info): - """new_priority=NO_CHANGE → returns None (keep original priority).""" + """new_priority=NO_CHANGE → returns the original priority unchanged.""" analyzer = DmesgAnalyzer(system_info=system_info) regex_obj = ErrorRegex( regex=re.compile(r"GPU reset failed"), @@ -763,7 +763,7 @@ def test_resolve_priority_no_change(system_info): event_category=EventCategory.RAS, ) rules = [{"event_category": "RAS", "new_priority": "NO_CHANGE"}] - assert analyzer.resolve_priority(regex_obj, rules) is None + assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.ERROR def test_resolve_priority_first_match_wins(system_info): @@ -796,11 +796,11 @@ def test_resolve_priority_multiple_filter_fields(system_info): ] assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.WARNING - # Does NOT match because message differs + # Does NOT match because message differs → returns original priority rules_mismatch = [ {"event_category": "RAS", "message": "ACA Error", "new_priority": "WARNING"}, ] - assert analyzer.resolve_priority(regex_obj, rules_mismatch) is None + assert analyzer.resolve_priority(regex_obj, rules_mismatch) == EventPriority.ERROR def test_resolve_priority_match_all_matches_any_regex(system_info): @@ -851,12 +851,12 @@ def test_resolve_priority_match_all_false_still_filters(system_info): message="GPU reset failed", event_category=EventCategory.RAS, ) - # match_all=False with a non-matching filter → should NOT match + # match_all=False with a non-matching filter → returns original priority result = analyzer.resolve_priority( regex_obj, [{"match_all": False, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], ) - assert result is None + assert result == EventPriority.ERROR # match_all=False with a matching filter → should match result = analyzer.resolve_priority( From de3df446812cf1f0f21031e38156441b7e8dcfcf Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Tue, 21 Apr 2026 21:11:36 +0000 Subject: [PATCH 17/25] nodescraper/plugins/inband/rdma/rdma_collector.py --- nodescraper/plugins/inband/rdma/rdmadata.py | 379 ++++++++++++++++++-- test/unit/plugin/test_rdma_analyzer.py | 187 ++++++---- 2 files changed, 467 insertions(+), 99 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdmadata.py b/nodescraper/plugins/inband/rdma/rdmadata.py index cb26b5b1..965cb5b5 100644 --- a/nodescraper/plugins/inband/rdma/rdmadata.py +++ b/nodescraper/plugins/inband/rdma/rdmadata.py @@ -23,44 +23,365 @@ # SOFTWARE. # ############################################################################### -from typing import Optional +from typing import ClassVar, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, Field, model_validator from typing_extensions import Self from nodescraper.models import DataModel -class RdmaDevice(BaseModel): - """RDMA device from 'rdma dev' (text output).""" +class PollaraRdmaStatistics(BaseModel): + """ifname ionic""" - device: str - node_type: Optional[str] = None - transport: Optional[str] = None - node_guid: Optional[str] = None - sys_image_guid: Optional[str] = None - state: Optional[str] = None - attributes: dict[str, str] = Field(default_factory=dict) + tx_rdma_ucast_bytes: Optional[int] = None + tx_rdma_ucast_pkts: Optional[int] = None + tx_rdma_mcast_bytes: Optional[int] = None + tx_rdma_mcast_pkts: Optional[int] = None + tx_rdma_cnp_pkts: Optional[int] = None + rx_rdma_ucast_bytes: Optional[int] = None + rx_rdma_ucast_pkts: Optional[int] = None + rx_rdma_mcast_bytes: Optional[int] = None + rx_rdma_mcast_pkts: Optional[int] = None + rx_rdma_cnp_pkts: Optional[int] = None + rx_rdma_ecn_pkts: Optional[int] = None + req_rx_pkt_seq_err: Optional[int] = None + req_rx_rnr_retry_err: Optional[int] = None + req_rx_rmt_acc_err: Optional[int] = None + req_rx_rmt_req_err: Optional[int] = None + req_rx_oper_err: Optional[int] = None + req_rx_impl_nak_seq_err: Optional[int] = None + req_rx_cqe_err: Optional[int] = None + req_rx_cqe_flush: Optional[int] = None + req_rx_dup_response: Optional[int] = None + req_rx_inval_pkts: Optional[int] = None + req_tx_loc_acc_err: Optional[int] = None + req_tx_loc_oper_err: Optional[int] = None + req_tx_mem_mgmt_err: Optional[int] = None + req_tx_retry_excd_err: Optional[int] = None + req_tx_loc_sgl_inv_err: Optional[int] = None + resp_rx_dup_request: Optional[int] = None + resp_rx_outof_buf: Optional[int] = None + resp_rx_outouf_seq: Optional[int] = None + resp_rx_cqe_err: Optional[int] = None + resp_rx_cqe_flush: Optional[int] = None + resp_rx_loc_len_err: Optional[int] = None + resp_rx_inval_request: Optional[int] = None + resp_rx_loc_oper_err: Optional[int] = None + resp_rx_outof_atomic: Optional[int] = None + resp_tx_pkt_seq_err: Optional[int] = None + resp_tx_rmt_inval_req_err: Optional[int] = None + resp_tx_rmt_acc_err: Optional[int] = None + resp_tx_rmt_oper_err: Optional[int] = None + resp_tx_rnr_retry_err: Optional[int] = None + resp_tx_loc_sgl_inv_err: Optional[int] = None + resp_rx_s0_table_err: Optional[int] = None + tx_rdma_ccl_cts_bytes: Optional[int] = None + tx_rdma_ccl_cts_pkts: Optional[int] = None + rx_rdma_ccl_cts_bytes: Optional[int] = None + rx_rdma_ccl_cts_pkts: Optional[int] = None + resp_rx_ccl_cts_outouf_seq: Optional[int] = None + tx_rdma_ack_timeout: Optional[int] = None + tx_rdma_ccl_cts_ack_timeout: Optional[int] = None + tx_rdma_retx_bytes: Optional[int] = None + tx_rdma_retx_pkts: Optional[int] = None + tx_rdma_ccl_cts_retx_bytes: Optional[int] = None + tx_rdma_ccl_cts_retx_pkts: Optional[int] = None + rx_rdma_mtu_discard_pkts: Optional[int] = None + error_fields: ClassVar[list[str]] = [ + "req_rx_pkt_seq_err", + "req_rx_rnr_retry_err", + "req_rx_rmt_acc_err", + "req_rx_rmt_req_err", + "req_rx_oper_err", + "req_rx_impl_nak_seq_err", + "req_rx_cqe_err", + "req_rx_cqe_flush", + "req_rx_dup_response", + "req_rx_inval_pkts", + "req_tx_loc_acc_err", + "req_tx_loc_oper_err", + "req_tx_mem_mgmt_err", + "req_tx_retry_excd_err", + "req_tx_loc_sgl_inv_err", + "resp_rx_dup_request", + "resp_rx_outof_buf", + "resp_rx_outouf_seq", + "resp_rx_cqe_err", + "resp_rx_cqe_flush", + "resp_rx_loc_len_err", + "resp_rx_inval_request", + "resp_rx_loc_oper_err", + "resp_rx_outof_atomic", + "resp_tx_pkt_seq_err", + "resp_tx_rmt_inval_req_err", + "resp_tx_rmt_acc_err", + "resp_tx_rmt_oper_err", + "resp_tx_rnr_retry_err", + "resp_tx_loc_sgl_inv_err", + "resp_rx_s0_table_err", + "resp_rx_ccl_cts_outouf_seq", + "tx_rdma_ack_timeout", + "tx_rdma_ccl_cts_ack_timeout", + "tx_rdma_retx_bytes", + "tx_rdma_retx_pkts", + "tx_rdma_ccl_cts_retx_bytes", + "tx_rdma_ccl_cts_retx_pkts", + "rx_rdma_mtu_discard_pkts", + ] -class RdmaStatistics(BaseModel): - """RDMA statistic entry from 'rdma statistic -j'.""" + critial_error_fields: ClassVar[list[str]] = [] + + +class Thor2RdmaStatistics(BaseModel): + """ifname bnxt""" + + active_pds: Optional[int] = None + active_ahs: Optional[int] = None + active_qps: Optional[int] = None + active_rc_qps: Optional[int] = None + active_ud_qps: Optional[int] = None + active_srqs: Optional[int] = None + active_cqs: Optional[int] = None + active_mrs: Optional[int] = None + active_mws: Optional[int] = None + watermark_pds: Optional[int] = None + watermark_ahs: Optional[int] = None + watermark_qps: Optional[int] = None + watermark_rc_qps: Optional[int] = None + watermark_ud_qps: Optional[int] = None + watermark_srqs: Optional[int] = None + watermark_cqs: Optional[int] = None + watermark_mrs: Optional[int] = None + watermark_mws: Optional[int] = None + rx_pkts: Optional[int] = None + rx_bytes: Optional[int] = None + tx_pkts: Optional[int] = None + tx_bytes: Optional[int] = None + recoverable_errors: Optional[int] = None + tx_roce_errors: Optional[int] = None + tx_roce_discards: Optional[int] = None + rx_roce_errors: Optional[int] = None + rx_roce_discards: Optional[int] = None + local_ack_timeout_err: Optional[int] = None + packet_seq_err: Optional[int] = None + max_retry_exceeded: Optional[int] = None + rnr_nak_retry_err: Optional[int] = None + implied_nak_seq_err: Optional[int] = None + unrecoverable_err: Optional[int] = None + bad_resp_err: Optional[int] = None + local_qp_op_err: Optional[int] = None + local_protection_err: Optional[int] = None + mem_mgmt_op_err: Optional[int] = None + req_remote_invalid_request: Optional[int] = None + req_remote_access_errors: Optional[int] = None + remote_op_err: Optional[int] = None + duplicate_request: Optional[int] = None + res_exceed_max: Optional[int] = None + resp_local_length_error: Optional[int] = None + res_exceeds_wqe: Optional[int] = None + res_opcode_err: Optional[int] = None + res_rx_invalid_rkey: Optional[int] = None + res_rx_domain_err: Optional[int] = None + res_rx_no_perm: Optional[int] = None + res_rx_range_err: Optional[int] = None + res_tx_invalid_rkey: Optional[int] = None + res_tx_domain_err: Optional[int] = None + res_tx_no_perm: Optional[int] = None + res_tx_range_err: Optional[int] = None + res_irrq_oflow: Optional[int] = None + res_unsup_opcode: Optional[int] = None + res_unaligned_atomic: Optional[int] = None + res_rem_inv_err: Optional[int] = None + res_mem_err: Optional[int] = None + res_srq_err: Optional[int] = None + res_cmp_err: Optional[int] = None + res_invalid_dup_rkey: Optional[int] = None + res_wqe_format_err: Optional[int] = None + res_cq_load_err: Optional[int] = None + res_srq_load_err: Optional[int] = None + res_tx_pci_err: Optional[int] = None + res_rx_pci_err: Optional[int] = None + tx_atomic_req: Optional[int] = None + tx_read_req: Optional[int] = None + tx_read_resp: Optional[int] = None + tx_write_req: Optional[int] = None + tx_send_req: Optional[int] = None + rx_atomic_requests: Optional[int] = None + rx_read_requests: Optional[int] = None + rx_read_resp: Optional[int] = None + rx_write_requests: Optional[int] = None + rx_send_req: Optional[int] = None + rx_good_pkts: Optional[int] = None + rx_good_bytes: Optional[int] = None + out_of_buffer: Optional[int] = None + np_cnp_sent: Optional[int] = None + rp_cnp_handled: Optional[int] = None + np_ecn_marked_roce_packets: Optional[int] = None + out_of_sequence: Optional[int] = None + pacing_reschedule: Optional[int] = None + pacing_complete: Optional[int] = None + pacing_alerts: Optional[int] = None + db_fifo_register: Optional[int] = None + req_cqe_error: Optional[int] = None + req_cqe_flush_error: Optional[int] = None + resp_cqe_error: Optional[int] = None + resp_cqe_flush_error: Optional[int] = None + resp_remote_access_errors: Optional[int] = None + roce_adp_retrans: Optional[int] = None + roce_adp_retrans_to: Optional[int] = None + roce_slow_restart: Optional[int] = None + roce_slow_restart_cnps: Optional[int] = None + roce_slow_restart_trans: Optional[int] = None + rp_cnp_ignored: Optional[int] = None + rx_icrc_encapsulated: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "recoverable_errors", + "tx_roce_errors", + "tx_roce_discards", + "rx_roce_errors", + "rx_roce_discards", + "local_ack_timeout_err", + "packet_seq_err", + "max_retry_exceeded", + "rnr_nak_retry_err", + "implied_nak_seq_err", + "bad_resp_err", + "local_qp_op_err", + "local_protection_err", + "mem_mgmt_op_err", + "req_remote_invalid_request", + "req_remote_access_errors", + "remote_op_err", + "duplicate_request", + "res_exceed_max", + "resp_local_length_error", + "res_exceeds_wqe", + "res_opcode_err", + "res_rx_invalid_rkey", + "res_rx_domain_err", + "res_rx_no_perm", + "res_rx_range_err", + "res_tx_invalid_rkey", + "res_tx_domain_err", + "res_tx_no_perm", + "res_tx_range_err", + "res_irrq_oflow", + "res_unsup_opcode", + "res_unaligned_atomic", + "res_rem_inv_err", + "res_srq_err", + "res_cmp_err", + "res_invalid_dup_rkey", + "res_wqe_format_err", + "res_cq_load_err", + "res_srq_load_err", + "out_of_buffer", + "out_of_sequence", + "req_cqe_error", + "req_cqe_flush_error", + "resp_cqe_error", + "resp_cqe_flush_error", + "resp_remote_access_errors", + "roce_adp_retrans", + "roce_adp_retrans_to", + "rp_cnp_ignored", + "rx_icrc_encapsulated", + ] + + critial_error_fields: ClassVar[list[str]] = [ + "unrecoverable_err", + "res_tx_pci_err", + "res_rx_pci_err", + "res_mem_err", + ] + + +class Cx7RdmaStatistics(BaseModel): + """ifname mlx""" + + rx_write_requests: Optional[int] = None + rx_read_requests: Optional[int] = None + rx_atomic_requests: Optional[int] = None + rx_dct_connect: Optional[int] = None + out_of_buffer: Optional[int] = None + out_of_sequence: Optional[int] = None + duplicate_request: Optional[int] = None + rnr_nak_retry_err: Optional[int] = None + packet_seq_err: Optional[int] = None + implied_nak_seq_err: Optional[int] = None + local_ack_timeout_err: Optional[int] = None + resp_local_length_error: Optional[int] = None + resp_cqe_error: Optional[int] = None + req_cqe_error: Optional[int] = None + req_remote_invalid_request: Optional[int] = None + req_remote_access_errors: Optional[int] = None + resp_remote_access_errors: Optional[int] = None + resp_cqe_flush_error: Optional[int] = None + req_cqe_flush_error: Optional[int] = None + roce_adp_retrans: Optional[int] = None + roce_adp_retrans_to: Optional[int] = None + roce_slow_restart: Optional[int] = None + roce_slow_restart_cnps: Optional[int] = None + roce_slow_restart_trans: Optional[int] = None + rp_cnp_ignored: Optional[int] = None + rp_cnp_handled: Optional[int] = None + np_ecn_marked_roce_packets: Optional[int] = None + np_cnp_sent: Optional[int] = None + rx_icrc_encapsulated: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "out_of_buffer", + "out_of_sequence", + "duplicate_request", + "rnr_nak_retry_err", + "packet_seq_err", + "implied_nak_seq_err", + "local_ack_timeout_err", + "resp_local_length_error", + "resp_cqe_error", + "req_cqe_error", + "req_remote_invalid_request", + "req_remote_access_errors", + "resp_remote_access_errors", + "resp_cqe_flush_error", + "req_cqe_flush_error", + "roce_adp_retrans", + "roce_adp_retrans_to", + "rp_cnp_ignored", + "rx_icrc_encapsulated", + ] - model_config = ConfigDict(extra="allow") + critial_error_fields: ClassVar[list[str]] = [] + +RdmaVendorStatistics = Union[PollaraRdmaStatistics, Thor2RdmaStatistics, Cx7RdmaStatistics] + +# Map ifname prefixes to vendor-specific statistic models +VENDOR_PREFIX_MAP: dict[str, type[RdmaVendorStatistics]] = { + "ionic": PollaraRdmaStatistics, + "bnxt": Thor2RdmaStatistics, + "mlx": Cx7RdmaStatistics, +} + + +class RdmaStatistics(BaseModel): + # Interface information ifname: Optional[str] = None port: Optional[int] = None + vendor_statistics: Optional[RdmaVendorStatistics] = None @model_validator(mode="after") - def validate_at_least_one_field(self) -> Self: + def validate_atleast_one_field(self) -> Self: if not self.model_fields_set: raise ValueError("At least one field must be set in RdmaStatistics") return self class RdmaLink(BaseModel): - """RDMA link entry from 'rdma link -j' (JSON).""" - + # Interface and port information ifindex: Optional[int] = None ifname: Optional[str] = None port: Optional[int] = None @@ -70,12 +391,24 @@ class RdmaLink(BaseModel): netdev_index: Optional[int] = None @model_validator(mode="after") - def validate_at_least_one_field(self) -> Self: + def validate_atleast_one_field(self) -> Self: if not self.model_fields_set: raise ValueError("At least one field must be set in RdmaLink") return self +class RdmaDevice(BaseModel): + """RDMA device from 'rdma dev' (text output).""" + + device: str + node_type: Optional[str] = None + transport: Optional[str] = None + node_guid: Optional[str] = None + sys_image_guid: Optional[str] = None + state: Optional[str] = None + attributes: dict[str, str] = Field(default_factory=dict) + + class RdmaLinkText(BaseModel): """RDMA link from 'rdma link' (text output).""" @@ -92,10 +425,12 @@ class RdmaDataModel(DataModel): Data model for RDMA (Remote Direct Memory Access) statistics and link information. Attributes: - statistic_list: List of RDMA statistics from 'rdma statistic -j'. - link_list: List of RDMA links from 'rdma link -j' (JSON). - dev_list: List of RDMA devices from 'rdma dev' (text). - link_list_text: List of RDMA links from 'rdma link' (text). + statistic_list: RDMA statistics from 'rdma statistic -j'. Each entry has + ifname, port, and vendor_statistics (ionic/bnxt/mlx counters) when the + interface prefix matches a known vendor. + link_list: RDMA links from 'rdma link -j' (JSON). + dev_list: RDMA devices from 'rdma dev' (text). + link_list_text: RDMA links from 'rdma link' (text). """ link_list: list[RdmaLink] = Field(default_factory=list) diff --git a/test/unit/plugin/test_rdma_analyzer.py b/test/unit/plugin/test_rdma_analyzer.py index 2f477b11..ada67c04 100644 --- a/test/unit/plugin/test_rdma_analyzer.py +++ b/test/unit/plugin/test_rdma_analyzer.py @@ -25,15 +25,21 @@ ############################################################################### import json from pathlib import Path +from typing import Optional import pytest from nodescraper.enums import EventPriority, ExecutionStatus from nodescraper.plugins.inband.rdma.rdma_analyzer import RdmaAnalyzer from nodescraper.plugins.inband.rdma.rdmadata import ( + VENDOR_PREFIX_MAP, + Cx7RdmaStatistics, + PollaraRdmaStatistics, RdmaDataModel, RdmaLink, RdmaStatistics, + RdmaVendorStatistics, + Thor2RdmaStatistics, ) @@ -48,91 +54,108 @@ def plugin_fixtures_path(): @pytest.fixture -def clean_rdma_model(plugin_fixtures_path): - """RDMA data with no errors (all counters zero).""" +def example_stat_dicts(plugin_fixtures_path): path = plugin_fixtures_path / "rdma_statistic_example_data.json" - data = json.loads(path.read_text()) - stats = [RdmaStatistics(**s) for s in data] - return RdmaDataModel(statistic_list=stats) + return json.loads(path.read_text()) + + +def _build_stats(data: list[dict]) -> list[RdmaStatistics]: + """Build RdmaStatistics list from raw dicts using vendor prefix map.""" + stats = [] + for entry in data: + ifname = entry.get("ifname", "") + vendor_stats: Optional[RdmaVendorStatistics] = None + for prefix, vendor_cls in VENDOR_PREFIX_MAP.items(): + if ifname.startswith(prefix): + vendor_stats = vendor_cls(**entry) + break + stats.append( + RdmaStatistics( + ifname=entry.get("ifname"), + port=entry.get("port"), + vendor_statistics=vendor_stats, + ) + ) + return stats @pytest.fixture -def clean_stats(plugin_fixtures_path): - """List of clean RdmaStatistics (no errors) for building models with links.""" - path = plugin_fixtures_path / "rdma_statistic_example_data.json" - data = json.loads(path.read_text()) - return [RdmaStatistics(**s) for s in data] +def clean_rdma_model(example_stat_dicts): + return RdmaDataModel(statistic_list=_build_stats(example_stat_dicts)) + + +@pytest.fixture +def clean_stats(example_stat_dicts): + return _build_stats(example_stat_dicts) def test_no_errors_detected(rdma_analyzer, clean_rdma_model): - """Test with nominal data that has no errors.""" result = rdma_analyzer.analyze_data(clean_rdma_model) assert result.status == ExecutionStatus.OK assert len(result.events) == 0 -def test_single_error_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing a single error.""" - stats = list(clean_rdma_model.statistic_list) - stats[0].tx_roce_errors = 5 - model = RdmaDataModel(statistic_list=stats) +def test_single_error_detected(rdma_analyzer, example_stat_dicts): + stats_with_error = _build_stats(example_stat_dicts) + stats_with_error[0].vendor_statistics.req_rx_pkt_seq_err = 5 + model = RdmaDataModel(statistic_list=stats_with_error) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message assert len(result.events) == 1 - assert result.events[0].description == "RDMA error detected on bnxt_re0: [tx_roce_errors]" + assert result.events[0].description == "RDMA error detected: req_rx_pkt_seq_err" assert result.events[0].priority == EventPriority.ERROR - assert result.events[0].data["errors"] == {"tx_roce_errors": 5} - assert result.events[0].data["interface"] == "bnxt_re0" + assert result.events[0].data["error_count"] == 5 + assert result.events[0].data["interface"] == "ionic_0" -def test_multiple_errors_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing multiple errors (grouped per interface).""" - stats = list(clean_rdma_model.statistic_list) - stats[0].tx_roce_errors = 10 - stats[0].rx_roce_errors = 3 - stats[1].packet_seq_err = 7 - model = RdmaDataModel(statistic_list=stats) +def test_multiple_errors_detected(rdma_analyzer, example_stat_dicts): + stats_with_errors = _build_stats(example_stat_dicts) + stats_with_errors[0].vendor_statistics.req_rx_rmt_acc_err = 10 + stats_with_errors[0].vendor_statistics.req_tx_loc_oper_err = 3 + stats_with_errors[8].vendor_statistics.packet_seq_err = 7 + model = RdmaDataModel(statistic_list=stats_with_errors) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message - assert len(result.events) == 2 # one per interface + assert len(result.events) == 3 for event in result.events: assert event.priority == EventPriority.ERROR - # Total 3 errors across 2 interfaces - assert sum(len(e.data["errors"]) for e in result.events) == 3 -def test_critical_error_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing a critical error (grouped per interface).""" - stats = list(clean_rdma_model.statistic_list) - stats[0].unrecoverable_err = 1 - stats[0].res_tx_pci_err = 2 +def test_critical_error_detected(rdma_analyzer): + stats = [ + RdmaStatistics( + ifname="bnxt_re_test", + port=1, + vendor_statistics=Thor2RdmaStatistics( + unrecoverable_err=1, + res_tx_pci_err=2, + ), + ) + ] model = RdmaDataModel(statistic_list=stats) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message - assert len(result.events) == 1 # one event per interface - assert result.events[0].priority == EventPriority.CRITICAL - assert "unrecoverable_err" in result.events[0].data["errors"] - assert "res_tx_pci_err" in result.events[0].data["errors"] + assert len(result.events) == 2 + critical_events = [e for e in result.events if e.priority == EventPriority.CRITICAL] + assert len(critical_events) == 2 def test_empty_statistics(rdma_analyzer): - """Test with empty statistics list: WARNING and message logged.""" model = RdmaDataModel(statistic_list=[], link_list=[]) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.WARNING assert result.message == "No RDMA devices found" -def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): - """Test with errors across multiple interfaces.""" - stats = list(clean_rdma_model.statistic_list) - stats[0].max_retry_exceeded = 15 - stats[2].local_ack_timeout_err = 8 - stats[4].out_of_buffer = 100 - model = RdmaDataModel(statistic_list=stats) +def test_multiple_interfaces_with_errors(rdma_analyzer, example_stat_dicts): + stats_multi_errors = _build_stats(example_stat_dicts) + stats_multi_errors[0].vendor_statistics.req_rx_pkt_seq_err = 15 + stats_multi_errors[2].vendor_statistics.tx_rdma_ack_timeout = 8 + stats_multi_errors[8].vendor_statistics.out_of_buffer = 100 + model = RdmaDataModel(statistic_list=stats_multi_errors) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert len(result.events) == 3 @@ -141,44 +164,58 @@ def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): def test_all_error_types(rdma_analyzer): - """Test that all error fields are properly detected (grouped in one event).""" - stats = RdmaStatistics( - ifname="bnxt_re_test", - port=1, - recoverable_errors=1, - tx_roce_errors=1, - unrecoverable_err=1, - ) - model = RdmaDataModel(statistic_list=[stats]) + stats = [ + RdmaStatistics( + ifname="ionic_test", + port=1, + vendor_statistics=PollaraRdmaStatistics( + req_rx_pkt_seq_err=1, + req_tx_loc_oper_err=1, + ), + ), + RdmaStatistics( + ifname="mlx5_test", + port=1, + vendor_statistics=Cx7RdmaStatistics( + packet_seq_err=1, + ), + ), + ] + model = RdmaDataModel(statistic_list=stats) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR - assert len(result.events) == 1 # one event per interface - assert "unrecoverable_err" in result.events[0].data["errors"] - assert result.events[0].priority == EventPriority.CRITICAL - assert set(result.events[0].data["errors"].keys()) == { - "recoverable_errors", - "tx_roce_errors", - "unrecoverable_err", - } + assert len(result.events) == 3 + interfaces = {event.data["interface"] for event in result.events} + assert interfaces == {"ionic_test", "mlx5_test"} def test_zero_errors_are_ignored(rdma_analyzer): - """Test that zero-value errors are not reported.""" - stats = RdmaStatistics( - ifname="bnxt_re_test", - port=1, - tx_roce_errors=0, - rx_roce_errors=0, - unrecoverable_err=0, - ) - model = RdmaDataModel(statistic_list=[stats]) + stats = [ + RdmaStatistics( + ifname="ionic_test", + port=1, + vendor_statistics=PollaraRdmaStatistics( + req_rx_pkt_seq_err=0, + req_rx_rnr_retry_err=0, + tx_rdma_ack_timeout=0, + ), + ), + RdmaStatistics( + ifname="mlx5_test", + port=1, + vendor_statistics=Cx7RdmaStatistics( + packet_seq_err=0, + out_of_buffer=0, + ), + ), + ] + model = RdmaDataModel(statistic_list=stats) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.OK assert len(result.events) == 0 def test_rdma_link_all_active(rdma_analyzer, clean_stats): - """Test with RDMA links that are all active and up.""" links = [ RdmaLink( ifindex=0, @@ -207,7 +244,6 @@ def test_rdma_link_all_active(rdma_analyzer, clean_stats): def test_rdma_link_down_detected(rdma_analyzer, clean_stats): - """Test with RDMA links that are down""" links = [ RdmaLink( ifindex=0, @@ -230,12 +266,10 @@ def test_rdma_link_down_detected(rdma_analyzer, clean_stats): ] model = RdmaDataModel(statistic_list=clean_stats, link_list=links) result = rdma_analyzer.analyze_data(model) - # Current implementation only checks statistics, not link state assert result.status == ExecutionStatus.OK def test_rdma_link_empty_list(rdma_analyzer, clean_stats): - """Test with empty RDMA link list.""" model = RdmaDataModel(statistic_list=clean_stats, link_list=[]) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.OK @@ -243,7 +277,6 @@ def test_rdma_link_empty_list(rdma_analyzer, clean_stats): def test_rdma_link_multiple_interfaces(rdma_analyzer, clean_stats): - """Test with multiple RDMA interfaces with different link states.""" links = [ RdmaLink( ifindex=0, From 193980843583847ed21e00637307bc06de631045 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Tue, 21 Apr 2026 21:13:29 +0000 Subject: [PATCH 18/25] rdma fix --- .../plugins/inband/rdma/rdma_analyzer.py | 156 +- .../plugins/inband/rdma/rdma_collector.py | 81 +- .../fixtures/rdma_statistic_example_data.json | 1276 +++++++---------- test/unit/plugin/test_rdma_collector.py | 130 +- 4 files changed, 754 insertions(+), 889 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py index 163602b0..00f6977f 100644 --- a/nodescraper/plugins/inband/rdma/rdma_analyzer.py +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -37,106 +37,12 @@ class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]): DATA_MODEL = RdmaDataModel - # Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.) - ERROR_FIELDS = [ - "recoverable_errors", - "tx_roce_errors", - "tx_roce_discards", - "rx_roce_errors", - "rx_roce_discards", - "local_ack_timeout_err", - "packet_seq_err", - "max_retry_exceeded", - "rnr_nak_retry_err", - "implied_nak_seq_err", - "unrecoverable_err", - "bad_resp_err", - "local_qp_op_err", - "local_protection_err", - "mem_mgmt_op_err", - "req_remote_invalid_request", - "req_remote_access_errors", - "remote_op_err", - "duplicate_request", - "res_exceed_max", - "resp_local_length_error", - "res_exceeds_wqe", - "res_opcode_err", - "res_rx_invalid_rkey", - "res_rx_domain_err", - "res_rx_no_perm", - "res_rx_range_err", - "res_tx_invalid_rkey", - "res_tx_domain_err", - "res_tx_no_perm", - "res_tx_range_err", - "res_irrq_oflow", - "res_unsup_opcode", - "res_unaligned_atomic", - "res_rem_inv_err", - "res_mem_err", - "res_srq_err", - "res_cmp_err", - "res_invalid_dup_rkey", - "res_wqe_format_err", - "res_cq_load_err", - "res_srq_load_err", - "res_tx_pci_err", - "res_rx_pci_err", - "out_of_buffer", - "out_of_sequence", - "req_cqe_error", - "req_cqe_flush_error", - "resp_cqe_error", - "resp_cqe_flush_error", - "resp_remote_access_errors", - "req_rx_pkt_seq_err", - "req_rx_rnr_retry_err", - "req_rx_rmt_acc_err", - "req_rx_rmt_req_err", - "req_rx_oper_err", - "req_rx_impl_nak_seq_err", - "req_rx_cqe_err", - "req_rx_cqe_flush", - "req_rx_dup_response", - "req_rx_inval_pkts", - "req_tx_loc_acc_err", - "req_tx_loc_oper_err", - "req_tx_mem_mgmt_err", - "req_tx_retry_excd_err", - "req_tx_loc_sgl_inv_err", - "resp_rx_dup_request", - "resp_rx_outof_buf", - "resp_rx_outouf_seq", - "resp_rx_cqe_err", - "resp_rx_cqe_flush", - "resp_rx_loc_len_err", - "resp_rx_inval_request", - "resp_rx_loc_oper_err", - "resp_rx_outof_atomic", - "resp_tx_pkt_seq_err", - "resp_tx_rmt_inval_req_err", - "resp_tx_rmt_acc_err", - "resp_tx_rmt_oper_err", - "resp_tx_rnr_retry_err", - "resp_tx_loc_sgl_inv_err", - "resp_rx_s0_table_err", - "resp_rx_ccl_cts_outouf_seq", - "tx_rdma_ack_timeout", - "tx_rdma_ccl_cts_ack_timeout", - "rx_rdma_mtu_discard_pkts", - ] - - CRITICAL_ERROR_FIELDS = [ - "unrecoverable_err", - "res_tx_pci_err", - "res_rx_pci_err", - "res_mem_err", - ] - def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> TaskResult: """Analyze RDMA statistics for non-zero error counters. + Error and critical counter names come from each vendor's statistics model + (ionic / bnxt / mlx prefixes). + Args: data: RDMA data model with statistic_list (and optionally link_list). args: Unused (analyzer has no configurable args). @@ -150,32 +56,36 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task return self.result error_state = False - for idx, stat in enumerate(data.statistic_list): - errors_on_interface = [] # (error_field, value, is_critical) - for error_field in self.ERROR_FIELDS: - value = getattr(stat, error_field, None) - if value is not None and value > 0: - is_critical = error_field in self.CRITICAL_ERROR_FIELDS - errors_on_interface.append((error_field, value, is_critical)) - if errors_on_interface: - error_state = True - interface_label = stat.ifname or "unknown" - error_names = [e[0] for e in errors_on_interface] - any_critical = any(e[2] for e in errors_on_interface) - priority = EventPriority.CRITICAL if any_critical else EventPriority.ERROR - errors_data = {field: value for field, value, _ in errors_on_interface} - self._log_event( - category=EventCategory.IO, - description=f"RDMA error detected on {interface_label}: [{', '.join(error_names)}]", - data={ - "interface": stat.ifname, - "port": stat.port, - "errors": errors_data, - "statistic_index": idx, - }, - priority=priority, - console_log=True, - ) + + for stat in data.statistic_list: + if stat.vendor_statistics is None: + continue + + error_fields = stat.vendor_statistics.error_fields + critical_fields = stat.vendor_statistics.critial_error_fields + + for error_field in error_fields + critical_fields: + error_value = getattr(stat.vendor_statistics, error_field, None) + + if error_value is not None and error_value > 0: + priority = ( + EventPriority.CRITICAL + if error_field in critical_fields + else EventPriority.ERROR + ) + self._log_event( + category=EventCategory.NETWORK, + description=f"RDMA error detected: {error_field}", + data={ + "interface": stat.ifname, + "port": stat.port, + "error_field": error_field, + "error_count": error_value, + }, + priority=priority, + console_log=True, + ) + error_state = True if error_state: self.result.message = "RDMA errors detected in statistics" diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py index b5e01b5c..a719a334 100644 --- a/nodescraper/plugins/inband/rdma/rdma_collector.py +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -34,7 +34,15 @@ from nodescraper.models import TaskResult from nodescraper.utils import get_exception_traceback -from .rdmadata import RdmaDataModel, RdmaDevice, RdmaLink, RdmaLinkText, RdmaStatistics +from .rdmadata import ( + VENDOR_PREFIX_MAP, + RdmaDataModel, + RdmaDevice, + RdmaLink, + RdmaLinkText, + RdmaStatistics, + RdmaVendorStatistics, +) class RdmaCollector(InBandDataCollector[RdmaDataModel, None]): @@ -61,7 +69,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: if res.exit_code != 0: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description=f"Error running rdma command: {cmd}", data={ "command": cmd, @@ -80,7 +88,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: return json.loads(res.stdout) except json.JSONDecodeError as e: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description=f"Error parsing command: {cmd} json data", data={ "cmd": cmd, @@ -172,7 +180,11 @@ def _parse_rdma_link_text(self, output: str) -> list[RdmaLinkText]: return links def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: - """Get RDMA statistics from 'rdma statistic -j'.""" + """Get RDMA statistics from 'rdma statistic -j'. + + Warns on unexpected or missing fields relative to the vendor-specific model + for the interface prefix (ionic / bnxt / mlx). + """ stat_data = self._run_rdma_command(self.CMD_STATISTIC) if stat_data is None: return None @@ -184,21 +196,70 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: for stat in stat_data: if not isinstance(stat, dict): self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Invalid data type for RDMA statistic", data={"data_type": type(stat).__name__}, priority=EventPriority.WARNING, ) continue - statistics.append(RdmaStatistics(**stat)) + + ifname = stat.get("ifname", "") + vendor_stats: Optional[RdmaVendorStatistics] = None + for prefix, vendor_cls in VENDOR_PREFIX_MAP.items(): + if ifname.startswith(prefix): + vendor_fields = set(vendor_cls.model_fields.keys()) + stat_fields = set(stat.keys()) - {"ifname", "port"} + + extra_fields = stat_fields - vendor_fields + if extra_fields: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Unexpected fields in RDMA statistic for {ifname}", + data={ + "interface": ifname, + "extra_fields": sorted(extra_fields), + }, + priority=EventPriority.WARNING, + ) + + missing_fields = vendor_fields - stat_fields + if missing_fields: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Missing fields in RDMA statistic for {ifname}", + data={ + "interface": ifname, + "missing_fields": sorted(missing_fields), + }, + priority=EventPriority.WARNING, + ) + + try: + vendor_stats = vendor_cls(**stat) + except ValidationError as ve: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Failed to build vendor model for {ifname}", + data={"exception": get_exception_traceback(ve)}, + priority=EventPriority.WARNING, + ) + break + + rdma_stat = RdmaStatistics( + ifname=stat.get("ifname"), + port=stat.get("port"), + vendor_statistics=vendor_stats, + ) + statistics.append(rdma_stat) + return statistics except ValidationError as e: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Failed to build RdmaStatistics model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, ) - return statistics + return None def _get_rdma_link(self) -> Optional[list[RdmaLink]]: """Get RDMA link data from 'rdma link -j'.""" @@ -213,7 +274,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: for link in link_data: if not isinstance(link, dict): self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Invalid data type for RDMA link", data={"data_type": type(link).__name__}, priority=EventPriority.WARNING, @@ -223,7 +284,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: return links except ValidationError as e: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Failed to build RdmaLink model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, diff --git a/test/unit/plugin/fixtures/rdma_statistic_example_data.json b/test/unit/plugin/fixtures/rdma_statistic_example_data.json index e338e41a..6f0a33ed 100644 --- a/test/unit/plugin/fixtures/rdma_statistic_example_data.json +++ b/test/unit/plugin/fixtures/rdma_statistic_example_data.json @@ -1,826 +1,598 @@ [ { - "ifname": "bnxt_re0", + "ifname": "ionic_0", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 12, - "watermark_ahs": 8, - "watermark_qps": 229, - "watermark_rc_qps": 220, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 305, - "watermark_mws": 0, - "rx_pkts": 3504998440, - "rx_bytes": 2966950848, - "tx_pkts": 2747190987, - "tx_bytes": 912073550, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3324056122, - "tx_read_resp": 3324056122, - "tx_write_req": 622240024, - "tx_send_req": 97500, - "rx_atomic_requests": 0, - "rx_read_requests": 3324056122, - "rx_read_resp": 3324056122, - "rx_write_requests": 626374468, - "rx_send_req": 97500, - "rx_good_pkts": 1401322762, - "rx_good_bytes": 2966950848, - "out_of_buffer": 0, - "np_cnp_sent": 2873487760, - "rp_cnp_handled": 2103675678, - "np_ecn_marked_roce_packets": 2873487760, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re1", + "ifname": "ionic_1", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 14, - "watermark_ahs": 3, - "watermark_qps": 228, - "watermark_rc_qps": 219, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 287, - "watermark_mws": 0, - "rx_pkts": 1509751895, - "rx_bytes": 3099873130, - "tx_pkts": 692925073, - "tx_bytes": 2068663286, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3322387232, - "tx_read_resp": 3322387232, - "tx_write_req": 620621144, - "tx_send_req": 0, - "rx_atomic_requests": 0, - "rx_read_requests": 3322387232, - "rx_read_resp": 3322387232, - "rx_write_requests": 621181433, - "rx_send_req": 0, - "rx_good_pkts": 3507768689, - "rx_good_bytes": 3099873130, - "out_of_buffer": 0, - "np_cnp_sent": 1097578610, - "rp_cnp_handled": 2296950502, - "np_ecn_marked_roce_packets": 1097578610, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re2", + "ifname": "ionic_2", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 4, - "watermark_qps": 230, - "watermark_rc_qps": 221, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 294, - "watermark_mws": 0, - "rx_pkts": 2328181128, - "rx_bytes": 79750872, - "tx_pkts": 1404869338, - "tx_bytes": 644434628, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3212760135, - "tx_read_resp": 3212760135, - "tx_write_req": 1995861174, - "tx_send_req": 0, - "rx_atomic_requests": 0, - "rx_read_requests": 3212760135, - "rx_read_resp": 3212760135, - "rx_write_requests": 1995579948, - "rx_send_req": 0, - "rx_good_pkts": 4025638368, - "rx_good_bytes": 79750872, - "out_of_buffer": 0, - "np_cnp_sent": 4174752904, - "rp_cnp_handled": 2597510056, - "np_ecn_marked_roce_packets": 4174752904, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re3", + "ifname": "ionic_3", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 12, - "watermark_ahs": 7, - "watermark_qps": 229, - "watermark_rc_qps": 220, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 292, - "watermark_mws": 0, - "rx_pkts": 3888070733, - "rx_bytes": 3748987850, - "tx_pkts": 2265082996, - "tx_bytes": 3715380316, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3103369202, - "tx_read_resp": 3103369202, - "tx_write_req": 3370635080, - "tx_send_req": 0, - "rx_atomic_requests": 0, - "rx_read_requests": 3103369202, - "rx_read_resp": 3103369202, - "rx_write_requests": 3368547249, - "rx_send_req": 0, - "rx_good_pkts": 2688805201, - "rx_good_bytes": 3748987850, - "out_of_buffer": 0, - "np_cnp_sent": 134598312, - "rp_cnp_handled": 1199265532, - "np_ecn_marked_roce_packets": 134598312, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re4", + "ifname": "ionic_4", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 12, - "watermark_ahs": 6, - "watermark_qps": 230, - "watermark_rc_qps": 221, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 302, - "watermark_mws": 0, - "rx_pkts": 986831570, - "rx_bytes": 1185181414, - "tx_pkts": 1975828812, - "tx_bytes": 2763928250, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2993618119, - "tx_read_resp": 2993618119, - "tx_write_req": 449606302, - "tx_send_req": 37687, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "ionic_5", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "ionic_6", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "ionic_7", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "mlx5_0", + "port": 1, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2993618119, - "rx_read_resp": 2993618119, - "rx_write_requests": 448485514, - "rx_send_req": 37687, - "rx_good_pkts": 2876478595, - "rx_good_bytes": 1185181414, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 3525492995, - "rp_cnp_handled": 2405320271, - "np_ecn_marked_roce_packets": 3525492995, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 }, { - "ifname": "bnxt_re5", + "ifname": "mlx5_1", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 7, - "watermark_qps": 228, - "watermark_rc_qps": 219, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 287, - "watermark_mws": 0, - "rx_pkts": 3602164391, - "rx_bytes": 515322372, - "tx_pkts": 3498885620, - "tx_bytes": 3601952844, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2883798845, - "tx_read_resp": 2883798845, - "tx_write_req": 1822414941, - "tx_send_req": 0, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2883798845, - "rx_read_resp": 2883798845, - "rx_write_requests": 1819507161, - "rx_send_req": 0, - "rx_good_pkts": 1576292710, - "rx_good_bytes": 515322372, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 4093842522, - "rp_cnp_handled": 2025871681, - "np_ecn_marked_roce_packets": 4093842522, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 }, { - "ifname": "bnxt_re6", + "ifname": "mlx5_2", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 7, - "watermark_qps": 230, - "watermark_rc_qps": 221, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 294, - "watermark_mws": 0, - "rx_pkts": 2577272275, - "rx_bytes": 2249875450, - "tx_pkts": 2452138468, - "tx_bytes": 700557582, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2775090592, - "tx_read_resp": 2775090592, - "tx_write_req": 3201764210, - "tx_send_req": 0, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2775090592, - "rx_read_resp": 2775090592, - "rx_write_requests": 3201655162, - "rx_send_req": 0, - "rx_good_pkts": 1197866395, - "rx_good_bytes": 2249875450, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 2401103251, - "rp_cnp_handled": 1379405880, - "np_ecn_marked_roce_packets": 2401103251, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 }, { - "ifname": "bnxt_re7", + "ifname": "mlx5_3", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 6, - "watermark_qps": 228, - "watermark_rc_qps": 219, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 287, - "watermark_mws": 0, - "rx_pkts": 1606921676, - "rx_bytes": 4007942950, - "tx_pkts": 1249198409, - "tx_bytes": 25134278, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2665758274, - "tx_read_resp": 2665758274, - "tx_write_req": 284646587, - "tx_send_req": 0, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2665758274, - "rx_read_resp": 2665758274, - "rx_write_requests": 284542358, - "rx_send_req": 0, - "rx_good_pkts": 253070639, - "rx_good_bytes": 4007942950, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 2670842510, - "rp_cnp_handled": 1353851037, - "np_ecn_marked_roce_packets": 2670842510, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 } ] diff --git a/test/unit/plugin/test_rdma_collector.py b/test/unit/plugin/test_rdma_collector.py index 595e7e33..eb687c54 100644 --- a/test/unit/plugin/test_rdma_collector.py +++ b/test/unit/plugin/test_rdma_collector.py @@ -23,12 +23,13 @@ # SOFTWARE. # ############################################################################### +import json from pathlib import Path import pytest from nodescraper.connection.inband.inband import CommandArtifact -from nodescraper.enums import ExecutionStatus, OSFamily +from nodescraper.enums import EventPriority, ExecutionStatus, OSFamily from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.plugins.inband.rdma.rdma_collector import RdmaCollector from nodescraper.plugins.inband.rdma.rdmadata import RdmaDataModel @@ -70,9 +71,10 @@ def test_collect_success(collector, conn_mock, rdma_link_output, rdma_statistic_ assert res.status == ExecutionStatus.OK assert data is not None assert isinstance(data, RdmaDataModel) - # Full statistic fixture has 8 devices (bnxt_re0..bnxt_re7) with full stats - assert len(data.statistic_list) == 8 - assert data.statistic_list[0].ifname == "bnxt_re0" + assert len(data.statistic_list) == 12 + assert data.statistic_list[0].ifname == "ionic_0" + assert data.statistic_list[0].vendor_statistics is not None + assert data.statistic_list[0].vendor_statistics.tx_rdma_ucast_bytes == 0 # Full link fixture has 4 ionic links assert len(data.link_list) == 4 assert data.link_list[0].ifname == "ionic_0" @@ -154,3 +156,123 @@ def test_parse_rdma_link_text_empty(collector): """Test parsing empty rdma link (text) output.""" links = collector._parse_rdma_link_text("") assert len(links) == 0 + + +def test_collect_extra_fields_warning(collector, conn_mock): + """Extra keys in a statistic row produce a warning event but collection succeeds.""" + collector.system_info.os_family = OSFamily.LINUX + stat_data = [ + { + "ifname": "ionic_0", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "unknown_field_1": 42, + "unknown_field_2": 99, + } + ] + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=json.dumps(stat_data), + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert len(data.statistic_list) == 1 + assert data.statistic_list[0].vendor_statistics is not None + extra_events = [e for e in res.events if "Unexpected fields" in e.description] + assert len(extra_events) == 1 + assert extra_events[0].priority == EventPriority.WARNING + assert "unknown_field_1" in extra_events[0].data["extra_fields"] + assert "unknown_field_2" in extra_events[0].data["extra_fields"] + + +def test_collect_missing_fields_warning(collector, conn_mock): + """Missing vendor fields produce a warning event.""" + collector.system_info.os_family = OSFamily.LINUX + stat_data = [{"ifname": "ionic_0", "port": 1, "tx_rdma_ucast_bytes": 0}] + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=json.dumps(stat_data), + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert len(data.statistic_list) == 1 + missing_events = [e for e in res.events if "Missing fields" in e.description] + assert len(missing_events) == 1 + assert missing_events[0].priority == EventPriority.WARNING + assert "tx_rdma_ucast_pkts" in missing_events[0].data["missing_fields"] + + +def test_collect_extra_and_missing_fields_warning(collector, conn_mock): + """Both extra and unknown vendor keys produce separate warnings (mlx).""" + collector.system_info.os_family = OSFamily.LINUX + stat_data = [ + { + "ifname": "mlx5_0", + "port": 1, + "rx_write_requests": 0, + "brand_new_counter": 7, + } + ] + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=json.dumps(stat_data), + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + extra_events = [e for e in res.events if "Unexpected fields" in e.description] + missing_events = [e for e in res.events if "Missing fields" in e.description] + assert len(extra_events) == 1 + assert len(missing_events) == 1 + assert "brand_new_counter" in extra_events[0].data["extra_fields"] + assert "rx_read_requests" in missing_events[0].data["missing_fields"] + + +def test_collect_no_field_warnings_when_fixture_matches( + collector, conn_mock, rdma_statistic_output +): + """Full fixture rows match vendor models: no missing/extra field warnings.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=rdma_statistic_output, + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + drift = [ + e + for e in res.events + if "Unexpected fields" in e.description or "Missing fields" in e.description + ] + assert drift == [] From 5b73934a9e15d2d2d0a4eb192081144e04ce6fe3 Mon Sep 17 00:00:00 2001 From: niratner Date: Thu, 23 Apr 2026 12:08:52 -0400 Subject: [PATCH 19/25] Moved the functionality to update ErrorRegex priorities based on rules into a function which can be used whenever making new ErrorRegex object. This function is now being used after creating the 'Unknown dmesg errors' --- .../plugins/inband/dmesg/dmesg_analyzer.py | 64 +++++++++++++------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index a790cfd0..5e49da60 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -535,7 +535,32 @@ def _norm(s: str) -> str: return True return False - def resolve_priority( + def update_error_regex_priorities( + self, + error_regexes: list[ErrorRegex], + priority_override_rules: list[dict], + ) -> list[EventPriority]: + """Updates the priorities of a list of ErrorRegex options based on given priority rules + + Args: + error_regexes (list[ErrorRegex]): A list of ErrorRegex objects to have their priorities updated + priority_override_rules (list[dict]): The list of rules which determine what the updated priority should be + + Returns: + list[ErrorRegex]: A list of the same ErrorRegex objects but with their priorities updated + """ + + if priority_override_rules is None: + return error_regexes + + updated_error_regexes = [] + for regex_obj in error_regexes: + new_priority = self._resolve_priority(regex_obj, priority_override_rules) + regex_obj = regex_obj.model_copy(update={"event_priority": new_priority}) + updated_error_regexes.append(regex_obj) + return updated_error_regexes + + def _resolve_priority( self, regex_obj: ErrorRegex, priority_override_rules: list[dict], @@ -621,14 +646,9 @@ def analyze_data( args = DmesgAnalyzerArgs() final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) - - if args.priority_override_rules: - updated_regex = [] - for regex_obj in final_error_regex: - new_priority = self.resolve_priority(regex_obj, args.priority_override_rules) - regex_obj = regex_obj.model_copy(update={"event_priority": new_priority}) - updated_regex.append(regex_obj) - final_error_regex = updated_regex + final_error_regex = self.update_error_regex_priorities( + final_error_regex, args.priority_override_rules + ) # makes no changes if no rules are provided if args.analysis_range_start or args.analysis_range_end: self.logger.info( @@ -662,19 +682,25 @@ def analyze_data( self.result.events += known_err_events if args.check_unknown_dmesg_errors: + + unknown_dmesg_error_regexes = [ + ErrorRegex( + regex=re.compile( + r"kern :(?:err|crit|alert|emerg)\s+: \d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+ (.*)" + ), + message="Unknown dmesg error", + event_category=EventCategory.UNKNOWN, + event_priority=EventPriority.WARNING, + ) + ] + unknown_dmesg_error_regexes = self.update_error_regex_priorities( + unknown_dmesg_error_regexes, args.priority_override_rules + ) # makes no changes if no rules are provided + err_events = self.check_all_regexes( content=dmesg_content, source="dmesg", - error_regex=[ - ErrorRegex( - regex=re.compile( - r"kern :(?:err|crit|alert|emerg)\s+: \d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+ (.*)" - ), - message="Unknown dmesg error", - event_category=EventCategory.UNKNOWN, - event_priority=EventPriority.WARNING, - ) - ], + error_regex=unknown_dmesg_error_regexes, num_timestamps=args.num_timestamps, interval_to_collapse_event=args.interval_to_collapse_event, ) From 29aece1099fb3266abc9f31919465f1c47218f28 Mon Sep 17 00:00:00 2001 From: niratner Date: Thu, 23 Apr 2026 12:49:23 -0400 Subject: [PATCH 20/25] Updated comments --- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index 5e49da60..cbc14a81 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -540,7 +540,7 @@ def update_error_regex_priorities( error_regexes: list[ErrorRegex], priority_override_rules: list[dict], ) -> list[EventPriority]: - """Updates the priorities of a list of ErrorRegex options based on given priority rules + """Updates the priorities of a list of ErrorRegex objects based on given priority rules Args: error_regexes (list[ErrorRegex]): A list of ErrorRegex objects to have their priorities updated @@ -648,7 +648,7 @@ def analyze_data( final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) final_error_regex = self.update_error_regex_priorities( final_error_regex, args.priority_override_rules - ) # makes no changes if no rules are provided + ) # updates the priorities of the ErrorRegex objects using the given rules. makes no changes if no rules are provided. if args.analysis_range_start or args.analysis_range_end: self.logger.info( @@ -682,7 +682,6 @@ def analyze_data( self.result.events += known_err_events if args.check_unknown_dmesg_errors: - unknown_dmesg_error_regexes = [ ErrorRegex( regex=re.compile( @@ -695,7 +694,7 @@ def analyze_data( ] unknown_dmesg_error_regexes = self.update_error_regex_priorities( unknown_dmesg_error_regexes, args.priority_override_rules - ) # makes no changes if no rules are provided + ) # updates the priorities of the ErrorRegex objects using the given rules. makes no changes if no rules are provided. err_events = self.check_all_regexes( content=dmesg_content, From 69ba66aac4068a785756e4efb4a5f386909cb0e9 Mon Sep 17 00:00:00 2001 From: niratner Date: Thu, 23 Apr 2026 13:02:53 -0400 Subject: [PATCH 21/25] updated unit tests with new priority override logic --- test/unit/plugin/test_dmesg_analyzer.py | 43 ++++++++++++++++++------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 7aaeb850..8509c340 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -720,7 +720,7 @@ def test_resolve_priority_no_match(system_info): event_category=EventCategory.RAS, ) rules = [{"event_category": "SW_DRIVER", "new_priority": "WARNING"}] - assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.ERROR + assert analyzer._resolve_priority(regex_obj, rules) == EventPriority.ERROR def test_resolve_priority_match_by_category(system_info): @@ -732,7 +732,7 @@ def test_resolve_priority_match_by_category(system_info): event_category=EventCategory.RAS, ) rules = [{"event_category": "RAS", "new_priority": "WARNING"}] - result = analyzer.resolve_priority(regex_obj, rules) + result = analyzer._resolve_priority(regex_obj, rules) assert result == EventPriority.WARNING @@ -750,7 +750,7 @@ def test_resolve_priority_match_by_message_list(system_info): "new_priority": "WARNING", } ] - result = analyzer.resolve_priority(regex_obj, rules) + result = analyzer._resolve_priority(regex_obj, rules) assert result == EventPriority.WARNING @@ -763,7 +763,7 @@ def test_resolve_priority_no_change(system_info): event_category=EventCategory.RAS, ) rules = [{"event_category": "RAS", "new_priority": "NO_CHANGE"}] - assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.ERROR + assert analyzer._resolve_priority(regex_obj, rules) == EventPriority.ERROR def test_resolve_priority_first_match_wins(system_info): @@ -778,7 +778,7 @@ def test_resolve_priority_first_match_wins(system_info): {"event_category": "RAS", "new_priority": "WARNING"}, {"event_category": "RAS", "new_priority": "ERROR"}, ] - result = analyzer.resolve_priority(regex_obj, rules) + result = analyzer._resolve_priority(regex_obj, rules) assert result == EventPriority.WARNING @@ -794,13 +794,13 @@ def test_resolve_priority_multiple_filter_fields(system_info): rules = [ {"event_category": "RAS", "message": "GPU reset failed", "new_priority": "WARNING"}, ] - assert analyzer.resolve_priority(regex_obj, rules) == EventPriority.WARNING + assert analyzer._resolve_priority(regex_obj, rules) == EventPriority.WARNING # Does NOT match because message differs → returns original priority rules_mismatch = [ {"event_category": "RAS", "message": "ACA Error", "new_priority": "WARNING"}, ] - assert analyzer.resolve_priority(regex_obj, rules_mismatch) == EventPriority.ERROR + assert analyzer._resolve_priority(regex_obj, rules_mismatch) == EventPriority.ERROR def test_resolve_priority_match_all_matches_any_regex(system_info): @@ -818,7 +818,7 @@ def test_resolve_priority_match_all_matches_any_regex(system_info): event_category=EventCategory.SW_DRIVER, ), ]: - result = analyzer.resolve_priority( + result = analyzer._resolve_priority( regex_obj, [{"match_all": True, "new_priority": "WARNING"}] ) assert ( @@ -836,7 +836,7 @@ def test_resolve_priority_match_all_ignores_non_matching_filters(system_info): ) # event_category is RAS, but filter says SW_DRIVER — would normally NOT match. # match_all=True should bypass this check and still apply the rule. - result = analyzer.resolve_priority( + result = analyzer._resolve_priority( regex_obj, [{"match_all": True, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], ) @@ -852,14 +852,14 @@ def test_resolve_priority_match_all_false_still_filters(system_info): event_category=EventCategory.RAS, ) # match_all=False with a non-matching filter → returns original priority - result = analyzer.resolve_priority( + result = analyzer._resolve_priority( regex_obj, [{"match_all": False, "event_category": "SW_DRIVER", "new_priority": "WARNING"}], ) assert result == EventPriority.ERROR # match_all=False with a matching filter → should match - result = analyzer.resolve_priority( + result = analyzer._resolve_priority( regex_obj, [{"match_all": False, "event_category": "RAS", "new_priority": "WARNING"}], ) @@ -953,3 +953,24 @@ def test_custom_regex_with_multiline_pattern(system_info): assert len(res.events) >= 1 start_events = [e for e in res.events if e.description == "Start Error Block"] assert len(start_events) == 1 + + +def test_priority_override_updates_unkown_dmesg_error(system_info): + """NO_CHANGE rule leaves the original event priority intact.""" + dmesg_data = DmesgData( + dmesg_content=("kern :err : 2024-10-07T10:17:15,145363-04:00 UNKOWN DMESG ERROR") + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + dmesg_data, + args=DmesgAnalyzerArgs( + check_unknown_dmesg_errors=True, + priority_override_rules=[ + {"message": "Unknown dmesg error", "new_priority": "ERROR"}, + ], + ), + ) + + assert len(res.events) == 1 + assert res.events[0].priority == EventPriority.ERROR From 18ae2c7ba6a3f4329fdc49f67e55185fc7a2ea01 Mon Sep 17 00:00:00 2001 From: niratner Date: Thu, 23 Apr 2026 13:05:08 -0400 Subject: [PATCH 22/25] updated test comment --- test/unit/plugin/test_dmesg_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index 8509c340..67faaf05 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -956,7 +956,7 @@ def test_custom_regex_with_multiline_pattern(system_info): def test_priority_override_updates_unkown_dmesg_error(system_info): - """NO_CHANGE rule leaves the original event priority intact.""" + """Updating an 'Unknown dmesg error', which is added after the base ErrorRegex list, successfully changes its priority""" dmesg_data = DmesgData( dmesg_content=("kern :err : 2024-10-07T10:17:15,145363-04:00 UNKOWN DMESG ERROR") ) From a0a5449034c6e14d20bf9a2cd26a36f37b662afc Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 23 Apr 2026 16:00:30 -0500 Subject: [PATCH 23/25] event category fix --- .../plugins/inband/rdma/rdma_collector.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py index a719a334..67f4073b 100644 --- a/nodescraper/plugins/inband/rdma/rdma_collector.py +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -69,7 +69,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: if res.exit_code != 0: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Error running rdma command: {cmd}", data={ "command": cmd, @@ -88,7 +88,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: return json.loads(res.stdout) except json.JSONDecodeError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Error parsing command: {cmd} json data", data={ "cmd": cmd, @@ -196,7 +196,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: for stat in stat_data: if not isinstance(stat, dict): self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Invalid data type for RDMA statistic", data={"data_type": type(stat).__name__}, priority=EventPriority.WARNING, @@ -213,7 +213,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: extra_fields = stat_fields - vendor_fields if extra_fields: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Unexpected fields in RDMA statistic for {ifname}", data={ "interface": ifname, @@ -225,7 +225,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: missing_fields = vendor_fields - stat_fields if missing_fields: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Missing fields in RDMA statistic for {ifname}", data={ "interface": ifname, @@ -238,7 +238,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: vendor_stats = vendor_cls(**stat) except ValidationError as ve: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Failed to build vendor model for {ifname}", data={"exception": get_exception_traceback(ve)}, priority=EventPriority.WARNING, @@ -254,7 +254,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: return statistics except ValidationError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Failed to build RdmaStatistics model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, @@ -274,7 +274,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: for link in link_data: if not isinstance(link, dict): self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Invalid data type for RDMA link", data={"data_type": type(link).__name__}, priority=EventPriority.WARNING, @@ -284,7 +284,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: return links except ValidationError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Failed to build RdmaLink model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, From 68d6894c14a933c1ed5dde7736ea036732291ab3 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 24 Apr 2026 08:17:21 -0500 Subject: [PATCH 24/25] sys info print when not none --- nodescraper/pluginexecutor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index 1782bb50..8a6998f2 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -93,8 +93,10 @@ def __init__( ) self.logger.info("System Name: %s", self.system_info.name) - self.logger.info("System SKU: %s", self.system_info.sku) - self.logger.info("System Platform: %s", self.system_info.platform) + if self.system_info.sku: + self.logger.info("System SKU: %s", self.system_info.sku) + if self.system_info.platform: + self.logger.info("System Platform: %s", self.system_info.platform) self.logger.info("System location: %s", self.system_info.location) @staticmethod From a1b1032abb634e483fc913aa2c0fed0829ec40e6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Apr 2026 00:16:08 +0000 Subject: [PATCH 25/25] docs: Update plugin documentation [automated] --- README.md | 6 ++- docs/PLUGIN_DOC.md | 92 ---------------------------------------------- 2 files changed, 4 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index dd739f25..27c14577 100644 --- a/README.md +++ b/README.md @@ -113,8 +113,10 @@ options: --sys-platform STRING Specify system platform (default: None) --plugin-configs LIST - Comma-separated built-in names and/or plugin config JSON - paths. Built-in: NodeStatus, AllPlugins (default: None) + Comma-separated built-in names and/or plugin config + JSON paths (e.g. --plugin- + configs=NodeStatus,/path/c.json). Built-ins: + NodeStatus, AllPlugins (default: None) --system-config STRING Path to system config json (default: None) --connection-config STRING diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index baa51d5a..5e84641c 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -1693,98 +1693,6 @@ Check RDMA statistics for errors (RoCE and other RDMA error counters). **Link to code**: [rdma_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/rdma/rdma_analyzer.py) -### Class Variables - -- **ERROR_FIELDS**: `[ - recoverable_errors, - tx_roce_errors, - tx_roce_discards, - rx_roce_errors, - rx_roce_discards, - local_ack_timeout_err, - packet_seq_err, - max_retry_exceeded, - rnr_nak_retry_err, - implied_nak_seq_err, - unrecoverable_err, - bad_resp_err, - local_qp_op_err, - local_protection_err, - mem_mgmt_op_err, - req_remote_invalid_request, - req_remote_access_errors, - remote_op_err, - duplicate_request, - res_exceed_max, - resp_local_length_error, - res_exceeds_wqe, - res_opcode_err, - res_rx_invalid_rkey, - res_rx_domain_err, - res_rx_no_perm, - res_rx_range_err, - res_tx_invalid_rkey, - res_tx_domain_err, - res_tx_no_perm, - res_tx_range_err, - res_irrq_oflow, - res_unsup_opcode, - res_unaligned_atomic, - res_rem_inv_err, - res_mem_err, - res_srq_err, - res_cmp_err, - res_invalid_dup_rkey, - res_wqe_format_err, - res_cq_load_err, - res_srq_load_err, - res_tx_pci_err, - res_rx_pci_err, - out_of_buffer, - out_of_sequence, - req_cqe_error, - req_cqe_flush_error, - resp_cqe_error, - resp_cqe_flush_error, - resp_remote_access_errors, - req_rx_pkt_seq_err, - req_rx_rnr_retry_err, - req_rx_rmt_acc_err, - req_rx_rmt_req_err, - req_rx_oper_err, - req_rx_impl_nak_seq_err, - req_rx_cqe_err, - req_rx_cqe_flush, - req_rx_dup_response, - req_rx_inval_pkts, - req_tx_loc_acc_err, - req_tx_loc_oper_err, - req_tx_mem_mgmt_err, - req_tx_retry_excd_err, - req_tx_loc_sgl_inv_err, - resp_rx_dup_request, - resp_rx_outof_buf, - resp_rx_outouf_seq, - resp_rx_cqe_err, - resp_rx_cqe_flush, - resp_rx_loc_len_err, - resp_rx_inval_request, - resp_rx_loc_oper_err, - resp_rx_outof_atomic, - resp_tx_pkt_seq_err, - resp_tx_rmt_inval_req_err, - resp_tx_rmt_acc_err, - resp_tx_rmt_oper_err, - resp_tx_rnr_retry_err, - resp_tx_loc_sgl_inv_err, - resp_rx_s0_table_err, - resp_rx_ccl_cts_outouf_seq, - tx_rdma_ack_timeout, - tx_rdma_ccl_cts_ack_timeout, - rx_rdma_mtu_discard_pkts -]` -- **CRITICAL_ERROR_FIELDS**: `['unrecoverable_err', 'res_tx_pci_err', 'res_rx_pci_err', 'res_mem_err']` - ## Data Analyzer Class RocmAnalyzer ### Description