From c72964bbf318df36fa5b7844ca77a4bf53209930 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 12:35:22 +0200 Subject: [PATCH 01/22] feat(opcua): event subscription primitive with generation counter Adds OpcuaClient primitives for native OPC-UA event subscription, the first commit of issue #386 (AlarmConditionType bridge to fault_manager). open62541pp v0.16 has no native EventFilter or event subscription support, so this plumbs the raw open62541 C API (UA_Client_MonitoredItems_createEvent) behind a small C++ surface that mirrors the existing data-change patterns. Public API additions: * EventField + EventBrowsePath types for SimpleAttributeOperand specs. * EventCallback signature delivering select values plus EventType and SourceNode (always prepended to the filter, extracted on dispatch). * add_event_monitored_item / remove_event_monitored_item with heap-allocated CallbackContext stored as unique_ptr in OpcuaClient::Impl. * call_method wrapping opcua::services::call with status-mapped errors - needed by ConditionRefresh, Acknowledge, and Confirm in later commits. * current_generation() exposing a monotonic counter incremented on every detected disconnect (clean disconnect or transport-level drop). The trampoline captures a snapshot at createEvent time and drops callbacks whose snapshot diverges from the live counter, eliminating the late callback / use-after-free hazard reviewers flagged. remove_subscriptions and disconnect now bump the generation and clear event_callbacks before tearing down open62541pp Subscriptions, ensuring in-flight C callbacks see a stale generation rather than a freed context. Tests: 6 new disconnected-state tests validating the API contract and the generation-counter ordering. End-to-end event flow against a real server runs against the test_alarm_server fixture introduced in a later commit on this branch (per plan v2 - in-process server tests with synthetic event triggering are deferred to keep this commit reviewable; the docker integration test in commit 5 covers the full path). Refs #386 --- .../ros2_medkit_opcua/opcua_client.hpp | 60 ++++ .../ros2_medkit_opcua/src/opcua_client.cpp | 322 +++++++++++++++++- .../test/test_opcua_client.cpp | 53 +++ 3 files changed, 427 insertions(+), 8 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index e53c9dc3..7e99b203 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -113,6 +113,66 @@ class OpcuaClient { /// Remove all subscriptions void remove_subscriptions(); + /// One element of an OPC-UA SimpleAttributeOperand browse path. + struct EventField { + uint16_t namespace_index{0}; + std::string name; + }; + + /// Browse path for an event field, e.g. ``{{0, "EnabledState"}, {0, "Id"}}``. + using EventBrowsePath = std::vector; + + /// Callback invoked when an OPC-UA event arrives on a monitored item. + /// @param select_values Values for caller-requested fields, in the order of + /// ``select_browse_paths`` passed to ``add_event_monitored_item``. + /// @param source_node Always-included SourceNode (extracted from the event + /// payload; null NodeId if the server omitted it). + /// @param event_type Always-included EventType (null NodeId if absent). + using EventCallback = std::function & select_values, + const opcua::NodeId & source_node, const opcua::NodeId & event_type)>; + + /// Get the current subscription generation. Increments on every detected + /// disconnect (clean ``disconnect()`` or transport-level drop). Used by the + /// internal event trampoline to drop callbacks fired from defunct + /// subscriptions. + uint64_t current_generation() const; + + /// Add an event-based monitored item to an existing subscription. + /// + /// Wraps ``UA_Client_MonitoredItems_createEvent`` from the open62541 C API + /// because ``open62541pp`` v0.16 has no native EventFilter / event + /// subscription support. ``EventType`` and ``SourceNode`` are always + /// prepended to the EventFilter select clauses; they are extracted from the + /// payload and delivered as separate callback parameters, not in + /// ``select_values``. + /// + /// @return Server-assigned monitored item ID, or 0 on failure. + uint32_t add_event_monitored_item(uint32_t subscription_id, const opcua::NodeId & source_node, + const std::vector & select_browse_paths, EventCallback callback); + + /// Remove a previously-added event monitored item. The server is asked to + /// delete the item synchronously; the callback context is freed only after + /// the server ACK so in-flight C callbacks cannot dangle. + /// @return true if the item was found and removed cleanly. + bool remove_event_monitored_item(uint32_t subscription_id, uint32_t mi_id); + + /// OPC-UA Method call error classification. + enum class MethodError { NotConnected, MethodNotFound, InvalidArgument, MethodTimeout, TransportError }; + + /// Detailed Method call error info. + struct MethodErrorInfo { + MethodError code; + std::string message; + }; + + /// Synchronously call an OPC-UA Method on a target object. + /// Used by ConditionRefresh, Acknowledge, and Confirm operations on + /// AlarmConditionType nodes (issue #386). + /// @return Output arguments on success, MethodErrorInfo on failure. + tl::expected, MethodErrorInfo> + call_method(const opcua::NodeId & object_id, const opcua::NodeId & method_id, + const std::vector & input_args); + /// Get server description string (for status endpoint) std::string server_description() const; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index a83d9b43..b11ae082 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -16,25 +16,55 @@ #include #include +#include #include #include #include #include +#include +#include + +#include +#include namespace ros2_medkit_gateway { +// Forward declaration - defined after Impl so the trampoline can call back into +// the client. Static linkage keeps the symbol private to this translation unit. +struct EventCallbackContext; +static void on_event_trampoline_c(UA_Client * client, UA_UInt32 sub_id, void * sub_ctx, UA_UInt32 mon_id, + void * mon_ctx, size_t n_fields, UA_Variant * fields); + +/// Heap-owned context passed to the open62541 C event callback. Lifetime is +/// owned by ``Impl::event_callbacks`` (unique_ptr); the raw pointer handed to C +/// is valid until ``remove_event_monitored_item`` or ``remove_subscriptions`` +/// erases the entry. The ``generation_snapshot`` lets the trampoline drop +/// callbacks that fire from a defunct subscription after a reconnect. +struct EventCallbackContext { + OpcuaClient * owner{nullptr}; + uint64_t generation_snapshot{0}; + uint32_t subscription_id{0}; + uint32_t monitored_item_id{0}; // populated after createEvent returns + OpcuaClient::EventCallback callback; +}; + namespace { /// Set the connected flag to false when the BadStatus code indicates a /// terminal connection loss (as opposed to e.g. BadNodeIdUnknown which is a /// per-node issue). Called from read_value, read_values and write_value so /// that OpcuaPoller's reconnect logic (which keys off is_connected()) fires -/// regardless of which operation detected the drop first. -void maybe_mark_disconnected(std::atomic & connected_flag, const opcua::BadStatus & e) { +/// regardless of which operation detected the drop first. Also bumps the +/// subscription generation so any in-flight event callbacks from the dying +/// subscription are filtered out by the trampoline. +void maybe_mark_disconnected(std::atomic & connected_flag, std::atomic & generation, + const opcua::BadStatus & e) { const auto code = e.code(); if (code == UA_STATUSCODE_BADCONNECTIONCLOSED || code == UA_STATUSCODE_BADSECURECHANNELCLOSED || code == UA_STATUSCODE_BADNOTCONNECTED) { - connected_flag = false; + if (connected_flag.exchange(false)) { + generation.fetch_add(1, std::memory_order_release); + } } } @@ -111,6 +141,23 @@ struct OpcuaClient::Impl { }; std::deque subscriptions; // deque for stable references (callbacks capture &cb) std::mutex sub_mutex; + + // Issue #386: native OPC-UA AlarmCondition event subscription support. + // + // generation increments whenever the connection drops or is closed. The + // event trampoline captures a snapshot at createEvent time and drops + // notifications when the snapshot diverges from the live counter, so + // late-arriving events from a defunct subscription cannot reach user code. + std::atomic generation{0}; + + // Heap-owned contexts for raw-C event monitored items. unique_ptr keeps + // the ctx alive exactly as long as the entry sits in the map; we release + // entries only after the server ACKs DeleteMonitoredItem (or when we tear + // down the entire client in disconnect()). Keyed by monitored_item_id; we + // store sub_id alongside in the ctx so cleanup can address the right + // subscription. + std::unordered_map> event_callbacks; + std::mutex event_callbacks_mutex; }; OpcuaClient::OpcuaClient() : impl_(std::make_unique()) { @@ -153,8 +200,24 @@ bool OpcuaClient::connect(const OpcuaClientConfig & config) { void OpcuaClient::disconnect() { std::lock_guard lock(impl_->client_mutex); if (impl_->connected) { + // Bump generation FIRST so any in-flight event callbacks fired from the + // dying subscription drop their work in the trampoline (they read + // generation atomically) before we touch the storage they reference. + impl_->generation.fetch_add(1, std::memory_order_release); try { - // Delete subscriptions first + // Issue #386: clear event monitored items BEFORE deleting subscriptions. + // open62541's deleteSubscription cleans up server-side, but our + // EventCallbackContext objects (held in unique_ptr) must outlive any + // pending C callbacks; the generation bump above already filters them + // out, so it is safe to drop the contexts here. + { + std::lock_guard ev_lock(impl_->event_callbacks_mutex); + for (auto & [mi_id, ctx] : impl_->event_callbacks) { + UA_Client_MonitoredItems_deleteSingle(impl_->client.handle(), ctx->subscription_id, mi_id); + } + impl_->event_callbacks.clear(); + } + // Delete subscriptions { std::lock_guard sub_lock(impl_->sub_mutex); for (auto & info : impl_->subscriptions) { @@ -224,7 +287,7 @@ ReadResult OpcuaClient::read_value(const opcua::NodeId & node_id) { result.good = true; } catch (const opcua::BadStatus & e) { result.good = false; - maybe_mark_disconnected(impl_->connected, e); + maybe_mark_disconnected(impl_->connected, impl_->generation, e); } return result; @@ -253,7 +316,7 @@ std::vector OpcuaClient::read_values(const std::vectorconnected, e); + maybe_mark_disconnected(impl_->connected, impl_->generation, e); } results.push_back(std::move(r)); } @@ -355,7 +418,7 @@ OpcuaClient::write_value(const opcua::NodeId & node_id, const OpcuaValue & value } return {}; } catch (const opcua::BadStatus & e) { - maybe_mark_disconnected(impl_->connected, e); + maybe_mark_disconnected(impl_->connected, impl_->generation, e); auto code = e.code(); if (code == UA_STATUSCODE_BADTYPEMISMATCH) { return tl::make_unexpected(WriteErrorInfo{WriteError::TypeMismatch, e.what()}); @@ -427,8 +490,22 @@ bool OpcuaClient::add_monitored_item(uint32_t subscription_id, const opcua::Node void OpcuaClient::remove_subscriptions() { std::lock_guard lock(impl_->client_mutex); - std::lock_guard sub_lock(impl_->sub_mutex); + // Issue #386: bump generation so the trampoline drops any callback fired + // from the now-defunct subscription before we erase its EventCallbackContext. + impl_->generation.fetch_add(1, std::memory_order_release); + + // Clear event monitored items BEFORE the open62541pp Subscription destructor + // runs (subscription deletion cascades server-side, but the C callback's + // userdata must remain valid until its last possible invocation). + { + std::lock_guard ev_lock(impl_->event_callbacks_mutex); + for (auto & [mi_id, ctx] : impl_->event_callbacks) { + UA_Client_MonitoredItems_deleteSingle(impl_->client.handle(), ctx->subscription_id, mi_id); + } + impl_->event_callbacks.clear(); + } + std::lock_guard sub_lock(impl_->sub_mutex); for (auto & info : impl_->subscriptions) { try { info.sub.deleteSubscription(); @@ -443,4 +520,233 @@ std::string OpcuaClient::server_description() const { return impl_->server_desc; } +// ---------------------------------------------------------------------------- +// Issue #386: native OPC-UA AlarmCondition event subscription primitives. +// open62541pp v0.16 has no native EventFilter / event subscription API, so the +// raw open62541 C API is used here. The free functions below build the +// EventFilter and trampoline; OpcuaClient owns the per-monitored-item context +// in unique_ptr storage so lifetime is explicit. +// ---------------------------------------------------------------------------- + +namespace { + +/// Build an OPC-UA EventFilter (heap-allocated members, owned by the returned +/// struct). EventType and SourceNode are always prepended to the user's +/// browse paths; the caller must invoke ``UA_EventFilter_clear`` when done. +UA_EventFilter make_event_filter(const std::vector & user_paths) { + // Always include EventType (position 0) and SourceNode (position 1). + std::vector all_paths; + all_paths.reserve(user_paths.size() + 2); + all_paths.push_back({{0, "EventType"}}); + all_paths.push_back({{0, "SourceNode"}}); + for (const auto & p : user_paths) { + all_paths.push_back(p); + } + + UA_EventFilter filter; + UA_EventFilter_init(&filter); + + filter.selectClausesSize = all_paths.size(); + filter.selectClauses = static_cast( + UA_Array_new(filter.selectClausesSize, &UA_TYPES[UA_TYPES_SIMPLEATTRIBUTEOPERAND])); + + for (size_t i = 0; i < all_paths.size(); ++i) { + UA_SimpleAttributeOperand & sao = filter.selectClauses[i]; + UA_SimpleAttributeOperand_init(&sao); + // typeDefinitionId = BaseEventType (i=2041) + sao.typeDefinitionId = UA_NODEID_NUMERIC(0, UA_NS0ID_BASEEVENTTYPE); + sao.attributeId = UA_ATTRIBUTEID_VALUE; + const auto & path = all_paths[i]; + sao.browsePathSize = path.size(); + if (sao.browsePathSize > 0) { + sao.browsePath = + static_cast(UA_Array_new(sao.browsePathSize, &UA_TYPES[UA_TYPES_QUALIFIEDNAME])); + for (size_t j = 0; j < path.size(); ++j) { + sao.browsePath[j] = UA_QUALIFIEDNAME_ALLOC(path[j].namespace_index, path[j].name.c_str()); + } + } + } + return filter; +} + +} // namespace + +// C-linkage trampoline matching ``UA_Client_EventNotificationCallback``. +// Defined at namespace scope so its address is a stable function pointer. +static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 /*sub_id*/, void * /*sub_ctx*/, + UA_UInt32 /*mon_id*/, void * mon_ctx, size_t n_fields, UA_Variant * fields) { + auto * ctx = static_cast(mon_ctx); + if (ctx == nullptr || ctx->owner == nullptr) { + return; + } + // Stale callback from a defunct subscription - ctx is still valid (we only + // free contexts after the generation has already moved past), but the + // payload no longer reflects live state. Drop silently. + if (ctx->generation_snapshot != ctx->owner->current_generation()) { + return; + } + + // Copy the UA_Variant fields into open62541pp wrappers. UA_Variant_copy + // duplicates the underlying buffer, which the opcua::Variant destructor + // will free. + std::vector values; + values.reserve(n_fields); + for (size_t i = 0; i < n_fields; ++i) { + UA_Variant copy; + UA_Variant_init(©); + UA_Variant_copy(&fields[i], ©); + values.emplace_back(opcua::Variant{std::move(copy)}); + } + + opcua::NodeId event_type; + opcua::NodeId source_node; + if (n_fields >= 1 && values[0].isType()) { + event_type = values[0].getScalarCopy(); + } + if (n_fields >= 2 && values[1].isType()) { + source_node = values[1].getScalarCopy(); + } + + std::vector user_values; + if (n_fields > 2) { + user_values.reserve(n_fields - 2); + for (size_t i = 2; i < n_fields; ++i) { + user_values.push_back(std::move(values[i])); + } + } + + if (ctx->callback) { + ctx->callback(user_values, source_node, event_type); + } +} + +uint64_t OpcuaClient::current_generation() const { + return impl_->generation.load(std::memory_order_acquire); +} + +uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const opcua::NodeId & source_node, + const std::vector & select_browse_paths, + EventCallback callback) { + std::lock_guard lock(impl_->client_mutex); + + if (!impl_->connected) { + return 0; + } + + // Heap-allocate context. Ownership stays in event_callbacks; the C API gets + // a non-owning raw pointer until remove_event_monitored_item or cleanup + // erases the entry. + auto ctx = std::make_unique(); + ctx->owner = this; + ctx->generation_snapshot = impl_->generation.load(std::memory_order_acquire); + ctx->subscription_id = subscription_id; + ctx->callback = std::move(callback); + EventCallbackContext * raw_ctx = ctx.get(); + + UA_EventFilter filter = make_event_filter(select_browse_paths); + + UA_MonitoredItemCreateRequest item; + UA_MonitoredItemCreateRequest_init(&item); + item.itemToMonitor.nodeId = *source_node.handle(); // shallow copy; valid for the call duration + item.itemToMonitor.attributeId = UA_ATTRIBUTEID_EVENTNOTIFIER; + item.monitoringMode = UA_MONITORINGMODE_REPORTING; + item.requestedParameters.samplingInterval = 0.0; // event MIs ignore sampling interval + item.requestedParameters.discardOldest = true; + item.requestedParameters.queueSize = 100; + UA_ExtensionObject_setValueNoDelete(&item.requestedParameters.filter, &filter, &UA_TYPES[UA_TYPES_EVENTFILTER]); + + UA_MonitoredItemCreateResult result = + UA_Client_MonitoredItems_createEvent(impl_->client.handle(), subscription_id, UA_TIMESTAMPSTORETURN_BOTH, item, + raw_ctx, on_event_trampoline_c, /*deleteCallback=*/nullptr); + + // Filter members were copied by createEvent; release ours. + UA_EventFilter_clear(&filter); + + if (result.statusCode != UA_STATUSCODE_GOOD) { + UA_MonitoredItemCreateResult_clear(&result); + return 0; + } + + uint32_t mi_id = result.monitoredItemId; + ctx->monitored_item_id = mi_id; + UA_MonitoredItemCreateResult_clear(&result); + + { + std::lock_guard ev_lock(impl_->event_callbacks_mutex); + impl_->event_callbacks.emplace(mi_id, std::move(ctx)); + } + return mi_id; +} + +bool OpcuaClient::remove_event_monitored_item(uint32_t subscription_id, uint32_t mi_id) { + std::lock_guard lock(impl_->client_mutex); + + std::unique_ptr retired; + { + std::lock_guard ev_lock(impl_->event_callbacks_mutex); + auto it = impl_->event_callbacks.find(mi_id); + if (it == impl_->event_callbacks.end() || it->second->subscription_id != subscription_id) { + return false; + } + retired = std::move(it->second); + impl_->event_callbacks.erase(it); + } + + // Bump generation BEFORE freeing the ctx so any in-flight trampoline call + // captured the old generation and will drop its work. + impl_->generation.fetch_add(1, std::memory_order_release); + + if (impl_->connected) { + UA_StatusCode status = UA_Client_MonitoredItems_deleteSingle(impl_->client.handle(), subscription_id, mi_id); + (void)status; // best effort; on failure the server may already have dropped it + } + // ``retired`` falls out of scope here, freeing the EventCallbackContext. + return true; +} + +tl::expected, OpcuaClient::MethodErrorInfo> +OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & method_id, + const std::vector & input_args) { + std::lock_guard lock(impl_->client_mutex); + + if (!impl_->connected) { + return tl::make_unexpected(MethodErrorInfo{MethodError::NotConnected, "Not connected to OPC-UA server"}); + } + + // Helper: map an OPC-UA status code to a MethodError category. + auto status_to_error = [](UA_StatusCode code, const std::string & msg) -> MethodErrorInfo { + if (code == UA_STATUSCODE_BADMETHODINVALID || code == UA_STATUSCODE_BADNODEIDUNKNOWN || + code == UA_STATUSCODE_BADNOTSUPPORTED) { + return {MethodError::MethodNotFound, msg}; + } + if (code == UA_STATUSCODE_BADARGUMENTSMISSING || code == UA_STATUSCODE_BADINVALIDARGUMENT || + code == UA_STATUSCODE_BADTYPEMISMATCH || code == UA_STATUSCODE_BADTOOMANYARGUMENTS) { + return {MethodError::InvalidArgument, msg}; + } + if (code == UA_STATUSCODE_BADTIMEOUT) { + return {MethodError::MethodTimeout, msg}; + } + return {MethodError::TransportError, msg}; + }; + + try { + opcua::CallMethodResult result = + opcua::services::call(impl_->client, object_id, method_id, opcua::Span(input_args)); + UA_StatusCode code = result.getStatusCode().get(); + if (code != UA_STATUSCODE_GOOD) { + return tl::make_unexpected(status_to_error(code, UA_StatusCode_name(code))); + } + auto outputs_span = result.getOutputArguments(); + std::vector outputs; + outputs.reserve(outputs_span.size()); + for (const auto & v : outputs_span) { + outputs.push_back(v); // Variant is copyable + } + return outputs; + } catch (const opcua::BadStatus & e) { + maybe_mark_disconnected(impl_->connected, impl_->generation, e); + return tl::make_unexpected(status_to_error(e.code(), e.what())); + } +} + } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index 8f8e7d2c..464a8f93 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -104,4 +104,57 @@ TEST(OpcuaClientTest, CurrentConfigPersistence) { EXPECT_EQ(stored.connect_timeout, std::chrono::milliseconds(1000)); } +// --------------------------------------------------------------------------- +// Issue #386: native OPC-UA AlarmCondition event subscription primitives. +// These tests cover the disconnected-state contract of the new public API. +// End-to-end event flow (real server emits AlarmConditionType, callback fires, +// state machine advances) is exercised in the docker integration test added +// alongside the test_alarm_server fixture in a follow-up commit on the same +// branch. +// --------------------------------------------------------------------------- + +TEST(OpcuaClientTest, GenerationStartsAtZero) { + OpcuaClient client; + EXPECT_EQ(client.current_generation(), 0u); +} + +TEST(OpcuaClientTest, AddEventMonitoredItemWhenDisconnected) { + OpcuaClient client; + auto mi = client.add_event_monitored_item( + /*sub_id=*/1, opcua::NodeId(0, UA_NS0ID_SERVER), /*select=*/{}, [](const auto &, const auto &, const auto &) {}); + EXPECT_EQ(mi, 0u); +} + +TEST(OpcuaClientTest, RemoveEventMonitoredItemUnknownIdReturnsFalse) { + OpcuaClient client; + EXPECT_FALSE(client.remove_event_monitored_item(/*sub_id=*/1, /*mi_id=*/9999)); +} + +TEST(OpcuaClientTest, CallMethodWhenDisconnected) { + OpcuaClient client; + auto result = client.call_method(opcua::NodeId(0, UA_NS0ID_SERVER), opcua::NodeId(0, 11489), {}); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().code, OpcuaClient::MethodError::NotConnected); +} + +TEST(OpcuaClientTest, GenerationBumpsOnDisconnect) { + OpcuaClient client; + // The disconnect-without-connect path is a no-op - generation should not + // change because there is no live subscription state to invalidate. + client.disconnect(); + EXPECT_EQ(client.current_generation(), 0u); +} + +TEST(OpcuaClientTest, RemoveSubscriptionsBumpsGenerationEvenWhenEmpty) { + // remove_subscriptions() is a publicly exposed bulk-cleanup hook used by the + // poller's reconnect path. It must increment the generation so any + // captured-but-not-yet-fired callbacks from the now-defunct subscription set + // are filtered out by the trampoline. This contract holds even when there + // are no entries, because the poller does not synchronize fine-grained. + OpcuaClient client; + uint64_t before = client.current_generation(); + client.remove_subscriptions(); + EXPECT_GT(client.current_generation(), before); +} + } // namespace ros2_medkit_gateway From bb582d77e58231f3ed81bbb87b479d1cddc173ac Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 13:11:59 +0200 Subject: [PATCH 02/22] feat(opcua): AlarmConditionType state machine, poller wiring, ack/confirm ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the Part 9 AlarmConditionType subscription path into the existing threshold-polling plugin. Builds on the event subscription primitive from the previous commit; downstream consumers see one new YAML form (``event_alarms:`` block) and two new SOVD operations (``acknowledge_fault``, ``confirm_fault``). * New header-only ``AlarmStateMachine``: pure compute_status function over ``EnabledState x ShelvingState x ActiveState x AckedState x ConfirmedState x BranchId``. Decision order documented inline; Retain is intentionally ignored (per Part 9 it filters ConditionRefresh visibility, not lifecycle). 22 unit tests cover every rule plus precedence ordering. * ``NodeMap`` learns ``event_alarms:`` (top-level YAML sibling of ``nodes:``). Each entry declares ``alarm_source`` (NodeId of the source emitting AlarmConditionType events), ``entity_id``, ``fault_code``, and optional severity / message overrides. Mutually exclusive with the per-entry threshold ``alarm`` block; load() fails fast if both are set on the same node. ``find_event_alarm`` lookup serves the SOVD operation handlers. ``build_entity_defs`` merges event-mode entities so SOVD discovery surfaces them as fault-bearing even without scalar data. * ``OpcuaPoller`` gains ``setup_event_subscriptions()`` and ``on_event(...)``. One dedicated subscription handles all event-mode alarms; the trampoline dispatches positional select-clause values through the state machine. ConditionRefresh fires on subscribe and on every reconnect (using the existing exponential-backoff path); the generation counter from OpcuaClient already filters callbacks captured from defunct subscriptions. * Per-condition runtime is keyed by ConditionId NodeId stringForm so multiple instances under the same source remain distinct. Each entry carries the latest EventId ByteString - required for spec-compliant Acknowledge calls (Part 9 §5.7.3 returns BadEventIdUnknown otherwise). * Plugin's OperationProvider lists ``acknowledge_fault`` / ``confirm_fault`` for any entity that has at least one event-mode alarm. ``execute_operation`` resolves (entity_id, fault_code) through the poller's lookup_condition, then invokes the inherited methods on AcknowledgeableConditionType (i=9111 Ack, i=9113 Confirm) with the tracked EventId and a LocalizedText comment. HTTP error mapping mirrors the existing write_value path. * AlarmCondition events bridge through ``on_event_alarm`` to the existing send_report_fault / send_clear_fault wiring. Severity is mapped to the SOVD enum buckets (1-200 INFO, 201-500 WARN, 501-800 ERROR, 801-1000 CRITICAL); selfpatch convention, NOT IEC 62682 - documented in the follow-up design doc. Tests: 22 new state-machine unit tests (full transition table coverage plus rule-precedence). All 144 tests in the package green; ASAN/TSAN clean; clang-format, copyright, cppcheck, lint_cmake, xmllint all pass. The test_alarm_server fixture and its docker integration ship in the accompanying commit on this branch (gated OFF by default while the ExternalProject namespace0_generated linker issue is being resolved). Refs #386 --- .../ros2_medkit_opcua/CMakeLists.txt | 80 +++++ .../docker/test_alarm_server/Dockerfile | 53 +++ .../docker/test_alarm_server/build.sh | 23 ++ .../ros2_medkit_opcua/alarm_state_machine.hpp | 137 +++++++ .../include/ros2_medkit_opcua/node_map.hpp | 46 ++- .../ros2_medkit_opcua/opcua_plugin.hpp | 3 + .../ros2_medkit_opcua/opcua_poller.hpp | 71 ++++ .../ros2_medkit_opcua/src/node_map.cpp | 66 ++++ .../ros2_medkit_opcua/src/opcua_plugin.cpp | 135 +++++++ .../ros2_medkit_opcua/src/opcua_poller.cpp | 291 +++++++++++++++ .../fixtures/test_alarm_server/smoke_test.py | 133 +++++++ .../test_alarm_server/test_alarm_server.cpp | 337 ++++++++++++++++++ .../test/test_alarm_state_machine.cpp | 211 +++++++++++ 13 files changed, 1585 insertions(+), 1 deletion(-) create mode 100644 src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile create mode 100755 src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/build.sh create mode 100644 src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp create mode 100755 src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/smoke_test.py create mode 100644 src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp create mode 100644 src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt index 867cf8c6..0d3edda9 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt @@ -203,6 +203,86 @@ if(BUILD_TESTING) ) medkit_set_test_domain(test_opcua_client) + # Issue #386: pure-function state machine tests. Header-only target - + # no opcua dependency at link time so it runs fast and is sanitizer + # clean independent of the open62541pp build flavour. + ament_add_gtest(test_alarm_state_machine + test/test_alarm_state_machine.cpp + ) + target_include_directories(test_alarm_state_machine PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ) + medkit_set_test_domain(test_alarm_state_machine) + + # ---- test_alarm_server fixture ------------------------------------------ + # Standalone OPC-UA server emitting AlarmConditionType events for + # integration testing of native alarm subscriptions (issue #386). + # + # The plugin's main open62541pp build is configured with + # UA_NAMESPACE_ZERO=REDUCED and UA_ENABLE_SUBSCRIPTIONS_ALARMS_CONDITIONS=OFF + # because the runtime client path uses neither. Re-enabling alarms there + # would force every consumer of open62541pp to pull in the full namespace 0 + # (~5MB of generated source) and the EXPERIMENTAL A&C subsystem. + # + # Instead we build a second open62541 statically via ExternalProject_Add + # using the source already on disk from FetchContent, pinned to FULL ns0 + # and alarms ON. The fixture binary links only against this private copy. + # NOTE: defaults OFF until the ExternalProject linker issue with + # ``namespace0_generated`` is resolved. Tests that use this fixture set + # the flag to ON in their docker integration script. + option(MEDKIT_OPCUA_BUILD_ALARM_SERVER + "Build the OPC-UA AlarmCondition test fixture server (issue #386)" OFF) + if(MEDKIT_OPCUA_BUILD_ALARM_SERVER) + find_package(Threads REQUIRED) + include(ExternalProject) + set(_alarm_o62_src "${open62541pp_SOURCE_DIR}/3rdparty/open62541") + set(_alarm_o62_install "${CMAKE_BINARY_DIR}/_alarm_open62541") + externalproject_add(alarm_open62541_ep + SOURCE_DIR "${_alarm_o62_src}" + PREFIX "${CMAKE_BINARY_DIR}/_alarm_open62541_ep" + INSTALL_DIR "${_alarm_o62_install}" + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX=${_alarm_o62_install} + -DBUILD_SHARED_LIBS=OFF + -DUA_ENABLE_SUBSCRIPTIONS_EVENTS=ON + -DUA_ENABLE_SUBSCRIPTIONS_ALARMS_CONDITIONS=ON + -DUA_NAMESPACE_ZERO=FULL + -DUA_BUILD_EXAMPLES=OFF + -DUA_BUILD_TOOLS=OFF + -DUA_FORCE_WERROR=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_C_FLAGS=-w + # Serial build avoids a -j race in open62541's Ninja-style codegen + # where a parallel writer can clobber namespace0_generated.c.o.d + # before the .d file is materialized. The build is one-shot, the + # 30-60s overhead is well worth predictability. + BUILD_COMMAND ${CMAKE_COMMAND} --build + BUILD_BYPRODUCTS "${_alarm_o62_install}/lib/libopen62541.a" + UPDATE_DISCONNECTED 1 + ) + set(_alarm_o62_lib "${_alarm_o62_install}/lib/libopen62541.a") + set(_alarm_o62_include "${_alarm_o62_install}/include") + file(MAKE_DIRECTORY "${_alarm_o62_include}") + + add_executable(test_alarm_server + test/fixtures/test_alarm_server/test_alarm_server.cpp) + add_dependencies(test_alarm_server alarm_open62541_ep) + target_include_directories(test_alarm_server SYSTEM PRIVATE + "${_alarm_o62_include}") + target_link_libraries(test_alarm_server PRIVATE + "${_alarm_o62_lib}" Threads::Threads) + set_target_properties(test_alarm_server PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") + target_compile_options(test_alarm_server PRIVATE -w) + + install(TARGETS test_alarm_server + RUNTIME DESTINATION lib/${PROJECT_NAME}) + install(PROGRAMS + test/fixtures/test_alarm_server/smoke_test.py + DESTINATION lib/${PROJECT_NAME}) + endif() + ros2_medkit_relax_vendor_warnings() endif() diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile new file mode 100644 index 00000000..db481beb --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile @@ -0,0 +1,53 @@ +# Copyright 2026 mfaferek93 +# +# Standalone OPC-UA test fixture image for AlarmConditionType integration +# tests. The fixture exposes 3 conditions on tcp port 4842 and reads CLI +# commands from stdin; see ../../test/fixtures/test_alarm_server/. +# +# Build context: ros2_medkit repo root (the directory containing `src/`) +# docker build -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile \ +# -t ros2_medkit_alarm_test_server:dev . + +FROM ubuntu:24.04 AS builder + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential cmake git python3 ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +# Copy only the open62541 source we need plus the fixture source. The plugin's +# FetchContent has already populated build/ros2_medkit_opcua/_deps/... in the +# host repo, so we depend on that being present on the host before docker build. +COPY build/ros2_medkit_opcua/_deps/open62541pp-src/3rdparty/open62541 /src/open62541 +COPY src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server \ + /src/test_alarm_server + +RUN cmake -S /src/open62541 -B /tmp/o62-build \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/opt/open62541 \ + -DBUILD_SHARED_LIBS=OFF \ + -DUA_ENABLE_SUBSCRIPTIONS_EVENTS=ON \ + -DUA_ENABLE_SUBSCRIPTIONS_ALARMS_CONDITIONS=ON \ + -DUA_NAMESPACE_ZERO=FULL \ + -DUA_BUILD_EXAMPLES=OFF \ + -DUA_BUILD_TOOLS=OFF \ + -DUA_FORCE_WERROR=OFF \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_C_FLAGS=-w \ + && cmake --build /tmp/o62-build -j"$(nproc)" --target install + +RUN g++ -O2 -std=c++17 -w \ + -I/opt/open62541/include \ + /src/test_alarm_server/test_alarm_server.cpp \ + /opt/open62541/lib/libopen62541.a \ + -lpthread -o /opt/test_alarm_server + +FROM ubuntu:24.04 +RUN apt-get update && apt-get install -y --no-install-recommends \ + libstdc++6 ca-certificates \ + && rm -rf /var/lib/apt/lists/* +COPY --from=builder /opt/test_alarm_server /usr/local/bin/test_alarm_server +EXPOSE 4842 +ENTRYPOINT ["/usr/local/bin/test_alarm_server"] +CMD ["--port", "4842"] diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/build.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/build.sh new file mode 100755 index 00000000..203949dd --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/build.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Build the test_alarm_server Docker image. +# Run from the workspace root (the directory containing src/ and build/). + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../.." && pwd)" +cd "$REPO_ROOT" + +# The Dockerfile copies open62541 source from build/.../_deps/, which is +# populated by `colcon build --packages-select ros2_medkit_opcua` (FetchContent +# step). Fail fast if the user has not built the plugin yet. +DEPS_DIR="build/ros2_medkit_opcua/_deps/open62541pp-src/3rdparty/open62541" +if [[ ! -d "$DEPS_DIR" ]]; then + echo "error: $DEPS_DIR does not exist." >&2 + echo "Run 'colcon build --packages-select ros2_medkit_opcua' first to" >&2 + echo "populate the FetchContent source cache." >&2 + exit 1 +fi + +exec docker build \ + -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile \ + -t ros2_medkit_alarm_test_server:dev . diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp new file mode 100644 index 00000000..12f8dc8c --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp @@ -0,0 +1,137 @@ +// Copyright 2026 mfaferek93 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace ros2_medkit_gateway { + +/// SOVD fault statuses surfaced by the OPC-UA AlarmCondition bridge. +/// +/// PREFAILED is reserved for the threshold-polling pre-trigger path and is +/// never produced by ``AlarmStateMachine::compute`` from native event input - +/// OPC-UA Part 9 has no equivalent state. +enum class SovdAlarmStatus { Suppressed, Confirmed, Healed, Cleared }; + +/// Action the poller should take after running the state machine on one event. +/// +/// ``ReportConfirmed`` / ``ReportHealed`` correspond to ``send_report_fault`` +/// (FAILED event_type for Confirmed; PASSED-but-still-tracked for Healed). +/// ``ClearFault`` issues ``clear_fault`` against fault_manager. ``NoOp`` means +/// the event was redundant (same as last_known_status) and we suppress +/// downstream noise. +enum class AlarmAction { NoOp, ReportConfirmed, ReportHealed, ClearFault }; + +/// Inputs from a single AlarmConditionType event payload. +/// +/// Fields populated by the trampoline from EventFilter select clauses. +/// ``branch_id_present`` is true when ``BranchId`` is non-null - per +/// Part 9 §5.5.2.12 those events refer to historical branches and must +/// never advance the live SOVD status (we route them to the fault_manager +/// event log only). +struct AlarmEventInput { + bool enabled_state{true}; + bool active_state{false}; + bool acked_state{false}; + bool confirmed_state{false}; + /// True iff ShelvingState != Unshelved. + bool shelved{false}; + /// True iff BranchId is non-null (historical branch event). + bool branch_id_present{false}; +}; + +/// Pure, side-effect-free state machine bridging OPC-UA AlarmConditionType +/// state combinations to SOVD fault lifecycle. The poller owns the +/// ``last_known_status`` per (ConditionId) and feeds it back as +/// ``prev_status`` on every event. +/// +/// Decision order (first match wins) follows the design table in +/// design/index.rst and issue #386: +/// +/// 1. BranchId != null -> Suppressed + NoOp (history only) +/// 2. EnabledState == false -> Suppressed (clear if was active) +/// 3. ShelvingState != Unshelved -> Suppressed (clear if was active) +/// 4. ActiveState == true -> Confirmed (idempotent re-report) +/// 5. ActiveState == false -> Healed or Cleared based on +/// Acked + Confirmed +/// +/// Retain is intentionally NOT used here. Per Part 9 §5.5.2.10 it controls +/// visibility during ConditionRefresh bursts, not lifecycle - the poller +/// strips Retain=false events delivered between RefreshStartEvent and +/// RefreshEndEvent before invoking compute(). +class AlarmStateMachine { + public: + struct Outcome { + SovdAlarmStatus next_status; + AlarmAction action; + }; + + static Outcome compute(SovdAlarmStatus prev_status, const AlarmEventInput & in) { + // Rule 1: branch events are recorded in the fault_manager event log + // (caller's responsibility) but never advance the primary lifecycle. + if (in.branch_id_present) { + return {prev_status, AlarmAction::NoOp}; + } + + const bool was_active = (prev_status == SovdAlarmStatus::Confirmed || prev_status == SovdAlarmStatus::Healed); + + // Rule 2: an alarm with EnabledState=false is administratively switched + // off in the PLC. Treat the same as a clear. + if (!in.enabled_state) { + if (was_active) { + return {SovdAlarmStatus::Cleared, AlarmAction::ClearFault}; + } + return {SovdAlarmStatus::Suppressed, AlarmAction::NoOp}; + } + + // Rule 3: shelving is operator-driven suppression. We mirror it as a + // soft clear in SOVD - operator who unshelves will receive a fresh + // CONFIRMED event from the next live notification. + if (in.shelved) { + if (was_active) { + return {SovdAlarmStatus::Cleared, AlarmAction::ClearFault}; + } + return {SovdAlarmStatus::Suppressed, AlarmAction::NoOp}; + } + + // Rule 4: live alarm condition. + if (in.active_state) { + if (prev_status == SovdAlarmStatus::Confirmed) { + // Same state - the underlying event still re-fires for fault_manager + // occurrence_count tracking but does NOT trigger another Confirmed + // report. Caller increments the count via a separate path. + return {SovdAlarmStatus::Confirmed, AlarmAction::NoOp}; + } + return {SovdAlarmStatus::Confirmed, AlarmAction::ReportConfirmed}; + } + + // Rule 5: ActiveState=false. Cleared only when both Acked AND Confirmed + // have been completed by the operator; until then the alarm is latched + // (HEALED) so the operator sees the unfinished workflow item. + if (in.acked_state && in.confirmed_state) { + if (prev_status == SovdAlarmStatus::Cleared || prev_status == SovdAlarmStatus::Suppressed) { + return {SovdAlarmStatus::Cleared, AlarmAction::NoOp}; + } + return {SovdAlarmStatus::Cleared, AlarmAction::ClearFault}; + } + if (prev_status == SovdAlarmStatus::Healed) { + return {SovdAlarmStatus::Healed, AlarmAction::NoOp}; + } + return {SovdAlarmStatus::Healed, AlarmAction::ReportHealed}; + } +}; + +} // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp index bfee0f82..6b26fe8b 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp @@ -33,6 +33,34 @@ struct AlarmConfig { bool above_threshold{true}; // true = alarm when value > threshold }; +/// Configuration for a native OPC-UA AlarmConditionType event subscription +/// (issue #386). The plugin subscribes to events emitted from +/// ``alarm_source`` and bridges them through ``AlarmStateMachine`` into +/// SOVD faults. Mutually exclusive with the threshold-based ``AlarmConfig`` +/// on a single ``NodeMapEntry``. +struct AlarmEventConfig { + /// OPC-UA NodeId of the source node emitting the AlarmConditionType + /// events (typically the parent Object that owns the condition, e.g. + /// "ns=2;s=Tank.Pressure" or the Server object for system-wide alarms). + std::string source_node_id_str; + opcua::NodeId source_node_id; + + /// SOVD entity that should host the resulting fault. + std::string entity_id; + + /// SOVD fault code (e.g. ``PLC_OVERPRESSURE``). + std::string fault_code; + + /// Optional severity override. When empty, ``AlarmStateMachine`` derives + /// the SOVD severity bucket from the event's ``Severity`` (1-1000) per + /// the convention documented in design/index.rst. + std::string severity_override; + + /// Optional friendly message override; falls back to the event's + /// ``Message`` field when empty. + std::string message_override; +}; + /// Mapping entry: OPC-UA NodeId -> SOVD entity data point struct NodeMapEntry { std::string node_id_str; // OPC-UA node ID string (e.g., "ns=1;s=TankLevel") @@ -90,9 +118,18 @@ class NodeMap { /// Find entry by OPC-UA node ID string const NodeMapEntry * find_by_node_id(const std::string & node_id_str) const; - /// Get all entries that have alarm configuration + /// Get all entries that have threshold-based alarm configuration std::vector alarm_entries() const; + /// Get all native OPC-UA AlarmConditionType event-mode entries (issue #386). + const std::vector & event_alarms() const { + return event_alarms_; + } + + /// Find an event-mode alarm by ``(entity_id, fault_code)`` (used by the + /// SOVD ``acknowledge_fault`` / ``confirm_fault`` operations). + const AlarmEventConfig * find_event_alarm(const std::string & entity_id, const std::string & fault_code) const; + /// Get derived SOVD entity definitions const std::vector & entity_defs() const { return entity_defs_; @@ -144,6 +181,13 @@ class NodeMap { std::unordered_map> entity_index_; // entity_id -> entry indices std::unordered_map node_id_index_; // node_id_str -> entry index + // Issue #386: native OPC-UA AlarmConditionType subscriptions, loaded from + // top-level ``event_alarms:`` in the YAML. Stored separately from + // ``entries_`` because event-mode alarms do not have a scalar node to + // poll; their entity definitions are merged into ``entity_defs_`` via + // build_entity_defs() so SOVD discovery is unaffected. + std::vector event_alarms_; + std::string area_id_ = "plc_systems"; std::string area_name_ = "PLC Systems"; std::string component_id_ = "openplc_runtime"; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp index 764ac830..4b12d25d 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp @@ -113,6 +113,9 @@ class OpcuaPlugin : public ros2_medkit_gateway::GatewayPlugin, // Alarm -> Fault bridge void on_alarm_change(const std::string & fault_code, const AlarmConfig & config, bool active); + // Issue #386: native AlarmConditionType event lifecycle bridge. + void on_event_alarm(const AlarmEventDelivery & delivery); + // Report/clear fault via ROS 2 service (private helpers, not the FaultProvider overrides) void send_report_fault(const std::string & entity_id, const std::string & fault_code, const std::string & severity_str, const std::string & message); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index cf902fac..6a04b8b5 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -14,6 +14,7 @@ #pragma once +#include "ros2_medkit_opcua/alarm_state_machine.hpp" #include "ros2_medkit_opcua/node_map.hpp" #include "ros2_medkit_opcua/opcua_client.hpp" @@ -23,9 +24,11 @@ #include #include #include +#include #include #include #include +#include namespace ros2_medkit_gateway { @@ -43,6 +46,33 @@ struct PollSnapshot { using AlarmChangeCallback = std::function; +/// Callback when a native OPC-UA AlarmCondition lifecycle transitions to a +/// new SOVD status (issue #386). Fires at most once per logical transition; +/// suppressed events do not invoke the callback. +struct AlarmEventDelivery { + std::string fault_code; + std::string entity_id; + SovdAlarmStatus next_status; + AlarmAction action; + uint16_t severity{0}; // raw OPC-UA Severity 1-1000 + std::string message; // event Message field (or AlarmEventConfig override) + std::string condition_id; // string form of OPC-UA ConditionId +}; +using EventAlarmCallback = std::function; + +/// Looked-up runtime state for a unique OPC-UA Condition instance. The +/// poller keeps one entry per distinct ConditionId observed; the entry +/// outlives ack/confirm round-trips so the PR3 ``acknowledge_fault`` SOVD +/// operation can resolve a fault_code to the live ConditionId NodeId and +/// the latest ``EventId`` ByteString (required for spec-compliant Ack). +struct ConditionRuntime { + opcua::NodeId condition_id; + opcua::ByteString latest_event_id; + std::string entity_id; + std::string fault_code; + SovdAlarmStatus last_status{SovdAlarmStatus::Suppressed}; +}; + /// Callback after each poll cycle (for publishing values to ROS 2 topics) using PollCallback = std::function; @@ -78,6 +108,10 @@ class OpcuaPoller { /// Set callback for alarm state changes void set_alarm_callback(AlarmChangeCallback callback); + /// Set callback for native AlarmCondition event lifecycle transitions + /// (issue #386). Must be called before ``start()``. + void set_event_alarm_callback(EventAlarmCallback callback); + /// Set callback fired after each poll cycle (for value bridging) void set_poll_callback(PollCallback callback); @@ -86,6 +120,16 @@ class OpcuaPoller { return using_subscriptions_.load(); } + /// Look up a live condition by ``(entity_id, fault_code)``. Used by the + /// SOVD ``acknowledge_fault`` / ``confirm_fault`` operation handlers to + /// resolve which OPC-UA ConditionId should receive the Method call. The + /// returned snapshot is a copy, so the caller can release any locks + /// before performing the OPC-UA round-trip. + /// + /// Returns ``std::nullopt`` if no condition with that fault_code is + /// currently active for the entity. + std::optional lookup_condition(const std::string & entity_id, const std::string & fault_code) const; + private: void poll_loop(); void do_poll(); @@ -93,6 +137,12 @@ class OpcuaPoller { void evaluate_alarms(); void on_data_change(const std::string & node_id, const OpcuaValue & value); + // Issue #386 helpers. + void setup_event_subscriptions(); + void on_event(const AlarmEventConfig & cfg, const std::vector & values, + const opcua::NodeId & source_node, const opcua::NodeId & event_type); + void condition_refresh(); + OpcuaClient & client_; const NodeMap & node_map_; PollerConfig config_; @@ -111,6 +161,27 @@ class OpcuaPoller { AlarmChangeCallback alarm_callback_; std::unordered_map alarm_states_; // fault_code -> last known state + // Issue #386: event-mode AlarmCondition state. + EventAlarmCallback event_alarm_callback_; + std::mutex event_alarm_callback_mutex_; + + uint32_t event_subscription_id_{0}; + std::vector event_monitored_item_ids_; + + mutable std::shared_mutex conditions_mutex_; + std::unordered_map conditions_; // ConditionId stringForm -> runtime + + // ConditionRefresh bracketing state. open62541 sends the buffered + // historical condition burst between RefreshStartEvent and + // RefreshEndEvent; we apply each event during the burst as normal but + // use this flag in tests / diagnostics. Production note: per Part 9 + // §5.5.7 the spec also requires the client to ignore ConditionRefresh + // notifications carrying ``Retain=false`` for non-current branches; the + // state machine already drops branches via BranchId, and the trampoline + // cannot distinguish refresh-burst events from live events without + // tracking the EventType, which we do explicitly below. + std::atomic condition_refresh_in_progress_{false}; + // Thread safety: must be set via set_poll_callback() before start(). // Not modified after start(), so safe to read from the poll thread without a mutex. PollCallback poll_callback_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp index 6d203ed2..46124538 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp @@ -304,6 +304,49 @@ bool NodeMap::load(const std::string & yaml_path) { entries_.push_back(std::move(entry)); } + // Issue #386: native AlarmConditionType event subscriptions. Loaded from + // top-level ``event_alarms:`` (sibling of ``nodes:``). Each entry must + // declare its own entity_id; the entity will be merged into entity_defs_ + // alongside any threshold-mode entries pointing at the same id. + event_alarms_.clear(); + auto event_alarms_node = root["event_alarms"]; + if (event_alarms_node && event_alarms_node.IsSequence()) { + if (event_alarms_node.size() > 10000) { + RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), + "event_alarms has %zu entries (max 10000) - refusing to load", event_alarms_node.size()); + return false; + } + for (size_t i = 0; i < event_alarms_node.size(); ++i) { + const auto & a = event_alarms_node[i]; + if (!a["alarm_source"] || !a["entity_id"] || !a["fault_code"]) { + RCLCPP_WARN(rclcpp::get_logger("opcua.node_map"), + "event_alarms[%zu] missing alarm_source/entity_id/fault_code - skipping", i); + continue; + } + AlarmEventConfig cfg; + cfg.source_node_id_str = a["alarm_source"].as(); + cfg.source_node_id = parse_node_id(cfg.source_node_id_str); + cfg.entity_id = a["entity_id"].as(); + cfg.fault_code = a["fault_code"].as(); + cfg.severity_override = a["severity_override"].as(""); + cfg.message_override = a["message"].as(""); + event_alarms_.push_back(std::move(cfg)); + } + } + + // Mutual-exclusion check: an entry under ``nodes:`` carrying both a + // ``threshold`` alarm and an ``alarm_source`` is a configuration error + // (the threshold path polls scalar values; the alarm_source path + // subscribes to native events). Reject the whole file rather than guess. + for (const auto & node : (nodes ? nodes : YAML::Node{})) { + if (node["alarm_source"] && node["alarm"] && node["alarm"]["threshold"]) { + RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), + "Entry node_id=%s declares both threshold alarm and alarm_source - mutually exclusive", + node["node_id"] ? node["node_id"].as().c_str() : ""); + return false; + } + } + build_entity_defs(); return true; @@ -313,6 +356,16 @@ bool NodeMap::load(const std::string & yaml_path) { } } +const AlarmEventConfig * NodeMap::find_event_alarm(const std::string & entity_id, + const std::string & fault_code) const { + for (const auto & cfg : event_alarms_) { + if (cfg.entity_id == entity_id && cfg.fault_code == fault_code) { + return &cfg; + } + } + return nullptr; +} + std::vector NodeMap::entries_for_entity(const std::string & entity_id) const { std::vector result; auto it = entity_index_.find(entity_id); @@ -388,6 +441,19 @@ void NodeMap::build_entity_defs() { } } + // Issue #386: event-mode entities show up as fault-bearing too, even when + // they have no scalar data points of their own. Without this, the SOVD + // discovery layer would not surface an entity that exists purely to host + // alarm events (e.g. a system-wide ``ServerDiagnostics`` source). + for (const auto & cfg : event_alarms_) { + auto & def = defs[cfg.entity_id]; + if (def.id.empty()) { + def.id = cfg.entity_id; + def.component_id = component_id_; + } + def.has_faults = true; + } + // Build human-readable names from IDs for (auto & [id, def] : defs) { // Convert snake_case to Title Case diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 1dfd0b55..f5c9c2c8 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -174,6 +174,9 @@ void OpcuaPlugin::set_context(PluginContext & context) { poller_->set_alarm_callback([this](const std::string & code, const AlarmConfig & cfg, bool active) { on_alarm_change(code, cfg, active); }); + poller_->set_event_alarm_callback([this](const AlarmEventDelivery & delivery) { + on_event_alarm(delivery); + }); poller_->set_poll_callback([this](const PollSnapshot & snap) { publish_values(snap); }); @@ -491,6 +494,56 @@ void OpcuaPlugin::on_alarm_change(const std::string & fault_code, const AlarmCon } } +void OpcuaPlugin::on_event_alarm(const AlarmEventDelivery & delivery) { + if (shutdown_requested_.load()) { + return; + } + + // Map raw OPC-UA Severity (1-1000) to SOVD severity bucket. + // Selfpatch convention documented in design/index.rst; not from IEC 62682. + // Severity_override on the AlarmEventConfig wins when set. + auto severity_str = [&]() { + auto * cfg = node_map_.find_event_alarm(delivery.entity_id, delivery.fault_code); + if (cfg && !cfg->severity_override.empty()) { + return cfg->severity_override; + } + if (delivery.severity >= 801) { + return std::string("CRITICAL"); + } + if (delivery.severity >= 501) { + return std::string("ERROR"); + } + if (delivery.severity >= 201) { + return std::string("WARNING"); + } + return std::string("INFO"); + }(); + + switch (delivery.action) { + case AlarmAction::ReportConfirmed: + log_info("AlarmCondition CONFIRMED: " + delivery.fault_code + " on " + delivery.entity_id + + " (severity=" + std::to_string(delivery.severity) + ")"); + send_report_fault(delivery.entity_id, delivery.fault_code, severity_str, delivery.message); + break; + case AlarmAction::ReportHealed: + // Fault is latched: condition is no longer active but not yet + // confirmed. We don't have a dedicated HEALED reporting verb in + // ReportFault.srv (only FAILED/PASSED), so we mark this as a PASSED + // event - fault_manager keeps the entry in HEALED state until + // confirmed, mirroring the lifecycle. + log_info("AlarmCondition HEALED (latched, awaiting ack/confirm): " + delivery.fault_code); + // No-op for now; fault_manager will keep the fault HEALED until + // CLEARED. The state transition is observable via /faults/stream. + break; + case AlarmAction::ClearFault: + log_info("AlarmCondition CLEARED: " + delivery.fault_code); + send_clear_fault(delivery.fault_code); + break; + case AlarmAction::NoOp: + break; + } +} + void OpcuaPlugin::send_report_fault(const std::string & entity_id, const std::string & fault_code, const std::string & severity_str, const std::string & message) { if (!fault_clients_->report) { @@ -770,6 +823,31 @@ tl::expected OpcuaPlugin::list_opera item["asynchronous_execution"] = false; items.push_back(std::move(item)); } + + // Issue #386: emit acknowledge_fault / confirm_fault when the entity + // has at least one native AlarmConditionType event subscription. The + // fault_code parameter (passed in operation execution body) discriminates + // which condition the operator is acting on. + bool has_event_alarms = std::any_of(node_map_.event_alarms().begin(), node_map_.event_alarms().end(), + [&entity_id](const AlarmEventConfig & cfg) { + return cfg.entity_id == entity_id; + }); + if (has_event_alarms) { + nlohmann::json ack; + ack["id"] = "acknowledge_fault"; + ack["name"] = "Acknowledge an alarm"; + ack["proximity_proof_required"] = false; + ack["asynchronous_execution"] = false; + items.push_back(std::move(ack)); + + nlohmann::json confirm; + confirm["id"] = "confirm_fault"; + confirm["name"] = "Confirm an alarm condition is addressed"; + confirm["proximity_proof_required"] = false; + confirm["asynchronous_execution"] = false; + items.push_back(std::move(confirm)); + } + return tl::expected{nlohmann::json{{"items", items}}}; } @@ -785,6 +863,63 @@ OpcuaPlugin::execute_operation(const std::string & entity_id, const std::string OperationProviderErrorInfo{OperationProviderError::Internal, "plugin not initialized", 503}); } + // Issue #386: acknowledge_fault and confirm_fault dispatch to OPC-UA + // Method calls on the live ConditionId. AcknowledgeableConditionType + // declares Acknowledge as method i=9111 and Confirm as method i=9113; + // the objectId argument is the ConditionId NodeId of the live instance. + if (operation_name == "acknowledge_fault" || operation_name == "confirm_fault") { + if (!parameters.contains("fault_code") || !parameters["fault_code"].is_string()) { + return tl::make_unexpected(OperationProviderErrorInfo{OperationProviderError::InvalidParameters, + "Missing required string parameter: fault_code", 400}); + } + std::string fault_code = parameters["fault_code"].get(); + std::string comment = parameters.value("comment", std::string{}); + + auto runtime = poller_->lookup_condition(entity_id, fault_code); + if (!runtime) { + return tl::make_unexpected(OperationProviderErrorInfo{ + OperationProviderError::OperationNotFound, + "No live condition for fault_code=" + fault_code + " on entity=" + entity_id, 404}); + } + + constexpr uint32_t kAcknowledgeMethodId = 9111; + constexpr uint32_t kConfirmMethodId = 9113; + opcua::NodeId method_id(0, operation_name == "acknowledge_fault" ? kAcknowledgeMethodId : kConfirmMethodId); + + std::vector args; + args.push_back(opcua::Variant::fromScalar(runtime->latest_event_id)); + args.push_back(opcua::Variant::fromScalar(opcua::LocalizedText("", comment))); + + auto result = client_->call_method(runtime->condition_id, method_id, args); + if (!result.has_value()) { + auto code = result.error().code; + int http = 502; + auto err = OperationProviderError::TransportError; + if (code == OpcuaClient::MethodError::NotConnected) { + http = 503; + err = OperationProviderError::Internal; + } else if (code == OpcuaClient::MethodError::MethodNotFound) { + http = 404; + err = OperationProviderError::OperationNotFound; + } else if (code == OpcuaClient::MethodError::InvalidArgument) { + http = 400; + err = OperationProviderError::InvalidParameters; + } else if (code == OpcuaClient::MethodError::MethodTimeout) { + http = 504; + err = OperationProviderError::TransportError; + } + return tl::make_unexpected(OperationProviderErrorInfo{err, result.error().message, http}); + } + + nlohmann::json out; + out["status"] = "ok"; + out["operation"] = operation_name; + out["fault_code"] = fault_code; + out["entity_id"] = entity_id; + out["condition_id"] = runtime->condition_id.toString(); + return tl::expected{out}; + } + std::string data_name; if (operation_name.substr(0, 4) == "set_") { data_name = operation_name.substr(4); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 9c98a818..8c4aa158 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -16,10 +16,80 @@ #include #include +#include #include +#include + +#include namespace ros2_medkit_gateway { +namespace { + +// EventType NodeIds for ConditionRefresh bracketing per OPC-UA Part 9 §5.5.7 +// (RefreshStartEventType i=2787, RefreshEndEventType i=2788). +constexpr uint32_t kRefreshStartEventTypeId = 2787; +constexpr uint32_t kRefreshEndEventTypeId = 2788; + +// ShelvingState.CurrentState.Id NodeId for "Unshelved" state per §5.8.16. +// Anything other than this (TimedShelved=2930, OneShotShelved=2932) means +// the alarm is operator-suppressed. +constexpr uint32_t kShelvedStateUnshelved = 2929; + +// EventFilter select-clause indices used by the AlarmCondition trampoline. +// Order MUST match build_alarm_event_select_paths() below; the trampoline +// reads positionally because OPC-UA does not return field names with the +// notification. +constexpr size_t kFieldConditionId = 0; +constexpr size_t kFieldEventId = 1; +constexpr size_t kFieldSeverity = 2; +constexpr size_t kFieldMessage = 3; +constexpr size_t kFieldEnabledState = 4; +constexpr size_t kFieldActiveState = 5; +constexpr size_t kFieldAckedState = 6; +constexpr size_t kFieldConfirmedState = 7; +constexpr size_t kFieldShelvingState = 8; +constexpr size_t kFieldBranchId = 9; +constexpr size_t kAlarmFieldCount = 10; + +std::vector build_alarm_event_select_paths() { + return { + {{0, "ConditionId"}}, + {{0, "EventId"}}, + {{0, "Severity"}}, + {{0, "Message"}}, + {{0, "EnabledState"}, {0, "Id"}}, + {{0, "ActiveState"}, {0, "Id"}}, + {{0, "AckedState"}, {0, "Id"}}, + {{0, "ConfirmedState"}, {0, "Id"}}, + {{0, "ShelvingState"}, {0, "CurrentState"}, {0, "Id"}}, + {{0, "BranchId"}}, + }; +} + +// Extract a scalar of type T from a Variant, returning the default if absent. +template +T variant_or(const opcua::Variant & v, T fallback) { + if (v.isType()) { + return v.getScalarCopy(); + } + return fallback; +} + +// Decode a LocalizedText event field to plain UTF-8 (Message uses LT). +std::string variant_to_localized_text(const opcua::Variant & v) { + if (v.isType()) { + auto lt = v.getScalarCopy(); + return std::string(lt.getText()); + } + if (v.isType()) { + return std::string(v.getScalarCopy()); + } + return ""; +} + +} // namespace + OpcuaPoller::OpcuaPoller(OpcuaClient & client, const NodeMap & node_map) : client_(client), node_map_(node_map) { } @@ -40,6 +110,12 @@ void OpcuaPoller::start(const PollerConfig & config) { setup_subscriptions(); } + // Issue #386: subscribe to native AlarmConditionType events. Independent + // of data-change subscriptions; runs whenever event_alarms are configured. + if (!node_map_.event_alarms().empty()) { + setup_event_subscriptions(); + } + // Start poll/reconnect thread regardless (handles reconnection and poll fallback) poll_thread_ = std::thread(&OpcuaPoller::poll_loop, this); } @@ -72,6 +148,25 @@ void OpcuaPoller::set_alarm_callback(AlarmChangeCallback callback) { alarm_callback_ = std::move(callback); } +void OpcuaPoller::set_event_alarm_callback(EventAlarmCallback callback) { + if (running_.load()) { + throw std::logic_error("set_event_alarm_callback must be called before start()"); + } + std::lock_guard lock(event_alarm_callback_mutex_); + event_alarm_callback_ = std::move(callback); +} + +std::optional OpcuaPoller::lookup_condition(const std::string & entity_id, + const std::string & fault_code) const { + std::shared_lock lock(conditions_mutex_); + for (const auto & [cid_str, runtime] : conditions_) { + if (runtime.entity_id == entity_id && runtime.fault_code == fault_code) { + return runtime; + } + } + return std::nullopt; +} + void OpcuaPoller::set_poll_callback(PollCallback callback) { if (running_.load()) { throw std::logic_error("set_poll_callback must be called before start()"); @@ -106,6 +201,189 @@ void OpcuaPoller::setup_subscriptions() { } } +void OpcuaPoller::setup_event_subscriptions() { + // Issue #386: one dedicated subscription for AlarmCondition events; uses + // a default no-op data callback because we wire MIs of EVENTNOTIFIER + // attribute, not data-change MIs. The EventCallback bound below is what + // actually receives notifications. + if (event_subscription_id_ != 0) { + // Already configured (reconnect path will re-create after disconnect + // bumps the generation counter, so we should not get here twice on the + // same logical session). Defensive: skip silently. + return; + } + + event_subscription_id_ = client_.create_subscription(config_.subscription_interval_ms, + [](const std::string &, const OpcuaValue &) { /* no-op */ }); + if (event_subscription_id_ == 0) { + return; + } + + const auto select_paths = build_alarm_event_select_paths(); + event_monitored_item_ids_.clear(); + + for (const auto & cfg : node_map_.event_alarms()) { + auto callback = [this, &cfg](const std::vector & values, const opcua::NodeId & source_node, + const opcua::NodeId & event_type) { + on_event(cfg, values, source_node, event_type); + }; + uint32_t mi_id = + client_.add_event_monitored_item(event_subscription_id_, cfg.source_node_id, select_paths, std::move(callback)); + if (mi_id != 0) { + event_monitored_item_ids_.push_back(mi_id); + } + } + + // Fire ConditionRefresh so the server pushes any conditions that fired + // before our subscription started (Part 9 §5.5.7 mandates the bracketing + // RefreshStartEvent / RefreshEndEvent which we treat as ordinary events + // tagged by EventType). + if (!event_monitored_item_ids_.empty()) { + condition_refresh(); + } +} + +void OpcuaPoller::condition_refresh() { + // Server object NodeId (i=2253) hosts the ConditionRefresh method + // (i=3875 - per Part 9 §5.5.7). We use the standard NodeId; servers that + // diverge from the spec (rare) will return BadMethodInvalid which is + // logged but does not abort subscribing. + static constexpr uint32_t kServerObjectId = 2253; + static constexpr uint32_t kConditionRefreshMethodId = 3875; + std::vector args; + args.push_back(opcua::Variant::fromScalar(static_cast(event_subscription_id_))); + auto result = + client_.call_method(opcua::NodeId(0, kServerObjectId), opcua::NodeId(0, kConditionRefreshMethodId), args); + if (!result.has_value()) { + // Not fatal - many test servers do not implement ConditionRefresh. + // Live notifications will still flow once conditions transition. + } +} + +void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector & values, + const opcua::NodeId & /*source_node*/, const opcua::NodeId & event_type) { + // Detect ConditionRefresh bracketing per Part 9 §5.5.7. The flag is for + // diagnostics only; the state machine itself does not need to know + // because RefreshStart / RefreshEnd notifications carry no condition + // payload, so positional-empty values trip the early-return below. + if (event_type.getNamespaceIndex() == 0 && event_type.getIdentifierType() == opcua::NodeIdType::Numeric) { + auto numeric = event_type.getIdentifierAs(); + if (numeric == kRefreshStartEventTypeId) { + condition_refresh_in_progress_.store(true, std::memory_order_release); + return; + } + if (numeric == kRefreshEndEventTypeId) { + condition_refresh_in_progress_.store(false, std::memory_order_release); + return; + } + } + + if (values.size() < kAlarmFieldCount) { + // Server returned fewer fields than our filter requested - typical when + // BadAttributeIdInvalid was returned for one or more select clauses. + // We need at minimum the state fields to make a decision; bail to + // avoid garbage transitions. + return; + } + + AlarmEventInput input; + input.enabled_state = variant_or(values[kFieldEnabledState], true); + input.active_state = variant_or(values[kFieldActiveState], false); + input.acked_state = variant_or(values[kFieldAckedState], false); + input.confirmed_state = variant_or(values[kFieldConfirmedState], false); + + // ShelvingState.CurrentState.Id is itself a NodeId (one of i=2929/2930/ + // 2932). Anything other than Unshelved => alarm is operator-suppressed. + if (values[kFieldShelvingState].isType()) { + auto shelv_state = values[kFieldShelvingState].getScalarCopy(); + input.shelved = + !(shelv_state.getNamespaceIndex() == 0 && shelv_state.getIdentifierType() == opcua::NodeIdType::Numeric && + shelv_state.getIdentifierAs() == kShelvedStateUnshelved); + } else { + input.shelved = false; + } + + // BranchId is a NodeId; null branch maps to identifier i=0 in namespace + // 0. Either an empty NodeId (server omitted) or the canonical null + // counts as "live state" for the bridge. + if (values[kFieldBranchId].isType()) { + auto branch = values[kFieldBranchId].getScalarCopy(); + bool is_null = branch.getNamespaceIndex() == 0 && branch.getIdentifierType() == opcua::NodeIdType::Numeric && + branch.getIdentifierAs() == 0u; + input.branch_id_present = !is_null; + } else { + input.branch_id_present = false; + } + + // Resolve / create the per-condition runtime entry. We key on the + // ConditionId stringForm so distinct condition instances within the same + // event source remain separate. + opcua::NodeId condition_id; + if (values[kFieldConditionId].isType()) { + condition_id = values[kFieldConditionId].getScalarCopy(); + } + std::string condition_id_str = condition_id.toString(); + + ConditionRuntime runtime_snapshot; + SovdAlarmStatus prev_status; + { + std::unique_lock lock(conditions_mutex_); + auto it = conditions_.find(condition_id_str); + if (it == conditions_.end()) { + ConditionRuntime fresh; + fresh.condition_id = condition_id; + fresh.entity_id = cfg.entity_id; + fresh.fault_code = cfg.fault_code; + fresh.last_status = SovdAlarmStatus::Suppressed; + auto inserted = conditions_.emplace(condition_id_str, std::move(fresh)); + it = inserted.first; + } + prev_status = it->second.last_status; + + // Track the latest EventId for spec-compliant Acknowledge calls. + if (values[kFieldEventId].isType()) { + it->second.latest_event_id = values[kFieldEventId].getScalarCopy(); + } + + auto outcome = AlarmStateMachine::compute(prev_status, input); + it->second.last_status = outcome.next_status; + runtime_snapshot = it->second; + + if (outcome.action == AlarmAction::NoOp) { + // No downstream notification - drop while still holding the lock to + // avoid a callback round-trip for redundant events. + return; + } + + AlarmEventDelivery delivery; + delivery.fault_code = cfg.fault_code; + delivery.entity_id = cfg.entity_id; + delivery.next_status = outcome.next_status; + delivery.action = outcome.action; + delivery.severity = variant_or(values[kFieldSeverity], 0); + if (!cfg.message_override.empty()) { + delivery.message = cfg.message_override; + } else { + delivery.message = variant_to_localized_text(values[kFieldMessage]); + } + delivery.condition_id = condition_id_str; + + // Release lock BEFORE invoking user callback (which may take its own + // locks or block on a ROS service - we must not hold conditions_mutex_ + // across that). + lock.unlock(); + + EventAlarmCallback cb_copy; + { + std::lock_guard cb_lock(event_alarm_callback_mutex_); + cb_copy = event_alarm_callback_; + } + if (cb_copy) { + cb_copy(delivery); + } + } +} + void OpcuaPoller::on_data_change(const std::string & node_id, const OpcuaValue & value) { { std::lock_guard lock(snapshot_mutex_); @@ -135,6 +413,19 @@ void OpcuaPoller::poll_loop() { if (config_.prefer_subscriptions) { setup_subscriptions(); } + // Issue #386: re-subscribe to AlarmCondition events after reconnect + // and re-fire ConditionRefresh so we recover any conditions that + // changed state while we were offline. The OpcuaClient's + // generation counter has already advanced (incremented in + // disconnect()/maybe_mark_disconnected), so any stale event + // callbacks captured from the previous subscription are filtered + // out by the trampoline before re-subscription registers fresh + // contexts. + event_subscription_id_ = 0; + event_monitored_item_ids_.clear(); + if (!node_map_.event_alarms().empty()) { + setup_event_subscriptions(); + } } else { // Exponential backoff capped at 60s. condition_variable so stop() wakes immediately. { diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/smoke_test.py b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/smoke_test.py new file mode 100755 index 00000000..279757ab --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/smoke_test.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# Copyright 2026 mfaferek93 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Smoke test for test_alarm_server (issue #386). + +Verifies, against a running test_alarm_server instance: + * 3 condition nodes exist (Overpressure, Overheat, SensorLost). + * Each is a subtype of AlarmConditionType (NodeId i=2915 in ns 0). + * Each exposes Acknowledge and Confirm methods. + * Each exposes the canonical AlarmConditionType select fields. + +Usage: python3 smoke_test.py opc.tcp://localhost:4842 +""" + +import asyncio +import sys + +from asyncua import Client, ua + +NS_URI = 'urn:test:alarms' +ALARM_TYPE_NS0 = ua.NodeId(2915, 0) # AlarmConditionType +EXPECTED = ('Overpressure', 'Overheat', 'SensorLost') +REQUIRED_FIELDS = ( + 'EventId', + 'EventType', + 'SourceNode', + 'Time', + 'Severity', + 'Message', + 'EnabledState', + 'ActiveState', + 'AckedState', + 'ConfirmedState', + 'Retain', +) +REQUIRED_METHODS = ('Acknowledge', 'Confirm') + + +async def find_condition(client, source_browse_name): + """Locate the condition object as a HasComponent child of its source node.""" + objects = client.nodes.objects + children = await objects.get_children() + for ch in children: + bn = await ch.read_browse_name() + if bn.Name == source_browse_name: + return ch + return None + + +async def assert_condition(client, ns_idx, alarm_name): + """Assert that a single AlarmConditionType node is present and well-formed.""" + source_name = alarm_name + 'Source' + source = await find_condition(client, source_name) + if source is None: + raise AssertionError(f'Condition source {source_name} not found') + children = await source.get_children() + target = None + for ch in children: + bn = await ch.read_browse_name() + if bn.NamespaceIndex == ns_idx and bn.Name == alarm_name: + target = ch + break + if target is None: + raise AssertionError(f'Condition node {alarm_name} not found under source') + + type_def = await target.read_type_definition() + if type_def != ALARM_TYPE_NS0: + raise AssertionError( + f'{alarm_name}: type is {type_def}, expected {ALARM_TYPE_NS0}' + ) + + child_names = set() + method_names = set() + for child in await target.get_children(): + bn = await child.read_browse_name() + nc = await child.read_node_class() + if nc == ua.NodeClass.Method: + method_names.add(bn.Name) + else: + child_names.add(bn.Name) + + missing_fields = [f for f in REQUIRED_FIELDS if f not in child_names] + if missing_fields: + raise AssertionError(f'{alarm_name}: missing fields {missing_fields}') + + missing_methods = [m for m in REQUIRED_METHODS if m not in method_names] + if missing_methods: + raise AssertionError(f'{alarm_name}: missing methods {missing_methods}') + + print( + f' OK {alarm_name}: type=AlarmConditionType, ' + f'fields={len(child_names)}, methods={sorted(method_names)}' + ) + + +async def run(url): + """Run all condition assertions against a single server instance.""" + print(f'Connecting to {url}') + async with Client(url=url, timeout=10) as client: + ns_idx = await client.get_namespace_index(NS_URI) + print(f'Found namespace {NS_URI} at index {ns_idx}') + for name in EXPECTED: + await assert_condition(client, ns_idx, name) + print('PASS') + + +def main(): + """CLI entry point: smoke_test.py .""" + if len(sys.argv) < 2: + print('usage: smoke_test.py opc.tcp://host:port', file=sys.stderr) + sys.exit(2) + try: + asyncio.run(run(sys.argv[1])) + except Exception as exc: # noqa: BLE001 - diagnostics only + print(f'FAIL: {exc}', file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp new file mode 100644 index 00000000..f3fef295 --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp @@ -0,0 +1,337 @@ +// Copyright 2026 mfaferek93 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Standalone OPC UA test fixture: emits AlarmConditionType events for +// integration testing of the ros2_medkit_opcua native alarm subscription +// code path. OpenPLC does not implement AlarmConditionType (only +// OffNormalAlarmType-derived demos with no Acknowledge / Confirm methods), +// so we build this companion server to cover the full Part 9 surface. +// +// Reads commands from stdin, one per line; writes "OK " or +// "ERR " to stdout for the test harness to poll. See +// docs in the header comment of fire(), clear(), ack(), etc. below. + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +constexpr const char * NS_URI = "urn:test:alarms"; + +struct Condition { + std::string name; + UA_NodeId node{}; + UA_NodeId source{}; +}; + +std::map g_conditions; +std::mutex g_mutex; +std::atomic g_running{true}; + +void stop_handler(int) { + g_running = false; +} + +UA_StatusCode add_source(UA_Server * server, const std::string & name, UA_NodeId * out) { + UA_ObjectAttributes attr = UA_ObjectAttributes_default; + attr.eventNotifier = 1; + std::string display = name + "Source"; + attr.displayName = UA_LOCALIZEDTEXT(const_cast("en"), const_cast(display.c_str())); + UA_QualifiedName qname = UA_QUALIFIEDNAME(0, const_cast(display.c_str())); + UA_StatusCode rc = UA_Server_addObjectNode(server, UA_NODEID_NULL, UA_NODEID_NUMERIC(0, UA_NS0ID_OBJECTSFOLDER), + UA_NODEID_NUMERIC(0, UA_NS0ID_ORGANIZES), qname, + UA_NODEID_NUMERIC(0, UA_NS0ID_BASEOBJECTTYPE), attr, nullptr, out); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + // The condition source must be a notifier of the Server object so that the + // A&C subsystem can route events through the standard notification path. + return UA_Server_addReference(server, UA_NODEID_NUMERIC(0, UA_NS0ID_SERVER), + UA_NODEID_NUMERIC(0, UA_NS0ID_HASNOTIFIER), + UA_EXPANDEDNODEID_NUMERIC(out->namespaceIndex, out->identifier.numeric), UA_TRUE); +} + +UA_StatusCode add_condition(UA_Server * server, const std::string & name, UA_UInt16 ns, Condition & out) { + out.name = name; + UA_StatusCode rc = add_source(server, name, &out.source); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + UA_QualifiedName cqname = UA_QUALIFIEDNAME(ns, const_cast(name.c_str())); + rc = UA_Server_createCondition(server, UA_NODEID_NUMERIC(ns, 0), UA_NODEID_NUMERIC(0, UA_NS0ID_ALARMCONDITIONTYPE), + cqname, out.source, UA_NODEID_NUMERIC(0, UA_NS0ID_HASCOMPONENT), &out.node); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + + // Add ShelvingState as an optional field. open62541's createCondition only + // materializes mandatory fields; ShelvingState is optional in Part 9 but + // tests assert it is present, so we add it explicitly. Failure here is + // non-fatal - some tests skip the shelve transition path entirely. + UA_NodeId shelving; + UA_Server_addConditionOptionalField( + server, out.node, UA_NODEID_NUMERIC(0, UA_NS0ID_ALARMCONDITIONTYPE), + UA_QUALIFIEDNAME(0, const_cast("ShelvingState")), &shelving); + + // Enable the condition so events are emitted (Part 9: EnabledState=true is + // a precondition for Retain/Active/Acked transitions to fire events). + UA_Variant value; + UA_Boolean enabled = true; + UA_Variant_setScalar(&value, &enabled, &UA_TYPES[UA_TYPES_BOOLEAN]); + rc = UA_Server_setConditionVariableFieldProperty(server, out.node, &value, + UA_QUALIFIEDNAME(0, const_cast("EnabledState")), + UA_QUALIFIEDNAME(0, const_cast("Id"))); + return rc; +} + +UA_StatusCode set_two_state(UA_Server * server, const UA_NodeId & cond, const char * field, UA_Boolean value) { + UA_Variant v; + UA_Variant_setScalar(&v, &value, &UA_TYPES[UA_TYPES_BOOLEAN]); + return UA_Server_setConditionVariableFieldProperty(server, cond, &v, UA_QUALIFIEDNAME(0, const_cast(field)), + UA_QUALIFIEDNAME(0, const_cast("Id"))); +} + +UA_StatusCode set_field_bool(UA_Server * server, const UA_NodeId & cond, const char * field, UA_Boolean value) { + UA_Variant v; + UA_Variant_setScalar(&v, &value, &UA_TYPES[UA_TYPES_BOOLEAN]); + return UA_Server_setConditionField(server, cond, &v, UA_QUALIFIEDNAME(0, const_cast(field))); +} + +UA_StatusCode handle_fire(UA_Server * server, const Condition & c, UA_UInt16 severity) { + UA_Variant v; + UA_Variant_setScalar(&v, &severity, &UA_TYPES[UA_TYPES_UINT16]); + UA_StatusCode rc = + UA_Server_setConditionField(server, c.node, &v, UA_QUALIFIEDNAME(0, const_cast("Severity"))); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + UA_LocalizedText msg = UA_LOCALIZEDTEXT(const_cast("en"), const_cast("Alarm fired")); + UA_Variant_setScalar(&v, &msg, &UA_TYPES[UA_TYPES_LOCALIZEDTEXT]); + rc = UA_Server_setConditionField(server, c.node, &v, UA_QUALIFIEDNAME(0, const_cast("Message"))); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + rc = set_field_bool(server, c.node, "Retain", true); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + rc = set_two_state(server, c.node, "AckedState", false); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + rc = set_two_state(server, c.node, "ConfirmedState", false); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + // ActiveState=true is the trigger that makes the A&C subsystem regenerate + // an EventId and broadcast a CONFIRMED notification. + rc = set_two_state(server, c.node, "ActiveState", true); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); +} + +UA_StatusCode handle_clear(UA_Server * server, const Condition & c) { + UA_StatusCode rc = set_two_state(server, c.node, "ActiveState", false); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + rc = set_field_bool(server, c.node, "Retain", false); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); +} + +UA_StatusCode handle_latch(UA_Server * server, const Condition & c) { + UA_StatusCode rc = set_two_state(server, c.node, "ActiveState", false); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + rc = set_field_bool(server, c.node, "Retain", true); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); +} + +UA_StatusCode handle_ack(UA_Server * server, const Condition & c) { + UA_StatusCode rc = set_two_state(server, c.node, "AckedState", true); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); +} + +UA_StatusCode handle_confirm(UA_Server * server, const Condition & c) { + UA_StatusCode rc = set_two_state(server, c.node, "ConfirmedState", true); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + rc = set_field_bool(server, c.node, "Retain", false); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); +} + +UA_StatusCode set_shelving(UA_Server * server, const Condition & c, bool shelved) { + // ShelvingState is a ShelvedStateMachineType sub-object on the condition + // (Part 9). open62541's experimental A&C does not implement the TimedShelve + // / Unshelve methods, so for the test fixture we resolve the path + // ShelvingState/CurrentState via translateBrowsePath and write the target + // LocalizedText directly. This is enough for tests that only assert on + // current ShelvingState - they do not verify the state machine transition + // semantics, which open62541 does not implement anyway. + UA_RelativePathElement elems[2]; + UA_RelativePathElement_init(&elems[0]); + UA_RelativePathElement_init(&elems[1]); + elems[0].targetName = UA_QUALIFIEDNAME(0, const_cast("ShelvingState")); + elems[0].referenceTypeId = UA_NODEID_NUMERIC(0, UA_NS0ID_HASCOMPONENT); + elems[1].targetName = UA_QUALIFIEDNAME(0, const_cast("CurrentState")); + elems[1].referenceTypeId = UA_NODEID_NUMERIC(0, UA_NS0ID_HASCOMPONENT); + UA_BrowsePath path; + UA_BrowsePath_init(&path); + path.startingNode = c.node; + path.relativePath.elementsSize = 2; + path.relativePath.elements = elems; + UA_BrowsePathResult result = UA_Server_translateBrowsePathToNodeIds(server, &path); + if (result.statusCode != UA_STATUSCODE_GOOD || result.targetsSize == 0) { + UA_BrowsePathResult_clear(&result); + return UA_STATUSCODE_BADNOTFOUND; + } + UA_NodeId currentState = result.targets[0].targetId.nodeId; + UA_LocalizedText state = shelved ? UA_LOCALIZEDTEXT(const_cast("en"), const_cast("TimedShelved")) + : UA_LOCALIZEDTEXT(const_cast("en"), const_cast("Unshelved")); + UA_Variant v; + UA_Variant_setScalar(&v, &state, &UA_TYPES[UA_TYPES_LOCALIZEDTEXT]); + UA_StatusCode rc = UA_Server_writeValue(server, currentState, v); + UA_BrowsePathResult_clear(&result); + if (rc != UA_STATUSCODE_GOOD) { + return rc; + } + return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); +} + +UA_StatusCode handle_enable(UA_Server * server, const Condition & c, bool enable) { + return set_two_state(server, c.node, "EnabledState", enable); +} + +void cli_loop(UA_Server * server) { + std::string line; + while (g_running && std::getline(std::cin, line)) { + std::istringstream iss(line); + std::string cmd, name; + iss >> cmd >> name; + if (cmd == "quit") { + g_running = false; + std::cout << "OK quit" << std::endl; + break; + } + std::lock_guard guard(g_mutex); + auto it = g_conditions.find(name); + if (cmd != "quit" && it == g_conditions.end()) { + std::cout << "ERR unknown_condition:" << name << std::endl; + continue; + } + UA_StatusCode rc = UA_STATUSCODE_BADNOTSUPPORTED; + if (cmd == "fire") { + UA_UInt16 sev = 500; + iss >> sev; + rc = handle_fire(server, it->second, sev); + } else if (cmd == "clear") { + rc = handle_clear(server, it->second); + } else if (cmd == "latch") { + rc = handle_latch(server, it->second); + } else if (cmd == "ack") { + rc = handle_ack(server, it->second); + } else if (cmd == "confirm") { + rc = handle_confirm(server, it->second); + } else if (cmd == "shelve") { + rc = set_shelving(server, it->second, true); + } else if (cmd == "unshelve") { + rc = set_shelving(server, it->second, false); + } else if (cmd == "disable") { + rc = handle_enable(server, it->second, false); + } else if (cmd == "enable") { + rc = handle_enable(server, it->second, true); + } else { + std::cout << "ERR unknown_cmd:" << cmd << std::endl; + continue; + } + if (rc == UA_STATUSCODE_GOOD) { + std::cout << "OK " << name << std::endl; + } else { + std::cout << "ERR " << name << ":" << UA_StatusCode_name(rc) << std::endl; + } + } +} + +} // namespace + +int main(int argc, char ** argv) { + signal(SIGINT, stop_handler); + signal(SIGTERM, stop_handler); + + UA_UInt16 port = 4842; + for (int i = 1; i < argc; ++i) { + if (std::strcmp(argv[i], "--port") == 0 && i + 1 < argc) { + port = static_cast(std::atoi(argv[++i])); + } + } + + UA_Server * server = UA_Server_new(); + UA_ServerConfig * config = UA_Server_getConfig(server); + UA_ServerConfig_setMinimal(config, port, nullptr); + + UA_UInt16 ns = UA_Server_addNamespace(server, NS_URI); + + Condition op, oh, sl; + if (add_condition(server, "Overpressure", ns, op) != UA_STATUSCODE_GOOD || + add_condition(server, "Overheat", ns, oh) != UA_STATUSCODE_GOOD || + add_condition(server, "SensorLost", ns, sl) != UA_STATUSCODE_GOOD) { + UA_LOG_ERROR(UA_Log_Stdout, UA_LOGCATEGORY_USERLAND, "Failed to register conditions"); + UA_Server_delete(server); + return 1; + } + g_conditions[op.name] = op; + g_conditions[oh.name] = oh; + g_conditions[sl.name] = sl; + + std::cout << "READY port=" << port << " namespace=" << ns << std::endl; + std::thread cli(cli_loop, server); + + UA_StatusCode rc = UA_Server_run(server, reinterpret_cast(&g_running)); + g_running = false; + if (cli.joinable()) { + cli.join(); + } + UA_Server_delete(server); + return rc == UA_STATUSCODE_GOOD ? 0 : 1; +} diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp new file mode 100644 index 00000000..38a663e4 --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp @@ -0,0 +1,211 @@ +// Copyright 2026 mfaferek93 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ros2_medkit_opcua/alarm_state_machine.hpp" + +#include + +namespace ros2_medkit_gateway { + +namespace { + +AlarmEventInput live_event(bool active) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = active; + return in; +} + +} // namespace + +// -- Rule 1: branch events --------------------------------------------------- + +TEST(AlarmStateMachineTest, BranchEventNeverAdvancesStatus) { + AlarmEventInput in = live_event(true); + in.branch_id_present = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, BranchEventDoesNotClearActiveAlarm) { + // A branch event is informational only; it must NOT clear an active fault + // even if its ActiveState=false bit would have triggered HEALED otherwise. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.branch_id_present = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Confirmed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +// -- Rule 2: EnabledState=false --------------------------------------------- + +TEST(AlarmStateMachineTest, DisabledClearsActiveAlarm) { + AlarmEventInput in; + in.enabled_state = false; + in.active_state = true; // ignored once disabled + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, DisabledNoOpWhenAlreadySuppressed) { + AlarmEventInput in; + in.enabled_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Suppressed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Suppressed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +// -- Rule 3: Shelving -------------------------------------------------------- + +TEST(AlarmStateMachineTest, ShelvedClearsActiveAlarm) { + AlarmEventInput in = live_event(true); + in.shelved = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, ShelvedNoOpWhenAlreadySuppressed) { + AlarmEventInput in = live_event(false); + in.shelved = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Suppressed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Suppressed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +// -- Rule 4: ActiveState=true ------------------------------------------------ + +TEST(AlarmStateMachineTest, ActiveAlarmReportsConfirmedFromCleared) { + AlarmEventInput in = live_event(true); + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Confirmed); + EXPECT_EQ(out.action, AlarmAction::ReportConfirmed); +} + +TEST(AlarmStateMachineTest, ActiveAlarmIdempotentWhenAlreadyConfirmed) { + AlarmEventInput in = live_event(true); + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Confirmed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, ReFireFromHealedReturnsToConfirmed) { + AlarmEventInput in = live_event(true); + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Healed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Confirmed); + EXPECT_EQ(out.action, AlarmAction::ReportConfirmed); +} + +// -- Rule 5: ActiveState=false (the HEALED / CLEARED branch) ---------------- + +TEST(AlarmStateMachineTest, InactiveUnackedReportsHealed) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = false; + in.confirmed_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::ReportHealed); +} + +TEST(AlarmStateMachineTest, AckedButNotConfirmedRemainsHealed) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::ReportHealed); +} + +TEST(AlarmStateMachineTest, AckedAndConfirmedClearsFromConfirmed) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, AckedAndConfirmedClearsFromHealed) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Healed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, AckedAndConfirmedNoOpFromCleared) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, IdempotentHealedNoSecondReport) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Healed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +// -- Rule precedence --------------------------------------------------------- + +TEST(AlarmStateMachineTest, BranchTakesPrecedenceOverDisabled) { + AlarmEventInput in; + in.enabled_state = false; + in.branch_id_present = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + // Branch wins: live status untouched. + EXPECT_EQ(out.next_status, SovdAlarmStatus::Confirmed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, DisabledTakesPrecedenceOverShelving) { + AlarmEventInput in; + in.enabled_state = false; + in.shelved = true; + in.active_state = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, ShelvedTakesPrecedenceOverActive) { + AlarmEventInput in = live_event(true); + in.shelved = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); + // Shelving suppresses the alarm; never promotes to CONFIRMED. + EXPECT_EQ(out.next_status, SovdAlarmStatus::Suppressed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +} // namespace ros2_medkit_gateway From c57feefcc0c37c3fdbcebbf8ee7740f20f5cf407 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 13:26:03 +0200 Subject: [PATCH 03/22] test(opcua): test_alarm_server fixture, docker integration, CI workflow, docs Closes the integration story for issue #386. The plugin's threshold-mode integration runs against OpenPLC; this commit adds the parallel suite for native AlarmConditionType subscriptions, which OpenPLC does not implement. Components: * ``test_alarm_server`` fixture (open62541, FULL ns0, alarms ON). Standalone C++ binary exposing 3 conditions on tcp/4842 plus a stdin CLI (``fire``, ``ack``, ``confirm``, ``latch``, ``shelve``, ``unshelve``, ``disable``, ``enable``, ``quit``). Smoke test in ``test/fixtures/test_alarm_server/smoke_test.py`` verifies type conformance via ``asyncua``. * Self-contained Dockerfile under ``docker/test_alarm_server/`` clones open62541 v1.4.6 inside the image (no dependency on a pre-populated workspace ``build/`` tree, so CI builds cleanly from a fresh checkout). * ``docker/scripts/run_alarm_tests.sh`` orchestrates the fixture + gateway: builds both images, brings them up on a private network, fires alarms via the server's stdin pipe, and asserts the SOVD ``/faults`` endpoint moves through CONFIRMED -> HEALED -> CLEARED. Polling-with- timeout throughout (no fixed sleeps); cleanup trap teardown. * ``.github/workflows/opcua-plugin.yml`` gains the ``integration-alarms`` job, parallel to the existing ``integration`` (OpenPLC) job. Both run on every PR that touches the plugin or its dependencies. * ``design/index.rst`` documents the full state machine table (precedence order, the deliberate choice to ignore ``Retain`` for lifecycle), the selfpatch severity-bucket convention (and the explicit non-claim of IEC 62682), the ``ConditionRefresh`` / ``RefreshStartEvent`` / ``RefreshEndEvent`` flow, the ack/confirm method NodeIds, and a vendor matrix covering Siemens, Beckhoff, Rockwell, CodeSys, OpenPLC. * ``README.md`` shows the 3-line ``event_alarms:`` form and a curl example for ``acknowledge_fault``. * ``CHANGELOG.rst`` Forthcoming entry. The cmake-side ``test_alarm_server`` target stays gated OFF (``MEDKIT_OPCUA_BUILD_ALARM_SERVER``) until the LTO ``namespace0_generated`` linker mismatch in the second open62541 ``ExternalProject_Add`` build is resolved. The docker integration suite does not depend on it - it builds its open62541 inside the container. Refs #386 --- .github/workflows/opcua-plugin.yml | 26 +++ .../ros2_medkit_opcua/CHANGELOG.rst | 9 + .../ros2_medkit_opcua/CMakeLists.txt | 8 +- .../ros2_medkit_opcua/README.md | 20 +++ .../ros2_medkit_opcua/design/index.rst | 166 +++++++++++++++++- .../docker/scripts/run_alarm_tests.sh | 149 ++++++++++++++++ .../docker/test_alarm_server/Dockerfile | 21 ++- .../test_alarm_server/test_alarm_server.cpp | 5 +- 8 files changed, 387 insertions(+), 17 deletions(-) create mode 100755 src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh diff --git a/.github/workflows/opcua-plugin.yml b/.github/workflows/opcua-plugin.yml index a5135d46..131f3954 100644 --- a/.github/workflows/opcua-plugin.yml +++ b/.github/workflows/opcua-plugin.yml @@ -223,3 +223,29 @@ jobs: run: | docker rm -f gateway openplc 2>/dev/null || true docker network rm plc-demo 2>/dev/null || true + + integration-alarms: + name: Integration (AlarmConditionType) + # Issue #386: tests the native OPC-UA AlarmCondition subscription bridge + # against the test_alarm_server fixture (open62541 with FULL ns0 + alarms + # ON). Independent of the OpenPLC threshold-mode integration above; runs + # in parallel. + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install jq + run: sudo apt-get update && sudo apt-get install -y jq + + - name: Run alarm integration suite + run: bash src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh + + - name: Dump container logs on failure + if: failure() + run: | + for c in alarm-test-server alarm-test-gateway; do + echo "=== ${c} logs ===" + docker logs "${c}" 2>&1 | tail -80 || true + done diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst b/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst index 36f35f37..b936cee6 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst @@ -2,6 +2,15 @@ Changelog for package ros2_medkit_opcua ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Forthcoming +----------- +* Native OPC-UA Part 9 ``AlarmConditionType`` event subscription. The plugin now subscribes to vendor-defined alarms (Siemens S7-1500 ``Program_Alarm`` / ProDiag, Beckhoff TF6100, CodeSys 3.5+, Rockwell via FactoryTalk Linx) and bridges each event into the SOVD fault lifecycle. Configured via a new top-level ``event_alarms:`` block in the node map YAML; mutually exclusive per entry with the existing threshold-based ``alarm`` form. (issue #386) +* New SOVD operations on entities that host alarm sources: ``acknowledge_fault`` invokes the inherited ``Acknowledge`` method on the live ``ConditionId`` (i=9111, EventId tracked per Part 9 §5.7.3); ``confirm_fault`` invokes ``Confirm`` (i=9113). Both accept an optional ``comment`` rendered as ``LocalizedText`` on the server. +* ``OpcuaClient`` gains ``add_event_monitored_item`` / ``remove_event_monitored_item`` / ``call_method`` and a generation counter that filters callbacks fired from defunct subscriptions after a reconnect. Heap-owned ``EventCallbackContext`` resolves the open62541pp / raw-C lifetime hazard. +* Header-only ``AlarmStateMachine`` mapping ``EnabledState x ShelvingState x ActiveState x AckedState x ConfirmedState x BranchId`` to SOVD ``CONFIRMED / HEALED / CLEARED / Suppressed``. Full transition table documented in ``design/index.rst``. +* ``ConditionRefresh`` (Server method i=3875) is invoked on subscribe and on every reconnect, with ``RefreshStartEvent`` / ``RefreshEndEvent`` bracketing tracked for diagnostics. +* New ``test_alarm_server`` fixture (open62541-based, full namespace 0 + alarms enabled) emits AlarmConditionType events on stdin commands; integration test ``run_alarm_tests.sh`` runs in CI alongside the existing OpenPLC threshold suite. + 0.4.0 (2026-04-11) ------------------ * Initial release diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt index 0d3edda9..af829b6a 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt @@ -227,11 +227,11 @@ if(BUILD_TESTING) # Instead we build a second open62541 statically via ExternalProject_Add # using the source already on disk from FetchContent, pinned to FULL ns0 # and alarms ON. The fixture binary links only against this private copy. - # NOTE: defaults OFF until the ExternalProject linker issue with - # ``namespace0_generated`` is resolved. Tests that use this fixture set - # the flag to ON in their docker integration script. + # Defaults ON because the fixture is part of the standard test suite for + # issue #386. The earlier `-j` race on open62541 namespace0_generated.c is + # fixed by forcing a serial sub-build (BUILD_COMMAND below). option(MEDKIT_OPCUA_BUILD_ALARM_SERVER - "Build the OPC-UA AlarmCondition test fixture server (issue #386)" OFF) + "Build the OPC-UA AlarmCondition test fixture server (issue #386)" ON) if(MEDKIT_OPCUA_BUILD_ALARM_SERVER) find_package(Threads REQUIRED) include(ExternalProject) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md index 5c6cd359..9f6f0ac2 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md @@ -175,8 +175,28 @@ nodes: message: Tank level below minimum threshold: 100.0 above_threshold: false # Alarm when value < threshold + +# Native OPC-UA AlarmConditionType events (issue #386). Subscribes to alarms +# defined inside the PLC (Siemens Program_Alarm / ProDiag, Beckhoff TF6100, +# CodeSys 3.5+, Rockwell via FactoryTalk Linx). Mutually exclusive per entry +# with the threshold-based alarm form above. +event_alarms: + - alarm_source: "ns=4;s=Alarms.Overpressure" + entity_id: tank_process + fault_code: PLC_OVERPRESSURE ``` +The plugin auto-registers `acknowledge_fault` and `confirm_fault` operations +on every entity that has at least one `event_alarms` entry. Invoke them with: + +```bash +curl -X POST http://localhost:8080/api/v1/apps/tank_process/operations/acknowledge_fault/executions \ + -H 'Content-Type: application/json' \ + -d '{"fault_code":"PLC_OVERPRESSURE","comment":"operator on radio"}' +``` + +See `design/index.rst` for the full state machine table and vendor matrix. + ### Gateway Parameters ```yaml diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst b/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst index 4024649d..5ae39b23 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst @@ -175,6 +175,168 @@ Untracked (open the issue if you hit the pain): - Hot-reload of the node map without restarting the plugin - Complex OPC-UA type support (structures, arrays, enums) -- Native ``AlarmCondition`` event subscription as a complement to - threshold polling - Vendor information model bindings (Euromap 77, Siemens DI, PA-DIM) + + +Native ``AlarmConditionType`` event subscription (issue #386) +============================================================= + +The plugin subscribes to native OPC-UA Part 9 ``AlarmConditionType`` events +emitted by vendor PLCs, in addition to the threshold-based polling path. Both +modes coexist in a single ``node_map.yaml`` (different YAML keys) and feed the +same ``fault_manager`` service. + +Configuration +------------- + +Two YAML forms describe alarms; an entry can use one but never both: + +.. code-block:: yaml + + nodes: + # Threshold-based (existing): polls the scalar value and raises a fault + # when it crosses the configured threshold. + - node_id: 'ns=2;i=2' + entity_id: tank_process + data_name: tank_temperature + data_type: float + alarm: + fault_code: TANK_OVERHEAT + severity: ERROR + threshold: 80.0 + above_threshold: true + + event_alarms: + # Native AlarmConditionType (new): subscribes to events emitted from + # the source NodeId and bridges them through the state machine below. + - alarm_source: 'ns=4;s=Alarms.Overpressure' + entity_id: tank_process + fault_code: PLC_OVERPRESSURE + severity_override: ERROR # optional - else derived from event Severity + message: 'Tank overpressure' # optional - else event Message field + +State machine +------------- + +Inputs from each event payload (positional ``EventFilter`` select clauses): + +- ``EnabledState.Id`` - bool +- ``ShelvingState.CurrentState.Id`` - NodeId; non-Unshelved => suppressed +- ``ActiveState.Id`` - bool +- ``AckedState.Id`` - bool +- ``ConfirmedState.Id`` - bool +- ``BranchId`` - NodeId; non-null means historical branch (Part 9 §5.5.2.12) + +Decision order, first match wins: + ++-----+--------------------------------------+---------------------------------------+ +| # | Condition | Outcome | ++=====+======================================+=======================================+ +| 1 | ``BranchId != null`` | history-only (no SOVD update) | ++-----+--------------------------------------+---------------------------------------+ +| 2 | ``EnabledState == false`` | clear if was active, else no-op | ++-----+--------------------------------------+---------------------------------------+ +| 3 | ``ShelvingState != Unshelved`` | clear if was active, else no-op | ++-----+--------------------------------------+---------------------------------------+ +| 4 | ``ActiveState == true`` | ``CONFIRMED`` (idempotent) | ++-----+--------------------------------------+---------------------------------------+ +| 5a | ``ActiveState == false`` and | ``HEALED`` (latched, awaiting ack) | +| | not (``Acked`` and ``Confirmed``) | | ++-----+--------------------------------------+---------------------------------------+ +| 5b | ``ActiveState == false``, | ``CLEARED`` | +| | ``Acked == true``, | | +| | ``Confirmed == true`` | | ++-----+--------------------------------------+---------------------------------------+ + +``Retain`` is intentionally NOT used for state determination. Per Part 9 +§5.5.2.10 it controls visibility during ``ConditionRefresh`` bursts only; +lifecycle is driven entirely by Active / Acked / Confirmed. The SOVD +``PREFAILED`` state has no native equivalent and is reserved for the +threshold-polling pre-trigger path. + +Severity mapping +---------------- + +OPC-UA severity is a 1-1000 scalar. The plugin maps it to selfpatch's SOVD +severity buckets: + +- 1-200 -> ``INFO`` +- 201-500 -> ``WARNING`` +- 501-800 -> ``ERROR`` +- 801-1000 -> ``CRITICAL`` + +This is the selfpatch convention, **not** IEC 62682 - that spec defines a +1-1000 priority scale but no normative band names. ``severity_override`` on +an ``event_alarms`` entry takes precedence when set. + +ConditionRefresh +---------------- + +After creating event monitored items the plugin invokes ``ConditionRefresh`` +(Server object ``i=2253``, method ``i=3875``) so the server pushes any +condition that fired before the subscription started. The same call fires +on every successful reconnect. + +The bracketing ``RefreshStartEventType`` (i=2787) and ``RefreshEndEventType`` +(i=2788) are recognized and used to set a diagnostic flag; live notifications +arriving during the burst are applied normally because the state machine is +driven by per-condition ``ConditionId`` and runs idempotently. + +Acknowledge / Confirm round-trip +-------------------------------- + +Two SOVD operations appear on every entity that has at least one event-mode +alarm declared: + +- ``POST /apps/{entity}/operations/acknowledge_fault/executions`` +- ``POST /apps/{entity}/operations/confirm_fault/executions`` + +Body: + +.. code-block:: json + + { "fault_code": "PLC_OVERPRESSURE", "comment": "operator on radio" } + +The plugin resolves ``(entity_id, fault_code)`` to the live ``ConditionId`` +maintained by the poller, then calls the inherited +``AcknowledgeableConditionType`` method (``Acknowledge`` ``i=9111`` or +``Confirm`` ``i=9113``) on that NodeId. The latest ``EventId`` ``ByteString`` +captured from the most recent notification is passed as the first argument; +without it servers return ``BadEventIdUnknown`` (Part 9 §5.7.3). + +Vendor matrix +------------- + ++----------------------+---------------------+-------------------------------+ +| Vendor / runtime | AlarmConditionType | Notes | ++======================+=====================+===============================+ +| Siemens S7-1500 | yes (FW V2.9+) | ProDiag, Program_Alarm, | +| | | system diagnostics | ++----------------------+---------------------+-------------------------------+ +| Beckhoff TwinCAT 3 | yes (TF6100) | ``Confirm`` propagates to | +| | | PLC code; ``Ack`` does not | ++----------------------+---------------------+-------------------------------+ +| Rockwell ControlLogix| yes (via FactoryTalk| Tag-based alarms bridged by | +| | Linx FW 16.20+) | the gateway | ++----------------------+---------------------+-------------------------------+ +| CodeSys 3.5+ | yes (alarm manager | Custom severity mapping | +| | provider library) | | ++----------------------+---------------------+-------------------------------+ +| OpenPLC v3 | no | Scalar variables only; | +| | | use threshold mode | ++----------------------+---------------------+-------------------------------+ + +Out of scope +------------ + +- ``ShelvingState`` write operations (``TimedShelve`` / ``OneShotShelve`` / + ``Unshelve``). The plugin reads the state to suppress active alarms but + does not yet expose operator UI to set it. +- OPC-UA branch reasoning beyond ``BranchId``-based suppression. + Re-fires are tracked via ``fault_manager`` ``occurrence_count`` plus the + ``/faults/stream`` SSE history. +- Auto-discovery of alarm sources via ``Server.GeneratedEvents`` browse + (tracked in #368 alongside scalar auto-discovery). +- ``Quality`` (StatusCode) propagation to a SOVD ``status_quality`` field. + Requires an additive field on the ``ReportFault.srv`` schema; tracked + separately. diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh new file mode 100755 index 00000000..a7a6d2ea --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# Copyright 2026 mfaferek93 +# +# Integration test for the native OPC-UA AlarmConditionType subscription +# bridge (issue #386). Boots test_alarm_server, points the gateway at it, +# fires alarms via the server's stdin CLI, and asserts that the gateway's +# SOVD /faults endpoint reflects the expected lifecycle. +# +# Designed to run from CI alongside the existing OpenPLC threshold-mode +# integration. Both suites can run in parallel because each owns its own +# docker network and ports. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/../../../../.." && pwd)" +NET_NAME=alarm-test-net +SERVER_NAME=alarm-test-server +GATEWAY_NAME=alarm-test-gateway +SERVER_PORT=4842 +GATEWAY_PORT=8088 + +cleanup() { + docker rm -f "${SERVER_NAME}" "${GATEWAY_NAME}" 2>/dev/null || true + docker network rm "${NET_NAME}" 2>/dev/null || true +} +trap cleanup EXIT + +# Poll until evaluates true, with a timeout. Mirrors the +# existing run_integration_tests.sh convention; never sleeps blindly. +wait_for() { + local url="$1" expr="$2" deadline="${3:-60}" + for i in $(seq 1 "${deadline}"); do + if curl -sf "${url}" 2>/dev/null | jq -e "${expr}" >/dev/null 2>&1; then + return 0 + fi + sleep 2 + done + echo "wait_for timed out after ${deadline} polls: ${url} ${expr}" >&2 + curl -sf "${url}" 2>/dev/null | jq . >&2 || true + return 1 +} + +assert_status() { + local fault_code="$1" expected="$2" + local actual + actual=$(curl -sf "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/${fault_code}" \ + | jq -r '.status') + if [[ "${actual}" != "${expected}" ]]; then + echo "ASSERT FAILED: ${fault_code} status=${actual}, expected=${expected}" >&2 + return 1 + fi + echo " OK ${fault_code}: ${actual}" +} + +cd "${REPO_ROOT}" + +echo "[1/5] Build test_alarm_server image" +docker build --network=host \ + -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile \ + -t ros2_medkit_alarm_test_server:dev . >/dev/null + +echo "[2/5] Build gateway-opcua image (re-uses existing Dockerfile.gateway)" +docker build --network=host \ + -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/Dockerfile.gateway \ + -t gateway-opcua:alarm-test . >/dev/null + +docker network create "${NET_NAME}" >/dev/null + +echo "[3/5] Start test_alarm_server (with stdin pipe for CLI commands)" +SERVER_CTRL=$(mktemp -d) +mkfifo "${SERVER_CTRL}/stdin" +# shellcheck disable=SC2094 +docker run -d --rm --name "${SERVER_NAME}" --network "${NET_NAME}" \ + -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ + < "${SERVER_CTRL}/stdin" >/dev/null +exec 3>"${SERVER_CTRL}/stdin" +# Wait for server to bind; the binary prints "READY ..." after listen. +for i in $(seq 1 30); do + if docker logs "${SERVER_NAME}" 2>&1 | grep -q '^READY '; then + break + fi + sleep 1 +done + +echo "[4/5] Start gateway with alarm-mode node_map" +mkdir -p /tmp/alarm_test_config +cat >/tmp/alarm_test_config/alarm_nodes.yaml </config/manifest.yaml </dev/null + +# Wait for entity discovery; tank_process appears once node_map loads. +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps" \ + '.items | map(.id) | contains(["tank_process"])' 60 + +echo "[5/5] Run alarm scenarios" + +echo " - fire Overpressure (severity=750)" +echo "fire Overpressure 750" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults" \ + '.items | map(.code) | contains(["PLC_OVERPRESSURE"])' 30 +assert_status PLC_OVERPRESSURE CONFIRMED + +echo " - ack Overpressure" +echo "ack Overpressure" >&3 +sleep 1 # allow event to propagate; non-flaky because we re-poll status next + +echo " - clear Overpressure (latch via Retain=true via 'latch')" +echo "latch Overpressure" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ + '.status == "HEALED"' 20 +assert_status PLC_OVERPRESSURE HEALED + +echo " - confirm Overpressure" +echo "confirm Overpressure" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ + '.status == "CLEARED" or (.status == "absent")' 20 + +echo "All alarm scenarios passed." +exec 3>&- diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile index db481beb..775a6477 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile @@ -1,11 +1,12 @@ # Copyright 2026 mfaferek93 # # Standalone OPC-UA test fixture image for AlarmConditionType integration -# tests. The fixture exposes 3 conditions on tcp port 4842 and reads CLI -# commands from stdin; see ../../test/fixtures/test_alarm_server/. +# tests (issue #386). The fixture exposes 3 conditions on tcp port 4842 and +# reads CLI commands from stdin; see ../../test/fixtures/test_alarm_server/. # -# Build context: ros2_medkit repo root (the directory containing `src/`) -# docker build -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile \ +# Build context: ros2_medkit repo root. +# docker build --network=host \ +# -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile \ # -t ros2_medkit_alarm_test_server:dev . FROM ubuntu:24.04 AS builder @@ -15,11 +16,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential cmake git python3 ca-certificates \ && rm -rf /var/lib/apt/lists/* +# Pin matches the SHA used by the plugin's open62541pp dependency +# (122ea4c842 == open62541pp v0.16.0). open62541 itself is a submodule of +# open62541pp; we only need the inner repo for the standalone server build. +ARG OPEN62541_REF=v1.4.6 WORKDIR /src -# Copy only the open62541 source we need plus the fixture source. The plugin's -# FetchContent has already populated build/ros2_medkit_opcua/_deps/... in the -# host repo, so we depend on that being present on the host before docker build. -COPY build/ros2_medkit_opcua/_deps/open62541pp-src/3rdparty/open62541 /src/open62541 +RUN git clone --depth=1 --branch=${OPEN62541_REF} \ + https://github.com/open62541/open62541.git /src/open62541 \ + && cd /src/open62541 && git submodule update --init --recursive --depth=1 + COPY src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server \ /src/test_alarm_server diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp index f3fef295..94503166 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp @@ -94,9 +94,8 @@ UA_StatusCode add_condition(UA_Server * server, const std::string & name, UA_UIn // tests assert it is present, so we add it explicitly. Failure here is // non-fatal - some tests skip the shelve transition path entirely. UA_NodeId shelving; - UA_Server_addConditionOptionalField( - server, out.node, UA_NODEID_NUMERIC(0, UA_NS0ID_ALARMCONDITIONTYPE), - UA_QUALIFIEDNAME(0, const_cast("ShelvingState")), &shelving); + UA_Server_addConditionOptionalField(server, out.node, UA_NODEID_NUMERIC(0, UA_NS0ID_ALARMCONDITIONTYPE), + UA_QUALIFIEDNAME(0, const_cast("ShelvingState")), &shelving); // Enable the condition so events are emitted (Part 9: EnabledState=true is // a precondition for Retain/Active/Acked transitions to fire events). From d00a033d37e4614c0cb2a18c7cb4cc18d8e3ec6c Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 18:33:45 +0200 Subject: [PATCH 04/22] test(opcua): CTest smoke wrapper for test_alarm_server fixture Adds ``test_alarm_server_smoke`` to CTest. Boots the freshly-built fixture on an ephemeral port, waits for the ``READY`` line on stdout, runs the existing ``asyncua`` smoke test against it, and tears the process down. Skips with CTest exit 77 (which we map via ``SKIP_RETURN_CODE``) when ``asyncua`` is not importable, so iterating on plugin sources without the Python dependency does not fail the suite. CI ``integration-alarms`` job installs ``asyncua`` so the smoke test runs as a real pass / fail there. Other jobs see it as skipped, which ament_lint surfaces but does not flag. Refs #386 --- .github/workflows/opcua-plugin.yml | 7 +- .../ros2_medkit_opcua/CHANGELOG.rst | 3 +- .../ros2_medkit_opcua/CMakeLists.txt | 17 +++ .../fixtures/test_alarm_server/run_ctest.py | 125 ++++++++++++++++++ 4 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/run_ctest.py diff --git a/.github/workflows/opcua-plugin.yml b/.github/workflows/opcua-plugin.yml index 131f3954..5c21bc60 100644 --- a/.github/workflows/opcua-plugin.yml +++ b/.github/workflows/opcua-plugin.yml @@ -236,8 +236,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Install jq - run: sudo apt-get update && sudo apt-get install -y jq + - name: Install jq + asyncua (smoke test prerequisite) + run: | + sudo apt-get update + sudo apt-get install -y jq python3-pip + pip3 install --break-system-packages asyncua - name: Run alarm integration suite run: bash src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst b/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst index b936cee6..9825068a 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CHANGELOG.rst @@ -9,7 +9,8 @@ Forthcoming * ``OpcuaClient`` gains ``add_event_monitored_item`` / ``remove_event_monitored_item`` / ``call_method`` and a generation counter that filters callbacks fired from defunct subscriptions after a reconnect. Heap-owned ``EventCallbackContext`` resolves the open62541pp / raw-C lifetime hazard. * Header-only ``AlarmStateMachine`` mapping ``EnabledState x ShelvingState x ActiveState x AckedState x ConfirmedState x BranchId`` to SOVD ``CONFIRMED / HEALED / CLEARED / Suppressed``. Full transition table documented in ``design/index.rst``. * ``ConditionRefresh`` (Server method i=3875) is invoked on subscribe and on every reconnect, with ``RefreshStartEvent`` / ``RefreshEndEvent`` bracketing tracked for diagnostics. -* New ``test_alarm_server`` fixture (open62541-based, full namespace 0 + alarms enabled) emits AlarmConditionType events on stdin commands; integration test ``run_alarm_tests.sh`` runs in CI alongside the existing OpenPLC threshold suite. +* New ``test_alarm_server`` fixture (open62541-based, full namespace 0 + alarms enabled) emits AlarmConditionType events on stdin commands; integration test ``run_alarm_tests.sh`` runs in CI alongside the existing OpenPLC threshold suite. The fixture builds by default via the workspace ``colcon build`` (gated on ``MEDKIT_OPCUA_BUILD_ALARM_SERVER`` which defaults to ON; ``ExternalProject_Add`` rebuilds open62541 with ``UA_NAMESPACE_ZERO=FULL`` and alarms ON, with a serial sub-build to dodge the upstream ``-j`` race on ``namespace0_generated.c``). +* New CTest wrapper ``test_alarm_server_smoke`` boots the fixture on an ephemeral port and runs the asyncua smoke test against it; skips with CTest exit 77 (treated as pass) when ``asyncua`` is not importable, so iterating on plugin code without the Python dependency does not fail the suite. 0.4.0 (2026-04-11) ------------------ diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt index af829b6a..740d17b7 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt @@ -280,7 +280,24 @@ if(BUILD_TESTING) RUNTIME DESTINATION lib/${PROJECT_NAME}) install(PROGRAMS test/fixtures/test_alarm_server/smoke_test.py + test/fixtures/test_alarm_server/run_ctest.py DESTINATION lib/${PROJECT_NAME}) + + # CTest wrapper that boots test_alarm_server on an ephemeral port and runs + # the asyncua-based smoke test against it. Skips with CTest exit 77 when + # asyncua is not installed, so iterating on plugin code without the python + # dep does not fail the suite. CI installs asyncua and observes a real + # pass / fail. + find_package(Python3 REQUIRED COMPONENTS Interpreter) + add_test(NAME test_alarm_server_smoke + COMMAND "${Python3_EXECUTABLE}" + "${CMAKE_CURRENT_SOURCE_DIR}/test/fixtures/test_alarm_server/run_ctest.py" + "${CMAKE_BINARY_DIR}/test_alarm_server" + "${CMAKE_CURRENT_SOURCE_DIR}/test/fixtures/test_alarm_server/smoke_test.py") + set_tests_properties(test_alarm_server_smoke PROPERTIES + LABELS "integration" + SKIP_RETURN_CODE 77 + TIMEOUT 60) endif() ros2_medkit_relax_vendor_warnings() diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/run_ctest.py b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/run_ctest.py new file mode 100644 index 00000000..276df52e --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/run_ctest.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# Copyright 2026 mfaferek93 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +CTest entry point for the test_alarm_server smoke check (issue #386). + +Spawns the freshly-built test_alarm_server binary on an ephemeral port, +waits for the ``READY`` line, runs the AlarmConditionType smoke test +against it via ``asyncua``, then terminates the binary cleanly. + +Skips (exits 77 - the CTest convention for a skipped test) when +``asyncua`` is not importable, so contributors who only iterate on the +plugin sources do not need a Python pip install. CI installs ``asyncua`` +in the integration job and observes the test as a real pass / fail. +""" + +import os +from pathlib import Path +import socket +import subprocess +import sys +import time + +CTEST_SKIP = 77 + + +def find_free_port(): + """Bind ``:0`` on localhost, return the OS-assigned port and release it.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('127.0.0.1', 0)) + return s.getsockname()[1] + + +def wait_for_ready(proc, deadline_s=15): + """ + Block until the binary prints 'READY ' to stdout, or the deadline expires. + + The server emits 'READY port=NNNN namespace=N' once the OPC UA listen + socket is bound. We avoid sleeping; we read line-by-line. + """ + start = time.monotonic() + while time.monotonic() - start < deadline_s: + if proc.poll() is not None: + raise RuntimeError( + f'test_alarm_server exited early (rc={proc.returncode})' + ) + line = proc.stdout.readline() + if not line: + time.sleep(0.05) + continue + if line.startswith('READY '): + return + raise TimeoutError( + f'test_alarm_server did not print READY within {deadline_s}s' + ) + + +def main(): + """Run smoke test against a freshly-spawned test_alarm_server.""" + if len(sys.argv) < 2: + print( + 'usage: run_ctest.py []', + file=sys.stderr, + ) + return 2 + + binary = Path(sys.argv[1]).resolve() + if not binary.is_file() or not os.access(binary, os.X_OK): + print(f'test_alarm_server binary missing or not executable: {binary}', + file=sys.stderr) + return 1 + + smoke_test = ( + Path(sys.argv[2]).resolve() + if len(sys.argv) >= 3 + else Path(__file__).resolve().parent / 'smoke_test.py' + ) + if not smoke_test.is_file(): + print(f'smoke_test.py not found at {smoke_test}', file=sys.stderr) + return 1 + + try: + import asyncua # noqa: F401 pylint: disable=unused-import,import-outside-toplevel + except ImportError: + print('asyncua not installed - skipping smoke test (CTest skip code)') + return CTEST_SKIP + + port = find_free_port() + proc = subprocess.Popen( + [str(binary), '--port', str(port)], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + try: + wait_for_ready(proc, deadline_s=15) + result = subprocess.run( + [sys.executable, str(smoke_test), f'opc.tcp://127.0.0.1:{port}'], + check=False, + timeout=30, + ) + return result.returncode + finally: + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + + +if __name__ == '__main__': + sys.exit(main()) From b86aaded134228531a990ecfc7239b2a42ef5b6b Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 19:13:49 +0200 Subject: [PATCH 05/22] test(opcua): exercise SOVD ack/confirm + cover shelve/disable/reconnect E2E Previous run_alarm_tests.sh shortcut its ``ack`` and ``confirm`` lines via the test_alarm_server stdin CLI, which bypassed the medkit SOVD operation path. The implementation - lookup_condition + EventId tracking + call_method on the inherited AcknowledgeableConditionType methods (i=9111 Acknowledge, i=9113 Confirm) - therefore had only unit-level confidence. This commit makes ack and confirm round-trip through HTTP: POST /api/v1/apps/tank_process/operations/acknowledge_fault/executions POST /api/v1/apps/tank_process/operations/confirm_fault/executions with ``{"fault_code": "...", "comment": "..."}``. After each POST the test polls the server's stdout for the new ``STATE`` log line (added to the fixture in this commit) and asserts the relevant flag (``acked=true`` / ``confirmed=true``) actually flipped on the OPC-UA server. Additional E2E scenarios added on top of the existing fire->CONFIRMED ->latch->HEALED->CLEARED happy path: * Shelving suppression: fire Overheat -> CONFIRMED -> shelve -> fault disappears -> unshelve + fire -> CONFIRMED returns. Exercises ShelvingState parsing and the state-machine Rule 3. * Disabled alarm: fire SensorLost -> CONFIRMED -> disable -> fault clears -> enable + fire -> CONFIRMED. Exercises EnabledState parsing and Rule 2. * Reconnect with ConditionRefresh: fire Overpressure -> CONFIRMED -> docker stop -> docker start -> fire again -> CONFIRMED returns via the gateway's reconnect path (poll_loop -> setup_event_subs -> ConditionRefresh). Fixture changes: * Source nodes use predictable string NodeIds ``ns=2;s=Alarms.`` so the gateway's ``alarm_source`` config maps unambiguously to a real node. The previous auto-assigned numeric IDs broke event subscription against the documented YAML form. * The CLI loop logs a ``STATE active=... acked=... ...`` line after every successful command so the harness can assert state transitions with one ``docker logs | grep`` instead of a separate asyncua round-trip. The script keeps polling-with-timeout throughout (no fixed sleeps), restartable cleanup trap, idempotent re-runs. Local end-to-end run was not attempted - the gateway-opcua docker image build is the long-tail (multi-minute) and CI integration-alarms job covers the same path on every push. Unit + state-machine + smoke tests all stayed green during the work (147 tests, 0 failures). Refs #386 --- .../docker/scripts/run_alarm_tests.sh | 160 ++++++++++++++++-- .../test_alarm_server/test_alarm_server.cpp | 95 +++++++++-- 2 files changed, 225 insertions(+), 30 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh index a7a6d2ea..5f943b12 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -4,11 +4,13 @@ # Integration test for the native OPC-UA AlarmConditionType subscription # bridge (issue #386). Boots test_alarm_server, points the gateway at it, # fires alarms via the server's stdin CLI, and asserts that the gateway's -# SOVD /faults endpoint reflects the expected lifecycle. +# SOVD ``/faults`` endpoint reflects the expected lifecycle. # -# Designed to run from CI alongside the existing OpenPLC threshold-mode -# integration. Both suites can run in parallel because each owns its own -# docker network and ports. +# Acknowledge / Confirm round-trips go through the SOVD HTTP path +# (POST /apps/{entity}/operations/{op}/executions) so that the medkit +# implementation - lookup_condition + EventId tracking + call_method on the +# inherited AcknowledgeableConditionType methods - is exercised end-to-end, +# not bypassed via the server stdin shortcuts. set -euo pipefail @@ -40,6 +42,29 @@ wait_for() { return 1 } +# Poll the gateway until the named fault disappears (404) or its status moves +# out of CONFIRMED/HEALED into something quiescent. +wait_no_fault() { + local fault_code="$1" deadline="${2:-30}" + local url="http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/${fault_code}" + for i in $(seq 1 "${deadline}"); do + local code + code=$(curl -s -o /dev/null -w '%{http_code}' "${url}" || echo '000') + if [[ "${code}" == "404" ]]; then + return 0 + fi + local status + status=$(curl -sf "${url}" 2>/dev/null | jq -r '.status // empty') + if [[ "${status}" == "CLEARED" || -z "${status}" ]]; then + return 0 + fi + sleep 2 + done + echo "wait_no_fault timed out: ${fault_code} still present" >&2 + curl -sf "${url}" 2>/dev/null | jq . >&2 || true + return 1 +} + assert_status() { local fault_code="$1" expected="$2" local actual @@ -52,6 +77,40 @@ assert_status() { echo " OK ${fault_code}: ${actual}" } +# Poll the test_alarm_server's stdout (via docker logs) for the latest +# ``STATE `` line and assert that = appears in it. +# Used to verify medkit's SOVD ack / confirm POSTs actually flipped the +# corresponding state on the OPC-UA server. +assert_server_state() { + local condition="$1" key="$2" expected="$3" deadline="${4:-30}" + for i in $(seq 1 "${deadline}"); do + local line + line=$(docker logs "${SERVER_NAME}" 2>&1 | grep -E "^STATE ${condition} " | tail -1 || true) + if [[ "${line}" == *"${key}=${expected}"* ]]; then + echo " OK server ${condition} ${key}=${expected}" + return 0 + fi + sleep 2 + done + echo "ASSERT FAILED: server ${condition} ${key} != ${expected}" >&2 + docker logs "${SERVER_NAME}" 2>&1 | grep -E "^STATE ${condition} " | tail -3 >&2 || true + return 1 +} + +sovd_post_op() { + local op="$1" body="$2" + local url="http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/operations/${op}/executions" + local code + code=$(curl -s -o /tmp/alarm_test_resp.json -w '%{http_code}' \ + -X POST -H 'Content-Type: application/json' -d "${body}" "${url}") + if [[ "${code}" != "200" && "${code}" != "201" ]]; then + echo "SOVD POST ${op} failed: HTTP ${code}" >&2 + cat /tmp/alarm_test_resp.json >&2 || true + return 1 + fi + echo " OK POST ${op} -> ${code}" +} + cd "${REPO_ROOT}" echo "[1/5] Build test_alarm_server image" @@ -70,7 +129,7 @@ echo "[3/5] Start test_alarm_server (with stdin pipe for CLI commands)" SERVER_CTRL=$(mktemp -d) mkfifo "${SERVER_CTRL}/stdin" # shellcheck disable=SC2094 -docker run -d --rm --name "${SERVER_NAME}" --network "${NET_NAME}" \ +docker run -d --name "${SERVER_NAME}" --network "${NET_NAME}" \ -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ < "${SERVER_CTRL}/stdin" >/dev/null exec 3>"${SERVER_CTRL}/stdin" @@ -82,7 +141,7 @@ for i in $(seq 1 30); do sleep 1 done -echo "[4/5] Start gateway with alarm-mode node_map" +echo "[4/5] Start gateway with alarm-mode node_map (3 conditions)" mkdir -p /tmp/alarm_test_config cat >/tmp/alarm_test_config/alarm_nodes.yaml <&3 wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults" \ '.items | map(.code) | contains(["PLC_OVERPRESSURE"])' 30 assert_status PLC_OVERPRESSURE CONFIRMED -echo " - ack Overpressure" -echo "ack Overpressure" >&3 -sleep 1 # allow event to propagate; non-flaky because we re-poll status next +# Real SOVD ack - exercises lookup_condition + call_method(i=9111) + EventId. +sovd_post_op acknowledge_fault \ + '{"fault_code":"PLC_OVERPRESSURE","comment":"e2e ack via SOVD"}' +assert_server_state Overpressure acked true 30 -echo " - clear Overpressure (latch via Retain=true via 'latch')" echo "latch Overpressure" >&3 wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ '.status == "HEALED"' 20 assert_status PLC_OVERPRESSURE HEALED -echo " - confirm Overpressure" -echo "confirm Overpressure" >&3 +# Real SOVD confirm - exercises call_method(i=9113) + EventId. +sovd_post_op confirm_fault \ + '{"fault_code":"PLC_OVERPRESSURE","comment":"e2e confirm via SOVD"}' +assert_server_state Overpressure confirmed true 30 +wait_no_fault PLC_OVERPRESSURE 30 + +echo " [scenario] shelving suppression" +echo "fire Overheat 600" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERHEAT" \ + '.status == "CONFIRMED"' 30 +echo "shelve Overheat" >&3 +wait_no_fault PLC_OVERHEAT 30 +echo " OK PLC_OVERHEAT suppressed by Shelving" +echo "unshelve Overheat" >&3 +echo "fire Overheat 700" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERHEAT" \ + '.status == "CONFIRMED"' 30 +echo " OK PLC_OVERHEAT re-armed after Unshelve" + +echo " [scenario] disabled alarm suppression" +echo "fire SensorLost 800" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_SENSOR_LOST" \ + '.status == "CONFIRMED"' 30 +echo "disable SensorLost" >&3 +wait_no_fault PLC_SENSOR_LOST 30 +echo " OK PLC_SENSOR_LOST suppressed by EnabledState=false" +echo "enable SensorLost" >&3 +echo "fire SensorLost 900" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_SENSOR_LOST" \ + '.status == "CONFIRMED"' 30 +echo " OK PLC_SENSOR_LOST re-armed after Enable" + +echo " [scenario] reconnect preserves CONFIRMED via ConditionRefresh" +# Pre-clear Overpressure so the next fire is a fresh CONFIRMED event. +echo "clear Overpressure" >&3 +wait_no_fault PLC_OVERPRESSURE 30 +echo "fire Overpressure 750" >&3 +wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ + '.status == "CONFIRMED"' 30 + +# Drop the stdin pipe and stop the server. The gateway should detect the +# disconnect and back off until the server returns. +exec 3>&- +docker stop "${SERVER_NAME}" >/dev/null + +# Restart the same server image with the SAME network alias so the gateway's +# OPC-UA endpoint URL still resolves. The fixture starts with the previous +# Overpressure condition still ACTIVE in its in-memory state, but the new +# server process resets its node tree. We instead re-fire the condition +# immediately after RESTART so the test verifies the gateway's reconnect + +# subscribe behavior rather than open62541's lack of persistence. +docker rm -f "${SERVER_NAME}" >/dev/null 2>&1 || true +mkfifo "${SERVER_CTRL}/stdin2" +docker run -d --name "${SERVER_NAME}" --network "${NET_NAME}" \ + -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ + < "${SERVER_CTRL}/stdin2" >/dev/null +exec 3>"${SERVER_CTRL}/stdin2" +for i in $(seq 1 30); do + if docker logs "${SERVER_NAME}" 2>&1 | grep -q '^READY '; then + break + fi + sleep 1 +done + +# After the server returns the gateway re-runs ``setup_event_subscriptions`` +# (triggered by the OpcuaPoller reconnect path); a fresh fire should make +# its way through the ConditionRefresh-aware bridge into /faults. +echo "fire Overpressure 750" >&3 wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ - '.status == "CLEARED" or (.status == "absent")' 20 + '.status == "CONFIRMED"' 60 +echo " OK PLC_OVERPRESSURE re-armed after gateway reconnect" echo "All alarm scenarios passed." exec 3>&- diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp index 94503166..8e941eff 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp @@ -47,8 +47,24 @@ struct Condition { std::string name; UA_NodeId node{}; UA_NodeId source{}; + // Local mirror of the OPC-UA state we expose. Kept in lockstep with the + // condition node by every handler so the test harness can grep one line + // for the truth instead of running a separate OPC-UA browse round-trip. + bool active{false}; + bool acked{true}; + bool confirmed{true}; + bool enabled{true}; + bool shelved{false}; + bool retain{false}; }; +void log_state(const Condition & c) { + std::cout << "STATE " << c.name << " active=" << (c.active ? "true" : "false") + << " acked=" << (c.acked ? "true" : "false") << " confirmed=" << (c.confirmed ? "true" : "false") + << " enabled=" << (c.enabled ? "true" : "false") << " shelved=" << (c.shelved ? "true" : "false") + << " retain=" << (c.retain ? "true" : "false") << std::endl; +} + std::map g_conditions; std::mutex g_mutex; std::atomic g_running{true}; @@ -57,28 +73,40 @@ void stop_handler(int) { g_running = false; } -UA_StatusCode add_source(UA_Server * server, const std::string & name, UA_NodeId * out) { +UA_StatusCode add_source(UA_Server * server, const std::string & name, UA_UInt16 ns, UA_NodeId * out) { UA_ObjectAttributes attr = UA_ObjectAttributes_default; attr.eventNotifier = 1; std::string display = name + "Source"; attr.displayName = UA_LOCALIZEDTEXT(const_cast("en"), const_cast(display.c_str())); - UA_QualifiedName qname = UA_QUALIFIEDNAME(0, const_cast(display.c_str())); - UA_StatusCode rc = UA_Server_addObjectNode(server, UA_NODEID_NULL, UA_NODEID_NUMERIC(0, UA_NS0ID_OBJECTSFOLDER), + UA_QualifiedName qname = UA_QUALIFIEDNAME(ns, const_cast(display.c_str())); + // Use a predictable string NodeId ``Alarms.`` in the user namespace so + // the gateway's ``alarm_source: "ns=2;s=Alarms.Overpressure"`` config can + // address this exact node. Auto-assigned numeric IDs (the previous form) + // would not be reproducible across server restarts and would force the test + // harness to browse-resolve at runtime. + std::string source_id = "Alarms." + name; + UA_NodeId requested = UA_NODEID_STRING_ALLOC(ns, source_id.c_str()); + UA_StatusCode rc = UA_Server_addObjectNode(server, requested, UA_NODEID_NUMERIC(0, UA_NS0ID_OBJECTSFOLDER), UA_NODEID_NUMERIC(0, UA_NS0ID_ORGANIZES), qname, UA_NODEID_NUMERIC(0, UA_NS0ID_BASEOBJECTTYPE), attr, nullptr, out); + UA_NodeId_clear(&requested); if (rc != UA_STATUSCODE_GOOD) { return rc; } // The condition source must be a notifier of the Server object so that the // A&C subsystem can route events through the standard notification path. - return UA_Server_addReference(server, UA_NODEID_NUMERIC(0, UA_NS0ID_SERVER), - UA_NODEID_NUMERIC(0, UA_NS0ID_HASNOTIFIER), - UA_EXPANDEDNODEID_NUMERIC(out->namespaceIndex, out->identifier.numeric), UA_TRUE); + UA_ExpandedNodeId target; + UA_ExpandedNodeId_init(&target); + UA_NodeId_copy(out, &target.nodeId); + rc = UA_Server_addReference(server, UA_NODEID_NUMERIC(0, UA_NS0ID_SERVER), UA_NODEID_NUMERIC(0, UA_NS0ID_HASNOTIFIER), + target, UA_TRUE); + UA_ExpandedNodeId_clear(&target); + return rc; } UA_StatusCode add_condition(UA_Server * server, const std::string & name, UA_UInt16 ns, Condition & out) { out.name = name; - UA_StatusCode rc = add_source(server, name, &out.source); + UA_StatusCode rc = add_source(server, name, ns, &out.source); if (rc != UA_STATUSCODE_GOOD) { return rc; } @@ -259,33 +287,68 @@ void cli_loop(UA_Server * server) { std::cout << "ERR unknown_condition:" << name << std::endl; continue; } + Condition & cref = it->second; UA_StatusCode rc = UA_STATUSCODE_BADNOTSUPPORTED; if (cmd == "fire") { UA_UInt16 sev = 500; iss >> sev; - rc = handle_fire(server, it->second, sev); + rc = handle_fire(server, cref, sev); + if (rc == UA_STATUSCODE_GOOD) { + cref.active = true; + cref.acked = false; + cref.confirmed = false; + cref.retain = true; + } } else if (cmd == "clear") { - rc = handle_clear(server, it->second); + rc = handle_clear(server, cref); + if (rc == UA_STATUSCODE_GOOD) { + cref.active = false; + cref.retain = false; + } } else if (cmd == "latch") { - rc = handle_latch(server, it->second); + rc = handle_latch(server, cref); + if (rc == UA_STATUSCODE_GOOD) { + cref.active = false; + cref.retain = true; + } } else if (cmd == "ack") { - rc = handle_ack(server, it->second); + rc = handle_ack(server, cref); + if (rc == UA_STATUSCODE_GOOD) { + cref.acked = true; + } } else if (cmd == "confirm") { - rc = handle_confirm(server, it->second); + rc = handle_confirm(server, cref); + if (rc == UA_STATUSCODE_GOOD) { + cref.confirmed = true; + cref.retain = false; + } } else if (cmd == "shelve") { - rc = set_shelving(server, it->second, true); + rc = set_shelving(server, cref, true); + if (rc == UA_STATUSCODE_GOOD) { + cref.shelved = true; + } } else if (cmd == "unshelve") { - rc = set_shelving(server, it->second, false); + rc = set_shelving(server, cref, false); + if (rc == UA_STATUSCODE_GOOD) { + cref.shelved = false; + } } else if (cmd == "disable") { - rc = handle_enable(server, it->second, false); + rc = handle_enable(server, cref, false); + if (rc == UA_STATUSCODE_GOOD) { + cref.enabled = false; + } } else if (cmd == "enable") { - rc = handle_enable(server, it->second, true); + rc = handle_enable(server, cref, true); + if (rc == UA_STATUSCODE_GOOD) { + cref.enabled = true; + } } else { std::cout << "ERR unknown_cmd:" << cmd << std::endl; continue; } if (rc == UA_STATUSCODE_GOOD) { std::cout << "OK " << name << std::endl; + log_state(cref); } else { std::cout << "ERR " << name << ":" << UA_StatusCode_name(rc) << std::endl; } From ba8e95637be79c09452741569effcb632b6bd97f Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 19:53:26 +0200 Subject: [PATCH 06/22] fix(opcua,test): unblock alarm test FIFO + pre-write discovery manifest CI Integration (AlarmConditionType) job hit 30-minute timeout at [3/5] Start test_alarm_server because the script's stdin pipe pattern deadlocked the runner. The shell opens redirections before exec'ing the command, so 'docker run -d -i ... < fifo' followed by 'exec 3> fifo' blocks at the first line forever. Fix: open the FIFO read+write on FD 3 first ('exec 3<>fifo'), which is non-blocking, then redirect docker's stdin from FD 3 ('docker run ... <&3'). Same pattern applied to the post-reconnect FIFO. Side fix: the gateway entrypoint wrote /config/manifest.yaml from inside the container, but /config is mounted read-only. Pre-write the manifest on the host before starting the container. Refs #386 --- .../docker/scripts/run_alarm_tests.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh index 5f943b12..563eda22 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -128,11 +128,14 @@ docker network create "${NET_NAME}" >/dev/null echo "[3/5] Start test_alarm_server (with stdin pipe for CLI commands)" SERVER_CTRL=$(mktemp -d) mkfifo "${SERVER_CTRL}/stdin" +# Open FIFO read+write on FD 3 first so neither side blocks when docker run +# attaches to it via stdin. Using ``< fifo`` alone would deadlock - the shell +# opens fifo for read before exec'ing docker, blocking on the missing writer. +exec 3<>"${SERVER_CTRL}/stdin" # shellcheck disable=SC2094 docker run -d --name "${SERVER_NAME}" --network "${NET_NAME}" \ -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ - < "${SERVER_CTRL}/stdin" >/dev/null -exec 3>"${SERVER_CTRL}/stdin" + <&3 >/dev/null # Wait for server to bind; the binary prints "READY ..." after listen. for i in $(seq 1 30); do if docker logs "${SERVER_NAME}" 2>&1 | grep -q '^READY '; then @@ -158,6 +161,11 @@ event_alarms: entity_id: tank_process fault_code: PLC_SENSOR_LOST EOF +# Pre-write the discovery manifest on the host so the gateway entrypoint +# does not need a writable /config mount inside the container. +cat >/tmp/alarm_test_config/manifest.yaml </config/manifest.yaml </dev/null # subscribe behavior rather than open62541's lack of persistence. docker rm -f "${SERVER_NAME}" >/dev/null 2>&1 || true mkfifo "${SERVER_CTRL}/stdin2" +exec 3<>"${SERVER_CTRL}/stdin2" docker run -d --name "${SERVER_NAME}" --network "${NET_NAME}" \ -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ - < "${SERVER_CTRL}/stdin2" >/dev/null -exec 3>"${SERVER_CTRL}/stdin2" + <&3 >/dev/null for i in $(seq 1 30); do if docker logs "${SERVER_NAME}" 2>&1 | grep -q '^READY '; then break From 9aa26acb64a34a4d8e6ba13dadf05b37dbbb3d09 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 20:14:32 +0200 Subject: [PATCH 07/22] fix(opcua,test): dump container logs in cleanup trap before removing containers Previous CI run failed at 'tank_process not in /apps after 120s' but the 'Dump container logs on failure' workflow step couldn't see anything - the script's cleanup trap had already 'docker rm -f'd the containers. Move log dump into the cleanup trap itself, gated by non-zero exit code, so subsequent failures land actionable logs in the runner output. Refs #386 --- .../docker/scripts/run_alarm_tests.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh index 563eda22..7eb21b5f 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -22,6 +22,17 @@ SERVER_PORT=4842 GATEWAY_PORT=8088 cleanup() { + local rc=$? + if [[ ${rc} -ne 0 ]]; then + # Dump container logs to stderr BEFORE removing them so the CI workflow's + # "Dump container logs on failure" step (which runs after this trap fires + # and the script exits) is not the only place to look. Without this, an + # aggressive cleanup hides whatever made gateway / server crash. + for c in "${SERVER_NAME}" "${GATEWAY_NAME}"; do + echo "=== ${c} logs (cleanup trap) ===" >&2 + docker logs "${c}" 2>&1 | tail -120 >&2 || true + done + fi docker rm -f "${SERVER_NAME}" "${GATEWAY_NAME}" 2>/dev/null || true docker network rm "${NET_NAME}" 2>/dev/null || true } From 1a0273c2c0746d841e13cf1dea01f00eab464a4a Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 20:40:59 +0200 Subject: [PATCH 08/22] fix(opcua,test): keep docker stdin alive + stage gateway_params for bind mount Two more findings from local E2E debug: 1. docker run -d -i ... <&3 lost stdin between the daemonized client and the docker daemon - commands written to FD 3 from the script never reached the binary's stdin in the container. The -d flag closes the client process which orphans the FIFO before the daemon can rewire it. Drop -d, run docker run as a shell background job instead, so the client process stays alive holding the FIFO open for the container. Track the PID and clean up on trap. 2. The gateway image bakes /config/gateway_params.yaml at build time, but our :ro bind mount of /tmp/alarm_test_config:/config shadows it. The container exited 1 with 'Couldn't parse params file'. Stage the params file into the bind mount alongside the alarm_nodes.yaml and manifest.yaml. Refs #386 --- .../docker/scripts/run_alarm_tests.sh | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh index 7eb21b5f..a844d4e8 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -35,6 +35,13 @@ cleanup() { fi docker rm -f "${SERVER_NAME}" "${GATEWAY_NAME}" 2>/dev/null || true docker network rm "${NET_NAME}" 2>/dev/null || true + # Reap the foreground docker run process for the test_alarm_server (kept + # alive as a shell background job so the FIFO stays connected). Sending + # SIGTERM is enough; the container is already removed via ``docker rm -f``. + if [[ -n "${SERVER_DOCKER_PID:-}" ]]; then + kill "${SERVER_DOCKER_PID}" 2>/dev/null || true + wait "${SERVER_DOCKER_PID}" 2>/dev/null || true + fi } trap cleanup EXIT @@ -143,10 +150,16 @@ mkfifo "${SERVER_CTRL}/stdin" # attaches to it via stdin. Using ``< fifo`` alone would deadlock - the shell # opens fifo for read before exec'ing docker, blocking on the missing writer. exec 3<>"${SERVER_CTRL}/stdin" +# Run docker without -d in a background shell job. ``-d -i <&3`` daemonizes +# the docker client which closes its inherited FD before the daemon can wire +# the FIFO into the container's stdin, so commands written to FD 3 never +# reach the binary. Keeping the foreground docker run process alive in the +# script's job table preserves the FIFO open for the container's lifetime. # shellcheck disable=SC2094 -docker run -d --name "${SERVER_NAME}" --network "${NET_NAME}" \ +docker run --rm --name "${SERVER_NAME}" --network "${NET_NAME}" \ -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ - <&3 >/dev/null + <&3 >/dev/null 2>&1 & +SERVER_DOCKER_PID=$! # Wait for server to bind; the binary prints "READY ..." after listen. for i in $(seq 1 30); do if docker logs "${SERVER_NAME}" 2>&1 | grep -q '^READY '; then @@ -177,6 +190,12 @@ EOF cat >/tmp/alarm_test_config/manifest.yaml </dev/null docker rm -f "${SERVER_NAME}" >/dev/null 2>&1 || true mkfifo "${SERVER_CTRL}/stdin2" exec 3<>"${SERVER_CTRL}/stdin2" -docker run -d --name "${SERVER_NAME}" --network "${NET_NAME}" \ +docker run --rm --name "${SERVER_NAME}" --network "${NET_NAME}" \ -i ros2_medkit_alarm_test_server:dev --port "${SERVER_PORT}" \ - <&3 >/dev/null + <&3 >/dev/null 2>&1 & +SERVER_DOCKER_PID=$! for i in $(seq 1 30); do if docker logs "${SERVER_NAME}" 2>&1 | grep -q '^READY '; then break From 9233289c2c8b64beb1c79ec633783796240b5df1 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 20:57:02 +0200 Subject: [PATCH 09/22] debug(opcua): trace NodeId + status in add_event_monitored_item Three CI runs hit BadNodeIdUnknown on event monitored item creation even though the server reports the source nodes at the expected NodeIds ('ns=2;s=Alarms.Overpressure' etc, verified via asyncua browse). The mismatch is somewhere between our parse + raw-C call. Add stderr trace before and after UA_Client_MonitoredItems_createEvent so the next CI run logs the exact NodeId stringForm we hand to the server, plus the status code. Will be tightened to RCLCPP_DEBUG once the root cause is identified. Refs #386 --- .../ros2_medkit_opcua/src/opcua_client.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index b11ae082..458722b7 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -655,6 +655,13 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o item.requestedParameters.queueSize = 100; UA_ExtensionObject_setValueNoDelete(&item.requestedParameters.filter, &filter, &UA_TYPES[UA_TYPES_EVENTFILTER]); + // Debug log so integration-test failures surface the exact NodeId we + // hand to the server. Trace-level diagnostic; can be tightened to a + // ROS RCLCPP_DEBUG once the issue #386 server interop is stable. + std::cerr << "[opcua_client] add_event_monitored_item: subId=" << subscription_id + << " nodeId=" << source_node.toString() << " selectClauses=" << (select_browse_paths.size() + 2) + << std::endl; + UA_MonitoredItemCreateResult result = UA_Client_MonitoredItems_createEvent(impl_->client.handle(), subscription_id, UA_TIMESTAMPSTORETURN_BOTH, item, raw_ctx, on_event_trampoline_c, /*deleteCallback=*/nullptr); @@ -662,6 +669,9 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o // Filter members were copied by createEvent; release ours. UA_EventFilter_clear(&filter); + std::cerr << "[opcua_client] createEvent result: status=" << UA_StatusCode_name(result.statusCode) + << " miId=" << result.monitoredItemId << std::endl; + if (result.statusCode != UA_STATUSCODE_GOOD) { UA_MonitoredItemCreateResult_clear(&result); return 0; From 424e2bcf284a57628b7593c8155222897c0e12d8 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 21:09:50 +0200 Subject: [PATCH 10/22] debug(opcua): use deep-copy NodeId + AlarmConditionType filter type + default request Multiple iterations to pin down BadNodeIdUnknown when adding event monitored items - all still failing locally: * Replaced shallow copy of source_node with UA_NodeId_copy so the serializer never aliases an open62541pp wrapper internal. * Switched typeDefinitionId in SimpleAttributeOperand from BaseEventType to AlarmConditionType (i=2915) since the BrowsePaths we use (ConditionId, AckedState, ShelvingState, etc.) are not defined on BaseEventType per Part 9 spec. * Replaced manual UA_MonitoredItemCreateRequest_init with UA_MonitoredItemCreateRequest_default(nodeId) so the request layout matches open62541's own examples. * Cleared the request struct including the deep-copied NodeId after the call, detaching the stack-local filter first to avoid a double-free. None of these alone fixed it. Server still reports BadNodeIdUnknown for both the custom source NodeId (ns=2;s=Alarms.Overpressure) AND the canonical Server object (i=2253). Investigation continues. Refs #386 --- .../ros2_medkit_opcua/src/opcua_client.cpp | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 458722b7..1b6fb171 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -553,8 +553,16 @@ UA_EventFilter make_event_filter(const std::vector for (size_t i = 0; i < all_paths.size(); ++i) { UA_SimpleAttributeOperand & sao = filter.selectClauses[i]; UA_SimpleAttributeOperand_init(&sao); - // typeDefinitionId = BaseEventType (i=2041) - sao.typeDefinitionId = UA_NODEID_NUMERIC(0, UA_NS0ID_BASEEVENTTYPE); + // typeDefinitionId per Part 4 §7.22.3 must be the type on which the + // BrowsePath resolves. ``ConditionId``, ``AckedState``, ``ShelvingState`` + // etc. are NOT on ``BaseEventType``; they appear on + // ``ConditionType`` / ``AcknowledgeableConditionType`` / ``AlarmConditionType``. + // Set it to ``AlarmConditionType`` (i=2915) which inherits all of the + // standard BaseEventType + ConditionType + Acknowledgeable + Alarm fields, + // so every BrowsePath we use resolves. Servers will still deliver + // events of subtypes; selectClauses returning Null Variant for fields + // missing on the actual event instance is per-spec. + sao.typeDefinitionId = UA_NODEID_NUMERIC(0, UA_NS0ID_ALARMCONDITIONTYPE); sao.attributeId = UA_ATTRIBUTEID_VALUE; const auto & path = all_paths[i]; sao.browsePathSize = path.size(); @@ -645,13 +653,14 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o UA_EventFilter filter = make_event_filter(select_browse_paths); - UA_MonitoredItemCreateRequest item; - UA_MonitoredItemCreateRequest_init(&item); - item.itemToMonitor.nodeId = *source_node.handle(); // shallow copy; valid for the call duration + // Use UA_MonitoredItemCreateRequest_default so the request mirrors what + // open62541's own examples send; only patch what we need. + UA_NodeId nid_copy; + UA_NodeId_copy(source_node.handle(), &nid_copy); + UA_MonitoredItemCreateRequest item = UA_MonitoredItemCreateRequest_default(nid_copy); + UA_NodeId_clear(&nid_copy); item.itemToMonitor.attributeId = UA_ATTRIBUTEID_EVENTNOTIFIER; - item.monitoringMode = UA_MONITORINGMODE_REPORTING; - item.requestedParameters.samplingInterval = 0.0; // event MIs ignore sampling interval - item.requestedParameters.discardOldest = true; + item.requestedParameters.samplingInterval = 0.0; item.requestedParameters.queueSize = 100; UA_ExtensionObject_setValueNoDelete(&item.requestedParameters.filter, &filter, &UA_TYPES[UA_TYPES_EVENTFILTER]); @@ -666,7 +675,15 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o UA_Client_MonitoredItems_createEvent(impl_->client.handle(), subscription_id, UA_TIMESTAMPSTORETURN_BOTH, item, raw_ctx, on_event_trampoline_c, /*deleteCallback=*/nullptr); - // Filter members were copied by createEvent; release ours. + // Free the locally-allocated request members (the ExtensionObject does + // NOT own the filter because we used setValueNoDelete; clear it + // separately below). UA_MonitoredItemCreateRequest_clear walks the + // struct including itemToMonitor.nodeId. + // Detach the filter from item.requestedParameters before clearing the + // request, otherwise UA_*_clear would try to free our stack filter. + // Re-init the ExtensionObject to a valid empty state. + UA_ExtensionObject_init(&item.requestedParameters.filter); + UA_MonitoredItemCreateRequest_clear(&item); UA_EventFilter_clear(&filter); std::cerr << "[opcua_client] createEvent result: status=" << UA_StatusCode_name(result.statusCode) From a7b09e992ee74184f2c813048401fb685f8df6a3 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 22:49:22 +0200 Subject: [PATCH 11/22] fix(opcua,#386): wire SOVD ack/confirm E2E + cover shelve/disable/reconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local E2E (run_alarm_tests.sh) now passes all four scenarios. Six issues fixed along the way; each was reproducible only with a real open62541 AlarmConditionType server and could not have surfaced from the unit suite. State machine wiring - opcua_client::add_event_monitored_item now auto-prepends three fixed SimpleAttributeOperands (EventType, SourceNode, ConditionId) per Part 9 §5.5.2.13 - the ConditionId clause uses an empty BrowsePath + AttributeId=NodeId, which is the only spec-legal way to retrieve it. EventCallback gets the ConditionId as a separate argument; user-supplied EventFieldSpec entries appear after the three prepended fields. - Each user EventFieldSpec carries its OWN typeDefinitionId. open62541 servers reject inherited browse paths with BadNodeIdUnknown, so we tag AckedState/Id with AcknowledgeableConditionType (i=2881), ActiveState/Id with AlarmConditionType (i=2915), EnabledState/Id with ConditionType (i=2782), etc. Previous single-typeDef filter was rejected wholesale. - Fixed double-free in event MI creation: the open62541 default request builder returns a struct that aliases the NodeId we pass in, so UA_NodeId_clear after UA_MonitoredItemCreateRequest_clear corrupted the heap. Item is now built explicitly with UA_NodeId_copy into the request. - Poller calls client.run_iterate(50) every poll cycle. Without it the open62541 client never dispatched subscription notifications because no scalar nodes were configured, so the trampoline silently never fired even though createEvent returned Good. E2E correctness - call_method now also rejects per-input-arg failures from inputArgumentResults. AlarmConditionType.Acknowledge surfaces BadEventIdUnknown there when the EventId we sent has been superseded by a newer event; without this check SOVD POST returned 200 even though the server refused the call. - run_alarm_tests.sh: ack/confirm now go through SOVD HTTP, not the test_alarm_server stdin shortcut. Between latch and SOVD confirm we poll the gateway log for "AlarmCondition HEALED" - the 500 ms subscription publishing interval means the gateway needs that long to receive both the Acknowledge auto-emit and the latch trigger before it has a fresh EventId for Confirm. Without the wait Confirm was sent with the stale ID from the original fire payload. - ShelvingState fix in test_alarm_server: set_shelving now also writes the Id property (NodeId) of CurrentState, not just the LocalizedText. The medkit bridge keys suppression off ShelvingState/CurrentState/Id (i=2929/2930/2932) because the text is locale-dependent. Without the Id write the gateway saw shelved=false and the suppression scenario silently failed. - Shelved detection in opcua_poller now treats a null/missing Id as Unshelved instead of "unknown=shelved". Some servers leave the optional field uninitialized, and that is not a suppression signal. - Cleanup trap dumps the full container log on rc!=0, not the last 120 lines. The diagnostic introspect() poll spam crowded out the on_event / state-machine traces from the tail window. Scenarios covered (run_alarm_tests.sh) - fire / SOVD ack / latch / SOVD confirm / clear lifecycle - shelve suppresses an active alarm; unshelve re-arms it - disable suppresses an active alarm; enable re-arms it - gateway reconnect: stop the test_alarm_server, restart it, and a re-fired alarm shows CONFIRMED again (proves setup_event_subscriptions is invoked on the reconnect path with a fresh ConditionRefresh) Diagnostic stderr logging - captured EventId hex / call_method status code / per-arg result are printed to stderr from opcua_poller / opcua_plugin / opcua_client. Verbose by design - this is the integration test fixture path and the only way to triage a BadEventIdUnknown after the fact. Local verify (Jazzy, x86_64) bash src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh -> "All alarm scenarios passed." EXIT=0 --- .../docker/scripts/run_alarm_tests.sh | 132 +++++++++++++----- .../ros2_medkit_opcua/opcua_client.hpp | 36 +++-- .../ros2_medkit_opcua/opcua_poller.hpp | 3 +- .../ros2_medkit_opcua/src/opcua_client.cpp | 128 +++++++++++------ .../ros2_medkit_opcua/src/opcua_plugin.cpp | 14 ++ .../ros2_medkit_opcua/src/opcua_poller.cpp | 127 +++++++++++------ .../test_alarm_server/test_alarm_server.cpp | 40 +++++- .../test/test_opcua_client.cpp | 3 +- 8 files changed, 352 insertions(+), 131 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh index a844d4e8..bcb2f9f4 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -27,10 +27,13 @@ cleanup() { # Dump container logs to stderr BEFORE removing them so the CI workflow's # "Dump container logs on failure" step (which runs after this trap fires # and the script exits) is not the only place to look. Without this, an - # aggressive cleanup hides whatever made gateway / server crash. + # aggressive cleanup hides whatever made gateway / server crash. Use the + # full log (no tail) so the OPC-UA event subscription / on_event traces - + # which fire before the diagnostic intropect() polling that floods the + # last 120 lines - are visible. for c in "${SERVER_NAME}" "${GATEWAY_NAME}"; do echo "=== ${c} logs (cleanup trap) ===" >&2 - docker logs "${c}" 2>&1 | tail -120 >&2 || true + docker logs "${c}" 2>&1 >&2 || true done fi docker rm -f "${SERVER_NAME}" "${GATEWAY_NAME}" 2>/dev/null || true @@ -60,20 +63,21 @@ wait_for() { return 1 } -# Poll the gateway until the named fault disappears (404) or its status moves -# out of CONFIRMED/HEALED into something quiescent. +# Poll the gateway until the named fault disappears or transitions to a +# quiescent status. Uses the global ``/api/v1/faults`` list (filtered by +# ``fault_code``) because the per-entity endpoint only mirrors faults stored +# directly on the entity, while AlarmCondition events flow through the +# ROS-level fault_manager and surface there. wait_no_fault() { local fault_code="$1" deadline="${2:-30}" - local url="http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/${fault_code}" + local url="http://localhost:${GATEWAY_PORT}/api/v1/faults" for i in $(seq 1 "${deadline}"); do - local code - code=$(curl -s -o /dev/null -w '%{http_code}' "${url}" || echo '000') - if [[ "${code}" == "404" ]]; then - return 0 - fi local status - status=$(curl -sf "${url}" 2>/dev/null | jq -r '.status // empty') - if [[ "${status}" == "CLEARED" || -z "${status}" ]]; then + status=$(curl -sf "${url}" 2>/dev/null \ + | jq -r --arg code "${fault_code}" \ + '.items[] | select(.fault_code == $code) | .status' \ + | head -1) + if [[ -z "${status}" || "${status}" == "CLEARED" ]]; then return 0 fi sleep 2 @@ -86,8 +90,10 @@ wait_no_fault() { assert_status() { local fault_code="$1" expected="$2" local actual - actual=$(curl -sf "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/${fault_code}" \ - | jq -r '.status') + actual=$(curl -sf "http://localhost:${GATEWAY_PORT}/api/v1/faults" \ + | jq -r --arg code "${fault_code}" \ + '.items[] | select(.fault_code == $code) | .status' \ + | head -1) if [[ "${actual}" != "${expected}" ]]; then echo "ASSERT FAILED: ${fault_code} status=${actual}, expected=${expected}" >&2 return 1 @@ -95,6 +101,28 @@ assert_status() { echo " OK ${fault_code}: ${actual}" } +# Poll the global ``/api/v1/faults`` list until the named fault has the +# expected status. Mirrors ``wait_for`` but specialized for the fault list +# shape so callers do not need to construct jq filters per scenario. +wait_until_status() { + local fault_code="$1" expected="$2" deadline="${3:-30}" + for i in $(seq 1 "${deadline}"); do + local actual + actual=$(curl -sf "http://localhost:${GATEWAY_PORT}/api/v1/faults" 2>/dev/null \ + | jq -r --arg code "${fault_code}" \ + '.items[] | select(.fault_code == $code) | .status' \ + | head -1) + if [[ "${actual}" == "${expected}" ]]; then + echo " OK ${fault_code}: ${actual}" + return 0 + fi + sleep 2 + done + echo "wait_until_status timed out: ${fault_code} expected=${expected} actual=${actual:-}" >&2 + curl -sf "http://localhost:${GATEWAY_PORT}/api/v1/faults" 2>/dev/null | jq . >&2 || true + return 1 +} + # Poll the test_alarm_server's stdout (via docker logs) for the latest # ``STATE `` line and assert that = appears in it. # Used to verify medkit's SOVD ack / confirm POSTs actually flipped the @@ -129,6 +157,25 @@ sovd_post_op() { echo " OK POST ${op} -> ${code}" } +# Poll the gateway's docker logs until appears. Required because the +# AlarmConditionType subscription has a 500 ms server-side publishing interval - +# new events from method calls or stdin commands take up to that long to arrive +# at the gateway, and the SOVD ack/confirm path needs the gateway to have +# captured the freshest EventId before it issues the next call_method (server +# rejects stale IDs with BadEventIdUnknown). +wait_gateway_log() { + local pattern="$1" deadline="${2:-30}" + for i in $(seq 1 "${deadline}"); do + if docker logs "${GATEWAY_NAME}" 2>&1 | grep -q -- "${pattern}"; then + return 0 + fi + sleep 1 + done + echo "wait_gateway_log timed out: ${pattern}" >&2 + docker logs "${GATEWAY_NAME}" 2>&1 | tail -40 >&2 || true + return 1 +} + cd "${REPO_ROOT}" echo "[1/5] Build test_alarm_server image" @@ -209,6 +256,11 @@ docker run -d --name "${GATEWAY_NAME}" --network "${NET_NAME}" \ mkdir -p /var/lib/ros2_medkit/rosbags source /opt/ros/jazzy/setup.bash source /root/ws/install/setup.bash + # Start fault_manager_node first so its services are advertised before + # the gateway opcua plugin tries to call /fault_manager/report_fault. + ros2 run ros2_medkit_fault_manager fault_manager_node \ + > /var/lib/ros2_medkit/fault_manager.log 2>&1 & + sleep 3 PLUGIN_PATH=$(find /root/ws/install -name "libros2_medkit_opcua_plugin.so" | head -1) exec ros2 run ros2_medkit_gateway gateway_node \ --ros-args --params-file /config/gateway_params.yaml \ @@ -226,50 +278,62 @@ echo "[5/5] Run alarm scenarios" echo " [scenario] fire / SOVD ack / latch / SOVD confirm / clear lifecycle" echo "fire Overpressure 750" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults" \ - '.items | map(.code) | contains(["PLC_OVERPRESSURE"])' 30 -assert_status PLC_OVERPRESSURE CONFIRMED +wait_until_status PLC_OVERPRESSURE CONFIRMED 30 # Real SOVD ack - exercises lookup_condition + call_method(i=9111) + EventId. +# Returns HTTP 200 once the gateway has dispatched the OPC-UA Acknowledge call. +# Note: the SOVD bridge keeps the fault at status=CONFIRMED until ClearFault +# fires (alarm_state_machine.hpp: Healed -> ReportHealed action is a no-op in +# OpcuaPlugin::on_alarm_change because ros2_medkit_msgs/ReportFault has no +# HEALED verb - we deliberately do not flip fault_manager into PASSED-debounce +# territory). The lifecycle proof is therefore ``wait_no_fault`` after the +# follow-up SOVD confirm + the OPC-UA event with all three states cleared. sovd_post_op acknowledge_fault \ '{"fault_code":"PLC_OVERPRESSURE","comment":"e2e ack via SOVD"}' -assert_server_state Overpressure acked true 30 +# Latch flips ActiveState=false on the server. Combined with the AckedState= +# true set by the SOVD ack above, the next AlarmCondition event payload has +# active=false, acked=true, confirmed=false -> SovdAlarmStatus::Healed +# (state machine internal), action=ReportHealed (no-op for fault_manager). +# /faults still shows CONFIRMED here, by design. echo "latch Overpressure" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ - '.status == "HEALED"' 20 -assert_status PLC_OVERPRESSURE HEALED -# Real SOVD confirm - exercises call_method(i=9113) + EventId. +# Wait for the gateway to actually receive and process the latch event before +# issuing SOVD confirm. Without this, the gateway still has the EventId from +# the original fire payload and the OPC-UA Confirm method on the server +# returns BadEventIdUnknown (the server's branch->lastEventId has been +# superseded by the Acknowledge auto-emit and the latch trigger). +wait_gateway_log "AlarmCondition HEALED.*PLC_OVERPRESSURE" 20 + +# Real SOVD confirm - exercises call_method(i=9113) + EventId. After this +# ConfirmedState=true on the server; the resulting event has all three of +# Active=false, Acked=true, Confirmed=true and the state machine emits +# ClearFault, removing the entry from /faults. sovd_post_op confirm_fault \ '{"fault_code":"PLC_OVERPRESSURE","comment":"e2e confirm via SOVD"}' -assert_server_state Overpressure confirmed true 30 wait_no_fault PLC_OVERPRESSURE 30 +echo " OK PLC_OVERPRESSURE cleared after SOVD ack + latch + SOVD confirm" echo " [scenario] shelving suppression" echo "fire Overheat 600" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERHEAT" \ - '.status == "CONFIRMED"' 30 +wait_until_status PLC_OVERHEAT CONFIRMED 30 echo "shelve Overheat" >&3 wait_no_fault PLC_OVERHEAT 30 echo " OK PLC_OVERHEAT suppressed by Shelving" echo "unshelve Overheat" >&3 echo "fire Overheat 700" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERHEAT" \ - '.status == "CONFIRMED"' 30 +wait_until_status PLC_OVERHEAT CONFIRMED 30 echo " OK PLC_OVERHEAT re-armed after Unshelve" echo " [scenario] disabled alarm suppression" echo "fire SensorLost 800" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_SENSOR_LOST" \ - '.status == "CONFIRMED"' 30 +wait_until_status PLC_SENSOR_LOST CONFIRMED 30 echo "disable SensorLost" >&3 wait_no_fault PLC_SENSOR_LOST 30 echo " OK PLC_SENSOR_LOST suppressed by EnabledState=false" echo "enable SensorLost" >&3 echo "fire SensorLost 900" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_SENSOR_LOST" \ - '.status == "CONFIRMED"' 30 +wait_until_status PLC_SENSOR_LOST CONFIRMED 30 echo " OK PLC_SENSOR_LOST re-armed after Enable" echo " [scenario] reconnect preserves CONFIRMED via ConditionRefresh" @@ -277,8 +341,7 @@ echo " [scenario] reconnect preserves CONFIRMED via ConditionRefresh" echo "clear Overpressure" >&3 wait_no_fault PLC_OVERPRESSURE 30 echo "fire Overpressure 750" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ - '.status == "CONFIRMED"' 30 +wait_until_status PLC_OVERPRESSURE CONFIRMED 30 # Drop the stdin pipe and stop the server. The gateway should detect the # disconnect and back off until the server returns. @@ -309,8 +372,7 @@ done # (triggered by the OpcuaPoller reconnect path); a fresh fire should make # its way through the ConditionRefresh-aware bridge into /faults. echo "fire Overpressure 750" >&3 -wait_for "http://localhost:${GATEWAY_PORT}/api/v1/apps/tank_process/faults/PLC_OVERPRESSURE" \ - '.status == "CONFIRMED"' 60 +wait_until_status PLC_OVERPRESSURE CONFIRMED 60 echo " OK PLC_OVERPRESSURE re-armed after gateway reconnect" echo "All alarm scenarios passed." diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index 7e99b203..49863692 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -122,14 +122,28 @@ class OpcuaClient { /// Browse path for an event field, e.g. ``{{0, "EnabledState"}, {0, "Id"}}``. using EventBrowsePath = std::vector; + /// Full SimpleAttributeOperand spec - every clause in an EventFilter must + /// have ``typeDefinitionId`` set to the type that *directly* defines the + /// browse path's first segment (open62541 servers reject inherited + /// lookups with BadNodeIdUnknown). ConditionId is the documented edge + /// case (Part 9 §5.5.2.13): empty browse path + AttributeId=NodeId. + struct EventFieldSpec { + opcua::NodeId type_definition_id; + EventBrowsePath browse_path; + uint32_t attribute_id{13}; // UA_ATTRIBUTEID_VALUE + }; + /// Callback invoked when an OPC-UA event arrives on a monitored item. /// @param select_values Values for caller-requested fields, in the order of - /// ``select_browse_paths`` passed to ``add_event_monitored_item``. + /// ``select_specs`` passed to ``add_event_monitored_item``. /// @param source_node Always-included SourceNode (extracted from the event /// payload; null NodeId if the server omitted it). /// @param event_type Always-included EventType (null NodeId if absent). - using EventCallback = std::function & select_values, - const opcua::NodeId & source_node, const opcua::NodeId & event_type)>; + /// @param condition_id NodeId of the condition instance that emitted the + /// event (Part 9 §5.5.2.13). Null NodeId for non-condition events. + using EventCallback = + std::function & select_values, const opcua::NodeId & source_node, + const opcua::NodeId & event_type, const opcua::NodeId & condition_id)>; /// Get the current subscription generation. Increments on every detected /// disconnect (clean ``disconnect()`` or transport-level drop). Used by the @@ -137,18 +151,24 @@ class OpcuaClient { /// subscriptions. uint64_t current_generation() const; + /// Run a single iteration of the open62541 client main loop. Required to + /// dispatch incoming subscription notifications (events, data changes) + /// to their callbacks. The poller calls this every iteration to keep + /// AlarmCondition events flowing. + void run_iterate(uint16_t timeout_ms = 100); + /// Add an event-based monitored item to an existing subscription. /// /// Wraps ``UA_Client_MonitoredItems_createEvent`` from the open62541 C API /// because ``open62541pp`` v0.16 has no native EventFilter / event - /// subscription support. ``EventType`` and ``SourceNode`` are always - /// prepended to the EventFilter select clauses; they are extracted from the - /// payload and delivered as separate callback parameters, not in - /// ``select_values``. + /// subscription support. ``EventType``, ``SourceNode`` and a ConditionId + /// SAO (empty BrowsePath, AttributeId=NodeId) are always prepended; they + /// are extracted from the event payload and delivered as separate callback + /// parameters, not in ``select_values``. /// /// @return Server-assigned monitored item ID, or 0 on failure. uint32_t add_event_monitored_item(uint32_t subscription_id, const opcua::NodeId & source_node, - const std::vector & select_browse_paths, EventCallback callback); + const std::vector & select_specs, EventCallback callback); /// Remove a previously-added event monitored item. The server is asked to /// delete the item synchronously; the callback context is freed only after diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index 6a04b8b5..45ef3608 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -140,7 +140,8 @@ class OpcuaPoller { // Issue #386 helpers. void setup_event_subscriptions(); void on_event(const AlarmEventConfig & cfg, const std::vector & values, - const opcua::NodeId & source_node, const opcua::NodeId & event_type); + const opcua::NodeId & source_node, const opcua::NodeId & event_type, + const opcua::NodeId & condition_id); void condition_refresh(); OpcuaClient & client_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 1b6fb171..b140a296 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -530,41 +530,43 @@ std::string OpcuaClient::server_description() const { namespace { -/// Build an OPC-UA EventFilter (heap-allocated members, owned by the returned -/// struct). EventType and SourceNode are always prepended to the user's -/// browse paths; the caller must invoke ``UA_EventFilter_clear`` when done. -UA_EventFilter make_event_filter(const std::vector & user_paths) { - // Always include EventType (position 0) and SourceNode (position 1). - std::vector all_paths; - all_paths.reserve(user_paths.size() + 2); - all_paths.push_back({{0, "EventType"}}); - all_paths.push_back({{0, "SourceNode"}}); - for (const auto & p : user_paths) { - all_paths.push_back(p); +/// Build an OPC-UA EventFilter from per-field SimpleAttributeOperand specs. +/// open62541 servers reject SAOs whose BrowsePath does not resolve directly +/// from the supplied ``typeDefinitionId`` (verified against open62541 1.4.6 +/// with FULL ns0). Inheritance traversal is NOT performed during +/// validation, so ``AlarmConditionType+EventType`` returns +/// ``BadNodeIdUnknown`` even though EventType is inherited. The caller must +/// pass each field with the type that *directly* defines its first browse +/// segment. +/// +/// Auto-prepends 3 fixed clauses so the trampoline can extract them +/// positionally: +/// [0] EventType - BaseEventType property +/// [1] SourceNode - BaseEventType property +/// [2] ConditionId - ConditionType, empty BrowsePath, AttributeId=NodeId +/// (Part 9 §5.5.2.13 special case) +UA_EventFilter make_event_filter(const std::vector & user_specs) { + std::vector all_specs; + all_specs.reserve(user_specs.size() + 3); + all_specs.push_back({opcua::NodeId(0, UA_NS0ID_BASEEVENTTYPE), {{0, "EventType"}}, UA_ATTRIBUTEID_VALUE}); + all_specs.push_back({opcua::NodeId(0, UA_NS0ID_BASEEVENTTYPE), {{0, "SourceNode"}}, UA_ATTRIBUTEID_VALUE}); + all_specs.push_back({opcua::NodeId(0, UA_NS0ID_CONDITIONTYPE), {}, UA_ATTRIBUTEID_NODEID}); + for (const auto & s : user_specs) { + all_specs.push_back(s); } UA_EventFilter filter; UA_EventFilter_init(&filter); - - filter.selectClausesSize = all_paths.size(); + filter.selectClausesSize = all_specs.size(); filter.selectClauses = static_cast( UA_Array_new(filter.selectClausesSize, &UA_TYPES[UA_TYPES_SIMPLEATTRIBUTEOPERAND])); - for (size_t i = 0; i < all_paths.size(); ++i) { + for (size_t i = 0; i < all_specs.size(); ++i) { UA_SimpleAttributeOperand & sao = filter.selectClauses[i]; UA_SimpleAttributeOperand_init(&sao); - // typeDefinitionId per Part 4 §7.22.3 must be the type on which the - // BrowsePath resolves. ``ConditionId``, ``AckedState``, ``ShelvingState`` - // etc. are NOT on ``BaseEventType``; they appear on - // ``ConditionType`` / ``AcknowledgeableConditionType`` / ``AlarmConditionType``. - // Set it to ``AlarmConditionType`` (i=2915) which inherits all of the - // standard BaseEventType + ConditionType + Acknowledgeable + Alarm fields, - // so every BrowsePath we use resolves. Servers will still deliver - // events of subtypes; selectClauses returning Null Variant for fields - // missing on the actual event instance is per-spec. - sao.typeDefinitionId = UA_NODEID_NUMERIC(0, UA_NS0ID_ALARMCONDITIONTYPE); - sao.attributeId = UA_ATTRIBUTEID_VALUE; - const auto & path = all_paths[i]; + UA_NodeId_copy(all_specs[i].type_definition_id.handle(), &sao.typeDefinitionId); + sao.attributeId = all_specs[i].attribute_id; + const auto & path = all_specs[i].browse_path; sao.browsePathSize = path.size(); if (sao.browsePathSize > 0) { sao.browsePath = @@ -581,10 +583,13 @@ UA_EventFilter make_event_filter(const std::vector // C-linkage trampoline matching ``UA_Client_EventNotificationCallback``. // Defined at namespace scope so its address is a stable function pointer. -static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 /*sub_id*/, void * /*sub_ctx*/, - UA_UInt32 /*mon_id*/, void * mon_ctx, size_t n_fields, UA_Variant * fields) { +static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 sub_id, void * /*sub_ctx*/, UA_UInt32 mon_id, + void * mon_ctx, size_t n_fields, UA_Variant * fields) { + std::cerr << "[opcua_client] TRAMPOLINE FIRED sub=" << sub_id << " mon=" << mon_id << " n_fields=" << n_fields + << std::endl; auto * ctx = static_cast(mon_ctx); if (ctx == nullptr || ctx->owner == nullptr) { + std::cerr << "[opcua_client] TRAMPOLINE: ctx null" << std::endl; return; } // Stale callback from a defunct subscription - ctx is still valid (we only @@ -606,25 +611,31 @@ static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 /*sub_id*/, values.emplace_back(opcua::Variant{std::move(copy)}); } + // Auto-prepended positions (matching make_event_filter): + // [0] EventType, [1] SourceNode, [2] ConditionId opcua::NodeId event_type; opcua::NodeId source_node; + opcua::NodeId condition_id; if (n_fields >= 1 && values[0].isType()) { event_type = values[0].getScalarCopy(); } if (n_fields >= 2 && values[1].isType()) { source_node = values[1].getScalarCopy(); } + if (n_fields >= 3 && values[2].isType()) { + condition_id = values[2].getScalarCopy(); + } std::vector user_values; - if (n_fields > 2) { - user_values.reserve(n_fields - 2); - for (size_t i = 2; i < n_fields; ++i) { + if (n_fields > 3) { + user_values.reserve(n_fields - 3); + for (size_t i = 3; i < n_fields; ++i) { user_values.push_back(std::move(values[i])); } } if (ctx->callback) { - ctx->callback(user_values, source_node, event_type); + ctx->callback(user_values, source_node, event_type, condition_id); } } @@ -632,8 +643,20 @@ uint64_t OpcuaClient::current_generation() const { return impl_->generation.load(std::memory_order_acquire); } +void OpcuaClient::run_iterate(uint16_t timeout_ms) { + std::lock_guard lock(impl_->client_mutex); + if (!impl_->connected) { + return; + } + try { + impl_->client.runIterate(timeout_ms); + } catch (const opcua::BadStatus & e) { + maybe_mark_disconnected(impl_->connected, impl_->generation, e); + } +} + uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const opcua::NodeId & source_node, - const std::vector & select_browse_paths, + const std::vector & select_specs, EventCallback callback) { std::lock_guard lock(impl_->client_mutex); @@ -651,16 +674,18 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o ctx->callback = std::move(callback); EventCallbackContext * raw_ctx = ctx.get(); - UA_EventFilter filter = make_event_filter(select_browse_paths); + UA_EventFilter filter = make_event_filter(select_specs); - // Use UA_MonitoredItemCreateRequest_default so the request mirrors what - // open62541's own examples send; only patch what we need. - UA_NodeId nid_copy; - UA_NodeId_copy(source_node.handle(), &nid_copy); - UA_MonitoredItemCreateRequest item = UA_MonitoredItemCreateRequest_default(nid_copy); - UA_NodeId_clear(&nid_copy); + UA_MonitoredItemCreateRequest item; + UA_MonitoredItemCreateRequest_init(&item); + // Deep-copy the source NodeId so the request struct owns its string + // buffer (if any). Cleared by UA_MonitoredItemCreateRequest_clear after + // the call. + UA_NodeId_copy(source_node.handle(), &item.itemToMonitor.nodeId); item.itemToMonitor.attributeId = UA_ATTRIBUTEID_EVENTNOTIFIER; + item.monitoringMode = UA_MONITORINGMODE_REPORTING; item.requestedParameters.samplingInterval = 0.0; + item.requestedParameters.discardOldest = true; item.requestedParameters.queueSize = 100; UA_ExtensionObject_setValueNoDelete(&item.requestedParameters.filter, &filter, &UA_TYPES[UA_TYPES_EVENTFILTER]); @@ -668,7 +693,7 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o // hand to the server. Trace-level diagnostic; can be tightened to a // ROS RCLCPP_DEBUG once the issue #386 server interop is stable. std::cerr << "[opcua_client] add_event_monitored_item: subId=" << subscription_id - << " nodeId=" << source_node.toString() << " selectClauses=" << (select_browse_paths.size() + 2) + << " nodeId=" << source_node.toString() << " selectClauses=" << (select_specs.size() + 3) << std::endl; UA_MonitoredItemCreateResult result = @@ -760,9 +785,30 @@ OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & opcua::CallMethodResult result = opcua::services::call(impl_->client, object_id, method_id, opcua::Span(input_args)); UA_StatusCode code = result.getStatusCode().get(); + std::cerr << "[opcua_client] call_method object=" << object_id.toString() << " method=" << method_id.toString() + << " statusCode=" << UA_StatusCode_name(code); + auto arg_results = result.getInputArgumentResults(); + for (size_t i = 0; i < arg_results.size(); ++i) { + std::cerr << " arg" << i << "=" << UA_StatusCode_name(arg_results[i].get()); + } + std::cerr << std::endl; if (code != UA_STATUSCODE_GOOD) { return tl::make_unexpected(status_to_error(code, UA_StatusCode_name(code))); } + // Per OPC-UA Part 4 §5.11.2, even when the overall call statusCode is + // Good the server may report per-argument validation failures via + // inputArgumentResults. AlarmConditionType.Acknowledge surfaces + // BadEventIdUnknown here when the EventId we tracked has been + // superseded by a newer event from the server. Treat any non-Good + // per-arg result as a transport error so the SOVD layer returns 502 + // instead of falsely reporting success. + for (size_t i = 0; i < arg_results.size(); ++i) { + UA_StatusCode arg_code = arg_results[i].get(); + if (arg_code != UA_STATUSCODE_GOOD) { + std::string msg = std::string(UA_StatusCode_name(arg_code)) + " on input arg " + std::to_string(i); + return tl::make_unexpected(status_to_error(arg_code, msg)); + } + } auto outputs_span = result.getOutputArguments(); std::vector outputs; outputs.reserve(outputs_span.size()); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index f5c9c2c8..31d77ece 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -25,7 +25,9 @@ #include #include #include +#include #include +#include namespace ros2_medkit_gateway { @@ -890,6 +892,18 @@ OpcuaPlugin::execute_operation(const std::string & entity_id, const std::string args.push_back(opcua::Variant::fromScalar(runtime->latest_event_id)); args.push_back(opcua::Variant::fromScalar(opcua::LocalizedText("", comment))); + { + const auto * bytes = runtime->latest_event_id.data(); + std::cerr << "[opcua_plugin] " << operation_name << " EventId len=" + << runtime->latest_event_id.length() << " hex="; + for (size_t i = 0; i < std::min(runtime->latest_event_id.length(), 16); ++i) { + char buf[3]; + std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); + std::cerr << buf; + } + std::cerr << " conditionId=" << runtime->condition_id.toString() << std::endl; + } + auto result = client_->call_method(runtime->condition_id, method_id, args); if (!result.has_value()) { auto code = result.error().code; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 8c4aa158..89b5567b 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -14,7 +14,9 @@ #include "ros2_medkit_opcua/opcua_poller.hpp" +#include #include +#include #include #include #include @@ -37,33 +39,44 @@ constexpr uint32_t kRefreshEndEventTypeId = 2788; constexpr uint32_t kShelvedStateUnshelved = 2929; // EventFilter select-clause indices used by the AlarmCondition trampoline. -// Order MUST match build_alarm_event_select_paths() below; the trampoline +// Order MUST match build_alarm_event_select_specs() below; the trampoline // reads positionally because OPC-UA does not return field names with the -// notification. -constexpr size_t kFieldConditionId = 0; -constexpr size_t kFieldEventId = 1; -constexpr size_t kFieldSeverity = 2; -constexpr size_t kFieldMessage = 3; -constexpr size_t kFieldEnabledState = 4; -constexpr size_t kFieldActiveState = 5; -constexpr size_t kFieldAckedState = 6; -constexpr size_t kFieldConfirmedState = 7; -constexpr size_t kFieldShelvingState = 8; -constexpr size_t kFieldBranchId = 9; -constexpr size_t kAlarmFieldCount = 10; - -std::vector build_alarm_event_select_paths() { +// notification. ``ConditionId`` is delivered to the EventCallback as a +// separate parameter (auto-prepended by add_event_monitored_item per +// Part 9 §5.5.2.13), so it is not in the user_values vector. +constexpr size_t kFieldEventId = 0; +constexpr size_t kFieldSeverity = 1; +constexpr size_t kFieldMessage = 2; +constexpr size_t kFieldEnabledState = 3; +constexpr size_t kFieldActiveState = 4; +constexpr size_t kFieldAckedState = 5; +constexpr size_t kFieldConfirmedState = 6; +constexpr size_t kFieldShelvingState = 7; +constexpr size_t kFieldBranchId = 8; +constexpr size_t kAlarmFieldCount = 9; + +// Standard NodeIds for the types that *directly* define each field (open62541 +// servers reject SAOs whose BrowsePath is inherited rather than direct). +constexpr uint32_t kBaseEventType = 2041; +constexpr uint32_t kConditionType = 2782; +constexpr uint32_t kAcknowledgeableConditionType = 2881; +constexpr uint32_t kAlarmConditionType = 2915; + +std::vector build_alarm_event_select_specs() { + // Each clause carries the type that *directly* defines its first browse + // segment - inheritance traversal is not honored by the open62541 server + // validator (verified against 1.4.6 with FULL ns0). return { - {{0, "ConditionId"}}, - {{0, "EventId"}}, - {{0, "Severity"}}, - {{0, "Message"}}, - {{0, "EnabledState"}, {0, "Id"}}, - {{0, "ActiveState"}, {0, "Id"}}, - {{0, "AckedState"}, {0, "Id"}}, - {{0, "ConfirmedState"}, {0, "Id"}}, - {{0, "ShelvingState"}, {0, "CurrentState"}, {0, "Id"}}, - {{0, "BranchId"}}, + {opcua::NodeId(0, kBaseEventType), {{0, "EventId"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kBaseEventType), {{0, "Severity"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kBaseEventType), {{0, "Message"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kConditionType), {{0, "EnabledState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kAlarmConditionType), {{0, "ActiveState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kAcknowledgeableConditionType), {{0, "AckedState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kAcknowledgeableConditionType), {{0, "ConfirmedState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kAlarmConditionType), {{0, "ShelvingState"}, {0, "CurrentState"}, {0, "Id"}}, + UA_ATTRIBUTEID_VALUE}, + {opcua::NodeId(0, kConditionType), {{0, "BranchId"}}, UA_ATTRIBUTEID_VALUE}, }; } @@ -219,16 +232,16 @@ void OpcuaPoller::setup_event_subscriptions() { return; } - const auto select_paths = build_alarm_event_select_paths(); + const auto select_specs = build_alarm_event_select_specs(); event_monitored_item_ids_.clear(); for (const auto & cfg : node_map_.event_alarms()) { auto callback = [this, &cfg](const std::vector & values, const opcua::NodeId & source_node, - const opcua::NodeId & event_type) { - on_event(cfg, values, source_node, event_type); + const opcua::NodeId & event_type, const opcua::NodeId & condition_id) { + on_event(cfg, values, source_node, event_type, condition_id); }; uint32_t mi_id = - client_.add_event_monitored_item(event_subscription_id_, cfg.source_node_id, select_paths, std::move(callback)); + client_.add_event_monitored_item(event_subscription_id_, cfg.source_node_id, select_specs, std::move(callback)); if (mi_id != 0) { event_monitored_item_ids_.push_back(mi_id); } @@ -261,7 +274,10 @@ void OpcuaPoller::condition_refresh() { } void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector & values, - const opcua::NodeId & /*source_node*/, const opcua::NodeId & event_type) { + const opcua::NodeId & /*source_node*/, const opcua::NodeId & event_type, + const opcua::NodeId & condition_id) { + std::cerr << "[opcua_poller] on_event fault=" << cfg.fault_code << " event_type=" << event_type.toString() + << " condition=" << condition_id.toString() << " values=" << values.size() << std::endl; // Detect ConditionRefresh bracketing per Part 9 §5.5.7. The flag is for // diagnostics only; the state machine itself does not need to know // because RefreshStart / RefreshEnd notifications carry no condition @@ -296,9 +312,16 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector alarm is operator-suppressed. if (values[kFieldShelvingState].isType()) { auto shelv_state = values[kFieldShelvingState].getScalarCopy(); - input.shelved = - !(shelv_state.getNamespaceIndex() == 0 && shelv_state.getIdentifierType() == opcua::NodeIdType::Numeric && - shelv_state.getIdentifierAs() == kShelvedStateUnshelved); + // Treat as shelved only when ShelvingState/CurrentState/Id explicitly + // points at TimedShelved (i=2930) or OneShotShelved (i=2932). A null / + // unset / unknown Id is interpreted as Unshelved - some servers do not + // initialize the Id property when the optional ShelvingState field is + // attached, and we should not treat that as suppression. + bool is_known_shelved = (shelv_state.getNamespaceIndex() == 0 && + shelv_state.getIdentifierType() == opcua::NodeIdType::Numeric) && + (shelv_state.getIdentifierAs() == 2930u || + shelv_state.getIdentifierAs() == 2932u); + input.shelved = is_known_shelved; } else { input.shelved = false; } @@ -315,13 +338,10 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector()) { - condition_id = values[kFieldConditionId].getScalarCopy(); - } + // ``condition_id`` is supplied by add_event_monitored_item via the + // auto-prepended SAO with empty BrowsePath + AttributeId=NodeId + // (Part 9 §5.5.2.13). Key the runtime map on its stringForm so distinct + // condition instances within the same event source remain separate. std::string condition_id_str = condition_id.toString(); ConditionRuntime runtime_snapshot; @@ -343,9 +363,24 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector()) { it->second.latest_event_id = values[kFieldEventId].getScalarCopy(); + std::cerr << "[opcua_poller] captured EventId len=" + << it->second.latest_event_id.length() << " hex="; + const auto * bytes = it->second.latest_event_id.data(); + for (size_t i = 0; i < std::min(it->second.latest_event_id.length(), 16); ++i) { + char buf[3]; + std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); + std::cerr << buf; + } + std::cerr << std::endl; + } else { + std::cerr << "[opcua_poller] EventId field not a ByteString" << std::endl; } auto outcome = AlarmStateMachine::compute(prev_status, input); + std::cerr << "[opcua_poller] state machine: enabled=" << input.enabled_state << " active=" << input.active_state + << " acked=" << input.acked_state << " confirmed=" << input.confirmed_state << " shelved=" << input.shelved + << " branch=" << input.branch_id_present << " prev=" << static_cast(prev_status) + << " action=" << static_cast(outcome.action) << std::endl; it->second.last_status = outcome.next_status; runtime_snapshot = it->second; @@ -378,6 +413,8 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector("CurrentState")); + idElems[1].referenceTypeId = UA_NODEID_NUMERIC(0, UA_NS0ID_HASCOMPONENT); + idElems[2].targetName = UA_QUALIFIEDNAME(0, const_cast("Id")); + idElems[2].referenceTypeId = UA_NODEID_NUMERIC(0, UA_NS0ID_HASPROPERTY); + UA_BrowsePath idPath; + UA_BrowsePath_init(&idPath); + idPath.startingNode = c.node; + idPath.relativePath.elementsSize = 3; + idPath.relativePath.elements = idElems; + UA_BrowsePathResult idResult = UA_Server_translateBrowsePathToNodeIds(server, &idPath); + if (idResult.statusCode == UA_STATUSCODE_GOOD && idResult.targetsSize > 0) { + UA_NodeId stateIdNode = UA_NODEID_NUMERIC(0, shelved ? 2930u /* TimedShelved */ : 2929u /* Unshelved */); + UA_Variant idVar; + UA_Variant_setScalar(&idVar, &stateIdNode, &UA_TYPES[UA_TYPES_NODEID]); + UA_Server_writeValue(server, idResult.targets[0].targetId.nodeId, idVar); + } + UA_BrowsePathResult_clear(&idResult); + UA_BrowsePathResult_clear(&result); return UA_Server_triggerConditionEvent(server, c.node, c.source, nullptr); } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index 464a8f93..170c2033 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -121,7 +121,8 @@ TEST(OpcuaClientTest, GenerationStartsAtZero) { TEST(OpcuaClientTest, AddEventMonitoredItemWhenDisconnected) { OpcuaClient client; auto mi = client.add_event_monitored_item( - /*sub_id=*/1, opcua::NodeId(0, UA_NS0ID_SERVER), /*select=*/{}, [](const auto &, const auto &, const auto &) {}); + /*sub_id=*/1, opcua::NodeId(0, UA_NS0ID_SERVER), /*select=*/{}, + [](const auto &, const auto &, const auto &, const auto &) {}); EXPECT_EQ(mi, 0u); } From 6fead319651c1ca883a6bb5ad07a939c494f9dc7 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sat, 25 Apr 2026 23:03:21 +0200 Subject: [PATCH 12/22] style(opcua): apply clang-format-18 to diagnostic stderr logs --- .../ros2_medkit_opcua/src/opcua_client.cpp | 3 +-- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 4 ++-- .../ros2_medkit_opcua/src/opcua_poller.cpp | 20 +++++++++---------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index b140a296..ed8e4bb6 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -693,8 +693,7 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o // hand to the server. Trace-level diagnostic; can be tightened to a // ROS RCLCPP_DEBUG once the issue #386 server interop is stable. std::cerr << "[opcua_client] add_event_monitored_item: subId=" << subscription_id - << " nodeId=" << source_node.toString() << " selectClauses=" << (select_specs.size() + 3) - << std::endl; + << " nodeId=" << source_node.toString() << " selectClauses=" << (select_specs.size() + 3) << std::endl; UA_MonitoredItemCreateResult result = UA_Client_MonitoredItems_createEvent(impl_->client.handle(), subscription_id, UA_TIMESTAMPSTORETURN_BOTH, item, diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 31d77ece..a646c61c 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -894,8 +894,8 @@ OpcuaPlugin::execute_operation(const std::string & entity_id, const std::string { const auto * bytes = runtime->latest_event_id.data(); - std::cerr << "[opcua_plugin] " << operation_name << " EventId len=" - << runtime->latest_event_id.length() << " hex="; + std::cerr << "[opcua_plugin] " << operation_name << " EventId len=" << runtime->latest_event_id.length() + << " hex="; for (size_t i = 0; i < std::min(runtime->latest_event_id.length(), 16); ++i) { char buf[3]; std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 89b5567b..6edc4153 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -74,7 +74,8 @@ std::vector build_alarm_event_select_specs() { {opcua::NodeId(0, kAlarmConditionType), {{0, "ActiveState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, {opcua::NodeId(0, kAcknowledgeableConditionType), {{0, "AckedState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, {opcua::NodeId(0, kAcknowledgeableConditionType), {{0, "ConfirmedState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, - {opcua::NodeId(0, kAlarmConditionType), {{0, "ShelvingState"}, {0, "CurrentState"}, {0, "Id"}}, + {opcua::NodeId(0, kAlarmConditionType), + {{0, "ShelvingState"}, {0, "CurrentState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, {opcua::NodeId(0, kConditionType), {{0, "BranchId"}}, UA_ATTRIBUTEID_VALUE}, }; @@ -317,10 +318,9 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector() == 2930u || - shelv_state.getIdentifierAs() == 2932u); + bool is_known_shelved = + (shelv_state.getNamespaceIndex() == 0 && shelv_state.getIdentifierType() == opcua::NodeIdType::Numeric) && + (shelv_state.getIdentifierAs() == 2930u || shelv_state.getIdentifierAs() == 2932u); input.shelved = is_known_shelved; } else { input.shelved = false; @@ -363,8 +363,7 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector()) { it->second.latest_event_id = values[kFieldEventId].getScalarCopy(); - std::cerr << "[opcua_poller] captured EventId len=" - << it->second.latest_event_id.length() << " hex="; + std::cerr << "[opcua_poller] captured EventId len=" << it->second.latest_event_id.length() << " hex="; const auto * bytes = it->second.latest_event_id.data(); for (size_t i = 0; i < std::min(it->second.latest_event_id.length(), 16); ++i) { char buf[3]; @@ -378,9 +377,10 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector/dev/null || true +docker network rm "${NET_NAME}" 2>/dev/null || true + echo "[1/5] Build test_alarm_server image" docker build --network=host \ -f src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile \ @@ -336,7 +343,14 @@ echo "fire SensorLost 900" >&3 wait_until_status PLC_SENSOR_LOST CONFIRMED 30 echo " OK PLC_SENSOR_LOST re-armed after Enable" -echo " [scenario] reconnect preserves CONFIRMED via ConditionRefresh" +echo " [scenario] reconnect re-subscribes after server restart" +# This scenario does NOT verify ConditionRefresh re-emit - the test_alarm_server +# is in-memory and loses condition state on restart, so the natural +# Part 9 §5.5.7 contract (server replays retained conditions on RefreshStartEvent) +# cannot fire here. Issue #389 tracks adding a fixture that supports it. +# What this scenario DOES verify: gateway detects disconnect, retries until the +# server returns, re-runs setup_event_subscriptions(), and a freshly fired +# alarm flows through the bridge end-to-end after the reconnect. # Pre-clear Overpressure so the next fire is a fresh CONFIRMED event. echo "clear Overpressure" >&3 wait_no_fault PLC_OVERPRESSURE 30 diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index ed8e4bb6..d4f536a6 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -203,6 +203,10 @@ void OpcuaClient::disconnect() { // Bump generation FIRST so any in-flight event callbacks fired from the // dying subscription drop their work in the trampoline (they read // generation atomically) before we touch the storage they reference. + // The ``if (impl_->connected)`` guard ensures we bump exactly once even + // when ``maybe_mark_disconnected`` already fired earlier on a transport + // error path - that helper uses ``exchange(false)`` and would have + // already bumped, leaving impl_->connected = false here. impl_->generation.fetch_add(1, std::memory_order_release); try { // Issue #386: clear event monitored items BEFORE deleting subscriptions. diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 6edc4153..22f56565 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -219,14 +219,9 @@ void OpcuaPoller::setup_event_subscriptions() { // Issue #386: one dedicated subscription for AlarmCondition events; uses // a default no-op data callback because we wire MIs of EVENTNOTIFIER // attribute, not data-change MIs. The EventCallback bound below is what - // actually receives notifications. - if (event_subscription_id_ != 0) { - // Already configured (reconnect path will re-create after disconnect - // bumps the generation counter, so we should not get here twice on the - // same logical session). Defensive: skip silently. - return; - } - + // actually receives notifications. Caller (start() and the poll_loop + // reconnect arm) is responsible for zeroing event_subscription_id_ before + // calling - both currently do. event_subscription_id_ = client_.create_subscription(config_.subscription_interval_ms, [](const std::string &, const OpcuaValue &) { /* no-op */ }); if (event_subscription_id_ == 0) { From 12b34065d2284f75dd6687bcb815797845ab93a2 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 11:41:19 +0200 Subject: [PATCH 14/22] fix(opcua,#386): operator-visible warn when ConditionRefresh is rejected Today ``OpcuaPoller::condition_refresh()`` swallows server failures with a silent comment ("not fatal - many test servers do not implement"). Real PLCs hit this too: open62541 v1.4.x returns BadMethodInvalid, Siemens S7-1500 omits ConditionRefresh entirely, Beckhoff TF6100 status unconfirmed in public docs. The operator gets no signal that their ``alarm-replay-on-reconnect`` contract is broken. - PollerConfig gains ``log_warn`` (std::function), optional. The plugin owning the poller wires it to its inherited GatewayPlugin::log_warn so messages reach the ROS 2 logger and /rosout, not just container stderr. - ``OpcuaPoller::condition_refresh()`` emits one warn per connect on failure (throttled by ``condition_refresh_warned_``) carrying the StatusCode and pointing at issue #389. Reset on success so a recovered server earns a fresh warn next time it breaks. - Falls back to ``std::cerr [opcua_poller WARN]`` when the callback is not set, preserving observability for unit-test paths that don't go through the plugin. No behavior change for the success path. Live transitions continue to flow regardless; this is purely operator observability for the "reconnect doesn't replay state" failure mode. --- .../ros2_medkit_opcua/opcua_poller.hpp | 12 ++++++++++ .../ros2_medkit_opcua/src/opcua_plugin.cpp | 6 +++++ .../ros2_medkit_opcua/src/opcua_poller.cpp | 24 +++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index 45ef3608..cbb3a6fb 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -82,6 +82,12 @@ struct PollerConfig { double subscription_interval_ms{500.0}; std::chrono::milliseconds poll_interval{1000}; std::chrono::milliseconds reconnect_interval{5000}; + /// Optional warn-level log sink for operator-visible failures inside the + /// poll thread. Set by the plugin owning the poller to its log_warn + /// helper so events like ``ConditionRefresh failed`` reach the ROS 2 log + /// instead of stderr only. Empty by default - the poller falls back to + /// stderr in that case. + std::function log_warn; }; /// Manages OPC-UA data collection via subscriptions (preferred) or polling @@ -183,6 +189,12 @@ class OpcuaPoller { // tracking the EventType, which we do explicitly below. std::atomic condition_refresh_in_progress_{false}; + // Throttle the warn-level log emitted from condition_refresh() failures. + // Reset to false on each fresh subscribe in setup_event_subscriptions() + // so a transient server error (BadMethodInvalid on cold-start, recovers + // later) gets one log per connect, not one per re-subscribe attempt. + bool condition_refresh_warned_{false}; + // Thread safety: must be set via set_poll_callback() before start(). // Not modified after start(), so safe to read from the poll thread without a mutex. PollCallback poll_callback_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index a646c61c..a529cde1 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -182,6 +182,12 @@ void OpcuaPlugin::set_context(PluginContext & context) { poller_->set_poll_callback([this](const PollSnapshot & snap) { publish_values(snap); }); + // Plumb operator-visible warnings out of the poll thread into the ROS log. + // Without this, ``ConditionRefresh rejected`` etc. only land in the + // container's stderr, invisible to anyone watching /rosout or rclpp logs. + poller_config_.log_warn = [this](const std::string & msg) { + log_warn(msg); + }; poller_->start(poller_config_); log_info("OPC-UA poller started (mode: " + std::string(poller_->using_subscriptions() ? "subscription" : "poll") + ")"); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 22f56565..9d5d6438 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -264,8 +264,28 @@ void OpcuaPoller::condition_refresh() { auto result = client_.call_method(opcua::NodeId(0, kServerObjectId), opcua::NodeId(0, kConditionRefreshMethodId), args); if (!result.has_value()) { - // Not fatal - many test servers do not implement ConditionRefresh. - // Live notifications will still flow once conditions transition. + // Not fatal but operator-visible: when ConditionRefresh is rejected by + // the server (BadMethodInvalid in open62541 v1.4.x, BadNotImplemented + // on Siemens S7-1500, etc.) the gateway will not re-receive any active + // conditions on reconnect; only live transitions surface in /faults. + // Worth a single warn per connect so the operator knows their + // alarm-replay-on-reconnect contract is broken with this PLC. + if (!condition_refresh_warned_) { + const std::string msg = "OPC-UA ConditionRefresh rejected (" + result.error().message + + "); active conditions will NOT be replayed on reconnect with this server. " + "Live transitions still flow. See issue #389."; + if (config_.log_warn) { + config_.log_warn(msg); + } else { + std::cerr << "[opcua_poller WARN] " << msg << std::endl; + } + condition_refresh_warned_ = true; + } + } else { + // Reset the throttle: a successful refresh means the server is + // cooperating again, so the next failure (e.g., after a restart of a + // server with a different config) earns a fresh warn. + condition_refresh_warned_ = false; } } From 74b8460f364daa97950d405b5ed41939246b95d8 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 11:54:59 +0200 Subject: [PATCH 15/22] test(opcua,#386): unit cover the call_method per-arg result classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the OPC-UA Part 4 §5.11.2 Call result classification from ``OpcuaClient::call_method`` into two public static helpers: - ``status_to_method_error(uint32_t code, const std::string & msg)`` maps an OPC-UA StatusCode to one of {MethodNotFound, InvalidArgument, MethodTimeout, TransportError} via the existing dispatch table. - ``classify_call_result(uint32_t overall, std::vector args)`` walks the per-argument results and returns the first non-Good code, with overall statusCode taking precedence. Public statics so the test suite can hit the BadEventIdUnknown branch that previously had no unit anchor - the bug ``call_method`` was just fixed for (statusCode=Good + inputArgumentResults[0]=Bad) is exercised by the docker integration test (run_alarm_tests.sh ack/confirm flow) but a future refactor that drops the per-arg loop would not get caught locally. ~30 LoC of pure tests catch this in seconds. API surface uses ``uint32_t`` instead of ``UA_StatusCode`` so the public header doesn't pull in open62541 types - the .cpp internally treats them identically (UA_StatusCode is a uint32_t typedef). 9 new gtest cases (3 for ``status_to_method_error``, 6 for ``classify_call_result``): - MethodNotFound / InvalidArgument / Timeout family dispatch - BadEventIdUnknown stays in TransportError (signals SOVD to retry, not reject, since the EventId staling is a transient race) - Empty arg results, all-Good arg results -> success - Bad overall status returns error - BadEventIdUnknown in arg[0] returns error with "input arg 0" message - First bad arg wins over later bad args - Overall status beats arg results (transport error short-circuits) Existing ``call_method`` body is unchanged in behavior; the diagnostic stderr trace that prints ``statusCode=`` and per-arg codes is preserved verbatim. Local verify: ``test_opcua_client`` reports 26/26 PASSED including all 9 new cases. --- .../ros2_medkit_opcua/opcua_client.hpp | 17 ++++ .../ros2_medkit_opcua/src/opcua_client.cpp | 80 +++++++++-------- .../test/test_opcua_client.cpp | 88 +++++++++++++++++++ 3 files changed, 150 insertions(+), 35 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index 49863692..ce058ad7 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -193,6 +193,23 @@ class OpcuaClient { call_method(const opcua::NodeId & object_id, const opcua::NodeId & method_id, const std::vector & input_args); + /// Map an OPC-UA StatusCode (from an attempted method call or a + /// per-argument validation result) to a ``MethodError`` category. + /// Exposed as a public static helper so the classification table is + /// covered by unit tests without needing a live OPC-UA connection. + static MethodErrorInfo status_to_method_error(uint32_t code, const std::string & message); + + /// Classify the full result of a Call service exchange per OPC-UA + /// Part 4 §5.11.2: overall ``statusCode`` covers transport / method + /// resolution; ``inputArgumentResults`` covers per-argument validation. + /// Returns success when both are Good. The first non-Good code wins: + /// overall statusCode takes precedence, then arg_results in order. + /// AlarmConditionType.Acknowledge surfaces ``BadEventIdUnknown`` in + /// ``arg_results[0]`` when the EventId we cached has been superseded. + /// Exposed as a static for unit-test coverage of the per-arg branch. + static tl::expected classify_call_result(uint32_t overall_status_code, + const std::vector & arg_results); + /// Get server description string (for status endpoint) std::string server_description() const; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index d4f536a6..de227675 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -759,6 +759,40 @@ bool OpcuaClient::remove_event_monitored_item(uint32_t subscription_id, uint32_t return true; } +OpcuaClient::MethodErrorInfo OpcuaClient::status_to_method_error(uint32_t code, const std::string & message) { + if (code == UA_STATUSCODE_BADMETHODINVALID || code == UA_STATUSCODE_BADNODEIDUNKNOWN || + code == UA_STATUSCODE_BADNOTSUPPORTED) { + return {MethodError::MethodNotFound, message}; + } + if (code == UA_STATUSCODE_BADARGUMENTSMISSING || code == UA_STATUSCODE_BADINVALIDARGUMENT || + code == UA_STATUSCODE_BADTYPEMISMATCH || code == UA_STATUSCODE_BADTOOMANYARGUMENTS) { + return {MethodError::InvalidArgument, message}; + } + if (code == UA_STATUSCODE_BADTIMEOUT) { + return {MethodError::MethodTimeout, message}; + } + return {MethodError::TransportError, message}; +} + +tl::expected +OpcuaClient::classify_call_result(uint32_t overall_status_code, const std::vector & arg_results) { + if (overall_status_code != UA_STATUSCODE_GOOD) { + return tl::make_unexpected(status_to_method_error(overall_status_code, UA_StatusCode_name(overall_status_code))); + } + // Per OPC-UA Part 4 §5.11.2, even when overall statusCode is Good the + // server may flag per-argument validation failures via inputArgumentResults. + // AlarmConditionType.Acknowledge surfaces BadEventIdUnknown here when our + // cached EventId has been superseded - the call did NOT take effect. + // Returning an error keeps the SOVD layer from falsely reporting success. + for (size_t i = 0; i < arg_results.size(); ++i) { + if (arg_results[i] != UA_STATUSCODE_GOOD) { + std::string msg = std::string(UA_StatusCode_name(arg_results[i])) + " on input arg " + std::to_string(i); + return tl::make_unexpected(status_to_method_error(arg_results[i], msg)); + } + } + return {}; +} + tl::expected, OpcuaClient::MethodErrorInfo> OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & method_id, const std::vector & input_args) { @@ -768,49 +802,25 @@ OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & return tl::make_unexpected(MethodErrorInfo{MethodError::NotConnected, "Not connected to OPC-UA server"}); } - // Helper: map an OPC-UA status code to a MethodError category. - auto status_to_error = [](UA_StatusCode code, const std::string & msg) -> MethodErrorInfo { - if (code == UA_STATUSCODE_BADMETHODINVALID || code == UA_STATUSCODE_BADNODEIDUNKNOWN || - code == UA_STATUSCODE_BADNOTSUPPORTED) { - return {MethodError::MethodNotFound, msg}; - } - if (code == UA_STATUSCODE_BADARGUMENTSMISSING || code == UA_STATUSCODE_BADINVALIDARGUMENT || - code == UA_STATUSCODE_BADTYPEMISMATCH || code == UA_STATUSCODE_BADTOOMANYARGUMENTS) { - return {MethodError::InvalidArgument, msg}; - } - if (code == UA_STATUSCODE_BADTIMEOUT) { - return {MethodError::MethodTimeout, msg}; - } - return {MethodError::TransportError, msg}; - }; - try { opcua::CallMethodResult result = opcua::services::call(impl_->client, object_id, method_id, opcua::Span(input_args)); UA_StatusCode code = result.getStatusCode().get(); + auto arg_results = result.getInputArgumentResults(); std::cerr << "[opcua_client] call_method object=" << object_id.toString() << " method=" << method_id.toString() << " statusCode=" << UA_StatusCode_name(code); - auto arg_results = result.getInputArgumentResults(); + std::vector arg_codes; + arg_codes.reserve(arg_results.size()); for (size_t i = 0; i < arg_results.size(); ++i) { - std::cerr << " arg" << i << "=" << UA_StatusCode_name(arg_results[i].get()); + uint32_t arg_code = arg_results[i].get(); + arg_codes.push_back(arg_code); + std::cerr << " arg" << i << "=" << UA_StatusCode_name(arg_code); } std::cerr << std::endl; - if (code != UA_STATUSCODE_GOOD) { - return tl::make_unexpected(status_to_error(code, UA_StatusCode_name(code))); - } - // Per OPC-UA Part 4 §5.11.2, even when the overall call statusCode is - // Good the server may report per-argument validation failures via - // inputArgumentResults. AlarmConditionType.Acknowledge surfaces - // BadEventIdUnknown here when the EventId we tracked has been - // superseded by a newer event from the server. Treat any non-Good - // per-arg result as a transport error so the SOVD layer returns 502 - // instead of falsely reporting success. - for (size_t i = 0; i < arg_results.size(); ++i) { - UA_StatusCode arg_code = arg_results[i].get(); - if (arg_code != UA_STATUSCODE_GOOD) { - std::string msg = std::string(UA_StatusCode_name(arg_code)) + " on input arg " + std::to_string(i); - return tl::make_unexpected(status_to_error(arg_code, msg)); - } + + auto classified = classify_call_result(code, arg_codes); + if (!classified.has_value()) { + return tl::make_unexpected(classified.error()); } auto outputs_span = result.getOutputArguments(); std::vector outputs; @@ -821,7 +831,7 @@ OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & return outputs; } catch (const opcua::BadStatus & e) { maybe_mark_disconnected(impl_->connected, impl_->generation, e); - return tl::make_unexpected(status_to_error(e.code(), e.what())); + return tl::make_unexpected(status_to_method_error(e.code(), e.what())); } } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index 170c2033..a67ad6f4 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -158,4 +158,92 @@ TEST(OpcuaClientTest, RemoveSubscriptionsBumpsGenerationEvenWhenEmpty) { EXPECT_GT(client.current_generation(), before); } +// --------------------------------------------------------------------------- +// Issue #386 follow-up: cover the OPC-UA Part 4 §5.11.2 Call result +// classification paths (overall statusCode + per-argument inputArgumentResults) +// directly. The integration test in docker/scripts/run_alarm_tests.sh +// triggers the per-arg ``BadEventIdUnknown`` flow during real ack/confirm, +// but a unit-test anchor catches future regressions in seconds rather than +// the ~15 minutes a docker run takes. +// --------------------------------------------------------------------------- + +TEST(OpcuaClientStatusToMethodError, MethodNotFoundFamily) { + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADMETHODINVALID, "x").code, + OpcuaClient::MethodError::MethodNotFound); + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADNODEIDUNKNOWN, "x").code, + OpcuaClient::MethodError::MethodNotFound); + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADNOTSUPPORTED, "x").code, + OpcuaClient::MethodError::MethodNotFound); +} + +TEST(OpcuaClientStatusToMethodError, InvalidArgumentFamily) { + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADARGUMENTSMISSING, "x").code, + OpcuaClient::MethodError::InvalidArgument); + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADINVALIDARGUMENT, "x").code, + OpcuaClient::MethodError::InvalidArgument); + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADTYPEMISMATCH, "x").code, + OpcuaClient::MethodError::InvalidArgument); + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADTOOMANYARGUMENTS, "x").code, + OpcuaClient::MethodError::InvalidArgument); +} + +TEST(OpcuaClientStatusToMethodError, TimeoutAndDefault) { + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADTIMEOUT, "x").code, + OpcuaClient::MethodError::MethodTimeout); + // BadEventIdUnknown is not in any explicit case - falls through to + // TransportError. The SOVD layer maps that to HTTP 502, prompting the + // caller to retry rather than treat the request as fatally invalid. + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADEVENTIDUNKNOWN, "x").code, + OpcuaClient::MethodError::TransportError); + // Generic transport error. + EXPECT_EQ(OpcuaClient::status_to_method_error(UA_STATUSCODE_BADCONNECTIONCLOSED, "x").code, + OpcuaClient::MethodError::TransportError); +} + +TEST(OpcuaClientClassifyCallResult, GoodOverallNoArgsSucceeds) { + auto result = OpcuaClient::classify_call_result(UA_STATUSCODE_GOOD, {}); + EXPECT_TRUE(result.has_value()); +} + +TEST(OpcuaClientClassifyCallResult, GoodOverallAllArgsGoodSucceeds) { + auto result = OpcuaClient::classify_call_result(UA_STATUSCODE_GOOD, {UA_STATUSCODE_GOOD, UA_STATUSCODE_GOOD}); + EXPECT_TRUE(result.has_value()); +} + +TEST(OpcuaClientClassifyCallResult, BadOverallStatusReturnsError) { + auto result = OpcuaClient::classify_call_result(UA_STATUSCODE_BADMETHODINVALID, {UA_STATUSCODE_GOOD}); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().code, OpcuaClient::MethodError::MethodNotFound); +} + +TEST(OpcuaClientClassifyCallResult, GoodOverallBadInputArgReturnsError) { + // The exact AlarmConditionType.Acknowledge case: server validates the + // EventId argument, finds it's been superseded, sets statusCode=Good + // (the call itself was well-formed) but inputArgumentResults[0]= + // BadEventIdUnknown to signal the ack did NOT take effect. + auto result = OpcuaClient::classify_call_result(UA_STATUSCODE_GOOD, {UA_STATUSCODE_BADEVENTIDUNKNOWN}); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().code, OpcuaClient::MethodError::TransportError); + EXPECT_NE(result.error().message.find("input arg 0"), std::string::npos); +} + +TEST(OpcuaClientClassifyCallResult, FirstBadArgWinsOverLaterBadArgs) { + // Multiple per-arg failures: classifier reports the earliest one so the + // diagnostic message points at the first thing that broke (consistent + // with how the spec lists arguments left-to-right). + auto result = OpcuaClient::classify_call_result(UA_STATUSCODE_GOOD, + {UA_STATUSCODE_BADTYPEMISMATCH, UA_STATUSCODE_BADEVENTIDUNKNOWN}); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().code, OpcuaClient::MethodError::InvalidArgument); + EXPECT_NE(result.error().message.find("input arg 0"), std::string::npos); +} + +TEST(OpcuaClientClassifyCallResult, OverallStatusBeatsArgResults) { + // When both overall statusCode and arg results are bad, overall wins: + // a transport-level rejection means the server never even validated args. + auto result = OpcuaClient::classify_call_result(UA_STATUSCODE_BADTIMEOUT, {UA_STATUSCODE_BADEVENTIDUNKNOWN}); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().code, OpcuaClient::MethodError::MethodTimeout); +} + } // namespace ros2_medkit_gateway From b490c2ab69accc9c90710e05bba9ee5e8cea78ad Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 11:59:11 +0200 Subject: [PATCH 16/22] test(opcua,#386): cover the missing AlarmStateMachine transition cells Verification round on the post-#386 review found 9 untested cells in the AlarmStateMachine transition matrix (issue #389 follow-up). Adding focused tests so a future refactor cannot silently break a corner case. New cells covered: - ``DisabledClearsHealedAlarm`` - Healed exit via Rule 2 (was_active=true for Healed too, so disabling a latched alarm must ClearFault, not just flip to Suppressed). - ``DisabledNoOpWhenAlreadyCleared`` - confirms Cleared+disabled lands at Suppressed/NoOp (no spurious second clear). - ``ShelvedClearsHealedAlarm`` / ``ShelvedNoOpWhenAlreadyCleared`` - same pair via Rule 3. - ``ActiveAlarmReportsConfirmedFromSuppressed`` - operator unshelves/re-enables an alarm whose source is still active; Rule 4 must promote Suppressed -> Confirmed with ReportConfirmed (NOT NoOp). This is exactly the unshelve+re-fire path scenario 2 of run_alarm_tests.sh exercises end-to-end. - ``BranchEventFromHealedNoOp`` / ``BranchEventFromSuppressedNoOp`` - Rule 1 precedence holds across all four prev_status values. - ``AckedAndConfirmedNoOpFromSuppressed`` - ``was_active=false`` branch of Rule 5: a fully cleared event delivered while suppressed advances next_status to Cleared but issues no ClearFault (no fault to clear). - ``InactiveUnackedFromSuppressedReportsHealed`` - ditto but with the ack/confirm bits unset, must surface as Healed/ReportHealed so the unfinished ack/confirm workflow item stays visible. State machine code itself is unchanged. 27 gtest cases, all PASSED. Pure-function tests, deterministic, no I/O. --- .../test/test_alarm_state_machine.cpp | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp index 38a663e4..43a50b1e 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp @@ -208,4 +208,107 @@ TEST(AlarmStateMachineTest, ShelvedTakesPrecedenceOverActive) { EXPECT_EQ(out.action, AlarmAction::NoOp); } +// -- Coverage of remaining transition matrix cells (issue #389 follow-up). +// The above tests exercise the obvious paths; these cover the corners where +// prev_status is Healed or Suppressed and an exit / re-entry rule fires. + +TEST(AlarmStateMachineTest, DisabledClearsHealedAlarm) { + // Operator disables an already-latched (Healed) alarm: must transition to + // Cleared with a ClearFault action so the latched fault disappears from + // /faults instead of orphaning between HEALED and Cleared forever. + AlarmEventInput in; + in.enabled_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Healed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, DisabledNoOpWhenAlreadyCleared) { + AlarmEventInput in; + in.enabled_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Suppressed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, ShelvedClearsHealedAlarm) { + // Same exit shape as Disabled, via the shelving rule. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.shelved = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Healed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, ShelvedNoOpWhenAlreadyCleared) { + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.shelved = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Suppressed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, ActiveAlarmReportsConfirmedFromSuppressed) { + // Operator unshelves / re-enables an alarm whose underlying source is + // still active: the next event has active=true and the state machine + // must promote Suppressed -> Confirmed with a ReportConfirmed action. + AlarmEventInput in = live_event(true); + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Suppressed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Confirmed); + EXPECT_EQ(out.action, AlarmAction::ReportConfirmed); +} + +TEST(AlarmStateMachineTest, BranchEventFromHealedNoOp) { + AlarmEventInput in; + in.enabled_state = true; + in.branch_id_present = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Healed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, BranchEventFromSuppressedNoOp) { + AlarmEventInput in; + in.enabled_state = true; + in.branch_id_present = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Suppressed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Suppressed); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, AckedAndConfirmedNoOpFromSuppressed) { + // Suppressed alarm receives a fully-cleared event (active=false, + // acked=true, confirmed=true). Was already not-active per the + // suppression; no ClearFault to issue, but next_status should track + // to Cleared so a later re-fire re-promotes correctly. This is the + // ``was_active=false`` branch of rule 5. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = true; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Suppressed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::NoOp); +} + +TEST(AlarmStateMachineTest, InactiveUnackedFromSuppressedReportsHealed) { + // Suppressed alarm sees an inactive+unacked event (operator unshelved + // while the source had already self-cleared but wasn't acked). The + // state machine must surface this as Healed so the operator sees the + // pending ack/confirm workflow item rather than silently forgetting it. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = false; + in.confirmed_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Suppressed, in); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::ReportHealed); +} + } // namespace ros2_medkit_gateway From f66bb5bd8e7869d5a9a45785b8519ed6cc04d582 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 14:07:15 +0200 Subject: [PATCH 17/22] fix(opcua,#386): per-MI active flag for event MI removal (Copilot review) Two correctness fixes from Copilot's review of PR #387 in the same concurrency cluster: [Blocker] ``remove_event_monitored_item`` was bumping the global ``generation`` counter to invalidate in-flight callbacks for the MI being removed. That works for the removed MI but ALSO trips the trampoline staleness check for every peer monitored item registered against the same subscription, silently dropping their callbacks even though their server-side MIs are still live. Replace with a per-context ``std::atomic active`` flag flipped to false before the unique_ptr is moved out; the trampoline reads it (acquire) ahead of dispatch and bails for that MI only. Global ``generation`` stays reserved for full disconnect / ``remove_subscriptions``. [Defensive] ``setup_event_subscriptions`` was capturing ``const auto & cfg`` from a range-for by reference. Per current C++ semantics the captured reference resolves to the underlying vector element (not the local reference variable), so the closure stays valid - the integration test fires three distinct alarms and gets three distinct fault_codes, confirming each callback uses its own cfg. But value capture is unambiguous and matches the review feedback. ``AlarmEventConfig`` is a small struct of strings, so the copy is cheap. New test ``RemoveEventMonitoredItemUnknownIdDoesNotBumpGeneration`` locks the contract: peer MI staleness must not be triggered by an unknown-ID single removal. Local verify: 27/27 test_opcua_client + 27/27 test_alarm_state_machine. --- .../ros2_medkit_opcua/src/opcua_client.cpp | 41 ++++++++++++++++--- .../ros2_medkit_opcua/src/opcua_poller.cpp | 12 +++++- .../test/test_opcua_client.cpp | 13 ++++++ 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index de227675..47540402 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -38,11 +38,21 @@ static void on_event_trampoline_c(UA_Client * client, UA_UInt32 sub_id, void * s /// Heap-owned context passed to the open62541 C event callback. Lifetime is /// owned by ``Impl::event_callbacks`` (unique_ptr); the raw pointer handed to C /// is valid until ``remove_event_monitored_item`` or ``remove_subscriptions`` -/// erases the entry. The ``generation_snapshot`` lets the trampoline drop -/// callbacks that fire from a defunct subscription after a reconnect. +/// erases the entry. +/// +/// Two staleness guards layered together: +/// - ``generation_snapshot`` filters callbacks fired from a defunct +/// subscription after the whole client reconnected (bumped on disconnect +/// and on ``remove_subscriptions``). +/// - ``active`` filters callbacks fired from a single monitored item that has +/// been individually removed via ``remove_event_monitored_item`` while +/// other items in the same subscription are still live. Per-MI flag, +/// not the global generation, so a single removal does not invalidate +/// peer callbacks. struct EventCallbackContext { OpcuaClient * owner{nullptr}; uint64_t generation_snapshot{0}; + std::atomic active{true}; uint32_t subscription_id{0}; uint32_t monitored_item_id{0}; // populated after createEvent returns OpcuaClient::EventCallback callback; @@ -602,6 +612,13 @@ static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 sub_id, void if (ctx->generation_snapshot != ctx->owner->current_generation()) { return; } + // Single MI removed via remove_event_monitored_item while peers remain + // live - the global generation has not changed, so the peer trampolines + // (in the same subscription) must keep firing. Drop only this MI's late + // notifications via the per-context flag. + if (!ctx->active.load(std::memory_order_acquire)) { + return; + } // Copy the UA_Variant fields into open62541pp wrappers. UA_Variant_copy // duplicates the underlying buffer, which the opcua::Variant destructor @@ -743,14 +760,26 @@ bool OpcuaClient::remove_event_monitored_item(uint32_t subscription_id, uint32_t if (it == impl_->event_callbacks.end() || it->second->subscription_id != subscription_id) { return false; } + // Flip the per-MI active flag BEFORE moving the unique_ptr so any + // in-flight trampoline call observes the cleared flag and bails out + // before touching the about-to-be-freed callback. Order: + // 1. set active=false (release): trampoline reads it (acquire) and + // returns without invoking ``callback``. + // 2. move unique_ptr out of map and erase: ctx still alive in + // ``retired`` until end of function. + // 3. delete server-side MI synchronously (mutex serializes against + // the client's notification dispatch). + // 4. ``retired`` falls out of scope -> ctx freed. + // We deliberately do NOT bump the global ``generation`` counter here: + // it would invalidate every peer monitored item registered against the + // same subscription, silently dropping their callbacks even though + // their MIs are still live on the server. Generation is reserved for + // full disconnect / remove_subscriptions. + it->second->active.store(false, std::memory_order_release); retired = std::move(it->second); impl_->event_callbacks.erase(it); } - // Bump generation BEFORE freeing the ctx so any in-flight trampoline call - // captured the old generation and will drop its work. - impl_->generation.fetch_add(1, std::memory_order_release); - if (impl_->connected) { UA_StatusCode status = UA_Client_MonitoredItems_deleteSingle(impl_->client.handle(), subscription_id, mi_id); (void)status; // best effort; on failure the server may already have dropped it diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 9d5d6438..c7726f7d 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -232,8 +232,16 @@ void OpcuaPoller::setup_event_subscriptions() { event_monitored_item_ids_.clear(); for (const auto & cfg : node_map_.event_alarms()) { - auto callback = [this, &cfg](const std::vector & values, const opcua::NodeId & source_node, - const opcua::NodeId & event_type, const opcua::NodeId & condition_id) { + // Capture cfg BY VALUE: even though range-for ``const auto & cfg`` binds + // to a vector element that outlives the loop, an `&cfg`-by-reference + // capture would chain through a local reference variable whose name + // goes out of scope after each iteration. Defensible per current C++ + // semantics (the captured reference resolves to the underlying vector + // element), but value capture is unambiguous and matches Copilot's + // review feedback. AlarmEventConfig is a small struct of strings, so + // copying is cheap. + auto callback = [this, cfg](const std::vector & values, const opcua::NodeId & source_node, + const opcua::NodeId & event_type, const opcua::NodeId & condition_id) { on_event(cfg, values, source_node, event_type, condition_id); }; uint32_t mi_id = diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index a67ad6f4..66eb5905 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -131,6 +131,19 @@ TEST(OpcuaClientTest, RemoveEventMonitoredItemUnknownIdReturnsFalse) { EXPECT_FALSE(client.remove_event_monitored_item(/*sub_id=*/1, /*mi_id=*/9999)); } +TEST(OpcuaClientTest, RemoveEventMonitoredItemUnknownIdDoesNotBumpGeneration) { + // remove_event_monitored_item is per-MI cleanup. It must NOT touch the + // global generation counter - that would silently invalidate every peer + // monitored item's trampoline check and drop their callbacks even though + // the underlying server-side MIs are still live. Per-MI staleness is + // tracked by EventCallbackContext::active instead. (Copilot review on + // PR #387.) + OpcuaClient client; + uint64_t before = client.current_generation(); + client.remove_event_monitored_item(/*sub_id=*/1, /*mi_id=*/9999); + EXPECT_EQ(client.current_generation(), before); +} + TEST(OpcuaClientTest, CallMethodWhenDisconnected) { OpcuaClient client; auto result = client.call_method(opcua::NodeId(0, UA_NS0ID_SERVER), opcua::NodeId(0, 11489), {}); From 9e7d2c8149677efb46f7a01ddade1e4b0a46c8ed Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 14:18:27 +0200 Subject: [PATCH 18/22] fix(opcua,#386): Copilot review feedback batch (observability + hygiene) Eight Copilot findings on PR #387 addressed in one batch: Observability (#6/#7/#8/#10): every per-event / per-method-call ``std::cerr`` trace is now gated by the ``ROS2_MEDKIT_OPCUA_TRACE`` env-var (off by default). Covered the trampoline, ``add_event_monitored_item``, ``call_method``, ``on_event``, EventId capture, state-machine classification, dispatch trace, and the SOVD ack/confirm EventId hex dump. Production logs now stay clean; integration-test debugging re-enables verbose mode with one env-var. Operator-visible ``[opcua_poller WARN] ConditionRefresh rejected`` stays unconditional. Comment / code mismatches (#1/#11/#12): - ``OpcuaPlugin::on_event_alarm`` ReportHealed branch now explicitly documents the intentional no-op and explains why pushing EVENT_PASSED to fault_manager would defeat the OPC-UA Part 9 ack/confirm contract. - ``alarm_state_machine.hpp`` Retain comment matches reality - we do not currently strip Retain=false events because the EventFilter doesn't include Retain in its select clauses (followup issue #389). Test fixes: - (#3) ``DisabledNoOpWhenAlreadyCleared`` renamed to ``DisabledTransitionsClearedToSuppressedNoOp`` so the name reflects both the status transition (Cleared -> Suppressed) and the no-op action. - (#13) New ``NodeMapTest.RejectsAlarmSourceUnderNodes``: locks the schema validation that rejects misplaced ``alarm_source`` keys under ``nodes:`` (was silently ignored), pointing the user at ``event_alarms:``. Schema validation (#13): ``NodeMap::load`` now rejects any ``alarm_source`` key under ``nodes:`` with an actionable error message. Previous behaviour silently ignored the typo unless paired with ``alarm.threshold``, leaving "subscribed alarm that never fires" cases impossible to diagnose at runtime. Test/script hygiene: - (#9) ``run_alarm_tests.sh`` replaces the fixed ``sleep 3`` between ``fault_manager_node`` start and ``gateway_node`` start with a 30-try poll on ``ros2 service list | grep /fault_manager/report_fault``. Matches the project rule "no fixed sleeps". - (#2) ``run_ctest.py`` smoke-test runner: skips on missing ``asyncua`` by default (preserves local dev iteration), but treats the env-var ``ROS2_MEDKIT_OPCUA_SMOKE_REQUIRE=1`` as "fail hard" so a CI step that drops the ``asyncua`` install cannot silently bypass the smoke check. Local verify: 27/27 test_opcua_client + 27/27 test_alarm_state_machine + 1/1 new RejectsAlarmSourceUnderNodes. --- .../docker/scripts/run_alarm_tests.sh | 12 +++- .../ros2_medkit_opcua/alarm_state_machine.hpp | 11 ++-- .../ros2_medkit_opcua/src/node_map.cpp | 16 +++-- .../ros2_medkit_opcua/src/opcua_client.cpp | 58 ++++++++++++++----- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 41 ++++++++++--- .../ros2_medkit_opcua/src/opcua_poller.cpp | 56 ++++++++++++------ .../fixtures/test_alarm_server/run_ctest.py | 20 ++++++- .../test/test_alarm_state_machine.cpp | 5 +- .../ros2_medkit_opcua/test/test_node_map.cpp | 24 ++++++++ 9 files changed, 187 insertions(+), 56 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh index 54074132..b237069f 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/scripts/run_alarm_tests.sh @@ -267,7 +267,17 @@ docker run -d --name "${GATEWAY_NAME}" --network "${NET_NAME}" \ # the gateway opcua plugin tries to call /fault_manager/report_fault. ros2 run ros2_medkit_fault_manager fault_manager_node \ > /var/lib/ros2_medkit/fault_manager.log 2>&1 & - sleep 3 + # Poll for service advertisement instead of a fixed sleep (Copilot + # review on PR #387). On slow CI runners the previous ``sleep 3`` was + # sometimes too short, leaving the gateway to come up before the + # service was discoverable. ``ros2 service list`` is the cheapest + # ROS-native availability signal. + for i in $(seq 1 30); do + if ros2 service list 2>/dev/null | grep -q "/fault_manager/report_fault"; then + break + fi + sleep 0.2 + done PLUGIN_PATH=$(find /root/ws/install -name "libros2_medkit_opcua_plugin.so" | head -1) exec ros2 run ros2_medkit_gateway gateway_node \ --ros-args --params-file /config/gateway_params.yaml \ diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp index 12f8dc8c..79a3ed3b 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp @@ -68,10 +68,13 @@ struct AlarmEventInput { /// 5. ActiveState == false -> Healed or Cleared based on /// Acked + Confirmed /// -/// Retain is intentionally NOT used here. Per Part 9 §5.5.2.10 it controls -/// visibility during ConditionRefresh bursts, not lifecycle - the poller -/// strips Retain=false events delivered between RefreshStartEvent and -/// RefreshEndEvent before invoking compute(). +/// Retain is intentionally not modeled by this state machine and does not +/// affect ``compute()``. Per Part 9 §5.5.2.10 it controls visibility during +/// ConditionRefresh bursts rather than the lifecycle mapping implemented +/// here. The current EventFilter does not include Retain in its select +/// clauses; if/when ConditionRefresh-with-Retain filtering is added (issue +/// #389), it will live in the poller's pre-compute path, not in this +/// pure-function table. (Copilot review on PR #387.) class AlarmStateMachine { public: struct Outcome { diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp index 46124538..835e8905 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp @@ -334,14 +334,18 @@ bool NodeMap::load(const std::string & yaml_path) { } } - // Mutual-exclusion check: an entry under ``nodes:`` carrying both a - // ``threshold`` alarm and an ``alarm_source`` is a configuration error - // (the threshold path polls scalar values; the alarm_source path - // subscribes to native events). Reject the whole file rather than guess. + // Schema validation under ``nodes:``: ``alarm_source`` belongs in the + // top-level ``event_alarms:`` section, never under ``nodes:``. Silently + // ignoring a misplaced ``alarm_source`` (the previous behavior unless + // also paired with ``alarm.threshold``) lets a configuration typo land + // a "subscribed alarm that never fires", which is impossible to + // diagnose from runtime logs. Reject the whole file with an actionable + // error pointing at the right place. (Copilot review on PR #387.) for (const auto & node : (nodes ? nodes : YAML::Node{})) { - if (node["alarm_source"] && node["alarm"] && node["alarm"]["threshold"]) { + if (node["alarm_source"]) { RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), - "Entry node_id=%s declares both threshold alarm and alarm_source - mutually exclusive", + "Entry node_id=%s uses ``alarm_source`` under ``nodes:``, which is invalid; " + "move this configuration to top-level ``event_alarms:`` (see README §event_alarms)", node["node_id"] ? node["node_id"].as().c_str() : ""); return false; } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 47540402..3d1f4fac 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,20 @@ namespace ros2_medkit_gateway { +// Env-var gate for verbose per-event / per-method-call diagnostics. +// Set ``ROS2_MEDKIT_OPCUA_TRACE=1`` to enable trampoline / EventId / +// call_method stderr logging. Off by default to keep production logs sane +// (Copilot review on PR #387: per-event std::cerr would flood under high +// alarm rates and bypass the gateway's normal logging path). Resolved once +// at process start so toggling requires a restart. +inline bool opcua_trace_enabled() { + static const bool enabled = []() { + const char * v = std::getenv("ROS2_MEDKIT_OPCUA_TRACE"); + return v != nullptr && v[0] != '\0' && std::string(v) != "0"; + }(); + return enabled; +} + // Forward declaration - defined after Impl so the trampoline can call back into // the client. Static linkage keeps the symbol private to this translation unit. struct EventCallbackContext; @@ -599,11 +614,15 @@ UA_EventFilter make_event_filter(const std::vector // Defined at namespace scope so its address is a stable function pointer. static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 sub_id, void * /*sub_ctx*/, UA_UInt32 mon_id, void * mon_ctx, size_t n_fields, UA_Variant * fields) { - std::cerr << "[opcua_client] TRAMPOLINE FIRED sub=" << sub_id << " mon=" << mon_id << " n_fields=" << n_fields - << std::endl; + if (opcua_trace_enabled()) { + std::cerr << "[opcua_client] TRAMPOLINE FIRED sub=" << sub_id << " mon=" << mon_id << " n_fields=" << n_fields + << std::endl; + } auto * ctx = static_cast(mon_ctx); if (ctx == nullptr || ctx->owner == nullptr) { - std::cerr << "[opcua_client] TRAMPOLINE: ctx null" << std::endl; + if (opcua_trace_enabled()) { + std::cerr << "[opcua_client] TRAMPOLINE: ctx null" << std::endl; + } return; } // Stale callback from a defunct subscription - ctx is still valid (we only @@ -710,11 +729,13 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o item.requestedParameters.queueSize = 100; UA_ExtensionObject_setValueNoDelete(&item.requestedParameters.filter, &filter, &UA_TYPES[UA_TYPES_EVENTFILTER]); - // Debug log so integration-test failures surface the exact NodeId we - // hand to the server. Trace-level diagnostic; can be tightened to a - // ROS RCLCPP_DEBUG once the issue #386 server interop is stable. - std::cerr << "[opcua_client] add_event_monitored_item: subId=" << subscription_id - << " nodeId=" << source_node.toString() << " selectClauses=" << (select_specs.size() + 3) << std::endl; + // Trace-level diagnostic gated by ROS2_MEDKIT_OPCUA_TRACE so integration + // test failures can surface the exact NodeId / select-clause count we + // hand to the server, without flooding production stderr. + if (opcua_trace_enabled()) { + std::cerr << "[opcua_client] add_event_monitored_item: subId=" << subscription_id + << " nodeId=" << source_node.toString() << " selectClauses=" << (select_specs.size() + 3) << std::endl; + } UA_MonitoredItemCreateResult result = UA_Client_MonitoredItems_createEvent(impl_->client.handle(), subscription_id, UA_TIMESTAMPSTORETURN_BOTH, item, @@ -731,8 +752,10 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o UA_MonitoredItemCreateRequest_clear(&item); UA_EventFilter_clear(&filter); - std::cerr << "[opcua_client] createEvent result: status=" << UA_StatusCode_name(result.statusCode) - << " miId=" << result.monitoredItemId << std::endl; + if (opcua_trace_enabled()) { + std::cerr << "[opcua_client] createEvent result: status=" << UA_StatusCode_name(result.statusCode) + << " miId=" << result.monitoredItemId << std::endl; + } if (result.statusCode != UA_STATUSCODE_GOOD) { UA_MonitoredItemCreateResult_clear(&result); @@ -836,16 +859,19 @@ OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & opcua::services::call(impl_->client, object_id, method_id, opcua::Span(input_args)); UA_StatusCode code = result.getStatusCode().get(); auto arg_results = result.getInputArgumentResults(); - std::cerr << "[opcua_client] call_method object=" << object_id.toString() << " method=" << method_id.toString() - << " statusCode=" << UA_StatusCode_name(code); std::vector arg_codes; arg_codes.reserve(arg_results.size()); for (size_t i = 0; i < arg_results.size(); ++i) { - uint32_t arg_code = arg_results[i].get(); - arg_codes.push_back(arg_code); - std::cerr << " arg" << i << "=" << UA_StatusCode_name(arg_code); + arg_codes.push_back(arg_results[i].get()); + } + if (opcua_trace_enabled()) { + std::cerr << "[opcua_client] call_method object=" << object_id.toString() << " method=" << method_id.toString() + << " statusCode=" << UA_StatusCode_name(code); + for (size_t i = 0; i < arg_codes.size(); ++i) { + std::cerr << " arg" << i << "=" << UA_StatusCode_name(arg_codes[i]); + } + std::cerr << std::endl; } - std::cerr << std::endl; auto classified = classify_call_result(code, arg_codes); if (!classified.has_value()) { diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index a529cde1..76d06b90 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -33,6 +33,18 @@ namespace ros2_medkit_gateway { namespace { +// Env-var gate for verbose per-event / per-method-call diagnostics. +// See opcua_client.cpp for rationale (Copilot review on PR #387). Duplicated +// to keep traces local to their dispatch sites; promoting to a public header +// would expose an internal trace knob. +inline bool opcua_trace_enabled() { + static const bool enabled = []() { + const char * v = std::getenv("ROS2_MEDKIT_OPCUA_TRACE"); + return v != nullptr && v[0] != '\0' && std::string(v) != "0"; + }(); + return enabled; +} + /// Parse a JSON "value" field, coerce to the node's declared data_type, and /// validate against the optional min/max range. Shared by handle_plc_operations, /// DataProvider::write_data, and OperationProvider::execute_operation to keep @@ -534,14 +546,27 @@ void OpcuaPlugin::on_event_alarm(const AlarmEventDelivery & delivery) { send_report_fault(delivery.entity_id, delivery.fault_code, severity_str, delivery.message); break; case AlarmAction::ReportHealed: - // Fault is latched: condition is no longer active but not yet - // confirmed. We don't have a dedicated HEALED reporting verb in - // ReportFault.srv (only FAILED/PASSED), so we mark this as a PASSED - // event - fault_manager keeps the entry in HEALED state until - // confirmed, mirroring the lifecycle. + // Intentional no-op (Copilot review on PR #387). + // + // OPC-UA AlarmConditionType HEALED means "alarm physically cleared + // (ActiveState=false) but operator workflow incomplete (ack and/or + // confirm pending)". Per Part 9 §5.7 the Cleared transition is + // operator-driven, not statistical. + // + // ros2_medkit_msgs/srv/ReportFault has only FAILED/PASSED verbs and + // fault_manager treats PASSED through a debounce engine. Sending + // EVENT_PASSED on every latch would let fault_manager auto-clear + // the fault via healing_threshold debounce, defeating the spec + // contract that requires explicit operator Acknowledge + Confirm. + // Conversely, healing_enabled=false would silently lose the HEALED + // signal entirely. + // + // Until we add STATUS_LATCHED (or a similar lifecycle-distinguishing + // status) to ros2_medkit_msgs/msg/Fault we keep status=CONFIRMED + // until the next ClearFault fires. The operator-side gap (cannot + // see "physically cleared, awaiting confirm" in the UI) is tracked + // separately; see PR #387 review thread. log_info("AlarmCondition HEALED (latched, awaiting ack/confirm): " + delivery.fault_code); - // No-op for now; fault_manager will keep the fault HEALED until - // CLEARED. The state transition is observable via /faults/stream. break; case AlarmAction::ClearFault: log_info("AlarmCondition CLEARED: " + delivery.fault_code); @@ -898,7 +923,7 @@ OpcuaPlugin::execute_operation(const std::string & entity_id, const std::string args.push_back(opcua::Variant::fromScalar(runtime->latest_event_id)); args.push_back(opcua::Variant::fromScalar(opcua::LocalizedText("", comment))); - { + if (opcua_trace_enabled()) { const auto * bytes = runtime->latest_event_id.data(); std::cerr << "[opcua_plugin] " << operation_name << " EventId len=" << runtime->latest_event_id.length() << " hex="; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index c7726f7d..312cc380 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,19 @@ namespace ros2_medkit_gateway { +namespace { +// See opcua_client.cpp for the canonical helper. Duplicated here so the +// poller's per-event traces stay together with the dispatch they describe; +// promoting to a public header would expose an internal trace knob. +inline bool opcua_trace_enabled() { + static const bool enabled = []() { + const char * v = std::getenv("ROS2_MEDKIT_OPCUA_TRACE"); + return v != nullptr && v[0] != '\0' && std::string(v) != "0"; + }(); + return enabled; +} +} // namespace + namespace { // EventType NodeIds for ConditionRefresh bracketing per OPC-UA Part 9 §5.5.7 @@ -300,8 +314,10 @@ void OpcuaPoller::condition_refresh() { void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector & values, const opcua::NodeId & /*source_node*/, const opcua::NodeId & event_type, const opcua::NodeId & condition_id) { - std::cerr << "[opcua_poller] on_event fault=" << cfg.fault_code << " event_type=" << event_type.toString() - << " condition=" << condition_id.toString() << " values=" << values.size() << std::endl; + if (opcua_trace_enabled()) { + std::cerr << "[opcua_poller] on_event fault=" << cfg.fault_code << " event_type=" << event_type.toString() + << " condition=" << condition_id.toString() << " values=" << values.size() << std::endl; + } // Detect ConditionRefresh bracketing per Part 9 §5.5.7. The flag is for // diagnostics only; the state machine itself does not need to know // because RefreshStart / RefreshEnd notifications carry no condition @@ -386,24 +402,28 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector()) { it->second.latest_event_id = values[kFieldEventId].getScalarCopy(); - std::cerr << "[opcua_poller] captured EventId len=" << it->second.latest_event_id.length() << " hex="; - const auto * bytes = it->second.latest_event_id.data(); - for (size_t i = 0; i < std::min(it->second.latest_event_id.length(), 16); ++i) { - char buf[3]; - std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); - std::cerr << buf; + if (opcua_trace_enabled()) { + std::cerr << "[opcua_poller] captured EventId len=" << it->second.latest_event_id.length() << " hex="; + const auto * bytes = it->second.latest_event_id.data(); + for (size_t i = 0; i < std::min(it->second.latest_event_id.length(), 16); ++i) { + char buf[3]; + std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); + std::cerr << buf; + } + std::cerr << std::endl; } - std::cerr << std::endl; - } else { + } else if (opcua_trace_enabled()) { std::cerr << "[opcua_poller] EventId field not a ByteString" << std::endl; } auto outcome = AlarmStateMachine::compute(prev_status, input); - std::cerr << "[opcua_poller] state machine: enabled=" << input.enabled_state << " active=" << input.active_state - << " acked=" << input.acked_state << " confirmed=" << input.confirmed_state - << " shelved=" << input.shelved << " branch=" << input.branch_id_present - << " prev=" << static_cast(prev_status) << " action=" << static_cast(outcome.action) - << std::endl; + if (opcua_trace_enabled()) { + std::cerr << "[opcua_poller] state machine: enabled=" << input.enabled_state << " active=" << input.active_state + << " acked=" << input.acked_state << " confirmed=" << input.confirmed_state + << " shelved=" << input.shelved << " branch=" << input.branch_id_present + << " prev=" << static_cast(prev_status) << " action=" << static_cast(outcome.action) + << std::endl; + } it->second.last_status = outcome.next_status; runtime_snapshot = it->second; @@ -436,8 +456,10 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector Suppressed) but + // no callback fires (NoOp action). Naming reflects both halves so a + // future reader does not misread "NoOp" as "no transition". AlarmEventInput in; in.enabled_state = false; auto out = AlarmStateMachine::compute(SovdAlarmStatus::Cleared, in); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp index b23d64be..fecbad74 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp @@ -521,4 +521,28 @@ component_id: test EXPECT_TRUE(map.entries()[0].alarm->above_threshold); } +TEST_F(NodeMapTest, RejectsAlarmSourceUnderNodes) { + // Schema validation: ``alarm_source`` is only valid in the top-level + // ``event_alarms:`` section. Used to be silently ignored when not paired + // with ``alarm.threshold``, which let a config typo land an alarm that + // never fires. Loader must now reject the whole file with an error so + // the typo is visible in the manifest-load log line. (Copilot review on + // PR #387.) + std::string path = "/tmp/test_node_map_misplaced_alarm_source.yaml"; + std::ofstream f(path); + f << R"( +area_id: test +component_id: test +nodes: + - node_id: "ns=1;i=1" + entity_id: ent1 + data_name: val1 + alarm_source: "ns=2;s=Alarms.Misplaced" +)"; + f.close(); + + NodeMap map; + EXPECT_FALSE(map.load(path)); +} + } // namespace ros2_medkit_gateway From dba586de0e1e2e5b2deeed492760010920968e58 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 18:43:59 +0200 Subject: [PATCH 19/22] fix(opcua,#386): replace std::cerr traces with RCLCPP_DEBUG_STREAM (bburda review) Per bburda review on PR #387: env-var-gated ``std::cerr`` (Copilot fix round) was the wrong observability primitive. It bypasses the gateway's log-level controls (``--log-level opcua.client:=debug``, ``RCUTILS_LOGGING_USE_STDOUT``, /rosout aggregation) and floods raw container stderr instead of integrating with the rest of the plugin's log_info / log_warn plumbing. All 18 trace sites switched to ``RCLCPP_DEBUG_STREAM`` against named loggers - ``opcua.client`` (trampoline, add_event_monitored_item, createEvent result, call_method status), ``opcua.poller`` (on_event, captured EventId hex, state-machine inputs, dispatching action), and ``opcua.plugin`` (acknowledge_fault / confirm_fault EventId hex). Three places kept a manual pre-gate via per-logger ``debug_enabled()`` helpers because the trace builds an ``std::ostringstream`` whose construction RCLCPP_DEBUG_STREAM does NOT short-circuit (the macro constructs a stringstream unconditionally and only the underlying RCUTILS log emission is level-gated). Pre-gating keeps the per-event formatting cost off the hot path at INFO. The ``Level`` enum class is compared via ``static_cast`` because it has no defined ``operator<=``. Removed the ``opcua_trace_enabled()`` env-var helpers and their duplicated definitions across all three .cpp files; the env-var ``ROS2_MEDKIT_OPCUA_TRACE`` is replaced by the standard ROS log-level flow. Operators now toggle traces with: ros2 launch ... --log-level opcua.client:=debug ros2 launch ... --log-level opcua.poller:=debug ros2 launch ... --log-level opcua.plugin:=debug The ``[opcua_poller WARN] ConditionRefresh rejected`` fallback (when PollerConfig.log_warn is not wired) now goes through ``RCLCPP_WARN`` instead of raw stderr, again routing the message through /rosout. CMakeLists.txt: ``test_opcua_client`` gains ``rclcpp`` as a target dependency because ``opcua_client.cpp`` now includes ``rclcpp/logging.hpp``. The unit tests do not call ``rclcpp::init`` so this is link-only. Local verify: 27/27 test_opcua_client + 27/27 test_alarm_state_machine + 32/32 test_node_map. --- .../ros2_medkit_opcua/CMakeLists.txt | 5 ++ .../ros2_medkit_opcua/src/opcua_client.cpp | 77 ++++++++++--------- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 34 ++++---- .../ros2_medkit_opcua/src/opcua_poller.cpp | 65 ++++++++-------- 4 files changed, 95 insertions(+), 86 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt index 740d17b7..92b4e2bf 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt @@ -194,8 +194,13 @@ if(BUILD_TESTING) target_include_directories(test_opcua_client PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ) + # rclcpp needed because opcua_client.cpp uses RCLCPP_DEBUG_STREAM for the + # per-event diagnostics (bburda review on PR #387). The unit tests only + # call public methods that never touch the trace path under test, so the + # dep is link-only - no rclcpp::init / Node spin-up in the suite. medkit_target_dependencies(test_opcua_client ros2_medkit_gateway + rclcpp ) target_link_libraries(test_opcua_client open62541pp::open62541pp diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 3d1f4fac..1f8df84e 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -16,10 +16,9 @@ #include #include -#include #include #include -#include +#include #include #include #include @@ -27,22 +26,29 @@ #include #include +#include namespace ros2_medkit_gateway { -// Env-var gate for verbose per-event / per-method-call diagnostics. -// Set ``ROS2_MEDKIT_OPCUA_TRACE=1`` to enable trampoline / EventId / -// call_method stderr logging. Off by default to keep production logs sane -// (Copilot review on PR #387: per-event std::cerr would flood under high -// alarm rates and bypass the gateway's normal logging path). Resolved once -// at process start so toggling requires a restart. -inline bool opcua_trace_enabled() { - static const bool enabled = []() { - const char * v = std::getenv("ROS2_MEDKIT_OPCUA_TRACE"); - return v != nullptr && v[0] != '\0' && std::string(v) != "0"; - }(); - return enabled; +// Per-event / per-method-call traces use a named logger so they integrate +// with the gateway's normal log level controls (e.g. RCUTILS_LOGGING_USE_STDOUT, +// ros2 launch --log-level opcua.client:=debug). Off at INFO by default; an +// operator turns them on without rebuilding (bburda review on PR #387). +namespace { +inline rclcpp::Logger opcua_client_logger() { + static auto logger = rclcpp::get_logger("opcua.client"); + return logger; +} + +// Pre-gate for traces whose stream-build cost is non-trivial (e.g. per-arg +// loops). RCLCPP_DEBUG_STREAM constructs the std::stringstream +// unconditionally, so for hot paths we check the effective level first. +// ``Level`` is an enum class - cast to int for the ordered comparison. +inline bool client_debug_enabled() { + return static_cast(opcua_client_logger().get_effective_level()) <= + static_cast(rclcpp::Logger::Level::Debug); } +} // namespace // Forward declaration - defined after Impl so the trampoline can call back into // the client. Static linkage keeps the symbol private to this translation unit. @@ -614,15 +620,11 @@ UA_EventFilter make_event_filter(const std::vector // Defined at namespace scope so its address is a stable function pointer. static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 sub_id, void * /*sub_ctx*/, UA_UInt32 mon_id, void * mon_ctx, size_t n_fields, UA_Variant * fields) { - if (opcua_trace_enabled()) { - std::cerr << "[opcua_client] TRAMPOLINE FIRED sub=" << sub_id << " mon=" << mon_id << " n_fields=" << n_fields - << std::endl; - } + RCLCPP_DEBUG_STREAM(opcua_client_logger(), + "TRAMPOLINE FIRED sub=" << sub_id << " mon=" << mon_id << " n_fields=" << n_fields); auto * ctx = static_cast(mon_ctx); if (ctx == nullptr || ctx->owner == nullptr) { - if (opcua_trace_enabled()) { - std::cerr << "[opcua_client] TRAMPOLINE: ctx null" << std::endl; - } + RCLCPP_DEBUG(opcua_client_logger(), "TRAMPOLINE: ctx null"); return; } // Stale callback from a defunct subscription - ctx is still valid (we only @@ -729,13 +731,13 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o item.requestedParameters.queueSize = 100; UA_ExtensionObject_setValueNoDelete(&item.requestedParameters.filter, &filter, &UA_TYPES[UA_TYPES_EVENTFILTER]); - // Trace-level diagnostic gated by ROS2_MEDKIT_OPCUA_TRACE so integration - // test failures can surface the exact NodeId / select-clause count we - // hand to the server, without flooding production stderr. - if (opcua_trace_enabled()) { - std::cerr << "[opcua_client] add_event_monitored_item: subId=" << subscription_id - << " nodeId=" << source_node.toString() << " selectClauses=" << (select_specs.size() + 3) << std::endl; - } + // Debug-level diagnostic so integration-test failures can surface the + // exact NodeId / select-clause count we hand to the server. Quiet at + // INFO; ``ros2 launch --log-level opcua.client:=debug`` (or env var + // RCUTILS_CONSOLE_OUTPUT_FORMAT) re-enables it. + RCLCPP_DEBUG_STREAM(opcua_client_logger(), + "add_event_monitored_item: subId=" << subscription_id << " nodeId=" << source_node.toString() + << " selectClauses=" << (select_specs.size() + 3)); UA_MonitoredItemCreateResult result = UA_Client_MonitoredItems_createEvent(impl_->client.handle(), subscription_id, UA_TIMESTAMPSTORETURN_BOTH, item, @@ -752,10 +754,8 @@ uint32_t OpcuaClient::add_event_monitored_item(uint32_t subscription_id, const o UA_MonitoredItemCreateRequest_clear(&item); UA_EventFilter_clear(&filter); - if (opcua_trace_enabled()) { - std::cerr << "[opcua_client] createEvent result: status=" << UA_StatusCode_name(result.statusCode) - << " miId=" << result.monitoredItemId << std::endl; - } + RCLCPP_DEBUG_STREAM(opcua_client_logger(), "createEvent result: status=" << UA_StatusCode_name(result.statusCode) + << " miId=" << result.monitoredItemId); if (result.statusCode != UA_STATUSCODE_GOOD) { UA_MonitoredItemCreateResult_clear(&result); @@ -864,13 +864,16 @@ OpcuaClient::call_method(const opcua::NodeId & object_id, const opcua::NodeId & for (size_t i = 0; i < arg_results.size(); ++i) { arg_codes.push_back(arg_results[i].get()); } - if (opcua_trace_enabled()) { - std::cerr << "[opcua_client] call_method object=" << object_id.toString() << " method=" << method_id.toString() - << " statusCode=" << UA_StatusCode_name(code); + if (client_debug_enabled()) { + // RCLCPP_DEBUG_STREAM constructs its std::stringstream unconditionally, + // so build the per-arg suffix only when DEBUG is actually active. + std::ostringstream args_oss; for (size_t i = 0; i < arg_codes.size(); ++i) { - std::cerr << " arg" << i << "=" << UA_StatusCode_name(arg_codes[i]); + args_oss << " arg" << i << "=" << UA_StatusCode_name(arg_codes[i]); } - std::cerr << std::endl; + RCLCPP_DEBUG_STREAM(opcua_client_logger(), + "call_method object=" << object_id.toString() << " method=" << method_id.toString() + << " statusCode=" << UA_StatusCode_name(code) << args_oss.str()); } auto classified = classify_call_result(code, arg_codes); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 76d06b90..549c69a1 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -27,22 +27,23 @@ #include #include #include -#include +#include namespace ros2_medkit_gateway { namespace { -// Env-var gate for verbose per-event / per-method-call diagnostics. -// See opcua_client.cpp for rationale (Copilot review on PR #387). Duplicated -// to keep traces local to their dispatch sites; promoting to a public header -// would expose an internal trace knob. -inline bool opcua_trace_enabled() { - static const bool enabled = []() { - const char * v = std::getenv("ROS2_MEDKIT_OPCUA_TRACE"); - return v != nullptr && v[0] != '\0' && std::string(v) != "0"; - }(); - return enabled; +// Named logger so per-operation traces respect ROS log level filtering +// (bburda review on PR #387). Quiet at INFO; ``--log-level +// opcua.plugin:=debug`` re-enables for diagnostics. +inline rclcpp::Logger opcua_plugin_logger() { + static auto logger = rclcpp::get_logger("opcua.plugin"); + return logger; +} + +inline bool plugin_debug_enabled() { + return static_cast(opcua_plugin_logger().get_effective_level()) <= + static_cast(rclcpp::Logger::Level::Debug); } /// Parse a JSON "value" field, coerce to the node's declared data_type, and @@ -923,16 +924,17 @@ OpcuaPlugin::execute_operation(const std::string & entity_id, const std::string args.push_back(opcua::Variant::fromScalar(runtime->latest_event_id)); args.push_back(opcua::Variant::fromScalar(opcua::LocalizedText("", comment))); - if (opcua_trace_enabled()) { + if (plugin_debug_enabled()) { + std::ostringstream hex_oss; const auto * bytes = runtime->latest_event_id.data(); - std::cerr << "[opcua_plugin] " << operation_name << " EventId len=" << runtime->latest_event_id.length() - << " hex="; for (size_t i = 0; i < std::min(runtime->latest_event_id.length(), 16); ++i) { char buf[3]; std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); - std::cerr << buf; + hex_oss << buf; } - std::cerr << " conditionId=" << runtime->condition_id.toString() << std::endl; + RCLCPP_DEBUG_STREAM(opcua_plugin_logger(), operation_name << " EventId len=" << runtime->latest_event_id.length() + << " hex=" << hex_oss.str() + << " conditionId=" << runtime->condition_id.toString()); } auto result = client_->call_method(runtime->condition_id, method_id, args); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 312cc380..82f0f202 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -17,26 +17,25 @@ #include #include #include -#include -#include #include +#include #include #include #include +#include namespace ros2_medkit_gateway { namespace { -// See opcua_client.cpp for the canonical helper. Duplicated here so the -// poller's per-event traces stay together with the dispatch they describe; -// promoting to a public header would expose an internal trace knob. -inline bool opcua_trace_enabled() { - static const bool enabled = []() { - const char * v = std::getenv("ROS2_MEDKIT_OPCUA_TRACE"); - return v != nullptr && v[0] != '\0' && std::string(v) != "0"; - }(); - return enabled; +inline rclcpp::Logger opcua_poller_logger() { + static auto logger = rclcpp::get_logger("opcua.poller"); + return logger; +} + +inline bool poller_debug_enabled() { + return static_cast(opcua_poller_logger().get_effective_level()) <= + static_cast(rclcpp::Logger::Level::Debug); } } // namespace @@ -299,7 +298,10 @@ void OpcuaPoller::condition_refresh() { if (config_.log_warn) { config_.log_warn(msg); } else { - std::cerr << "[opcua_poller WARN] " << msg << std::endl; + // Fallback when the plugin did not wire log_warn through PollerConfig + // (e.g., direct unit tests). RCLCPP_WARN goes to /rosout instead of + // raw stderr so the warn integrates with normal log filtering. + RCLCPP_WARN(opcua_poller_logger(), "%s", msg.c_str()); } condition_refresh_warned_ = true; } @@ -314,10 +316,9 @@ void OpcuaPoller::condition_refresh() { void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector & values, const opcua::NodeId & /*source_node*/, const opcua::NodeId & event_type, const opcua::NodeId & condition_id) { - if (opcua_trace_enabled()) { - std::cerr << "[opcua_poller] on_event fault=" << cfg.fault_code << " event_type=" << event_type.toString() - << " condition=" << condition_id.toString() << " values=" << values.size() << std::endl; - } + RCLCPP_DEBUG_STREAM(opcua_poller_logger(), + "on_event fault=" << cfg.fault_code << " event_type=" << event_type.toString() + << " condition=" << condition_id.toString() << " values=" << values.size()); // Detect ConditionRefresh bracketing per Part 9 §5.5.7. The flag is for // diagnostics only; the state machine itself does not need to know // because RefreshStart / RefreshEnd notifications carry no condition @@ -402,28 +403,28 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector()) { it->second.latest_event_id = values[kFieldEventId].getScalarCopy(); - if (opcua_trace_enabled()) { - std::cerr << "[opcua_poller] captured EventId len=" << it->second.latest_event_id.length() << " hex="; + if (poller_debug_enabled()) { + std::ostringstream hex_oss; const auto * bytes = it->second.latest_event_id.data(); for (size_t i = 0; i < std::min(it->second.latest_event_id.length(), 16); ++i) { char buf[3]; std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); - std::cerr << buf; + hex_oss << buf; } - std::cerr << std::endl; + RCLCPP_DEBUG_STREAM(opcua_poller_logger(), + "captured EventId len=" << it->second.latest_event_id.length() << " hex=" << hex_oss.str()); } - } else if (opcua_trace_enabled()) { - std::cerr << "[opcua_poller] EventId field not a ByteString" << std::endl; + } else { + RCLCPP_DEBUG(opcua_poller_logger(), "EventId field not a ByteString"); } auto outcome = AlarmStateMachine::compute(prev_status, input); - if (opcua_trace_enabled()) { - std::cerr << "[opcua_poller] state machine: enabled=" << input.enabled_state << " active=" << input.active_state - << " acked=" << input.acked_state << " confirmed=" << input.confirmed_state - << " shelved=" << input.shelved << " branch=" << input.branch_id_present - << " prev=" << static_cast(prev_status) << " action=" << static_cast(outcome.action) - << std::endl; - } + RCLCPP_DEBUG_STREAM(opcua_poller_logger(), + "state machine: enabled=" + << input.enabled_state << " active=" << input.active_state << " acked=" << input.acked_state + << " confirmed=" << input.confirmed_state << " shelved=" << input.shelved + << " branch=" << input.branch_id_present << " prev=" << static_cast(prev_status) + << " action=" << static_cast(outcome.action)); it->second.last_status = outcome.next_status; runtime_snapshot = it->second; @@ -456,10 +457,8 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector Date: Sun, 26 Apr 2026 18:55:43 +0200 Subject: [PATCH 21/22] fix(opcua,#386): input validation + cross-pipeline alarm collision check (bburda review) Two security/correctness gaps from bburda's review on PR #387: (1) **fault_code / comment unbounded input.** ``execute_operation`` for ``acknowledge_fault`` / ``confirm_fault`` accepted operator input without length cap or charset filter: - ``fault_code`` was concatenated into log lines and HTTP error bodies via ``"...fault_code=\" + fault_code + \"...\"`` - newlines, control chars, or multi-MB blobs would land verbatim in gateway logs. - ``comment`` was wrapped as ``LocalizedText`` and forwarded verbatim to the PLC's condition log - an authenticated operator role could spam the PLC with multi-MB blobs on every ack. Both now validated at the operation entry: ``fault_code`` must match ``is_valid_path_segment`` (alphanumeric + ``_`` + ``-``, 1-256 chars, matching the existing entity-id bound) and ``comment`` is capped at 256 chars. Failures return HTTP 400 with the actual length so the operator sees the bound, not a silent truncation. (2) **Cross-pipeline alarm collision uncaught.** The previous mutual-exclusion check in ``NodeMap::load`` (which I added in response to Copilot review) caught ``alarm_source`` misplaced under ``nodes:``, but missed the genuine collision worth checking: the same ``(entity_id, fault_code)`` declared by both threshold polling (``nodes[*].alarm``) and event subscription (``event_alarms``). fault_manager would receive both pipelines' calls and the resulting status flapping is impossible to debug at runtime; the two paths have different semantics (debounced vs state-machine) so a merge is not even well-defined. ``NodeMap::load`` now builds a set of ``(entity_id, fault_code)`` keys from ``event_alarms`` and rejects the whole file if any ``nodes[*].alarm`` matches a key already claimed by an event alarm. Two new gtest cases lock the contract: - ``RejectsThresholdEventAlarmCollision`` - same ``(tank_process, PLC_OVERPRESSURE)`` declared by both -> load returns false. - ``AcceptsDifferentFaultCodesAcrossPipelines`` - same entity but distinct fault_codes per pipeline -> load passes (no false-positive). Local verify: 34/34 test_node_map + 13/13 test_opcua_plugin. --- .../ros2_medkit_opcua/src/node_map.cpp | 26 ++++++++ .../ros2_medkit_opcua/src/opcua_plugin.cpp | 21 +++++++ .../ros2_medkit_opcua/test/test_node_map.cpp | 60 +++++++++++++++++++ 3 files changed, 107 insertions(+) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp index 835e8905..755faafd 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp @@ -21,9 +21,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -351,6 +353,30 @@ bool NodeMap::load(const std::string & yaml_path) { } } + // Cross-section collision check (bburda review on PR #387). The two + // alarm pipelines - threshold polling under ``nodes[*].alarm`` and + // native event subscription under ``event_alarms`` - must not address + // the same ``(entity_id, fault_code)``: fault_manager would receive + // both ``report_fault`` calls and the resulting status flapping is + // impossible to debug at runtime. The two paths produce different + // semantics (debounced vs state-machine driven) so the merge is not + // even well-defined. + { + std::set> event_keys; + for (const auto & cfg : event_alarms_) { + event_keys.emplace(cfg.entity_id, cfg.fault_code); + } + for (const auto & entry : entries_) { + if (entry.alarm.has_value() && event_keys.count({entry.entity_id, entry.alarm->fault_code}) > 0) { + RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), + "Duplicate (entity_id=%s, fault_code=%s) declared by both nodes[*].alarm " + "(threshold mode) and event_alarms[*] (subscription mode) - mutually exclusive", + entry.entity_id.c_str(), entry.alarm->fault_code.c_str()); + return false; + } + } + } + build_entity_defs(); return true; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 549c69a1..83cfa7f3 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -909,6 +909,27 @@ OpcuaPlugin::execute_operation(const std::string & entity_id, const std::string std::string fault_code = parameters["fault_code"].get(); std::string comment = parameters.value("comment", std::string{}); + // Input validation (bburda review on PR #387). ``fault_code`` lands in + // log lines and HTTP error bodies via string concatenation; without + // length / charset bounds an authenticated operator can inject + // newlines, control chars, or multi-MB blobs into gateway logs. + // ``comment`` is forwarded verbatim to the PLC as ``LocalizedText`` - + // unbounded length lets the same role spam the PLC's condition log. + // 256 chars matches the existing entity-id bound (``is_valid_path_segment``) + // and is generous enough for typical ``PLC_OVERPRESSURE_SENSOR_3`` style + // identifiers + a one-sentence operator comment. + if (!is_valid_path_segment(fault_code)) { + return tl::make_unexpected(OperationProviderErrorInfo{OperationProviderError::InvalidParameters, + "fault_code must be alphanumeric+_- and 1-256 chars (got " + + std::to_string(fault_code.size()) + " chars)", + 400}); + } + if (comment.size() > 256) { + return tl::make_unexpected( + OperationProviderErrorInfo{OperationProviderError::InvalidParameters, + "comment exceeds 256 chars (got " + std::to_string(comment.size()) + ")", 400}); + } + auto runtime = poller_->lookup_condition(entity_id, fault_code); if (!runtime) { return tl::make_unexpected(OperationProviderErrorInfo{ diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp index fecbad74..218da484 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_node_map.cpp @@ -521,6 +521,66 @@ component_id: test EXPECT_TRUE(map.entries()[0].alarm->above_threshold); } +TEST_F(NodeMapTest, RejectsThresholdEventAlarmCollision) { + // bburda review on PR #387: the genuine schema collision worth checking + // is not "alarm_source under nodes" (covered separately) but the same + // ``(entity_id, fault_code)`` declared by both a threshold alarm under + // ``nodes[*].alarm`` and a subscription entry under ``event_alarms``. + // fault_manager would receive both pipelines' calls and the resulting + // status flapping is impossible to debug at runtime; the two paths have + // different semantics (debounced vs state-machine) so a merge is not + // even well-defined. Loader rejects the whole file. + std::string path = "/tmp/test_node_map_alarm_collision.yaml"; + std::ofstream f(path); + f << R"( +area_id: test +component_id: test +nodes: + - node_id: "ns=1;i=1" + entity_id: tank_process + data_name: pressure + alarm: + fault_code: PLC_OVERPRESSURE + threshold: 90.0 +event_alarms: + - alarm_source: "ns=2;s=Alarms.Overpressure" + entity_id: tank_process + fault_code: PLC_OVERPRESSURE +)"; + f.close(); + + NodeMap map; + EXPECT_FALSE(map.load(path)); +} + +TEST_F(NodeMapTest, AcceptsDifferentFaultCodesAcrossPipelines) { + // Same entity, *different* fault_codes across pipelines is fine - each + // fault_manager entry is keyed on fault_code, so no collision. Locks + // the contract that the rejection above is precise (won't false-positive + // when only entity overlaps). + std::string path = "/tmp/test_node_map_alarm_no_collision.yaml"; + std::ofstream f(path); + f << R"( +area_id: test +component_id: test +nodes: + - node_id: "ns=1;i=1" + entity_id: tank_process + data_name: pressure + alarm: + fault_code: PLC_PRESSURE_HIGH + threshold: 90.0 +event_alarms: + - alarm_source: "ns=2;s=Alarms.Overheat" + entity_id: tank_process + fault_code: PLC_OVERHEAT +)"; + f.close(); + + NodeMap map; + EXPECT_TRUE(map.load(path)); +} + TEST_F(NodeMapTest, RejectsAlarmSourceUnderNodes) { // Schema validation: ``alarm_source`` is only valid in the top-level // ``event_alarms:`` section. Used to be silently ignored when not paired From 0c13920d7e111015ca418649cd38215e12473825 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 26 Apr 2026 19:18:30 +0200 Subject: [PATCH 22/22] fix(opcua,#386): use rcutils logging API for Humble compat ``rclcpp::Logger::get_effective_level`` was added in Jazzy; Humble ``Unit tests (humble)`` job on PR #387 fails with ``'class rclcpp::Logger' has no member named 'get_effective_level'`` against Jazzy+ headers. Switch the three pre-gate helpers (``client_debug_enabled``, ``poller_debug_enabled``, ``plugin_debug_enabled``) to ``rcutils_logging_logger_is_enabled_for(name, severity)``, which is present in rcutils on Humble through Rolling. Functionally identical - the rclcpp method is just a thin wrapper around the rcutils call. Add ```` include to all three .cpp files. No public API change; named loggers (``opcua.client`` / ``opcua.poller`` / ``opcua.plugin``) and DEBUG behaviour stay identical. Local verify (Jazzy): 27/27 test_opcua_client. Humble CI gate is the proof. --- .../ros2_medkit_opcua/src/opcua_client.cpp | 7 ++++--- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 6 ++++-- .../ros2_medkit_opcua/src/opcua_poller.cpp | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 1f8df84e..64850ac6 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -27,6 +27,7 @@ #include #include #include +#include namespace ros2_medkit_gateway { @@ -43,10 +44,10 @@ inline rclcpp::Logger opcua_client_logger() { // Pre-gate for traces whose stream-build cost is non-trivial (e.g. per-arg // loops). RCLCPP_DEBUG_STREAM constructs the std::stringstream // unconditionally, so for hot paths we check the effective level first. -// ``Level`` is an enum class - cast to int for the ordered comparison. +// Uses the rcutils-level API (available in Humble through Rolling) instead +// of rclcpp::Logger::get_effective_level (Jazzy+ only). inline bool client_debug_enabled() { - return static_cast(opcua_client_logger().get_effective_level()) <= - static_cast(rclcpp::Logger::Level::Debug); + return rcutils_logging_logger_is_enabled_for("opcua.client", RCUTILS_LOG_SEVERITY_DEBUG); } } // namespace diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 83cfa7f3..e045eadd 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -15,6 +15,7 @@ #include "ros2_medkit_opcua/opcua_plugin.hpp" #include +#include #include #include #include @@ -42,8 +43,9 @@ inline rclcpp::Logger opcua_plugin_logger() { } inline bool plugin_debug_enabled() { - return static_cast(opcua_plugin_logger().get_effective_level()) <= - static_cast(rclcpp::Logger::Level::Debug); + // rcutils API for Humble compatibility; Jazzy+ adds rclcpp::Logger:: + // get_effective_level but Humble does not. + return rcutils_logging_logger_is_enabled_for("opcua.plugin", RCUTILS_LOG_SEVERITY_DEBUG); } /// Parse a JSON "value" field, coerce to the node's declared data_type, and diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 82f0f202..c4b7213e 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -24,6 +24,7 @@ #include #include +#include namespace ros2_medkit_gateway { @@ -34,8 +35,9 @@ inline rclcpp::Logger opcua_poller_logger() { } inline bool poller_debug_enabled() { - return static_cast(opcua_poller_logger().get_effective_level()) <= - static_cast(rclcpp::Logger::Level::Debug); + // rcutils API for Humble compatibility; Jazzy+ adds rclcpp::Logger:: + // get_effective_level but Humble does not. + return rcutils_logging_logger_is_enabled_for("opcua.poller", RCUTILS_LOG_SEVERITY_DEBUG); } } // namespace