From 20bac298bee52a18a45581bcd7d561f8ac27c1f2 Mon Sep 17 00:00:00 2001 From: Julia Jia Date: Tue, 2 Dec 2025 10:39:57 -0800 Subject: [PATCH 1/3] Initial commit for rai_semap --- examples/check-detection-pipeline.py | 160 ++++ examples/rai-semap.launch.py | 133 ++++ examples/rosbot-xl-navigate-collect.py | 218 +++++ examples/validate-semantic-map.py | 166 ++++ src/rai_semap/README.md | 180 +++++ src/rai_semap/design.md | 577 ++++++++++++++ src/rai_semap/pyproject.toml | 25 + src/rai_semap/rai_semap/__init__.py | 13 + src/rai_semap/rai_semap/core/__init__.py | 13 + .../rai_semap/core/backend/__init__.py | 13 + .../core/backend/spatial_db_backend.py | 65 ++ .../rai_semap/core/backend/sqlite_backend.py | 342 ++++++++ src/rai_semap/rai_semap/core/base_memory.py | 45 ++ .../rai_semap/core/semantic_map_memory.py | 402 ++++++++++ src/rai_semap/rai_semap/ros2/__init__.py | 13 + .../rai_semap/ros2/detection_publisher.py | 474 +++++++++++ .../rai_semap/ros2/detection_publisher.yaml | 14 + src/rai_semap/rai_semap/ros2/node.py | 743 ++++++++++++++++++ .../rai_semap/ros2/perception_utils.py | 248 ++++++ .../rai_semap/ros2/perception_utils.yaml | 5 + src/rai_semap/rai_semap/tool/__init__.py | 13 + .../rai_semap/tool/query_semantic_map_tool.py | 44 ++ src/rai_semap/rai_semap/utils/__init__.py | 15 + .../rai_semap/utils/clear_annotations.py | 102 +++ tests/rai_semap/__init__.py | 13 + tests/rai_semap/conftest.py | 81 ++ tests/rai_semap/test_backend.py | 145 ++++ tests/rai_semap/test_memory.py | 242 ++++++ tests/rai_semap/test_node.py | 246 ++++++ 29 files changed, 4750 insertions(+) create mode 100644 examples/check-detection-pipeline.py create mode 100644 examples/rai-semap.launch.py create mode 100644 examples/rosbot-xl-navigate-collect.py create mode 100644 examples/validate-semantic-map.py create mode 100644 src/rai_semap/README.md create mode 100644 src/rai_semap/design.md create mode 100644 src/rai_semap/pyproject.toml create mode 100644 src/rai_semap/rai_semap/__init__.py create mode 100644 src/rai_semap/rai_semap/core/__init__.py create mode 100644 src/rai_semap/rai_semap/core/backend/__init__.py create mode 100644 src/rai_semap/rai_semap/core/backend/spatial_db_backend.py create mode 100644 src/rai_semap/rai_semap/core/backend/sqlite_backend.py create mode 100644 src/rai_semap/rai_semap/core/base_memory.py create mode 100644 src/rai_semap/rai_semap/core/semantic_map_memory.py create mode 100644 src/rai_semap/rai_semap/ros2/__init__.py create mode 100644 src/rai_semap/rai_semap/ros2/detection_publisher.py create mode 100644 src/rai_semap/rai_semap/ros2/detection_publisher.yaml create mode 100644 src/rai_semap/rai_semap/ros2/node.py create mode 100644 src/rai_semap/rai_semap/ros2/perception_utils.py create mode 100644 src/rai_semap/rai_semap/ros2/perception_utils.yaml create mode 100644 src/rai_semap/rai_semap/tool/__init__.py create mode 100644 src/rai_semap/rai_semap/tool/query_semantic_map_tool.py create mode 100644 src/rai_semap/rai_semap/utils/__init__.py create mode 100644 src/rai_semap/rai_semap/utils/clear_annotations.py create mode 100644 tests/rai_semap/__init__.py create mode 100644 tests/rai_semap/conftest.py create mode 100644 tests/rai_semap/test_backend.py create mode 100644 tests/rai_semap/test_memory.py create mode 100644 tests/rai_semap/test_node.py diff --git a/examples/check-detection-pipeline.py b/examples/check-detection-pipeline.py new file mode 100644 index 000000000..cd28f5d39 --- /dev/null +++ b/examples/check-detection-pipeline.py @@ -0,0 +1,160 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Diagnostic script to check if the detection pipeline is working.""" + +import rclpy +from rclpy.node import Node +from sensor_msgs.msg import Image + +from rai_interfaces.msg import RAIDetectionArray + + +class DetectionPipelineChecker(Node): + """Check if detection pipeline components are working.""" + + def __init__(self): + super().__init__("detection_pipeline_checker") + self.camera_received = False + self.detections_received = False + self.detection_count = 0 + + def check_camera_topic( + self, topic: str = "/camera/image_raw", timeout: float = 5.0 + ): + """Check if camera topic is publishing.""" + self.get_logger().info(f"Checking camera topic: {topic}") + _ = self.create_subscription( + Image, topic, lambda msg: setattr(self, "camera_received", True), 10 + ) + + import time + + start_time = time.time() + while time.time() - start_time < timeout: + rclpy.spin_once(self, timeout_sec=0.5) + if self.camera_received: + self.get_logger().info(f"✓ Camera topic {topic} is publishing") + return True + + self.get_logger().warn( + f"✗ Camera topic {topic} not publishing (timeout: {timeout}s)" + ) + return False + + def check_detection_topic( + self, topic: str = "/detection_array", timeout: float = 10.0 + ): + """Check if detection topic is publishing.""" + self.get_logger().info(f"Checking detection topic: {topic}") + + def detection_callback(msg: RAIDetectionArray): + self.detections_received = True + self.detection_count += len(msg.detections) + self.get_logger().info( + f"Received detection array with {len(msg.detections)} detections: " + f"{msg.detection_classes}" + ) + + _ = self.create_subscription(RAIDetectionArray, topic, detection_callback, 10) + + import time + + start_time = time.time() + while time.time() - start_time < timeout: + rclpy.spin_once(self, timeout_sec=0.5) + if self.detections_received: + self.get_logger().info( + f"✓ Detection topic {topic} is publishing ({self.detection_count} total detections)" + ) + return True + + self.get_logger().warn( + f"✗ Detection topic {topic} not publishing (timeout: {timeout}s)" + ) + return False + + def check_dino_service( + self, service_name: str = "/grounding_dino/grounding_dino_classify" + ): + """Check if DINO service is available.""" + from rai_interfaces.srv import RAIGroundingDino + + client = self.create_client(RAIGroundingDino, service_name) + + self.get_logger().info(f"Checking DINO service: {service_name}") + if client.wait_for_service(timeout_sec=2.0): + self.get_logger().info(f"✓ DINO service {service_name} is available") + return True + else: + self.get_logger().warn(f"✗ DINO service {service_name} not available") + return False + + +def main(): + """Run diagnostic checks.""" + rclpy.init() + checker = DetectionPipelineChecker() + + print("\n" + "=" * 60) + print("Detection Pipeline Diagnostic") + print("=" * 60 + "\n") + + # Check DINO service + dino_ok = checker.check_dino_service() + print() + + # Check camera topic + camera_ok = checker.check_camera_topic() + print() + + # Check detection topic (wait longer since it depends on camera) + if camera_ok: + detection_ok = checker.check_detection_topic(timeout=15.0) + else: + print("Skipping detection topic check (camera not available)") + detection_ok = False + print() + + # Summary + print("=" * 60) + print("Summary:") + print(f" DINO Service: {'✓' if dino_ok else '✗'}") + print(f" Camera Topic: {'✓' if camera_ok else '✗'}") + print(f" Detection Topic: {'✓' if detection_ok else '✗'}") + print("=" * 60) + + if not dino_ok: + print( + "\n⚠️ DINO service not available. Make sure perception agents are running:" + ) + print(" python -m rai_perception.scripts.run_perception_agents") + + if not camera_ok: + print("\n⚠️ Camera topic not publishing. Check:") + print(" ros2 topic list | grep camera") + print(" ros2 topic echo /camera/image_raw --once") + + if not detection_ok and camera_ok: + print("\n⚠️ Detection topic not publishing. Check:") + print(" - Is detection_publisher node running?") + print(" - Check detection_publisher logs for errors") + print(" - Verify camera topic name matches detection_publisher config") + + checker.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/examples/rai-semap.launch.py b/examples/rai-semap.launch.py new file mode 100644 index 000000000..703e00cc6 --- /dev/null +++ b/examples/rai-semap.launch.py @@ -0,0 +1,133 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument, ExecuteProcess +from launch.substitutions import LaunchConfiguration + + +def generate_launch_description(): + # Declare launch arguments + database_path_arg = DeclareLaunchArgument( + "database_path", + default_value="semantic_map.db", + description="Path to SQLite database file for semantic map", + ) + + location_id_arg = DeclareLaunchArgument( + "location_id", + default_value="default_location", + description="Identifier for the physical location", + ) + + camera_topic_arg = DeclareLaunchArgument( + "camera_topic", + default_value="/camera/camera/color/image_raw", + description="Camera image topic", + ) + + depth_topic_arg = DeclareLaunchArgument( + "depth_topic", + default_value="/camera/camera/depth/image_rect_raw", + description="Depth image topic", + ) + + camera_info_topic_arg = DeclareLaunchArgument( + "camera_info_topic", + default_value="/camera/camera/color/camera_info", + description="Camera info topic", + ) + + # Launch detection publisher (bridges DINO service to /detection_array topic) + camera_topic_param = LaunchConfiguration("camera_topic") + depth_topic_param = LaunchConfiguration("depth_topic") + camera_info_topic_param = LaunchConfiguration("camera_info_topic") + + launch_detection_publisher = ExecuteProcess( + cmd=[ + "python", + "-m", + "rai_semap.ros2.detection_publisher", + "--ros-args", + "-p", + ["camera_topic:=", camera_topic_param], + "-p", + ["depth_topic:=", depth_topic_param], + "-p", + ["camera_info_topic:=", camera_info_topic_param], + "-p", + "detection_topic:=/detection_array", + "-p", + "dino_service:=/grounding_dino_classify", + "-p", + "detection_interval:=2.0", + "-p", + "box_threshold:=0.4", + "-p", + "text_threshold:=0.3", + ], + output="screen", + ) + + # Launch semantic map node + db_path_param = LaunchConfiguration("database_path") + loc_id_param = LaunchConfiguration("location_id") + + launch_semap = ExecuteProcess( + cmd=[ + "python", + "-m", + "rai_semap.ros2.node", + "--ros-args", + "-p", + ["database_path:=", db_path_param], + "-p", + ["location_id:=", loc_id_param], + "-p", + "confidence_threshold:=0.5", + "-p", + "class_confidence_thresholds:=person:0.7,window:0.6,door:0.5", + "-p", + "class_merge_thresholds:=couch:2.5,table:1.5,shelf:1.5,chair:0.8", + "-p", + "min_bbox_area:=500.0", + "-p", + "use_pointcloud_dedup:=true", + "-p", + ["depth_topic:=", depth_topic_param], + "-p", + ["camera_info_topic:=", camera_info_topic_param], + "-p", + "detection_topic:=/detection_array", + "-p", + "map_topic:=/map", + "-p", + "map_frame_id:=map", + "-p", + "map_resolution:=0.05", + ], + output="screen", + ) + + return LaunchDescription( + [ + database_path_arg, + location_id_arg, + camera_topic_arg, + depth_topic_arg, + camera_info_topic_arg, + launch_detection_publisher, + launch_semap, + ] + ) diff --git a/examples/rosbot-xl-navigate-collect.py b/examples/rosbot-xl-navigate-collect.py new file mode 100644 index 000000000..7627e2a0c --- /dev/null +++ b/examples/rosbot-xl-navigate-collect.py @@ -0,0 +1,218 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +from typing import List + +import rclpy +from nav2_msgs.action import NavigateToPose +from rai.communication.ros2 import ROS2Connector +from rai.tools.ros2 import Nav2Toolkit +from rai.tools.time import WaitForSecondsTool +from rclpy.action import ActionClient + + +class NavigationCollector: + """Navigate robot and collect detections for semantic map validation.""" + + def __init__(self, connector: ROS2Connector): + self.connector = connector + self.nav_toolkit = Nav2Toolkit(connector=connector) + self.wait_tool = WaitForSecondsTool() + self._nav_action_ready = False + + def wait_for_nav_action_server(self, timeout_sec: float = 60.0) -> bool: + """Wait for Nav2 action server to be available. + + Args: + timeout_sec: Maximum time to wait for server. + + Returns: + True if server is available, False otherwise. + """ + if self._nav_action_ready: + return True + + node = self.connector.node + node.get_logger().info("Waiting for Nav2 action server to be available...") + node.get_logger().info("Checking action server at: navigate_to_pose") + + # Try different possible action names + action_names = ["navigate_to_pose", "/navigate_to_pose"] + + for action_name in action_names: + node.get_logger().info(f"Trying action name: {action_name}") + action_client = ActionClient(node, NavigateToPose, action_name) + start_time = time.time() + + while time.time() - start_time < timeout_sec: + if action_client.wait_for_server(timeout_sec=2.0): + node.get_logger().info( + f"Nav2 action server is ready at: {action_name}" + ) + self._nav_action_ready = True + return True + elapsed = time.time() - start_time + if int(elapsed) % 5 == 0 and elapsed > 0: + node.get_logger().info( + f"Still waiting for Nav2 action server... ({elapsed:.1f}s)" + ) + + node.get_logger().error( + f"Nav2 action server not available after {timeout_sec} seconds" + ) + node.get_logger().error("Make sure Nav2 is launched and running. Check:") + node.get_logger().error( + " 1. Is the launch file running? (ros2 launch examples/rosbot-xl-semap.launch.py ...)" + ) + node.get_logger().error(" 2. Check: ros2 action list | grep navigate") + node.get_logger().error(" 3. Check Nav2 logs for errors") + return False + + def navigate_to_waypoints(self, waypoints: List[tuple]) -> None: + """Navigate to a series of waypoints. + + Args: + waypoints: List of (x, y) or (x, y, yaw) tuples representing waypoints in map frame. + """ + node = self.connector.node + + # Wait for Nav2 action server to be ready + if not self.wait_for_nav_action_server(): + node.get_logger().error("Cannot navigate: Nav2 action server not available") + return + + node.get_logger().info(f"Starting navigation to {len(waypoints)} waypoints") + + for i, waypoint in enumerate(waypoints): + if len(waypoint) == 2: + x, y = waypoint + yaw = 0.0 + elif len(waypoint) == 3: + x, y, yaw = waypoint + else: + node.get_logger().warn(f"Invalid waypoint format: {waypoint}, skipping") + continue + + node.get_logger().info( + f"Navigating to waypoint {i + 1}/{len(waypoints)}: ({x}, {y}, yaw={yaw})" + ) + + # Use Nav2Toolkit to navigate + nav_tools = self.nav_toolkit.get_tools() + navigate_tool = None + for tool in nav_tools: + if "navigate" in tool.name.lower(): + navigate_tool = tool + break + + if navigate_tool: + try: + result = navigate_tool.invoke( + {"x": x, "y": y, "z": 0.0, "yaw": yaw} + ) + node.get_logger().info(f"Navigation result: {result}") + except Exception as e: + node.get_logger().warn(f"Navigation failed: {e}") + else: + node.get_logger().warn("Navigate tool not found, skipping waypoint") + + # Wait at waypoint to allow detections to be collected + node.get_logger().info("Waiting at waypoint for detections...") + self.wait_tool.invoke({"seconds": 5.0}) + + node.get_logger().info("Navigation complete") + + def collect_detections(self, duration_seconds: float = 30.0) -> None: + """Stay in place and collect detections. + + Args: + duration_seconds: How long to collect detections. + """ + node = self.connector.node + node.get_logger().info( + f"Collecting detections for {duration_seconds} seconds..." + ) + + start_time = time.time() + while time.time() - start_time < duration_seconds: + self.wait_tool.invoke({"seconds": 2.0}) + elapsed = time.time() - start_time + node.get_logger().info( + f"Collecting... {elapsed:.1f}/{duration_seconds} seconds" + ) + + node.get_logger().info("Detection collection complete") + + +def main(): + parser = argparse.ArgumentParser( + description="Navigate robot and collect detections for semantic map validation" + ) + parser.add_argument( + "--waypoints", + nargs="+", + type=float, + help="Waypoints as x1 y1 x2 y2 ... (in map frame)", + default=[2.0, 0.0, 4.0, 0.0, 2.0, 2.0], + ) + parser.add_argument( + "--collect-duration", + type=float, + default=10.0, + help="Duration to collect detections at each waypoint (seconds)", + ) + parser.add_argument( + "--use-sim-time", + action="store_true", + help="Use simulation time", + ) + + args = parser.parse_args() + + if len(args.waypoints) % 2 != 0: + parser.error("Waypoints must be pairs of (x, y) coordinates") + + waypoints = [ + (args.waypoints[i], args.waypoints[i + 1]) + for i in range(0, len(args.waypoints), 2) + ] + + rclpy.init() + + try: + connector = ROS2Connector( + executor_type="multi_threaded", + use_sim_time=args.use_sim_time, + ) + + collector = NavigationCollector(connector) + + # Navigate to waypoints + collector.navigate_to_waypoints(waypoints) + + # Final collection period + collector.collect_detections(duration_seconds=args.collect_duration) + + connector.node.get_logger().info("Navigation and collection complete") + + except KeyboardInterrupt: + pass + finally: + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/examples/validate-semantic-map.py b/examples/validate-semantic-map.py new file mode 100644 index 000000000..ed315d403 --- /dev/null +++ b/examples/validate-semantic-map.py @@ -0,0 +1,166 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +from collections import defaultdict +from pathlib import Path + +from geometry_msgs.msg import Point + +from rai_semap.core.backend.sqlite_backend import SQLiteBackend +from rai_semap.core.semantic_map_memory import SemanticMapMemory + + +def validate_database(database_path: str, location_id: str = "rosbot_xl_demo") -> bool: + """Validate stored data in semantic map database. + + Args: + database_path: Path to SQLite database file. + location_id: Location identifier to query. + + Returns: + True if validation passes, False otherwise. + """ + if not Path(database_path).exists(): + print(f"ERROR: Database file not found: {database_path}") + return False + + try: + backend = SQLiteBackend(database_path) + memory = SemanticMapMemory( + backend=backend, + location_id=location_id, + map_frame_id="map", + resolution=0.05, + ) + except Exception as e: + print(f"ERROR: Failed to initialize memory: {e}") + return False + + print(f"Validating semantic map database: {database_path}") + print(f"Location ID: {location_id}") + print("-" * 60) + + # Get map metadata + try: + metadata = memory.get_map_metadata() + print(f"Map Frame ID: {metadata.map_frame_id}") + print(f"Map Resolution: {metadata.resolution} m/pixel") + print(f"Last Updated: {metadata.last_updated}") + except Exception as e: + print(f"WARNING: Failed to get map metadata: {e}") + + print("-" * 60) + + # Query all annotations + center = Point(x=0.0, y=0.0, z=0.0) + all_annotations = memory.query_by_location( + center, radius=1e10, location_id=location_id + ) + + if not all_annotations: + print("WARNING: No annotations found in database") + return False + + print(f"Total annotations: {len(all_annotations)}") + print("-" * 60) + + # Group by object class + class_counts = defaultdict(int) + confidence_sum = defaultdict(float) + detection_sources = defaultdict(set) + + for ann in all_annotations: + class_counts[ann.object_class] += 1 + confidence_sum[ann.object_class] += ann.confidence + detection_sources[ann.object_class].add(ann.detection_source) + + print("Annotations by class:") + for obj_class in sorted(class_counts.keys()): + count = class_counts[obj_class] + avg_confidence = confidence_sum[obj_class] / count + sources = ", ".join(sorted(detection_sources[obj_class])) + print( + f" {obj_class}: {count} annotations, avg confidence: {avg_confidence:.3f}, sources: {sources}" + ) + + print("-" * 60) + + # Check for required fields + print("Validating annotation fields...") + all_valid = True + + for ann in all_annotations: + if not ann.object_class: + print(f"ERROR: Annotation {ann.id} has empty object_class") + all_valid = False + if ann.confidence < 0.0 or ann.confidence > 1.0: + print( + f"ERROR: Annotation {ann.id} has invalid confidence: {ann.confidence}" + ) + all_valid = False + if not ann.detection_source: + print(f"WARNING: Annotation {ann.id} has empty detection_source") + if not ann.source_frame: + print(f"WARNING: Annotation {ann.id} has empty source_frame") + + if all_valid: + print("All annotations have valid required fields") + + print("-" * 60) + + # Spatial distribution + if all_annotations: + x_coords = [ann.pose.position.x for ann in all_annotations] + y_coords = [ann.pose.position.y for ann in all_annotations] + + print("Spatial distribution:") + print(f" X range: [{min(x_coords):.2f}, {max(x_coords):.2f}]") + print(f" Y range: [{min(y_coords):.2f}, {max(y_coords):.2f}]") + print( + f" Mean position: ({sum(x_coords) / len(x_coords):.2f}, {sum(y_coords) / len(y_coords):.2f})" + ) + + print("-" * 60) + print("Validation complete") + + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Validate stored data in semantic map database" + ) + parser.add_argument( + "--database-path", + type=str, + default="semantic_map.db", + help="Path to SQLite database file", + ) + parser.add_argument( + "--location-id", + type=str, + default="rosbot_xl_demo", + help="Location identifier to query", + ) + + args = parser.parse_args() + + success = validate_database(args.database_path, args.location_id) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/src/rai_semap/README.md b/src/rai_semap/README.md new file mode 100644 index 000000000..dabdcbf69 --- /dev/null +++ b/src/rai_semap/README.md @@ -0,0 +1,180 @@ +# RAI Semantic Map Memory + +⚠️ **Experimental Module**: This module is in active development. Features may change and some functionality is still in progress. + +## Moudle Overview + +When a robot explores a new environment, it builds a map of the space using SLAM (Simultaneous Localization and Mapping), which creates a geometric map showing walls, obstacles, and open areas. This module extends that map with semantic information, storing what objects the robot sees and where they are located. It provides persistent storage of semantic annotations (object class, 3D pose, confidence) indexed by both spatial coordinates and semantic labels. Using this semantic map memory, user can do queries like "where did I see a red cup?" and "what objects are within 2m of (x,y)?". + +For detailed design and architecture, see [design.md](../design.md). + +## End-to-End Validation + +Phase 4 validation tests the full pipeline: launching the demo, collecting detections during navigation, and verifying stored data. + +### Prerequisites + +- ROS2 environment set up +- rai_semap package installed, `poetry install --with semap` +- ROSBot XL demo setup, see general readme. + +### Steps + +1. Launch rosbot-xl demo with semantic map node: + +```bash +ros2 launch examples/rosbot-xl-semap.launch.py \ + game_launcher:=./examples/rosbot-xl.launch.py game_launcher:=demo_assets/rosbot/RAIROSBotXLDemo/RAIROSBotXLDemo.GameLauncher \ + database_path:=semantic_map.db \ + location_id:=rosbot_xl_demo +``` + +2. Navigate and collect detections: + +In a separate terminal, run the navigation script to move the robot through waypoints and collect detections: + +```bash +python examples/rosbot-xl-navigate-collect.py \ + --waypoints 2.0 0.0 4.0 0.0 2.0 2.0 \ + --collect-duration 10.0 \ + --use-sim-time +``` + +The script navigates to each waypoint and waits to allow detections to be collected and stored in the semantic map. + +3. Validate stored data: + +After navigation completes, run the validation script to verify the database contents: + +```bash +python examples/validate-semantic-map.py \ + --database-path semantic_map.db \ + --location-id rosbot_xl_demo +``` + +The validation script reports: + +- Total annotation count +- Annotations grouped by object class +- Average confidence scores per class +- Detection sources +- Spatial distribution of annotations +- Field validation checks + +### Query by Timestamp + +To query annotations by timestamp, you can filter results after querying. Timestamps are stored as string representations of ROS2 Time objects. Here's an example Python script: + +```python +from geometry_msgs.msg import Point +from rai_semap.core.backend.sqlite_backend import SQLiteBackend +from rai_semap.core.semantic_map_memory import SemanticMapMemory + +# Initialize memory +backend = SQLiteBackend("semantic_map.db") +memory = SemanticMapMemory( + backend=backend, + location_id="rosbot_xl_demo", + map_frame_id="map", + resolution=0.05, +) + +# Query all annotations +center = Point(x=0.0, y=0.0, z=0.0) +all_annotations = memory.query_by_location(center, radius=1e10) + +# Filter by timestamp using string comparison +# Timestamps are stored as strings in format: "Time(nanoseconds=...)" +# Extract nanoseconds from timestamp string for comparison +def parse_timestamp_ns(timestamp_str): + """Extract nanoseconds from timestamp string like 'Time(nanoseconds=123456789)'""" + if not timestamp_str or not timestamp_str.startswith("Time(nanoseconds="): + return None + try: + ns_str = timestamp_str.split("nanoseconds=")[1].rstrip(")") + return int(ns_str) + except (IndexError, ValueError): + return None + +# Example: Get annotations from last hour +import time +current_ns = int(time.time() * 1e9) +one_hour_ns = int(3600 * 1e9) +one_hour_ago_ns = current_ns - one_hour_ns + +recent_annotations = [ + ann for ann in all_annotations + if ann.timestamp and (ann_ns := parse_timestamp_ns(ann.timestamp)) and ann_ns >= one_hour_ago_ns +] + +print(f"Found {len(recent_annotations)} annotations from the last hour") + +# Filter by specific time range (using string comparison) +start_time_str = "Time(nanoseconds=1700000000000000000)" # Example timestamp +end_time_str = "Time(nanoseconds=1700003600000000000)" # Example timestamp + +time_range_annotations = [ + ann for ann in all_annotations + if ann.timestamp and start_time_str <= ann.timestamp <= end_time_str +] + +print(f"Found {len(time_range_annotations)} annotations in time range") +``` + +For more complex timestamp queries, you can also query the database directly using SQL: + +```python +import sqlite3 + +conn = sqlite3.connect("semantic_map.db") +cursor = conn.cursor() + +# Example: Query annotations after a specific timestamp +# Timestamps are stored as strings, so string comparison works for ordering +cursor.execute(""" + SELECT * FROM annotations + WHERE location_id = ? + AND timestamp >= ? + ORDER BY timestamp DESC +""", ("rosbot_xl_demo", "Time(nanoseconds=1700000000000000000)")) + +rows = cursor.fetchall() +print(f"Found {len(rows)} annotations after specified timestamp") +``` + +### Expected Output + +The validation script should show annotations stored from detections collected during navigation, with valid confidence scores, detection sources, and spatial coordinates in the map frame. + +## Utilities + +### Clear All Annotations + +To remove all annotations from the semantic map database: + +```bash +python -m rai_semap.utils.clear_annotations \ + --database-path semantic_map.db +``` + +To remove annotations for a specific location only: + +```bash +python -m rai_semap.utils.clear_annotations \ + --database-path semantic_map.db \ + --location-id rosbot_xl_demo +``` + +To skip the confirmation prompt: + +```bash +python -m rai_semap.utils.clear_annotations \ + --database-path semantic_map.db \ + --yes +``` + +The script will: + +- Show the number of annotations that will be deleted +- Prompt for confirmation (unless `--yes` is used) +- Delete the annotations and report the count deleted diff --git a/src/rai_semap/design.md b/src/rai_semap/design.md new file mode 100644 index 000000000..c7b24af65 --- /dev/null +++ b/src/rai_semap/design.md @@ -0,0 +1,577 @@ +# RAI's Agent Memory System + +## Table of Contents + +- [Problem Definition](#problem-definition) + - [Q & A](#q--a) +- [Concepts](#concepts) + - [High-Level Concepts](#high-level-concepts) + - [Data Models](#data-models) + - [Relationships](#relationships) + - [Non Goals](#non-goals) +- [Design Proposal](#design-proposal) + - [BaseMemory Interface](#basememory-interface) + - [SemanticMapMemory Interface](#semanticmapmemory-interface) + - [Database Backend Abstraction](#database-backend-abstraction) + - [New Component: rai_semap](#new-component-rai_semap) + - [Usage Patterns from Other Layers](#usage-patterns-from-other-layers) + - [Implementation Phases](#implementation-phases) +- [Reusability](#reusability) +- [External Memory Systems](#external-memory-systems) +- [Appendix](#appendix) + +## Problem Definition + +[Issue#225 Design SLAM RAI features](https://github.com/RobotecAI/rai/issues/225) presents an explorative SLAM/semantic mapping integration task: + +> "Robots often need to find out about their environment first, building a map and localizing themselves on it. During this process, RAI can be used to guide exploration for mapping or to build a semantic map during the SLAM which adds knowledge and memory, which can be used to reason about the area itself and tasks that are to be given in the area. A great start is to design solutions for RAI." + +Based on RAI's current capabilities (perception, navigation, multi-modal interaction), this roughly maps to three areas: + +1. Semantic Perception Layer which may be built on top of `rai_perception` (open-set detection), Grounded SAM 2 integration + with a new semantic annotation pipeline that tags SLAM features/points with object identities during mapping + +2. Agent Guided Exploration Strategy which can be built on existing `rai_nomad` (navigation) where agent decides _where_ to explore based on goals ("find the kitchen", "map storage areas") rather than frontier-based exploration. Frontier-based exploration navigates to boundaries between known and unknown map regions to expand coverage. + +3. Spatial Memory System which provides persistent semantic map storage that agents can query ("where did I see tools?") and reason over ("this room is suitable for assembly tasks"). The word _spatial_ refers to 3D location/position information in map coordinates. + + - The connection between spatial memory and other RAI memory systems (artifact_database.pkl, rai_whoami vector store, StateBasedAgent state) needs to be explored: spatial memory could be queried by these systems to provide spatial context, rather than serving as storage for them. + - For example, could artifacts be annotated with spatial locations queried from spatial memory, could embodiment docs reference spatial locations that spatial memory could ground, or could recent spatial queries be included in StateBasedAgent state to provide spatial awareness during conversations? + +For explorative SLAM, these areas build on each other: perception feeds the semantic map, exploration uses it to guide decisions, and memory enables task reasoning. + +### Q & A + +Q: why "map storage areas" is not a frontier-based exploration? + +"Map storage areas" is goal-based, not frontier-based, because frontier-based chooses the nearest boundary between known and unknown regions, regardless of what might be there. It's geometry-driven. Goal-based ("map storage areas") uses semantic reasoning to prioritize exploration. The agent might: + +- Use partial detections ("I saw a shelf, explore that direction") +- Reason about room layouts ("storage areas are often in basements or corners") +- Query the semantic map for hints about where storage might be + +Both may explore unknown regions, but the decision differs: frontier-based picks the nearest unknown boundary whereas goal-based uses semantic cues to target likely locations. + +## Concepts + +### High-Level Concepts + +#### Semantic Annotation + +**A spatial-semantic record linking an object identity (class label, e.g., "red cup", "tool") to a 3D pose in the map frame, with metadata (timestamp, confidence, detection source).** + +Unlike pure geometric SLAM, semantic annotations enable querying "what" and "where" simultaneously. This allows agents to reason about object locations for task planning and spatial reasoning. The combination of semantic labels and 3D poses creates a rich representation that bridges perception and spatial memory. + +```python +# Example: Semantic annotation structure +{ + "object_class": "red cup", + "pose": {"x": 2.5, "y": 1.3, "z": 0.8, "orientation": {...}}, + "confidence": 0.92, + "timestamp": "2025-01-15T10:23:00", + "detection_source": "GroundingDINO", + "source_frame": "camera_frame" +} +``` + +#### Spatial Memory + +**A conceptual system that provides persistent storage of semantic annotations indexed by both spatial coordinates (3D: x, y, z) and semantic labels.** + +The storage can be implemented via database backends (SQLite/PostGIS) accessed through the `SemanticMapMemory` interface. This dual indexing enables efficient queries like "find objects near (x,y)" (2D projection when z is not needed) and "where did I see X?" by combining spatial indexing with semantic search. Without spatial memory, agents cannot recall where objects were seen, limiting task planning capabilities. + +```python +# Example: Spatial query +memory.query_nearby_objects(x=2.5, y=1.3, radius=2.0) +# Returns: [{"object": "red cup", "distance": 0.5}, +# {"object": "table", "distance": 1.2}] + +# Example: Semantic query +memory.query_by_class("red cup") +# Returns: [{"pose": (2.5, 1.3, 0.8), "confidence": 0.92, ...}] +``` + +#### Camera-to-map Transformation + +**Converting detections from camera frame to map frame using TF transforms.** + +The perception layer (GroundingDINO, GroundedSAM) already provides 3D poses in `ObjectHypothesisWithPose`, so the system transforms these poses from the camera frame to the map frame using TF transforms (camera → base_link → map). This is critical for building a consistent spatial-semantic map across robot movements. Without proper frame transformation, detections from different robot positions would be stored in inconsistent coordinate systems, making spatial queries unreliable. + +```python +# Example: Frame transformation flow +# Detection in camera frame +camera_pose = {"x": 0.3, "y": 0.1, "z": 1.2} # relative to camera + +# Transform to map frame via TF +map_pose = tf_buffer.transform( + camera_pose, + target_frame="map", + source_frame="camera_frame" +) +# Result: {"x": 2.5, "y": 1.3, "z": 0.8} # absolute map coordinates + +# Same object detected from different angle → same map coordinates +``` + +#### Temporal Consistency + +**Handling multiple detections of the same object instance over time by merging duplicates based on spatial proximity.** + +Tracks individual instances (by spatial location), not object classes. Without temporal consistency, repeated detections of the same object would create duplicate records, making queries like "where did I see the red cup?" return multiple locations for the same object, rendering the database inconsistent and queries unreliable. Temporal consistency merges repeated detections of the same physical object (same location within a threshold), not different objects even if they share the same class label. A key challenge is distinguishing a moved object (same instance, new location) from a new object instance (different instance, similar appearance). + +```python +# Example: Multiple detections of same object +# Detection 1 at t=0: "red cup" at (2.5, 1.3, 0.8), confidence=0.85 +# Detection 2 at t=5: "red cup" at (2.52, 1.31, 0.81), confidence=0.92 +# → Merged into single annotation with max confidence (0.92) and latest timestamp + +# Example: Different objects (same class, different locations) +# Detection 1: "box" at (1.0, 2.0, 0.5) # Box A +# Detection 2: "box" at (3.0, 4.0, 0.5) # Box B (different instance) +# → Two separate annotations stored +``` + +#### Deduplication Strategies + +**Multiple techniques work together to prevent duplicate annotations: spatial merging, point cloud-based matching, confidence filtering, and bounding box size filtering.** + +These strategies work together to ensure database consistency and query reliability. Confidence and size filtering happen first to remove low-quality detections, then spatial merging with point cloud validation occurs during storage. Without deduplication, the database would be polluted with duplicate entries, making spatial queries return incorrect results and wasting storage. + +**1. Spatial Merging** + +Detections of the same class within a merge threshold (distance in meters) are merged into a single annotation. The merge threshold is class-specific to handle objects of different sizes. When merging, the system keeps the maximum confidence score and updates the timestamp to the latest detection. + +```python +# Example: Class-specific merge thresholds +merge_thresholds = { + "couch": 2.5, # Large objects + "table": 1.5, + "shelf": 1.5, + "chair": 0.8, # Medium objects + "cup": 0.5 # Small objects (default) +} + +# Two "cup" detections within 0.5m → merged +# Two "couch" detections within 2.5m → merged +``` + +**2. Point Cloud-Based Matching** + +When depth images are available, the system extracts 3D point clouds from bounding box regions and uses them for more accurate deduplication. Point cloud centroid is more accurate than bounding box center for spatial matching. Size validation compares 3D point cloud sizes to avoid merging objects of very different sizes. If point cloud sizes differ by >50% and >0.5m, detections are treated as different objects even if spatially close. + +```python +# Example: Point cloud validation +detection1 = { + "centroid": (2.5, 1.3, 0.8), + "size_3d": 0.15, # meters + "point_count": 1250 +} +detection2 = { + "centroid": (2.52, 1.31, 0.81), # Close spatially + "size_3d": 0.25, # 67% larger → different object + "point_count": 2100 +} +# Result: Not merged (size difference >50% and >0.5m) +``` + +**3. Confidence Filtering** + +Only detections above a confidence threshold are stored. The threshold is class-specific to handle high false-positive classes. This prevents low-confidence false positives from polluting the database. + +```python +# Example: Class-specific confidence thresholds +confidence_thresholds = { + "person": 0.7, # High false-positive rate + "window": 0.6, + "door": 0.5, + "cup": 0.5 # Default threshold +} +``` + +**4. Bounding Box Size Filtering** + +Very small bounding boxes (below minimum area threshold, default: 500 pixels²) are filtered out as they are often false positives from partial occlusions or detection artifacts. + +```python +# Example: Size filtering +bbox_area = width * height # pixels² +if bbox_area < 500: + # Filtered out (likely false positive) + return None +``` + +#### Query Patterns + +**Primary query types: spatial queries (objects near a location), semantic queries (locations of object classes), and hybrid queries (combining both).** + +These query patterns enable agents to retrieve spatial-semantic information for task planning. Spatial queries support navigation and proximity-based reasoning. Semantic queries enable object retrieval tasks. Hybrid queries combine both for complex scenarios like "find tools in the workshop." + +```python +# Spatial query: "What objects are within 2m of (x,y)?" +results = memory.query_nearby_objects(x=2.5, y=1.3, radius=2.0) + +# Semantic query: "Where did I see a red cup?" +results = memory.query_by_class("red cup") + +# Hybrid query: "Find tools in the workshop" (semantic + spatial region) +results = memory.query_by_class_and_region( + object_class="tool", + region_polygon=[(x1,y1), (x2,y2), (x3,y3), (x4,y4)] +) +``` + +### Data Models + +#### SemanticAnnotation + +**A data structure representing a single semantic annotation with object identity, 3D pose, confidence, and metadata.** + +This is the core data model that stores all semantic-spatial information in the memory system. Each annotation links a detected object to its location in the map frame, enabling spatial queries and temporal consistency tracking. The metadata field allows extensibility for point cloud features and other attributes without changing the core schema. + +```python +@dataclass +class SemanticAnnotation: + id: UUID # Unique identifier + object_class: str # e.g., "red cup", "tool" + pose: Pose # 3D pose in map frame (x, y, z, orientation) + confidence: float # 0.0-1.0 + timestamp: Time # ROS timestamp of detection + detection_source: str # e.g., "GroundingDINO", "GroundedSAM" + source_frame: str # Original camera frame_id + vision_detection_id: str # ID from Detection2D for debugging + metadata: dict # Optional JSON with pointcloud info when available + # metadata.pointcloud: {centroid, size_3d, point_count} +``` + +#### SpatialIndex + +**Database-level spatial index (R-tree) for efficient spatial queries.** + +Spatial indexing is essential for performance when querying large numbers of annotations. Without it, spatial queries would require scanning all records, which becomes prohibitively slow as the map grows. SQLite uses SpatiaLite extension; PostGIS uses native GIST indexes. Both provide sub-linear query performance for spatial operations. + +```sql +-- Example: Spatial index creation (SpatiaLite) +CREATE VIRTUAL TABLE annotations_rtree USING rtree( + id, minx, maxx, miny, maxy +); + +-- Example: Efficient spatial query using index +SELECT * FROM annotations +WHERE id IN ( + SELECT id FROM annotations_rtree + WHERE minx <= x+radius AND maxx >= x-radius + AND miny <= y+radius AND maxy >= y-radius +); +``` + +#### MapMetadata + +**Metadata about the SLAM map including frame ID, resolution, origin, and last update timestamp.** + +This metadata enables the system to correctly interpret spatial coordinates and maintain consistency with the underlying SLAM map. The resolution and origin are needed to convert between map coordinates and pixel coordinates for visualization. The last_updated timestamp helps track map freshness and coordinate system changes. + +```python +@dataclass +class MapMetadata: + map_frame_id: str # Frame ID of the SLAM map + resolution: float # OccupancyGrid resolution (meters/pixel) + origin: Pose # Map origin pose + last_updated: Time # Timestamp of last annotation +``` + +### Relationships + +#### Perception Layer → Memory System + +**`RAIDetectionArray` messages flow from `rai_perception` services (GroundingDINO, GroundedSAM) into `rai_semap`, which projects detections to map frame and stores them.** + +The `detection_publisher` node bridges the service-based perception layer to topic-based messaging by subscribing to camera images, calling DINO service, and publishing `RAIDetectionArray` messages to `/detection_array` topic. This decoupling allows the memory system to work with any perception service that publishes `RAIDetectionArray` messages, not just GroundingDINO. The topic-based interface enables multiple consumers and easier debugging. + +```python +# Flow: Camera → detection_publisher → RAIDetectionArray → semantic_map_node +# detection_publisher subscribes to /camera/image_raw +# Calls /grounding_dino/grounding_dino_classify service +# Publishes RAIDetectionArray to /detection_array +# semantic_map_node subscribes to /detection_array and stores annotations +``` + +#### Exploration Layer → Memory System + +**Agent-guided exploration uses semantic map queries to find unexplored regions with specific semantic properties.** + +The memory system returns candidate locations for exploration goals like "find areas with storage furniture". This enables goal-based exploration rather than purely geometric frontier-based exploration. The exploration layer can query for semantic hints ("I saw a shelf, explore that direction") and use coverage tracking to prioritize unexplored regions. + +```python +# Example: Goal-based exploration query +candidates = memory.query_by_class("shelf") +unexplored_regions = exploration.find_unexplored_near(candidates) +# Agent navigates to unexplored regions near detected shelves +``` + +#### Memory System → Agent Tools + +**Agents query semantic map via `QuerySemanticMapTool` to retrieve object locations for task planning.** + +This integration enables multi-step task planning where agents can query object locations, navigate to them, verify presence, and manipulate objects. Without this connection, agents would have no persistent spatial memory and would need to re-detect objects every time, limiting task capabilities. + +```python +# Example: Agent tool usage +tool = QuerySemanticMapTool(memory) +result = tool.invoke("red cup in kitchen") +# Returns: {"object": "red cup", "location": (2.5, 1.3, 0.8), ...} +# Agent uses NavigateToPoseTool to go to location +``` + +### Non Goals + +Future Integration Points: + +- `artifact_database.pkl`: Could store semantic annotations alongside multimodal artifacts +- `rai_whoami` vector store: Could index semantic annotations for LLM-based reasoning +- `StateBasedAgent` state: Could include recent semantic map queries in conversation context + +## Design Proposal + +### BaseMemory Interface + +**A minimal abstract interface for memory systems that allows future memory systems (conversational, vector-based, etc.) to share a common API.** + +Since no `BaseMemory` interface exists in RAI, we define this interface to enable consistent memory system integration across RAI components. This interface allows `SemanticMapMemory` and future memory systems to share a common API while each extends it with domain-specific methods. See `base_memory.py` for the interface definition. + +```python +# Example: BaseMemory interface structure +class BaseMemory(ABC): + @abstractmethod + def store(self, data: Any) -> str: + """Store data and return identifier.""" + pass + + @abstractmethod + def query(self, query: str) -> List[Any]: + """Query stored data.""" + pass +``` + +### SemanticMapMemory Interface + +**`SemanticMapMemory` extends `BaseMemory` with spatial query capabilities for semantic annotations.** + +This interface provides the contract for spatial-semantic memory operations, enabling agent tools and exploration layers to query object locations without depending on the specific database backend implementation. The interface abstracts away backend details (SQLite vs PostGIS) while providing spatial query methods. See `semantic_map_memory.py` for the interface definition. + +```python +# Example: SemanticMapMemory interface methods +class SemanticMapMemory(BaseMemory): + def query_nearby_objects(self, x: float, y: float, radius: float) -> List[SemanticAnnotation]: + """Query objects within radius of (x, y).""" + pass + + def query_by_class(self, object_class: str) -> List[SemanticAnnotation]: + """Query all annotations of a given class.""" + pass +``` + +### Database Backend Abstraction + +**A backend abstraction layer that supports both SQLite (Phase I) and PostGIS (future) implementations.** + +This abstraction enables switching between database backends without changing the `SemanticMapMemory` interface or agent tools. SQLite provides a lightweight, single-file solution for Phase I, while PostGIS enables advanced features for future multi-robot deployments. See `spatial_db_backend.py` for the interface definition. + +**SQLiteBackend (Phase I):** + +Uses SpatiaLite extension for spatial indexing. Single-file database with no external dependencies. Can be deployed on-board the robot (no network or separate server required). Sufficient for single-robot deployments. + +```python +# Example: SQLiteBackend usage +backend = SQLiteBackend("semantic_map.db") +memory = SemanticMapMemory(backend) +# Single file, no server needed +``` + +**PostGISBackend (future):** + +Full PostgreSQL + PostGIS for advanced spatial operations. Supports multi-robot coordination via shared database (cloud or local network server). Better performance for large-scale maps. + +```python +# Example: PostGISBackend usage (future) +backend = PostGISBackend(connection_string="postgresql://...") +memory = SemanticMapMemory(backend) +# Shared database for multi-robot coordination +``` + +Configuration via ROS2 parameters (set via launch file or command line): + +Detection Publisher Node (`rai_semap.ros2.detection_publisher`): + +- `camera_topic`: Camera image topic to subscribe to (default: "/camera/image_raw") +- `detection_topic`: Topic to publish RAIDetectionArray messages (default: "/detection_array") +- `dino_service`: GroundingDINO service name (default: "/grounding_dino/grounding_dino_classify") +- `detection_classes`: Comma-separated list of object classes to detect (default: "person, cup, bottle, box, bag, chair, table, shelf, door, window") +- `detection_interval`: Minimum time between detections in seconds (default: 2.0) +- `box_threshold`: DINO box threshold (default: 0.3) +- `text_threshold`: DINO text threshold (default: 0.25) + +Semantic Map Node (`rai_semap.ros2.node`): + +- `database_path`: Path to SQLite database file (default: "semantic_map.db") +- `confidence_threshold`: Minimum confidence score (0.0-1.0) for storing detections (default: 0.5) +- `class_confidence_thresholds`: Class-specific confidence thresholds as 'class1:threshold1,class2:threshold2' (e.g., 'person:0.7,window:0.6,door:0.5') +- `class_merge_thresholds`: Class-specific merge radii (meters) for deduplication as 'class1:radius1,class2:radius2' (e.g., 'couch:2.5,table:1.5,shelf:1.5,chair:0.8') +- `min_bbox_area`: Minimum bounding box area (pixels²) to filter small false positives (default: 500.0) +- `use_pointcloud_dedup`: Enable point cloud-based deduplication matching (default: true) +- `depth_topic`: Depth image topic for point cloud extraction (optional, required if use_pointcloud_dedup=true) +- `camera_info_topic`: Camera info topic for point cloud extraction (optional, required if use_pointcloud_dedup=true) +- `detection_topic`: Topic for RAIDetectionArray messages (default: "/detection_array") +- `map_topic`: Topic for OccupancyGrid map messages (default: "/map") +- `map_frame_id`: Frame ID of the SLAM map (default: "map") +- `location_id`: Identifier for the physical location (default: "default_location") +- `map_resolution`: OccupancyGrid resolution in meters/pixel (default: 0.05) + +Future: PostGIS backend selection will be configurable (not yet implemented). + +### New Component: `rai_semap` + +Architecture: + +`rai_semap` consists of a core library, a ROS2 node wrapper and tools. + +1. Core Library (`rai_semap.core`): + + - Frame Projection: Transform 3D poses from source frame (camera frame) to map frame using TF transforms. Detections already contain 3D poses in `ObjectHypothesisWithPose`, so depth estimation is handled by the perception layer. + - Temporal Filtering: Merge duplicate detections of same object using multi-strategy deduplication: + - Spatial clustering within class-specific merge thresholds + - Point cloud-based matching when depth data is available (uses 3D centroid and size validation) + - Confidence filtering with class-specific thresholds + - Bounding box size filtering to remove small false positives + - Confidence Aggregation: Update confidence scores for repeated observations (keeps maximum confidence) + - Storage: Persist annotations to `SemanticMapMemory` backend with point cloud features in metadata + - No ROS2 dependencies; pure Python library + +2. ROS2 Node Wrapper (`rai_semap.ros2`): + + - `detection_publisher` node: + - Subscribes to camera images (e.g., `/camera/image_raw`) + - Calls GroundingDINO service with configurable object classes + - Publishes `RAIDetectionArray` messages to `/detection_array` topic + - Throttles detections via `detection_interval` parameter to control processing rate + - `node` (semantic map node): + - Subscriptions: + - `RAIDetectionArray` topic (detections from `detection_publisher` or other sources) + - `/map` (OccupancyGrid from SLAM) + - TF handling via `tf2_ros.Buffer` and `TransformListener` (automatically subscribes to `/tf` and `/tf_static`) + - Converts ROS2 messages to core library data structures + - Calls core library processing functions + - Handles ROS2-specific concerns (TF lookups, message conversion) + +3. Tools/Services: + - `QuerySemanticMapTool`: LangChain tool for agent queries + - ROS2 service for programmatic access (not in current scope) + +Dependency Flow: + +``` +Camera Images → detection_publisher → RAIDetectionArray → semantic_map_node → rai_semap.core → SemanticMapMemory → Agent Tools + ↓ ↓ ↓ ↓ ↓ ↓ + /camera/image_raw DINO Service Detection2D ROS2 Wrapper Frame Transform SQLite/PostGIS + (service call) (3D pose, class) (msg conversion) (TF transform) Spatial Queries +``` + +### Usage Patterns from Other Layers + +Perception Layer Requirements: + +- Service-to-topic bridge: The `detection_publisher` node bridges the service-based perception layer (GroundingDINO, GroundedSAM) to topic-based messaging by continuously processing camera images and publishing detections +- Topic-based processing: Handle `RAIDetectionArray` messages published to topics (from `detection_publisher` or other sources) +- Immediate processing: Each detection array is processed immediately upon receipt in the callback +- Confidence filtering: Only store high-confidence detections (configurable via ROS2 parameter `confidence_threshold`) +- Configurable detection rate: The `detection_interval` parameter controls how frequently detections are processed to balance accuracy and computational load + +Exploration Layer Requirements (prioritized): + +1. Coverage tracking: Query which map regions have been semantically annotated (M) + - Foundation for other exploration features + - Requires spatial indexing of annotated regions (grid-based or region-based) + - Grid-based example: Divide map into fixed-size cells (e.g., 0.5m x 0.5m aligned with OccupancyGrid resolution), track which cells contain annotations + - Region-based example: Use spatial clustering or bounding polygons around annotated areas, define regions dynamically based on annotation density +2. Goal-based queries: "Find unexplored regions with 'kitchen' objects" (M) + - Depends on: Coverage tracking, semantic queries + - Combines semantic search with coverage information +3. Frontier detection: Identify boundaries between mapped and unmapped semantic regions (L) + - Depends on: Coverage tracking + - Requires boundary detection algorithms and spatial analysis + +Agent Tool Integration: + +- Natural language queries: `QuerySemanticMapTool("red cup in kitchen")` → spatial query +- Multi-step planning: Query → Navigate → Verify → Manipulate +- Temporal reasoning: "Where did I see X yesterday?" (requires timestamp filtering) + +### Implementation Phases + +Phase I (SQLite): + +- Implement `SQLiteBackend` with SpatiaLite +- Basic `SemanticMapMemory` with spatial queries +- `rai_semap` node with frame projection +- `QuerySemanticMapTool` for agent integration +- Single-robot deployment +- Validation demo using rosbot-xl: Build semantic map during navigation, query object locations (e.g., "Where did I see the bed?"), verify detections are correctly stored and retrieved + +Future direction (PostGIS Migration): + +- Implement `PostGISBackend` with same interface +- Configuration-based backend switching +- Multi-robot coordination support +- Advanced spatial operations (polygon queries, distance calculations) + +## Reusability + +`rai_semap` may be reusable for object retrieval scenario which robot finds and retrieves objects it saw earlier. For example, after initial mapping, user asks: _"Bring me the red cup I saw in the kitchen"_ + +- Flow: + +1. Agent queries semantic map: `QuerySemanticMapTool("red cup", room="kitchen")` +2. Semantic map returns: `{object: "red cup", location: (x: 2.5, y: 1.3, z: 0.8), timestamp: "2025-01-15T10:23:00", confidence: 0.92}` +3. Agent uses `NavigateToPoseTool` to go to that location +4. Agent uses `GetDetectionTool` to confirm object presence +5. Agent uses manipulation tools to grab and return the cup + +- Benefits: + - Persistent memory: remembers objects across sessions + - Spatial reasoning: knows where things are, not just what they are + - Task planning: can plan multi-step retrieval tasks + +More scenarios. These are yet to be explored, listed here just for future revisit of the design. + +- Inventory tracking: "What tools are in the workshop?" +- Change detection: "Did anything move in the living room?" +- Multi-robot coordination: share semantic map between robots +- Long-term monitoring: track object locations over days/weeks + +## External Memory Systems + +### mem0 + +[mem0](https://github.com/mem0ai/mem0) is a mature implementation (43.8k stars, production-ready). It's not a good fit for RAI. mem0 targets conversational memory, while RAI needs spatial-semantic storage with pose queries. + +### ROS semantic_mapping + +C++/ROS1, no Python API or SQLite/PostGIS backend, [source reference](https://github.com/fdayoub/ros-semantic-mapper). + +### KnowRob + +Knowledge reasoning, not spatial-semantic storage, [source reference](https://github.com/knowrob/knowrob) + +### SEGO (Semantic Graph Ontology) + +Research framework, no production storage backend, [paper](https://arxiv.org/abs/2506.13149) + +### Semantic SLAM projects + +Mostly C++ (ORB-SLAM2 etc) not Python with database backends. [orb-slam2 source reference](https://github.com/appliedAI-Initiative/orb_slam_2_ros) +The frame viewer from this [post](https://records.sigmm.org/?open-source-item=openvslam-a-versatile-visual-slam-framework) is fantastic for visualization. + +## Appendix + +### PostGIS + +PostGIS is an extension for PostgreSQL, not a separate database. It adds spatial data types and functions (geometry, geography, spatial indexing, spatial queries). After installing PostgreSQL, then enable the PostGIS extension with `CREATE EXTENSION postgis;`. diff --git a/src/rai_semap/pyproject.toml b/src/rai_semap/pyproject.toml new file mode 100644 index 000000000..ba08882d3 --- /dev/null +++ b/src/rai_semap/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "rai_semap" +version = "0.1.0" +description = "Semantic mapping and spatial memory for RAI" +authors = ["Julia Jia"] +readme = "rai_semap/README.md" +classifiers = [ + "Programming Language :: Python :: 3", + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", +] +packages = [ + { include = "rai_semap", from = "." }, +] + +[tool.poetry.dependencies] +python = "^3.10, <3.13" +rai_core = {path = "../rai_core", develop = true} + +[tool.poetry.group.dev.dependencies] +pytest = "^8.2.0" diff --git a/src/rai_semap/rai_semap/__init__.py b/src/rai_semap/rai_semap/__init__.py new file mode 100644 index 000000000..c8b4abf7f --- /dev/null +++ b/src/rai_semap/rai_semap/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/rai_semap/rai_semap/core/__init__.py b/src/rai_semap/rai_semap/core/__init__.py new file mode 100644 index 000000000..c8b4abf7f --- /dev/null +++ b/src/rai_semap/rai_semap/core/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/rai_semap/rai_semap/core/backend/__init__.py b/src/rai_semap/rai_semap/core/backend/__init__.py new file mode 100644 index 000000000..c8b4abf7f --- /dev/null +++ b/src/rai_semap/rai_semap/core/backend/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/rai_semap/rai_semap/core/backend/spatial_db_backend.py b/src/rai_semap/rai_semap/core/backend/spatial_db_backend.py new file mode 100644 index 000000000..76074a4f7 --- /dev/null +++ b/src/rai_semap/rai_semap/core/backend/spatial_db_backend.py @@ -0,0 +1,65 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from geometry_msgs.msg import Point + +if TYPE_CHECKING: + from rai_semap.core.semantic_map_memory import SemanticAnnotation + + +class SpatialDBBackend(ABC): + """Abstract backend for spatial database operations.""" + + @abstractmethod + def init_schema(self) -> None: + """Initialize database schema with spatial extensions.""" + pass + + @abstractmethod + def insert_annotation(self, annotation: "SemanticAnnotation") -> str: + """Insert annotation with spatial indexing.""" + pass + + @abstractmethod + def spatial_query( + self, center: Point, radius: float, filters: Optional[Dict[str, Any]] = None + ) -> List["SemanticAnnotation"]: + """Execute spatial query with optional filters.""" + pass + + @abstractmethod + def delete_annotation(self, annotation_id: str) -> bool: + """Delete annotation by ID. Returns success status.""" + pass + + @abstractmethod + def delete_all_annotations(self, location_id: Optional[str] = None) -> int: + """Delete all annotations, optionally filtered by location_id. + + Args: + location_id: If provided, only delete annotations for this location. + If None, delete all annotations. + + Returns: + Number of annotations deleted. + """ + pass + + @abstractmethod + def update_annotation(self, annotation: "SemanticAnnotation") -> bool: + """Update existing annotation by ID. Returns success status.""" + pass diff --git a/src/rai_semap/rai_semap/core/backend/sqlite_backend.py b/src/rai_semap/rai_semap/core/backend/sqlite_backend.py new file mode 100644 index 000000000..779c047c0 --- /dev/null +++ b/src/rai_semap/rai_semap/core/backend/sqlite_backend.py @@ -0,0 +1,342 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import sqlite3 +from typing import Any, Dict, List, Optional + +from geometry_msgs.msg import Point, Pose + +from rai_semap.core.backend.spatial_db_backend import SpatialDBBackend +from rai_semap.core.semantic_map_memory import SemanticAnnotation + +logger = logging.getLogger(__name__) + + +class SQLiteBackend(SpatialDBBackend): + """SQLite backend with SpatiaLite extension for spatial indexing.""" + + def __init__(self, database_path: str): + self.database_path = database_path + self.conn: Optional[sqlite3.Connection] = None + + def _get_connection(self) -> sqlite3.Connection: + """Get or create database connection.""" + if self.conn is None: + logger.info(f"Creating SQLite connection to: {self.database_path}") + self.conn = sqlite3.connect(self.database_path, check_same_thread=False) + self.conn.row_factory = sqlite3.Row + # Enable WAL mode for better concurrency and ensure data is written + journal_mode = self.conn.execute("PRAGMA journal_mode=WAL").fetchone()[0] + logger.debug(f"SQLite journal mode: {journal_mode}") + # Use NORMAL synchronous mode (balance between safety and performance) + # FULL is safer but slower, OFF is faster but riskier + self.conn.execute("PRAGMA synchronous=NORMAL") + logger.info(f"SQLite connection established to {self.database_path}") + return self.conn + + def init_schema(self) -> None: + """Initialize database schema with spatial extensions.""" + conn = self._get_connection() + cursor = conn.cursor() + + logger.info("Initializing database schema") + cursor.execute(""" + CREATE TABLE IF NOT EXISTS annotations ( + id TEXT PRIMARY KEY, + object_class TEXT NOT NULL, + x REAL NOT NULL, + y REAL NOT NULL, + z REAL NOT NULL, + qx REAL, + qy REAL, + qz REAL, + qw REAL, + confidence REAL NOT NULL, + timestamp TEXT NOT NULL, + detection_source TEXT NOT NULL, + source_frame TEXT NOT NULL, + location_id TEXT NOT NULL, + vision_detection_id TEXT, + metadata TEXT + ) + """) + + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_object_class ON annotations(object_class) + """) + + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_confidence ON annotations(confidence) + """) + + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_location_id ON annotations(location_id) + """) + + conn.commit() + logger.info("Database schema initialized successfully") + + def insert_annotation(self, annotation: SemanticAnnotation) -> str: + """Insert annotation with spatial indexing.""" + conn = self._get_connection() + cursor = conn.cursor() + + x = annotation.pose.position.x + y = annotation.pose.position.y + z = annotation.pose.position.z + qx = annotation.pose.orientation.x + qy = annotation.pose.orientation.y + qz = annotation.pose.orientation.z + qw = annotation.pose.orientation.w + + metadata_json = json.dumps(annotation.metadata) if annotation.metadata else None + + logger.info( + f"Inserting annotation: id={annotation.id}, class={annotation.object_class}, " + f"pos=({x:.2f}, {y:.2f}, {z:.2f}), confidence={annotation.confidence:.3f}, " + f"location_id={annotation.location_id}" + ) + + try: + cursor.execute( + """ + INSERT INTO annotations ( + id, object_class, x, y, z, qx, qy, qz, qw, + confidence, timestamp, detection_source, source_frame, + location_id, vision_detection_id, metadata + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + annotation.id, + annotation.object_class, + x, + y, + z, + qx, + qy, + qz, + qw, + annotation.confidence, + str(annotation.timestamp), + annotation.detection_source, + annotation.source_frame, + annotation.location_id, + annotation.vision_detection_id, + metadata_json, + ), + ) + + conn.commit() + logger.debug(f"Committed annotation {annotation.id} to database") + + # Verify the annotation was actually stored + cursor.execute( + "SELECT COUNT(*) FROM annotations WHERE id = ?", (annotation.id,) + ) + count = cursor.fetchone()[0] + if count == 1: + logger.info(f"✓ Verified annotation {annotation.id} stored in database") + else: + logger.error( + f"✗ FAILED to verify annotation {annotation.id} in database " + f"(found {count} rows, expected 1)" + ) + + return annotation.id + except sqlite3.Error as e: + logger.error(f"SQLite error inserting annotation {annotation.id}: {e}") + conn.rollback() + raise + + def spatial_query( + self, center: Point, radius: float, filters: Optional[Dict[str, Any]] = None + ) -> List[SemanticAnnotation]: + """Execute spatial query with optional filters.""" + conn = self._get_connection() + cursor = conn.cursor() + + query = """ + SELECT * FROM annotations + WHERE ((x - ?) * (x - ?) + (y - ?) * (y - ?) + (z - ?) * (z - ?)) <= (? * ?) + """ + params = [ + center.x, + center.x, + center.y, + center.y, + center.z, + center.z, + radius, + radius, + ] + + if filters: + if "object_class" in filters: + query += " AND object_class = ?" + params.append(filters["object_class"]) + if "confidence_threshold" in filters: + query += " AND confidence >= ?" + params.append(filters["confidence_threshold"]) + if "location_id" in filters: + query += " AND location_id = ?" + params.append(filters["location_id"]) + + cursor.execute(query, params) + rows = cursor.fetchall() + + annotations = [] + for row in rows: + pose = Pose() + pose.position.x = row["x"] + pose.position.y = row["y"] + pose.position.z = row["z"] + pose.orientation.x = row["qx"] or 0.0 + pose.orientation.y = row["qy"] or 0.0 + pose.orientation.z = row["qz"] or 0.0 + pose.orientation.w = row["qw"] or 1.0 + + metadata = json.loads(row["metadata"]) if row["metadata"] else {} + + annotation = SemanticAnnotation( + id=row["id"], + object_class=row["object_class"], + pose=pose, + confidence=row["confidence"], + timestamp=row["timestamp"], + detection_source=row["detection_source"], + source_frame=row["source_frame"], + location_id=row["location_id"], + vision_detection_id=row["vision_detection_id"], + metadata=metadata, + ) + annotations.append(annotation) + + return annotations + + def delete_annotation(self, annotation_id: str) -> bool: + """Delete annotation by ID. Returns success status.""" + conn = self._get_connection() + cursor = conn.cursor() + + logger.info(f"Deleting annotation: id={annotation_id}") + cursor.execute("DELETE FROM annotations WHERE id = ?", (annotation_id,)) + rows_deleted = cursor.rowcount + conn.commit() + + if rows_deleted > 0: + logger.info(f"✓ Deleted annotation {annotation_id} from database") + else: + logger.warn(f"✗ Annotation {annotation_id} not found for deletion") + + return rows_deleted > 0 + + def delete_all_annotations(self, location_id: Optional[str] = None) -> int: + """Delete all annotations, optionally filtered by location_id. + + Args: + location_id: If provided, only delete annotations for this location. + If None, delete all annotations. + + Returns: + Number of annotations deleted. + """ + conn = self._get_connection() + cursor = conn.cursor() + + if location_id: + logger.info(f"Deleting all annotations for location_id={location_id}") + cursor.execute( + "DELETE FROM annotations WHERE location_id = ?", (location_id,) + ) + else: + logger.info("Deleting all annotations from database") + cursor.execute("DELETE FROM annotations") + + rows_deleted = cursor.rowcount + conn.commit() + + logger.info(f"✓ Deleted {rows_deleted} annotation(s) from database") + return rows_deleted + + def update_annotation(self, annotation: SemanticAnnotation) -> bool: + """Update existing annotation by ID. Returns success status.""" + conn = self._get_connection() + cursor = conn.cursor() + + x = annotation.pose.position.x + y = annotation.pose.position.y + z = annotation.pose.position.z + qx = annotation.pose.orientation.x + qy = annotation.pose.orientation.y + qz = annotation.pose.orientation.z + qw = annotation.pose.orientation.w + + metadata_json = json.dumps(annotation.metadata) if annotation.metadata else None + + logger.info( + f"Updating annotation: id={annotation.id}, class={annotation.object_class}, " + f"pos=({x:.2f}, {y:.2f}, {z:.2f}), confidence={annotation.confidence:.3f}" + ) + + try: + cursor.execute( + """ + UPDATE annotations SET + object_class = ?, + x = ?, y = ?, z = ?, + qx = ?, qy = ?, qz = ?, qw = ?, + confidence = ?, + timestamp = ?, + detection_source = ?, + source_frame = ?, + location_id = ?, + vision_detection_id = ?, + metadata = ? + WHERE id = ? + """, + ( + annotation.object_class, + x, + y, + z, + qx, + qy, + qz, + qw, + annotation.confidence, + str(annotation.timestamp), + annotation.detection_source, + annotation.source_frame, + annotation.location_id, + annotation.vision_detection_id, + metadata_json, + annotation.id, + ), + ) + + rows_updated = cursor.rowcount + conn.commit() + + if rows_updated > 0: + logger.info(f"✓ Updated annotation {annotation.id} in database") + else: + logger.warn(f"✗ Annotation {annotation.id} not found for update") + + return rows_updated > 0 + except sqlite3.Error as e: + logger.error(f"SQLite error updating annotation {annotation.id}: {e}") + conn.rollback() + raise diff --git a/src/rai_semap/rai_semap/core/base_memory.py b/src/rai_semap/rai_semap/core/base_memory.py new file mode 100644 index 000000000..44b11815f --- /dev/null +++ b/src/rai_semap/rai_semap/core/base_memory.py @@ -0,0 +1,45 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + + +class BaseMemory(ABC): + """Abstract base class for agent memory systems.""" + + @abstractmethod + def store( + self, key: str, value: Any, metadata: Optional[Dict[str, Any]] = None + ) -> str: + """Store a value with optional metadata. Returns storage ID.""" + pass + + @abstractmethod + def retrieve( + self, query: str, filters: Optional[Dict[str, Any]] = None + ) -> List[Any]: + """Retrieve values matching query and filters. + + Designed for vector database use cases where query is text to embed + for similarity search, and filters are metadata constraints. + Not suitable for spatial databases which require concrete query methods + (e.g., query_by_location, query_by_region). + """ + pass + + @abstractmethod + def delete(self, key: str) -> bool: + """Delete a stored value. Returns success status.""" + pass diff --git a/src/rai_semap/rai_semap/core/semantic_map_memory.py b/src/rai_semap/rai_semap/core/semantic_map_memory.py new file mode 100644 index 000000000..600c53c57 --- /dev/null +++ b/src/rai_semap/rai_semap/core/semantic_map_memory.py @@ -0,0 +1,402 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid +from typing import Any, Dict, List, Optional, Tuple + +from geometry_msgs.msg import Point, Pose + +from rai_semap.core.backend.spatial_db_backend import SpatialDBBackend +from rai_semap.core.base_memory import BaseMemory + + +def _pose_to_dict(pose: Pose) -> Dict[str, Any]: + """Convert Pose to JSON-serializable dictionary.""" + return { + "position": {"x": pose.position.x, "y": pose.position.y, "z": pose.position.z}, + "orientation": { + "x": pose.orientation.x, + "y": pose.orientation.y, + "z": pose.orientation.z, + "w": pose.orientation.w, + }, + } + + +class SemanticAnnotation: + """Spatial-semantic annotation data model.""" + + def __init__( + self, + id: str, + object_class: str, + pose: Pose, + confidence: float, + timestamp: Any, + detection_source: str, + source_frame: str, + location_id: str, + vision_detection_id: Optional[ + str + ] = None, # id from vision pipeline, mostly for debugging + metadata: Optional[Dict[str, Any]] = None, + ): + self.id = id + self.object_class = object_class + self.pose = pose + self.confidence = confidence + self.timestamp = timestamp + self.detection_source = detection_source + self.source_frame = source_frame + self.location_id = location_id + self.vision_detection_id = vision_detection_id + self.metadata = metadata or {} + + def to_dict(self) -> Dict[str, Any]: + """Convert to JSON-serializable dictionary.""" + return { + "id": self.id, + "object_class": self.object_class, + "pose": _pose_to_dict(self.pose), + "confidence": self.confidence, + "timestamp": str(self.timestamp) if self.timestamp is not None else None, + "detection_source": self.detection_source, + "source_frame": self.source_frame, + "location_id": self.location_id, + "vision_detection_id": self.vision_detection_id, + "metadata": self.metadata, + } + + +class MapMetadata: + """Metadata structure for a SLAM map (one per location, not per annotation). + + Tracks properties of the underlying SLAM map and semantic annotation activity. + """ + + def __init__( + self, + location_id: str, + map_frame_id: str, + resolution: float, + origin: Optional[Pose] = None, + last_updated: Optional[Any] = None, + ): + """ + Args: + location_id: Identifier for the physical location (e.g., "warehouse_a", "warehouse_b") + map_frame_id: Frame ID of the SLAM map + resolution: OccupancyGrid resolution (meters/pixel) from SLAM map configuration + origin: Optional map origin pose. Only needed for coordinate transformations + between map frame and world frame. Not required for semantic annotations + that are already stored in map frame. + last_updated: Optional timestamp of last semantic annotation update to this map + """ + self.location_id = location_id + self.map_frame_id = map_frame_id + self.resolution = resolution + self.origin = origin + self.last_updated = last_updated + + def to_dict(self) -> Dict[str, Any]: + """Convert to JSON-serializable dictionary.""" + result = { + "location_id": self.location_id, + "map_frame_id": self.map_frame_id, + "resolution": self.resolution, + "last_updated": str(self.last_updated) + if self.last_updated is not None + else None, + } + if self.origin is not None: + result["origin"] = _pose_to_dict(self.origin) + return result + + +class SemanticMapMemory(BaseMemory): + """Spatial-semantic memory for storing and querying object annotations.""" + + def __init__( + self, + backend: SpatialDBBackend, + location_id: str, + map_frame_id: str = "map", + resolution: float = 0.05, + ): + self.backend = backend + self.location_id = location_id + self.map_frame_id = map_frame_id + self.resolution = resolution + + def store( + self, key: str, value: Any, metadata: Optional[Dict[str, Any]] = None + ) -> str: + """Store a value with optional metadata. Returns storage ID. + + Uses temporal consistency: if nearby annotation of same class exists, + updates it; otherwise inserts new. The key parameter is ignored + (annotation ID is determined by temporal consistency). + + Args: + key: Ignored (kept for BaseMemory interface compatibility) + value: Dict containing required fields including 'object_class' + metadata: Optional additional metadata + """ + if not isinstance(value, dict): + raise TypeError(f"value must be a dict, got {type(value).__name__}") + + required_fields = [ + "object_class", + "pose", + "confidence", + "timestamp", + "detection_source", + "source_frame", + "location_id", + ] + missing = [field for field in required_fields if field not in value] + if missing: + raise ValueError(f"Missing required fields in value: {missing}") + + return self.store_or_update_annotation( + object_class=value["object_class"], + pose=value["pose"], + confidence=value["confidence"], + timestamp=value["timestamp"], + detection_source=value["detection_source"], + source_frame=value["source_frame"], + location_id=value["location_id"], + vision_detection_id=value.get("vision_detection_id"), + metadata=metadata, + ) + + def retrieve( + self, query: str, filters: Optional[Dict[str, Any]] = None + ) -> List[Any]: + """Retrieve values matching query and filters. + + Not implemented. Use concrete query methods instead: + - query_by_class: Query by object class + - query_by_location: Query by center point and radius + - query_by_region: Query by bounding box + """ + raise NotImplementedError( + "Use concrete query methods: query_by_class, query_by_location, or query_by_region" + ) + + def delete(self, key: str) -> bool: + """Delete a stored value by annotation ID. Returns success status.""" + return self.backend.delete_annotation(key) + + def delete_all_annotations(self, location_id: Optional[str] = None) -> int: + """Delete all annotations, optionally filtered by location_id. + + Args: + location_id: If provided, only delete annotations for this location. + If None, delete all annotations. If not provided and + self.location_id is set, defaults to self.location_id. + + Returns: + Number of annotations deleted. + """ + if location_id is None: + location_id = self.location_id + return self.backend.delete_all_annotations(location_id) + + def store_annotation( + self, + object_class: str, + pose: Pose, + confidence: float, + timestamp: Any, + detection_source: str, + source_frame: str, + location_id: str, + vision_detection_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + annotation_id: Optional[str] = None, + ) -> str: + """Store a semantic annotation. Returns annotation ID.""" + if annotation_id is None: + annotation_id = str(uuid.uuid4()) + annotation = SemanticAnnotation( + id=annotation_id, + object_class=object_class, + pose=pose, + confidence=confidence, + timestamp=timestamp, + detection_source=detection_source, + source_frame=source_frame, + location_id=location_id, + vision_detection_id=vision_detection_id, + metadata=metadata, + ) + return self.backend.insert_annotation(annotation) + + def store_or_update_annotation( + self, + object_class: str, + pose: Pose, + confidence: float, + timestamp: Any, + detection_source: str, + source_frame: str, + location_id: str, + vision_detection_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + merge_threshold: float = 0.5, + ) -> str: + """Store or update annotation with temporal consistency. + + Queries nearby annotations of the same class and location. If found within merge_threshold, + updates existing annotation. Otherwise, inserts new annotation. + + Args: + location_id: Identifier for the physical location + merge_threshold: Distance threshold (meters) for merging duplicate detections + Other args: Same as store_annotation + + Returns: + Annotation ID (existing if updated, new if inserted) + """ + center = Point(x=pose.position.x, y=pose.position.y, z=pose.position.z) + nearby = self.query_by_location( + center, + radius=merge_threshold, + object_class=object_class, + location_id=location_id, + ) + + if nearby: + # Update first match (closest) + existing = nearby[0] + updated = SemanticAnnotation( + id=existing.id, + object_class=object_class, + pose=pose, + confidence=max( + existing.confidence, confidence + ), # Keep higher confidence + timestamp=timestamp, # Update to latest timestamp + detection_source=detection_source, + source_frame=source_frame, + location_id=location_id, + vision_detection_id=vision_detection_id, + metadata=metadata or existing.metadata, + ) + self.backend.update_annotation(updated) + return existing.id + else: + # Insert new + return self.store_annotation( + object_class=object_class, + pose=pose, + confidence=confidence, + timestamp=timestamp, + detection_source=detection_source, + source_frame=source_frame, + location_id=location_id, + vision_detection_id=vision_detection_id, + metadata=metadata, + ) + + def query_by_class( + self, + object_class: str, + confidence_threshold: float = 0.5, + limit: Optional[int] = None, + location_id: Optional[str] = None, + ) -> List[SemanticAnnotation]: + """Query annotations by object class.""" + filters = { + "object_class": object_class, + "confidence_threshold": confidence_threshold, + "location_id": location_id or self.location_id, + } + center = Point(x=0.0, y=0.0, z=0.0) + results = self.backend.spatial_query(center, radius=1e10, filters=filters) + if limit is not None: + results = results[:limit] + return results + + def query_by_location( + self, + center: Point, + radius: float, + object_class: Optional[str] = None, + location_id: Optional[str] = None, + ) -> List[SemanticAnnotation]: + """Query annotations within radius of center point.""" + filters = {"location_id": location_id or self.location_id} + if object_class: + filters["object_class"] = object_class + return self.backend.spatial_query(center, radius, filters=filters) + + def query_by_region( + self, + bbox: Tuple[float, float, float, float], # (min_x, min_y, max_x, max_y) + object_class: Optional[str] = None, + location_id: Optional[str] = None, + ) -> List[SemanticAnnotation]: + """Query annotations within bounding box region.""" + min_x, min_y, max_x, max_y = bbox + center_x = (min_x + max_x) / 2.0 + center_y = (min_y + max_y) / 2.0 + radius = max(max_x - min_x, max_y - min_y) / 2.0 + + center = Point(x=center_x, y=center_y, z=0.0) + filters = {"location_id": location_id or self.location_id} + if object_class: + filters["object_class"] = object_class + + results = self.backend.spatial_query(center, radius, filters=filters) + + filtered_results = [] + for annotation in results: + x = annotation.pose.position.x + y = annotation.pose.position.y + if min_x <= x <= max_x and min_y <= y <= max_y: + filtered_results.append(annotation) + + return filtered_results + + def get_map_metadata(self) -> MapMetadata: + """Get metadata for the current SLAM map. + + Returns one MapMetadata instance per location (not per annotation). + Computes last_updated from the most recent annotation timestamp for this location. + map_frame_id and resolution come from instance configuration. + """ + # Get most recent annotation timestamp for this location + center = Point(x=0.0, y=0.0, z=0.0) + filters = {"location_id": self.location_id} + all_annotations = self.backend.spatial_query( + center, radius=1e10, filters=filters + ) + + last_updated = None + if all_annotations: + timestamps = [ + ann.timestamp for ann in all_annotations if ann.timestamp is not None + ] + if timestamps: + last_updated = max(timestamps) + + return MapMetadata( + location_id=self.location_id, + map_frame_id=self.map_frame_id, + resolution=self.resolution, + origin=None, + last_updated=last_updated, + ) diff --git a/src/rai_semap/rai_semap/ros2/__init__.py b/src/rai_semap/rai_semap/ros2/__init__.py new file mode 100644 index 000000000..c8b4abf7f --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/rai_semap/rai_semap/ros2/detection_publisher.py b/src/rai_semap/rai_semap/ros2/detection_publisher.py new file mode 100644 index 000000000..faab3729b --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/detection_publisher.py @@ -0,0 +1,474 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import rclpy +import yaml +from cv_bridge import CvBridge +from rcl_interfaces.msg import ParameterDescriptor, ParameterType +from rclpy.node import Node +from rclpy.qos import qos_profile_sensor_data +from sensor_msgs.msg import CameraInfo, Image + +from rai_interfaces.msg import RAIDetectionArray +from rai_interfaces.srv import RAIGroundingDino +from rai_semap.ros2.perception_utils import enhance_detection_with_3d_pose + + +# NOTE: This module contains perception layer logic that may belong to rai_perception. +# It performs object detection and 3D pose computation, which are general perception +# tasks not specific to semantic mapping. Consider moving to rai_perception when +# that package has ROS2 node infrastructure in place. +class DetectionPublisher(Node): + """ROS2 node that subscribes to camera images, calls DINO service, and publishes detections.""" + + def __init__(self): + super().__init__("detection_publisher") + self._initialize_parameters() + self.bridge = CvBridge() + self._initialize_clients() + self._initialize_subscriptions() + self._initialize_publishers() + self.last_image: Optional[Image] = None + self.last_depth_image: Optional[Image] = None + self.last_camera_info: Optional[CameraInfo] = None + self.last_detection_time = 0.0 + + def _initialize_parameters(self): + """Initialize ROS2 parameters from YAML files.""" + # Get directory containing this file + current_dir = Path(__file__).parent + + # Load detection_publisher parameters + detection_pub_yaml = current_dir / "detection_publisher.yaml" + with open(detection_pub_yaml, "r") as f: + detection_pub_config = yaml.safe_load(f) + detection_pub_params = detection_pub_config.get("detection_publisher", {}).get( + "ros__parameters", {} + ) + + # Load perception_utils parameters + perception_utils_yaml = current_dir / "perception_utils.yaml" + with open(perception_utils_yaml, "r") as f: + perception_utils_config = yaml.safe_load(f) + perception_utils_params = perception_utils_config.get( + "perception_utils", {} + ).get("ros__parameters", {}) + + # Declare detection_publisher parameters + parameters = [ + ( + "camera_topic", + detection_pub_params.get("camera_topic", "/camera/image_raw"), + ParameterType.PARAMETER_STRING, + "Camera image topic to subscribe to", + ), + ( + "detection_topic", + detection_pub_params.get("detection_topic", "/detection_array"), + ParameterType.PARAMETER_STRING, + "Topic to publish RAIDetectionArray messages", + ), + ( + "dino_service", + detection_pub_params.get("dino_service", "/grounding_dino_classify"), + ParameterType.PARAMETER_STRING, + "GroundingDINO service name", + ), + ( + "detection_classes", + detection_pub_params.get( + "detection_classes", + "person, cup, bottle, box, bag, chair, table, shelf, door, window, couch, sofa, bed", + ), + ParameterType.PARAMETER_STRING, + "Comma-separated list of object classes to detect. Format: 'class1:threshold1, class2, class3:threshold3' where classes without thresholds use default_class_threshold", + ), + ( + "default_class_threshold", + detection_pub_params.get("default_class_threshold", 0.3), + ParameterType.PARAMETER_DOUBLE, + "Default box threshold for classes without explicit threshold in detection_classes", + ), + ( + "detection_interval", + detection_pub_params.get("detection_interval", 2.0), + ParameterType.PARAMETER_DOUBLE, + "Minimum time between detections (seconds)", + ), + ( + "box_threshold", + detection_pub_params.get("box_threshold", 0.3), + ParameterType.PARAMETER_DOUBLE, + "DINO box threshold (used as minimum for DINO call to get all detections)", + ), + ( + "text_threshold", + detection_pub_params.get("text_threshold", 0.25), + ParameterType.PARAMETER_DOUBLE, + "DINO text threshold", + ), + ] + + for name, default, param_type, description in parameters: + self.declare_parameter( + name, + default, + descriptor=ParameterDescriptor( + type=param_type, + description=description, + ), + ) + + # Declare perception_utils parameters + perception_params = [ + ( + "depth_topic", + perception_utils_params.get("depth_topic", ""), + ParameterType.PARAMETER_STRING, + "Depth image topic (optional, for 3D pose computation)", + ), + ( + "camera_info_topic", + perception_utils_params.get("camera_info_topic", ""), + ParameterType.PARAMETER_STRING, + "Camera info topic (optional, for 3D pose computation)", + ), + ( + "depth_fallback_region_size", + perception_utils_params.get("depth_fallback_region_size", 5), + ParameterType.PARAMETER_INTEGER, + "Region size for depth fallback when center pixel has no depth", + ), + ] + + for name, default, param_type, description in perception_params: + self.declare_parameter( + name, + default, + descriptor=ParameterDescriptor( + type=param_type, + description=description, + ), + ) + + def _get_string_parameter(self, name: str) -> str: + """Get string parameter value.""" + return self.get_parameter(name).get_parameter_value().string_value + + def _get_double_parameter(self, name: str) -> float: + return self.get_parameter(name).get_parameter_value().double_value + + def _get_integer_parameter(self, name: str) -> int: + return self.get_parameter(name).get_parameter_value().integer_value + + def _parse_detection_classes( + self, detection_classes_str: str + ) -> Tuple[List[str], Dict[str, float]]: + """Parse detection_classes string to extract class names and per-class thresholds. + + Format: "class1:threshold1, class2, class3:threshold3" + Classes without explicit thresholds use default_class_threshold. + + Returns: + Tuple of (class_names_list, class_thresholds_dict) + """ + default_threshold = self._get_double_parameter("default_class_threshold") + class_names = [] + class_thresholds = {} + + for item in detection_classes_str.split(","): + item = item.strip() + if not item: + continue + + if ":" in item: + class_name, threshold_str = item.split(":", 1) + class_name = class_name.strip() + try: + threshold = float(threshold_str.strip()) + class_names.append(class_name) + class_thresholds[class_name] = threshold + except ValueError: + self.get_logger().warn( + f"Invalid threshold value in '{item}', using default" + ) + class_names.append(class_name) + class_thresholds[class_name] = default_threshold + else: + class_name = item.strip() + class_names.append(class_name) + class_thresholds[class_name] = default_threshold + + return class_names, class_thresholds + + def _initialize_clients(self): + """Initialize service clients.""" + dino_service = self._get_string_parameter("dino_service") + self.dino_client = self.create_client(RAIGroundingDino, dino_service) + self.get_logger().info(f"Waiting for DINO service: {dino_service}") + if not self.dino_client.wait_for_service(timeout_sec=10.0): + self.get_logger().warn(f"DINO service not available: {dino_service}") + else: + self.get_logger().info(f"DINO service ready: {dino_service}") + + def _initialize_subscriptions(self): + """Initialize ROS2 subscriptions.""" + camera_topic = self._get_string_parameter("camera_topic") + self.image_subscription = self.create_subscription( + Image, camera_topic, self.image_callback, qos_profile_sensor_data + ) + self.get_logger().info( + f"Subscribed to camera topic: {camera_topic} " + f"(QoS: {qos_profile_sensor_data.reliability.name})" + ) + + # Optional depth and camera info subscriptions for 3D pose computation + depth_topic = self._get_string_parameter("depth_topic") + camera_info_topic = self._get_string_parameter("camera_info_topic") + + if depth_topic: + self.depth_subscription = self.create_subscription( + Image, depth_topic, self.depth_callback, qos_profile_sensor_data + ) + self.get_logger().info(f"Subscribed to depth topic: {depth_topic}") + else: + self.depth_subscription = None + self.get_logger().info( + "No depth topic provided, 3D poses will not be computed" + ) + + if camera_info_topic: + self.camera_info_subscription = self.create_subscription( + CameraInfo, + camera_info_topic, + self.camera_info_callback, + qos_profile_sensor_data, + ) + self.get_logger().info( + f"Subscribed to camera info topic: {camera_info_topic}" + ) + else: + self.camera_info_subscription = None + self.get_logger().info( + "No camera info topic provided, 3D poses will not be computed" + ) + + def _initialize_publishers(self): + """Initialize ROS2 publishers.""" + detection_topic = self._get_string_parameter("detection_topic") + self.detection_publisher = self.create_publisher( + RAIDetectionArray, detection_topic, 10 + ) + self.get_logger().info(f"Publishing to detection topic: {detection_topic}") + + def depth_callback(self, msg: Image): + """Store latest depth image.""" + self.last_depth_image = msg + + def camera_info_callback(self, msg: CameraInfo): + """Store latest camera info.""" + self.last_camera_info = msg + + def image_callback(self, msg: Image): + """Process incoming camera image.""" + self.get_logger().info( + f"Received camera image (stamp: {msg.header.stamp.sec}.{msg.header.stamp.nanosec:09d}, " + f"frame_id: {msg.header.frame_id})" + ) + current_time = time.time() + detection_interval = self._get_double_parameter("detection_interval") + + # Throttle detections + if current_time - self.last_detection_time < detection_interval: + time_since_last = current_time - self.last_detection_time + self.get_logger().debug( + f"Throttling: {time_since_last:.2f}s since last detection (interval: {detection_interval}s)" + ) + return + + self.last_image = msg + self.get_logger().info("Processing camera image...") + self._process_image(msg) + + def _process_image(self, image_msg: Image): + """Call DINO service and publish detections.""" + if not self.dino_client.wait_for_service(timeout_sec=0.1): + self.get_logger().warn("DINO service not ready, skipping detection") + return + + detection_classes_str = self._get_string_parameter("detection_classes") + class_names, class_thresholds = self._parse_detection_classes( + detection_classes_str + ) + + # Use minimum threshold for DINO call to ensure we get all relevant detections + # Results will be filtered by per-class thresholds in _handle_dino_response + min_threshold = ( + min(class_thresholds.values()) + if class_thresholds + else self._get_double_parameter("default_class_threshold") + ) + box_threshold = min(self._get_double_parameter("box_threshold"), min_threshold) + text_threshold = self._get_double_parameter("text_threshold") + + # Store class_thresholds for filtering in response handler + self._current_class_thresholds = class_thresholds + + request = RAIGroundingDino.Request() + request.source_img = image_msg + request.classes = ", ".join(class_names) + request.box_threshold = box_threshold + request.text_threshold = text_threshold + + self.get_logger().info( + f"Calling DINO service with classes: {', '.join(class_names)} " + f"(box_threshold={box_threshold:.3f}, per-class thresholds: {class_thresholds})" + ) + + future = self.dino_client.call_async(request) + future.add_done_callback( + lambda f: self._handle_dino_response(f, image_msg.header) + ) + + def _handle_dino_response(self, future, image_header): + """Handle DINO service response.""" + try: + response = future.result() + if response is None: + self.get_logger().warn("DINO service returned None") + return + + # Get class thresholds for filtering (set in _process_image) + class_thresholds = getattr(self, "_current_class_thresholds", {}) + + # Filter detections by per-class thresholds + filtered_detections = [] + for det in response.detections.detections: + if det.results and len(det.results) > 0: + result = det.results[0] + class_id = result.hypothesis.class_id + score = result.hypothesis.score + + # Get threshold for this class (use default if not found) + threshold = class_thresholds.get( + class_id, self._get_double_parameter("default_class_threshold") + ) + + if score >= threshold: + filtered_detections.append(det) + else: + self.get_logger().debug( + f"Filtered out {class_id} detection with score {score:.3f} " + f"(threshold: {threshold:.3f})" + ) + else: + # Keep detections without results (shouldn't happen, but be safe) + filtered_detections.append(det) + + # Create RAIDetectionArray message + detection_array = RAIDetectionArray() + detection_array.header = image_header + detection_array.header.frame_id = image_header.frame_id + detection_array.detections = filtered_detections + detection_array.detection_classes = response.detections.detection_classes + + # Ensure each detection has the correct frame_id and enhance with 3D poses + for det in detection_array.detections: + if not det.header.frame_id: + det.header.frame_id = image_header.frame_id + det.header.stamp = image_header.stamp + + # Enhance detection with 3D pose if pose is empty + region_size = self._get_integer_parameter("depth_fallback_region_size") + if enhance_detection_with_3d_pose( + det, + self.last_depth_image, + self.last_camera_info, + self.bridge, + region_size, + ): + if det.results and len(det.results) > 0: + result = det.results[0] + computed_pose = result.pose.pose + self.get_logger().debug( + f"Computed 3D pose for {result.hypothesis.class_id}: " + f"({computed_pose.position.x:.3f}, {computed_pose.position.y:.3f}, " + f"{computed_pose.position.z:.3f})" + ) + elif det.results and len(det.results) > 0: + result = det.results[0] + pose = result.pose.pose + if ( + pose.position.x == 0.0 + and pose.position.y == 0.0 + and pose.position.z == 0.0 + ): + self.get_logger().debug( + f"Could not compute 3D pose for {result.hypothesis.class_id} " + f"(depth or camera info not available)" + ) + + # Log detection details for debugging + detection_count = len(detection_array.detections) + if detection_count > 0: + for i, det in enumerate(detection_array.detections): + results_count = len(det.results) if det.results else 0 + if results_count > 0: + result = det.results[0] + self.get_logger().info( + f"Detection {i}: class={result.hypothesis.class_id}, " + f"score={result.hypothesis.score:.3f}, " + f"results_count={results_count}, " + f"frame_id={det.header.frame_id}" + ) + else: + self.get_logger().warn( + f"Detection {i} has no results! frame_id={det.header.frame_id}" + ) + self.get_logger().info( + f"Published {detection_count} detections: {detection_array.detection_classes}" + ) + else: + self.get_logger().info("No detections found in image") + + # Publish detections + self.detection_publisher.publish(detection_array) + self.last_detection_time = time.time() + + except Exception as e: + self.get_logger().error(f"Error processing DINO response: {e}") + + +def main(args=None): + """Main entry point for the detection publisher node.""" + rclpy.init(args=args) + node = DetectionPublisher() + node.get_logger().info("=" * 60) + node.get_logger().info("Detection Publisher Node Started") + node.get_logger().info("=" * 60) + try: + rclpy.spin(node) + except KeyboardInterrupt: + node.get_logger().info("Shutting down detection publisher...") + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/src/rai_semap/rai_semap/ros2/detection_publisher.yaml b/src/rai_semap/rai_semap/ros2/detection_publisher.yaml new file mode 100644 index 000000000..161bd4386 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/detection_publisher.yaml @@ -0,0 +1,14 @@ +detection_publisher: + ros__parameters: + camera_topic: "/camera/image_raw" + detection_topic: "/detection_array" + dino_service: "/grounding_dino_classify" + # Format: "class1:threshold1, class2, class3:threshold3" + # Classes without explicit thresholds use default_class_threshold + # Example: "person:0.5, cup, bottle:0.4, box" (person uses 0.5, cup uses default, bottle uses 0.4, box uses default) + detection_classes: | + person, cup, bottle, box, bag, chair, table, shelf, door, window, couch, sofa, bed + default_class_threshold: 0.3 + detection_interval: 2.0 + box_threshold: 0.3 + text_threshold: 0.25 diff --git a/src/rai_semap/rai_semap/ros2/node.py b/src/rai_semap/rai_semap/ros2/node.py new file mode 100644 index 000000000..8f8d05899 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/node.py @@ -0,0 +1,743 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Optional, Tuple + +# ROS2 core +import rclpy +from cv_bridge import CvBridge + +# ROS2 geometry and transforms +from geometry_msgs.msg import Point, Pose, PoseStamped +from nav_msgs.msg import OccupancyGrid +from rcl_interfaces.msg import ParameterDescriptor, ParameterType +from rclpy.node import Node +from rclpy.qos import qos_profile_sensor_data +from sensor_msgs.msg import CameraInfo, Image +from tf2_geometry_msgs import do_transform_pose_stamped +from tf2_ros import Buffer, TransformListener + +# RAI interfaces +from rai_interfaces.msg import RAIDetectionArray + +# Local imports +from rai_semap.core.backend.sqlite_backend import SQLiteBackend +from rai_semap.core.semantic_map_memory import SemanticAnnotation, SemanticMapMemory +from rai_semap.ros2.perception_utils import extract_pointcloud_from_bbox + + +# Configure Python logging to use ROS2 logger +class ROS2LogHandler(logging.Handler): + """Log handler that forwards Python logging to ROS2 logger.""" + + def __init__(self, ros2_node: Node): + super().__init__() + self.ros2_node = ros2_node + + def emit(self, record): + log_msg = self.format(record) + if record.levelno >= logging.ERROR: + self.ros2_node.get_logger().error(log_msg) + elif record.levelno >= logging.WARNING: + self.ros2_node.get_logger().warn(log_msg) + elif record.levelno >= logging.INFO: + self.ros2_node.get_logger().info(log_msg) + else: + self.ros2_node.get_logger().debug(log_msg) + + +# Constants +DEFAULT_QUEUE_SIZE = 10 +TF_LOOKUP_TIMEOUT_SEC = 1.0 + + +class SemanticMapNode(Node): + """ROS2 node for semantic map processing.""" + + def __init__(self, database_path: Optional[str] = None): + super().__init__("rai_semap_node") + + # Configure Python logging to forward to ROS2 logger + # Configure all rai_semap loggers (including submodules) + handler = ROS2LogHandler(self) + handler.setLevel(logging.DEBUG) + + # Configure root rai_semap logger + python_logger = logging.getLogger("rai_semap") + python_logger.setLevel(logging.DEBUG) + python_logger.handlers.clear() # Remove any existing handlers + python_logger.addHandler(handler) + python_logger.propagate = False # Prevent propagation to root logger + + # Also explicitly configure SQLite backend logger + sqlite_logger = logging.getLogger("rai_semap.core.backend.sqlite_backend") + sqlite_logger.setLevel(logging.DEBUG) + sqlite_logger.handlers.clear() + sqlite_logger.addHandler(handler) + sqlite_logger.propagate = False + + self._initialize_parameters() + if database_path is not None: + self.set_parameters( + [ + rclpy.parameter.Parameter( + "database_path", + rclpy.parameter.Parameter.Type.STRING, + database_path, + ) + ] + ) + self._parse_class_thresholds() + self._parse_class_merge_thresholds() + self.bridge = CvBridge() + self.last_depth_image: Optional[Image] = None + self.last_camera_info: Optional[CameraInfo] = None + self._initialize_tf() + self._initialize_memory() + self._initialize_subscriptions() + + def _initialize_parameters(self): + """Initialize ROS2 parameters.""" + # Define all parameters with their defaults and descriptions + parameters = [ + ( + "database_path", + "semantic_map.db", + ParameterType.PARAMETER_STRING, + "Path to SQLite database file", + ), + ( + "confidence_threshold", + 0.5, + ParameterType.PARAMETER_DOUBLE, + "Minimum confidence score (0.0-1.0) for storing detections", + ), + ( + "class_confidence_thresholds", + "", + ParameterType.PARAMETER_STRING, + "Class-specific thresholds as 'class1:threshold1,class2:threshold2' (e.g., 'person:0.7,window:0.6')", + ), + ( + "class_merge_thresholds", + "", + ParameterType.PARAMETER_STRING, + "Class-specific merge radii (meters) for deduplication as 'class1:radius1,class2:radius2' (e.g., 'couch:2.5,table:1.5')", + ), + ( + "min_bbox_area", + 100.0, + ParameterType.PARAMETER_DOUBLE, + "Minimum bounding box area (pixels^2) to filter small false positives", + ), + ( + "use_pointcloud_dedup", + True, + ParameterType.PARAMETER_BOOL, + "Use point cloud features for improved deduplication matching", + ), + ( + "depth_topic", + "", + ParameterType.PARAMETER_STRING, + "Depth image topic (optional, for point cloud extraction)", + ), + ( + "camera_info_topic", + "", + ParameterType.PARAMETER_STRING, + "Camera info topic (optional, for point cloud extraction)", + ), + ( + "detection_topic", + "/detection_array", + ParameterType.PARAMETER_STRING, + "Topic for RAIDetectionArray messages", + ), + ( + "map_topic", + "/map", + ParameterType.PARAMETER_STRING, + "Topic for OccupancyGrid map messages", + ), + ( + "map_frame_id", + "map", + ParameterType.PARAMETER_STRING, + "Frame ID of the SLAM map", + ), + ( + "location_id", + "default_location", + ParameterType.PARAMETER_STRING, + "Identifier for the physical location", + ), + ( + "map_resolution", + 0.05, + ParameterType.PARAMETER_DOUBLE, + "OccupancyGrid resolution (meters/pixel)", + ), + ] + + for name, default, param_type, description in parameters: + self.declare_parameter( + name, + default, + descriptor=ParameterDescriptor( + type=param_type, + description=description, + ), + ) + + def _initialize_tf(self): + """Initialize TF buffer and listener.""" + self.tf_buffer = Buffer(node=self) + self.tf_listener = TransformListener(self.tf_buffer, self) + + def _get_string_parameter(self, name: str) -> str: + """Get string parameter value.""" + return self.get_parameter(name).get_parameter_value().string_value + + def _get_double_parameter(self, name: str) -> float: + """Get double parameter value.""" + return self.get_parameter(name).get_parameter_value().double_value + + def _parse_class_thresholds(self): + """Parse class-specific confidence thresholds from parameter.""" + thresholds_str = self._get_string_parameter("class_confidence_thresholds") + self.class_thresholds = {} + if thresholds_str: + for item in thresholds_str.split(","): + item = item.strip() + if ":" in item: + class_name, threshold_str = item.split(":", 1) + try: + threshold = float(threshold_str.strip()) + self.class_thresholds[class_name.strip()] = threshold + self.get_logger().info( + f"Class-specific threshold: {class_name.strip()}={threshold:.3f}" + ) + except ValueError: + self.get_logger().warn( + f"Invalid threshold value in '{item}', skipping" + ) + + def _parse_class_merge_thresholds(self): + """Parse class-specific merge thresholds from parameter.""" + merge_thresholds_str = self._get_string_parameter("class_merge_thresholds") + self.class_merge_thresholds = {} + if merge_thresholds_str: + for item in merge_thresholds_str.split(","): + item = item.strip() + if ":" in item: + class_name, radius_str = item.split(":", 1) + try: + radius = float(radius_str.strip()) + self.class_merge_thresholds[class_name.strip()] = radius + self.get_logger().info( + f"Class-specific merge radius: {class_name.strip()}={radius:.2f}m" + ) + except ValueError: + self.get_logger().warn( + f"Invalid merge radius value in '{item}', skipping" + ) + + def _initialize_memory(self): + """Initialize semantic map memory backend.""" + database_path = self._get_string_parameter("database_path") + location_id = self._get_string_parameter("location_id") + map_frame_id = self._get_string_parameter("map_frame_id") + map_resolution = self._get_double_parameter("map_resolution") + + backend = SQLiteBackend(database_path) + backend.init_schema() + self.memory = SemanticMapMemory( + backend=backend, + location_id=location_id, + map_frame_id=map_frame_id, + resolution=map_resolution, + ) + self.get_logger().info( + f"Initialized semantic map memory: location_id={location_id}, " + f"map_frame_id={map_frame_id}, database_path={database_path}" + ) + + def _initialize_subscriptions(self): + """Initialize ROS2 subscriptions.""" + detection_topic = self._get_string_parameter("detection_topic") + map_topic = self._get_string_parameter("map_topic") + + self.detection_subscription = self.create_subscription( + RAIDetectionArray, + detection_topic, + self.detection_callback, + DEFAULT_QUEUE_SIZE, + ) + self.map_subscription = self.create_subscription( + OccupancyGrid, map_topic, self.map_callback, DEFAULT_QUEUE_SIZE + ) + + # Optional depth and camera info for point cloud extraction + depth_topic = self._get_string_parameter("depth_topic") + camera_info_topic = self._get_string_parameter("camera_info_topic") + use_pointcloud = ( + self.get_parameter("use_pointcloud_dedup").get_parameter_value().bool_value + ) + + if use_pointcloud and depth_topic: + self.depth_subscription = self.create_subscription( + Image, depth_topic, self.depth_callback, qos_profile_sensor_data + ) + self.get_logger().info(f"Subscribed to depth topic: {depth_topic}") + else: + self.depth_subscription = None + + if use_pointcloud and camera_info_topic: + self.camera_info_subscription = self.create_subscription( + CameraInfo, + camera_info_topic, + self.camera_info_callback, + qos_profile_sensor_data, + ) + self.get_logger().info( + f"Subscribed to camera info topic: {camera_info_topic}" + ) + else: + self.camera_info_subscription = None + + self.get_logger().info( + f"Subscribed to detection_topic={detection_topic}, map_topic={map_topic}" + ) + + def depth_callback(self, msg: Image): + """Store latest depth image.""" + self.last_depth_image = msg + + def camera_info_callback(self, msg: CameraInfo): + """Store latest camera info.""" + self.last_camera_info = msg + + def _extract_pointcloud_from_bbox( + self, detection, source_frame: str + ) -> Optional[Tuple[Point, float, int]]: + """Extract point cloud from bounding box region and compute features. + + Args: + detection: Detection2D message with bounding box + source_frame: Frame ID of the detection (unused, kept for compatibility) + + Returns: + Tuple of (centroid_3d, pointcloud_size, point_count) or None if extraction fails. + centroid_3d: 3D centroid of point cloud in source frame + pointcloud_size: Approximate 3D size (diagonal of bounding box in meters) + point_count: Number of valid 3D points + """ + if self.last_depth_image is None or self.last_camera_info is None: + return None + + result = extract_pointcloud_from_bbox( + detection, + self.last_depth_image, + self.last_camera_info, + self.bridge, + ) + + if result is None: + self.get_logger().warn("Failed to extract point cloud from bbox") + + return result + + def detection_callback(self, msg: RAIDetectionArray): + """Process detection array and store annotations.""" + confidence_threshold = self._get_double_parameter("confidence_threshold") + map_frame_id = self._get_string_parameter("map_frame_id") + + self.get_logger().info( + f"Received detection array with {len(msg.detections)} detections: {msg.detection_classes}, " + f"header.frame_id={msg.header.frame_id}, confidence_threshold={confidence_threshold}" + ) + + # Log details of each detection + for i, det in enumerate(msg.detections): + results_count = len(det.results) if det.results else 0 + if results_count > 0: + result = det.results[0] + self.get_logger().debug( + f" Detection {i}: class={result.hypothesis.class_id}, " + f"score={result.hypothesis.score:.3f}, " + f"frame_id={det.header.frame_id}" + ) + else: + self.get_logger().warn(f" Detection {i} has no results!") + + timestamp = rclpy.time.Time.from_msg(msg.header.stamp) + detection_source = msg.header.frame_id or "unknown" + + stored_count = 0 + skipped_count = 0 + default_frame_id = msg.header.frame_id + for detection in msg.detections: + if self._process_detection( + detection, + confidence_threshold, + map_frame_id, + timestamp, + detection_source, + default_frame_id, + ): + stored_count += 1 + else: + skipped_count += 1 + + if stored_count > 0: + self.get_logger().info( + f"Stored {stored_count} annotations, skipped {skipped_count} (low confidence or transform failed)" + ) + elif len(msg.detections) > 0: + self.get_logger().warn( + f"Received {len(msg.detections)} detections but none were stored " + f"(confidence threshold: {confidence_threshold})" + ) + + def _process_detection( + self, + detection, + confidence_threshold: float, + map_frame_id: str, + timestamp: rclpy.time.Time, + detection_source: str, + default_frame_id: str, + ) -> bool: + """Process a single detection and store annotation if valid. + + Returns: + True if annotation was stored, False otherwise. + """ + if not detection.results: + self.get_logger().debug("Detection has no results, skipping") + return False + + result = detection.results[0] + confidence = result.hypothesis.score + object_class = result.hypothesis.class_id + + # Check bounding box size + min_bbox_area = self._get_double_parameter("min_bbox_area") + bbox_width = detection.bbox.size_x + bbox_height = detection.bbox.size_y + bbox_area = bbox_width * bbox_height + + if bbox_area < min_bbox_area: + self.get_logger().debug( + f"Bounding box too small: area={bbox_area:.1f} < {min_bbox_area:.1f} pixels^2, " + f"skipping {object_class}" + ) + return False + + # Use class-specific threshold if available, otherwise use default + effective_threshold = self.class_thresholds.get( + object_class, confidence_threshold + ) + + self.get_logger().info( + f"Processing detection: class={object_class}, confidence={confidence:.3f}, " + f"threshold={effective_threshold:.3f}, bbox_area={bbox_area:.1f}" + ) + + if confidence < effective_threshold: + self.get_logger().debug( + f"Confidence {confidence:.3f} below threshold {effective_threshold:.3f}, skipping" + ) + return False + + # Use detection frame_id, fallback to message header frame_id if empty + source_frame = detection.header.frame_id or default_frame_id + if not source_frame: + self.get_logger().warn( + f"Detection has no frame_id (detection.frame_id='{detection.header.frame_id}', " + f"default_frame_id='{default_frame_id}'), skipping" + ) + return False + + pose_in_source_frame = result.pose.pose + + # Check if pose is empty (all zeros) - this happens when DINO doesn't provide 3D pose + # In that case, we can't store a meaningful annotation without depth information + pose_is_empty = ( + pose_in_source_frame.position.x == 0.0 + and pose_in_source_frame.position.y == 0.0 + and pose_in_source_frame.position.z == 0.0 + ) + + if pose_is_empty: + self.get_logger().warn( + f"Detection for {object_class} has empty pose (0,0,0). " + f"GroundingDINO provides 2D bounding boxes but no 3D pose. " + f"Cannot store annotation without 3D position. " + f"Bounding box center: ({detection.bbox.center.position.x:.1f}, " + f"{detection.bbox.center.position.y:.1f})" + ) + return False + + self.get_logger().debug( + f"Pose in source frame ({source_frame}): " + f"x={pose_in_source_frame.position.x:.3f}, " + f"y={pose_in_source_frame.position.y:.3f}, " + f"z={pose_in_source_frame.position.z:.3f}" + ) + + try: + pose_in_map_frame = self._transform_pose_to_map( + pose_in_source_frame, source_frame, map_frame_id + ) + self.get_logger().info( + f"Transformed pose to map frame ({map_frame_id}): " + f"x={pose_in_map_frame.position.x:.3f}, " + f"y={pose_in_map_frame.position.y:.3f}, " + f"z={pose_in_map_frame.position.z:.3f}" + ) + except Exception as e: + self.get_logger().warn( + f"Failed to transform pose from {source_frame} to {map_frame_id}: {e}" + ) + return False + + vision_detection_id = detection.id if hasattr(detection, "id") else None + + # Extract point cloud features if available + use_pointcloud = ( + self.get_parameter("use_pointcloud_dedup").get_parameter_value().bool_value + ) + pointcloud_features = None + pointcloud_centroid_map = None + pc_size = None + + if use_pointcloud: + pc_result = self._extract_pointcloud_from_bbox(detection, source_frame) + if pc_result: + pc_centroid_source, pc_size, pc_count = pc_result + # Transform point cloud centroid to map frame + try: + pc_centroid_map = self._transform_pose_to_map( + Pose(position=pc_centroid_source), source_frame, map_frame_id + ) + pointcloud_features = { + "centroid": { + "x": pc_centroid_map.position.x, + "y": pc_centroid_map.position.y, + "z": pc_centroid_map.position.z, + }, + "size_3d": pc_size, + "point_count": pc_count, + } + pointcloud_centroid_map = Point( + x=pc_centroid_map.position.x, + y=pc_centroid_map.position.y, + z=pc_centroid_map.position.z, + ) + self.get_logger().debug( + f"Point cloud features: size={pc_size:.2f}m, points={pc_count}, " + f"centroid=({pc_centroid_map.position.x:.2f}, {pc_centroid_map.position.y:.2f}, " + f"{pc_centroid_map.position.z:.2f})" + ) + except Exception as e: + self.get_logger().warn( + f"Failed to transform point cloud centroid: {e}" + ) + + # Use class-specific merge threshold if available, otherwise default to 0.5m + merge_threshold = self.class_merge_thresholds.get(object_class, 0.5) + + # Use point cloud centroid for matching if available, otherwise use pose + match_center = ( + pointcloud_centroid_map + if pointcloud_centroid_map + else Point( + x=pose_in_map_frame.position.x, + y=pose_in_map_frame.position.y, + z=pose_in_map_frame.position.z, + ) + ) + + try: + self.get_logger().info( + f"Storing annotation: class={object_class}, confidence={confidence:.3f}, " + f"merge_radius={merge_threshold:.2f}m, location_id={self.memory.location_id}" + ) + + # Enhanced matching with point cloud features + nearby = self.memory.query_by_location( + match_center, + radius=merge_threshold, + object_class=object_class, + location_id=self.memory.location_id, + ) + + should_merge = False + existing_id = None + + if nearby: + existing = nearby[0] + + # If both have point cloud data, check size similarity + if pointcloud_features and use_pointcloud and pc_size is not None: + if existing.metadata and "pointcloud" in existing.metadata: + existing_pc = existing.metadata["pointcloud"] + existing_size = existing_pc.get("size_3d", 0) + + if existing_size > 0: + size_ratio = min(pc_size, existing_size) / max( + pc_size, existing_size + ) + size_diff = abs(existing_size - pc_size) + + # If sizes are very different (>50% ratio and >0.5m diff), likely different objects + if size_ratio < 0.5 and size_diff > 0.5: + self.get_logger().info( + f"Point cloud size mismatch: existing={existing_size:.2f}m, " + f"new={pc_size:.2f}m, ratio={size_ratio:.2f}. Treating as different object." + ) + else: + should_merge = True + existing_id = existing.id + self.get_logger().debug( + f"Point cloud size match: existing={existing_size:.2f}m, " + f"new={pc_size:.2f}m, ratio={size_ratio:.2f}" + ) + else: + should_merge = True + existing_id = existing.id + else: + # Existing doesn't have point cloud, use spatial matching + should_merge = True + existing_id = existing.id + else: + # No point cloud data, use spatial matching + should_merge = True + existing_id = existing.id + + # Prepare metadata with point cloud features + metadata = {} + if pointcloud_features: + metadata["pointcloud"] = pointcloud_features + + if should_merge and existing_id: + # Update existing annotation + existing_ann = nearby[0] + updated = SemanticAnnotation( + id=existing_id, + object_class=object_class, + pose=pose_in_map_frame, + confidence=max(existing_ann.confidence, confidence), + timestamp=timestamp, + detection_source=detection_source, + source_frame=source_frame, + location_id=self.memory.location_id, + vision_detection_id=vision_detection_id, + metadata=metadata if metadata else existing_ann.metadata, + ) + self.memory.backend.update_annotation(updated) + self.get_logger().info( + f"Updated existing annotation for {object_class}" + ) + else: + # Insert new annotation + self.memory.store_annotation( + object_class=object_class, + pose=pose_in_map_frame, + confidence=confidence, + timestamp=timestamp, + detection_source=detection_source, + source_frame=source_frame, + location_id=self.memory.location_id, + vision_detection_id=vision_detection_id, + metadata=metadata if metadata else None, + ) + self.get_logger().info(f"Created new annotation for {object_class}") + + return True + self.get_logger().info(f"Successfully stored annotation for {object_class}") + return True + except Exception as e: + self.get_logger().error(f"Failed to store annotation: {e}", exc_info=True) + return False + + def map_callback(self, msg: OccupancyGrid): + """Process map update and store metadata.""" + map_frame_id = self._get_string_parameter("map_frame_id") + + if msg.header.frame_id != map_frame_id: + self.get_logger().warn( + f"Map frame_id mismatch: expected {map_frame_id}, got {msg.header.frame_id}" + ) + + self.memory.map_frame_id = msg.header.frame_id + self.memory.resolution = msg.info.resolution + + self.get_logger().debug( + f"Updated map metadata: frame_id={msg.header.frame_id}, " + f"resolution={msg.info.resolution}" + ) + + def _transform_pose_to_map( + self, pose: Pose, source_frame: str, target_frame: str + ) -> Pose: + """Transform pose from source frame to map frame. + + If source and target frames are the same, returns the pose unchanged. + + Raises: + Exception: If transform lookup fails. + """ + # No transform needed if frames are identical + if source_frame == target_frame: + return pose + + try: + transform = self.tf_buffer.lookup_transform( + target_frame, + source_frame, + rclpy.time.Time(), + timeout=rclpy.duration.Duration(seconds=TF_LOOKUP_TIMEOUT_SEC), + ) + pose_stamped = PoseStamped() + pose_stamped.pose = pose + pose_stamped.header.frame_id = source_frame + pose_stamped.header.stamp = transform.header.stamp + + transformed_pose_stamped = do_transform_pose_stamped( + pose_stamped, transform + ) + return transformed_pose_stamped.pose + except Exception as e: + raise Exception(f"Transform lookup failed: {e}") from e + + +def main(args=None): + """Main entry point for the semantic map node.""" + rclpy.init(args=args) + node = SemanticMapNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/src/rai_semap/rai_semap/ros2/perception_utils.py b/src/rai_semap/rai_semap/ros2/perception_utils.py new file mode 100644 index 000000000..21c28c9f7 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/perception_utils.py @@ -0,0 +1,248 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Perception utilities for 3D pose computation and point cloud extraction. + +This module contains perception layer logic that may belong to rai_perception: +- 3D pose computation from 2D bounding boxes using depth images +- Point cloud extraction from bounding box regions +- Detection enhancement (filling empty poses from 2D detections) +""" + +from typing import Optional, Tuple + +import numpy as np +from cv_bridge import CvBridge +from geometry_msgs.msg import Point, Pose, Quaternion +from sensor_msgs.msg import CameraInfo, Image +from vision_msgs.msg import Detection2D + + +def compute_3d_pose_from_bbox( + bbox_center_x: float, + bbox_center_y: float, + depth_image: Image, + camera_info: CameraInfo, + bridge: CvBridge, + region_size: int = 5, +) -> Optional[Pose]: + """Compute 3D pose from 2D bounding box center using depth and camera intrinsics. + + This is perception layer logic that converts 2D detections to 3D poses. + + Args: + bbox_center_x: X coordinate of bounding box center in pixels + bbox_center_y: Y coordinate of bounding box center in pixels + depth_image: Depth image message + camera_info: Camera info message with intrinsics + bridge: CvBridge instance for image conversion + + Returns: + Pose in camera frame, or None if computation fails + """ + try: + # Convert depth image to numpy array + depth_array = bridge.imgmsg_to_cv2(depth_image, desired_encoding="passthrough") + + # Get pixel coordinates (round to nearest integer) + u = int(round(bbox_center_x)) + v = int(round(bbox_center_y)) + + # Check bounds + if u < 0 or u >= depth_array.shape[1] or v < 0 or v >= depth_array.shape[0]: + return None + + # Get depth value at pixel (in meters, assuming depth is in mm) + depth_value = float(depth_array[v, u]) + if depth_value <= 0: + # Try a small region around the center + y_start = max(0, v - region_size // 2) + y_end = min(depth_array.shape[0], v + region_size // 2 + 1) + x_start = max(0, u - region_size // 2) + x_end = min(depth_array.shape[1], u + region_size // 2 + 1) + region = depth_array[y_start:y_end, x_start:x_end] + valid_depths = region[region > 0] + if len(valid_depths) == 0: + return None + depth_value = float(np.median(valid_depths)) + + # Convert depth to meters (assuming depth image is in mm, adjust if needed) + # Common depth encodings: 16UC1 (mm), 32FC1 (m) + if depth_image.encoding in ["16UC1", "mono16"]: + depth_value = depth_value / 1000.0 # mm to meters + + # Get camera intrinsics + fx = camera_info.k[0] + fy = camera_info.k[4] + cx = camera_info.k[2] + cy = camera_info.k[5] + + # Project pixel to 3D + z = depth_value + x = (u - cx) * z / fx + y = (v - cy) * z / fy + + # Create pose + pose = Pose() + pose.position = Point(x=x, y=y, z=z) + pose.orientation = Quaternion(x=0.0, y=0.0, z=0.0, w=1.0) # No rotation + + return pose + + except Exception: + return None + + +def extract_pointcloud_from_bbox( + detection, + depth_image: Image, + camera_info: CameraInfo, + bridge: CvBridge, +) -> Optional[Tuple[Point, float, int]]: + """Extract point cloud from bounding box region and compute features. + + This is perception layer logic that extracts 3D point cloud data from images. + + Args: + detection: Detection2D message with bounding box + depth_image: Depth image message + camera_info: Camera info message with intrinsics + bridge: CvBridge instance for image conversion + + Returns: + Tuple of (centroid_3d, pointcloud_size, point_count) or None if extraction fails. + centroid_3d: 3D centroid of point cloud in camera frame + pointcloud_size: Approximate 3D size (diagonal of bounding box in meters) + point_count: Number of valid 3D points + """ + try: + # Convert depth image to numpy array + depth_array = bridge.imgmsg_to_cv2(depth_image, desired_encoding="passthrough") + + # Get bounding box bounds + bbox_center_x = detection.bbox.center.position.x + bbox_center_y = detection.bbox.center.position.y + bbox_size_x = detection.bbox.size_x + bbox_size_y = detection.bbox.size_y + + # Convert to pixel coordinates + x_min = int(max(0, bbox_center_x - bbox_size_x / 2)) + x_max = int(min(depth_array.shape[1], bbox_center_x + bbox_size_x / 2)) + y_min = int(max(0, bbox_center_y - bbox_size_y / 2)) + y_max = int(min(depth_array.shape[0], bbox_center_y + bbox_size_y / 2)) + + if x_max <= x_min or y_max <= y_min: + return None + + # Get camera intrinsics + fx = camera_info.k[0] + fy = camera_info.k[4] + cx = camera_info.k[2] + cy = camera_info.k[5] + + # Extract depth region + depth_region = depth_array[y_min:y_max, x_min:x_max] + + # Convert depth to meters if needed + if depth_image.encoding in ["16UC1", "mono16"]: + depth_region = depth_region.astype(np.float32) / 1000.0 + + # Extract valid points and convert to 3D + valid_mask = depth_region > 0 + if not np.any(valid_mask): + return None + + y_coords, x_coords = np.where(valid_mask) + depths = depth_region[valid_mask] + + # Convert to 3D points in camera frame + u_coords = x_coords + x_min + v_coords = y_coords + y_min + + z = depths + x = (u_coords - cx) * z / fx + y = (v_coords - cy) * z / fy + + # Compute centroid + centroid_x = np.mean(x) + centroid_y = np.mean(y) + centroid_z = np.mean(z) + + # Compute 3D bounding box size (diagonal) + if len(x) > 0: + x_range = np.max(x) - np.min(x) + y_range = np.max(y) - np.min(y) + z_range = np.max(z) - np.min(z) + size_3d = np.sqrt(x_range**2 + y_range**2 + z_range**2) + else: + size_3d = 0.0 + + point_count = len(x) + + centroid = Point(x=float(centroid_x), y=float(centroid_y), z=float(centroid_z)) + + return (centroid, float(size_3d), point_count) + + except Exception: + return None + + +def enhance_detection_with_3d_pose( + detection: Detection2D, + depth_image: Optional[Image], + camera_info: Optional[CameraInfo], + bridge: CvBridge, + region_size: int = 5, +) -> bool: + """Enhance detection with 3D pose if pose is empty and depth data is available. + + This is perception layer logic that handles incomplete detections (2D detections + without 3D poses) by computing 3D poses from depth images. + + Args: + detection: Detection2D message to enhance + depth_image: Optional depth image for 3D pose computation + camera_info: Optional camera info for 3D pose computation + bridge: CvBridge instance for image conversion + + Returns: + True if pose was enhanced, False otherwise + """ + if not detection.results or len(detection.results) == 0: + return False + + result = detection.results[0] + pose = result.pose.pose + + # Check if pose is empty (0,0,0) + if not ( + pose.position.x == 0.0 and pose.position.y == 0.0 and pose.position.z == 0.0 + ): + return False # Pose already exists + + # Compute 3D pose from bounding box if depth and camera info are available + if depth_image is None or camera_info is None: + return False + + bbox_center_x = detection.bbox.center.position.x + bbox_center_y = detection.bbox.center.position.y + computed_pose = compute_3d_pose_from_bbox( + bbox_center_x, bbox_center_y, depth_image, camera_info, bridge, region_size + ) + + if computed_pose: + result.pose.pose = computed_pose + return True + + return False diff --git a/src/rai_semap/rai_semap/ros2/perception_utils.yaml b/src/rai_semap/rai_semap/ros2/perception_utils.yaml new file mode 100644 index 000000000..f43d2b656 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/perception_utils.yaml @@ -0,0 +1,5 @@ +perception_utils: + ros__parameters: + depth_topic: "" + camera_info_topic: "" + depth_fallback_region_size: 5 diff --git a/src/rai_semap/rai_semap/tool/__init__.py b/src/rai_semap/rai_semap/tool/__init__.py new file mode 100644 index 000000000..c8b4abf7f --- /dev/null +++ b/src/rai_semap/rai_semap/tool/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/rai_semap/rai_semap/tool/query_semantic_map_tool.py b/src/rai_semap/rai_semap/tool/query_semantic_map_tool.py new file mode 100644 index 000000000..4f87fec66 --- /dev/null +++ b/src/rai_semap/rai_semap/tool/query_semantic_map_tool.py @@ -0,0 +1,44 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Type + +from langchain_core.tools import BaseTool +from pydantic import BaseModel, Field + +from rai_semap.core.semantic_map_memory import SemanticMapMemory + + +class QuerySemanticMapToolInput(BaseModel): + """Input schema for QuerySemanticMapTool.""" + + query: str = Field(description="Natural language query about object locations") + room: Optional[str] = Field( + default=None, description="Optional room or region name" + ) + + +class QuerySemanticMapTool(BaseTool): + """Tool for querying semantic map for object locations.""" + + name: str = "query_semantic_map" + description: str = "Query the semantic map for object locations" + + args_schema: Type[QuerySemanticMapToolInput] = QuerySemanticMapToolInput + + memory: SemanticMapMemory + + def _run(self, query: str, room: Optional[str] = None) -> str: + """Execute semantic map query.""" + pass diff --git a/src/rai_semap/rai_semap/utils/__init__.py b/src/rai_semap/rai_semap/utils/__init__.py new file mode 100644 index 000000000..23cecced4 --- /dev/null +++ b/src/rai_semap/rai_semap/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility scripts for rai_semap.""" diff --git a/src/rai_semap/rai_semap/utils/clear_annotations.py b/src/rai_semap/rai_semap/utils/clear_annotations.py new file mode 100644 index 000000000..22824d2a8 --- /dev/null +++ b/src/rai_semap/rai_semap/utils/clear_annotations.py @@ -0,0 +1,102 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility script to clear all annotations from semantic map database.""" + +import argparse +import sys + +from geometry_msgs.msg import Point + +from rai_semap.core.backend.sqlite_backend import SQLiteBackend +from rai_semap.core.semantic_map_memory import SemanticMapMemory + + +def main(): + parser = argparse.ArgumentParser( + description="Clear all annotations from semantic map database" + ) + parser.add_argument( + "--database-path", + type=str, + default="semantic_map.db", + help="Path to SQLite database file (default: semantic_map.db)", + ) + parser.add_argument( + "--location-id", + type=str, + default=None, + help="If provided, only delete annotations for this location. " + "If not provided, delete all annotations.", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Skip confirmation prompt", + ) + + args = parser.parse_args() + + # Initialize backend and memory + backend = SQLiteBackend(args.database_path) + backend.init_schema() + + # Query to get count before deletion + if args.location_id: + center = Point(x=0.0, y=0.0, z=0.0) + filters = {"location_id": args.location_id} + existing = backend.spatial_query(center, radius=1e10, filters=filters) + count = len(existing) + location_msg = f" for location_id='{args.location_id}'" + else: + center = Point(x=0.0, y=0.0, z=0.0) + existing = backend.spatial_query(center, radius=1e10, filters={}) + count = len(existing) + location_msg = "" + + if count == 0: + print(f"No annotations found{location_msg} in database: {args.database_path}") + return 0 + + # Confirmation prompt + if not args.yes: + if args.location_id: + prompt = ( + f"Are you sure you want to delete {count} annotation(s) " + f"for location_id='{args.location_id}' from {args.database_path}? [y/N]: " + ) + else: + prompt = ( + f"Are you sure you want to delete ALL {count} annotation(s) " + f"from {args.database_path}? [y/N]: " + ) + response = input(prompt) + if response.lower() not in ["y", "yes"]: + print("Cancelled.") + return 0 + + # Delete annotations + memory = SemanticMapMemory( + backend=backend, + location_id=args.location_id or "default_location", + ) + + deleted_count = memory.delete_all_annotations(location_id=args.location_id) + + print(f"✓ Successfully deleted {deleted_count} annotation(s){location_msg}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/rai_semap/__init__.py b/tests/rai_semap/__init__.py new file mode 100644 index 000000000..c8b4abf7f --- /dev/null +++ b/tests/rai_semap/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/rai_semap/conftest.py b/tests/rai_semap/conftest.py new file mode 100644 index 000000000..c5cd53b71 --- /dev/null +++ b/tests/rai_semap/conftest.py @@ -0,0 +1,81 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import tempfile +from pathlib import Path +from typing import Optional + +import pytest +from geometry_msgs.msg import Pose + +# Add src/rai_semap to Python path +rai_semap_path = Path(__file__).parent.parent.parent / "src" / "rai_semap" +sys.path.insert(0, str(rai_semap_path)) + +from rai_semap.core.semantic_map_memory import SemanticAnnotation # noqa: E402 + +# Common test constants +TEST_LOCATION_ID = "test_location" +TEST_DETECTION_SOURCE = "GroundingDINO" +TEST_SOURCE_FRAME = "camera_frame" +TEST_BASE_TIMESTAMP = 1234567890 + + +@pytest.fixture +def temp_db_path(): + """Create a temporary database path for testing.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: + db_path = f.name + yield db_path + Path(db_path).unlink(missing_ok=True) + + +def make_pose(x: float, y: float, z: float = 0.0) -> Pose: + """Create a Pose with specified position.""" + pose = Pose() + pose.position.x = x + pose.position.y = y + pose.position.z = z + return pose + + +def make_annotation( + annotation_id: str, + object_class: str, + x: float, + y: float, + z: float = 0.0, + confidence: float = 0.9, + timestamp: int = TEST_BASE_TIMESTAMP, + detection_source: str = TEST_DETECTION_SOURCE, + source_frame: str = TEST_SOURCE_FRAME, + location_id: str = TEST_LOCATION_ID, + vision_detection_id: Optional[str] = None, + metadata: Optional[dict] = None, +) -> SemanticAnnotation: + """Create a SemanticAnnotation with common defaults.""" + pose = make_pose(x, y, z) + return SemanticAnnotation( + id=annotation_id, + object_class=object_class, + pose=pose, + confidence=confidence, + timestamp=timestamp, + detection_source=detection_source, + source_frame=source_frame, + location_id=location_id, + vision_detection_id=vision_detection_id, + metadata=metadata, + ) diff --git a/tests/rai_semap/test_backend.py b/tests/rai_semap/test_backend.py new file mode 100644 index 000000000..46e7c4574 --- /dev/null +++ b/tests/rai_semap/test_backend.py @@ -0,0 +1,145 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from geometry_msgs.msg import Point + +from rai_semap.core.backend.sqlite_backend import SQLiteBackend + +from .conftest import make_annotation + + +@pytest.fixture +def backend(temp_db_path): + """Create a SQLiteBackend instance for testing.""" + backend = SQLiteBackend(temp_db_path) + backend.init_schema() + return backend + + +def test_backend_init_schema(backend): + """Test that schema initialization completes without error.""" + backend.init_schema() + + +def test_backend_insert_annotation(backend): + """Test inserting a single annotation.""" + annotation = make_annotation("test-id", "cup", 1.0, 2.0) + annotation_id = backend.insert_annotation(annotation) + assert annotation_id == "test-id" + assert isinstance(annotation_id, str) + + +def test_backend_insert_and_query(backend): + """Test inserting annotation and querying to verify persistence.""" + annotation = make_annotation("test-id-1", "cup", 1.0, 2.0) + backend.insert_annotation(annotation) + + center = Point(x=1.0, y=2.0, z=0.0) + results = backend.spatial_query(center, radius=0.5) + assert len(results) == 1 + assert results[0].id == "test-id-1" + assert results[0].object_class == "cup" + assert results[0].confidence == 0.9 + assert results[0].pose.position.x == 1.0 + assert results[0].pose.position.y == 2.0 + assert results[0].location_id == "test_location" + + +def test_backend_query_by_class_filter(backend): + """Test querying with object_class filter.""" + annotation1 = make_annotation("cup-1", "cup", 1.0, 1.0) + annotation2 = make_annotation( + "bottle-1", "bottle", 2.0, 2.0, confidence=0.8, timestamp=1234567891 + ) + + backend.insert_annotation(annotation1) + backend.insert_annotation(annotation2) + + center = Point(x=0.0, y=0.0, z=0.0) + results = backend.spatial_query( + center, radius=10.0, filters={"object_class": "cup"} + ) + assert len(results) == 1 + assert results[0].object_class == "cup" + assert results[0].id == "cup-1" + + +def test_backend_query_by_confidence_threshold(backend): + """Test querying with confidence threshold filter.""" + annotation1 = make_annotation("high-conf", "cup", 1.0, 1.0, confidence=0.9) + annotation2 = make_annotation( + "low-conf", "cup", 2.0, 2.0, confidence=0.3, timestamp=1234567891 + ) + + backend.insert_annotation(annotation1) + backend.insert_annotation(annotation2) + + center = Point(x=0.0, y=0.0, z=0.0) + results = backend.spatial_query( + center, radius=10.0, filters={"confidence_threshold": 0.5} + ) + assert len(results) == 1 + assert results[0].id == "high-conf" + assert results[0].confidence >= 0.5 + + +def test_backend_update_annotation(backend): + """Test updating an existing annotation.""" + annotation = make_annotation("test-id", "cup", 1.0, 2.0, confidence=0.7) + backend.insert_annotation(annotation) + + updated_annotation = make_annotation( + "test-id", + "cup", + 1.5, + 2.5, + confidence=0.95, + timestamp=1234567900, + detection_source="GroundedSAM", + ) + success = backend.update_annotation(updated_annotation) + assert success is True + + center = Point(x=1.0, y=2.0, z=0.0) + results = backend.spatial_query(center, radius=1.0) + assert len(results) == 1 + assert results[0].confidence == 0.95 + assert results[0].pose.position.x == 1.5 + assert results[0].detection_source == "GroundedSAM" + + +def test_backend_delete_annotation(backend): + """Test deleting an annotation.""" + annotation = make_annotation("test-id", "cup", 1.0, 2.0) + backend.insert_annotation(annotation) + + success = backend.delete_annotation("test-id") + assert success is True + + center = Point(x=1.0, y=2.0, z=0.0) + results = backend.spatial_query(center, radius=1.0) + assert len(results) == 0 + + success = backend.delete_annotation("non-existent") + assert success is False + + +def test_backend_spatial_query(backend): + """Test spatial query returns list of annotations.""" + center = Point(x=0.0, y=0.0, z=0.0) + radius = 1.0 + results = backend.spatial_query(center, radius) + assert isinstance(results, list) + assert len(results) == 0 diff --git a/tests/rai_semap/test_memory.py b/tests/rai_semap/test_memory.py new file mode 100644 index 000000000..983524b54 --- /dev/null +++ b/tests/rai_semap/test_memory.py @@ -0,0 +1,242 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import MagicMock + +import pytest +from geometry_msgs.msg import Point + +from rai_semap.core.backend.sqlite_backend import SQLiteBackend +from rai_semap.core.semantic_map_memory import SemanticAnnotation, SemanticMapMemory + +from .conftest import TEST_LOCATION_ID, make_annotation, make_pose + + +@pytest.fixture +def mock_backend(): + """Create a mock backend for testing.""" + backend = MagicMock(spec=SQLiteBackend) + return backend + + +@pytest.fixture +def memory(mock_backend): + """Create a SemanticMapMemory instance with mock backend.""" + return SemanticMapMemory(mock_backend, location_id=TEST_LOCATION_ID) + + +@pytest.fixture +def real_backend(temp_db_path): + """Create a real SQLiteBackend instance for integration testing.""" + backend = SQLiteBackend(temp_db_path) + backend.init_schema() + return backend + + +@pytest.fixture +def real_memory(real_backend): + """Create a SemanticMapMemory instance with real backend.""" + return SemanticMapMemory(real_backend, location_id=TEST_LOCATION_ID) + + +def test_memory_store_annotation(memory): + """Test storing an annotation returns an ID.""" + memory.backend.insert_annotation.return_value = "test-id" + annotation_id = memory.store_annotation( + object_class="cup", + pose=make_pose(1.0, 2.0), + confidence=0.9, + timestamp=1234567890, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + assert annotation_id == "test-id" + assert isinstance(annotation_id, str) + memory.backend.insert_annotation.assert_called_once() + call_args = memory.backend.insert_annotation.call_args[0][0] + assert isinstance(call_args, SemanticAnnotation) + assert call_args.object_class == "cup" + assert call_args.location_id == TEST_LOCATION_ID + + +def test_memory_query_by_class(memory): + """Test querying by object class returns list.""" + mock_annotation = make_annotation("test-id", "cup", 0.0, 0.0) + memory.backend.spatial_query.return_value = [mock_annotation] + results = memory.query_by_class("cup") + assert isinstance(results, list) + assert len(results) == 1 + assert results[0].object_class == "cup" + memory.backend.spatial_query.assert_called_once() + call_kwargs = memory.backend.spatial_query.call_args[1] + assert call_kwargs["filters"]["object_class"] == "cup" + assert call_kwargs["filters"]["location_id"] == TEST_LOCATION_ID + + +def test_memory_query_by_location(memory): + """Test querying by location returns list.""" + mock_annotation = make_annotation("test-id", "cup", 0.0, 0.0) + memory.backend.spatial_query.return_value = [mock_annotation] + center = Point(x=1.0, y=2.0, z=0.0) + results = memory.query_by_location(center, radius=1.0) + assert isinstance(results, list) + assert len(results) == 1 + memory.backend.spatial_query.assert_called_once() + call_args = memory.backend.spatial_query.call_args[0] + assert call_args[0] == center + assert call_args[1] == 1.0 + + +def test_memory_query_by_region(memory): + """Test querying by region returns list.""" + mock_annotation = make_annotation("test-id", "cup", 0.0, 0.0) + memory.backend.spatial_query.return_value = [mock_annotation] + bbox = (0.0, 0.0, 2.0, 2.0) + results = memory.query_by_region(bbox) + assert isinstance(results, list) + memory.backend.spatial_query.assert_called_once() + + +def test_memory_get_map_metadata(memory): + """Test getting map metadata returns MapMetadata.""" + memory.backend.spatial_query.return_value = [] + metadata = memory.get_map_metadata() + assert metadata is not None + assert metadata.location_id == TEST_LOCATION_ID + assert metadata.map_frame_id == "map" + + +def test_memory_store_or_update_new_annotation(real_memory): + """Test store_or_update_annotation creates new annotation when none nearby.""" + annotation_id = real_memory.store_or_update_annotation( + object_class="cup", + pose=make_pose(1.0, 2.0), + confidence=0.9, + timestamp=1234567890, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + assert isinstance(annotation_id, str) + assert len(annotation_id) > 0 + + center = Point(x=1.0, y=2.0, z=0.0) + results = real_memory.query_by_location(center, radius=0.5) + assert len(results) == 1 + assert results[0].object_class == "cup" + assert results[0].confidence == 0.9 + + +def test_memory_store_or_update_merges_nearby(real_memory): + """Test store_or_update_annotation merges nearby duplicate detections.""" + annotation_id1 = real_memory.store_or_update_annotation( + object_class="cup", + pose=make_pose(1.0, 2.0), + confidence=0.7, + timestamp=1234567890, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + + annotation_id2 = real_memory.store_or_update_annotation( + object_class="cup", + pose=make_pose(1.1, 2.1), + confidence=0.9, + timestamp=1234567900, + detection_source="GroundedSAM", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + merge_threshold=0.5, + ) + + assert annotation_id1 == annotation_id2 + + center = Point(x=1.0, y=2.0, z=0.0) + results = real_memory.query_by_location(center, radius=0.5) + assert len(results) == 1 + assert results[0].confidence == 0.9 + assert results[0].detection_source == "GroundedSAM" + + +def test_memory_store_or_update_creates_separate_for_different_classes(real_memory): + """Test store_or_update_annotation creates separate annotations for different classes.""" + annotation_id1 = real_memory.store_or_update_annotation( + object_class="cup", + pose=make_pose(1.0, 2.0), + confidence=0.9, + timestamp=1234567890, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + + annotation_id2 = real_memory.store_or_update_annotation( + object_class="bottle", + pose=make_pose(1.1, 2.1), + confidence=0.8, + timestamp=1234567900, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + merge_threshold=0.5, + ) + + assert annotation_id1 != annotation_id2 + + center = Point(x=1.0, y=2.0, z=0.0) + results = real_memory.query_by_location(center, radius=0.5) + assert len(results) == 2 + + +def test_memory_end_to_end_store_and_query(real_memory): + """Test end-to-end: store multiple annotations and query by class.""" + real_memory.store_annotation( + object_class="cup", + pose=make_pose(1.0, 1.0), + confidence=0.9, + timestamp=1234567890, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + + real_memory.store_annotation( + object_class="bottle", + pose=make_pose(2.0, 2.0), + confidence=0.8, + timestamp=1234567891, + detection_source="GroundingDINO", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + + real_memory.store_annotation( + object_class="cup", + pose=make_pose(3.0, 3.0), + confidence=0.85, + timestamp=1234567892, + detection_source="GroundedSAM", + source_frame="camera_frame", + location_id=TEST_LOCATION_ID, + ) + + cups = real_memory.query_by_class("cup") + assert len(cups) == 2 + assert all(c.object_class == "cup" for c in cups) + + bottles = real_memory.query_by_class("bottle") + assert len(bottles) == 1 + assert bottles[0].object_class == "bottle" diff --git a/tests/rai_semap/test_node.py b/tests/rai_semap/test_node.py new file mode 100644 index 000000000..e9fea50c2 --- /dev/null +++ b/tests/rai_semap/test_node.py @@ -0,0 +1,246 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +from pathlib import Path + +# Testing framework +import pytest + +# ROS2 core +import rclpy + +# ROS2 message types +from geometry_msgs.msg import Pose, Quaternion +from nav_msgs.msg import MapMetaData, OccupancyGrid +from rclpy.node import Node +from std_msgs.msg import Header +from vision_msgs.msg import Detection2D, ObjectHypothesis, ObjectHypothesisWithPose + +# RAI interfaces +from rai_interfaces.msg import RAIDetectionArray + +# Local imports +from rai_semap.ros2.node import SemanticMapNode + + +@pytest.fixture(scope="module") +def ros2_context(): + """Initialize ROS2 context for testing.""" + rclpy.init() + yield + rclpy.shutdown() + + +@pytest.fixture +def temp_db_path(): + """Create a temporary database path for testing.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: + db_path = f.name + yield db_path + Path(db_path).unlink(missing_ok=True) + + +@pytest.fixture +def node(ros2_context, temp_db_path): + """Create a SemanticMapNode instance for testing.""" + node = SemanticMapNode(database_path=temp_db_path) + yield node + node.destroy_node() + + +# Test helper functions +def create_detection_message(frame_id: str, detections: list) -> RAIDetectionArray: + """Create a RAIDetectionArray message with given detections.""" + msg = RAIDetectionArray() + msg.header = Header() + msg.header.frame_id = frame_id + msg.detections = detections + return msg + + +def create_detection( + frame_id: str, + class_id: str, + score: float, + x: float = 1.0, + y: float = 2.0, + z: float = 0.0, +) -> Detection2D: + """Create a Detection2D with ObjectHypothesisWithPose.""" + detection = Detection2D() + detection.header = Header() + detection.header.frame_id = frame_id + + hypothesis = ObjectHypothesis() + hypothesis.class_id = class_id + hypothesis.score = score + + result = ObjectHypothesisWithPose() + result.hypothesis = hypothesis + result.pose.pose = Pose() + result.pose.pose.position.x = x + result.pose.pose.position.y = y + result.pose.pose.position.z = z + result.pose.pose.orientation = Quaternion(w=1.0) + + detection.results = [result] + return detection + + +def test_node_creation(node): + """Test that SemanticMapNode can be created with default parameters.""" + assert isinstance(node, Node) + assert node.get_name() == "rai_semap_node" + + expected_params = [ + "database_path", + "confidence_threshold", + "detection_topic", + "map_topic", + "map_frame_id", + "location_id", + "map_resolution", + ] + for param in expected_params: + assert node.has_parameter(param) + + +def test_node_parameter_defaults(node, temp_db_path): + """Test that node parameters have correct default values.""" + assert ( + node.get_parameter("database_path").get_parameter_value().string_value + == temp_db_path + ) + assert ( + node.get_parameter("confidence_threshold").get_parameter_value().double_value + == 0.5 + ) + assert ( + node.get_parameter("detection_topic").get_parameter_value().string_value + == "/detection_array" + ) + assert node.get_parameter("map_topic").get_parameter_value().string_value == "/map" + assert ( + node.get_parameter("map_frame_id").get_parameter_value().string_value == "map" + ) + assert ( + node.get_parameter("location_id").get_parameter_value().string_value + == "default_location" + ) + assert ( + node.get_parameter("map_resolution").get_parameter_value().double_value == 0.05 + ) + + +def test_node_memory_initialization(node): + """Test that semantic map memory is properly initialized.""" + assert node.memory is not None + assert node.memory.location_id == "default_location" + assert node.memory.map_frame_id == "map" + assert node.memory.resolution == 0.05 + + +def test_node_subscriptions_created(node): + """Test that subscriptions are created.""" + assert node.detection_subscription is not None + assert node.map_subscription is not None + + +def test_detection_callback_low_confidence(node): + """Test that detections below confidence threshold are filtered out.""" + node.set_parameters( + [ + rclpy.parameter.Parameter( + "confidence_threshold", rclpy.parameter.Parameter.Type.DOUBLE, 0.8 + ), + ] + ) + + detection = create_detection("camera_frame", "cup", score=0.5) + msg = create_detection_message("camera_frame", [detection]) + + initial_count = len(node.memory.query_by_class("cup")) + node.detection_callback(msg) + final_count = len(node.memory.query_by_class("cup")) + + assert final_count == initial_count + + +def test_detection_callback_high_confidence(node): + """Test that high-confidence detections are processed.""" + node.set_parameters( + [ + rclpy.parameter.Parameter( + "confidence_threshold", rclpy.parameter.Parameter.Type.DOUBLE, 0.5 + ), + rclpy.parameter.Parameter( + "map_frame_id", rclpy.parameter.Parameter.Type.STRING, "camera_frame" + ), + ] + ) + + detection = create_detection("camera_frame", "bottle", score=0.9) + msg = create_detection_message("GroundingDINO", [detection]) + + initial_count = len(node.memory.query_by_class("bottle")) + node.detection_callback(msg) + final_count = len(node.memory.query_by_class("bottle")) + + assert final_count >= initial_count + + +def test_map_callback_updates_metadata(node): + """Test that map callback updates metadata.""" + node.set_parameters( + [ + rclpy.parameter.Parameter( + "map_frame_id", rclpy.parameter.Parameter.Type.STRING, "map" + ), + ] + ) + + msg = OccupancyGrid() + msg.header = Header() + msg.header.frame_id = "map" + msg.info = MapMetaData() + msg.info.resolution = 0.1 + + initial_resolution = node.memory.resolution + node.map_callback(msg) + + assert node.memory.map_frame_id == "map" + assert node.memory.resolution == 0.1 + assert node.memory.resolution != initial_resolution + + +def test_detection_callback_empty_detections(node): + """Test that empty detection arrays are handled gracefully.""" + msg = create_detection_message("camera_frame", []) + node.detection_callback(msg) + + +def test_detection_callback_no_results(node): + """Test that detections without results are skipped.""" + detection = Detection2D() + detection.header = Header() + detection.results = [] + + msg = create_detection_message("camera_frame", [detection]) + + initial_count = len(node.memory.query_by_class("cup")) + node.detection_callback(msg) + final_count = len(node.memory.query_by_class("cup")) + + assert final_count == initial_count From 4618982bb374a401831c28044377791b3a5dae61 Mon Sep 17 00:00:00 2001 From: Julia Jia Date: Tue, 2 Dec 2025 18:59:37 -0800 Subject: [PATCH 2/3] refactored --- examples/check-detection-pipeline.py | 160 -------- examples/rai-semap.launch.py | 133 ------- poetry.lock | 24 +- src/rai_bringup/launch/openset.launch.py | 3 +- src/rai_semap/README.md | 201 ++++------ src/rai_semap/design.md | 245 +++++------- .../rai_semap/core/backend/sqlite_backend.py | 4 +- .../{ => config}/detection_publisher.yaml | 6 +- src/rai_semap/rai_semap/ros2/config/node.yaml | 33 ++ .../ros2/config/perception_utils.yaml | 7 + .../rai_semap/ros2/config/visualizer.yaml | 11 + .../rai_semap/ros2/detection_publisher.py | 110 ++++-- src/rai_semap/rai_semap/ros2/node.py | 163 ++++---- .../rai_semap/ros2/perception_utils.yaml | 5 - src/rai_semap/rai_semap/ros2/visualizer.py | 350 ++++++++++++++++++ .../rai_semap/scripts/navigate_collect.py | 64 ++-- .../rai_semap/scripts/semap.launch.py | 90 +++++ .../rai_semap/scripts/validate_semap.py | 2 +- src/rai_semap/rai_semap/utils/ros2_log.py | 36 ++ tests/rai_semap/conftest.py | 9 + tests/rai_semap/test_detection_publisher.py | 104 ++++++ tests/rai_semap/test_perception_utils.py | 115 ++++++ tests/rai_semap/test_visualizer.py | 144 +++++++ 23 files changed, 1315 insertions(+), 704 deletions(-) delete mode 100644 examples/check-detection-pipeline.py delete mode 100644 examples/rai-semap.launch.py rename src/rai_semap/rai_semap/ros2/{ => config}/detection_publisher.yaml (82%) create mode 100644 src/rai_semap/rai_semap/ros2/config/node.yaml create mode 100644 src/rai_semap/rai_semap/ros2/config/perception_utils.yaml create mode 100644 src/rai_semap/rai_semap/ros2/config/visualizer.yaml delete mode 100644 src/rai_semap/rai_semap/ros2/perception_utils.yaml create mode 100644 src/rai_semap/rai_semap/ros2/visualizer.py rename examples/rosbot-xl-navigate-collect.py => src/rai_semap/rai_semap/scripts/navigate_collect.py (75%) create mode 100644 src/rai_semap/rai_semap/scripts/semap.launch.py rename examples/validate-semantic-map.py => src/rai_semap/rai_semap/scripts/validate_semap.py (99%) create mode 100644 src/rai_semap/rai_semap/utils/ros2_log.py create mode 100644 tests/rai_semap/test_detection_publisher.py create mode 100644 tests/rai_semap/test_perception_utils.py create mode 100644 tests/rai_semap/test_visualizer.py diff --git a/examples/check-detection-pipeline.py b/examples/check-detection-pipeline.py deleted file mode 100644 index cd28f5d39..000000000 --- a/examples/check-detection-pipeline.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (C) 2025 Julia Jia -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Diagnostic script to check if the detection pipeline is working.""" - -import rclpy -from rclpy.node import Node -from sensor_msgs.msg import Image - -from rai_interfaces.msg import RAIDetectionArray - - -class DetectionPipelineChecker(Node): - """Check if detection pipeline components are working.""" - - def __init__(self): - super().__init__("detection_pipeline_checker") - self.camera_received = False - self.detections_received = False - self.detection_count = 0 - - def check_camera_topic( - self, topic: str = "/camera/image_raw", timeout: float = 5.0 - ): - """Check if camera topic is publishing.""" - self.get_logger().info(f"Checking camera topic: {topic}") - _ = self.create_subscription( - Image, topic, lambda msg: setattr(self, "camera_received", True), 10 - ) - - import time - - start_time = time.time() - while time.time() - start_time < timeout: - rclpy.spin_once(self, timeout_sec=0.5) - if self.camera_received: - self.get_logger().info(f"✓ Camera topic {topic} is publishing") - return True - - self.get_logger().warn( - f"✗ Camera topic {topic} not publishing (timeout: {timeout}s)" - ) - return False - - def check_detection_topic( - self, topic: str = "/detection_array", timeout: float = 10.0 - ): - """Check if detection topic is publishing.""" - self.get_logger().info(f"Checking detection topic: {topic}") - - def detection_callback(msg: RAIDetectionArray): - self.detections_received = True - self.detection_count += len(msg.detections) - self.get_logger().info( - f"Received detection array with {len(msg.detections)} detections: " - f"{msg.detection_classes}" - ) - - _ = self.create_subscription(RAIDetectionArray, topic, detection_callback, 10) - - import time - - start_time = time.time() - while time.time() - start_time < timeout: - rclpy.spin_once(self, timeout_sec=0.5) - if self.detections_received: - self.get_logger().info( - f"✓ Detection topic {topic} is publishing ({self.detection_count} total detections)" - ) - return True - - self.get_logger().warn( - f"✗ Detection topic {topic} not publishing (timeout: {timeout}s)" - ) - return False - - def check_dino_service( - self, service_name: str = "/grounding_dino/grounding_dino_classify" - ): - """Check if DINO service is available.""" - from rai_interfaces.srv import RAIGroundingDino - - client = self.create_client(RAIGroundingDino, service_name) - - self.get_logger().info(f"Checking DINO service: {service_name}") - if client.wait_for_service(timeout_sec=2.0): - self.get_logger().info(f"✓ DINO service {service_name} is available") - return True - else: - self.get_logger().warn(f"✗ DINO service {service_name} not available") - return False - - -def main(): - """Run diagnostic checks.""" - rclpy.init() - checker = DetectionPipelineChecker() - - print("\n" + "=" * 60) - print("Detection Pipeline Diagnostic") - print("=" * 60 + "\n") - - # Check DINO service - dino_ok = checker.check_dino_service() - print() - - # Check camera topic - camera_ok = checker.check_camera_topic() - print() - - # Check detection topic (wait longer since it depends on camera) - if camera_ok: - detection_ok = checker.check_detection_topic(timeout=15.0) - else: - print("Skipping detection topic check (camera not available)") - detection_ok = False - print() - - # Summary - print("=" * 60) - print("Summary:") - print(f" DINO Service: {'✓' if dino_ok else '✗'}") - print(f" Camera Topic: {'✓' if camera_ok else '✗'}") - print(f" Detection Topic: {'✓' if detection_ok else '✗'}") - print("=" * 60) - - if not dino_ok: - print( - "\n⚠️ DINO service not available. Make sure perception agents are running:" - ) - print(" python -m rai_perception.scripts.run_perception_agents") - - if not camera_ok: - print("\n⚠️ Camera topic not publishing. Check:") - print(" ros2 topic list | grep camera") - print(" ros2 topic echo /camera/image_raw --once") - - if not detection_ok and camera_ok: - print("\n⚠️ Detection topic not publishing. Check:") - print(" - Is detection_publisher node running?") - print(" - Check detection_publisher logs for errors") - print(" - Verify camera topic name matches detection_publisher config") - - checker.destroy_node() - rclpy.shutdown() - - -if __name__ == "__main__": - main() diff --git a/examples/rai-semap.launch.py b/examples/rai-semap.launch.py deleted file mode 100644 index 703e00cc6..000000000 --- a/examples/rai-semap.launch.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (C) 2025 Julia Jia -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from launch import LaunchDescription -from launch.actions import DeclareLaunchArgument, ExecuteProcess -from launch.substitutions import LaunchConfiguration - - -def generate_launch_description(): - # Declare launch arguments - database_path_arg = DeclareLaunchArgument( - "database_path", - default_value="semantic_map.db", - description="Path to SQLite database file for semantic map", - ) - - location_id_arg = DeclareLaunchArgument( - "location_id", - default_value="default_location", - description="Identifier for the physical location", - ) - - camera_topic_arg = DeclareLaunchArgument( - "camera_topic", - default_value="/camera/camera/color/image_raw", - description="Camera image topic", - ) - - depth_topic_arg = DeclareLaunchArgument( - "depth_topic", - default_value="/camera/camera/depth/image_rect_raw", - description="Depth image topic", - ) - - camera_info_topic_arg = DeclareLaunchArgument( - "camera_info_topic", - default_value="/camera/camera/color/camera_info", - description="Camera info topic", - ) - - # Launch detection publisher (bridges DINO service to /detection_array topic) - camera_topic_param = LaunchConfiguration("camera_topic") - depth_topic_param = LaunchConfiguration("depth_topic") - camera_info_topic_param = LaunchConfiguration("camera_info_topic") - - launch_detection_publisher = ExecuteProcess( - cmd=[ - "python", - "-m", - "rai_semap.ros2.detection_publisher", - "--ros-args", - "-p", - ["camera_topic:=", camera_topic_param], - "-p", - ["depth_topic:=", depth_topic_param], - "-p", - ["camera_info_topic:=", camera_info_topic_param], - "-p", - "detection_topic:=/detection_array", - "-p", - "dino_service:=/grounding_dino_classify", - "-p", - "detection_interval:=2.0", - "-p", - "box_threshold:=0.4", - "-p", - "text_threshold:=0.3", - ], - output="screen", - ) - - # Launch semantic map node - db_path_param = LaunchConfiguration("database_path") - loc_id_param = LaunchConfiguration("location_id") - - launch_semap = ExecuteProcess( - cmd=[ - "python", - "-m", - "rai_semap.ros2.node", - "--ros-args", - "-p", - ["database_path:=", db_path_param], - "-p", - ["location_id:=", loc_id_param], - "-p", - "confidence_threshold:=0.5", - "-p", - "class_confidence_thresholds:=person:0.7,window:0.6,door:0.5", - "-p", - "class_merge_thresholds:=couch:2.5,table:1.5,shelf:1.5,chair:0.8", - "-p", - "min_bbox_area:=500.0", - "-p", - "use_pointcloud_dedup:=true", - "-p", - ["depth_topic:=", depth_topic_param], - "-p", - ["camera_info_topic:=", camera_info_topic_param], - "-p", - "detection_topic:=/detection_array", - "-p", - "map_topic:=/map", - "-p", - "map_frame_id:=map", - "-p", - "map_resolution:=0.05", - ], - output="screen", - ) - - return LaunchDescription( - [ - database_path_arg, - location_id_arg, - camera_topic_arg, - depth_topic_arg, - camera_info_topic_arg, - launch_detection_publisher, - launch_semap, - ] - ) diff --git a/poetry.lock b/poetry.lock index 350a71c1a..ecc51fcc1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -900,7 +900,7 @@ files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\"", nomad = "platform_system == \"Windows\" or sys_platform == \"win32\"", perception = "platform_system == \"Windows\"", s2s = "platform_system == \"Windows\" or sys_platform == \"win32\"", simbench = "platform_system == \"Windows\""} +markers = {main = "sys_platform == \"win32\" or platform_system == \"Windows\"", nomad = "sys_platform == \"win32\" or platform_system == \"Windows\"", perception = "platform_system == \"Windows\"", s2s = "sys_platform == \"win32\" or platform_system == \"Windows\"", simbench = "platform_system == \"Windows\""} [[package]] name = "coloredlogs" @@ -3613,8 +3613,8 @@ files = [ boto3 = ">=1.39.7" langchain-core = ">=0.3.76,<0.4.0" numpy = [ - {version = ">=1.26.0,<3", markers = "python_version >= \"3.12\""}, {version = ">=1,<2", markers = "python_version < \"3.12\""}, + {version = ">=1.26.0,<3", markers = "python_version >= \"3.12\""}, ] pydantic = ">=2.10.0,<3" @@ -5205,7 +5205,7 @@ description = "ONNX Runtime is a runtime accelerator for Machine Learning models optional = false python-versions = ">=3.10" groups = ["s2s"] -markers = "platform_machine == \"x86_64\" and sys_platform != \"darwin\"" +markers = "sys_platform != \"darwin\" and platform_machine == \"x86_64\"" files = [ {file = "onnxruntime_gpu-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18de50c6c8eea50acc405ea13d299aec593e46478d7a22cd32cdbbdf7c42899d"}, {file = "onnxruntime_gpu-1.23.2-cp310-cp310-win_amd64.whl", hash = "sha256:deba091e15357355aa836fd64c6c4ac97dd0c4609c38b08a69675073ea46b321"}, @@ -5329,10 +5329,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, - {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""}, - {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""}, + {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -5354,10 +5354,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, - {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""}, - {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""}, + {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -5718,9 +5718,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -7378,7 +7378,7 @@ reference = "test-pypi" [[package]] name = "rai-perception" -version = "0.1.2" +version = "0.1.5" description = "Package for object detection, segmentation and gripping point detection." optional = false python-versions = "^3.8" @@ -7520,8 +7520,8 @@ pandas = {version = "*", optional = true, markers = "extra == \"tune\""} prometheus_client = {version = ">=0.7.1", optional = true, markers = "extra == \"default\""} protobuf = ">=3.20.3" py-spy = [ - {version = ">=0.4.0", optional = true, markers = "python_version >= \"3.12\" and extra == \"default\""}, {version = ">=0.2.0", optional = true, markers = "python_version < \"3.12\" and extra == \"default\""}, + {version = ">=0.4.0", optional = true, markers = "python_version >= \"3.12\" and extra == \"default\""}, ] pyarrow = {version = ">=9.0.0", optional = true, markers = "extra == \"tune\""} pydantic = {version = "<2.0.dev0 || >=2.5.dev0,<3", optional = true, markers = "extra == \"default\""} diff --git a/src/rai_bringup/launch/openset.launch.py b/src/rai_bringup/launch/openset.launch.py index 47ea9b9f4..a70ab3bc2 100644 --- a/src/rai_bringup/launch/openset.launch.py +++ b/src/rai_bringup/launch/openset.launch.py @@ -22,7 +22,8 @@ def generate_launch_description(): [ ExecuteProcess( cmd=["python", "run_perception_agents.py"], - cwd="src/rai_extensions/rai_perception/scripts", + # https://github.com/RobotecAI/rai/pull/722 + cwd="src/rai_extensions/rai_perception/rai_perception/scripts", output="screen", ), ] diff --git a/src/rai_semap/README.md b/src/rai_semap/README.md index dabdcbf69..d51a060e7 100644 --- a/src/rai_semap/README.md +++ b/src/rai_semap/README.md @@ -2,39 +2,61 @@ ⚠️ **Experimental Module**: This module is in active development. Features may change and some functionality is still in progress. -## Moudle Overview +## Overview -When a robot explores a new environment, it builds a map of the space using SLAM (Simultaneous Localization and Mapping), which creates a geometric map showing walls, obstacles, and open areas. This module extends that map with semantic information, storing what objects the robot sees and where they are located. It provides persistent storage of semantic annotations (object class, 3D pose, confidence) indexed by both spatial coordinates and semantic labels. Using this semantic map memory, user can do queries like "where did I see a red cup?" and "what objects are within 2m of (x,y)?". +Imagine your robot exploring a new warehouse or office building. Using SLAM (Simultaneous Localization and Mapping), it builds a geometric map showing walls and open areas, but it doesn't remember what objects it saw—like that tool cart or equipment in the storage area. + +RAI Semantic Map Memory solves this by adding a memory layer. As the robot explores, it remembers not just where walls are, but also what objects it detected and where they were located. Later, you can ask questions like "where did I see a pallet?" or "what objects are near the loading dock?" and the robot can answer using its stored memory. + +This module provides persistent storage of semantic annotations—linking object identities (like "shelf", "cart", "pallet") to their 3D locations in the map. It enables spatial-semantic queries that combine "what" and "where" information. + +## Some Usage Examples + +- Store object detections with their locations as the robot explores +- Query objects by location: "what's near point (x, y)?" +- Visualize stored annotations overlaid on the SLAM map For detailed design and architecture, see [design.md](../design.md). -## End-to-End Validation +## Quick Start -Phase 4 validation tests the full pipeline: launching the demo, collecting detections during navigation, and verifying stored data. +The following examples use the ROSBot XL demo to illustrate how to use rai_semap. ### Prerequisites - ROS2 environment set up -- rai_semap package installed, `poetry install --with semap` -- ROSBot XL demo setup, see general readme. +- rai_semap package installed: `poetry install --with semap` +- ROSBot XL demo setup, see instuctions at [rosbotxl demo](../demos/rosbot_xl.md) + +### Step 0: Launch the ROSBot XL demo + +Follow the instruction from [rosbotxl demo](../demos/rosbot_xl.md). + +### Step 1: Launch the Semantic Map Node + +Start the semantic map node to begin collecting and storing detections: + +```bash +ros2 launch src/rai_semap/rai_semap/scripts/semap.launch.py +``` -### Steps +This uses default configuration files from `rai_semap/ros2/config/`. The default configs assume depth topic `/camera/depth/image_rect_raw` and camera info topic `/camera/color/camera_info`. If your topics use different names, create custom config files or override parameters. -1. Launch rosbot-xl demo with semantic map node: +To use custom configs: ```bash -ros2 launch examples/rosbot-xl-semap.launch.py \ - game_launcher:=./examples/rosbot-xl.launch.py game_launcher:=demo_assets/rosbot/RAIROSBotXLDemo/RAIROSBotXLDemo.GameLauncher \ - database_path:=semantic_map.db \ - location_id:=rosbot_xl_demo +ros2 launch src/rai_semap/rai_semap/scripts/semap.launch.py \ + node_config:=/path/to/node.yaml \ + detection_publisher_config:=/path/to/detection_publisher.yaml \ + perception_utils_config:=/path/to/perception_utils.yaml ``` -2. Navigate and collect detections: +### Step 2: Collect Detections In a separate terminal, run the navigation script to move the robot through waypoints and collect detections: ```bash -python examples/rosbot-xl-navigate-collect.py \ +python -m rai_semap.scripts.navigate_collect \ --waypoints 2.0 0.0 4.0 0.0 2.0 2.0 \ --collect-duration 10.0 \ --use-sim-time @@ -42,28 +64,51 @@ python examples/rosbot-xl-navigate-collect.py \ The script navigates to each waypoint and waits to allow detections to be collected and stored in the semantic map. -3. Validate stored data: +### Step 3: Validate Stored Data -After navigation completes, run the validation script to verify the database contents: +After navigation completes, verify what was stored: ```bash -python examples/validate-semantic-map.py \ +python -m rai_semap.scripts.validate_semap \ --database-path semantic_map.db \ - --location-id rosbot_xl_demo + --location-id default_location ``` -The validation script reports: +The validation script shows total annotation count, annotations grouped by object class, confidence scores, and spatial distribution. -- Total annotation count -- Annotations grouped by object class -- Average confidence scores per class -- Detection sources -- Spatial distribution of annotations -- Field validation checks +## Configuration -### Query by Timestamp +Configuration parameters (database_path, location_id, topics, etc.) are set in YAML config files. If config files are not provided, default configs in `rai_semap/ros2/config/` are used. -To query annotations by timestamp, you can filter results after querying. Timestamps are stored as string representations of ROS2 Time objects. Here's an example Python script: +## Visualization + +View your semantic map annotations overlaid on the SLAM map in RViz2. + +### Start the Visualizer + +```bash +python -m rai_semap.ros2.visualizer \ + --ros-args \ + -p database_path:=semantic_map.db \ + -p location_id:=default_location \ + -p update_rate:=1.0 \ + -p marker_scale:=0.3 \ + -p show_text_labels:=true +``` + +### Setup RViz2 + +1. Launch RViz2: `rviz2` +2. Add displays: + - Add "Map" display → subscribe to `/map` topic + - Add "MarkerArray" display → subscribe to `/semantic_map_markers` topic +3. Set Fixed Frame to `map` (or your map frame ID) + +The visualizer shows color-coded markers by object class (bed=blue, chair=green, door=orange, shelf=purple, table=violet). Marker transparency scales with confidence score, and optional text labels show object class names. + +## Querying the Semantic Map + +Query stored annotations programmatically using the Python API: ```python from geometry_msgs.msg import Point @@ -79,102 +124,10 @@ memory = SemanticMapMemory( resolution=0.05, ) -# Query all annotations -center = Point(x=0.0, y=0.0, z=0.0) -all_annotations = memory.query_by_location(center, radius=1e10) - -# Filter by timestamp using string comparison -# Timestamps are stored as strings in format: "Time(nanoseconds=...)" -# Extract nanoseconds from timestamp string for comparison -def parse_timestamp_ns(timestamp_str): - """Extract nanoseconds from timestamp string like 'Time(nanoseconds=123456789)'""" - if not timestamp_str or not timestamp_str.startswith("Time(nanoseconds="): - return None - try: - ns_str = timestamp_str.split("nanoseconds=")[1].rstrip(")") - return int(ns_str) - except (IndexError, ValueError): - return None - -# Example: Get annotations from last hour -import time -current_ns = int(time.time() * 1e9) -one_hour_ns = int(3600 * 1e9) -one_hour_ago_ns = current_ns - one_hour_ns - -recent_annotations = [ - ann for ann in all_annotations - if ann.timestamp and (ann_ns := parse_timestamp_ns(ann.timestamp)) and ann_ns >= one_hour_ago_ns -] - -print(f"Found {len(recent_annotations)} annotations from the last hour") - -# Filter by specific time range (using string comparison) -start_time_str = "Time(nanoseconds=1700000000000000000)" # Example timestamp -end_time_str = "Time(nanoseconds=1700003600000000000)" # Example timestamp - -time_range_annotations = [ - ann for ann in all_annotations - if ann.timestamp and start_time_str <= ann.timestamp <= end_time_str -] - -print(f"Found {len(time_range_annotations)} annotations in time range") -``` - -For more complex timestamp queries, you can also query the database directly using SQL: - -```python -import sqlite3 - -conn = sqlite3.connect("semantic_map.db") -cursor = conn.cursor() - -# Example: Query annotations after a specific timestamp -# Timestamps are stored as strings, so string comparison works for ordering -cursor.execute(""" - SELECT * FROM annotations - WHERE location_id = ? - AND timestamp >= ? - ORDER BY timestamp DESC -""", ("rosbot_xl_demo", "Time(nanoseconds=1700000000000000000)")) - -rows = cursor.fetchall() -print(f"Found {len(rows)} annotations after specified timestamp") -``` - -### Expected Output - -The validation script should show annotations stored from detections collected during navigation, with valid confidence scores, detection sources, and spatial coordinates in the map frame. - -## Utilities - -### Clear All Annotations - -To remove all annotations from the semantic map database: - -```bash -python -m rai_semap.utils.clear_annotations \ - --database-path semantic_map.db -``` - -To remove annotations for a specific location only: +# Query annotations near a location +center = Point(x=2.0, y=0.0, z=0.0) +annotations = memory.query_by_location(center, radius=2.0) -```bash -python -m rai_semap.utils.clear_annotations \ - --database-path semantic_map.db \ - --location-id rosbot_xl_demo +for ann in annotations: + print(f"Found {ann.object_class} at ({ann.pose.position.x}, {ann.pose.position.y})") ``` - -To skip the confirmation prompt: - -```bash -python -m rai_semap.utils.clear_annotations \ - --database-path semantic_map.db \ - --yes -``` - -The script will: - -- Show the number of annotations that will be deleted -- Prompt for confirmation (unless `--yes` is used) -- Delete the annotations and report the count deleted diff --git a/src/rai_semap/design.md b/src/rai_semap/design.md index c7b24af65..f109b46e6 100644 --- a/src/rai_semap/design.md +++ b/src/rai_semap/design.md @@ -2,23 +2,23 @@ ## Table of Contents -- [Problem Definition](#problem-definition) - - [Q & A](#q--a) -- [Concepts](#concepts) - - [High-Level Concepts](#high-level-concepts) - - [Data Models](#data-models) - - [Relationships](#relationships) - - [Non Goals](#non-goals) -- [Design Proposal](#design-proposal) - - [BaseMemory Interface](#basememory-interface) - - [SemanticMapMemory Interface](#semanticmapmemory-interface) - - [Database Backend Abstraction](#database-backend-abstraction) - - [New Component: rai_semap](#new-component-rai_semap) - - [Usage Patterns from Other Layers](#usage-patterns-from-other-layers) - - [Implementation Phases](#implementation-phases) -- [Reusability](#reusability) -- [External Memory Systems](#external-memory-systems) -- [Appendix](#appendix) +- [Problem Definition](#problem-definition) + - [Q & A](#q--a) +- [Concepts](#concepts) + - [High-Level Concepts](#high-level-concepts) + - [Data Models](#data-models) + - [Relationships](#relationships) + - [Non Goals](#non-goals) +- [Design Proposal](#design-proposal) + - [BaseMemory Interface](#basememory-interface) + - [SemanticMapMemory Interface](#semanticmapmemory-interface) + - [Database Backend Abstraction](#database-backend-abstraction) + - [New Component: rai_semap](#new-component-rai_semap) + - [Usage Patterns from Other Layers](#usage-patterns-from-other-layers) + - [Implementation Phases](#implementation-phases) +- [Reusability](#reusability) +- [External Memory Systems](#external-memory-systems) +- [Appendix](#appendix) ## Problem Definition @@ -26,9 +26,9 @@ > "Robots often need to find out about their environment first, building a map and localizing themselves on it. During this process, RAI can be used to guide exploration for mapping or to build a semantic map during the SLAM which adds knowledge and memory, which can be used to reason about the area itself and tasks that are to be given in the area. A great start is to design solutions for RAI." -Based on RAI's current capabilities (perception, navigation, multi-modal interaction), this roughly maps to three areas: +Based on RAI's current capabilities (perception, navigation, multi-modal interaction), this roughly maps to three areas, listed as below. These areas build on each other: perception feeds the semantic map, exploration uses it to guide decisions, and memory enables task reasoning. -1. Semantic Perception Layer which may be built on top of `rai_perception` (open-set detection), Grounded SAM 2 integration +1. Semantic Perception Layer which may be built on top of `rai_perception` (open-set detection), GroundingDINO integration with a new semantic annotation pipeline that tags SLAM features/points with object identities during mapping 2. Agent Guided Exploration Strategy which can be built on existing `rai_nomad` (navigation) where agent decides _where_ to explore based on goals ("find the kitchen", "map storage areas") rather than frontier-based exploration. Frontier-based exploration navigates to boundaries between known and unknown map regions to expand coverage. @@ -38,8 +38,6 @@ Based on RAI's current capabilities (perception, navigation, multi-modal interac - The connection between spatial memory and other RAI memory systems (artifact_database.pkl, rai_whoami vector store, StateBasedAgent state) needs to be explored: spatial memory could be queried by these systems to provide spatial context, rather than serving as storage for them. - For example, could artifacts be annotated with spatial locations queried from spatial memory, could embodiment docs reference spatial locations that spatial memory could ground, or could recent spatial queries be included in StateBasedAgent state to provide spatial awareness during conversations? -For explorative SLAM, these areas build on each other: perception feeds the semantic map, exploration uses it to guide decisions, and memory enables task reasoning. - ### Q & A Q: why "map storage areas" is not a frontier-based exploration? @@ -58,20 +56,22 @@ Both may explore unknown regions, but the decision differs: frontier-based picks #### Semantic Annotation -**A spatial-semantic record linking an object identity (class label, e.g., "red cup", "tool") to a 3D pose in the map frame, with metadata (timestamp, confidence, detection source).** +**A spatial-semantic record linking an object identity (class label, e.g., "red cup", "tool") to a 3D position (centroid/center) in the map frame, with metadata (timestamp, confidence, detection source).** -Unlike pure geometric SLAM, semantic annotations enable querying "what" and "where" simultaneously. This allows agents to reason about object locations for task planning and spatial reasoning. The combination of semantic labels and 3D poses creates a rich representation that bridges perception and spatial memory. +Unlike pure geometric SLAM, semantic annotations enable querying "what" and "where" simultaneously. This allows agents to reason about object locations for task planning and spatial reasoning. The combination of semantic labels and 3D positions creates a different representation that bridges perception and spatial memory. The position is stored as a `Pose` object, where `pose.position` is the 3D point (computed from bounding box center or point cloud centroid when available) and `pose.orientation` is typically identity (not meaningful). ```python -# Example: Semantic annotation structure +# Example: Semantic annotation structure (simplified) { "object_class": "red cup", - "pose": {"x": 2.5, "y": 1.3, "z": 0.8, "orientation": {...}}, + "pose": {"position": {"x": 2.5, "y": 1.3, "z": 0.8}, "orientation": {"x": 0.0, "y": 0.0, "z": 0.0, "w": 1.0}}, "confidence": 0.92, "timestamp": "2025-01-15T10:23:00", "detection_source": "GroundingDINO", "source_frame": "camera_frame" } +# Note: pose.position is a 3D point (centroid/center) in map frame, computed from bounding box center +# or point cloud centroid when available. pose.orientation is typically identity (not meaningful). ``` #### Spatial Memory @@ -82,33 +82,34 @@ The storage can be implemented via database backends (SQLite/PostGIS) accessed t ```python # Example: Spatial query -memory.query_nearby_objects(x=2.5, y=1.3, radius=2.0) -# Returns: [{"object": "red cup", "distance": 0.5}, -# {"object": "table", "distance": 1.2}] +from geometry_msgs.msg import Point +center = Point(x=2.5, y=1.3, z=0.0) +results = memory.query_by_location(center, radius=2.0) +# Returns: List[SemanticAnnotation] with objects within radius # Example: Semantic query -memory.query_by_class("red cup") -# Returns: [{"pose": (2.5, 1.3, 0.8), "confidence": 0.92, ...}] +results = memory.query_by_class("red cup") +# Returns: List[SemanticAnnotation] with all "red cup" annotations ``` #### Camera-to-map Transformation **Converting detections from camera frame to map frame using TF transforms.** -The perception layer (GroundingDINO, GroundedSAM) already provides 3D poses in `ObjectHypothesisWithPose`, so the system transforms these poses from the camera frame to the map frame using TF transforms (camera → base_link → map). This is critical for building a consistent spatial-semantic map across robot movements. Without proper frame transformation, detections from different robot positions would be stored in inconsistent coordinate systems, making spatial queries unreliable. +The perception layer provides detections with 3D positions (in our implementation, GroundingDINO provides 2D bounding boxes, and we compute 3D poses from bounding box centers using depth images and camera intrinsics). These 3D positions are initially in the camera frame. The system transforms these positions from the camera frame to the map frame using TF transforms (camera → base_link → map). This is critical for building a consistent spatial-semantic map across robot movements. Without proper frame transformation, detections from different robot positions would be stored in inconsistent coordinate systems, making spatial queries unreliable. ```python -# Example: Frame transformation flow +# Example: Frame transformation flow (pseduo code) # Detection in camera frame -camera_pose = {"x": 0.3, "y": 0.1, "z": 1.2} # relative to camera +camera_pose = (x=0.3, y=0.1, z=1.2) # relative to camera # Transform to map frame via TF -map_pose = tf_buffer.transform( - camera_pose, - target_frame="map", - source_frame="camera_frame" +map_pose = transform_pose( + camera_pose, + source_frame="camera_frame", + target_frame="map" ) -# Result: {"x": 2.5, "y": 1.3, "z": 0.8} # absolute map coordinates +# Result: (x=2.5, y=1.3, z=0.8) # absolute map coordinates # Same object detected from different angle → same map coordinates ``` @@ -190,12 +191,12 @@ confidence_thresholds = { **4. Bounding Box Size Filtering** -Very small bounding boxes (below minimum area threshold, default: 500 pixels²) are filtered out as they are often false positives from partial occlusions or detection artifacts. +Very small bounding boxes (below minimum area threshold, default: 100 pixels²) are filtered out as they are often false positives from partial occlusions or detection artifacts. ```python # Example: Size filtering bbox_area = width * height # pixels² -if bbox_area < 500: +if bbox_area < min_bbox_area: # default: 100 pixels² # Filtered out (likely false positive) return None ``` @@ -208,15 +209,17 @@ These query patterns enable agents to retrieve spatial-semantic information for ```python # Spatial query: "What objects are within 2m of (x,y)?" -results = memory.query_nearby_objects(x=2.5, y=1.3, radius=2.0) +from geometry_msgs.msg import Point +center = Point(x=2.5, y=1.3, z=0.0) +results = memory.query_by_location(center, radius=2.0) # Semantic query: "Where did I see a red cup?" results = memory.query_by_class("red cup") # Hybrid query: "Find tools in the workshop" (semantic + spatial region) -results = memory.query_by_class_and_region( - object_class="tool", - region_polygon=[(x1,y1), (x2,y2), (x3,y3), (x4,y4)] +results = memory.query_by_region( + bbox=(x1, y1, x2, y2), # (min_x, min_y, max_x, max_y) + object_class="tool" ) ``` @@ -229,17 +232,17 @@ results = memory.query_by_class_and_region( This is the core data model that stores all semantic-spatial information in the memory system. Each annotation links a detected object to its location in the map frame, enabling spatial queries and temporal consistency tracking. The metadata field allows extensibility for point cloud features and other attributes without changing the core schema. ```python -@dataclass class SemanticAnnotation: - id: UUID # Unique identifier + id: str # Unique identifier (UUID string) object_class: str # e.g., "red cup", "tool" pose: Pose # 3D pose in map frame (x, y, z, orientation) confidence: float # 0.0-1.0 - timestamp: Time # ROS timestamp of detection + timestamp: Any # ROS timestamp of detection detection_source: str # e.g., "GroundingDINO", "GroundedSAM" source_frame: str # Original camera frame_id - vision_detection_id: str # ID from Detection2D for debugging - metadata: dict # Optional JSON with pointcloud info when available + location_id: str # Identifier for the physical location + vision_detection_id: Optional[str] # ID from Detection2D for debugging + metadata: Dict[str, Any] # Optional JSON with pointcloud info when available # metadata.pointcloud: {centroid, size_3d, point_count} ``` @@ -256,10 +259,10 @@ CREATE VIRTUAL TABLE annotations_rtree USING rtree( ); -- Example: Efficient spatial query using index -SELECT * FROM annotations +SELECT * FROM annotations WHERE id IN ( - SELECT id FROM annotations_rtree - WHERE minx <= x+radius AND maxx >= x-radius + SELECT id FROM annotations_rtree + WHERE minx <= x+radius AND maxx >= x-radius AND miny <= y+radius AND maxy >= y-radius ); ``` @@ -271,12 +274,12 @@ WHERE id IN ( This metadata enables the system to correctly interpret spatial coordinates and maintain consistency with the underlying SLAM map. The resolution and origin are needed to convert between map coordinates and pixel coordinates for visualization. The last_updated timestamp helps track map freshness and coordinate system changes. ```python -@dataclass class MapMetadata: + location_id: str # Identifier for the physical location map_frame_id: str # Frame ID of the SLAM map resolution: float # OccupancyGrid resolution (meters/pixel) - origin: Pose # Map origin pose - last_updated: Time # Timestamp of last annotation + origin: Optional[Pose] # Optional map origin pose + last_updated: Optional[Any] # Timestamp of last annotation ``` ### Relationships @@ -289,8 +292,8 @@ The `detection_publisher` node bridges the service-based perception layer to top ```python # Flow: Camera → detection_publisher → RAIDetectionArray → semantic_map_node -# detection_publisher subscribes to /camera/image_raw -# Calls /grounding_dino/grounding_dino_classify service +# detection_publisher subscribes to /camera/camera/color/image_raw (configurable) +# Calls /grounding_dino_classify service (configurable) # Publishes RAIDetectionArray to /detection_array # semantic_map_node subscribes to /detection_array and stores annotations ``` @@ -316,9 +319,9 @@ This integration enables multi-step task planning where agents can query object ```python # Example: Agent tool usage -tool = QuerySemanticMapTool(memory) -result = tool.invoke("red cup in kitchen") -# Returns: {"object": "red cup", "location": (2.5, 1.3, 0.8), ...} +tool = QuerySemanticMapTool(memory=memory) +result = tool.invoke({"query": "red cup in kitchen", "room": "kitchen"}) +# Returns: String with object locations # Agent uses NavigateToPoseTool to go to location ``` @@ -342,13 +345,24 @@ Since no `BaseMemory` interface exists in RAI, we define this interface to enabl # Example: BaseMemory interface structure class BaseMemory(ABC): @abstractmethod - def store(self, data: Any) -> str: - """Store data and return identifier.""" + def store(self, key: str, value: Any, metadata: Optional[Dict[str, Any]] = None) -> str: + """Store a value with optional metadata. Returns storage ID.""" + pass + + @abstractmethod + def retrieve(self, query: str, filters: Optional[Dict[str, Any]] = None) -> List[Any]: + """Retrieve values matching query and filters. + + Designed for vector database use cases where query is text to embed + for similarity search, and filters are metadata constraints. + Not suitable for spatial databases which require concrete query methods + (e.g., query_by_location, query_by_region). + """ pass - + @abstractmethod - def query(self, query: str) -> List[Any]: - """Query stored data.""" + def delete(self, key: str) -> bool: + """Delete a stored value. Returns success status.""" pass ``` @@ -361,12 +375,16 @@ This interface provides the contract for spatial-semantic memory operations, ena ```python # Example: SemanticMapMemory interface methods class SemanticMapMemory(BaseMemory): - def query_nearby_objects(self, x: float, y: float, radius: float) -> List[SemanticAnnotation]: - """Query objects within radius of (x, y).""" + def query_by_location(self, center: Point, radius: float, object_class: Optional[str] = None, location_id: Optional[str] = None) -> List[SemanticAnnotation]: + """Query annotations within radius of center point.""" pass - - def query_by_class(self, object_class: str) -> List[SemanticAnnotation]: - """Query all annotations of a given class.""" + + def query_by_class(self, object_class: str, confidence_threshold: float = 0.5, limit: Optional[int] = None, location_id: Optional[str] = None) -> List[SemanticAnnotation]: + """Query annotations by object class.""" + pass + + def query_by_region(self, bbox: Tuple[float, float, float, float], object_class: Optional[str] = None, location_id: Optional[str] = None) -> List[SemanticAnnotation]: + """Query annotations within bounding box region.""" pass ``` @@ -383,7 +401,8 @@ Uses SpatiaLite extension for spatial indexing. Single-file database with no ext ```python # Example: SQLiteBackend usage backend = SQLiteBackend("semantic_map.db") -memory = SemanticMapMemory(backend) +backend.init_schema() # Initialize database schema +memory = SemanticMapMemory(backend, location_id="default_location") # Single file, no server needed ``` @@ -394,38 +413,11 @@ Full PostgreSQL + PostGIS for advanced spatial operations. Supports multi-robot ```python # Example: PostGISBackend usage (future) backend = PostGISBackend(connection_string="postgresql://...") -memory = SemanticMapMemory(backend) +backend.init_schema() # Initialize database schema +memory = SemanticMapMemory(backend, location_id="warehouse_a") # Shared database for multi-robot coordination ``` -Configuration via ROS2 parameters (set via launch file or command line): - -Detection Publisher Node (`rai_semap.ros2.detection_publisher`): - -- `camera_topic`: Camera image topic to subscribe to (default: "/camera/image_raw") -- `detection_topic`: Topic to publish RAIDetectionArray messages (default: "/detection_array") -- `dino_service`: GroundingDINO service name (default: "/grounding_dino/grounding_dino_classify") -- `detection_classes`: Comma-separated list of object classes to detect (default: "person, cup, bottle, box, bag, chair, table, shelf, door, window") -- `detection_interval`: Minimum time between detections in seconds (default: 2.0) -- `box_threshold`: DINO box threshold (default: 0.3) -- `text_threshold`: DINO text threshold (default: 0.25) - -Semantic Map Node (`rai_semap.ros2.node`): - -- `database_path`: Path to SQLite database file (default: "semantic_map.db") -- `confidence_threshold`: Minimum confidence score (0.0-1.0) for storing detections (default: 0.5) -- `class_confidence_thresholds`: Class-specific confidence thresholds as 'class1:threshold1,class2:threshold2' (e.g., 'person:0.7,window:0.6,door:0.5') -- `class_merge_thresholds`: Class-specific merge radii (meters) for deduplication as 'class1:radius1,class2:radius2' (e.g., 'couch:2.5,table:1.5,shelf:1.5,chair:0.8') -- `min_bbox_area`: Minimum bounding box area (pixels²) to filter small false positives (default: 500.0) -- `use_pointcloud_dedup`: Enable point cloud-based deduplication matching (default: true) -- `depth_topic`: Depth image topic for point cloud extraction (optional, required if use_pointcloud_dedup=true) -- `camera_info_topic`: Camera info topic for point cloud extraction (optional, required if use_pointcloud_dedup=true) -- `detection_topic`: Topic for RAIDetectionArray messages (default: "/detection_array") -- `map_topic`: Topic for OccupancyGrid map messages (default: "/map") -- `map_frame_id`: Frame ID of the SLAM map (default: "map") -- `location_id`: Identifier for the physical location (default: "default_location") -- `map_resolution`: OccupancyGrid resolution in meters/pixel (default: 0.05) - Future: PostGIS backend selection will be configurable (not yet implemented). ### New Component: `rai_semap` @@ -436,31 +428,16 @@ Architecture: 1. Core Library (`rai_semap.core`): - - Frame Projection: Transform 3D poses from source frame (camera frame) to map frame using TF transforms. Detections already contain 3D poses in `ObjectHypothesisWithPose`, so depth estimation is handled by the perception layer. - - Temporal Filtering: Merge duplicate detections of same object using multi-strategy deduplication: - - Spatial clustering within class-specific merge thresholds - - Point cloud-based matching when depth data is available (uses 3D centroid and size validation) - - Confidence filtering with class-specific thresholds - - Bounding box size filtering to remove small false positives - - Confidence Aggregation: Update confidence scores for repeated observations (keeps maximum confidence) + - Frame Projection: Transform 3D poses from camera frame to map frame using TF transforms + - Temporal Filtering: Multi-strategy deduplication (spatial clustering, point cloud-based matching, confidence/size filtering) to merge duplicate detections - Storage: Persist annotations to `SemanticMapMemory` backend with point cloud features in metadata - - No ROS2 dependencies; pure Python library + - Pure Python library with no ROS2 dependencies 2. ROS2 Node Wrapper (`rai_semap.ros2`): - - `detection_publisher` node: - - Subscribes to camera images (e.g., `/camera/image_raw`) - - Calls GroundingDINO service with configurable object classes - - Publishes `RAIDetectionArray` messages to `/detection_array` topic - - Throttles detections via `detection_interval` parameter to control processing rate - - `node` (semantic map node): - - Subscriptions: - - `RAIDetectionArray` topic (detections from `detection_publisher` or other sources) - - `/map` (OccupancyGrid from SLAM) - - TF handling via `tf2_ros.Buffer` and `TransformListener` (automatically subscribes to `/tf` and `/tf_static`) - - Converts ROS2 messages to core library data structures - - Calls core library processing functions - - Handles ROS2-specific concerns (TF lookups, message conversion) + - `detection_publisher` node: Subscribes to camera images, calls GroundingDINO service, publishes `RAIDetectionArray` messages with configurable throttling + - `node` (semantic map node): Subscribes to `RAIDetectionArray` and `/map` topics, handles TF transforms, converts ROS2 messages to core library data structures, and calls core processing functions + - `visualizer` node: Publishes semantic map annotations as RViz2 markers for real-time visualization, querying the database at configurable intervals 3. Tools/Services: - `QuerySemanticMapTool`: LangChain tool for agent queries @@ -477,27 +454,9 @@ Camera Images → detection_publisher → RAIDetectionArray → semantic_map_nod ### Usage Patterns from Other Layers -Perception Layer Requirements: - -- Service-to-topic bridge: The `detection_publisher` node bridges the service-based perception layer (GroundingDINO, GroundedSAM) to topic-based messaging by continuously processing camera images and publishing detections -- Topic-based processing: Handle `RAIDetectionArray` messages published to topics (from `detection_publisher` or other sources) -- Immediate processing: Each detection array is processed immediately upon receipt in the callback -- Confidence filtering: Only store high-confidence detections (configurable via ROS2 parameter `confidence_threshold`) -- Configurable detection rate: The `detection_interval` parameter controls how frequently detections are processed to balance accuracy and computational load - -Exploration Layer Requirements (prioritized): +**Perception Layer**: The `detection_publisher` node bridges service-based perception (GroundingDINO) to topic-based messaging, processing camera images and publishing `RAIDetectionArray` messages for immediate processing with configurable confidence filtering and detection rate throttling. -1. Coverage tracking: Query which map regions have been semantically annotated (M) - - Foundation for other exploration features - - Requires spatial indexing of annotated regions (grid-based or region-based) - - Grid-based example: Divide map into fixed-size cells (e.g., 0.5m x 0.5m aligned with OccupancyGrid resolution), track which cells contain annotations - - Region-based example: Use spatial clustering or bounding polygons around annotated areas, define regions dynamically based on annotation density -2. Goal-based queries: "Find unexplored regions with 'kitchen' objects" (M) - - Depends on: Coverage tracking, semantic queries - - Combines semantic search with coverage information -3. Frontier detection: Identify boundaries between mapped and unmapped semantic regions (L) - - Depends on: Coverage tracking - - Requires boundary detection algorithms and spatial analysis +**Exploration Layer** (preliminary): Future integration could support coverage tracking (identifying which map regions have been annotated), goal-based queries (finding unexplored regions with specific semantic properties), and frontier detection (boundaries between mapped/unmapped regions). These features would enable agent-guided exploration beyond geometric frontier-based methods. Agent Tool Integration: @@ -529,8 +488,8 @@ Future direction (PostGIS Migration): - Flow: -1. Agent queries semantic map: `QuerySemanticMapTool("red cup", room="kitchen")` -2. Semantic map returns: `{object: "red cup", location: (x: 2.5, y: 1.3, z: 0.8), timestamp: "2025-01-15T10:23:00", confidence: 0.92}` +1. Agent queries semantic map: `QuerySemanticMapTool.invoke({"query": "red cup", "room": "kitchen"})` +2. Semantic map returns: String with object locations including pose information 3. Agent uses `NavigateToPoseTool` to go to that location 4. Agent uses `GetDetectionTool` to confirm object presence 5. Agent uses manipulation tools to grab and return the cup diff --git a/src/rai_semap/rai_semap/core/backend/sqlite_backend.py b/src/rai_semap/rai_semap/core/backend/sqlite_backend.py index 779c047c0..25c446d1e 100644 --- a/src/rai_semap/rai_semap/core/backend/sqlite_backend.py +++ b/src/rai_semap/rai_semap/core/backend/sqlite_backend.py @@ -239,7 +239,7 @@ def delete_annotation(self, annotation_id: str) -> bool: if rows_deleted > 0: logger.info(f"✓ Deleted annotation {annotation_id} from database") else: - logger.warn(f"✗ Annotation {annotation_id} not found for deletion") + logger.warning(f"✗ Annotation {annotation_id} not found for deletion") return rows_deleted > 0 @@ -333,7 +333,7 @@ def update_annotation(self, annotation: SemanticAnnotation) -> bool: if rows_updated > 0: logger.info(f"✓ Updated annotation {annotation.id} in database") else: - logger.warn(f"✗ Annotation {annotation.id} not found for update") + logger.warning(f"✗ Annotation {annotation.id} not found for update") return rows_updated > 0 except sqlite3.Error as e: diff --git a/src/rai_semap/rai_semap/ros2/detection_publisher.yaml b/src/rai_semap/rai_semap/ros2/config/detection_publisher.yaml similarity index 82% rename from src/rai_semap/rai_semap/ros2/detection_publisher.yaml rename to src/rai_semap/rai_semap/ros2/config/detection_publisher.yaml index 161bd4386..cb5f9d62d 100644 --- a/src/rai_semap/rai_semap/ros2/detection_publisher.yaml +++ b/src/rai_semap/rai_semap/ros2/config/detection_publisher.yaml @@ -1,14 +1,14 @@ detection_publisher: ros__parameters: - camera_topic: "/camera/image_raw" + camera_topic: "/camera/camera/color/image_raw" detection_topic: "/detection_array" dino_service: "/grounding_dino_classify" # Format: "class1:threshold1, class2, class3:threshold3" # Classes without explicit thresholds use default_class_threshold # Example: "person:0.5, cup, bottle:0.4, box" (person uses 0.5, cup uses default, bottle uses 0.4, box uses default) detection_classes: | - person, cup, bottle, box, bag, chair, table, shelf, door, window, couch, sofa, bed - default_class_threshold: 0.3 + person, cup, bottle, box, bag, chair, table, shelf, door, window, couch, sofa, bed, stove + default_class_threshold: 0.35 detection_interval: 2.0 box_threshold: 0.3 text_threshold: 0.25 diff --git a/src/rai_semap/rai_semap/ros2/config/node.yaml b/src/rai_semap/rai_semap/ros2/config/node.yaml new file mode 100644 index 000000000..c8b5b0965 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/config/node.yaml @@ -0,0 +1,33 @@ +rai_semap_node: + ros__parameters: + # Storage configuration + storage: + database_path: "semantic_map.db" + location_id: "default_location" + + # Detection filtering configuration + detection_filtering: + confidence_threshold: 0.5 + class_confidence_thresholds: "" + # Minimum bounding box area in pixels^2 to filter small false positives + min_bbox_area: 100.0 + + # Deduplication configuration + deduplication: + class_merge_thresholds: "" + use_pointcloud_dedup: true + + # Topic configuration + topics: + detection_topic: "/detection_array" + map_topic: "/map" + # Set these to your depth and camera info topics for point cloud extraction + # Leave empty if point cloud deduplication is not needed + depth_topic: "/camera/camera/depth/image_rect_raw" + camera_info_topic: "/camera/camera/color/camera_info" + + # Map/SLAM configuration + map: + map_frame_id: "map" + # OccupancyGrid resolution in meters/pixel (0.05 = 5 cm per pixel) + map_resolution: 0.05 diff --git a/src/rai_semap/rai_semap/ros2/config/perception_utils.yaml b/src/rai_semap/rai_semap/ros2/config/perception_utils.yaml new file mode 100644 index 000000000..648ae0526 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/config/perception_utils.yaml @@ -0,0 +1,7 @@ +perception_utils: + ros__parameters: + # Set these to your depth and camera info topics for 3D pose computation + # Leave empty if 3D poses are not needed + depth_topic: "/camera/camera/depth/image_rect_raw" + camera_info_topic: "/camera/camera/color/camera_info" + depth_fallback_region_size: 5 diff --git a/src/rai_semap/rai_semap/ros2/config/visualizer.yaml b/src/rai_semap/rai_semap/ros2/config/visualizer.yaml new file mode 100644 index 000000000..4dc01e992 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/config/visualizer.yaml @@ -0,0 +1,11 @@ +# Format: [r, g, b, a] where r, g, b are in range [0.0, 1.0] and a is alpha [0.0, 1.0] +# Default color +default_color: [0.5, 0.5, 0.5, 0.8] + +# Color mapping for specific object classes +class_colors: + bed: [0.2, 0.4, 0.8, 0.8] + chair: [0.2, 0.8, 0.4, 0.8] + door: [0.8, 0.6, 0.2, 0.8] + shelf: [0.8, 0.2, 0.6, 0.8] + table: [0.6, 0.2, 0.8, 0.8] diff --git a/src/rai_semap/rai_semap/ros2/detection_publisher.py b/src/rai_semap/rai_semap/ros2/detection_publisher.py index faab3729b..35563f4d4 100644 --- a/src/rai_semap/rai_semap/ros2/detection_publisher.py +++ b/src/rai_semap/rai_semap/ros2/detection_publisher.py @@ -47,14 +47,57 @@ def __init__(self): self.last_depth_image: Optional[Image] = None self.last_camera_info: Optional[CameraInfo] = None self.last_detection_time = 0.0 + self.last_log_time = 0.0 + self.log_interval = 5.0 # Log summary every 5 seconds def _initialize_parameters(self): """Initialize ROS2 parameters from YAML files.""" # Get directory containing this file current_dir = Path(__file__).parent + config_dir = current_dir / "config" + + # Declare config file path parameters first + config_params = [ + ( + "detection_publisher_config", + "", + ParameterType.PARAMETER_STRING, + "Path to detection_publisher YAML config file (empty = use default in config/)", + ), + ( + "perception_utils_config", + "", + ParameterType.PARAMETER_STRING, + "Path to perception_utils YAML config file (empty = use default in config/)", + ), + ] + for name, default, param_type, description in config_params: + self.declare_parameter( + name, + default, + descriptor=ParameterDescriptor( + type=param_type, description=description + ), + ) + + # Get config file paths + detection_pub_config_path = ( + self.get_parameter("detection_publisher_config") + .get_parameter_value() + .string_value + ) + perception_utils_config_path = ( + self.get_parameter("perception_utils_config") + .get_parameter_value() + .string_value + ) # Load detection_publisher parameters - detection_pub_yaml = current_dir / "detection_publisher.yaml" + if detection_pub_config_path: + detection_pub_yaml = Path(detection_pub_config_path) + else: + detection_pub_yaml = config_dir / "detection_publisher.yaml" + with open(detection_pub_yaml, "r") as f: detection_pub_config = yaml.safe_load(f) detection_pub_params = detection_pub_config.get("detection_publisher", {}).get( @@ -62,7 +105,11 @@ def _initialize_parameters(self): ) # Load perception_utils parameters - perception_utils_yaml = current_dir / "perception_utils.yaml" + if perception_utils_config_path: + perception_utils_yaml = Path(perception_utils_config_path) + else: + perception_utils_yaml = config_dir / "perception_utils.yaml" + with open(perception_utils_yaml, "r") as f: perception_utils_config = yaml.safe_load(f) perception_utils_params = perception_utils_config.get( @@ -73,7 +120,9 @@ def _initialize_parameters(self): parameters = [ ( "camera_topic", - detection_pub_params.get("camera_topic", "/camera/image_raw"), + detection_pub_params.get( + "camera_topic", "/camera/camera/color/image_raw" + ), ParameterType.PARAMETER_STRING, "Camera image topic to subscribe to", ), @@ -204,7 +253,7 @@ def _parse_detection_classes( class_names.append(class_name) class_thresholds[class_name] = threshold except ValueError: - self.get_logger().warn( + self.get_logger().warning( f"Invalid threshold value in '{item}', using default" ) class_names.append(class_name) @@ -222,7 +271,7 @@ def _initialize_clients(self): self.dino_client = self.create_client(RAIGroundingDino, dino_service) self.get_logger().info(f"Waiting for DINO service: {dino_service}") if not self.dino_client.wait_for_service(timeout_sec=10.0): - self.get_logger().warn(f"DINO service not available: {dino_service}") + self.get_logger().warning(f"DINO service not available: {dino_service}") else: self.get_logger().info(f"DINO service ready: {dino_service}") @@ -272,9 +321,12 @@ def _initialize_publishers(self): """Initialize ROS2 publishers.""" detection_topic = self._get_string_parameter("detection_topic") self.detection_publisher = self.create_publisher( - RAIDetectionArray, detection_topic, 10 + RAIDetectionArray, detection_topic, qos_profile_sensor_data + ) + self.get_logger().info( + f"Publishing to detection topic: {detection_topic} " + f"(QoS: reliability={qos_profile_sensor_data.reliability.name})" ) - self.get_logger().info(f"Publishing to detection topic: {detection_topic}") def depth_callback(self, msg: Image): """Store latest depth image.""" @@ -286,7 +338,7 @@ def camera_info_callback(self, msg: CameraInfo): def image_callback(self, msg: Image): """Process incoming camera image.""" - self.get_logger().info( + self.get_logger().debug( f"Received camera image (stamp: {msg.header.stamp.sec}.{msg.header.stamp.nanosec:09d}, " f"frame_id: {msg.header.frame_id})" ) @@ -302,13 +354,13 @@ def image_callback(self, msg: Image): return self.last_image = msg - self.get_logger().info("Processing camera image...") + self.get_logger().debug("Processing camera image...") self._process_image(msg) def _process_image(self, image_msg: Image): """Call DINO service and publish detections.""" if not self.dino_client.wait_for_service(timeout_sec=0.1): - self.get_logger().warn("DINO service not ready, skipping detection") + self.get_logger().warning("DINO service not ready, skipping detection") return detection_classes_str = self._get_string_parameter("detection_classes") @@ -335,9 +387,8 @@ def _process_image(self, image_msg: Image): request.box_threshold = box_threshold request.text_threshold = text_threshold - self.get_logger().info( - f"Calling DINO service with classes: {', '.join(class_names)} " - f"(box_threshold={box_threshold:.3f}, per-class thresholds: {class_thresholds})" + self.get_logger().debug( + f"Calling DINO service with {len(class_names)} classes (box_threshold={box_threshold:.3f})" ) future = self.dino_client.call_async(request) @@ -350,7 +401,7 @@ def _handle_dino_response(self, future, image_header): try: response = future.result() if response is None: - self.get_logger().warn("DINO service returned None") + self.get_logger().warning("DINO service returned None") return # Get class thresholds for filtering (set in _process_image) @@ -425,26 +476,39 @@ def _handle_dino_response(self, future, image_header): # Log detection details for debugging detection_count = len(detection_array.detections) + current_time = time.time() + should_log = current_time - self.last_log_time >= self.log_interval + if detection_count > 0: + # Log individual detections only at DEBUG level for i, det in enumerate(detection_array.detections): results_count = len(det.results) if det.results else 0 if results_count > 0: result = det.results[0] - self.get_logger().info( + self.get_logger().debug( f"Detection {i}: class={result.hypothesis.class_id}, " - f"score={result.hypothesis.score:.3f}, " - f"results_count={results_count}, " - f"frame_id={det.header.frame_id}" + f"score={result.hypothesis.score:.3f}" ) else: - self.get_logger().warn( + self.get_logger().warning( f"Detection {i} has no results! frame_id={det.header.frame_id}" ) - self.get_logger().info( - f"Published {detection_count} detections: {detection_array.detection_classes}" - ) + + # Throttled summary log + if should_log: + classes_found = [ + det.results[0].hypothesis.class_id + for det in detection_array.detections + if det.results and len(det.results) > 0 + ] + self.get_logger().info( + f"Published {detection_count} detections: {', '.join(set(classes_found))}" + ) + self.last_log_time = current_time else: - self.get_logger().info("No detections found in image") + if should_log: + self.get_logger().debug("No detections found in image") + self.last_log_time = current_time # Publish detections self.detection_publisher.publish(detection_array) diff --git a/src/rai_semap/rai_semap/ros2/node.py b/src/rai_semap/rai_semap/ros2/node.py index 8f8d05899..c09a2c123 100644 --- a/src/rai_semap/rai_semap/ros2/node.py +++ b/src/rai_semap/rai_semap/ros2/node.py @@ -13,10 +13,12 @@ # limitations under the License. import logging +from pathlib import Path from typing import Optional, Tuple # ROS2 core import rclpy +import yaml from cv_bridge import CvBridge # ROS2 geometry and transforms @@ -36,27 +38,7 @@ from rai_semap.core.backend.sqlite_backend import SQLiteBackend from rai_semap.core.semantic_map_memory import SemanticAnnotation, SemanticMapMemory from rai_semap.ros2.perception_utils import extract_pointcloud_from_bbox - - -# Configure Python logging to use ROS2 logger -class ROS2LogHandler(logging.Handler): - """Log handler that forwards Python logging to ROS2 logger.""" - - def __init__(self, ros2_node: Node): - super().__init__() - self.ros2_node = ros2_node - - def emit(self, record): - log_msg = self.format(record) - if record.levelno >= logging.ERROR: - self.ros2_node.get_logger().error(log_msg) - elif record.levelno >= logging.WARNING: - self.ros2_node.get_logger().warn(log_msg) - elif record.levelno >= logging.INFO: - self.ros2_node.get_logger().info(log_msg) - else: - self.ros2_node.get_logger().debug(log_msg) - +from rai_semap.utils.ros2_log import ROS2LogHandler # Constants DEFAULT_QUEUE_SIZE = 10 @@ -109,84 +91,123 @@ def __init__(self, database_path: Optional[str] = None): self._initialize_subscriptions() def _initialize_parameters(self): - """Initialize ROS2 parameters.""" - # Define all parameters with their defaults and descriptions + """Initialize ROS2 parameters from YAML config file.""" + current_dir = Path(__file__).parent + config_dir = current_dir / "config" + + # Declare config file path parameter + self.declare_parameter( + "node_config", + "", + descriptor=ParameterDescriptor( + type=ParameterType.PARAMETER_STRING, + description="Path to node YAML config file (empty = use default in config/)", + ), + ) + + # Get config file path + node_config_path = ( + self.get_parameter("node_config").get_parameter_value().string_value + ) + if node_config_path: + node_yaml = Path(node_config_path) + else: + node_yaml = config_dir / "node.yaml" + + # Load YAML config + with open(node_yaml, "r") as f: + node_config = yaml.safe_load(f) + node_params = node_config.get("rai_semap_node", {}).get("ros__parameters", {}) + + # Extract grouped parameters with defaults + storage = node_params.get("storage", {}) + detection_filtering = node_params.get("detection_filtering", {}) + deduplication = node_params.get("deduplication", {}) + topics = node_params.get("topics", {}) + map_config = node_params.get("map", {}) + + # Declare all parameters with descriptions parameters = [ + # Storage ( "database_path", - "semantic_map.db", + storage.get("database_path", "semantic_map.db"), ParameterType.PARAMETER_STRING, "Path to SQLite database file", ), + ( + "location_id", + storage.get("location_id", "default_location"), + ParameterType.PARAMETER_STRING, + "Identifier for the physical location", + ), + # Detection filtering ( "confidence_threshold", - 0.5, + detection_filtering.get("confidence_threshold", 0.5), ParameterType.PARAMETER_DOUBLE, "Minimum confidence score (0.0-1.0) for storing detections", ), ( "class_confidence_thresholds", - "", + detection_filtering.get("class_confidence_thresholds", ""), ParameterType.PARAMETER_STRING, "Class-specific thresholds as 'class1:threshold1,class2:threshold2' (e.g., 'person:0.7,window:0.6')", ), - ( - "class_merge_thresholds", - "", - ParameterType.PARAMETER_STRING, - "Class-specific merge radii (meters) for deduplication as 'class1:radius1,class2:radius2' (e.g., 'couch:2.5,table:1.5')", - ), ( "min_bbox_area", - 100.0, + detection_filtering.get("min_bbox_area", 100.0), ParameterType.PARAMETER_DOUBLE, "Minimum bounding box area (pixels^2) to filter small false positives", ), + # Deduplication + ( + "class_merge_thresholds", + deduplication.get("class_merge_thresholds", ""), + ParameterType.PARAMETER_STRING, + "Class-specific merge radii (meters) for deduplication as 'class1:radius1,class2:radius2' (e.g., 'couch:2.5,table:1.5')", + ), ( "use_pointcloud_dedup", - True, + deduplication.get("use_pointcloud_dedup", True), ParameterType.PARAMETER_BOOL, "Use point cloud features for improved deduplication matching", ), + # Topics ( - "depth_topic", - "", + "detection_topic", + topics.get("detection_topic", "/detection_array"), ParameterType.PARAMETER_STRING, - "Depth image topic (optional, for point cloud extraction)", + "Topic for RAIDetectionArray messages", ), ( - "camera_info_topic", - "", + "map_topic", + topics.get("map_topic", "/map"), ParameterType.PARAMETER_STRING, - "Camera info topic (optional, for point cloud extraction)", + "Topic for OccupancyGrid map messages", ), ( - "detection_topic", - "/detection_array", + "depth_topic", + topics.get("depth_topic", ""), ParameterType.PARAMETER_STRING, - "Topic for RAIDetectionArray messages", + "Depth image topic (optional, for point cloud extraction)", ), ( - "map_topic", - "/map", + "camera_info_topic", + topics.get("camera_info_topic", ""), ParameterType.PARAMETER_STRING, - "Topic for OccupancyGrid map messages", + "Camera info topic (optional, for point cloud extraction)", ), + # Map/SLAM ( "map_frame_id", - "map", + map_config.get("map_frame_id", "map"), ParameterType.PARAMETER_STRING, "Frame ID of the SLAM map", ), - ( - "location_id", - "default_location", - ParameterType.PARAMETER_STRING, - "Identifier for the physical location", - ), ( "map_resolution", - 0.05, + map_config.get("map_resolution", 0.05), ParameterType.PARAMETER_DOUBLE, "OccupancyGrid resolution (meters/pixel)", ), @@ -231,7 +252,7 @@ def _parse_class_thresholds(self): f"Class-specific threshold: {class_name.strip()}={threshold:.3f}" ) except ValueError: - self.get_logger().warn( + self.get_logger().warning( f"Invalid threshold value in '{item}', skipping" ) @@ -251,7 +272,7 @@ def _parse_class_merge_thresholds(self): f"Class-specific merge radius: {class_name.strip()}={radius:.2f}m" ) except ValueError: - self.get_logger().warn( + self.get_logger().warning( f"Invalid merge radius value in '{item}', skipping" ) @@ -284,7 +305,11 @@ def _initialize_subscriptions(self): RAIDetectionArray, detection_topic, self.detection_callback, - DEFAULT_QUEUE_SIZE, + qos_profile_sensor_data, + ) + self.get_logger().info( + f"Subscribed to detection topic: {detection_topic} " + f"(QoS: reliability={qos_profile_sensor_data.reliability.name})" ) self.map_subscription = self.create_subscription( OccupancyGrid, map_topic, self.map_callback, DEFAULT_QUEUE_SIZE @@ -356,12 +381,13 @@ def _extract_pointcloud_from_bbox( ) if result is None: - self.get_logger().warn("Failed to extract point cloud from bbox") + self.get_logger().warning("Failed to extract point cloud from bbox") return result def detection_callback(self, msg: RAIDetectionArray): """Process detection array and store annotations.""" + self.get_logger().debug("Entering detection_callback") confidence_threshold = self._get_double_parameter("confidence_threshold") map_frame_id = self._get_string_parameter("map_frame_id") @@ -381,7 +407,7 @@ def detection_callback(self, msg: RAIDetectionArray): f"frame_id={det.header.frame_id}" ) else: - self.get_logger().warn(f" Detection {i} has no results!") + self.get_logger().warning(f" Detection {i} has no results!") timestamp = rclpy.time.Time.from_msg(msg.header.stamp) detection_source = msg.header.frame_id or "unknown" @@ -389,6 +415,10 @@ def detection_callback(self, msg: RAIDetectionArray): stored_count = 0 skipped_count = 0 default_frame_id = msg.header.frame_id + self.get_logger().debug( + f"Processing {len(msg.detections)} detections from source={detection_source}, " + f"confidence_threshold={confidence_threshold}" + ) for detection in msg.detections: if self._process_detection( detection, @@ -407,7 +437,7 @@ def detection_callback(self, msg: RAIDetectionArray): f"Stored {stored_count} annotations, skipped {skipped_count} (low confidence or transform failed)" ) elif len(msg.detections) > 0: - self.get_logger().warn( + self.get_logger().warning( f"Received {len(msg.detections)} detections but none were stored " f"(confidence threshold: {confidence_threshold})" ) @@ -426,6 +456,9 @@ def _process_detection( Returns: True if annotation was stored, False otherwise. """ + self.get_logger().debug( + f"Entering _process_detection: source={detection_source}" + ) if not detection.results: self.get_logger().debug("Detection has no results, skipping") return False @@ -466,7 +499,7 @@ def _process_detection( # Use detection frame_id, fallback to message header frame_id if empty source_frame = detection.header.frame_id or default_frame_id if not source_frame: - self.get_logger().warn( + self.get_logger().warning( f"Detection has no frame_id (detection.frame_id='{detection.header.frame_id}', " f"default_frame_id='{default_frame_id}'), skipping" ) @@ -483,7 +516,7 @@ def _process_detection( ) if pose_is_empty: - self.get_logger().warn( + self.get_logger().warning( f"Detection for {object_class} has empty pose (0,0,0). " f"GroundingDINO provides 2D bounding boxes but no 3D pose. " f"Cannot store annotation without 3D position. " @@ -510,7 +543,7 @@ def _process_detection( f"z={pose_in_map_frame.position.z:.3f}" ) except Exception as e: - self.get_logger().warn( + self.get_logger().warning( f"Failed to transform pose from {source_frame} to {map_frame_id}: {e}" ) return False @@ -554,7 +587,7 @@ def _process_detection( f"{pc_centroid_map.position.z:.2f})" ) except Exception as e: - self.get_logger().warn( + self.get_logger().warning( f"Failed to transform point cloud centroid: {e}" ) @@ -680,7 +713,7 @@ def map_callback(self, msg: OccupancyGrid): map_frame_id = self._get_string_parameter("map_frame_id") if msg.header.frame_id != map_frame_id: - self.get_logger().warn( + self.get_logger().warning( f"Map frame_id mismatch: expected {map_frame_id}, got {msg.header.frame_id}" ) diff --git a/src/rai_semap/rai_semap/ros2/perception_utils.yaml b/src/rai_semap/rai_semap/ros2/perception_utils.yaml deleted file mode 100644 index f43d2b656..000000000 --- a/src/rai_semap/rai_semap/ros2/perception_utils.yaml +++ /dev/null @@ -1,5 +0,0 @@ -perception_utils: - ros__parameters: - depth_topic: "" - camera_info_topic: "" - depth_fallback_region_size: 5 diff --git a/src/rai_semap/rai_semap/ros2/visualizer.py b/src/rai_semap/rai_semap/ros2/visualizer.py new file mode 100644 index 000000000..3d21a0556 --- /dev/null +++ b/src/rai_semap/rai_semap/ros2/visualizer.py @@ -0,0 +1,350 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path +from typing import Dict + +import rclpy +import yaml +from builtin_interfaces.msg import Duration +from geometry_msgs.msg import Point +from rcl_interfaces.msg import ParameterDescriptor, ParameterType +from rclpy.node import Node +from std_msgs.msg import ColorRGBA +from visualization_msgs.msg import Marker, MarkerArray + +from rai_semap.core.backend.sqlite_backend import SQLiteBackend +from rai_semap.core.semantic_map_memory import SemanticMapMemory +from rai_semap.utils.ros2_log import ROS2LogHandler + + +class SemanticMapVisualizer(Node): + """ROS2 node for visualizing semantic map annotations in RViz2.""" + + def __init__(self): + super().__init__("semantic_map_visualizer") + + handler = ROS2LogHandler(self) + handler.setLevel(logging.DEBUG) + python_logger = logging.getLogger("rai_semap") + python_logger.setLevel(logging.DEBUG) + python_logger.handlers.clear() + python_logger.addHandler(handler) + python_logger.propagate = False + + self._initialize_parameters() + self._initialize_semap_memory() + self._initialize_publisher() + self._initialize_timer() + + self.class_colors, self.default_color = self._generate_class_colors() + + def _initialize_parameters(self): + """Initialize ROS2 parameters.""" + parameters = [ + ( + "database_path", + "semantic_map.db", + ParameterType.PARAMETER_STRING, + "Path to SQLite database file", + ), + ( + "location_id", + "rosbot_xl_demo", + ParameterType.PARAMETER_STRING, + "Location identifier to query", + ), + ( + "map_frame_id", + "map", + ParameterType.PARAMETER_STRING, + "Frame ID of the SLAM map", + ), + ( + "map_resolution", + 0.05, + ParameterType.PARAMETER_DOUBLE, + "OccupancyGrid resolution (meters/pixel)", + ), + ( + "marker_topic", + "/semantic_map_markers", + ParameterType.PARAMETER_STRING, + "Topic for publishing MarkerArray messages", + ), + ( + "update_rate", + 1.0, + ParameterType.PARAMETER_DOUBLE, + "Rate (Hz) for updating markers", + ), + ( + "marker_scale", + 0.3, + ParameterType.PARAMETER_DOUBLE, + "Scale factor for marker size", + ), + ( + "show_text_labels", + True, + ParameterType.PARAMETER_BOOL, + "Whether to show text labels with object class names", + ), + ( + "marker_lifetime", + 0.0, + ParameterType.PARAMETER_DOUBLE, + "Marker lifetime in seconds (0 = never expire)", + ), + ( + "class_colors_config", + "", + ParameterType.PARAMETER_STRING, + "Path to YAML file with class color definitions (empty = use default in config/)", + ), + ] + + for name, default, param_type, description in parameters: + self.declare_parameter( + name, + default, + descriptor=ParameterDescriptor( + type=param_type, description=description + ), + ) + + def _get_string_parameter(self, name: str) -> str: + """Get string parameter value.""" + return self.get_parameter(name).get_parameter_value().string_value + + def _get_double_parameter(self, name: str) -> float: + """Get double parameter value.""" + return self.get_parameter(name).get_parameter_value().double_value + + def _get_bool_parameter(self, name: str) -> bool: + """Get bool parameter value.""" + return self.get_parameter(name).get_parameter_value().bool_value + + def _initialize_semap_memory(self): + """Initialize semantic map memory backend.""" + database_path = self._get_string_parameter("database_path") + location_id = self._get_string_parameter("location_id") + map_frame_id = self._get_string_parameter("map_frame_id") + map_resolution = self._get_double_parameter("map_resolution") + + backend = SQLiteBackend(database_path) + self.memory = SemanticMapMemory( + backend=backend, + location_id=location_id, + map_frame_id=map_frame_id, + resolution=map_resolution, + ) + self.get_logger().info( + f"Initialized semantic map memory: location_id={location_id}, " + f"map_frame_id={map_frame_id}, database_path={database_path}" + ) + + def _initialize_publisher(self): + """Initialize marker publisher.""" + marker_topic = self._get_string_parameter("marker_topic") + self.marker_publisher = self.create_publisher(MarkerArray, marker_topic, 10) + self.get_logger().info(f"Publishing markers to: {marker_topic}") + + def _initialize_timer(self): + """Initialize update timer.""" + update_rate = self._get_double_parameter("update_rate") + timer_period = 1.0 / update_rate if update_rate > 0 else 1.0 + self.timer = self.create_timer(timer_period, self._update_markers) + self.get_logger().info(f"Update rate: {update_rate} Hz") + + def _generate_class_colors(self) -> tuple[Dict[str, ColorRGBA], ColorRGBA]: + """Load color map for object classes from YAML config.""" + config_path = self._get_string_parameter("class_colors_config") + + if config_path: + yaml_path = Path(config_path) + else: + current_dir = Path(__file__).parent + yaml_path = current_dir / "config" / "visualizer.yaml" + + default_color = ColorRGBA(r=0.5, g=0.5, b=0.5, a=0.8) + colors = {} + if yaml_path.exists(): + try: + with open(yaml_path, "r") as f: + config = yaml.safe_load(f) + + default_color_list = config.get("default_color", [0.5, 0.5, 0.5, 0.8]) + default_color = ColorRGBA( + r=default_color_list[0], + g=default_color_list[1], + b=default_color_list[2], + a=default_color_list[3] if len(default_color_list) > 3 else 0.8, + ) + + class_colors_config = config.get("class_colors", {}) + for class_name, color_value in class_colors_config.items(): + if isinstance(color_value, list): + colors[class_name] = ColorRGBA( + r=color_value[0], + g=color_value[1], + b=color_value[2], + a=color_value[3] if len(color_value) > 3 else 0.8, + ) + else: + colors[class_name] = ColorRGBA( + r=color_value.get("r", 0.5), + g=color_value.get("g", 0.5), + b=color_value.get("b", 0.5), + a=color_value.get("a", 0.8), + ) + self.get_logger().info( + f"Loaded {len(colors)} class colors from {yaml_path}" + ) + except Exception as e: + self.get_logger().warning( + f"Failed to load class colors from {yaml_path}: {e}" + ) + else: + self.get_logger().warning( + f"Class colors config file not found: {yaml_path}" + ) + + return colors, default_color + + def _get_class_color(self, object_class: str) -> ColorRGBA: + """Get color for object class, with fallback for unknown classes.""" + if object_class in self.class_colors: + return self.class_colors[object_class] + return self.default_color + + def _create_sphere_marker(self, annotation, marker_id: int, scale: float) -> Marker: + """Create a sphere marker for an annotation.""" + marker = Marker() + marker.header.frame_id = self._get_string_parameter("map_frame_id") + marker.header.stamp = self.get_clock().now().to_msg() + marker.ns = annotation.object_class + marker.id = marker_id + marker.type = Marker.SPHERE + marker.action = Marker.ADD + + marker.pose = annotation.pose + marker.scale.x = scale + marker.scale.y = scale + marker.scale.z = scale + + color = self._get_class_color(annotation.object_class) + color.a = 0.6 + 0.4 * annotation.confidence + marker.color = color + + marker_lifetime = self._get_double_parameter("marker_lifetime") + if marker_lifetime > 0: + lifetime_duration = Duration() + lifetime_duration.sec = int(marker_lifetime) + lifetime_duration.nanosec = int( + (marker_lifetime - int(marker_lifetime)) * 1e9 + ) + marker.lifetime = lifetime_duration + + return marker + + def _create_text_marker(self, annotation, marker_id: int, scale: float) -> Marker: + """Create a text marker for an annotation label.""" + marker = Marker() + marker.header.frame_id = self._get_string_parameter("map_frame_id") + marker.header.stamp = self.get_clock().now().to_msg() + marker.ns = f"{annotation.object_class}_text" + marker.id = marker_id + marker.type = Marker.TEXT_VIEW_FACING + marker.action = Marker.ADD + + marker.pose = annotation.pose + marker.pose.position.z += scale * 0.5 + marker.scale.z = scale * 0.3 + + marker.color = ColorRGBA(r=1.0, g=1.0, b=1.0, a=1.0) + marker.text = f"{annotation.object_class}\n{annotation.confidence:.2f}" + + marker_lifetime = self._get_double_parameter("marker_lifetime") + if marker_lifetime > 0: + lifetime_duration = Duration() + lifetime_duration.sec = int(marker_lifetime) + lifetime_duration.nanosec = int( + (marker_lifetime - int(marker_lifetime)) * 1e9 + ) + marker.lifetime = lifetime_duration + + return marker + + def _update_markers(self): + """Query database and publish markers.""" + location_id = self._get_string_parameter("location_id") + center = Point(x=0.0, y=0.0, z=0.0) + + try: + annotations = self.memory.query_by_location( + center, radius=1e10, location_id=location_id + ) + except Exception as e: + self.get_logger().error(f"Failed to query annotations: {e}") + return + + if not annotations: + self.get_logger().debug("No annotations found") + marker_array = MarkerArray() + marker_array.markers = [] + self.marker_publisher.publish(marker_array) + return + + marker_array = MarkerArray() + marker_scale = self._get_double_parameter("marker_scale") + show_text = self._get_bool_parameter("show_text_labels") + + marker_id = 0 + for annotation in annotations: + sphere_marker = self._create_sphere_marker( + annotation, marker_id, marker_scale + ) + marker_array.markers.append(sphere_marker) + marker_id += 1 + + if show_text: + text_marker = self._create_text_marker( + annotation, marker_id, marker_scale + ) + marker_array.markers.append(text_marker) + marker_id += 1 + + self.marker_publisher.publish(marker_array) + self.get_logger().debug( + f"Published {len(annotations)} annotations as {len(marker_array.markers)} markers" + ) + + +def main(args=None): + """Main entry point for the semantic map visualizer.""" + rclpy.init(args=args) + node = SemanticMapVisualizer() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/examples/rosbot-xl-navigate-collect.py b/src/rai_semap/rai_semap/scripts/navigate_collect.py similarity index 75% rename from examples/rosbot-xl-navigate-collect.py rename to src/rai_semap/rai_semap/scripts/navigate_collect.py index 7627e2a0c..adfa15685 100644 --- a/examples/rosbot-xl-navigate-collect.py +++ b/src/rai_semap/rai_semap/scripts/navigate_collect.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse +import logging import time from typing import List @@ -23,6 +24,8 @@ from rai.tools.time import WaitForSecondsTool from rclpy.action import ActionClient +from rai_semap.utils.ros2_log import ROS2LogHandler + class NavigationCollector: """Navigate robot and collect detections for semantic map validation.""" @@ -46,39 +49,30 @@ def wait_for_nav_action_server(self, timeout_sec: float = 60.0) -> bool: return True node = self.connector.node - node.get_logger().info("Waiting for Nav2 action server to be available...") - node.get_logger().info("Checking action server at: navigate_to_pose") + node.get_logger().info("Waiting for Nav2 action server...") # Try different possible action names action_names = ["navigate_to_pose", "/navigate_to_pose"] for action_name in action_names: - node.get_logger().info(f"Trying action name: {action_name}") action_client = ActionClient(node, NavigateToPose, action_name) start_time = time.time() while time.time() - start_time < timeout_sec: if action_client.wait_for_server(timeout_sec=2.0): node.get_logger().info( - f"Nav2 action server is ready at: {action_name}" + f"Nav2 action server ready at: {action_name}" ) self._nav_action_ready = True return True elapsed = time.time() - start_time - if int(elapsed) % 5 == 0 and elapsed > 0: - node.get_logger().info( - f"Still waiting for Nav2 action server... ({elapsed:.1f}s)" - ) + if int(elapsed) % 10 == 0 and elapsed > 0: + node.get_logger().info(f"Waiting for Nav2... ({elapsed:.1f}s)") node.get_logger().error( - f"Nav2 action server not available after {timeout_sec} seconds" - ) - node.get_logger().error("Make sure Nav2 is launched and running. Check:") - node.get_logger().error( - " 1. Is the launch file running? (ros2 launch examples/rosbot-xl-semap.launch.py ...)" + f"Nav2 action server not available after {timeout_sec}s. " + "Check: ros2 action list | grep navigate" ) - node.get_logger().error(" 2. Check: ros2 action list | grep navigate") - node.get_logger().error(" 3. Check Nav2 logs for errors") return False def navigate_to_waypoints(self, waypoints: List[tuple]) -> None: @@ -107,7 +101,7 @@ def navigate_to_waypoints(self, waypoints: List[tuple]) -> None: continue node.get_logger().info( - f"Navigating to waypoint {i + 1}/{len(waypoints)}: ({x}, {y}, yaw={yaw})" + f"Waypoint {i + 1}/{len(waypoints)}: ({x:.1f}, {y:.1f})" ) # Use Nav2Toolkit to navigate @@ -120,17 +114,13 @@ def navigate_to_waypoints(self, waypoints: List[tuple]) -> None: if navigate_tool: try: - result = navigate_tool.invoke( - {"x": x, "y": y, "z": 0.0, "yaw": yaw} - ) - node.get_logger().info(f"Navigation result: {result}") + navigate_tool.invoke({"x": x, "y": y, "z": 0.0, "yaw": yaw}) except Exception as e: node.get_logger().warn(f"Navigation failed: {e}") else: node.get_logger().warn("Navigate tool not found, skipping waypoint") # Wait at waypoint to allow detections to be collected - node.get_logger().info("Waiting at waypoint for detections...") self.wait_tool.invoke({"seconds": 5.0}) node.get_logger().info("Navigation complete") @@ -142,19 +132,18 @@ def collect_detections(self, duration_seconds: float = 30.0) -> None: duration_seconds: How long to collect detections. """ node = self.connector.node - node.get_logger().info( - f"Collecting detections for {duration_seconds} seconds..." - ) + node.get_logger().info(f"Collecting detections for {duration_seconds}s...") start_time = time.time() while time.time() - start_time < duration_seconds: self.wait_tool.invoke({"seconds": 2.0}) elapsed = time.time() - start_time - node.get_logger().info( - f"Collecting... {elapsed:.1f}/{duration_seconds} seconds" - ) + if int(elapsed) % 5 == 0: + node.get_logger().info( + f"Collecting... {elapsed:.0f}/{duration_seconds}s" + ) - node.get_logger().info("Detection collection complete") + node.get_logger().info("Collection complete") def main(): @@ -172,7 +161,7 @@ def main(): "--collect-duration", type=float, default=10.0, - help="Duration to collect detections at each waypoint (seconds)", + help="Duration to collect detections at final location (seconds). Detections are collected in real-time during navigation, so this is typically minimal (default: 2.0)", ) parser.add_argument( "--use-sim-time", @@ -198,15 +187,26 @@ def main(): use_sim_time=args.use_sim_time, ) + # Configure Python logging to forward to ROS2 logger + handler = ROS2LogHandler(connector.node) + handler.setLevel(logging.DEBUG) + python_logger = logging.getLogger("rai_semap") + python_logger.setLevel(logging.DEBUG) + python_logger.handlers.clear() + python_logger.addHandler(handler) + python_logger.propagate = False + collector = NavigationCollector(connector) # Navigate to waypoints collector.navigate_to_waypoints(waypoints) - # Final collection period - collector.collect_detections(duration_seconds=args.collect_duration) + # Brief wait to ensure final detections are processed + # (detections are collected in real-time during navigation, so this is minimal) + if args.collect_duration > 0: + collector.collect_detections(duration_seconds=args.collect_duration) - connector.node.get_logger().info("Navigation and collection complete") + connector.node.get_logger().info("Navigation completed") except KeyboardInterrupt: pass diff --git a/src/rai_semap/rai_semap/scripts/semap.launch.py b/src/rai_semap/rai_semap/scripts/semap.launch.py new file mode 100644 index 000000000..ef7f0a318 --- /dev/null +++ b/src/rai_semap/rai_semap/scripts/semap.launch.py @@ -0,0 +1,90 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from launch import LaunchDescription +from launch.actions import DeclareLaunchArgument, ExecuteProcess, OpaqueFunction + + +def generate_detection_publisher_cmd(context): + """Generate detection publisher command with conditional parameters.""" + detection_publisher_config = context.launch_configurations.get( + "detection_publisher_config", "" + ) + perception_utils_config = context.launch_configurations.get( + "perception_utils_config", "" + ) + + cmd = [ + "python", + "-m", + "rai_semap.ros2.detection_publisher", + "--ros-args", + ] + + # Add config file parameters only if provided + if detection_publisher_config: + cmd.extend(["-p", f"detection_publisher_config:={detection_publisher_config}"]) + if perception_utils_config: + cmd.extend(["-p", f"perception_utils_config:={perception_utils_config}"]) + + return [ExecuteProcess(cmd=cmd, output="screen")] + + +def generate_semap_cmd(context): + """Generate semantic map node command with conditional parameters.""" + node_config = context.launch_configurations.get("node_config", "") + + cmd = [ + "python", + "-m", + "rai_semap.ros2.node", + "--ros-args", + ] + + # Add config file parameter only if provided + if node_config: + cmd.extend(["-p", f"node_config:={node_config}"]) + + return [ExecuteProcess(cmd=cmd, output="screen")] + + +def generate_launch_description(): + # Declare launch arguments + node_config_arg = DeclareLaunchArgument( + "node_config", + default_value="", + description="Path to node YAML config file (empty = use default in config/)", + ) + + detection_publisher_config_arg = DeclareLaunchArgument( + "detection_publisher_config", + default_value="", + description="Path to detection_publisher YAML config file (empty = use default in config/)", + ) + + perception_utils_config_arg = DeclareLaunchArgument( + "perception_utils_config", + default_value="", + description="Path to perception_utils YAML config file (empty = use default in config/)", + ) + + return LaunchDescription( + [ + node_config_arg, + detection_publisher_config_arg, + perception_utils_config_arg, + OpaqueFunction(function=generate_detection_publisher_cmd), + OpaqueFunction(function=generate_semap_cmd), + ] + ) diff --git a/examples/validate-semantic-map.py b/src/rai_semap/rai_semap/scripts/validate_semap.py similarity index 99% rename from examples/validate-semantic-map.py rename to src/rai_semap/rai_semap/scripts/validate_semap.py index ed315d403..de4d0c88b 100644 --- a/examples/validate-semantic-map.py +++ b/src/rai_semap/rai_semap/scripts/validate_semap.py @@ -152,7 +152,7 @@ def main(): parser.add_argument( "--location-id", type=str, - default="rosbot_xl_demo", + default="default_location", help="Location identifier to query", ) diff --git a/src/rai_semap/rai_semap/utils/ros2_log.py b/src/rai_semap/rai_semap/utils/ros2_log.py new file mode 100644 index 000000000..f402c25ec --- /dev/null +++ b/src/rai_semap/rai_semap/utils/ros2_log.py @@ -0,0 +1,36 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from rclpy.node import Node + + +class ROS2LogHandler(logging.Handler): + """Log handler that forwards Python logging to ROS2 logger.""" + + def __init__(self, ros2_node: Node): + super().__init__() + self.ros2_node = ros2_node + + def emit(self, record): + log_msg = self.format(record) + if record.levelno >= logging.ERROR: + self.ros2_node.get_logger().error(log_msg) + elif record.levelno >= logging.WARNING: + self.ros2_node.get_logger().warning(log_msg) + elif record.levelno >= logging.INFO: + self.ros2_node.get_logger().info(log_msg) + else: + self.ros2_node.get_logger().debug(log_msg) diff --git a/tests/rai_semap/conftest.py b/tests/rai_semap/conftest.py index c5cd53b71..457e81be8 100644 --- a/tests/rai_semap/conftest.py +++ b/tests/rai_semap/conftest.py @@ -18,6 +18,7 @@ from typing import Optional import pytest +import rclpy from geometry_msgs.msg import Pose # Add src/rai_semap to Python path @@ -33,6 +34,14 @@ TEST_BASE_TIMESTAMP = 1234567890 +@pytest.fixture(scope="module") +def ros2_context(): + """Initialize ROS2 context for testing.""" + rclpy.init() + yield + rclpy.shutdown() + + @pytest.fixture def temp_db_path(): """Create a temporary database path for testing.""" diff --git a/tests/rai_semap/test_detection_publisher.py b/tests/rai_semap/test_detection_publisher.py new file mode 100644 index 000000000..361c49778 --- /dev/null +++ b/tests/rai_semap/test_detection_publisher.py @@ -0,0 +1,104 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import rclpy + +from rai_semap.ros2.detection_publisher import DetectionPublisher + + +def set_parameter(node, name: str, value, param_type): + """Helper to set a single parameter on a node.""" + node.set_parameters([rclpy.parameter.Parameter(name, param_type, value)]) + + +@pytest.fixture +def detection_publisher(ros2_context): + """Create a DetectionPublisher instance for testing.""" + node = DetectionPublisher() + yield node + node.destroy_node() + + +def test_detection_publisher_initialization(detection_publisher): + """Test that DetectionPublisher initializes correctly.""" + assert detection_publisher is not None + assert detection_publisher.get_name() == "detection_publisher" + assert detection_publisher.bridge is not None + assert detection_publisher.last_image is None + assert detection_publisher.last_depth_image is None + assert detection_publisher.last_camera_info is None + assert detection_publisher.last_detection_time == 0.0 + + +def test_parse_detection_classes_basic(detection_publisher): + """Test parsing detection classes with basic format.""" + set_parameter( + detection_publisher, + "default_class_threshold", + 0.3, + rclpy.parameter.Parameter.Type.DOUBLE, + ) + + classes_str = "person, cup, bottle" + class_names, class_thresholds = detection_publisher._parse_detection_classes( + classes_str + ) + + assert len(class_names) == 3 + assert set(class_names) == {"person", "cup", "bottle"} + assert all(class_thresholds[cls] == 0.3 for cls in class_names) + + +def test_parse_detection_classes_with_thresholds(detection_publisher): + """Test parsing detection classes with explicit thresholds.""" + set_parameter( + detection_publisher, + "default_class_threshold", + 0.3, + rclpy.parameter.Parameter.Type.DOUBLE, + ) + + classes_str = "person:0.7, cup, bottle:0.4" + class_names, class_thresholds = detection_publisher._parse_detection_classes( + classes_str + ) + + assert len(class_names) == 3 + assert set(class_names) == {"person", "cup", "bottle"} + assert class_thresholds["person"] == 0.7 + assert class_thresholds["cup"] == 0.3 + assert class_thresholds["bottle"] == 0.4 + + +def test_get_string_parameter(detection_publisher): + """Test getting string parameter.""" + set_parameter( + detection_publisher, + "camera_topic", + "/test/camera", + rclpy.parameter.Parameter.Type.STRING, + ) + assert detection_publisher._get_string_parameter("camera_topic") == "/test/camera" + + +def test_get_double_parameter(detection_publisher): + """Test getting double parameter.""" + set_parameter( + detection_publisher, + "default_class_threshold", + 0.5, + rclpy.parameter.Parameter.Type.DOUBLE, + ) + assert detection_publisher._get_double_parameter("default_class_threshold") == 0.5 diff --git a/tests/rai_semap/test_perception_utils.py b/tests/rai_semap/test_perception_utils.py new file mode 100644 index 000000000..b7fbc68f6 --- /dev/null +++ b/tests/rai_semap/test_perception_utils.py @@ -0,0 +1,115 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from cv_bridge import CvBridge +from sensor_msgs.msg import CameraInfo +from std_msgs.msg import Header +from vision_msgs.msg import BoundingBox2D, Detection2D + +from rai_semap.ros2.perception_utils import ( + compute_3d_pose_from_bbox, + extract_pointcloud_from_bbox, +) + + +@pytest.fixture +def bridge(): + """Create a CvBridge instance.""" + return CvBridge() + + +@pytest.fixture +def camera_info(): + """Create a basic camera info message.""" + info = CameraInfo() + info.width = 640 + info.height = 480 + info.k = [500.0, 0.0, 320.0, 0.0, 500.0, 240.0, 0.0, 0.0, 1.0] + return info + + +@pytest.fixture +def depth_image(bridge): + """Create a depth image message.""" + depth_array = np.ones((480, 640), dtype=np.uint16) * 1000 + depth_msg = bridge.cv2_to_imgmsg(depth_array, encoding="16UC1") + depth_msg.header = Header() + depth_msg.header.frame_id = "camera_frame" + return depth_msg + + +@pytest.fixture +def detection2d(): + """Create a Detection2D message.""" + from geometry_msgs.msg import Point + + detection = Detection2D() + detection.bbox = BoundingBox2D() + detection.bbox.center.position = Point(x=320.0, y=240.0, z=0.0) + detection.bbox.size_x = 100.0 + detection.bbox.size_y = 80.0 + return detection + + +def test_compute_3d_pose_from_bbox(bridge, camera_info, depth_image): + """Test computing 3D pose from bounding box center.""" + bbox_center_x = 320.0 + bbox_center_y = 240.0 + + pose = compute_3d_pose_from_bbox( + bbox_center_x, bbox_center_y, depth_image, camera_info, bridge + ) + + assert pose is not None + assert pose.position.z > 0 + assert pose.orientation.w == 1.0 + + +def test_compute_3d_pose_from_bbox_out_of_bounds(bridge, camera_info, depth_image): + """Test computing 3D pose with out-of-bounds coordinates.""" + bbox_center_x = 1000.0 + bbox_center_y = 1000.0 + + pose = compute_3d_pose_from_bbox( + bbox_center_x, bbox_center_y, depth_image, camera_info, bridge + ) + + assert pose is None + + +def test_extract_pointcloud_from_bbox(bridge, camera_info, depth_image, detection2d): + """Test extracting point cloud from bounding box.""" + result = extract_pointcloud_from_bbox(detection2d, depth_image, camera_info, bridge) + + assert result is not None + centroid, size, point_count = result + assert point_count > 0 + assert centroid.x is not None + assert centroid.y is not None + assert centroid.z is not None + assert size >= 0 + + +def test_extract_pointcloud_from_bbox_empty_depth(bridge, camera_info, detection2d): + """Test extracting point cloud with empty depth image.""" + depth_array = np.zeros((480, 640), dtype=np.uint16) + depth_msg = bridge.cv2_to_imgmsg(depth_array, encoding="16UC1") + depth_msg.header = Header() + depth_msg.header.frame_id = "camera_frame" + + result = extract_pointcloud_from_bbox(detection2d, depth_msg, camera_info, bridge) + + assert result is None diff --git a/tests/rai_semap/test_visualizer.py b/tests/rai_semap/test_visualizer.py new file mode 100644 index 000000000..b6cea38b3 --- /dev/null +++ b/tests/rai_semap/test_visualizer.py @@ -0,0 +1,144 @@ +# Copyright (C) 2025 Julia Jia +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +from pathlib import Path + +import pytest +import rclpy +import yaml +from std_msgs.msg import ColorRGBA + +from rai_semap.ros2.visualizer import SemanticMapVisualizer + + +def set_parameter(node, name: str, value, param_type): + """Helper to set a single parameter on a node.""" + node.set_parameters([rclpy.parameter.Parameter(name, param_type, value)]) + + +@pytest.fixture +def temp_config_file(): + """Create a temporary config file for testing.""" + config_data = { + "default_color": [0.5, 0.5, 0.5, 0.8], + "class_colors": { + "chair": [0.2, 0.8, 0.4, 0.8], + "table": [0.6, 0.2, 0.8, 0.8], + }, + } + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(config_data, f) + config_path = f.name + yield config_path + Path(config_path).unlink(missing_ok=True) + + +@pytest.fixture +def visualizer(ros2_context, temp_db_path): + """Create a SemanticMapVisualizer instance for testing.""" + node = SemanticMapVisualizer() + set_parameter( + node, "database_path", temp_db_path, rclpy.parameter.Parameter.Type.STRING + ) + yield node + node.destroy_node() + + +def test_visualizer_initialization(visualizer): + """Test that SemanticMapVisualizer initializes correctly.""" + assert visualizer is not None + assert visualizer.get_name() == "semantic_map_visualizer" + assert visualizer.class_colors is not None + assert visualizer.default_color is not None + assert visualizer.marker_publisher is not None + + +def test_generate_class_colors(visualizer): + """Test generating class colors from config.""" + colors, default_color = visualizer._generate_class_colors() + + assert isinstance(colors, dict) + assert isinstance(default_color, ColorRGBA) + assert default_color.r == 0.5 + assert default_color.g == 0.5 + assert default_color.b == 0.5 + assert default_color.a == 0.8 + + +def test_generate_class_colors_from_custom_config( + ros2_context, temp_db_path, temp_config_file +): + """Test generating class colors from custom config file.""" + node = SemanticMapVisualizer() + set_parameter( + node, "database_path", temp_db_path, rclpy.parameter.Parameter.Type.STRING + ) + set_parameter( + node, + "class_colors_config", + temp_config_file, + rclpy.parameter.Parameter.Type.STRING, + ) + + colors, default_color = node._generate_class_colors() + + assert "chair" in colors + assert "table" in colors + assert colors["chair"].r == 0.2 + assert colors["chair"].g == 0.8 + assert colors["chair"].b == 0.4 + assert default_color.r == 0.5 + + node.destroy_node() + + +def test_get_class_color(visualizer): + """Test getting color for object class.""" + visualizer.class_colors["test_class"] = ColorRGBA(r=1.0, g=0.0, b=0.0, a=1.0) + + color = visualizer._get_class_color("test_class") + assert color.r == 1.0 + assert color.g == 0.0 + assert color.b == 0.0 + + unknown_color = visualizer._get_class_color("unknown_class") + assert unknown_color == visualizer.default_color + + +def test_get_string_parameter(visualizer): + """Test getting string parameter.""" + set_parameter( + visualizer, + "database_path", + "/test/path.db", + rclpy.parameter.Parameter.Type.STRING, + ) + assert visualizer._get_string_parameter("database_path") == "/test/path.db" + + +def test_get_double_parameter(visualizer): + """Test getting double parameter.""" + set_parameter( + visualizer, "map_resolution", 0.1, rclpy.parameter.Parameter.Type.DOUBLE + ) + assert visualizer._get_double_parameter("map_resolution") == 0.1 + + +def test_get_bool_parameter(visualizer): + """Test getting bool parameter.""" + set_parameter( + visualizer, "show_text_labels", False, rclpy.parameter.Parameter.Type.BOOL + ) + assert visualizer._get_bool_parameter("show_text_labels") is False From 9e4e0ee2d297f0dbde1cf10abd2f5968c95ef7dd Mon Sep 17 00:00:00 2001 From: Julia Jia Date: Tue, 2 Dec 2025 19:29:39 -0800 Subject: [PATCH 3/3] Fix the failing tests --- tests/rai_semap/test_perception_utils.py | 6 ++---- tests/smoke/import_test.py | 3 +++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/rai_semap/test_perception_utils.py b/tests/rai_semap/test_perception_utils.py index b7fbc68f6..c3fe12fd3 100644 --- a/tests/rai_semap/test_perception_utils.py +++ b/tests/rai_semap/test_perception_utils.py @@ -17,7 +17,7 @@ from cv_bridge import CvBridge from sensor_msgs.msg import CameraInfo from std_msgs.msg import Header -from vision_msgs.msg import BoundingBox2D, Detection2D +from vision_msgs.msg import BoundingBox2D, Detection2D, Point2D from rai_semap.ros2.perception_utils import ( compute_3d_pose_from_bbox, @@ -54,11 +54,9 @@ def depth_image(bridge): @pytest.fixture def detection2d(): """Create a Detection2D message.""" - from geometry_msgs.msg import Point - detection = Detection2D() detection.bbox = BoundingBox2D() - detection.bbox.center.position = Point(x=320.0, y=240.0, z=0.0) + detection.bbox.center.position = Point2D(x=320.0, y=240.0) detection.bbox.size_x = 100.0 detection.bbox.size_y = 80.0 return detection diff --git a/tests/smoke/import_test.py b/tests/smoke/import_test.py index 901008e89..b651d008e 100644 --- a/tests/smoke/import_test.py +++ b/tests/smoke/import_test.py @@ -51,6 +51,9 @@ def import_submodules(package: ModuleType) -> None: continue if path.is_file() and path.suffix != ".py" or path.name == "__init__.py": continue + # Skip files with dots in name (e.g., semap.launch.py) as they can't be imported as modules + if path.is_file() and "." in path.stem: + continue relative_path = str(path.relative_to(package_path)) subpage_name = relative_path.replace(os.path.sep, ".").replace(".py", "")