From 0d6cf3288c3972160d98c163434c331b2e70b46f Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Fri, 8 Aug 2025 18:36:13 +0200
Subject: [PATCH 01/13] Depth support

---
 alignit/infere.py          |  92 +++++++----------
 alignit/models/alignnet.py |  72 +++++++++----
 alignit/record.py          | 200 ++++++++++++++++++++++++++-----------
 alignit/train.py           |  80 +++++++--------
 alignit/visualize.py       |  20 ++--
 5 files changed, 276 insertions(+), 188 deletions(-)

diff --git a/alignit/infere.py b/alignit/infere.py
index eb5810a..410893b 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -14,94 +14,74 @@
 
 
 @draccus.wrap()
-def main(cfg: InferConfig):
-    """Run inference/alignment using configuration parameters."""
+def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
 
-    # load model from file
     net = AlignNet(
-        backbone_name=cfg.model.backbone,
-        backbone_weights=cfg.model.backbone_weights,
-        use_vector_input=cfg.model.use_vector_input,
-        fc_layers=cfg.model.fc_layers,
-        vector_hidden_dim=cfg.model.vector_hidden_dim,
-        output_dim=cfg.model.output_dim,
-        feature_agg=cfg.model.feature_agg,
+        output_dim=9,
+        use_vector_input=False,
+        use_depth_input=True,
     )
-    net.load_state_dict(torch.load(cfg.model.path, map_location=device))
+    net.load_state_dict(torch.load("alignnet_model.pth", map_location=device))
     net.to(device)
     net.eval()
 
-    robot = XarmSim()
+    robot = Xarm()
 
-    # Set initial pose from config
     start_pose = t3d.affines.compose(
         [0.23, 0, 0.25], t3d.euler.euler2mat(np.pi, 0, 0), [1, 1, 1]
     )
     robot.servo_to_pose(start_pose, lin_tol=1e-2)
-
-    iteration = 0
-    iterations_within_tolerance = 0
-    ang_tol_rad = np.deg2rad(cfg.ang_tolerance)
-
+    total = 0
+    tick = 0
     try:
         while True:
+            start_capture = time.time()
             observation = robot.get_observation()
-            images = [observation["camera.rgb"].astype(np.float32) / 255.0]
+            print(f"Observation time: {time.time() - start_capture:.3f}s")
+
+            rgb_image = observation["rgb"].astype(np.float32) / 255.0
+            depth_image = observation["depth"].astype(np.float32) / 1000.0
+            rgb_image_tensor = (
+                torch.from_numpy(np.array(rgb_image))
+                .permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
+                .unsqueeze(0)  
+                .to(device)
+            )
 
-            # Convert images to tensor and reshape from HWC to CHW format
-            images_tensor = (
-                torch.from_numpy(np.array(images))
-                .permute(0, 3, 1, 2)
-                .unsqueeze(0)
+            depth_image_tensor = (
+                torch.from_numpy(np.array(depth_image))
+                .unsqueeze(0)  # Add channel dimension: (1, H, W)
+                .unsqueeze(0)  # Add batch dimension: (1, 1, H, W)
                 .to(device)
             )
 
-            if cfg.debug_output:
-                print(f"Max pixel value: {torch.max(images_tensor)}")
+            rgb_images_batch = rgb_image_tensor.unsqueeze(1)
+            depth_images_batch = depth_image_tensor.unsqueeze(1)
+
 
             start = time.time()
             with torch.no_grad():
-                relative_action = net(images_tensor)
+                relative_action = net(rgb_images_batch, depth_images=depth_images_batch)
             relative_action = relative_action.squeeze(0).cpu().numpy()
             relative_action = sixd_se3(relative_action)
-
-            if cfg.debug_output:
-                print_pose(relative_action)
-
-            # Check convergence
-            if are_tfs_close(
-                relative_action, lin_tol=cfg.lin_tolerance, ang_tol=ang_tol_rad
-            ):
-                iterations_within_tolerance += 1
-            else:
-                iterations_within_tolerance = 0
-
-            if iterations_within_tolerance >= cfg.debouncing_count:
-                print("Alignment achieved - stopping.")
-                break
-
-            target_pose = robot.pose() @ relative_action
-            iteration += 1
+            relative_action [:3,:3] = relative_action[:3,:3] @ relative_action[:3,:3] @ relative_action[:3,:3]
+            print_pose(relative_action)
+
+            action = robot.pose() @ relative_action
+            elapsed = time.time() - start
+            total = total + elapsed
+            tick += 1
+            avg = total / tick
             action = {
-                "pose": target_pose,
+                "pose": action,
                 "gripper.pos": 1.0,
             }
             robot.send_action(action)
-
-            # Check max iterations
-            if cfg.max_iterations and iteration >= cfg.max_iterations:
-                print(f"Reached maximum iterations ({cfg.max_iterations}) - stopping.")
-                break
-
-        time.sleep(10.0)
-
     except KeyboardInterrupt:
         print("\nExiting...")
 
-    robot.disconnect()
-
 
 if __name__ == "__main__":
     main()
diff --git a/alignit/models/alignnet.py b/alignit/models/alignnet.py
index 6da726f..8b1a225 100644
--- a/alignit/models/alignnet.py
+++ b/alignit/models/alignnet.py
@@ -7,43 +7,46 @@
 class AlignNet(nn.Module):
     def __init__(
         self,
-        backbone_name="efficientnet_b0",
+        backbone_name="efficientnet_b0", 
         backbone_weights="DEFAULT",
         use_vector_input=True,
+        use_depth_input=True,  # NEW
         fc_layers=[256, 128],
         vector_hidden_dim=64,
+        depth_hidden_dim=128,  # NEW
         output_dim=7,
         feature_agg="mean",
     ):
-        """
-        :param backbone_name: 'efficientnet_b0' or 'resnet18'
-        :param backbone_weights: 'DEFAULT' or None
-        :param use_vector_input: whether to accept a vector input
-        :param fc_layers: list of hidden layer sizes for the fully connected head
-        :param vector_hidden_dim: output dim of the vector MLP
-        :param output_dim: final output vector size
-        :param feature_agg: 'mean' or 'max' across image views
-        """
         super().__init__()
         self.use_vector_input = use_vector_input
+        self.use_depth_input = use_depth_input
         self.feature_agg = feature_agg
 
-        # CNN backbone
         self.backbone, self.image_feature_dim = self._build_backbone(
             backbone_name, backbone_weights
         )
 
-        # Linear projection of image features
         self.image_fc = nn.Sequential(
             nn.Linear(self.image_feature_dim, fc_layers[0]), nn.ReLU()
         )
 
+        if use_depth_input:
+            self.depth_cnn = nn.Sequential(
+                nn.Conv2d(1, 8, 3, padding=1), nn.ReLU(),
+                nn.Conv2d(8, 16, 3, padding=1), nn.ReLU(),
+                nn.AdaptiveAvgPool2d(1),
+            )
+            self.depth_fc = nn.Sequential(
+                nn.Linear(16, depth_hidden_dim), nn.ReLU()
+            )
+            input_dim = fc_layers[0] + depth_hidden_dim
+        else:
+            input_dim = fc_layers[0]
+
         # Optional vector input processing
         if use_vector_input:
             self.vector_fc = nn.Sequential(nn.Linear(1, vector_hidden_dim), nn.ReLU())
-            input_dim = fc_layers[0] + vector_hidden_dim
-        else:
-            input_dim = fc_layers[0]
+            input_dim += vector_hidden_dim
 
         # Fully connected layers
         layers = []
@@ -81,10 +84,11 @@ def aggregate_image_features(self, feats):
         else:
             raise ValueError("Invalid aggregation type")
 
-    def forward(self, rgb_images, vector_inputs=None):
+    def forward(self, rgb_images, vector_inputs=None, depth_images=None):
         """
         :param rgb_images: Tensor of shape (B, N, 3, H, W)
         :param vector_inputs: List of tensors of shape (L_i,) or None
+        :param depth_images: Tensor of shape (B, N, 1, H, W) or None
         :return: Tensor of shape (B, output_dim)
         """
         B, N, C, H, W = rgb_images.shape
@@ -93,6 +97,15 @@ def forward(self, rgb_images, vector_inputs=None):
         image_feats = self.aggregate_image_features(feats)
         image_feats = self.image_fc(image_feats)
 
+        features = [image_feats]
+
+        if self.use_depth_input and depth_images is not None:
+            depth = depth_images.view(B * N, 1, H, W)
+            depth_feats = self.depth_cnn(depth).view(B, N, -1)
+            depth_feats = self.aggregate_image_features(depth_feats)
+            depth_feats = self.depth_fc(depth_feats)
+            features.append(depth_feats)
+
         if self.use_vector_input and vector_inputs is not None:
             vec_feats = []
             for vec in vector_inputs:
@@ -100,8 +113,29 @@ def forward(self, rgb_images, vector_inputs=None):
                 pooled = self.vector_fc(vec).mean(dim=0)  # (D,)
                 vec_feats.append(pooled)
             vec_feats = torch.stack(vec_feats, dim=0)
-            fused = torch.cat([image_feats, vec_feats], dim=1)
-        else:
-            fused = image_feats
+            features.append(vec_feats)
 
+        fused = torch.cat(features, dim=1)
         return self.head(fused)  # (B, output_dim)
+
+
+if __name__ == "__main__":
+    import time
+
+    batch_size = 1
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = AlignNet(backbone_name="efficientnet_b0", use_vector_input=True, use_depth_input=True).to(device)
+
+    rgb_images = torch.randn(batch_size, 4, 3, 224, 224).to(device)
+    depth_images = torch.randn(batch_size, 4, 1, 224, 224).to(device)
+    vector_inputs = [torch.randn(10).to(device) for _ in range(batch_size)]
+
+    output = None
+    start_time = time.time()
+    for i in range(100):
+        output = model(rgb_images, vector_inputs, depth_images)
+    end_time = time.time()
+    duration_ms = ((end_time - start_time) / 100) * 1000
+    print(f"Inference time: {duration_ms:.3f} ms")
+    print(f"Optimal for {1000 / duration_ms:.2f} fps")
+    print("Output shape:", output.shape)
\ No newline at end of file
diff --git a/alignit/record.py b/alignit/record.py
index 93fc7d5..1e7168e 100644
--- a/alignit/record.py
+++ b/alignit/record.py
@@ -1,9 +1,11 @@
 import os
 import shutil
-
 import transforms3d as t3d
 import numpy as np
 from scipy.spatial.transform import Rotation as R
+from alignit.robots.xarm import Xarm
+from alignit.utils.zhou import se3_sixd
+import argparse  # Added for command line arguments
 from datasets import (
     Dataset,
     Features,
@@ -14,55 +16,89 @@
     concatenate_datasets,
 )
 
-from alignit.robots.xarmsim import XarmSim
-from alignit.robots.xarm import Xarm
-from alignit.utils.zhou import se3_sixd
-import draccus
-from alignit.config import RecordConfig
-
+def generate_lift_trajectory_with_random_rpy(
+    start_pose,
+    lift_distance=0.1,          
+    num_steps=100,              
+    max_rotation_angle=15.0,   
+    initial_lift=0.005,        
+):
+    
+    trajectory = []
+    R_start = start_pose[:3, :3]
+    t_start = start_pose[:3, 3]
+    
+    object_z_axis = R_start[:, 2]
+    
+    t_current = t_start + object_z_axis * initial_lift
+    
+    for i in range(num_steps):
+        lift_progress = i / (num_steps - 1)  # 0 to 1
+        current_lift = t_start + object_z_axis * -(initial_lift + lift_distance * lift_progress)
+        
+        r_rand = np.random.uniform(-max_rotation_angle, max_rotation_angle)
+        p_rand = np.random.uniform(-max_rotation_angle, max_rotation_angle)
+        y_rand = np.random.uniform(-max_rotation_angle, max_rotation_angle)
+        
+        random_rotation = R.from_euler('xyz', 
+                                     [np.radians(r_rand), 
+                                      np.radians(p_rand), 
+                                      np.radians(y_rand)]).as_matrix()
+        
+        R_current = R_start @ random_rotation
+        
+        T = np.eye(4)
+        T[:3, :3] = R_current
+        T[:3, 3] = t_start
+        
+        trajectory.append(T)
+    
+    return trajectory
 
-def generate_spiral_trajectory(start_pose, cfg):
-    """Generate spiral trajectory using configuration parameters."""
+def generate_spiral_trajectory(
+    start_pose,
+    z_step=0.1,
+    radius_step=0.001,
+    num_steps=50,
+    cone_angle=45,
+    visible_sweep=180,
+    viewing_angle_offset=0,
+    angular_resolution=5,
+    include_cone_poses=False,
+    lift_height_before_spiral=0.0005,
+):
     trajectory = []
     R_start = start_pose[:3, :3]
     t_start_initial = start_pose[:3, 3]
 
-    cone_angle_rad = np.deg2rad(cfg.cone_angle)
+    cone_angle_rad = np.deg2rad(cone_angle)
 
     object_z_axis = R_start[:, 2]
 
-    lift_offset_world = object_z_axis * cfg.lift_height_before_spiral
+    lift_offset_world = object_z_axis * lift_height_before_spiral
     t_start_spiral = t_start_initial + lift_offset_world
 
-    start_angle = -cfg.visible_sweep / 2 + cfg.viewing_angle_offset
-    end_angle = cfg.visible_sweep / 2 + cfg.viewing_angle_offset
+    start_angle = -visible_sweep / 2 + viewing_angle_offset
+    end_angle = visible_sweep / 2 + viewing_angle_offset
 
-    for i in range(cfg.num_steps):
-        radius = cfg.radius_step * i
+    for i in range(num_steps):
+        radius = radius_step * i
         angle = 2 * np.pi * i / 10
 
         local_offset = np.array(
-            [radius * np.cos(angle), radius * np.sin(angle), -cfg.z_step * i]
+            [radius * np.cos(angle), radius * np.sin(angle), -z_step * i]
         )
 
         world_offset = R_start @ local_offset
         base_position = t_start_spiral + world_offset
-        x_rot = np.random.uniform(-10, 10)
-        y_rot = np.random.uniform(-10, 10)
-        z_rot = np.random.uniform(-10, 10)
-
-        random_angles = np.radians([x_rot, y_rot, z_rot])
-        random_rotation = R.from_euler("xyz", random_angles).as_matrix()
-
-        randomized_rotation = R_start @ random_rotation
 
         T = np.eye(4)
-        T[:3, :3] = randomized_rotation
+        T[:3, :3] = R_start
         T[:3, 3] = base_position
         trajectory.append(T)
 
-        if cfg.include_cone_poses:
-            for deg in np.arange(start_angle, end_angle, cfg.angular_resolution):
+        if include_cone_poses:
+            for deg in np.arange(start_angle, end_angle, angular_resolution):
                 theta = np.deg2rad(deg)
 
                 tilt = t3d.euler.euler2mat(cone_angle_rad, 0, 0)
@@ -76,64 +112,108 @@ def generate_spiral_trajectory(start_pose, cfg):
 
     return trajectory
 
-
-@draccus.wrap()
-def main(cfg: RecordConfig):
-    """Record alignment dataset using configuration parameters."""
-    robot = XarmSim()
+def load_and_merge_datasets(dataset_paths, output_path, features):
+    """Load and merge multiple datasets from disk"""
+    if not dataset_paths:
+        return None
+        
+    datasets = []
+    for path in dataset_paths:
+        if os.path.exists(path):
+            try:
+                ds = load_from_disk(path)
+                ds = ds.cast(features)  # Ensure consistent features
+                datasets.append(ds)
+                print(f"Loaded dataset from {path}")
+            except Exception as e:
+                print(f"Error loading dataset from {path}: {e}")
+        else:
+            print(f"Dataset path not found: {path}")
+    
+    if not datasets:
+        return None
+    
+    merged_dataset = concatenate_datasets(datasets)
+    
+    # Save the merged dataset
+    temp_path = output_path + "_temp"
+    merged_dataset.save_to_disk(temp_path)
+    if os.path.exists(output_path):
+        shutil.rmtree(output_path)
+    shutil.move(temp_path, output_path)
+    
+    return merged_dataset
+
+def main(args):
+    robot = Xarm()
     features = Features(
-        {"images": Sequence(Image()), "action": Sequence(Value("float32"))}
+        {"images": Sequence(Image()), "action": Sequence(Value("float32")), "depth": Sequence(Image())}
     )
 
-    for episode in range(cfg.episodes):
-        pose_start, pose_alignment_target = robot.reset()
-
-        robot.servo_to_pose(pose_alignment_target, lin_tol=0.015, ang_tol=0.015)
-
-        robot.servo_to_pose(
-            pose_alignment_target,
-            lin_tol=cfg.lin_tol_alignment,
-            ang_tol=cfg.ang_tol_alignment,
+    if args.merge_datasets:
+        print(f"Merging datasets: {args.merge_datasets}")
+        merged_dataset = load_and_merge_datasets(
+            args.merge_datasets, 
+            args.output_path, 
+            features
         )
+        if merged_dataset is not None:
+            print(f"Merged dataset created with {len(merged_dataset)} samples at {args.output_path}")
+        else:
+            print("No valid datasets found to merge")
+    max_rot = 15 
 
-        trajectory = generate_spiral_trajectory(pose_start, cfg.trajectory)
-
+    for episode in range(args.num_episodes):
+        pose_start, pose_alignment_target = robot.reset()
+        max_rot = max_rot + 2  
+        trajectory = generate_lift_trajectory_with_random_rpy(
+            pose_start,
+            lift_distance=0.1,
+            num_steps=100,
+            max_rotation_angle=max_rot,
+            initial_lift=0.005,
+        )
+        print(f"Generated trajectory with {len(trajectory)} poses for episode {episode+1}")
         frames = []
         for pose in trajectory:
-            robot.servo_to_pose(
-                pose, lin_tol=cfg.lin_tol_trajectory, ang_tol=cfg.ang_tol_trajectory
-            )
+            robot.servo_to_pose(pose, lin_tol=5e-3, ang_tol=0.09)
             current_pose = robot.pose()
-
+            
             action_pose = np.linalg.inv(current_pose) @ pose_alignment_target
             action_sixd = se3_sixd(action_pose)
 
             observation = robot.get_observation()
-            frame = {
-                "images": [observation["camera.rgb"].copy()],
-                "action": action_sixd,
-            }
+            print(observation.keys())
+            frame = {"images": [observation["rgb"].copy()],"depth":  [observation["depth"].copy()],"action": action_sixd}
             frames.append(frame)
 
         print(f"Episode {episode+1} completed with {len(frames)} frames.")
 
         episode_dataset = Dataset.from_list(frames, features=features)
-        if episode == 0:
+        if episode == 0 and not os.path.exists(args.output_path):
             combined_dataset = episode_dataset
         else:
-            previous_dataset = load_from_disk(cfg.dataset.path)
+            previous_dataset = load_from_disk(args.output_path)
             previous_dataset = previous_dataset.cast(features)
             combined_dataset = concatenate_datasets([previous_dataset, episode_dataset])
             del previous_dataset
 
-        temp_path = f"{cfg.dataset.path}_temp"
+        temp_path = args.output_path + "_temp"
         combined_dataset.save_to_disk(temp_path)
-        if os.path.exists(cfg.dataset.path):
-            shutil.rmtree(cfg.dataset.path)
-        shutil.move(temp_path, cfg.dataset.path)
+        if os.path.exists(args.output_path):
+            shutil.rmtree(args.output_path)
+        shutil.move(temp_path, args.output_path)
 
     robot.disconnect()
 
-
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description='Robot data collection and dataset merging')
+    parser.add_argument('--output-path', type=str, default="data/duck",
+                       help='Path to save the output dataset')
+    parser.add_argument('--num-episodes', type=int, default=15,
+                       help='Number of episodes to run')
+    parser.add_argument('--merge-datasets', type=str, nargs='+', default=None,
+                       help='List of dataset paths to merge before collection')
+    
+    args = parser.parse_args()
+    main(args)
diff --git a/alignit/train.py b/alignit/train.py
index 6f05a33..51a23b2 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -18,74 +18,70 @@ def collate_fn(batch):
 
 
 @draccus.wrap()
-def main(cfg: TrainConfig):
-    """Train AlignNet model using configuration parameters."""
+def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
     # Load the dataset from disk
-    dataset = load_from_disk(cfg.dataset.path)
-
-    # Create model using config parameters
+    dataset = load_from_disk("data/duck")
     net = AlignNet(
-        backbone_name=cfg.model.backbone,
-        backbone_weights=cfg.model.backbone_weights,
-        use_vector_input=cfg.model.use_vector_input,
-        fc_layers=cfg.model.fc_layers,
-        vector_hidden_dim=cfg.model.vector_hidden_dim,
-        output_dim=cfg.model.output_dim,
-        feature_agg=cfg.model.feature_agg,
+        output_dim=9,  
+        use_vector_input=False,  
     ).to(device)
 
-    # Split dataset
-    train_dataset = dataset.train_test_split(
-        test_size=cfg.test_size, seed=cfg.random_seed
-    )
+    train_dataset = dataset.train_test_split(test_size=0.2, seed=42)
 
-    # Create data loader
     train_loader = DataLoader(
-        train_dataset["train"],
-        batch_size=cfg.batch_size,
-        shuffle=True,
-        collate_fn=collate_fn,
+        train_dataset["train"], batch_size=4, shuffle=True, collate_fn=collate_fn
     )
-
-    optimizer = Adam(net.parameters(), lr=cfg.learning_rate)
+    optimizer = Adam(net.parameters(), lr=1e-4)
     criterion = MSELoss()
     net.train()
-
-    for epoch in range(cfg.epochs):
+    for epoch in range(100):
         for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
             images = batch["images"]
+            depth_images_pil = batch["depth_images"]
             actions = batch["action"].to(device)
 
-            # Convert PIL Images to tensors and stack them properly
-            # images is a list of lists of PIL Images
-            batch_images = []
-            transform = transforms.Compose([transforms.ToTensor()])
+            batch_rgb_tensors = []
+            batch_depth_tensors = []
+            rgb_transform = transforms.Compose([transforms.ToTensor()])
+            depth_transform = transforms.Compose([transforms.ToTensor()])
 
-            for image_sequence in images:
-                tensor_sequence = [
-                    transform(img.convert("RGB")) for img in image_sequence
+            for i, image_sequence in enumerate(images):
+                # Process RGB
+                tensor_sequence_rgb = [
+                    rgb_transform(img.convert("RGB")) for img in image_sequence
                 ]
-                stacked_tensors = torch.stack(tensor_sequence, dim=0)
-                batch_images.append(stacked_tensors)
+                stacked_tensors_rgb = torch.stack(tensor_sequence_rgb, dim=0)
+                batch_rgb_tensors.append(stacked_tensors_rgb)
 
-            # Stack all batches to get shape (B, N, 3, H, W)
-            batch_images = torch.stack(batch_images, dim=0).to(device)
+                # Process Depth
+                if depth_images_pil[i] is not None: # Check if depth is actually present for this item
+                    depth_sequence = depth_images_pil[i]
+                    tensor_sequence_depth = [
+                        depth_transform(d_img.convert("L")) for d_img in depth_sequence
+                    ] 
+                    stacked_tensors_depth = torch.stack(tensor_sequence_depth, dim=0)
+                    batch_depth_tensors.append(stacked_tensors_depth)
+                else:
+                    print(f"Warning: Depth image missing for sample {i} in batch.")
+                    raise ValueError("Depth images expected but not found for a sample when use_depth is True.")
+
+
+            batch_rgb_tensors = torch.stack(batch_rgb_tensors, dim=0).to(device)
+            batch_depth_tensors = torch.stack(batch_depth_tensors, dim=0).to(device)
 
             optimizer.zero_grad()
-            outputs = net(batch_images)
+            outputs = net(batch_rgb_tensors, depth_images=batch_depth_tensors) # <--- Pass depth_images
             loss = criterion(outputs, actions)
             loss.backward()
             optimizer.step()
             tqdm.write(f"Loss: {loss.item():.4f}")
-
-        # Save the trained model
-        torch.save(net.state_dict(), cfg.model.path)
-        tqdm.write(f"Model saved as {cfg.model.path}")
+        torch.save(net.state_dict(), "alignnet_model.pth")
+        tqdm.write("Model saved as alignnet_model.pth")
 
     print("Training complete.")
 
 
 if __name__ == "__main__":
     main()
+
diff --git a/alignit/visualize.py b/alignit/visualize.py
index 26ee81f..9b8adad 100644
--- a/alignit/visualize.py
+++ b/alignit/visualize.py
@@ -1,6 +1,6 @@
 import gradio as gr
 import draccus
-
+from datasets import load_from_disk
 from alignit.utils.dataset import load_dataset
 from alignit.utils.zhou import sixd_se3
 from alignit.utils.tfs import get_pose_str
@@ -8,29 +8,27 @@
 
 
 @draccus.wrap()
-def visualize(cfg: VisualizeConfig):
-    dataset = load_dataset(cfg.dataset.path)
+def main():
+    dataset = load_from_disk("data/duck")
 
     def get_data(index):
+        print(index)
         item = dataset[index]
         image = item["images"][0]
+        depth = item["depth"][0]
         action_sixd = item["action"]
         action = sixd_se3(action_sixd)
         label = get_pose_str(action, degrees=True)
-        return image, label
+        return image,depth, label
 
     gr.Interface(
         fn=get_data,
         inputs=gr.Slider(0, len(dataset) - 1, step=1, label="Index", interactive=True),
-        outputs=[gr.Image(type="pil", label="Image"), gr.Text(label="Label")],
+        outputs=[gr.Image(type="pil", label="Image"),gr.Image(type="pil", label="Depth Image"), gr.Text(label="Label")],
         title="Dataset Image Viewer",
         live=True,
-    ).launch(
-        share=cfg.share,
-        server_name=cfg.server_name,
-        server_port=cfg.server_port
-    )
+    ).launch()
 
 
 if __name__ == "__main__":
-    visualize()
+    main()

From 09a8c1ccc1d223a371dbe49ba6322e54fd4f2149 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Fri, 8 Aug 2025 18:45:52 +0200
Subject: [PATCH 02/13] Added depth support

---
 cameras/realsense.py | 552 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 552 insertions(+)
 create mode 100644 cameras/realsense.py

diff --git a/cameras/realsense.py b/cameras/realsense.py
new file mode 100644
index 0000000..d16d2fd
--- /dev/null
+++ b/cameras/realsense.py
@@ -0,0 +1,552 @@
+#reference to https://github.com/huggingface/lerobot/tree/main
+#async_read now returns both rgb and depth images
+
+
+"""
+Provides the RealSenseCamera class for capturing frames from Intel RealSense cameras.
+"""
+
+import logging
+import time
+from threading import Event, Lock, Thread
+from typing import Any
+
+import cv2
+import numpy as np
+
+try:
+    import pyrealsense2 as rs
+except Exception as e:
+    logging.info(f"Could not import realsense: {e}")
+
+from lerobot.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
+
+from lerobot.cameras.camera import Camera
+from lerobot.configs import ColorMode
+from lerobot.utils import get_cv2_rotation
+from lerobot.cameras.realsense import RealSenseCameraConfig
+
+logger = logging.getLogger(__name__)
+
+
+class RealSenseCamera(Camera):
+    """
+    Manages interactions with Intel RealSense cameras for frame and depth recording.
+
+    This class provides an interface similar to `OpenCVCamera` but tailored for
+    RealSense devices, leveraging the `pyrealsense2` library. It uses the camera's
+    unique serial number for identification, offering more stability than device
+    indices, especially on Linux. It also supports capturing depth maps alongside
+    color frames.
+
+    Use the provided utility script to find available camera indices and default profiles:
+    ```bash
+    python -m lerobot.find_cameras realsense
+    ```
+
+    A `RealSenseCamera` instance requires a configuration object specifying the
+    camera's serial number or a unique device name. If using the name, ensure only
+    one camera with that name is connected.
+
+    The camera's default settings (FPS, resolution, color mode) from the stream
+    profile are used unless overridden in the configuration.
+
+    Example:
+        ```python
+        from lerobot.cameras.realsense import RealSenseCamera, RealSenseCameraConfig
+        from lerobot.cameras import ColorMode, Cv2Rotation
+
+        # Basic usage with serial number
+        config = RealSenseCameraConfig(serial_number_or_name="0123456789") # Replace with actual SN
+        camera = RealSenseCamera(config)
+        camera.connect()
+
+        # Read 1 frame synchronously
+        color_image = camera.read()
+        print(color_image.shape)
+
+        # Read 1 frame asynchronously
+        async_image = camera.async_read()
+
+        # When done, properly disconnect the camera using
+        camera.disconnect()
+
+        # Example with depth capture and custom settings
+        custom_config = RealSenseCameraConfig(
+            serial_number_or_name="0123456789", # Replace with actual SN
+            fps=30,
+            width=1280,
+            height=720,
+            color_mode=ColorMode.BGR, # Request BGR output
+            rotation=Cv2Rotation.NO_ROTATION,
+            use_depth=True
+        )
+        depth_camera = RealSenseCamera(custom_config)
+        depth_camera.connect()
+
+        # Read 1 depth frame
+        depth_map = depth_camera.read_depth()
+
+        # Example using a unique camera name
+        name_config = RealSenseCameraConfig(serial_number_or_name="Intel RealSense D435") # If unique
+        name_camera = RealSenseCamera(name_config)
+        # ... connect, read, disconnect ...
+        ```
+    """
+
+    def __init__(self, config: RealSenseCameraConfig):
+        """
+        Initializes the RealSenseCamera instance.
+
+        Args:
+            config: The configuration settings for the camera.
+        """
+
+        super().__init__(config)
+
+        self.config = config
+
+        if config.serial_number_or_name.isdigit():
+            self.serial_number = config.serial_number_or_name
+        else:
+            self.serial_number = self._find_serial_number_from_name(config.serial_number_or_name)
+
+        self.fps = config.fps
+        self.color_mode = config.color_mode
+        self.use_depth = config.use_depth
+        self.warmup_s = config.warmup_s
+        self.latest_frame_acquisition_time = None
+        self.rs_pipeline: rs.pipeline | None = None
+        self.rs_profile: rs.pipeline_profile | None = None
+
+        self.thread: Thread | None = None
+        self.stop_event: Event | None = None
+        self.frame_lock: Lock = Lock()
+        self.latest_frame: np.ndarray | None = None
+        self.latest_frame_depth: np.ndarray | None = None
+        self.new_frame_event: Event = Event()
+
+        self.rotation: int | None = get_cv2_rotation(config.rotation)
+
+        if self.height and self.width:
+            self.capture_width, self.capture_height = self.width, self.height
+            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+                self.capture_width, self.capture_height = self.height, self.width
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}({self.serial_number})"
+
+    @property
+    def is_connected(self) -> bool:
+        """Checks if the camera pipeline is started and streams are active."""
+        return self.rs_pipeline is not None and self.rs_profile is not None
+
+    def connect(self, warmup: bool = True):
+        """
+        Connects to the RealSense camera specified in the configuration.
+
+        Initializes the RealSense pipeline, configures the required streams (color
+        and optionally depth), starts the pipeline, and validates the actual stream settings.
+
+        Raises:
+            DeviceAlreadyConnectedError: If the camera is already connected.
+            ValueError: If the configuration is invalid (e.g., missing serial/name, name not unique).
+            ConnectionError: If the camera is found but fails to start the pipeline or no RealSense devices are detected at all.
+            RuntimeError: If the pipeline starts but fails to apply requested settings.
+        """
+        if self.is_connected:
+            raise DeviceAlreadyConnectedError(f"{self} is already connected.")
+
+        self.rs_pipeline = rs.pipeline()
+        rs_config = rs.config()
+        self._configure_rs_pipeline_config(rs_config)
+
+        try:
+            self.rs_profile = self.rs_pipeline.start(rs_config)
+        except RuntimeError as e:
+            self.rs_profile = None
+            self.rs_pipeline = None
+            raise ConnectionError(
+                f"Failed to open {self}."
+                "Run `python -m lerobot.find_cameras realsense` to find available cameras."
+            ) from e
+
+        self._configure_capture_settings()
+
+        if warmup:
+            time.sleep(
+                1
+            )  # NOTE(Steven): RS cameras need a bit of time to warm up before the first read. If we don't wait, the first read from the warmup will raise.
+            start_time = time.time()
+            while time.time() - start_time < self.warmup_s:
+                self.read()
+                time.sleep(0.1)
+
+        logger.info(f"{self} connected.")
+
+    @staticmethod
+    def find_cameras() -> list[dict[str, Any]]:
+        """
+        Detects available Intel RealSense cameras connected to the system.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries,
+            where each dictionary contains 'type', 'id' (serial number), 'name',
+            firmware version, USB type, and other available specs, and the default profile properties (width, height, fps, format).
+
+        Raises:
+            OSError: If pyrealsense2 is not installed.
+            ImportError: If pyrealsense2 is not installed.
+        """
+        found_cameras_info = []
+        context = rs.context()
+        devices = context.query_devices()
+
+        for device in devices:
+            camera_info = {
+                "name": device.get_info(rs.camera_info.name),
+                "type": "RealSense",
+                "id": device.get_info(rs.camera_info.serial_number),
+                "firmware_version": device.get_info(rs.camera_info.firmware_version),
+                "usb_type_descriptor": device.get_info(rs.camera_info.usb_type_descriptor),
+                "physical_port": device.get_info(rs.camera_info.physical_port),
+                "product_id": device.get_info(rs.camera_info.product_id),
+                "product_line": device.get_info(rs.camera_info.product_line),
+            }
+
+            # Get stream profiles for each sensor
+            sensors = device.query_sensors()
+            for sensor in sensors:
+                profiles = sensor.get_stream_profiles()
+
+                for profile in profiles:
+                    if profile.is_video_stream_profile() and profile.is_default():
+                        vprofile = profile.as_video_stream_profile()
+                        stream_info = {
+                            "stream_type": vprofile.stream_name(),
+                            "format": vprofile.format().name,
+                            "width": vprofile.width(),
+                            "height": vprofile.height(),
+                            "fps": vprofile.fps(),
+                        }
+                        camera_info["default_stream_profile"] = stream_info
+
+            found_cameras_info.append(camera_info)
+
+        return found_cameras_info
+
+    def _find_serial_number_from_name(self, name: str) -> str:
+        """Finds the serial number for a given unique camera name."""
+        camera_infos = self.find_cameras()
+        found_devices = [cam for cam in camera_infos if str(cam["name"]) == name]
+
+        if not found_devices:
+            available_names = [cam["name"] for cam in camera_infos]
+            raise ValueError(
+                f"No RealSense camera found with name '{name}'. Available camera names: {available_names}"
+            )
+
+        if len(found_devices) > 1:
+            serial_numbers = [dev["serial_number"] for dev in found_devices]
+            raise ValueError(
+                f"Multiple RealSense cameras found with name '{name}'. "
+                f"Please use a unique serial number instead. Found SNs: {serial_numbers}"
+            )
+
+        serial_number = str(found_devices[0]["serial_number"])
+        return serial_number
+
+    def _configure_rs_pipeline_config(self, rs_config):
+        """Creates and configures the RealSense pipeline configuration object."""
+        rs.config.enable_device(rs_config, self.serial_number)
+
+        if self.width and self.height and self.fps:
+            rs_config.enable_stream(
+                rs.stream.color, self.capture_width, self.capture_height, rs.format.rgb8, self.fps
+            )
+            if self.use_depth:
+                rs_config.enable_stream(
+                    rs.stream.depth, self.capture_width, self.capture_height, rs.format.z16, self.fps
+                )
+        else:
+            rs_config.enable_stream(rs.stream.color)
+            if self.use_depth:
+                rs_config.enable_stream(rs.stream.depth)
+
+    def _configure_capture_settings(self) -> None:
+        """Sets fps, width, and height from device stream if not already configured.
+
+        Uses the color stream profile to update unset attributes. Handles rotation by
+        swapping width/height when needed. Original capture dimensions are always stored.
+
+        Raises:
+            DeviceNotConnectedError: If device is not connected.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"Cannot validate settings for {self} as it is not connected.")
+
+        stream = self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile()
+
+        if self.fps is None:
+            self.fps = stream.fps()
+
+        if self.width is None or self.height is None:
+            actual_width = int(round(stream.width()))
+            actual_height = int(round(stream.height()))
+            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+                self.width, self.height = actual_height, actual_width
+                self.capture_width, self.capture_height = actual_width, actual_height
+            else:
+                self.width, self.height = actual_width, actual_height
+                self.capture_width, self.capture_height = actual_width, actual_height
+
+    def read_depth(self, timeout_ms: int = 200) -> np.ndarray:
+        """
+        Reads a single frame (depth) synchronously from the camera.
+
+        This is a blocking call. It waits for a coherent set of frames (depth)
+        from the camera hardware via the RealSense pipeline.
+
+        Args:
+            timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 200ms.
+
+        Returns:
+            np.ndarray: The depth map as a NumPy array (height, width)
+                  of type `np.uint16` (raw depth values in millimeters) and rotation.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If reading frames from the pipeline fails or frames are invalid.
+        """
+
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+        if not self.use_depth:
+            raise RuntimeError(
+                f"Failed to capture depth frame '.read_depth()'. Depth stream is not enabled for {self}."
+            )
+
+        start_time = time.perf_counter()
+
+        ret, frame = self.rs_pipeline.try_wait_for_frames(timeout_ms=timeout_ms)
+
+        if not ret or frame is None:
+            raise RuntimeError(f"{self} read_depth failed (status={ret}).")
+
+        depth_frame = frame.get_depth_frame()
+        depth_map = np.asanyarray(depth_frame.get_data())
+
+        depth_map_processed = self._postprocess_image(depth_map, depth_frame=True)
+
+        read_duration_ms = (time.perf_counter() - start_time) * 1e3
+        logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
+
+        return depth_map_processed
+
+    def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np.ndarray:
+        """
+        Reads a single frame (color) synchronously from the camera.
+
+        This is a blocking call. It waits for a coherent set of frames (color)
+        from the camera hardware via the RealSense pipeline.
+
+        Args:
+            timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 200ms.
+
+        Returns:
+            np.ndarray: The captured color frame as a NumPy array
+              (height, width, channels), processed according to `color_mode` and rotation.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            RuntimeError: If reading frames from the pipeline fails or frames are invalid.
+            ValueError: If an invalid `color_mode` is requested.
+        """
+
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+
+        start_time = time.perf_counter()
+
+        ret, frame = self.rs_pipeline.try_wait_for_frames(timeout_ms=timeout_ms)
+
+        if not ret or frame is None:
+            raise RuntimeError(f"{self} read failed (status={ret}).")
+
+        color_frame = frame.get_color_frame()
+        color_image_raw = np.asanyarray(color_frame.get_data())
+
+        color_image_processed = self._postprocess_image(color_image_raw, color_mode)
+
+        read_duration_ms = (time.perf_counter() - start_time) * 1e3
+        logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
+
+        return color_image_processed
+
+    def _postprocess_image(
+        self, image: np.ndarray, color_mode: ColorMode | None = None, depth_frame: bool = False
+    ) -> np.ndarray:
+        """
+        Applies color conversion, dimension validation, and rotation to a raw color frame.
+
+        Args:
+            image (np.ndarray): The raw image frame (expected RGB format from RealSense).
+            color_mode (Optional[ColorMode]): The target color mode (RGB or BGR). If None,
+                                             uses the instance's default `self.color_mode`.
+
+        Returns:
+            np.ndarray: The processed image frame according to `self.color_mode` and `self.rotation`.
+
+        Raises:
+            ValueError: If the requested `color_mode` is invalid.
+            RuntimeError: If the raw frame dimensions do not match the configured
+                          `width` and `height`.
+        """
+
+        if color_mode and color_mode not in (ColorMode.RGB, ColorMode.BGR):
+            raise ValueError(
+                f"Invalid requested color mode '{color_mode}'. Expected {ColorMode.RGB} or {ColorMode.BGR}."
+            )
+
+        if depth_frame:
+            h, w = image.shape
+        else:
+            h, w, c = image.shape
+
+            if c != 3:
+                raise RuntimeError(f"{self} frame channels={c} do not match expected 3 channels (RGB/BGR).")
+
+        if h != self.capture_height or w != self.capture_width:
+            raise RuntimeError(
+                f"{self} frame width={w} or height={h} do not match configured width={self.capture_width} or height={self.capture_height}."
+            )
+
+        processed_image = image
+        if self.color_mode == ColorMode.BGR:
+            processed_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+        if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE, cv2.ROTATE_180]:
+            processed_image = cv2.rotate(processed_image, self.rotation)
+
+        return processed_image
+
+    def _read_loop(self):
+        """
+        Internal loop run by the background thread for asynchronous reading.
+
+        On each iteration:
+        1. Reads a color frame with 500ms timeout
+        2. Stores result in latest_frame (thread-safe)
+        3. Sets new_frame_event to notify listeners
+
+        Stops on DeviceNotConnectedError, logs other errors and continues.
+        """
+        while not self.stop_event.is_set():
+            try:
+                color_image = self.read(timeout_ms=500)
+                depth_image = self.read_depth(timeout_ms=500) 
+
+                with self.frame_lock:
+                    self.latest_frame = color_image
+                    self.latest_frame_depth = depth_image
+                    self.latest_frame_acquisition_time = time.time()
+                self.new_frame_event.set()
+
+            except DeviceNotConnectedError:
+                break
+            except Exception as e:
+                logger.warning(f"Error reading frame in background thread for {self}: {e}")
+
+    def _start_read_thread(self) -> None:
+        """Starts or restarts the background read thread if it's not running."""
+        if self.thread is not None and self.thread.is_alive():
+            self.thread.join(timeout=0.1)
+        if self.stop_event is not None:
+            self.stop_event.set()
+
+        self.stop_event = Event()
+        self.thread = Thread(target=self._read_loop, args=(), name=f"{self}_read_loop")
+        self.thread.daemon = True
+        self.thread.start()
+
+    def _stop_read_thread(self):
+        """Signals the background read thread to stop and waits for it to join."""
+        if self.stop_event is not None:
+            self.stop_event.set()
+
+        if self.thread is not None and self.thread.is_alive():
+            self.thread.join(timeout=2.0)
+
+        self.thread = None
+        self.stop_event = None
+
+    # NOTE(Steven): Missing implementation for depth for now
+    def async_read(self, timeout_ms: float = 200) -> np.ndarray:
+        """
+        Reads the latest available frame data (color) asynchronously.
+
+        This method retrieves the most recent color frame captured by the background
+        read thread. It does not block waiting for the camera hardware directly,
+        but may wait up to timeout_ms for the background thread to provide a frame.
+
+        Args:
+            timeout_ms (float): Maximum time in milliseconds to wait for a frame
+                to become available. Defaults to 200ms (0.2 seconds).
+
+        Returns:
+            np.ndarray:
+            The latest captured frame data (color image), processed according to configuration.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is not connected.
+            TimeoutError: If no frame data becomes available within the specified timeout.
+            RuntimeError: If the background thread died unexpectedly or another error occurs.
+        """
+        if not self.is_connected:
+            raise DeviceNotConnectedError(f"{self} is not connected.")
+
+        if self.thread is None or not self.thread.is_alive():
+            self._start_read_thread()
+
+        if not self.new_frame_event.wait(timeout=timeout_ms / 1000.0):
+            thread_alive = self.thread is not None and self.thread.is_alive()
+            raise TimeoutError(
+                f"Timed out waiting for frame from camera {self} after {timeout_ms} ms. "
+                f"Read thread alive: {thread_alive}."
+            )
+
+        with self.frame_lock:
+            frame = self.latest_frame
+            depth = self.latest_frame_depth
+            acquisition_time = self.latest_frame_acquisition_time # Retrieve the timestamp
+            self.new_frame_event.clear()
+
+        if frame is None:
+            raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")
+
+        return frame, depth,acquisition_time
+
+    def disconnect(self):
+        """
+        Disconnects from the camera, stops the pipeline, and cleans up resources.
+
+        Stops the background read thread (if running) and stops the RealSense pipeline.
+
+        Raises:
+            DeviceNotConnectedError: If the camera is already disconnected (pipeline not running).
+        """
+
+        if not self.is_connected and self.thread is None:
+            raise DeviceNotConnectedError(
+                f"Attempted to disconnect {self}, but it appears already disconnected."
+            )
+
+        if self.thread is not None:
+            self._stop_read_thread()
+
+        if self.rs_pipeline is not None:
+            self.rs_pipeline.stop()
+            self.rs_pipeline = None
+            self.rs_profile = None
+
+        logger.info(f"{self} disconnected.")

From 7e67607124124a581e06e52046a3ac5432fd2cd5 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Mon, 11 Aug 2025 19:15:24 +0200
Subject: [PATCH 03/13] Fixed depth w/ draccus config

---
 alignit/config.py                  |   7 +
 alignit/infere.py                  |  67 ++++++----
 alignit/models/alignnet.py         |  11 ++
 alignit/record.py                  | 207 +++++++++--------------------
 alignit/robots/xarm.py             |   6 +-
 alignit/robots/xarmsim/__init__.py |   6 +
 alignit/train.py                   |  33 +++--
 alignit/visualize.py               |   7 +-
 8 files changed, 159 insertions(+), 185 deletions(-)

diff --git a/alignit/config.py b/alignit/config.py
index df3b71c..935f69b 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -46,6 +46,13 @@ class ModelConfig:
         default="alignnet_model.pth",
         metadata={"help": "Path to save/load trained model"},
     )
+    use_depth_input: bool = field(
+        default=True, 
+        metadata={"help": "Whether to use depth input for the model"}
+    )
+    depth_hidden_dim: int = field(
+        default=128, metadata={"help": "Output dimension of depth CNN"}
+    )
 
 
 @dataclass
diff --git a/alignit/infere.py b/alignit/infere.py
index 410893b..c336563 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -3,46 +3,47 @@
 import time
 import draccus
 from alignit.config import InferConfig
-
 import torch
-
 from alignit.models.alignnet import AlignNet
 from alignit.utils.zhou import sixd_se3
 from alignit.utils.tfs import print_pose, are_tfs_close
 from alignit.robots.xarmsim import XarmSim
 from alignit.robots.xarm import Xarm
 
-
 @draccus.wrap()
-def main():
+def main(cfg: InferConfig):
+    """Run inference/alignment using configuration parameters."""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
 
     net = AlignNet(
-        output_dim=9,
-        use_vector_input=False,
-        use_depth_input=True,
+        backbone_name=cfg.model.backbone,
+        backbone_weights=cfg.model.backbone_weights,
+        use_vector_input=cfg.model.use_vector_input,
+        fc_layers=cfg.model.fc_layers,
+        vector_hidden_dim=cfg.model.vector_hidden_dim,
+        output_dim=cfg.model.output_dim,
+        feature_agg=cfg.model.feature_agg,
+        use_depth_input=cfg.model.use_depth_input,
     )
-    net.load_state_dict(torch.load("alignnet_model.pth", map_location=device))
+    net.load_state_dict(torch.load(cfg.model.path, map_location=device))
     net.to(device)
     net.eval()
 
-    robot = Xarm()
+    robot = XarmSim()
 
     start_pose = t3d.affines.compose(
         [0.23, 0, 0.25], t3d.euler.euler2mat(np.pi, 0, 0), [1, 1, 1]
     )
-    robot.servo_to_pose(start_pose, lin_tol=1e-2)
-    total = 0
-    tick = 0
+    robot.servo_to_pose(start_pose, lin_tol=1e-2)   
+    iteration = 0
+    iterations_within_tolerance = 0
+    ang_tol_rad = np.deg2rad(cfg.ang_tolerance)
     try:
         while True:
-            start_capture = time.time()
             observation = robot.get_observation()
-            print(f"Observation time: {time.time() - start_capture:.3f}s")
-
-            rgb_image = observation["rgb"].astype(np.float32) / 255.0
-            depth_image = observation["depth"].astype(np.float32) / 1000.0
+            rgb_image = observation["camera.rgb"].astype(np.float32) / 255.0
+            depth_image = observation["camera.rgb.depth"].astype(np.float32) / 1000.0
             rgb_image_tensor = (
                 torch.from_numpy(np.array(rgb_image))
                 .permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
@@ -61,27 +62,45 @@ def main():
             depth_images_batch = depth_image_tensor.unsqueeze(1)
 
 
-            start = time.time()
             with torch.no_grad():
                 relative_action = net(rgb_images_batch, depth_images=depth_images_batch)
             relative_action = relative_action.squeeze(0).cpu().numpy()
             relative_action = sixd_se3(relative_action)
+
+            if cfg.debug_output:
+                print_pose(relative_action)
+
             relative_action [:3,:3] = relative_action[:3,:3] @ relative_action[:3,:3] @ relative_action[:3,:3]
-            print_pose(relative_action)
+
+            # Check convergence
+            if are_tfs_close(
+                relative_action, lin_tol=cfg.lin_tolerance, ang_tol=ang_tol_rad
+            ):
+                iterations_within_tolerance += 1
+            else:
+                iterations_within_tolerance = 0
+
+            if iterations_within_tolerance >= cfg.debouncing_count:
+                print("Alignment achieved - stopping.")
+                break
 
             action = robot.pose() @ relative_action
-            elapsed = time.time() - start
-            total = total + elapsed
-            tick += 1
-            avg = total / tick
+            iteration += 1
             action = {
                 "pose": action,
                 "gripper.pos": 1.0,
             }
             robot.send_action(action)
+
+            # Check max iterations
+            if cfg.max_iterations and iteration >= cfg.max_iterations:
+                print(f"Reached maximum iterations ({cfg.max_iterations}) - stopping.")
+                break
+
+        time.sleep(10.0)
     except KeyboardInterrupt:
         print("\nExiting...")
 
-
+    robot.disconnect()
 if __name__ == "__main__":
     main()
diff --git a/alignit/models/alignnet.py b/alignit/models/alignnet.py
index 8b1a225..4fa3bd0 100644
--- a/alignit/models/alignnet.py
+++ b/alignit/models/alignnet.py
@@ -17,6 +17,17 @@ def __init__(
         output_dim=7,
         feature_agg="mean",
     ):
+        """
+        :param backbone_name: 'efficientnet_b0' or 'resnet18'
+        :param backbone_weights: 'DEFAULT' or None
+        :param use_vector_input: whether to accept a vector input
+        :param fc_layers: list of hidden layer sizes for the fully connected head
+        :param vector_hidden_dim: output dim of the vector MLP
+        :param output_dim: final output vector size
+        :param feature_agg: 'mean' or 'max' across image views
+        :param use_depth_input: whether to accept depth input
+        :param depth_hidden_dim: output dim of the depth MLP
+        """
         super().__init__()
         self.use_vector_input = use_vector_input
         self.use_depth_input = use_depth_input
diff --git a/alignit/record.py b/alignit/record.py
index 1e7168e..304c479 100644
--- a/alignit/record.py
+++ b/alignit/record.py
@@ -6,6 +6,9 @@
 from alignit.robots.xarm import Xarm
 from alignit.utils.zhou import se3_sixd
 import argparse  # Added for command line arguments
+import draccus
+from alignit.config import RecordConfig
+from alignit.robots.xarmsim import XarmSim
 from datasets import (
     Dataset,
     Features,
@@ -16,89 +19,48 @@
     concatenate_datasets,
 )
 
-def generate_lift_trajectory_with_random_rpy(
-    start_pose,
-    lift_distance=0.1,          
-    num_steps=100,              
-    max_rotation_angle=15.0,   
-    initial_lift=0.005,        
-):
-    
-    trajectory = []
-    R_start = start_pose[:3, :3]
-    t_start = start_pose[:3, 3]
-    
-    object_z_axis = R_start[:, 2]
-    
-    t_current = t_start + object_z_axis * initial_lift
-    
-    for i in range(num_steps):
-        lift_progress = i / (num_steps - 1)  # 0 to 1
-        current_lift = t_start + object_z_axis * -(initial_lift + lift_distance * lift_progress)
-        
-        r_rand = np.random.uniform(-max_rotation_angle, max_rotation_angle)
-        p_rand = np.random.uniform(-max_rotation_angle, max_rotation_angle)
-        y_rand = np.random.uniform(-max_rotation_angle, max_rotation_angle)
-        
-        random_rotation = R.from_euler('xyz', 
-                                     [np.radians(r_rand), 
-                                      np.radians(p_rand), 
-                                      np.radians(y_rand)]).as_matrix()
-        
-        R_current = R_start @ random_rotation
-        
-        T = np.eye(4)
-        T[:3, :3] = R_current
-        T[:3, 3] = t_start
-        
-        trajectory.append(T)
-    
-    return trajectory
-
-def generate_spiral_trajectory(
-    start_pose,
-    z_step=0.1,
-    radius_step=0.001,
-    num_steps=50,
-    cone_angle=45,
-    visible_sweep=180,
-    viewing_angle_offset=0,
-    angular_resolution=5,
-    include_cone_poses=False,
-    lift_height_before_spiral=0.0005,
-):
+def generate_spiral_trajectory(start_pose, cfg):
+    """Generate spiral trajectory using configuration parameters."""
     trajectory = []
     R_start = start_pose[:3, :3]
     t_start_initial = start_pose[:3, 3]
 
-    cone_angle_rad = np.deg2rad(cone_angle)
+    cone_angle_rad = np.deg2rad(cfg.cone_angle)
 
     object_z_axis = R_start[:, 2]
 
-    lift_offset_world = object_z_axis * lift_height_before_spiral
+    lift_offset_world = object_z_axis * cfg.lift_height_before_spiral
     t_start_spiral = t_start_initial + lift_offset_world
 
-    start_angle = -visible_sweep / 2 + viewing_angle_offset
-    end_angle = visible_sweep / 2 + viewing_angle_offset
+    start_angle = -cfg.visible_sweep / 2 + cfg.viewing_angle_offset
+    end_angle = cfg.visible_sweep / 2 + cfg.viewing_angle_offset
 
-    for i in range(num_steps):
-        radius = radius_step * i
+    for i in range(cfg.num_steps):
+        radius = cfg.radius_step * i
         angle = 2 * np.pi * i / 10
 
         local_offset = np.array(
-            [radius * np.cos(angle), radius * np.sin(angle), -z_step * i]
+            [radius * np.cos(angle), radius * np.sin(angle), -cfg.z_step * i]
         )
 
         world_offset = R_start @ local_offset
         base_position = t_start_spiral + world_offset
+        x_rot = np.random.uniform(-10, 10)
+        y_rot = np.random.uniform(-10, 10)
+        z_rot = np.random.uniform(-10, 10)
+
+        random_angles = np.radians([x_rot, y_rot, z_rot])
+        random_rotation = R.from_euler("xyz", random_angles).as_matrix()
+
+        randomized_rotation = R_start @ random_rotation
 
         T = np.eye(4)
-        T[:3, :3] = R_start
+        T[:3, :3] = randomized_rotation
         T[:3, 3] = base_position
         trajectory.append(T)
 
-        if include_cone_poses:
-            for deg in np.arange(start_angle, end_angle, angular_resolution):
+        if cfg.include_cone_poses:
+            for deg in np.arange(start_angle, end_angle, cfg.angular_resolution):
                 theta = np.deg2rad(deg)
 
                 tilt = t3d.euler.euler2mat(cone_angle_rad, 0, 0)
@@ -112,108 +74,65 @@ def generate_spiral_trajectory(
 
     return trajectory
 
-def load_and_merge_datasets(dataset_paths, output_path, features):
-    """Load and merge multiple datasets from disk"""
-    if not dataset_paths:
-        return None
-        
-    datasets = []
-    for path in dataset_paths:
-        if os.path.exists(path):
-            try:
-                ds = load_from_disk(path)
-                ds = ds.cast(features)  # Ensure consistent features
-                datasets.append(ds)
-                print(f"Loaded dataset from {path}")
-            except Exception as e:
-                print(f"Error loading dataset from {path}: {e}")
-        else:
-            print(f"Dataset path not found: {path}")
-    
-    if not datasets:
-        return None
-    
-    merged_dataset = concatenate_datasets(datasets)
-    
-    # Save the merged dataset
-    temp_path = output_path + "_temp"
-    merged_dataset.save_to_disk(temp_path)
-    if os.path.exists(output_path):
-        shutil.rmtree(output_path)
-    shutil.move(temp_path, output_path)
-    
-    return merged_dataset
-
-def main(args):
-    robot = Xarm()
+@draccus.wrap()
+def main(cfg: RecordConfig):
+    """Record alignment dataset using configuration parameters."""
+    robot = XarmSim()
     features = Features(
         {"images": Sequence(Image()), "action": Sequence(Value("float32")), "depth": Sequence(Image())}
     )
 
-    if args.merge_datasets:
-        print(f"Merging datasets: {args.merge_datasets}")
-        merged_dataset = load_and_merge_datasets(
-            args.merge_datasets, 
-            args.output_path, 
-            features
-        )
-        if merged_dataset is not None:
-            print(f"Merged dataset created with {len(merged_dataset)} samples at {args.output_path}")
-        else:
-            print("No valid datasets found to merge")
-    max_rot = 15 
-
-    for episode in range(args.num_episodes):
+    for episode in range(cfg.episodes):
         pose_start, pose_alignment_target = robot.reset()
-        max_rot = max_rot + 2  
-        trajectory = generate_lift_trajectory_with_random_rpy(
-            pose_start,
-            lift_distance=0.1,
-            num_steps=100,
-            max_rotation_angle=max_rot,
-            initial_lift=0.005,
+
+        robot.servo_to_pose(pose_alignment_target, lin_tol=0.015, ang_tol=0.015)
+
+        robot.servo_to_pose(
+                pose_alignment_target,
+                lin_tol=cfg.lin_tol_alignment,
+                ang_tol=cfg.ang_tol_alignment,
         )
-        print(f"Generated trajectory with {len(trajectory)} poses for episode {episode+1}")
+        trajectory = generate_spiral_trajectory(pose_start, cfg.trajectory)
+    
+        print(f"Generated trajectory with {len(trajectory)} poses for episode {episode+1}",flush=True)
         frames = []
         for pose in trajectory:
-            robot.servo_to_pose(pose, lin_tol=5e-3, ang_tol=0.09)
-            current_pose = robot.pose()
-            
-            action_pose = np.linalg.inv(current_pose) @ pose_alignment_target
-            action_sixd = se3_sixd(action_pose)
-
-            observation = robot.get_observation()
-            print(observation.keys())
-            frame = {"images": [observation["rgb"].copy()],"depth":  [observation["depth"].copy()],"action": action_sixd}
-            frames.append(frame)
-
+                robot.servo_to_pose(
+                    pose, lin_tol=cfg.lin_tol_trajectory, ang_tol=cfg.ang_tol_trajectory
+                )
+                current_pose = robot.pose()
+
+                action_pose = np.linalg.inv(current_pose) @ pose_alignment_target
+                action_sixd = se3_sixd(action_pose)
+
+                observation = robot.get_observation()
+                frame = {
+                    "images": [observation["camera.rgb"].copy()],
+                    "action": action_sixd,
+                    "depth": [observation["camera.rgb.depth"].copy()],
+                }
+                frames.append(frame)
         print(f"Episode {episode+1} completed with {len(frames)} frames.")
 
         episode_dataset = Dataset.from_list(frames, features=features)
-        if episode == 0 and not os.path.exists(args.output_path):
+        if episode == 0:
             combined_dataset = episode_dataset
+            print("Writing do dataset to disk for the first time.")
         else:
-            previous_dataset = load_from_disk(args.output_path)
+            previous_dataset = load_from_disk(cfg.dataset.path)
             previous_dataset = previous_dataset.cast(features)
             combined_dataset = concatenate_datasets([previous_dataset, episode_dataset])
             del previous_dataset
 
-        temp_path = args.output_path + "_temp"
+        temp_path = f"{cfg.dataset.path}_temp"
         combined_dataset.save_to_disk(temp_path)
-        if os.path.exists(args.output_path):
-            shutil.rmtree(args.output_path)
-        shutil.move(temp_path, args.output_path)
+        if os.path.exists(cfg.dataset.path):
+            shutil.rmtree(cfg.dataset.path)
+        shutil.move(temp_path, cfg.dataset.path)
+        print(f"Saved dataset to {cfg.dataset.path}")
 
     robot.disconnect()
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Robot data collection and dataset merging')
-    parser.add_argument('--output-path', type=str, default="data/duck",
-                       help='Path to save the output dataset')
-    parser.add_argument('--num-episodes', type=int, default=15,
-                       help='Number of episodes to run')
-    parser.add_argument('--merge-datasets', type=str, nargs='+', default=None,
-                       help='List of dataset paths to merge before collection')
-    
-    args = parser.parse_args()
-    main(args)
+    main()
\ No newline at end of file
diff --git a/alignit/robots/xarm.py b/alignit/robots/xarm.py
index 075c2cc..207e718 100644
--- a/alignit/robots/xarm.py
+++ b/alignit/robots/xarm.py
@@ -33,10 +33,12 @@ def send_action(self, action):
         self.robot.send_action(action)
 
     def get_observation(self):
-        rgb_image = self.camera.read()
+        rgb_image,depth_image,acqusition_time = self.camera.async_read()
 
         return {
-            "camera.rgb": rgb_image,
+            "rgb": rgb_image,
+            "depth": depth_image,
+            "timestamp": acqusition_time
         }
 
     def disconnect(self):
diff --git a/alignit/robots/xarmsim/__init__.py b/alignit/robots/xarmsim/__init__.py
index c64e631..bf467b5 100644
--- a/alignit/robots/xarmsim/__init__.py
+++ b/alignit/robots/xarmsim/__init__.py
@@ -189,9 +189,15 @@ def get_observation(self):
 
         for i in range(self.model.ncam):
             name = mujoco.mj_id2name(self.model, mujoco.mjtObj.mjOBJ_CAMERA, i)
+            
             self.renderer.update_scene(self.data, camera=name)
             image = self.renderer.render()
+            self.renderer.enable_depth_rendering()
+            self.renderer.update_scene(self.data, camera=name)
+            image_depth = self.renderer.render()
+            self.renderer.disable_depth_rendering()
             obs["camera." + name] = image[:, :, ::-1]
+            obs["camera." + name + ".depth"] = image_depth
 
         return obs
 
diff --git a/alignit/train.py b/alignit/train.py
index 51a23b2..1ce0b2b 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -13,29 +13,40 @@
 
 def collate_fn(batch):
     images = [item["images"] for item in batch]
+    depth_images = [item.get("depth", None) for item in batch]
     actions = [item["action"] for item in batch]
-    return {"images": images, "action": torch.tensor(actions, dtype=torch.float32)}
+    return {"images": images, "depth_images": depth_images, "action": torch.tensor(actions, dtype=torch.float32)}
 
 
 @draccus.wrap()
-def main():
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # Load the dataset from disk
+def main(cfg: TrainConfig):
+    """Train AlignNet model using configuration parameters."""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    # Load the dataset from disk
     dataset = load_from_disk("data/duck")
     net = AlignNet(
-        output_dim=9,  
-        use_vector_input=False,  
+        backbone_name=cfg.model.backbone,
+        backbone_weights=cfg.model.backbone_weights,
+        use_vector_input=cfg.model.use_vector_input,
+        fc_layers=cfg.model.fc_layers,
+        vector_hidden_dim=cfg.model.vector_hidden_dim,
+        output_dim=cfg.model.output_dim,
+        feature_agg=cfg.model.feature_agg,
     ).to(device)
 
-    train_dataset = dataset.train_test_split(test_size=0.2, seed=42)
-
+    train_dataset = dataset.train_test_split(
+        test_size=cfg.test_size, seed=cfg.random_seed
+    )
     train_loader = DataLoader(
-        train_dataset["train"], batch_size=4, shuffle=True, collate_fn=collate_fn
+        train_dataset["train"],
+        batch_size=cfg.batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
     )
-    optimizer = Adam(net.parameters(), lr=1e-4)
+
+    optimizer = Adam(net.parameters(), lr=cfg.learning_rate)
     criterion = MSELoss()
     net.train()
-    for epoch in range(100):
+    for epoch in range(cfg.epochs):
         for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
             images = batch["images"]
             depth_images_pil = batch["depth_images"]
diff --git a/alignit/visualize.py b/alignit/visualize.py
index 9b8adad..09525b3 100644
--- a/alignit/visualize.py
+++ b/alignit/visualize.py
@@ -1,6 +1,5 @@
 import gradio as gr
 import draccus
-from datasets import load_from_disk
 from alignit.utils.dataset import load_dataset
 from alignit.utils.zhou import sixd_se3
 from alignit.utils.tfs import get_pose_str
@@ -8,8 +7,8 @@
 
 
 @draccus.wrap()
-def main():
-    dataset = load_from_disk("data/duck")
+def visualize(cfg: VisualizeConfig):
+    dataset = load_dataset(cfg.dataset.path)
 
     def get_data(index):
         print(index)
@@ -31,4 +30,4 @@ def get_data(index):
 
 
 if __name__ == "__main__":
-    main()
+    visualize()

From 0e67f686862c9490d271e443a1fa680679de94f1 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Tue, 12 Aug 2025 17:50:31 +0200
Subject: [PATCH 04/13] Added tests for depth image and all inputs available

---
 alignit/models/alignnet.py |  4 +++-
 tests/test_alignnet.py     | 27 ++++++++++++++++++++++++---
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/alignit/models/alignnet.py b/alignit/models/alignnet.py
index 4fa3bd0..d6a4a2d 100644
--- a/alignit/models/alignnet.py
+++ b/alignit/models/alignnet.py
@@ -40,7 +40,7 @@ def __init__(
         self.image_fc = nn.Sequential(
             nn.Linear(self.image_feature_dim, fc_layers[0]), nn.ReLU()
         )
-
+        
         if use_depth_input:
             self.depth_cnn = nn.Sequential(
                 nn.Conv2d(1, 8, 3, padding=1), nn.ReLU(),
@@ -127,6 +127,8 @@ def forward(self, rgb_images, vector_inputs=None, depth_images=None):
             features.append(vec_feats)
 
         fused = torch.cat(features, dim=1)
+        print("Fused shape:", fused.shape)
+
         return self.head(fused)  # (B, output_dim)
 
 
diff --git a/tests/test_alignnet.py b/tests/test_alignnet.py
index aa77cb6..4907b9e 100644
--- a/tests/test_alignnet.py
+++ b/tests/test_alignnet.py
@@ -4,7 +4,7 @@
 
 
 def test_alignnet_forward_shapes_cpu():
-    model = AlignNet(backbone_name="resnet18", backbone_weights=None, use_vector_input=False, output_dim=7)
+    model = AlignNet(backbone_name="resnet18", backbone_weights=None, use_vector_input=False, output_dim=7, use_depth_input=False)
     model.eval()
     x = torch.randn(2, 3, 3, 64, 64)  # B=2, N=3 views
     with torch.no_grad():
@@ -13,7 +13,7 @@ def test_alignnet_forward_shapes_cpu():
 
 
 def test_alignnet_with_vector_input():
-    model = AlignNet(backbone_name="resnet18", backbone_weights=None, use_vector_input=True, output_dim=7)
+    model = AlignNet(backbone_name="resnet18", backbone_weights=None, use_vector_input=True, output_dim=7,use_depth_input=False)
     model.eval()
     x = torch.randn(1, 2, 3, 64, 64)
     vecs = [torch.randn(5)]
@@ -23,7 +23,7 @@ def test_alignnet_with_vector_input():
 
 
 def test_alignnet_performance():
-    model = AlignNet(backbone_name="efficientnet_b0", backbone_weights=None, use_vector_input=True, output_dim=7)
+    model = AlignNet(backbone_name="efficientnet_b0", backbone_weights=None, use_vector_input=True, output_dim=7,use_depth_input=False)
     model.eval()
     x = torch.randn(1, 3, 3, 224, 224)  # B=1, N=3 views
     vecs = [torch.randn(5)]
@@ -37,3 +37,24 @@ def test_alignnet_performance():
         print(f"Forward pass took {elapsed_time_ms:.2f} ms")
 
     assert elapsed_time < 0.5
+
+def test_alignnet_with_depth_input():
+    model = AlignNet(
+        backbone_name="resnet18",backbone_weights=None,use_vector_input=False,use_depth_input=True, output_dim=7)
+    model.eval()
+    x = torch.randn(2, 3, 3, 64, 64)  # RGB images
+    depth = torch.randn(2, 3, 1, 64, 64)  # Depth images
+    with torch.no_grad():
+        y = model(x, depth_images=depth)
+    assert y.shape == (2, 7)
+
+def test_alignnet_with_all_inputs():
+    # New test for combined inputs
+    model = AlignNet(backbone_name="efficientnet_b0",backbone_weights=None,use_vector_input=True,use_depth_input=True, output_dim=7)
+    model.eval()
+    x = torch.randn(1, 3, 3, 224, 224)  # RGB images
+    depth = torch.randn(1, 3, 1, 224, 224)  # Depth images
+    vecs = [torch.randn(5)]  # Vector inputs
+    with torch.no_grad():
+        y = model(x, vecs, depth)
+    assert y.shape == (1, 7)
\ No newline at end of file

From c7a0ba4ff72388722cf1d74415dcdfbe2d94b43e Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Tue, 12 Aug 2025 17:56:44 +0200
Subject: [PATCH 05/13] Cameras folder moved to alignit

---
 {cameras => alignit/cameras}/realsense.py | 9 ++-------
 tests/test_alignnet.py                    | 2 ++
 2 files changed, 4 insertions(+), 7 deletions(-)
 rename {cameras => alignit/cameras}/realsense.py (99%)

diff --git a/cameras/realsense.py b/alignit/cameras/realsense.py
similarity index 99%
rename from cameras/realsense.py
rename to alignit/cameras/realsense.py
index d16d2fd..f3037ea 100644
--- a/cameras/realsense.py
+++ b/alignit/cameras/realsense.py
@@ -1,10 +1,5 @@
-#reference to https://github.com/huggingface/lerobot/tree/main
-#async_read now returns both rgb and depth images
-
-
-"""
-Provides the RealSenseCamera class for capturing frames from Intel RealSense cameras.
-"""
+# Reference to https://github.com/huggingface/lerobot/tree/main
+# Changed async_read now returns both RGB and depth images
 
 import logging
 import time
diff --git a/tests/test_alignnet.py b/tests/test_alignnet.py
index 4907b9e..e88630c 100644
--- a/tests/test_alignnet.py
+++ b/tests/test_alignnet.py
@@ -38,6 +38,7 @@ def test_alignnet_performance():
 
     assert elapsed_time < 0.5
 
+
 def test_alignnet_with_depth_input():
     model = AlignNet(
         backbone_name="resnet18",backbone_weights=None,use_vector_input=False,use_depth_input=True, output_dim=7)
@@ -48,6 +49,7 @@ def test_alignnet_with_depth_input():
         y = model(x, depth_images=depth)
     assert y.shape == (2, 7)
 
+
 def test_alignnet_with_all_inputs():
     # New test for combined inputs
     model = AlignNet(backbone_name="efficientnet_b0",backbone_weights=None,use_vector_input=True,use_depth_input=True, output_dim=7)

From 2404e642eeec766582bc9e493db90d921606861d Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Tue, 12 Aug 2025 22:34:50 +0200
Subject: [PATCH 06/13] Formatting fix

---
 alignit/config.py                  |  5 +++++
 alignit/infere.py                  |  6 +++---
 alignit/models/alignnet.py         | 30 ++++--------------------------
 alignit/record.py                  |  7 +++----
 alignit/robots/xarm.py             |  5 +----
 alignit/robots/xarmsim/__init__.py |  1 -
 alignit/train.py                   | 10 +++++-----
 alignit/visualize.py               | 11 +++++++----
 tests/test_alignnet.py             |  6 +++---
 9 files changed, 31 insertions(+), 50 deletions(-)

diff --git a/alignit/config.py b/alignit/config.py
index 935f69b..33ef561 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -156,6 +156,11 @@ class InferConfig:
         default=5,
         metadata={"help": "Number of iterations within tolerance before stopping"},
     )
+    num_multiplications: int = field(
+        default=3,
+        metadata={
+            "help": "Number of times to multiply the transformation matrix"},
+    )
 
 
 @dataclass
diff --git a/alignit/infere.py b/alignit/infere.py
index c336563..2fbb7ca 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -70,7 +70,7 @@ def main(cfg: InferConfig):
             if cfg.debug_output:
                 print_pose(relative_action)
 
-            relative_action [:3,:3] = relative_action[:3,:3] @ relative_action[:3,:3] @ relative_action[:3,:3]
+            relative_action[:3, :3] = np.linalg.matrix_power(relative_action[:3, :3], cfg.num_multiplications)
 
             # Check convergence
             if are_tfs_close(
@@ -84,10 +84,10 @@ def main(cfg: InferConfig):
                 print("Alignment achieved - stopping.")
                 break
 
-            action = robot.pose() @ relative_action
+            target_pose = robot.pose() @ relative_action
             iteration += 1
             action = {
-                "pose": action,
+                "pose": target_pose,
                 "gripper.pos": 1.0,
             }
             robot.send_action(action)
diff --git a/alignit/models/alignnet.py b/alignit/models/alignnet.py
index d6a4a2d..3540515 100644
--- a/alignit/models/alignnet.py
+++ b/alignit/models/alignnet.py
@@ -7,13 +7,13 @@
 class AlignNet(nn.Module):
     def __init__(
         self,
-        backbone_name="efficientnet_b0", 
+        backbone_name="efficientnet_b0",
         backbone_weights="DEFAULT",
         use_vector_input=True,
-        use_depth_input=True,  # NEW
+        use_depth_input=True,
         fc_layers=[256, 128],
         vector_hidden_dim=64,
-        depth_hidden_dim=128,  # NEW
+        depth_hidden_dim=128,
         output_dim=7,
         feature_agg="mean",
     ):
@@ -129,26 +129,4 @@ def forward(self, rgb_images, vector_inputs=None, depth_images=None):
         fused = torch.cat(features, dim=1)
         print("Fused shape:", fused.shape)
 
-        return self.head(fused)  # (B, output_dim)
-
-
-if __name__ == "__main__":
-    import time
-
-    batch_size = 1
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = AlignNet(backbone_name="efficientnet_b0", use_vector_input=True, use_depth_input=True).to(device)
-
-    rgb_images = torch.randn(batch_size, 4, 3, 224, 224).to(device)
-    depth_images = torch.randn(batch_size, 4, 1, 224, 224).to(device)
-    vector_inputs = [torch.randn(10).to(device) for _ in range(batch_size)]
-
-    output = None
-    start_time = time.time()
-    for i in range(100):
-        output = model(rgb_images, vector_inputs, depth_images)
-    end_time = time.time()
-    duration_ms = ((end_time - start_time) / 100) * 1000
-    print(f"Inference time: {duration_ms:.3f} ms")
-    print(f"Optimal for {1000 / duration_ms:.2f} fps")
-    print("Output shape:", output.shape)
\ No newline at end of file
+        return self.head(fused)  # (B, output_dim)
\ No newline at end of file
diff --git a/alignit/record.py b/alignit/record.py
index 304c479..6a4e783 100644
--- a/alignit/record.py
+++ b/alignit/record.py
@@ -5,7 +5,6 @@
 from scipy.spatial.transform import Rotation as R
 from alignit.robots.xarm import Xarm
 from alignit.utils.zhou import se3_sixd
-import argparse  # Added for command line arguments
 import draccus
 from alignit.config import RecordConfig
 from alignit.robots.xarmsim import XarmSim
@@ -88,9 +87,9 @@ def main(cfg: RecordConfig):
         robot.servo_to_pose(pose_alignment_target, lin_tol=0.015, ang_tol=0.015)
 
         robot.servo_to_pose(
-                pose_alignment_target,
-                lin_tol=cfg.lin_tol_alignment,
-                ang_tol=cfg.ang_tol_alignment,
+            pose_alignment_target,
+            lin_tol=cfg.lin_tol_alignment,
+            ang_tol=cfg.ang_tol_alignment,
         )
         trajectory = generate_spiral_trajectory(pose_start, cfg.trajectory)
     
diff --git a/alignit/robots/xarm.py b/alignit/robots/xarm.py
index 207e718..3cb722d 100644
--- a/alignit/robots/xarm.py
+++ b/alignit/robots/xarm.py
@@ -1,11 +1,9 @@
 import time
-
 from lerobot.cameras.realsense import RealSenseCamera, RealSenseCameraConfig
 from lerobot_xarm.xarm import Xarm as LeXarm
 from lerobot_xarm.config import XarmConfig
 import numpy as np
 import transforms3d as t3d
-
 from alignit.robots.robot import Robot
 from alignit.utils.tfs import are_tfs_close
 
@@ -33,12 +31,11 @@ def send_action(self, action):
         self.robot.send_action(action)
 
     def get_observation(self):
-        rgb_image,depth_image,acqusition_time = self.camera.async_read()
+        rgb_image, depth_image, acquisition_time = self.camera.async_read()
 
         return {
             "rgb": rgb_image,
             "depth": depth_image,
-            "timestamp": acqusition_time
         }
 
     def disconnect(self):
diff --git a/alignit/robots/xarmsim/__init__.py b/alignit/robots/xarmsim/__init__.py
index bf467b5..f3d6930 100644
--- a/alignit/robots/xarmsim/__init__.py
+++ b/alignit/robots/xarmsim/__init__.py
@@ -189,7 +189,6 @@ def get_observation(self):
 
         for i in range(self.model.ncam):
             name = mujoco.mj_id2name(self.model, mujoco.mjtObj.mjOBJ_CAMERA, i)
-            
             self.renderer.update_scene(self.data, camera=name)
             image = self.renderer.render()
             self.renderer.enable_depth_rendering()
diff --git a/alignit/train.py b/alignit/train.py
index 1ce0b2b..7a714a4 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -21,8 +21,8 @@ def collate_fn(batch):
 @draccus.wrap()
 def main(cfg: TrainConfig):
     """Train AlignNet model using configuration parameters."""
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    # Load the dataset from disk
-    dataset = load_from_disk("data/duck")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dataset = load_from_disk(cfg.dataset.path)
     net = AlignNet(
         backbone_name=cfg.model.backbone,
         backbone_weights=cfg.model.backbone_weights,
@@ -82,13 +82,13 @@ def main(cfg: TrainConfig):
             batch_depth_tensors = torch.stack(batch_depth_tensors, dim=0).to(device)
 
             optimizer.zero_grad()
-            outputs = net(batch_rgb_tensors, depth_images=batch_depth_tensors) # <--- Pass depth_images
+            outputs = net(batch_rgb_tensors, depth_images=batch_depth_tensors)
             loss = criterion(outputs, actions)
             loss.backward()
             optimizer.step()
             tqdm.write(f"Loss: {loss.item():.4f}")
-        torch.save(net.state_dict(), "alignnet_model.pth")
-        tqdm.write("Model saved as alignnet_model.pth")
+        torch.save(net.state_dict(), cfg.model.path)
+        tqdm.write(f"Model saved as {cfg.model.path}")
 
     print("Training complete.")
 
diff --git a/alignit/visualize.py b/alignit/visualize.py
index 09525b3..5d1203e 100644
--- a/alignit/visualize.py
+++ b/alignit/visualize.py
@@ -11,22 +11,25 @@ def visualize(cfg: VisualizeConfig):
     dataset = load_dataset(cfg.dataset.path)
 
     def get_data(index):
-        print(index)
         item = dataset[index]
         image = item["images"][0]
         depth = item["depth"][0]
         action_sixd = item["action"]
         action = sixd_se3(action_sixd)
         label = get_pose_str(action, degrees=True)
-        return image,depth, label
+        return image, depth, label
 
     gr.Interface(
         fn=get_data,
         inputs=gr.Slider(0, len(dataset) - 1, step=1, label="Index", interactive=True),
-        outputs=[gr.Image(type="pil", label="Image"),gr.Image(type="pil", label="Depth Image"), gr.Text(label="Label")],
+        outputs=[gr.Image(type="pil", label="Image"), gr.Image(type="pil", label="Depth Image"), gr.Text(label="Label")],
         title="Dataset Image Viewer",
         live=True,
-    ).launch()
+    ).launch(
+        share=cfg.share,
+        server_name=cfg.server_name,
+        server_port=cfg.server_port
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/test_alignnet.py b/tests/test_alignnet.py
index e88630c..6415d01 100644
--- a/tests/test_alignnet.py
+++ b/tests/test_alignnet.py
@@ -13,7 +13,7 @@ def test_alignnet_forward_shapes_cpu():
 
 
 def test_alignnet_with_vector_input():
-    model = AlignNet(backbone_name="resnet18", backbone_weights=None, use_vector_input=True, output_dim=7,use_depth_input=False)
+    model = AlignNet(backbone_name="resnet18", backbone_weights=None, use_vector_input=True, output_dim=7, use_depth_input=False)
     model.eval()
     x = torch.randn(1, 2, 3, 64, 64)
     vecs = [torch.randn(5)]
@@ -23,7 +23,7 @@ def test_alignnet_with_vector_input():
 
 
 def test_alignnet_performance():
-    model = AlignNet(backbone_name="efficientnet_b0", backbone_weights=None, use_vector_input=True, output_dim=7,use_depth_input=False)
+    model = AlignNet(backbone_name="efficientnet_b0", backbone_weights=None, use_vector_input=True, output_dim=7, use_depth_input=False)
     model.eval()
     x = torch.randn(1, 3, 3, 224, 224)  # B=1, N=3 views
     vecs = [torch.randn(5)]
@@ -41,7 +41,7 @@ def test_alignnet_performance():
 
 def test_alignnet_with_depth_input():
     model = AlignNet(
-        backbone_name="resnet18",backbone_weights=None,use_vector_input=False,use_depth_input=True, output_dim=7)
+        backbone_name="resnet18", backbone_weights=None, use_vector_input=False, use_depth_input=True, output_dim=7)
     model.eval()
     x = torch.randn(2, 3, 3, 64, 64)  # RGB images
     depth = torch.randn(2, 3, 1, 64, 64)  # Depth images

From 7f8c720b8e646a00b7542063a65f791e0648752d Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Wed, 13 Aug 2025 15:32:09 +0200
Subject: [PATCH 07/13] Before measuring precision

---
 alignit/cameras/realsense.py | 69 +++++++++++++++++++++++++++---------
 alignit/config.py            |  8 ++---
 alignit/infere.py            | 13 ++++---
 alignit/models/alignnet.py   | 14 ++++----
 alignit/record.py            | 49 ++++++++++++++-----------
 alignit/robots/xarm.py       |  3 ++
 alignit/train.py             | 62 +++++++++++++++++++++-----------
 alignit/visualize.py         |  6 +++-
 measuring/precision.py       | 41 +++++++++++++++++++++
 tests/test_alignnet.py       | 20 +++++++++--
 10 files changed, 208 insertions(+), 77 deletions(-)
 create mode 100644 measuring/precision.py

diff --git a/alignit/cameras/realsense.py b/alignit/cameras/realsense.py
index f3037ea..ef457cf 100644
--- a/alignit/cameras/realsense.py
+++ b/alignit/cameras/realsense.py
@@ -104,7 +104,9 @@ def __init__(self, config: RealSenseCameraConfig):
         if config.serial_number_or_name.isdigit():
             self.serial_number = config.serial_number_or_name
         else:
-            self.serial_number = self._find_serial_number_from_name(config.serial_number_or_name)
+            self.serial_number = self._find_serial_number_from_name(
+                config.serial_number_or_name
+            )
 
         self.fps = config.fps
         self.color_mode = config.color_mode
@@ -125,7 +127,10 @@ def __init__(self, config: RealSenseCameraConfig):
 
         if self.height and self.width:
             self.capture_width, self.capture_height = self.width, self.height
-            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+            if self.rotation in [
+                cv2.ROTATE_90_CLOCKWISE,
+                cv2.ROTATE_90_COUNTERCLOCKWISE,
+            ]:
                 self.capture_width, self.capture_height = self.height, self.width
 
     def __str__(self) -> str:
@@ -203,7 +208,9 @@ def find_cameras() -> list[dict[str, Any]]:
                 "type": "RealSense",
                 "id": device.get_info(rs.camera_info.serial_number),
                 "firmware_version": device.get_info(rs.camera_info.firmware_version),
-                "usb_type_descriptor": device.get_info(rs.camera_info.usb_type_descriptor),
+                "usb_type_descriptor": device.get_info(
+                    rs.camera_info.usb_type_descriptor
+                ),
                 "physical_port": device.get_info(rs.camera_info.physical_port),
                 "product_id": device.get_info(rs.camera_info.product_id),
                 "product_line": device.get_info(rs.camera_info.product_line),
@@ -257,11 +264,19 @@ def _configure_rs_pipeline_config(self, rs_config):
 
         if self.width and self.height and self.fps:
             rs_config.enable_stream(
-                rs.stream.color, self.capture_width, self.capture_height, rs.format.rgb8, self.fps
+                rs.stream.color,
+                self.capture_width,
+                self.capture_height,
+                rs.format.rgb8,
+                self.fps,
             )
             if self.use_depth:
                 rs_config.enable_stream(
-                    rs.stream.depth, self.capture_width, self.capture_height, rs.format.z16, self.fps
+                    rs.stream.depth,
+                    self.capture_width,
+                    self.capture_height,
+                    rs.format.z16,
+                    self.fps,
                 )
         else:
             rs_config.enable_stream(rs.stream.color)
@@ -278,7 +293,9 @@ def _configure_capture_settings(self) -> None:
             DeviceNotConnectedError: If device is not connected.
         """
         if not self.is_connected:
-            raise DeviceNotConnectedError(f"Cannot validate settings for {self} as it is not connected.")
+            raise DeviceNotConnectedError(
+                f"Cannot validate settings for {self} as it is not connected."
+            )
 
         stream = self.rs_profile.get_stream(rs.stream.color).as_video_stream_profile()
 
@@ -288,7 +305,10 @@ def _configure_capture_settings(self) -> None:
         if self.width is None or self.height is None:
             actual_width = int(round(stream.width()))
             actual_height = int(round(stream.height()))
-            if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE]:
+            if self.rotation in [
+                cv2.ROTATE_90_CLOCKWISE,
+                cv2.ROTATE_90_COUNTERCLOCKWISE,
+            ]:
                 self.width, self.height = actual_height, actual_width
                 self.capture_width, self.capture_height = actual_width, actual_height
             else:
@@ -338,7 +358,9 @@ def read_depth(self, timeout_ms: int = 200) -> np.ndarray:
 
         return depth_map_processed
 
-    def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np.ndarray:
+    def read(
+        self, color_mode: ColorMode | None = None, timeout_ms: int = 200
+    ) -> np.ndarray:
         """
         Reads a single frame (color) synchronously from the camera.
 
@@ -379,7 +401,10 @@ def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np
         return color_image_processed
 
     def _postprocess_image(
-        self, image: np.ndarray, color_mode: ColorMode | None = None, depth_frame: bool = False
+        self,
+        image: np.ndarray,
+        color_mode: ColorMode | None = None,
+        depth_frame: bool = False,
     ) -> np.ndarray:
         """
         Applies color conversion, dimension validation, and rotation to a raw color frame.
@@ -409,7 +434,9 @@ def _postprocess_image(
             h, w, c = image.shape
 
             if c != 3:
-                raise RuntimeError(f"{self} frame channels={c} do not match expected 3 channels (RGB/BGR).")
+                raise RuntimeError(
+                    f"{self} frame channels={c} do not match expected 3 channels (RGB/BGR)."
+                )
 
         if h != self.capture_height or w != self.capture_width:
             raise RuntimeError(
@@ -420,7 +447,11 @@ def _postprocess_image(
         if self.color_mode == ColorMode.BGR:
             processed_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
 
-        if self.rotation in [cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE, cv2.ROTATE_180]:
+        if self.rotation in [
+            cv2.ROTATE_90_CLOCKWISE,
+            cv2.ROTATE_90_COUNTERCLOCKWISE,
+            cv2.ROTATE_180,
+        ]:
             processed_image = cv2.rotate(processed_image, self.rotation)
 
         return processed_image
@@ -439,7 +470,7 @@ def _read_loop(self):
         while not self.stop_event.is_set():
             try:
                 color_image = self.read(timeout_ms=500)
-                depth_image = self.read_depth(timeout_ms=500) 
+                depth_image = self.read_depth(timeout_ms=500)
 
                 with self.frame_lock:
                     self.latest_frame = color_image
@@ -450,7 +481,9 @@ def _read_loop(self):
             except DeviceNotConnectedError:
                 break
             except Exception as e:
-                logger.warning(f"Error reading frame in background thread for {self}: {e}")
+                logger.warning(
+                    f"Error reading frame in background thread for {self}: {e}"
+                )
 
     def _start_read_thread(self) -> None:
         """Starts or restarts the background read thread if it's not running."""
@@ -513,13 +546,17 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
         with self.frame_lock:
             frame = self.latest_frame
             depth = self.latest_frame_depth
-            acquisition_time = self.latest_frame_acquisition_time # Retrieve the timestamp
+            acquisition_time = (
+                self.latest_frame_acquisition_time
+            )  # Retrieve the timestamp
             self.new_frame_event.clear()
 
         if frame is None:
-            raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")
+            raise RuntimeError(
+                f"Internal error: Event set but no frame available for {self}."
+            )
 
-        return frame, depth,acquisition_time
+        return frame, depth, acquisition_time
 
     def disconnect(self):
         """
diff --git a/alignit/config.py b/alignit/config.py
index 33ef561..19cb384 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -47,8 +47,7 @@ class ModelConfig:
         metadata={"help": "Path to save/load trained model"},
     )
     use_depth_input: bool = field(
-        default=True, 
-        metadata={"help": "Whether to use depth input for the model"}
+        default=True, metadata={"help": "Whether to use depth input for the model"}
     )
     depth_hidden_dim: int = field(
         default=128, metadata={"help": "Output dimension of depth CNN"}
@@ -156,10 +155,11 @@ class InferConfig:
         default=5,
         metadata={"help": "Number of iterations within tolerance before stopping"},
     )
-    num_multiplications: int = field(
+    rotation_matrix_multiplier: int = field(
         default=3,
         metadata={
-            "help": "Number of times to multiply the transformation matrix"},
+            "help": "Number of times to multiply the rotation matrix of relative action in order to speed up convergence"
+        },
     )
 
 
diff --git a/alignit/infere.py b/alignit/infere.py
index 2fbb7ca..6afed07 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -8,7 +8,7 @@
 from alignit.utils.zhou import sixd_se3
 from alignit.utils.tfs import print_pose, are_tfs_close
 from alignit.robots.xarmsim import XarmSim
-from alignit.robots.xarm import Xarm
+
 
 @draccus.wrap()
 def main(cfg: InferConfig):
@@ -35,7 +35,7 @@ def main(cfg: InferConfig):
     start_pose = t3d.affines.compose(
         [0.23, 0, 0.25], t3d.euler.euler2mat(np.pi, 0, 0), [1, 1, 1]
     )
-    robot.servo_to_pose(start_pose, lin_tol=1e-2)   
+    robot.servo_to_pose(start_pose, lin_tol=1e-2)
     iteration = 0
     iterations_within_tolerance = 0
     ang_tol_rad = np.deg2rad(cfg.ang_tolerance)
@@ -47,7 +47,7 @@ def main(cfg: InferConfig):
             rgb_image_tensor = (
                 torch.from_numpy(np.array(rgb_image))
                 .permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
-                .unsqueeze(0)  
+                .unsqueeze(0)
                 .to(device)
             )
 
@@ -61,7 +61,6 @@ def main(cfg: InferConfig):
             rgb_images_batch = rgb_image_tensor.unsqueeze(1)
             depth_images_batch = depth_image_tensor.unsqueeze(1)
 
-
             with torch.no_grad():
                 relative_action = net(rgb_images_batch, depth_images=depth_images_batch)
             relative_action = relative_action.squeeze(0).cpu().numpy()
@@ -70,7 +69,9 @@ def main(cfg: InferConfig):
             if cfg.debug_output:
                 print_pose(relative_action)
 
-            relative_action[:3, :3] = np.linalg.matrix_power(relative_action[:3, :3], cfg.num_multiplications)
+            relative_action[:3, :3] = np.linalg.matrix_power(
+                relative_action[:3, :3], cfg.rotation_matrix_multiplier
+            )
 
             # Check convergence
             if are_tfs_close(
@@ -102,5 +103,7 @@ def main(cfg: InferConfig):
         print("\nExiting...")
 
     robot.disconnect()
+
+
 if __name__ == "__main__":
     main()
diff --git a/alignit/models/alignnet.py b/alignit/models/alignnet.py
index 3540515..8f652bf 100644
--- a/alignit/models/alignnet.py
+++ b/alignit/models/alignnet.py
@@ -40,16 +40,16 @@ def __init__(
         self.image_fc = nn.Sequential(
             nn.Linear(self.image_feature_dim, fc_layers[0]), nn.ReLU()
         )
-        
+
         if use_depth_input:
             self.depth_cnn = nn.Sequential(
-                nn.Conv2d(1, 8, 3, padding=1), nn.ReLU(),
-                nn.Conv2d(8, 16, 3, padding=1), nn.ReLU(),
+                nn.Conv2d(1, 8, 3, padding=1),
+                nn.ReLU(),
+                nn.Conv2d(8, 16, 3, padding=1),
+                nn.ReLU(),
                 nn.AdaptiveAvgPool2d(1),
             )
-            self.depth_fc = nn.Sequential(
-                nn.Linear(16, depth_hidden_dim), nn.ReLU()
-            )
+            self.depth_fc = nn.Sequential(nn.Linear(16, depth_hidden_dim), nn.ReLU())
             input_dim = fc_layers[0] + depth_hidden_dim
         else:
             input_dim = fc_layers[0]
@@ -129,4 +129,4 @@ def forward(self, rgb_images, vector_inputs=None, depth_images=None):
         fused = torch.cat(features, dim=1)
         print("Fused shape:", fused.shape)
 
-        return self.head(fused)  # (B, output_dim)
\ No newline at end of file
+        return self.head(fused)  # (B, output_dim)
diff --git a/alignit/record.py b/alignit/record.py
index 6a4e783..3600d16 100644
--- a/alignit/record.py
+++ b/alignit/record.py
@@ -18,6 +18,7 @@
     concatenate_datasets,
 )
 
+
 def generate_spiral_trajectory(start_pose, cfg):
     """Generate spiral trajectory using configuration parameters."""
     trajectory = []
@@ -73,12 +74,17 @@ def generate_spiral_trajectory(start_pose, cfg):
 
     return trajectory
 
+
 @draccus.wrap()
 def main(cfg: RecordConfig):
     """Record alignment dataset using configuration parameters."""
-    robot = XarmSim()
+    robot = Xarm()
     features = Features(
-        {"images": Sequence(Image()), "action": Sequence(Value("float32")), "depth": Sequence(Image())}
+        {
+            "images": Sequence(Image()),
+            "action": Sequence(Value("float32")),
+            "depth": Sequence(Image()),
+        }
     )
 
     for episode in range(cfg.episodes):
@@ -92,25 +98,28 @@ def main(cfg: RecordConfig):
             ang_tol=cfg.ang_tol_alignment,
         )
         trajectory = generate_spiral_trajectory(pose_start, cfg.trajectory)
-    
-        print(f"Generated trajectory with {len(trajectory)} poses for episode {episode+1}",flush=True)
+
+        print(
+            f"Generated trajectory with {len(trajectory)} poses for episode {episode+1}",
+            flush=True,
+        )
         frames = []
         for pose in trajectory:
-                robot.servo_to_pose(
-                    pose, lin_tol=cfg.lin_tol_trajectory, ang_tol=cfg.ang_tol_trajectory
-                )
-                current_pose = robot.pose()
-
-                action_pose = np.linalg.inv(current_pose) @ pose_alignment_target
-                action_sixd = se3_sixd(action_pose)
-
-                observation = robot.get_observation()
-                frame = {
-                    "images": [observation["camera.rgb"].copy()],
-                    "action": action_sixd,
-                    "depth": [observation["camera.rgb.depth"].copy()],
-                }
-                frames.append(frame)
+            robot.servo_to_pose(
+                pose, lin_tol=cfg.lin_tol_trajectory, ang_tol=cfg.ang_tol_trajectory
+            )
+            current_pose = robot.pose()
+
+            action_pose = np.linalg.inv(current_pose) @ pose_alignment_target
+            action_sixd = se3_sixd(action_pose)
+
+            observation = robot.get_observation()
+            frame = {
+                "images": [observation["camera.rgb"].copy()],
+                "action": action_sixd,
+                "depth": [observation["camera.rgb.depth"].copy()],
+            }
+            frames.append(frame)
         print(f"Episode {episode+1} completed with {len(frames)} frames.")
 
         episode_dataset = Dataset.from_list(frames, features=features)
@@ -134,4 +143,4 @@ def main(cfg: RecordConfig):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/alignit/robots/xarm.py b/alignit/robots/xarm.py
index 3cb722d..860e496 100644
--- a/alignit/robots/xarm.py
+++ b/alignit/robots/xarm.py
@@ -30,6 +30,9 @@ def _connect(self):
     def send_action(self, action):
         self.robot.send_action(action)
 
+    def get_intrinsics(self):
+        return self.camera.get_intrinsics()
+
     def get_observation(self):
         rgb_image, depth_image, acquisition_time = self.camera.async_read()
 
diff --git a/alignit/train.py b/alignit/train.py
index 7a714a4..022cce7 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -6,6 +6,7 @@
 from datasets import load_from_disk
 from torchvision import transforms
 import draccus
+import numpy as np
 
 from alignit.config import TrainConfig
 from alignit.models.alignnet import AlignNet
@@ -15,7 +16,11 @@ def collate_fn(batch):
     images = [item["images"] for item in batch]
     depth_images = [item.get("depth", None) for item in batch]
     actions = [item["action"] for item in batch]
-    return {"images": images, "depth_images": depth_images, "action": torch.tensor(actions, dtype=torch.float32)}
+    return {
+        "images": images,
+        "depth_images": depth_images,
+        "action": torch.tensor(actions, dtype=torch.float32),
+    }
 
 
 @draccus.wrap()
@@ -31,6 +36,7 @@ def main(cfg: TrainConfig):
         vector_hidden_dim=cfg.model.vector_hidden_dim,
         output_dim=cfg.model.output_dim,
         feature_agg=cfg.model.feature_agg,
+        use_depth_input=cfg.model.use_depth_input,
     ).to(device)
 
     train_dataset = dataset.train_test_split(
@@ -46,6 +52,7 @@ def main(cfg: TrainConfig):
     optimizer = Adam(net.parameters(), lr=cfg.learning_rate)
     criterion = MSELoss()
     net.train()
+
     for epoch in range(cfg.epochs):
         for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
             images = batch["images"]
@@ -53,40 +60,54 @@ def main(cfg: TrainConfig):
             actions = batch["action"].to(device)
 
             batch_rgb_tensors = []
-            batch_depth_tensors = []
-            rgb_transform = transforms.Compose([transforms.ToTensor()])
-            depth_transform = transforms.Compose([transforms.ToTensor()])
+            rgb_transform = transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                ]
+            )
 
-            for i, image_sequence in enumerate(images):
-                # Process RGB
+            for image_sequence in images:
                 tensor_sequence_rgb = [
                     rgb_transform(img.convert("RGB")) for img in image_sequence
                 ]
                 stacked_tensors_rgb = torch.stack(tensor_sequence_rgb, dim=0)
                 batch_rgb_tensors.append(stacked_tensors_rgb)
 
-                # Process Depth
-                if depth_images_pil[i] is not None: # Check if depth is actually present for this item
-                    depth_sequence = depth_images_pil[i]
-                    tensor_sequence_depth = [
-                        depth_transform(d_img.convert("L")) for d_img in depth_sequence
-                    ] 
-                    stacked_tensors_depth = torch.stack(tensor_sequence_depth, dim=0)
-                    batch_depth_tensors.append(stacked_tensors_depth)
-                else:
-                    print(f"Warning: Depth image missing for sample {i} in batch.")
-                    raise ValueError("Depth images expected but not found for a sample when use_depth is True.")
+            batch_rgb_tensors = torch.stack(batch_rgb_tensors, dim=0).to(device)
 
+            batch_depth_tensors = None
+            if cfg.model.use_depth_input:
+                batch_depth_tensors = []
+                for depth_sequence in depth_images_pil:
+                    if depth_sequence is None:
+                        raise ValueError(
+                            "Depth images expected but not found when use_depth_input=True"
+                        )
 
-            batch_rgb_tensors = torch.stack(batch_rgb_tensors, dim=0).to(device)
-            batch_depth_tensors = torch.stack(batch_depth_tensors, dim=0).to(device)
+                    depth_sequence_processed = []
+                    for d_img in depth_sequence:
+                        depth_array = np.array(d_img) / 1000.0
+                        depth_tensor = torch.from_numpy(depth_array).float()
+                        depth_tensor = depth_tensor.unsqueeze(0)
+                        depth_sequence_processed.append(depth_tensor)
+
+                    stacked_depth = torch.stack(depth_sequence_processed, dim=0)
+                    batch_depth_tensors.append(stacked_depth)
+
+                batch_depth_tensors = torch.stack(batch_depth_tensors, dim=0).to(device)
 
             optimizer.zero_grad()
-            outputs = net(batch_rgb_tensors, depth_images=batch_depth_tensors)
+            if cfg.model.use_depth_input:
+                outputs = net(batch_rgb_tensors, depth_images=batch_depth_tensors)
+            else:
+                outputs = net(batch_rgb_tensors)
+
             loss = criterion(outputs, actions)
             loss.backward()
             optimizer.step()
+
             tqdm.write(f"Loss: {loss.item():.4f}")
+
         torch.save(net.state_dict(), cfg.model.path)
         tqdm.write(f"Model saved as {cfg.model.path}")
 
@@ -95,4 +116,3 @@ def main(cfg: TrainConfig):
 
 if __name__ == "__main__":
     main()
-
diff --git a/alignit/visualize.py b/alignit/visualize.py
index 562f27d..60d83b8 100644
--- a/alignit/visualize.py
+++ b/alignit/visualize.py
@@ -22,7 +22,11 @@ def get_data(index):
     gr.Interface(
         fn=get_data,
         inputs=gr.Slider(0, len(dataset) - 1, step=1, label="Index", interactive=True),
-        outputs=[gr.Image(type="pil", label="Image"), gr.Image(type="pil", label="Depth Image"), gr.Text(label="Label")],
+        outputs=[
+            gr.Image(type="pil", label="Image"),
+            gr.Image(type="pil", label="Depth Image"),
+            gr.Text(label="Label"),
+        ],
         title="Dataset Image Viewer",
         live=True,
     ).launch(share=cfg.share, server_name=cfg.server_name, server_port=cfg.server_port)
diff --git a/measuring/precision.py b/measuring/precision.py
new file mode 100644
index 0000000..8e025fc
--- /dev/null
+++ b/measuring/precision.py
@@ -0,0 +1,41 @@
+import cv2
+import numpy as np
+from pupil_apriltags import Detector
+from alignit.robots.xarm import Xarm
+
+
+class AprilTagDetector:
+    def __init__(self, tag_size=0.08):
+        camera_params = robot.get_intrinsics()
+        self.detector = Detector(families="tag36h11")
+        self.tag_size = tag_size
+        self.camera_params = camera_params
+
+    def detect_pose(self, image):
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        tags = self.detector.detect(
+            gray,
+            estimate_tag_pose=True,
+            camera_params=self.camera_params,
+            tag_size=self.tag_size,
+        )
+
+        if len(tags) == 0:
+            return None
+
+        tag = tags[0]
+        pose = np.eye(4)
+        pose[:3, :3] = tag.pose_R
+        pose[:3, 3] = tag.pose_t.flatten()
+        return pose
+
+
+if __name__ == "__main__":
+    robot = Xarm()
+    detector = AprilTagDetector()
+
+    while True:
+        observation = robot.get_observation()
+        rgb_image = observation["rgb"]
+        pose = detector.detect_pose(rgb_image)
+        print("Detected pose:", pose)
diff --git a/tests/test_alignnet.py b/tests/test_alignnet.py
index d061ff3..94c4274 100644
--- a/tests/test_alignnet.py
+++ b/tests/test_alignnet.py
@@ -9,6 +9,7 @@ def test_alignnet_forward_shapes_cpu():
         backbone_weights=None,
         use_vector_input=False,
         output_dim=7,
+        use_depth_input=False,
     )
     model.eval()
     x = torch.randn(2, 3, 3, 64, 64)  # B=2, N=3 views
@@ -23,6 +24,7 @@ def test_alignnet_with_vector_input():
         backbone_weights=None,
         use_vector_input=True,
         output_dim=7,
+        use_depth_input=False,
     )
     model.eval()
     x = torch.randn(1, 2, 3, 64, 64)
@@ -38,6 +40,7 @@ def test_alignnet_performance():
         backbone_weights=None,
         use_vector_input=True,
         output_dim=7,
+        use_depth_input=False,
     )
     model.eval()
     x = torch.randn(1, 3, 3, 224, 224)  # B=1, N=3 views
@@ -57,7 +60,12 @@ def test_alignnet_performance():
 
 def test_alignnet_with_depth_input():
     model = AlignNet(
-        backbone_name="resnet18", backbone_weights=None, use_vector_input=False, use_depth_input=True, output_dim=7)
+        backbone_name="resnet18",
+        backbone_weights=None,
+        use_vector_input=False,
+        use_depth_input=True,
+        output_dim=7,
+    )
     model.eval()
     x = torch.randn(2, 3, 3, 64, 64)  # RGB images
     depth = torch.randn(2, 3, 1, 64, 64)  # Depth images
@@ -68,11 +76,17 @@ def test_alignnet_with_depth_input():
 
 def test_alignnet_with_all_inputs():
     # New test for combined inputs
-    model = AlignNet(backbone_name="efficientnet_b0",backbone_weights=None,use_vector_input=True,use_depth_input=True, output_dim=7)
+    model = AlignNet(
+        backbone_name="efficientnet_b0",
+        backbone_weights=None,
+        use_vector_input=True,
+        use_depth_input=True,
+        output_dim=7,
+    )
     model.eval()
     x = torch.randn(1, 3, 3, 224, 224)  # RGB images
     depth = torch.randn(1, 3, 1, 224, 224)  # Depth images
     vecs = [torch.randn(5)]  # Vector inputs
     with torch.no_grad():
         y = model(x, vecs, depth)
-    assert y.shape == (1, 7)
\ No newline at end of file
+    assert y.shape == (1, 7)

From b9e82c0b5941e4e75a0cb804ebbe4bdb855484f0 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Thu, 14 Aug 2025 16:22:00 +0200
Subject: [PATCH 08/13] Added measuring/precision to compare ideal aligned pose
 to real pose after infere. XY values look fine but Z value is offset,
 probably due to depth image handling

---
 alignit/config.py      |  6 +--
 alignit/infere.py      | 15 ++++----
 alignit/record.py      | 42 +++++++++------------
 measuring/precision.py | 85 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/alignit/config.py b/alignit/config.py
index 19cb384..72052ac 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -112,7 +112,7 @@ class TrainConfig:
 
     dataset: DatasetConfig = field(default_factory=DatasetConfig)
     model: ModelConfig = field(default_factory=ModelConfig)
-    batch_size: int = field(default=8, metadata={"help": "Training batch size"})
+    batch_size: int = field(default=4, metadata={"help": "Training batch size"})
     learning_rate: float = field(
         default=1e-4, metadata={"help": "Learning rate for optimizer"}
     )
@@ -139,10 +139,10 @@ class InferConfig:
         metadata={"help": "Starting pose RPY angles"},
     )
     lin_tolerance: float = field(
-        default=2e-3, metadata={"help": "Linear tolerance for convergence (meters)"}
+        default=5e-3, metadata={"help": "Linear tolerance for convergence (meters)"}
     )
     ang_tolerance: float = field(
-        default=2, metadata={"help": "Angular tolerance for convergence (degrees)"}
+        default=5, metadata={"help": "Angular tolerance for convergence (degrees)"}
     )
     max_iterations: Optional[int] = field(
         default=None,
diff --git a/alignit/infere.py b/alignit/infere.py
index 6afed07..a501844 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -8,6 +8,7 @@
 from alignit.utils.zhou import sixd_se3
 from alignit.utils.tfs import print_pose, are_tfs_close
 from alignit.robots.xarmsim import XarmSim
+from alignit.robots.xarm import Xarm
 
 
 @draccus.wrap()
@@ -30,7 +31,7 @@ def main(cfg: InferConfig):
     net.to(device)
     net.eval()
 
-    robot = XarmSim()
+    robot = Xarm()
 
     start_pose = t3d.affines.compose(
         [0.23, 0, 0.25], t3d.euler.euler2mat(np.pi, 0, 0), [1, 1, 1]
@@ -39,11 +40,12 @@ def main(cfg: InferConfig):
     iteration = 0
     iterations_within_tolerance = 0
     ang_tol_rad = np.deg2rad(cfg.ang_tolerance)
+    print("beforee")
     try:
         while True:
             observation = robot.get_observation()
-            rgb_image = observation["camera.rgb"].astype(np.float32) / 255.0
-            depth_image = observation["camera.rgb.depth"].astype(np.float32) / 1000.0
+            rgb_image = observation["rgb"].astype(np.float32) / 255.0
+            depth_image = observation["depth"].astype(np.float32) / 1000.0
             rgb_image_tensor = (
                 torch.from_numpy(np.array(rgb_image))
                 .permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
@@ -57,7 +59,6 @@ def main(cfg: InferConfig):
                 .unsqueeze(0)  # Add batch dimension: (1, 1, H, W)
                 .to(device)
             )
-
             rgb_images_batch = rgb_image_tensor.unsqueeze(1)
             depth_images_batch = depth_image_tensor.unsqueeze(1)
 
@@ -72,8 +73,6 @@ def main(cfg: InferConfig):
             relative_action[:3, :3] = np.linalg.matrix_power(
                 relative_action[:3, :3], cfg.rotation_matrix_multiplier
             )
-
-            # Check convergence
             if are_tfs_close(
                 relative_action, lin_tol=cfg.lin_tolerance, ang_tol=ang_tol_rad
             ):
@@ -84,7 +83,8 @@ def main(cfg: InferConfig):
             if iterations_within_tolerance >= cfg.debouncing_count:
                 print("Alignment achieved - stopping.")
                 break
-
+            
+            print(relative_action)
             target_pose = robot.pose() @ relative_action
             iteration += 1
             action = {
@@ -92,7 +92,6 @@ def main(cfg: InferConfig):
                 "gripper.pos": 1.0,
             }
             robot.send_action(action)
-
             # Check max iterations
             if cfg.max_iterations and iteration >= cfg.max_iterations:
                 print(f"Reached maximum iterations ({cfg.max_iterations}) - stopping.")
diff --git a/alignit/record.py b/alignit/record.py
index 3600d16..4faccef 100644
--- a/alignit/record.py
+++ b/alignit/record.py
@@ -17,6 +17,7 @@
     load_from_disk,
     concatenate_datasets,
 )
+import time
 
 
 def generate_spiral_trajectory(start_pose, cfg):
@@ -89,20 +90,8 @@ def main(cfg: RecordConfig):
 
     for episode in range(cfg.episodes):
         pose_start, pose_alignment_target = robot.reset()
-
-        robot.servo_to_pose(pose_alignment_target, lin_tol=0.015, ang_tol=0.015)
-
-        robot.servo_to_pose(
-            pose_alignment_target,
-            lin_tol=cfg.lin_tol_alignment,
-            ang_tol=cfg.ang_tol_alignment,
-        )
         trajectory = generate_spiral_trajectory(pose_start, cfg.trajectory)
-
-        print(
-            f"Generated trajectory with {len(trajectory)} poses for episode {episode+1}",
-            flush=True,
-        )
+        pose = robot.pose()
         frames = []
         for pose in trajectory:
             robot.servo_to_pose(
@@ -114,30 +103,33 @@ def main(cfg: RecordConfig):
             action_sixd = se3_sixd(action_pose)
 
             observation = robot.get_observation()
+            print(observation.keys())
             frame = {
-                "images": [observation["camera.rgb"].copy()],
+                "images": [observation["rgb"].copy()],
                 "action": action_sixd,
-                "depth": [observation["camera.rgb.depth"].copy()],
+                "depth": [observation["depth"].copy()],
             }
             frames.append(frame)
         print(f"Episode {episode+1} completed with {len(frames)} frames.")
 
         episode_dataset = Dataset.from_list(frames, features=features)
-        if episode == 0:
-            combined_dataset = episode_dataset
-            print("Writing do dataset to disk for the first time.")
+
+        # 2. Load existing dataset if available
+        if os.path.exists(cfg.dataset.path):
+            existing_dataset = load_from_disk(cfg.dataset.path)
+            existing_dataset = existing_dataset.cast(features)
+            combined_dataset = concatenate_datasets([existing_dataset, episode_dataset])
         else:
-            previous_dataset = load_from_disk(cfg.dataset.path)
-            previous_dataset = previous_dataset.cast(features)
-            combined_dataset = concatenate_datasets([previous_dataset, episode_dataset])
-            del previous_dataset
+            combined_dataset = episode_dataset
 
+        # 3. Save to TEMPORARY location first (avoid self-overwrite)
         temp_path = f"{cfg.dataset.path}_temp"
         combined_dataset.save_to_disk(temp_path)
+
+        # 4. Atomic replacement (only after successful save)
         if os.path.exists(cfg.dataset.path):
-            shutil.rmtree(cfg.dataset.path)
-        shutil.move(temp_path, cfg.dataset.path)
-        print(f"Saved dataset to {cfg.dataset.path}")
+            shutil.rmtree(cfg.dataset.path)  # Remove old version
+        shutil.move(temp_path, cfg.dataset.path)  # Move new version into place
 
     robot.disconnect()
 
diff --git a/measuring/precision.py b/measuring/precision.py
index 8e025fc..6d3e7a2 100644
--- a/measuring/precision.py
+++ b/measuring/precision.py
@@ -2,10 +2,13 @@
 import numpy as np
 from pupil_apriltags import Detector
 from alignit.robots.xarm import Xarm
+import os
+import time
+from scipy.spatial.transform import Rotation as R
 
 
-class AprilTagDetector:
-    def __init__(self, tag_size=0.08):
+class Evaluation:
+    def __init__(self, tag_size=0.06):
         camera_params = robot.get_intrinsics()
         self.detector = Detector(families="tag36h11")
         self.tag_size = tag_size
@@ -21,6 +24,7 @@ def detect_pose(self, image):
         )
 
         if len(tags) == 0:
+            print("No tags detected")
             return None
 
         tag = tags[0]
@@ -29,13 +33,82 @@ def detect_pose(self, image):
         pose[:3, 3] = tag.pose_t.flatten()
         return pose
 
+    def pose_difference(self, T_ideal, T_current, max_pos_distance=1.0):
+
+        try:
+            # Convert and validate inputs
+            T1 = np.array(T_ideal, dtype=np.float32)
+            T2 = np.array(T_current, dtype=np.float32)
+
+            if T1.shape != (4, 4) or T2.shape != (4, 4):
+                raise ValueError("Input matrices must be 4x4")
+
+            # Extract components
+            p1 = T1[:3, 3] * 1000  # Ideal position
+            p2 = T2[:3, 3] * 1000  # Current position
+            R1 = T1[:3, :3]  # Ideal rotation
+            R2 = T2[:3, :3]  # Current rotation
+
+            # Position difference (Euclidean distance)
+            pos_distance = np.linalg.norm(p2 - p1)  # Convert to millimeters
+            pos_diff_pct = min(pos_distance / max_pos_distance, 1.0) * 100
+
+            # Rotation difference (angle in radians)
+            rel_rot = R1.T @ R2
+            angle_rad = np.arccos(np.clip((np.trace(rel_rot) - 1) / 2, -1, 1))
+            rot_diff_pct = (angle_rad / np.pi) * 100
+
+            # Euler angle differences
+            rpy1 = R.from_matrix(R1).as_euler("xyz")
+            rpy2 = R.from_matrix(R2).as_euler("xyz")
+            rpy_diff = np.abs(rpy2 - rpy1)
+
+            return {
+                "position_diff_milimeters": pos_distance,
+                "position_diff%": pos_diff_pct,
+                "rotation_diff_rad": angle_rad,
+                "rotation_diff%": rot_diff_pct,
+                "rpy_diff_rad": rpy_diff,
+                "xyz_diff_milimeters": (p2 - p1).tolist(),
+                "combined_diff%": 0.5 * pos_diff_pct + 0.5 * rot_diff_pct,
+            }
+
+        except Exception as e:
+            print(f"Pose comparison failed: {str(e)}")
+            return None
+
+
+def pose_in_tag_frame(tag_pose_world, robot_pose_world):
+    """Convert robot pose from world frame to AprilTag's frame."""
+    tag_pose_inv = np.linalg.inv(tag_pose_world)
+    return tag_pose_inv @ robot_pose_world
+
 
 if __name__ == "__main__":
     robot = Xarm()
-    detector = AprilTagDetector()
-
+    detector = Evaluation()
     while True:
         observation = robot.get_observation()
+        curr_pose = robot.pose()
         rgb_image = observation["rgb"]
-        pose = detector.detect_pose(rgb_image)
-        print("Detected pose:", pose)
+        apriltag_pose = detector.detect_pose(rgb_image)
+        T_ideal_tag = np.array(
+            [
+                [0.92529235, 0.37684695, 0.04266663, 0.25978898],
+                [0.37511194, -0.89279238, -0.24942494, 0.06284318],
+                [-0.05590259, 0.24679575, -0.96745375, -0.25993736],
+                [0.0, 0.0, 0.0, 1.0],
+            ],
+            dtype=np.float32,
+        )
+
+        T_after_inference = pose_in_tag_frame(apriltag_pose, curr_pose)
+        result = detector.pose_difference(T_ideal_tag, T_after_inference)
+        if result:
+            print("xyz_diff_milimeters:", result["xyz_diff_milimeters"])
+            print("position_diff_milimeters:", result["position_diff_milimeters"])
+            print("rotation_diff_rad:", result["rotation_diff_rad"])
+            print("rpy_diff_rad:", result["rpy_diff_rad"])
+            print("combined_diff%:", result["combined_diff%"])
+        else:
+            print("Error: Failed to get current pose estimate")

From 735a190f9a27ab47cf7f77b4d3741c1992e09ab2 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Thu, 14 Aug 2025 18:48:25 +0200
Subject: [PATCH 09/13] Z offset still an issue, out of ideas

---
 alignit/cameras/realsense.py |  6 +++---
 alignit/infere.py            | 13 ++++++++-----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/alignit/cameras/realsense.py b/alignit/cameras/realsense.py
index ef457cf..fffd7d8 100644
--- a/alignit/cameras/realsense.py
+++ b/alignit/cameras/realsense.py
@@ -115,6 +115,8 @@ def __init__(self, config: RealSenseCameraConfig):
         self.latest_frame_acquisition_time = None
         self.rs_pipeline: rs.pipeline | None = None
         self.rs_profile: rs.pipeline_profile | None = None
+        self.align = rs.align(rs.stream.color)
+
 
         self.thread: Thread | None = None
         self.stop_event: Event | None = None
@@ -473,9 +475,7 @@ def _read_loop(self):
                 depth_image = self.read_depth(timeout_ms=500)
 
                 with self.frame_lock:
-                    self.latest_frame = color_image
-                    self.latest_frame_depth = depth_image
-                    self.latest_frame_acquisition_time = time.time()
+                    self.latest_frame = color_image , depth_image
                 self.new_frame_event.set()
 
             except DeviceNotConnectedError:
diff --git a/alignit/infere.py b/alignit/infere.py
index a501844..6d2ae75 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -45,7 +45,12 @@ def main(cfg: InferConfig):
         while True:
             observation = robot.get_observation()
             rgb_image = observation["rgb"].astype(np.float32) / 255.0
-            depth_image = observation["depth"].astype(np.float32) / 1000.0
+            depth_image_unscaled= observation["depth"].astype(np.float32) 
+            depth_image_unscaled = np.clip(depth_image_unscaled, a_min=None, a_max=1000)
+
+            depth_image = depth_image_unscaled / 1000.0  # Scale depth to meters
+            print("Min/Max depth (raw):", observation["depth"].min(), observation["depth"].max())
+            print("Min/Max depth (scaled):", depth_image.min(), depth_image.max())
             rgb_image_tensor = (
                 torch.from_numpy(np.array(rgb_image))
                 .permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
@@ -80,10 +85,8 @@ def main(cfg: InferConfig):
             else:
                 iterations_within_tolerance = 0
 
-            if iterations_within_tolerance >= cfg.debouncing_count:
-                print("Alignment achieved - stopping.")
-                break
             
+
             print(relative_action)
             target_pose = robot.pose() @ relative_action
             iteration += 1
@@ -105,4 +108,4 @@ def main(cfg: InferConfig):
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From c88b5f2a3852166bb015d2999975dcc90335c772 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Thu, 14 Aug 2025 18:48:44 +0200
Subject: [PATCH 10/13] Z offset still an issue, out of ideas

---
 alignit/cameras/realsense.py |  3 +--
 alignit/infere.py            | 12 +++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/alignit/cameras/realsense.py b/alignit/cameras/realsense.py
index fffd7d8..8039384 100644
--- a/alignit/cameras/realsense.py
+++ b/alignit/cameras/realsense.py
@@ -117,7 +117,6 @@ def __init__(self, config: RealSenseCameraConfig):
         self.rs_profile: rs.pipeline_profile | None = None
         self.align = rs.align(rs.stream.color)
 
-
         self.thread: Thread | None = None
         self.stop_event: Event | None = None
         self.frame_lock: Lock = Lock()
@@ -475,7 +474,7 @@ def _read_loop(self):
                 depth_image = self.read_depth(timeout_ms=500)
 
                 with self.frame_lock:
-                    self.latest_frame = color_image , depth_image
+                    self.latest_frame = color_image, depth_image
                 self.new_frame_event.set()
 
             except DeviceNotConnectedError:
diff --git a/alignit/infere.py b/alignit/infere.py
index 6d2ae75..068acb1 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -45,11 +45,15 @@ def main(cfg: InferConfig):
         while True:
             observation = robot.get_observation()
             rgb_image = observation["rgb"].astype(np.float32) / 255.0
-            depth_image_unscaled= observation["depth"].astype(np.float32) 
+            depth_image_unscaled = observation["depth"].astype(np.float32)
             depth_image_unscaled = np.clip(depth_image_unscaled, a_min=None, a_max=1000)
 
             depth_image = depth_image_unscaled / 1000.0  # Scale depth to meters
-            print("Min/Max depth (raw):", observation["depth"].min(), observation["depth"].max())
+            print(
+                "Min/Max depth (raw):",
+                observation["depth"].min(),
+                observation["depth"].max(),
+            )
             print("Min/Max depth (scaled):", depth_image.min(), depth_image.max())
             rgb_image_tensor = (
                 torch.from_numpy(np.array(rgb_image))
@@ -85,8 +89,6 @@ def main(cfg: InferConfig):
             else:
                 iterations_within_tolerance = 0
 
-            
-
             print(relative_action)
             target_pose = robot.pose() @ relative_action
             iteration += 1
@@ -108,4 +110,4 @@ def main(cfg: InferConfig):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 9c3c6f4f9aaf4aba953fb4dcf3c762722c671e70 Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Fri, 15 Aug 2025 18:44:27 +0200
Subject: [PATCH 11/13] Fixed z offset issue

---
 alignit/config.py      | 15 ++++++++++++--
 alignit/infere.py      | 26 +++++++++++++++++++-----
 alignit/robots/xarm.py | 45 +++++++++++++++++++++++++++++++-----------
 alignit/train.py       | 11 ++++++++++-
 4 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/alignit/config.py b/alignit/config.py
index 72052ac..a52779e 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -104,6 +104,13 @@ class RecordConfig:
     ang_tol_trajectory: float = field(
         default=0.05, metadata={"help": "Angular tolerance for trajectory servo"}
     )
+    manual_height: float = field(
+        default=-0.05, metadata={"help": "Height above surface for manual movement"}
+    )
+    world_z_offset: float = field(
+        default=-0.02,
+        metadata={"help": "World frame Z offset after manual positioning"},
+    )
 
 
 @dataclass
@@ -145,14 +152,14 @@ class InferConfig:
         default=5, metadata={"help": "Angular tolerance for convergence (degrees)"}
     )
     max_iterations: Optional[int] = field(
-        default=None,
+        default=20,
         metadata={"help": "Maximum iterations before stopping (None = infinite)"},
     )
     debug_output: bool = field(
         default=True, metadata={"help": "Print debug information during inference"}
     )
     debouncing_count: int = field(
-        default=5,
+        default=20,
         metadata={"help": "Number of iterations within tolerance before stopping"},
     )
     rotation_matrix_multiplier: int = field(
@@ -161,6 +168,10 @@ class InferConfig:
             "help": "Number of times to multiply the rotation matrix of relative action in order to speed up convergence"
         },
     )
+    manual_height: float = field(
+        default=0.08, metadata={"help": "Height above surface for manual movement"}
+    )
+    
 
 
 @dataclass
diff --git a/alignit/infere.py b/alignit/infere.py
index 068acb1..013c406 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -3,6 +3,7 @@
 import time
 import draccus
 from alignit.config import InferConfig
+from alignit.config import RecordConfig
 import torch
 from alignit.models.alignnet import AlignNet
 from alignit.utils.zhou import sixd_se3
@@ -40,7 +41,6 @@ def main(cfg: InferConfig):
     iteration = 0
     iterations_within_tolerance = 0
     ang_tol_rad = np.deg2rad(cfg.ang_tolerance)
-    print("beforee")
     try:
         while True:
             observation = robot.get_observation()
@@ -50,11 +50,17 @@ def main(cfg: InferConfig):
 
             depth_image = depth_image_unscaled / 1000.0  # Scale depth to meters
             print(
-                "Min/Max depth (raw):",
+                "Min/Max depth,mean (raw):",
                 observation["depth"].min(),
                 observation["depth"].max(),
+                observation["depth"].mean(),
+            )
+            print(
+                "Min/Max depth,mean (scaled):",
+                depth_image.min(),
+                depth_image.max(),
+                depth_image.mean(),
             )
-            print("Min/Max depth (scaled):", depth_image.min(), depth_image.max())
             rgb_image_tensor = (
                 torch.from_numpy(np.array(rgb_image))
                 .permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
@@ -97,9 +103,19 @@ def main(cfg: InferConfig):
                 "gripper.pos": 1.0,
             }
             robot.send_action(action)
-            # Check max iterations
-            if cfg.max_iterations and iteration >= cfg.max_iterations:
+            if iterations_within_tolerance >= cfg.max_iterations:
                 print(f"Reached maximum iterations ({cfg.max_iterations}) - stopping.")
+                print("Moving robot to final pose.")
+                current_pose = robot.pose()
+                gripper_z_offset = np.array(
+                    [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, cfg.manual_height], [0, 0, 0, 1]]
+                      )
+                offset_pose = current_pose @ gripper_z_offset
+                robot.servo_to_pose(pose=offset_pose)
+                robot.close_gripper()
+                robot.gripper_off()
+                
+
                 break
 
         time.sleep(10.0)
diff --git a/alignit/robots/xarm.py b/alignit/robots/xarm.py
index 860e496..a65f159 100644
--- a/alignit/robots/xarm.py
+++ b/alignit/robots/xarm.py
@@ -6,6 +6,7 @@
 import transforms3d as t3d
 from alignit.robots.robot import Robot
 from alignit.utils.tfs import are_tfs_close
+from alignit.config import RecordConfig
 
 
 class Xarm(Robot):
@@ -35,6 +36,10 @@ def get_intrinsics(self):
 
     def get_observation(self):
         rgb_image, depth_image, acquisition_time = self.camera.async_read()
+        depth_array_clipped = np.clip(
+        np.array(depth_image), a_min=0, a_max=1000
+                        )
+        depth_image = np.array(depth_array_clipped) / 1000.0
 
         return {
             "rgb": rgb_image,
@@ -48,12 +53,32 @@ def servo_to_pose(self, pose, lin_tol=1e-3, ang_tol=1e-2):
         while not are_tfs_close(self.pose(), pose, lin_tol, ang_tol):
             action = {
                 "pose": pose,
-                "gripper.pos": 1.0,  # Optional: set gripper state (0.0=closed, 1.0=open)
+                "gripper.pos": 1.0,
             }
             self.send_action(action)
-            time.sleep(1.0 / 60.0)  # Adjust frequency as needed
+            time.sleep(1.0 / 60.0)
+    def close_gripper(self):
+        action = {
+            "pose": self.pose(),
+            "gripper.pos": 0.0,
+        }
+        self.send_action(action)
+
+    def open_gripper(self):
+        action = {
+            "pose": self.pose(),
+            "gripper.pos": 1.0,
+        }
+        self.send_action(action)
+
+    def gripper_off(self):
+        action = {
+            "pose": self.pose(),
+            "gripper.pos": 0.5,
+        }
+        self.send_action(action)
 
-    def reset(self):
+    def reset(self, cfg: RecordConfig):
         """
         Reset routine:
         1. Allows manual movement of the arm
@@ -66,30 +91,26 @@ def reset(self):
             manual_height: Height above surface to maintain during manual movement (meters)
             world_z_offset: Additional Z offset in world frame after manual positioning (meters)
         """
-        manual_height = -0.05
-        world_z_offset = -0.02
         self.robot.disconnect()
         input("Press Enter after positioning the arm...")
         self.robot.connect()
         current_pose = self.pose()
         gripper_z_offset = np.array(
-            [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, manual_height], [0, 0, 0, 1]]
+            [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, cfg.manual_height], [0, 0, 0, 1]]
         )
         offset_pose = current_pose @ gripper_z_offset
         self.servo_to_pose(pose=offset_pose)
 
         world_z_offset_mat = np.array(
-            [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, world_z_offset], [0, 0, 0, 1]]
+            [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, cfg.world_z_offset], [0, 0, 0, 1]]
         )
         final_pose = offset_pose @ world_z_offset_mat
         self.servo_to_pose(pose=final_pose)
-
+        current_pose = self.pose()
         pose_start = current_pose @ t3d.affines.compose(
-            [0, 0, -0.090], t3d.euler.euler2mat(0, 0, 0), [1, 1, 1]
-        )
-        pose_alignment_target = current_pose @ t3d.affines.compose(
-            [0, 0, -0.1], t3d.euler.euler2mat(0, 0, 0), [1, 1, 1]
+            [0, 0, -0.01], t3d.euler.euler2mat(0, 0, 0), [1, 1, 1]
         )
+        pose_alignment_target = current_pose
 
         _, (position, _, _) = self.robot._arm.get_joint_states()
         for i in range(6):
diff --git a/alignit/train.py b/alignit/train.py
index 022cce7..33653c7 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -86,7 +86,10 @@ def main(cfg: TrainConfig):
 
                     depth_sequence_processed = []
                     for d_img in depth_sequence:
-                        depth_array = np.array(d_img) / 1000.0
+                        depth_array_clipped = np.clip(
+                            np.array(d_img), a_min=0, a_max=1000
+                        )
+                        depth_array = np.array(depth_array_clipped) / 1000.0
                         depth_tensor = torch.from_numpy(depth_array).float()
                         depth_tensor = depth_tensor.unsqueeze(0)
                         depth_sequence_processed.append(depth_tensor)
@@ -95,6 +98,12 @@ def main(cfg: TrainConfig):
                     batch_depth_tensors.append(stacked_depth)
 
                 batch_depth_tensors = torch.stack(batch_depth_tensors, dim=0).to(device)
+                print(
+                    "Min,max,mean depth:",
+                    batch_depth_tensors.min().item(),
+                    batch_depth_tensors.max().item(),
+                    batch_depth_tensors.mean().item(),
+                )
 
             optimizer.zero_grad()
             if cfg.model.use_depth_input:

From 08eef48cfcb4d535fc546c480115ad31425437ea Mon Sep 17 00:00:00 2001
From: Nikola Stojic <snikola.stojic@gmail.com>
Date: Fri, 15 Aug 2025 18:59:45 +0200
Subject: [PATCH 12/13] Depth values clipping now done in get_observation,
 before train and infere

---
 alignit/config.py      |  1 -
 alignit/infere.py      | 15 ++++++++-------
 alignit/robots/xarm.py |  5 ++---
 alignit/train.py       |  5 +----
 4 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/alignit/config.py b/alignit/config.py
index a52779e..ed93cc1 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -171,7 +171,6 @@ class InferConfig:
     manual_height: float = field(
         default=0.08, metadata={"help": "Height above surface for manual movement"}
     )
-    
 
 
 @dataclass
diff --git a/alignit/infere.py b/alignit/infere.py
index 013c406..9f1210b 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -45,10 +45,7 @@ def main(cfg: InferConfig):
         while True:
             observation = robot.get_observation()
             rgb_image = observation["rgb"].astype(np.float32) / 255.0
-            depth_image_unscaled = observation["depth"].astype(np.float32)
-            depth_image_unscaled = np.clip(depth_image_unscaled, a_min=None, a_max=1000)
-
-            depth_image = depth_image_unscaled / 1000.0  # Scale depth to meters
+            depth_image = observation["depth"].astype(np.float32)
             print(
                 "Min/Max depth,mean (raw):",
                 observation["depth"].min(),
@@ -108,13 +105,17 @@ def main(cfg: InferConfig):
                 print("Moving robot to final pose.")
                 current_pose = robot.pose()
                 gripper_z_offset = np.array(
-                    [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, cfg.manual_height], [0, 0, 0, 1]]
-                      )
+                    [
+                        [1, 0, 0, 0],
+                        [0, 1, 0, 0],
+                        [0, 0, 1, cfg.manual_height],
+                        [0, 0, 0, 1],
+                    ]
+                )
                 offset_pose = current_pose @ gripper_z_offset
                 robot.servo_to_pose(pose=offset_pose)
                 robot.close_gripper()
                 robot.gripper_off()
-                
 
                 break
 
diff --git a/alignit/robots/xarm.py b/alignit/robots/xarm.py
index a65f159..6b3889a 100644
--- a/alignit/robots/xarm.py
+++ b/alignit/robots/xarm.py
@@ -36,9 +36,7 @@ def get_intrinsics(self):
 
     def get_observation(self):
         rgb_image, depth_image, acquisition_time = self.camera.async_read()
-        depth_array_clipped = np.clip(
-        np.array(depth_image), a_min=0, a_max=1000
-                        )
+        depth_array_clipped = np.clip(np.array(depth_image), a_min=0, a_max=1000)
         depth_image = np.array(depth_array_clipped) / 1000.0
 
         return {
@@ -57,6 +55,7 @@ def servo_to_pose(self, pose, lin_tol=1e-3, ang_tol=1e-2):
             }
             self.send_action(action)
             time.sleep(1.0 / 60.0)
+
     def close_gripper(self):
         action = {
             "pose": self.pose(),
diff --git a/alignit/train.py b/alignit/train.py
index 33653c7..4a59f1c 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -86,10 +86,7 @@ def main(cfg: TrainConfig):
 
                     depth_sequence_processed = []
                     for d_img in depth_sequence:
-                        depth_array_clipped = np.clip(
-                            np.array(d_img), a_min=0, a_max=1000
-                        )
-                        depth_array = np.array(depth_array_clipped) / 1000.0
+                        depth_array = np.array(d_img) / 1000.0
                         depth_tensor = torch.from_numpy(depth_array).float()
                         depth_tensor = depth_tensor.unsqueeze(0)
                         depth_sequence_processed.append(depth_tensor)

From ce60a3192671f2f4b9839e91c54f0da274af08ac Mon Sep 17 00:00:00 2001
From: Darko Lukic <lukicdarkoo@gmail.com>
Date: Sat, 16 Aug 2025 22:34:08 +0200
Subject: [PATCH 13/13] cleanup

---
 alignit/config.py                             |   3 +-
 alignit/infere.py                             |   9 +-
 alignit/record.py                             |  15 +-
 alignit/robots/xarm.py                        |   6 +-
 alignit/robots/xarmsim/__init__.py            |  10 +-
 .../ufactory_lite6/lite6_gripper_narrow.xml   |   2 +-
 .../ufactory_lite6/lite6_gripper_wide.xml     | 152 ------------------
 alignit/train.py                              |   8 +-
 .../apriltag_benchmark.py                     |   9 +-
 9 files changed, 31 insertions(+), 183 deletions(-)
 delete mode 100644 alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_wide.xml
 rename measuring/precision.py => benchmark/apriltag_benchmark.py (98%)

diff --git a/alignit/config.py b/alignit/config.py
index ed93cc1..15b8ff2 100644
--- a/alignit/config.py
+++ b/alignit/config.py
@@ -2,6 +2,7 @@
 
 from dataclasses import dataclass, field
 from typing import Optional, List
+
 import numpy as np
 
 
@@ -10,7 +11,7 @@ class DatasetConfig:
     """Configuration for dataset paths and loading."""
 
     path: str = field(
-        default="./data/duck", metadata={"help": "Path to the dataset directory"}
+        default="./data/default", metadata={"help": "Path to the dataset directory"}
     )
 
 
diff --git a/alignit/infere.py b/alignit/infere.py
index 9f1210b..e533162 100644
--- a/alignit/infere.py
+++ b/alignit/infere.py
@@ -1,10 +1,11 @@
+import time
+
+import torch
 import transforms3d as t3d
 import numpy as np
-import time
 import draccus
+
 from alignit.config import InferConfig
-from alignit.config import RecordConfig
-import torch
 from alignit.models.alignnet import AlignNet
 from alignit.utils.zhou import sixd_se3
 from alignit.utils.tfs import print_pose, are_tfs_close
@@ -32,7 +33,7 @@ def main(cfg: InferConfig):
     net.to(device)
     net.eval()
 
-    robot = Xarm()
+    robot = XarmSim()
 
     start_pose = t3d.affines.compose(
         [0.23, 0, 0.25], t3d.euler.euler2mat(np.pi, 0, 0), [1, 1, 1]
diff --git a/alignit/record.py b/alignit/record.py
index 4faccef..00e897b 100644
--- a/alignit/record.py
+++ b/alignit/record.py
@@ -1,13 +1,9 @@
 import os
 import shutil
+
 import transforms3d as t3d
 import numpy as np
 from scipy.spatial.transform import Rotation as R
-from alignit.robots.xarm import Xarm
-from alignit.utils.zhou import se3_sixd
-import draccus
-from alignit.config import RecordConfig
-from alignit.robots.xarmsim import XarmSim
 from datasets import (
     Dataset,
     Features,
@@ -17,7 +13,12 @@
     load_from_disk,
     concatenate_datasets,
 )
-import time
+
+from alignit.robots.xarmsim import XarmSim
+from alignit.robots.xarm import Xarm
+from alignit.utils.zhou import se3_sixd
+import draccus
+from alignit.config import RecordConfig
 
 
 def generate_spiral_trajectory(start_pose, cfg):
@@ -79,7 +80,7 @@ def generate_spiral_trajectory(start_pose, cfg):
 @draccus.wrap()
 def main(cfg: RecordConfig):
     """Record alignment dataset using configuration parameters."""
-    robot = Xarm()
+    robot = XarmSim()
     features = Features(
         {
             "images": Sequence(Image()),
diff --git a/alignit/robots/xarm.py b/alignit/robots/xarm.py
index 6b3889a..39c69fd 100644
--- a/alignit/robots/xarm.py
+++ b/alignit/robots/xarm.py
@@ -1,9 +1,11 @@
 import time
+
+import numpy as np
+import transforms3d as t3d
 from lerobot.cameras.realsense import RealSenseCamera, RealSenseCameraConfig
 from lerobot_xarm.xarm import Xarm as LeXarm
 from lerobot_xarm.config import XarmConfig
-import numpy as np
-import transforms3d as t3d
+
 from alignit.robots.robot import Robot
 from alignit.utils.tfs import are_tfs_close
 from alignit.config import RecordConfig
diff --git a/alignit/robots/xarmsim/__init__.py b/alignit/robots/xarmsim/__init__.py
index f3d6930..fbad295 100644
--- a/alignit/robots/xarmsim/__init__.py
+++ b/alignit/robots/xarmsim/__init__.py
@@ -125,10 +125,10 @@ def _set_object_pose(self, object_name, pose_matrix):
                 self.data.qvel[qvel_adr : qvel_adr + 6] = 0
         mj.mj_forward(self.model, self.data)
 
-    def _gripper_close(self):
+    def close_gripper(self):
         self._set_gripper_position(self.gripper_close_pos)
 
-    def _gripper_open(self):
+    def open_gripper(self):
         self._set_gripper_position(self.gripper_open_pos)
 
     def _set_gripper_position(self, pos, tolerance=1e-3, max_sim_steps=2000):
@@ -195,8 +195,10 @@ def get_observation(self):
             self.renderer.update_scene(self.data, camera=name)
             image_depth = self.renderer.render()
             self.renderer.disable_depth_rendering()
-            obs["camera." + name] = image[:, :, ::-1]
-            obs["camera." + name + ".depth"] = image_depth
+
+            # TODO: Handle multiple cameras
+            obs["rgb"] = image[:, :, ::-1]
+            obs["depth"] = image_depth
 
         return obs
 
diff --git a/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_narrow.xml b/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_narrow.xml
index 2281230..0e37df0 100644
--- a/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_narrow.xml
+++ b/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_narrow.xml
@@ -104,7 +104,7 @@
                       diaginertia="0.00016117 0.000118 0.00014455" />
                     <geom name="gripper_lite_body" class="visual" mesh="gripper_lite_body" material="white"/>
                     <geom name="gripper_lite_body_c" class="collision" mesh="gripper_lite_body_c"/>
-                    <camera name="rgb" mode="fixed" pos="0.1 0 -0.02" quat="0.0 0 1 0" fovy="90"/>
+                    <camera name="gripper" mode="fixed" pos="0.1 0 -0.02" quat="0.0 0 1 0" fovy="90"/>
                     <body name="gripper_left_finger" pos="0 0 0.054" quat="1 0 0 0">
                       <inertial pos="0.0 -0.005 0.01" quat="1 0 0 0" mass="0.024"
                         diaginertia="1.961e-6 2.274e-6 1.238e-6" />
diff --git a/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_wide.xml b/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_wide.xml
deleted file mode 100644
index d25e408..0000000
--- a/alignit/robots/xarmsim/ufactory_lite6/lite6_gripper_wide.xml
+++ /dev/null
@@ -1,152 +0,0 @@
-<mujoco model="ufactory_lite6">
-  <compiler angle="radian" meshdir="assets" autolimits="true"/>
-
-  <option integrator="implicitfast"/>
-
-  <default>
-    <default class="lite6">
-      <geom type="mesh"/>
-      <joint axis="0 0 1" damping="1" armature="0.1"/>
-      <position kp="2000" kv="200"/>
-      <default class="size1">
-        <position forcerange="-50 50"/>
-      </default>
-      <default class="size2">
-        <position forcerange="-32 32"/>
-      </default>
-      <default class="size3">
-        <position forcerange="-20 20"/>
-      </default>
-      <default class="visual">
-        <geom contype="0" conaffinity="0" group="2" material="white"/>
-      </default>
-      <default class="collision">
-        <geom group="3" mass="0" density="0"/>
-      </default>
-      <site size="0.001" rgba="0.5 0.5 0.5 0.3" group="4"/>
-    </default>
-  </default>
-
-  <asset>
-    <material name="white" rgba="1 1 1 1"/>
-    <material name="silver" rgba=".753 .753 .753 1"/>
-    <material name="black" rgba=".1 .1 .1 1"/>
-
-    <mesh file="visual/link_base.stl"/>
-    <mesh file="visual/link1.stl"/>
-    <mesh file="visual/link2.stl"/>
-    <mesh file="visual/link3.stl"/>
-    <mesh file="visual/link4.stl"/>
-    <mesh file="visual/link5.stl"/>
-    <mesh file="visual/link6.stl"/>
-    <mesh file="visual/gripper_lite_body.stl"/>
-    <mesh file="visual/gripper_lite_left_finger_wide.stl"/>
-    <mesh file="visual/gripper_lite_right_finger_wide.stl"/>
-    <mesh name="link_base_c" file="collision/link_base.stl"/>
-    <mesh name="link1_c" file="collision/link1.stl"/>
-    <mesh name="link2_c" file="collision/link2.stl"/>
-    <mesh name="link3_c" file="collision/link3.stl"/>
-    <mesh name="link4_c" file="collision/link4.stl"/>
-    <mesh name="link5_c" file="collision/link5.stl"/>
-    <mesh name="link6_c" file="collision/link6.stl"/>
-    <mesh name="gripper_lite_body_c" file="collision/gripper_lite_body.stl"/>
-    <mesh file="collision/gripper_lite_left_finger_wide_base.stl"/>
-    <mesh file="collision/gripper_lite_right_finger_wide_base.stl"/>
-    <mesh file="collision/gripper_lite_left_finger_wide_tip.stl"/>
-    <mesh file="collision/gripper_lite_right_finger_wide_tip.stl"/>
-  </asset>
-
-  <worldbody>
-    <body name="link_base" childclass="lite6">
-      <inertial pos="-0.00829545 3.26357e-05 0.0631195" mass="1.65394" diaginertia="0 0 0"/>
-      <geom class="visual" mesh="link_base"/>
-      <geom name="link_base_c" class="collision" mesh="link_base_c"/>
-      <body name="link1" pos="0 0 0.2435">
-        <inertial pos="-0.00036 0.04195 -0.0025" quat="0.608059 0.792349 -0.0438707 0.0228718" mass="1.411"
-          diaginertia="0.00145276 0.00135275 0.000853355"/>
-        <joint name="joint1" range="-6.28319 6.28319"/>
-        <geom class="visual" mesh="link1"/>
-        <geom name="link1_c" class="collision" mesh="link1_c"/>
-        <body name="link2" quat="-1 1 1 1">
-          <inertial pos="0.179 0 0.0584" quat="0.417561 0.571619 0.569585 0.417693" mass="1.34"
-            diaginertia="0.00560971 0.0052152 0.00122018"/>
-          <joint name="joint2" range="-2.61799 2.61799"/>
-          <geom class="visual" mesh="link2"/>
-          <geom name="link2_c" class="collision" mesh="link2_c"/>
-          <body name="link3" pos="0.2002 0 0" quat="-2.59734e-06 -0.707105 -0.707108 -2.59735e-06">
-            <inertial pos="0.072 -0.0357 -0.001" quat="0.128259 0.662963 -0.167256 0.71837" mass="0.953"
-              diaginertia="0.0018521 0.00175546 0.000703807"/>
-            <joint name="joint3" range="-0.061087 5.23599"/>
-            <geom class="visual" mesh="link3"/>
-            <geom name="link3_c" class="collision" mesh="link3_c"/>
-            <body name="link4" pos="0.087 -0.22761 0" quat="0.707105 0.707108 0 0">
-              <inertial pos="-0.002 -0.0285 -0.0813" quat="0.975248 0.22109 0.00203498 -0.00262178" mass="1.284"
-                diaginertia="0.00370503 0.00349091 0.00109586"/>
-              <joint name="joint4" range="-6.28319 6.28319"/>
-              <geom class="visual" mesh="link4"/>
-              <geom name="link4_c" class="collision" mesh="link4_c"/>
-              <body name="link5" quat="1 1 0 0">
-                <inertial pos="0 0.01 0.0019" quat="0.71423 0.696388 -0.0531933 0.0456997" mass="0.804"
-                  diaginertia="0.000567553 0.000529266 0.000507681"/>
-                <joint name="joint5" range="-2.1642 2.1642"/>
-                <geom class="visual" mesh="link5"/>
-                <geom name="link5_c" class="collision" mesh="link5_c"/>
-                <body name="link6" pos="0 0.0625 0" quat="1 -1 0 0">
-                  <inertial pos="0 -0.00194 -0.0102" quat="-0.0376023 0.704057 0.0446838 0.707738" mass="0.13"
-                    diaginertia="0.000148148 8.57757e-05 7.71412e-05"/>
-                  <joint name="joint6" range="-6.28319 6.28319"/>
-                  <geom class="visual" mesh="link6" material="silver"/>
-                  <geom name="link6_c" class="collision" mesh="link6_c"/>
-                  <site name="attachment_site"/>
-                  <body name="gripper_body" pos="0 0 0" quat="1 0 0 0">
-                    <inertial pos="0.0 0.0 0.026" quat="1 0 0 0" mass="0.26"
-                      diaginertia="0.00016117 0.000118 0.00014455" />
-                    <geom class="visual" mesh="gripper_lite_body" material="white"/>
-                    <geom name="gripper_lite_body_c" class="collision" mesh="gripper_lite_body_c"/>
-                    <!-- Finger and base separated to avoid convex hull preventing gripping -->
-                    <body name="gripper_left_finger" pos="0 0 0.054" quat="1 0 0 0">
-                      <inertial pos="0.0 -0.01 0.01" quat="1 0 0 0" mass="0.024"
-                        diaginertia="1.961e-6 2.274e-6 1.238e-6" />
-                      <joint name="gripper_left_finger" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="-0.0081 -1e-5" frictionloss="0.4" />
-                      <geom mesh="gripper_lite_left_finger_wide" class="visual" material="black"/>
-                      <geom name="gripper_left_finger_base" class="collision" mesh="gripper_lite_left_finger_wide_base" material="black" condim="1" solimp="2 1 0.01" solref="0.01 1" friction="6 0.5 0.1"/>
-                      <geom name="gripper_left_finger" class="collision" mesh="gripper_lite_left_finger_wide_tip" material="black" condim="4" solimp="2 1 0.01" solref="0.01 1" friction="6 0.5 0.1"/>
-                    </body>
-                    <body name="gripper_right_finger" pos="0 0 0.054" quat="1 0 0 0">
-                      <inertial pos="0.0 0.01 0.01" quat="1 0 0 0" mass="0.024"
-                        diaginertia="1.961e-6 2.274e-6 1.238e-6" />
-                      <joint name="gripper_right_finger" pos="0 0 0" axis="0 1 0" type="slide" limited="true" range="1e-5 0.0081" frictionloss="0.4" />
-                      <geom mesh="gripper_lite_right_finger_wide" class="visual" material="black"/>
-                      <geom name="gripper_right_finger_base" class="collision" mesh="gripper_lite_right_finger_wide_base" material="black" condim="1" solimp="2 1 0.01" solref="0.01 1" friction="5 0.5 0.1"/>
-                      <geom name="gripper_right_finger" class="collision" mesh="gripper_lite_right_finger_wide_tip" material="black" condim="4" solimp="2 1 0.01" solref="0.01 1" friction="5 0.5 0.1"/>
-                    </body>
-                    <!-- End effector reference point is in the centre at the tip of the grippers -->
-                    <site name="end_effector" pos="0 0 0.0811"/>
-                  </body>
-                </body>
-              </body>
-            </body>
-          </body>
-        </body>
-      </body>
-    </body>
-  </worldbody>
-
-  <actuator>
-    <position joint="joint1" class="size1" ctrlrange="-6.28319 6.28319"/>
-    <position joint="joint2" class="size1" ctrlrange="-2.61799 2.61799"/>
-    <position joint="joint3" class="size2" ctrlrange="-0.061087 5.23599"/>
-    <position joint="joint4" class="size2" ctrlrange="-6.28319 6.28319"/>
-    <position joint="joint5" class="size2" ctrlrange="-2.1642 2.1642"/>
-    <position joint="joint6" class="size3" ctrlrange="-6.28319 6.28319"/>
-    <motor name="gripper" joint="gripper_left_finger" forcerange="-10 10" ctrlrange="-10 10"/>
-  </actuator>
-
-  <equality>
-    <joint joint1="gripper_left_finger" joint2="gripper_right_finger" polycoef="0 -1 0 0 0"/>
-  </equality>
-
-  <keyframe>
-    <key name="home" qpos="0 0 1.57 0 1.57 0 0 0" ctrl="0 0 1.57 0 1.57 0 0"/>
-  </keyframe>
-</mujoco>
diff --git a/alignit/train.py b/alignit/train.py
index 4a59f1c..37b4204 100644
--- a/alignit/train.py
+++ b/alignit/train.py
@@ -86,7 +86,7 @@ def main(cfg: TrainConfig):
 
                     depth_sequence_processed = []
                     for d_img in depth_sequence:
-                        depth_array = np.array(d_img) / 1000.0
+                        depth_array = np.array(d_img)
                         depth_tensor = torch.from_numpy(depth_array).float()
                         depth_tensor = depth_tensor.unsqueeze(0)
                         depth_sequence_processed.append(depth_tensor)
@@ -95,12 +95,6 @@ def main(cfg: TrainConfig):
                     batch_depth_tensors.append(stacked_depth)
 
                 batch_depth_tensors = torch.stack(batch_depth_tensors, dim=0).to(device)
-                print(
-                    "Min,max,mean depth:",
-                    batch_depth_tensors.min().item(),
-                    batch_depth_tensors.max().item(),
-                    batch_depth_tensors.mean().item(),
-                )
 
             optimizer.zero_grad()
             if cfg.model.use_depth_input:
diff --git a/measuring/precision.py b/benchmark/apriltag_benchmark.py
similarity index 98%
rename from measuring/precision.py
rename to benchmark/apriltag_benchmark.py
index 6d3e7a2..9010310 100644
--- a/measuring/precision.py
+++ b/benchmark/apriltag_benchmark.py
@@ -1,13 +1,12 @@
 import cv2
 import numpy as np
 from pupil_apriltags import Detector
-from alignit.robots.xarm import Xarm
-import os
-import time
 from scipy.spatial.transform import Rotation as R
 
+from alignit.robots.xarm import Xarm
+
 
-class Evaluation:
+class AprilTagBenchmark:
     def __init__(self, tag_size=0.06):
         camera_params = robot.get_intrinsics()
         self.detector = Detector(families="tag36h11")
@@ -86,7 +85,7 @@ def pose_in_tag_frame(tag_pose_world, robot_pose_world):
 
 if __name__ == "__main__":
     robot = Xarm()
-    detector = Evaluation()
+    detector = AprilTagBenchmark()
     while True:
         observation = robot.get_observation()
         curr_pose = robot.pose()