leggedrobotics
diff --git a/‎rsl_rl/algorithms/distillation.py‎
Lines changed: 4 additions & 0 deletions b/‎rsl_rl/algorithms/distillation.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎rsl_rl/algorithms/ppo.py‎
Lines changed: 2 additions & 2 deletions b/‎rsl_rl/algorithms/ppo.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎rsl_rl/modules/actor_critic_cnn.py‎
Lines changed: 3 additions & 3 deletions b/‎rsl_rl/modules/actor_critic_cnn.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎rsl_rl/modules/rnd.py‎
Lines changed: 2 additions & 0 deletions b/‎rsl_rl/modules/rnd.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎rsl_rl/modules/symmetry.py‎
Lines changed: 2 additions & 0 deletions b/‎rsl_rl/modules/symmetry.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎rsl_rl/runners/distillation_runner.py‎
Lines changed: 12 additions & 130 deletions b/‎rsl_rl/runners/distillation_runner.py‎
Lines changed: 12 additions & 130 deletions
@@ -95,6 +95,10 @@ def process_env_step(
         self.transition.clear()
         self.policy.reset(dones)
 
+    def compute_returns(self, obs: TensorDict) -> None:
+        # Not needed for distillation
+        pass
+
     def update(self) -> dict[str, float]:
         self.num_updates += 1
         mean_behavior_loss = 0
 
@@ -61,7 +61,7 @@ def __init__(
             self.gpu_world_size = 1
 
         # RND components
-        if rnd_cfg is not None:
+        if rnd_cfg:
             # Extract parameters used in ppo
             rnd_lr = rnd_cfg.pop("learning_rate", 1e-3)
             # Create RND module
@@ -404,7 +404,7 @@ def update(self) -> dict[str, float]:
 
         # Construct the loss dictionary
         loss_dict = {
-            "value_function": mean_value_loss,
+            "value": mean_value_loss,
             "surrogate": mean_surrogate_loss,
             "entropy": mean_entropy,
         }
 
@@ -103,7 +103,7 @@ def __init__(
                 print(f"Actor CNN for {obs_group}: {self.actor_cnns[obs_group]}")
                 # Get the output dimension of the CNN
                 if self.actor_cnns[obs_group].output_channels is None:
-                    encoding_dim += int(self.actor_cnns[obs_group].output_dim)  # type: ignore
+                    encoding_dim += int(self.actor_cnns[obs_group].output_dim)
                 else:
                     raise ValueError("The output of the actor CNN must be flattened before passing it to the MLP.")
         else:
@@ -149,7 +149,7 @@ def __init__(
                 print(f"Critic CNN for {obs_group}: {self.critic_cnns[obs_group]}")
                 # Get the output dimension of the CNN
                 if self.critic_cnns[obs_group].output_channels is None:
-                    encoding_dim += int(self.critic_cnns[obs_group].output_dim)  # type: ignore
+                    encoding_dim += int(self.critic_cnns[obs_group].output_dim)
                 else:
                     raise ValueError("The output of the critic CNN must be flattened before passing it to the MLP.")
         else:
@@ -208,7 +208,7 @@ def act(self, obs: TensorDict, **kwargs: dict[str, Any]) -> torch.Tensor:
         mlp_obs, cnn_obs = self.get_actor_obs(obs)
         mlp_obs = self.actor_obs_normalizer(mlp_obs)
         self._update_distribution(mlp_obs, cnn_obs)
-        return self.distribution.sample()
+        return self.distribution.sample()  # type: ignore
 
     def act_inference(self, obs: TensorDict) -> torch.Tensor:
         mlp_obs, cnn_obs = self.get_actor_obs(obs)
 
@@ -205,4 +205,6 @@ def resolve_rnd_config(alg_cfg: dict, obs: TensorDict, obs_groups: dict[str, lis
         alg_cfg["rnd_cfg"]["obs_groups"] = obs_groups
         # Scale down the rnd weight with timestep
         alg_cfg["rnd_cfg"]["weight"] *= env.unwrapped.step_dt
+    else:
+        alg_cfg["rnd_cfg"] = None
     return alg_cfg
@@ -22,4 +22,6 @@ def resolve_symmetry_config(alg_cfg: dict, env: VecEnv) -> dict:
     # Note: This is used by the symmetry function for handling different observation terms
     if "symmetry_cfg" in alg_cfg and alg_cfg["symmetry_cfg"] is not None:
         alg_cfg["symmetry_cfg"]["_env"] = env
+    else:
+        alg_cfg["symmetry_cfg"] = None
     return alg_cfg
@@ -5,152 +5,31 @@
 
 from __future__ import annotations
 
-import os
-import time
-import torch
-from collections import deque
 from tensordict import TensorDict
 
-import rsl_rl
 from rsl_rl.algorithms import Distillation
-from rsl_rl.env import VecEnv
 from rsl_rl.modules import StudentTeacher, StudentTeacherRecurrent
 from rsl_rl.runners import OnPolicyRunner
 from rsl_rl.storage import RolloutStorage
-from rsl_rl.utils import resolve_obs_groups, store_code_state
 
 
 class DistillationRunner(OnPolicyRunner):
-    """On-policy runner for training and evaluation of teacher-student training."""
-
-    def __init__(self, env: VecEnv, train_cfg: dict, log_dir: str | None = None, device: str = "cpu") -> None:
-        self.cfg = train_cfg
-        self.alg_cfg = train_cfg["algorithm"]
-        self.policy_cfg = train_cfg["policy"]
-        self.device = device
-        self.env = env
-
-        # Check if multi-GPU is enabled
-        self._configure_multi_gpu()
-
-        # Store training configuration
-        self.num_steps_per_env = self.cfg["num_steps_per_env"]
-        self.save_interval = self.cfg["save_interval"]
-
-        # Query observations from environment for algorithm construction
-        obs = self.env.get_observations()
-        self.cfg["obs_groups"] = resolve_obs_groups(obs, self.cfg["obs_groups"], default_sets=["teacher"])
-
-        # Create the algorithm
-        self.alg = self._construct_algorithm(obs)
-
-        # Decide whether to disable logging
-        # Note: We only log from the process with rank 0 (main process)
-        self.disable_logs = self.is_distributed and self.gpu_global_rank != 0
-
-        # Logging
-        self.log_dir = log_dir
-        self.writer = None
-        self.tot_timesteps = 0
-        self.tot_time = 0
-        self.current_learning_iteration = 0
-        self.git_status_repos = [rsl_rl.__file__]
+    """Distillation runner for training and evaluation of teacher-student methods."""
 
     def learn(self, num_learning_iterations: int, init_at_random_ep_len: bool = False) -> None:
-        # Initialize writer
-        self._prepare_logging_writer()
-
         # Check if teacher is loaded
         if not self.alg.policy.loaded_teacher:
             raise ValueError("Teacher model parameters not loaded. Please load a teacher model to distill.")
 
-        # Randomize initial episode lengths (for exploration)
-        if init_at_random_ep_len:
-            self.env.episode_length_buf = torch.randint_like(
-                self.env.episode_length_buf, high=int(self.env.max_episode_length)
-            )
-
-        # Start learning
-        obs = self.env.get_observations().to(self.device)
-        self.train_mode()  # switch to train mode (for dropout for example)
+        super().learn(num_learning_iterations, init_at_random_ep_len)
 
-        # Book keeping
-        ep_infos = []
-        rewbuffer = deque(maxlen=100)
-        lenbuffer = deque(maxlen=100)
-        cur_reward_sum = torch.zeros(self.env.num_envs, dtype=torch.float, device=self.device)
-        cur_episode_length = torch.zeros(self.env.num_envs, dtype=torch.float, device=self.device)
+    def _get_default_obs_sets(self) -> list[str]:
+        """Get the the default observation sets required for the algorithm.
 
-        # Ensure all parameters are in-synced
-        if self.is_distributed:
-            print(f"Synchronizing parameters for rank {self.gpu_global_rank}...")
-            self.alg.broadcast_parameters()
-
-        # Start training
-        start_iter = self.current_learning_iteration
-        tot_iter = start_iter + num_learning_iterations
-        for it in range(start_iter, tot_iter):
-            start = time.time()
-            # Rollout
-            with torch.inference_mode():
-                for _ in range(self.num_steps_per_env):
-                    # Sample actions
-                    actions = self.alg.act(obs)
-                    # Step the environment
-                    obs, rewards, dones, extras = self.env.step(actions.to(self.env.device))
-                    # Move to device
-                    obs, rewards, dones = (obs.to(self.device), rewards.to(self.device), dones.to(self.device))
-                    # Process the step
-                    self.alg.process_env_step(obs, rewards, dones, extras)
-                    # Book keeping
-                    if self.log_dir is not None:
-                        if "episode" in extras:
-                            ep_infos.append(extras["episode"])
-                        elif "log" in extras:
-                            ep_infos.append(extras["log"])
-                        # Update rewards
-                        cur_reward_sum += rewards
-                        # Update episode length
-                        cur_episode_length += 1
-                        # Clear data for completed episodes
-                        new_ids = (dones > 0).nonzero(as_tuple=False)
-                        rewbuffer.extend(cur_reward_sum[new_ids][:, 0].cpu().numpy().tolist())
-                        lenbuffer.extend(cur_episode_length[new_ids][:, 0].cpu().numpy().tolist())
-                        cur_reward_sum[new_ids] = 0
-                        cur_episode_length[new_ids] = 0
-
-                stop = time.time()
-                collection_time = stop - start
-                start = stop
-
-            # Update policy
-            loss_dict = self.alg.update()
-
-            stop = time.time()
-            learn_time = stop - start
-            self.current_learning_iteration = it
-
-            if self.log_dir is not None and not self.disable_logs:
-                # Log information
-                self.log(locals())
-                # Save model
-                if it % self.save_interval == 0:
-                    self.save(os.path.join(self.log_dir, f"model_{it}.pt"))
-
-            # Clear episode infos
-            ep_infos.clear()
-            # Save code state
-            if it == start_iter and not self.disable_logs:
-                # Obtain all the diff files
-                git_file_paths = store_code_state(self.log_dir, self.git_status_repos)
-                # If possible store them to wandb or neptune
-                if self.logger_type in ["wandb", "neptune"] and git_file_paths:
-                    for path in git_file_paths:
-                        self.writer.save_file(path)
-
-        # Save the final model after training
-        if self.log_dir is not None and not self.disable_logs:
-            self.save(os.path.join(self.log_dir, f"model_{self.current_learning_iteration}.pt"))
+        .. note::
+            See :func:`resolve_obs_groups` for more details on the handling of observation sets.
+        """
+        return ["teacher"]
 
     def _construct_algorithm(self, obs: TensorDict) -> Distillation:
         """Construct the distillation algorithm."""
@@ -162,7 +41,7 @@ def _construct_algorithm(self, obs: TensorDict) -> Distillation:
 
         # Initialize the storage
         storage = RolloutStorage(
-            "distillation", self.env.num_envs, self.num_steps_per_env, obs, [self.env.num_actions], self.device
+            "distillation", self.env.num_envs, self.cfg["num_steps_per_env"], obs, [self.env.num_actions], self.device
         )
 
         # Initialize the algorithm
@@ -171,4 +50,7 @@ def _construct_algorithm(self, obs: TensorDict) -> Distillation:
             student_teacher, storage, device=self.device, **self.alg_cfg, multi_gpu_cfg=self.multi_gpu_cfg
         )
 
+        # Set RND configuration to None as it does not apply to distillation
+        self.cfg["algorithm"]["rnd_cfg"] = None
+
         return alg