HW5c fix: Problem 1 eval deterministically on all tasks

katerakelly · katerakelly · commit fae6e7ba3362 · 2018-11-07T11:21:07.000-08:00
diff --git a/hw5/meta/point_mass_observed.py b/hw5/meta/point_mass_observed.py
@@ -18,17 +18,23 @@ class ObservedPointEnv(Env):
     # YOUR CODE SOMEWHERE HERE
     def __init__(self, num_tasks=1):
         self.tasks = [0, 1, 2, 3][:num_tasks]
+        self.task_idx = -1
         self.reset_task()
         self.reset()
 
         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
         self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
 
     def reset_task(self, is_evaluation=False):
-        idx = np.random.choice(len(self.tasks))
-        self._task = self.tasks[idx]
+        # for evaluation, cycle deterministically through all tasks
+        if is_evaluation:
+            self.task_idx = (self.task_idx + 1) % len(self.tasks)
+        # during training, sample tasks randomly
+        else:
+            self.task_idx = np.random.randint(len(self.tasks))
+        self._task = self.tasks[self.task_idx]
         goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]]
-        self._goal = np.array(goals[idx])*10
+        self._goal = np.array(goals[self.task_idx])*10
 
     def reset(self):
         self._state = np.array([0, 0], dtype=np.float32)
diff --git a/hw5/meta/train_policy.py b/hw5/meta/train_policy.py
@@ -689,8 +689,11 @@ def unpack_sample(data):
 
         # sample trajectories to fill agent's replay buffer
         print("********** Iteration %i ************"%itr)
-        stats, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch)
-        total_timesteps += timesteps_this_batch
+        stats = []
+        for _ in range(num_tasks):
+            s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch)
+            total_timesteps += timesteps_this_batch
+            stats += s
 
         # compute the log probs, advantages, and returns for all data in agent's buffer
         # store in ppo buffer for use in multiple ppo updates
@@ -720,7 +723,10 @@ def unpack_sample(data):
 
         # compute validation statistics
         print('Validating...')
-        val_stats, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True)
+        val_stats = []
+        for _ in range(num_tasks):
+            vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True)
+            val_stats += vs
 
         # save trajectories for viz
         with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f: