Make progress with A2C algorithm

pockerman · pockerman · commit 41796686760f · 2022-01-05T15:30:31.000Z
diff --git a/src/algorithms/a2c.py b/src/algorithms/a2c.py
@@ -4,6 +4,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from src.utils.experience_buffer import unpack_batch
+
 Env = TypeVar("Env")
 Optimizer = TypeVar("Optimizer")
 LossFunction = TypeVar("LossFunction")
@@ -53,6 +55,8 @@ def __init__(self):
         self.n_iterations_per_episode: int = 100
         self.optimizer: Optimizer = None
         self.loss_function: LossFunction = None
+        self.batch_size: int = 0
+        self.device: str = 'cpu'
 
 
 class A2C(Generic[Optimizer]):
@@ -63,10 +67,13 @@ def __init__(self, config: A2CConfig, a2c_net: A2CNet):
         self.tau = config.tau
         self.n_workers = config.n_workers
         self.n_iterations_per_episode = config.n_iterations_per_episode
+        self.batch_size = config.batch_size
         self.optimizer = config.optimizer
+        self.device = config.device
         self.loss_function = config.loss_function
         self.a2c_net = a2c_net
         self.rewards = []
+        self.memory = []
         self.name = "A2C"
 
     def _optimize_model(self):
@@ -81,7 +88,17 @@ def select_action(self, env: Env, observation: State) -> Action:
         """
         return env.sample_action()
 
-    def update(self):
+    def update_policy_network(self):
+        """
+        Update the policy network
+        :return:
+        """
+        pass
+
+    def calculate_loss(self):
+        pass
+
+    def accummulate_batch(self):
         pass
 
     def train(self, env: Env) -> None:
@@ -92,6 +109,9 @@ def train(self, env: Env) -> None:
 
         observation = time_step.observation
 
+        # the batch to process
+        batch = []
+
         # learn over the episode
         for iteration in range(1, self.n_iterations_per_episode + 1):
 
@@ -102,11 +122,27 @@ def train(self, env: Env) -> None:
             # to the selected action
             next_time_step = env.step(action=action)
 
+            batch.append(next_time_step.observation)
+
+            if len(batch) < self.batch_size:
+                continue
+
+            # unpack the batch in order to process it
+            states_v, actions_t, vals_ref = unpack_batch(batch=batch, net=self.a2c_net, device=self.device)
+            batch.clear()
+
+            self.optimizer.zero_grad()
             # we reached the end of the episode
-            if next_time_step.last():
-                break
+            #if next_time_step.last():
+            #    break
+
+            #next_state = next_time_step.observation
+            policy_val, v_val = self.a2c_net.forward(x=states_v)
+
+            self.optimizer.zero_grad()
 
-            next_state = next_time_step.observation
-            policy_val, v_val = self.a2c_net.forward(x=next_state)
-            self._optimize_model()
+            # claculate loss
+            loss = self.calculate_loss()
+            loss.backward()
+            self.optimizer.step()
 
diff --git a/src/utils/serial_hierarchy.py b/src/utils/serial_hierarchy.py
@@ -43,7 +43,7 @@ class SerialHierarchy(HierarchyBase):
     that are applied one after the other. Applications should explicitly
     provide the list of the ensuing transformations. For example assume that the
     data field has the value 'foo' then values
-the following list ['fo*', 'f**', '***']
+     the following list ['fo*', 'f**', '***']
     """
     def __init__(self, values: List) -> None:
         """