#53 Update API and finalize the example

pockerman · pockerman · commit 3f5a77fa9834 · 2022-02-22T14:50:33.000Z
diff --git a/src/algorithms/sarsa_semi_gradient.py b/src/algorithms/sarsa_semi_gradient.py
@@ -5,9 +5,11 @@
 """
 import numpy as np
 from typing import TypeVar
+from dataclasses import dataclass
 
 from src.utils.mixins import WithMaxActionMixin, WithQTableMixinBase
 from src.utils.episode_info import EpisodeInfo
+
 from src.algorithms.q_estimator import QEstimator
 from src.exceptions.exceptions import InvalidParamValue
 
@@ -16,28 +18,28 @@
 Policy = TypeVar('Policy')
 Estimator = TypeVar('Estimator')
 
-
+@dataclass(init=True, repr=True)
 class SARSAnConfig:
+    """Configuration class for n-step SARSA algorithm
 
-    def __init__(self) -> None:
-        self.gamma: float = 1.0
-        self.alpha = 0.1
-        self.n = 10
-        self.n_itrs_per_episode: int = 100
-        self.max_size: int = 4096
-        self.use_trace: bool = False
-        self.policy: Policy = None
-        self.estimator: Estimator = None
-        self.reset_estimator_z_traces: bool = False
+    """
+    gamma: float = 1.0
+    alpha: float = 0.1
+    n: int = 10
+    n_itrs_per_episode: int = 100
+    max_size: int = 4096
+    use_trace: bool = False
+    policy: Policy = None
+    estimator: Estimator = None
+    reset_estimator_z_traces: bool = False
 
 
 class SARSAn(WithMaxActionMixin):
-    """
-    Implementation ofn-step  semi-gradient SARSA algorithm
+    """Implementation of n-step  semi-gradient SARSA algorithm
     """
 
     def __init__(self, sarsa_config: SARSAnConfig):
-
+        super(SARSAn, self).__init__()
         self.name = "SARSAn"
         self.config = sarsa_config
         self.q_table = {}
@@ -66,6 +68,9 @@ def actions_before_episode_begins(self, **options) -> None:
         # reset the estimator
         self.config.estimator.reset(self.config.reset_estimator_z_traces)
 
+    def actions_after_episode_ends(self, **options) -> None:
+        pass
+
     def on_episode(self, env: Env) -> EpisodeInfo:
         """
         Train the agent on the given algorithm
@@ -95,6 +100,7 @@ def on_episode(self, env: Env) -> EpisodeInfo:
                 # take action A, observe R, S'
                 next_time_step = env.step(action)
                 next_state = next_time_step.observation
+                states.append(next_state)
                 reward = next_time_step.reward
 
                 total_distortion += next_time_step.info["total_distortion"]
@@ -107,7 +113,7 @@ def on_episode(self, env: Env) -> EpisodeInfo:
 
                     next_action_idx = self.config.policy(self.q_table, next_state)
                     next_action = env.get_action(next_action_idx)
-                    actions.append(next_action)
+                    actions.append(next_action_idx)
 
             # should we update
             update_time = itr + 1 - self.config.n
@@ -122,7 +128,14 @@ def on_episode(self, env: Env) -> EpisodeInfo:
                     q_values_next = self.config.estimator.predict(states[update_time + self.config.n])
                     target += q_values_next[actions[update_time + self.config.n]]
 
-                # Update step
+                # Update step. what happens if the update_time is greater than
+                # len(states) or len(actions)
+
+                if update_time >= len(states) or update_time >= len(actions):
+                    raise InvalidParamValue(param_name="update_time", param_value=str(update_time))
+
+                # update the state for the respective action
+                # with the computed target
                 self.config.estimator.update(states[update_time], actions[update_time], target)
 
             if update_time == T - 1:
@@ -135,7 +148,7 @@ def on_episode(self, env: Env) -> EpisodeInfo:
         episode_info = EpisodeInfo()
         episode_info.episode_score = episode_score
         episode_info.total_distortion = total_distortion
-        episode_info.info["m_iterations"] = counter
+        episode_info.info["n_iterations"] = counter
         return episode_info
 
     def _validate(self, env: Env) -> None:
diff --git a/src/algorithms/trainer.py b/src/algorithms/trainer.py
@@ -4,8 +4,10 @@
 
 import numpy as np
 from typing import TypeVar
+
 from src.utils import INFO
 from src.utils.function_wraps import time_func
+from src.utils.episode_info import EpisodeInfo
 
 Env = TypeVar("Env")
 Agent = TypeVar("Agent")
@@ -83,16 +85,18 @@ def train(self):
 
             self.actions_before_episode_begins(**{"env": self.env})
             # train for a number of iterations
-            episode_score, total_distortion, n_itrs = self.agent.on_episode(self.env)
+            #episode_score, total_distortion, n_itrs = self.agent.on_episode(self.env)
+            episode_info: EpisodeInfo = self.agent.on_episode(self.env)
 
-            print("{0} Episode score={1}, episode total distortion {2}".format(INFO, episode_score, total_distortion / n_itrs))
+            print("{0} Episode score={1}, episode total avg distortion {2}".format(INFO, episode_info.episode_score,
+                                                                               episode_info.total_distortion / episode_info.info["n_iterations"]))
 
             #if episode % self.configuration['output_msg_frequency'] == 0:
-            print("{0} Episode finished after {1} iterations".format(INFO, n_itrs))
+            print("{0} Episode finished after {1} iterations".format(INFO, episode_info.info["n_iterations"]))
 
-            self.iterations_per_episode.append(n_itrs)
-            self.total_rewards[episode] = episode_score
-            self.total_distortions.append(total_distortion)
+            self.iterations_per_episode.append(episode_info.info["n_iterations"])
+            self.total_rewards[episode] = episode_info.episode_score
+            self.total_distortions.append(episode_info.total_distortion)
             self.actions_after_episode_ends(**{"episode_idx": episode})
 
         print("{0} Training finished for agent {1}".format(INFO, self.agent.name))
diff --git a/src/examples/nstep_semi_grad_sarsa_three_columns.py b/src/examples/nstep_semi_grad_sarsa_three_columns.py
@@ -169,7 +169,9 @@ def load_dataset() -> MockSubjectsLoader:
     env = DiscreteStateEnvironment(env_config=env_config)
 
     tiled_env_config = TiledEnvConfig(env=env, num_tilings=NUM_TILINGS, max_size=MAX_SIZE, tiling_dim=TILING_DIM,
-                                      column_scales={"ethnicity": [0.0, 1.0], "salary": [0.0, 1.0]})
+                                      column_ranges={"ethnicity": [0.0, 1.0],
+                                                     "salary": [0.0, 1.0],
+                                                     "diagnosis": [0.0, 5.0]})
     # we will use a tiled environment in this example
     tiled_env = TiledEnv(config=tiled_env_config)
     tiled_env.reset()
diff --git a/src/extern/tile_coding.py b/src/extern/tile_coding.py
@@ -77,7 +77,8 @@ def hashcoords(coordinates, m, readonly=False):
 
 
 def tiles(ihtORsize, numtilings, floats, ints=[], readonly=False):
-    """returns num-tilings tile indices corresponding to the floats and ints"""
+    """Returns num-tilings tile indices corresponding to the floats and ints
+    """
     qfloats = [floor(f * numtilings) for f in floats]
     Tiles = []
     for tiling in range(numtilings):
diff --git a/src/spaces/discrete_state_environment.py b/src/spaces/discrete_state_environment.py
@@ -84,6 +84,10 @@ def n_states(self) -> int:
     def column_names(self) -> list:
         return self.config.data_set.get_columns_names()
 
+    @property
+    def column_distortions(self) -> dict:
+        return self.column_distances
+
     def get_action(self, aidx: int) -> ActionBase:
         return self.config.action_space[aidx]
 
diff --git a/src/spaces/state.py b/src/spaces/state.py
@@ -61,12 +61,27 @@ def __init__(self):
         self.idx = -1
         self.bin_idx = -1
         self.total_distortion: float = 0.0
-        self.column_names = []
+        self.column_distortions = {}
 
     def __contains__(self, item) -> bool:
-        return item in self.column_names
+        return item in self.column_distortions.keys()
 
     def __iter__(self):
-        return StateIterator(self.column_names)
+        return StateIterator(list(self.column_distortions.keys()))
+
+    def __getitem__(self, name: str) -> float:
+        """
+        Get the distortion corresponding to the name-th column
+
+        Parameters
+        ----------
+        name: The name of the column
+
+        Returns
+        -------
+
+        The column distortion
+        """
+        return self.column_distortions[name]
 
 
diff --git a/src/spaces/tiled_environment.py b/src/spaces/tiled_environment.py
@@ -3,7 +3,7 @@
 """
 
 import copy
-from typing import TypeVar
+from typing import TypeVar, List
 from dataclasses import dataclass
 from src.extern.tile_coding import IHT, tiles
 from src.spaces.actions import ActionBase, ActionType
@@ -13,6 +13,8 @@
 from src.spaces.time_step import copy_time_step
 
 Env = TypeVar('Env')
+Tile = TypeVar('Tile')
+Config = TypeVar('Config')
 
 
 @dataclass(init=True, repr=True)
@@ -24,10 +26,14 @@ class TiledEnvConfig(object):
     num_tilings: int = 0
     max_size: int = 0
     tiling_dim: int = 0
-    column_scales: dict = None
+    column_ranges: dict = None
 
 
 class TiledEnv(object):
+    """The TiledEnv class. It models a tiled
+    environment
+    """
+
     IS_TILED_ENV_CONSTRAINT = True
 
     def __init__(self, config: TiledEnvConfig) -> None:
@@ -40,11 +46,13 @@ def __init__(self, config: TiledEnvConfig) -> None:
         # set up the columns scaling
         # only the columns that are to be altered participate in the
         # tiling
-        self.column_scales = config.column_scales
+        self.column_ranges = config.column_ranges
+        self.column_scales = {}
 
         # Initialize index hash table (IHT) for tile coding.
         # This assigns a unique index to each tile up to max_size tiles.
         self._validate()
+        self._create_column_scales()
         self.iht = IHT(self.max_size)
 
     @property
@@ -59,6 +67,10 @@ def n_actions(self) -> int:
     def n_states(self) -> int:
         return self.env.n_states
 
+    @property
+    def config(self) -> Config:
+        return self.env.config
+
     def step(self, action: ActionBase) -> TimeStep:
         """Execute the action in the environment and return
         a new state for observation
@@ -83,16 +95,11 @@ def step(self, action: ActionBase) -> TimeStep:
         # of the bin that the total distortion falls into
         state.bin_idx = raw_time_step.observation
         state.total_distortion = raw_time_step.info["total_distortion"]
-        state.column_names = self.env.column_names
+        state.column_distortions = self.env.column_distortions
 
         time_step = copy_time_step(time_step=raw_time_step, **{"observation": state})
-        #time_step = copy.deepcopy(raw_time_step)
-        #time_step.observation = state
-
         return time_step
 
-        return
-
     def reset(self, **options) -> TimeStep:
         """Reset the environment so that a new sequence
         of episodes can be generated
@@ -116,24 +123,29 @@ def reset(self, **options) -> TimeStep:
         # of the bin that the total distortion falls into
         state.bin_idx = raw_time_step.observation
         state.total_distortion = raw_time_step.info["total_distortion"]
-        state.column_names = self.env.column_names
+        state.column_distortions = self.env.column_distortions
 
         time_step = copy_time_step(time_step=raw_time_step, **{"observation": state})
 
-        #time_step = copy.deepcopy(raw_time_step)
-        #time_step.observation = state
-
         return time_step
 
     def get_action(self, aidx: int) -> ActionBase:
         return self.env.action_space[aidx]
 
     def save_current_dataset(self, episode_index: int, save_index: bool = False) -> None:
         """
-        Save the current distorted datase for the given episode index
-        :param episode_index:
-        :param save_index:
-        :return:
+        Save the current data set at the given episode index
+        Parameters
+        ----------
+
+        episode_index: Episode index corresponding to the training episode
+        save_index: if True the Pandas index is also saved
+
+        Returns
+        -------
+
+        None
+
         """
         self.env.save_current_dataset(episode_index, save_index)
 
@@ -200,22 +212,54 @@ def get_scaled_state(self, state: State) -> list:
         """
         scaled_state_vals = []
         for name in state:
-            scaled_state_vals.append(state[name] * self.columns_scales[name])
+            scaled_state_vals.append(state[name] * self.column_scales[name])
 
         return scaled_state_vals
 
-    def featurize_state_action(self, state, action: ActionBase) -> None:
-        """
-        Returns the featurized representation for a state-action pair
-        :param state:
-        :param action:
-        :return:
+    def featurize_state_action(self, state: State, action: ActionBase) -> List[Tile]:
+        """Get a list of Tiles for the given state and action
+
+        Parameters
+        ----------
+        state: The environment state observed
+        action: The action
+
+        Returns
+        -------
+
+        A list of tiles
+
         """
+
         scaled_state = self.get_scaled_state(state)
         featurized = tiles(self.iht, self.num_tilings, scaled_state, [action])
         return featurized
 
+    def _create_column_scales(self) -> None:
+        """
+        Create the scales for each column
+
+        Returns
+        -------
+
+        None
+
+        """
+
+        for name in self.column_ranges:
+            range_ = self.column_ranges[name]
+            self.column_scales[name] = self.tiling_dim / (range_[1] - range_[0])
+
     def _validate(self) -> None:
+        """
+        Validate the internal data structures
+
+        Returns
+        -------
+
+        None
+
+        """
         if self.max_size <= 0:
             raise InvalidParamValue(param_name="max_size",
                                     param_value=str(self.max_size) + " should be > 0")
@@ -227,7 +271,10 @@ def _validate(self) -> None:
                                     param_value=str(self.max_size) +
                                     " should be >=num_tilings * tiling_dim * tiling_dim")
 
-        if len(self.column_scales) == 0:
+        if len(self.column_ranges) == 0:
             raise InvalidParamValue(param_name="column_scales",
                                     param_value=str(len(self.column_scales)) + " should not be empty")
 
+        if len(self.column_ranges) != len(self.env.column_names):
+            raise ValueError("Column ranges is not equal to number of columns")
+