#53 API updates for n-step SARSA algorithm

pockerman · pockerman · commit 81d3eeb243a3 · 2022-02-22T11:30:30.000Z
diff --git a/build_sphinx_doc.sh b/build_sphinx_doc.sh
@@ -1,4 +1,4 @@
 #sphinx-quickstart docs
 
-sphinx-apidoc -f -o docs/source docs/projectdir
-#sphinx-build -b html docs/source/ docs/build/html
+#sphinx-apidoc -f -o docs/source docs/source/API
+sphinx-build -b html docs/source/ docs/build/html
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -3,5 +3,8 @@ Examples
 
 Some examples can be found below
 
-- `Qlearning agent on a three columns dataset <src/examples/qlearning_three_columns.py>`_
-- `N-step semi-gradient SARSA on  a three columns dataset <src/examples/nstep_semi_grad_sarsa_three_columns.py>`_
+.. toctree::
+   :maxdepth: 4
+   
+   Examples/qlearning_three_columns
+   Examples/nstep_semi_grad_sarsa_three_columns
diff --git a/src/examples/nstep_semi_grad_sarsa_three_columns.py b/src/examples/nstep_semi_grad_sarsa_three_columns.py
@@ -16,7 +16,7 @@
 from src.utils.string_distance_calculator import StringDistanceType
 from src.utils.distortion_calculator import DistortionCalculationType, DistortionCalculator
 from src.spaces.discrete_state_environment import DiscreteStateEnvironment, DiscreteEnvConfig
-from src.spaces.tiled_environment import TiledEnv
+from src.spaces.tiled_environment import TiledEnv, TiledEnvConfig
 from src.utils.iteration_control import IterationControl
 from src.utils.plot_utils import plot_running_avg
 from src.utils import INFO
@@ -168,10 +168,10 @@ def load_dataset() -> MockSubjectsLoader:
     # create the environment
     env = DiscreteStateEnvironment(env_config=env_config)
 
+    tiled_env_config = TiledEnvConfig(env=env, num_tilings=NUM_TILINGS, max_size=MAX_SIZE, tiling_dim=TILING_DIM,
+                                      column_scales={"ethnicity": [0.0, 1.0], "salary": [0.0, 1.0]})
     # we will use a tiled environment in this example
-    tiled_env = TiledEnv(env=env, max_size=MAX_SIZE,
-                         num_tilings=NUM_TILINGS,
-                         tiling_dim=TILING_DIM)
+    tiled_env = TiledEnv(config=tiled_env_config)
     tiled_env.reset()
 
     # save the data before distortion so that we can
diff --git a/src/spaces/discrete_state_environment.py b/src/spaces/discrete_state_environment.py
@@ -7,41 +7,41 @@
 import numpy as np
 from pathlib import Path
 from typing import TypeVar, List
+from dataclasses import dataclass
 import multiprocessing as mp
 
 from src.spaces.actions import ActionBase, ActionType
 from src.spaces.time_step import TimeStep, StepType
 
+
 DataSet = TypeVar("DataSet")
 RewardManager = TypeVar("RewardManager")
 ActionSpace = TypeVar("ActionSpace")
 DistortionCalculator = TypeVar('DistortionCalculator')
 
 
+@dataclass(init=True, repr=True)
 class DiscreteEnvConfig(object):
-    """
-    Configuration for discrete environment
+    """Configuration for discrete environment
     """
 
-    def __init__(self) -> None:
-        self.data_set: DataSet = None
-        self.action_space: ActionSpace = None
-        self.reward_manager: RewardManager = None
-        self.average_distortion_constraint: float = 0.0
-        self.gamma: float = 0.99
-        self.n_states: int = 10
-        self.min_distortion: float = 0.4
-        self.max_distortion: float = 0.7
-        self.punish_factor: float = 2.0
-        self.reward_factor: float = 0.95
-        self.n_rounds_below_min_distortion: int = 10
-        self.distorted_set_path: Path = None
-        self.distortion_calculator: DistortionCalculator = None
+    data_set: DataSet = None
+    action_space: ActionSpace = None
+    reward_manager: RewardManager = None
+    average_distortion_constraint: float = 0.0
+    gamma: float = 0.99
+    n_states: int = 10
+    min_distortion: float = 0.4
+    max_distortion: float = 0.7
+    punish_factor: float = 2.0
+    reward_factor: float = 0.95
+    n_rounds_below_min_distortion: int = 10
+    distorted_set_path: Path = None
+    distortion_calculator: DistortionCalculator = None
 
 
 class DiscreteStateEnvironment(object):
-    """
-    The DiscreteStateEnvironment class. Uses state aggregation in order
+    """The DiscreteStateEnvironment class. Uses state aggregation in order
     to create bins where the average total distortion of the dataset falls in
     """
 
@@ -80,6 +80,10 @@ def n_actions(self) -> int:
     def n_states(self) -> int:
         return self.config.n_states
 
+    @property
+    def column_names(self) -> list:
+        return self.config.data_set.get_columns_names()
+
     def get_action(self, aidx: int) -> ActionBase:
         return self.config.action_space[aidx]
 
@@ -268,7 +272,6 @@ def step(self, action: ActionBase) -> TimeStep:
 
         # TODO: these modifications will cause the agent to always
         # move close to transition points
-        # TODO: Remove the magic constants
         if next_state is not None and self.current_time_step.observation is not None:
             if next_state < min_dist_bin <= self.current_time_step.observation:
                 # the agent chose to step into the chaos again
diff --git a/src/spaces/tiled_environment.py b/src/spaces/tiled_environment.py
@@ -2,26 +2,29 @@
 Tile environment
 """
 
+import copy
 from typing import TypeVar
+from dataclasses import dataclass
 from src.extern.tile_coding import IHT, tiles
 from src.spaces.actions import ActionBase, ActionType
 from src.spaces.time_step import TimeStep
 from src.exceptions.exceptions import InvalidParamValue
+from src.spaces.state import State
+from src.spaces.time_step import copy_time_step
 
 Env = TypeVar('Env')
-State = TypeVar('State')
 
 
+@dataclass(init=True, repr=True)
 class TiledEnvConfig(object):
+    """Configuration for the TiledEnvironment
     """
-    Configuration for the TiledEnvironment
-    """
-    def __init__(self):
-        self.env: Env = None
-        self.num_tilings: int = 0
-        self.max_size = 0
-        self.tiling_dim = 0
-        self.column_scales = {}
+
+    env: Env = None
+    num_tilings: int = 0
+    max_size: int = 0
+    tiling_dim: int = 0
+    column_scales: dict = None
 
 
 class TiledEnv(object):
@@ -44,14 +47,6 @@ def __init__(self, config: TiledEnvConfig) -> None:
         self._validate()
         self.iht = IHT(self.max_size)
 
-    def step(self, action: ActionBase) -> TimeStep:
-        """
-         Apply the action and return new state
-        :param action: The action to apply
-        :return:
-        """
-        return self.env.step(action)
-
     @property
     def action_space(self):
         return self.env.action_space
@@ -64,6 +59,72 @@ def n_actions(self) -> int:
     def n_states(self) -> int:
         return self.env.n_states
 
+    def step(self, action: ActionBase) -> TimeStep:
+        """Execute the action in the environment and return
+        a new state for observation
+
+        Parameters
+        ----------
+        action: The action to execute
+
+        Returns
+        -------
+
+         An instance of TimeStep type
+
+        """
+
+        raw_time_step = self.env.step(action)
+
+        # a state wrapper to communicate
+        state = State()
+
+        # the raw environment returns an index
+        # of the bin that the total distortion falls into
+        state.bin_idx = raw_time_step.observation
+        state.total_distortion = raw_time_step.info["total_distortion"]
+        state.column_names = self.env.column_names
+
+        time_step = copy_time_step(time_step=raw_time_step, **{"observation": state})
+        #time_step = copy.deepcopy(raw_time_step)
+        #time_step.observation = state
+
+        return time_step
+
+        return
+
+    def reset(self, **options) -> TimeStep:
+        """Reset the environment so that a new sequence
+        of episodes can be generated
+
+        Parameters
+        ----------
+        options: Client provided named options
+
+        Returns
+        -------
+
+        An instance of TimeStep type
+        """
+
+        raw_time_step = self.env.reset(**options)
+
+        # a state wrapper to communicate
+        state = State()
+
+        # the raw environment returns an index
+        # of the bin that the total distortion falls into
+        state.bin_idx = raw_time_step.observation
+        state.total_distortion = raw_time_step.info["total_distortion"]
+        state.column_names = self.env.column_names
+
+        time_step = copy_time_step(time_step=raw_time_step, **{"observation": state})
+
+        #time_step = copy.deepcopy(raw_time_step)
+        #time_step.observation = state
+
+        return time_step
+
     def get_action(self, aidx: int) -> ActionBase:
         return self.env.action_space[aidx]
 
@@ -130,21 +191,6 @@ def total_current_distortion(self) -> float:
         """
         return self.env.total_current_distortion()
 
-    def reset(self, **options) -> TimeStep:
-        """
-        Starts a new sequence and returns the first `TimeStep` of this sequence.
-        Returns:
-          A `TimeStep` namedtuple containing:
-            step_type: A `StepType` of `FIRST`.
-            reward: `None`, indicating the reward is undefined.
-            discount: `None`, indicating the discount is undefined.
-            observation: A NumPy array, or a nested dict, list or tuple of arrays.
-              Scalar values that can be cast to NumPy arrays (e.g. Python floats)
-              are also valid in place of a scalar array. Must conform to the
-              specification returned by `observation_spec()`.
-        """
-        return self.env.reset(**options)
-
     def get_scaled_state(self, state: State) -> list:
         """
         Scales the state components ad returns the
diff --git a/src/spaces/time_step.py b/src/spaces/time_step.py
@@ -2,6 +2,7 @@
 
 """
 
+import copy
 import enum
 from typing import NamedTuple, Generic, Optional, TypeVar
 
@@ -52,4 +53,35 @@ def last(self) -> bool:
 
     @property
     def done(self) -> bool:
-        return self.last()
+        return self.last()
+
+
+def copy_time_step(time_step: TimeStep, **copy_options) -> TimeStep:
+    """Helper to copy partly or in whole a TimeStep namedtuple.
+    If copy_options is None or empty it returns a deep copy
+    of the given time step
+
+    Parameters
+    ----------
+    time_step: The time step to copy
+    copy_options: Members to be copied
+
+    Returns
+    -------
+
+    An instance of the TimeStep namedtuple
+
+    """
+    if not copy_options or len(copy_options) == 0:
+        return copy.deepcopy(time_step)
+
+    observation = copy_options["observation"] if "observation" in copy_options else time_step.observation
+    step_type = copy_options["step_type"] if "step_type" in copy_options else time_step.step_type
+    info = copy_options["info"] if "info" in copy_options else time_step.info
+    reward = copy_options["reward"] if "reward" in copy_options else time_step.reward
+    discount = copy_options["discount"] if "discount" in copy_options else time_step.discount
+    return TimeStep(observation=observation, step_type=step_type, info=info,
+                    reward=reward, discount=discount)
+
+
+