Add semi-gradient SARSA algo

pockerman · pockerman · commit abd2e28bbff2 · 2022-02-28T13:02:35.000Z
diff --git a/src/algorithms/epsilon_greedy_q_estimator.py b/src/algorithms/epsilon_greedy_q_estimator.py
@@ -0,0 +1,10 @@
+"""Module epsilon_greedy_q_estimator
+
+"""
+
+from src.utils.mixins import WithEstimatorMixin
+
+class EpsilonGreedyQEstimator(WithEstimatorMixin):
+    
+    def __init__(self):
+        super(EpsilonGreedyQEstimator, self).__init__()
diff --git a/src/algorithms/semi_gradient_sarsa.py b/src/algorithms/semi_gradient_sarsa.py
@@ -0,0 +1,150 @@
+"""Module semi_gradient_sarsa. Implements
+episodic semi-gradient SARSA for estimating the state-action
+value function. the im[plementation follows the algorithm
+ at page 244 in the book by Sutton and Barto: Reinforcement Learning An Introduction
+second edition 2020
+
+"""
+
+from dataclasses import  dataclass
+from typing import TypeVar
+
+from src.utils.mixins import WithMaxActionMixin, WithQTableMixinBase, WithEstimatorMixin
+from src.utils.episode_info import EpisodeInfo
+from src.exceptions.exceptions import InvalidParamValue
+
+Policy = TypeVar('Policy')
+Env = TypeVar('Env')
+State = TypeVar('State')
+Action = TypeVar('Action')
+
+
+@dataclass(init=True, repr=True)
+class SemiGradSARSAConfig(object):
+    """Configuration class for semi-gradient SARSA algorithm
+    """
+    gamma: float = 1.0
+    alpha: float = 0.1
+    n_itrs_per_episode: int = 100
+    policy: Policy = None
+
+
+class SemiGradSARSA(object):
+    """SemiGradSARSA class. Implements the semi-gradient SARSA algorithm
+    as described
+
+    """
+
+    def __init__(self, config: SemiGradSARSAConfig) -> None:
+        self.config: SemiGradSARSAConfig = config
+
+    def actions_before_training(self, env: Env, **options) -> None:
+        """Specify any actions necessary before training begins
+
+        Parameters
+        ----------
+        env: The environment to train on
+        options: Any key-value options passed by the client
+
+        Returns
+        -------
+
+        None
+        """
+
+        self._validate()
+        self._init()
+        """
+        for state in range(1, env.n_states):
+            for action in range(env.n_actions):
+                self.q_table[state, action] = 0.0
+        """
+
+    def on_episode(self, env: Env, **options) -> EpisodeInfo:
+
+        episode_reward = 0.0
+        episode_n_itrs = 0
+
+        # select a state
+        state: State = None
+
+        #choose an action using the policy
+        action: Action = None
+
+        for itr in range(self.config.n_itrs_per_episode):
+
+            # take action and observe reward and next_state
+
+            reward: float = 0.0
+            episode_reward += reward
+            next_state: State = None
+
+            # if next_state is terminal i.e. the done flag
+            # is set. then update the weights
+
+            # otherwise chose next action as a function of q_hat
+            next_action: Action = None
+            # update the weights
+
+            # update state
+            state = next_state
+
+            # update action
+            action = next_action
+
+            episode_n_itrs += 1
+
+        episode_info = EpisodeInfo()
+        episode_info.episode_score = episode_reward
+        episode_info.episode_itrs = episode_n_itrs
+        return episode_info
+
+    def _weights_update_episode_done(self, state: State, reward: float,
+                                     action: Action, next_state: State) -> None:
+        """Update the weights due to the fact that
+        the episode is finished
+
+        Parameters
+        ----------
+        state: The current state
+        reward: The reward to use
+        action: The action we took at state
+        next_state: The observed state
+
+        Returns
+        -------
+
+        None
+        """
+        pass
+
+    def _init(self) -> None:
+        """
+        Any initializations needed before starting the training
+
+        Returns
+        -------
+        None
+        """
+        pass
+
+    def _validate(self) -> None:
+        """
+        Validate the state of the agent. Is called before
+        any training begins to check that the starting state is sane
+
+        Returns
+        -------
+
+        None
+        """
+
+        if self.config is None:
+            raise InvalidParamValue(param_name="self.config", param_value="None")
+
+        if self.config.n_itrs_per_episode <= 0:
+            raise ValueError("n_itrs_per_episode should be greater than zero")
+
+        if not isinstance(self.config.policy, WithEstimatorMixin):
+            raise InvalidParamValue(param_name="policy", param_value=str(self.config.policy))
+
diff --git a/src/tests/test_semi_gradient_sarsa.py b/src/tests/test_semi_gradient_sarsa.py
@@ -0,0 +1,107 @@
+import unittest
+import pytest
+
+from src.algorithms.semi_gradient_sarsa import SemiGradSARSAConfig, SemiGradSARSA
+from src.algorithms.epsilon_greedy_q_estimator import EpsilonGreedyQEstimator
+from src.exceptions.exceptions import InvalidParamValue
+from src.spaces.tiled_environment import TiledEnv
+from src.spaces.discrete_state_environment import DiscreteStateEnvironment
+from src.datasets.datasets_loaders import MockSubjectsLoader, MockSubjectsData
+
+class TestSemiGradSARSA(unittest.TestCase):
+
+    def test_constructor(self):
+        config = SemiGradSARSAConfig()
+        semi_grad_sarsa = SemiGradSARSA(config)
+        self.assertIsNotNone(semi_grad_sarsa.config)
+
+    def test_actions_before_training_throws_1(self):
+
+        semi_grad_sarsa = SemiGradSARSA(None)
+        with pytest.raises(InvalidParamValue) as e:
+            semi_grad_sarsa.actions_before_training(env=None)
+
+    def test_actions_before_training_throws_2(self):
+        config = SemiGradSARSAConfig()
+        config.n_itrs_per_episode = 0
+        semi_grad_sarsa = SemiGradSARSA(config)
+
+        # make sure this is valid
+        self.assertIsNotNone(semi_grad_sarsa.config)
+
+        with pytest.raises(ValueError) as e:
+            semi_grad_sarsa.actions_before_training(env=None)
+
+    def test_actions_before_training_throws_3(self):
+        config = SemiGradSARSAConfig()
+        semi_grad_sarsa = SemiGradSARSA(config)
+
+        # make sure this is valid
+        self.assertIsNotNone(semi_grad_sarsa.config)
+
+        with pytest.raises(InvalidParamValue) as e:
+            semi_grad_sarsa.actions_before_training(env=None)
+
+    def test_on_episode_returns_info(self):
+        config = SemiGradSARSAConfig()
+        semi_grad_sarsa = SemiGradSARSA(config)
+
+        # make sure this is valid
+        self.assertIsNotNone(semi_grad_sarsa.config)
+
+        episode_info = semi_grad_sarsa.on_episode(env=None)
+        self.assertIsNotNone(episode_info)
+
+    def test_on_episode_trains(self):
+
+        sarsa_config = SemiGradSARSAConfig(n_itrs_per_episode=1, policy=EpsilonGreedyQEstimator())
+        semi_grad_sarsa = SemiGradSARSA(sarsa_config)
+
+        # cretate a default data
+        ds_default_data = MockSubjectsData()
+        ds = MockSubjectsLoader.from_options(filename=ds_default_data.FILENAME,
+                                             names=ds_default_data.NAMES, drop_na=ds_default_data.DROP_NA,
+                                             change_col_vals=ds_default_data.CHANGE_COLS_VALS,
+                                             features_drop_names=ds_default_data.FEATURES_DROP_NAMES +
+                                                                 ["preventative_treatment", "gender",
+                                                                  "education", "mutation_status"],
+                                             column_normalization=["salary"], column_types={"ethnicity": str,
+                                                                                            "salary": float,
+                                                                                            "diagnosis": int})
+
+        discrete_env = DiscreteStateEnvironment.from_options(data_set=ds, action_space=None,
+                                                             reward_manager=None, distortion_calculator=None)
+        tiled_env = TiledEnv.from_options(env=discrete_env, max_size=4096, num_tilings=5, n_bins=10,
+                                          column_ranges={"ethnicity": [0.0, 1.0],
+                                                         "salary": [0.0, 1.0],
+                                                         "diagnosis": [0.0, 1.0]}, tiling_dim=3)
+
+        """
+        # specify the columns to drop
+        drop_columns = MockSubjectsLoader.FEATURES_DROP_NAMES + ["preventative_treatment", "gender",
+                                                                 "education", "mutation_status"]
+        MockSubjectsLoader.FEATURES_DROP_NAMES = drop_columns
+
+        # do a salary normalization so that we work with
+        # salaries in [0, 1] this is needed as we will
+        # be using normalized distances
+        MockSubjectsLoader.NORMALIZED_COLUMNS = ["salary"]
+
+        # specify the columns to use
+        MockSubjectsLoader.COLUMNS_TYPES = {"ethnicity": str, "salary": float, "diagnosis": int}
+        ds = MockSubjectsLoader()
+        """
+
+        # create the discrete environment
+
+        semi_grad_sarsa.actions_before_training(tiled_env)
+
+        # make sure this is valid
+        self.assertIsNotNone(semi_grad_sarsa.config)
+
+        episode_info = semi_grad_sarsa.on_episode(env=tiled_env)
+        self.assertIsNotNone(episode_info)
+
+
+if __name__ == '__main__':
+    unittest.main()