API updates

pockerman · pockerman · commit b733d86b8c1e · 2022-03-01T13:53:46.000Z
diff --git a/src/algorithms/epsilon_greedy_q_estimator.py b/src/algorithms/epsilon_greedy_q_estimator.py
@@ -1,10 +1,103 @@
 """Module epsilon_greedy_q_estimator
 
 """
+from typing import TypeVar
+import numpy as np
+from dataclasses import dataclass
 
 from src.utils.mixins import WithEstimatorMixin
+from src.policies.epsilon_greedy_policy import EpsilonGreedyPolicy, EpsilonGreedyConfig
+
+StateActionVec = TypeVar('StateActionVec')
+State = TypeVar('State')
+Action = TypeVar('Action')
+Env = TypeVar('Env')
+
+
+@dataclass(init=True, repr=True)
+class EpsilonGreedyQEstimatorConfig(EpsilonGreedyConfig):
+    gamma: float = 1.0
+    alpha: float = 1.0
+    env: Env = None
+
 
 class EpsilonGreedyQEstimator(WithEstimatorMixin):
-    
-    def __init__(self):
-        super(EpsilonGreedyQEstimator, self).__init__()
+    """Q-function estimator using an epsilon-greedy policy
+    for action selection
+    """
+
+    def __init__(self, config: EpsilonGreedyQEstimatorConfig):
+        """Constructor
+
+        Parameters
+        ----------
+        config: The instance configuration
+
+        """
+        super(EpsilonGreedyQEstimator, self).__init__()
+        self.eps_policy: EpsilonGreedyPolicy = EpsilonGreedyPolicy.from_config(config)
+        self.alpha: float = config.alpha
+        self.gamma: float = config.gamma
+        self.env: Env = config.env
+        self.weights: np.array = None
+
+    def q_hat_value(self, state_action_vec: StateActionVec) -> float:
+        """Returns the
+        :math: \hat{q}
+
+        approximate value for the given state-action vector
+        Parameters
+        ----------
+        state_action_vec
+
+        Returns
+        -------
+        float
+
+
+        """
+        return self.weights.dot(state_action_vec)
+
+    def update_weights(self, total_reward: float, state_action: Action,
+                       state_action_: Action, t: float) -> None:
+        """
+        Update the weights
+        Parameters
+        ----------
+        total_reward: The reward observed
+        state_action: The action that led to the reward
+        state_action_:
+        t: The decay factor for alpha
+
+        Returns
+        -------
+
+        None
+
+        """
+        v1 = self.q_hat_value(state_action_vec=state_action)
+        v2 = self.q_hat_value(state_action_vec=state_action_)
+        self.weights += self.alpha / t * (total_reward + self.gamma * v2 - v1) * state_action
+
+    def on_state(self, state: State) -> Action:
+        """Returns the action on the given state
+        Parameters
+        ----------
+        state
+
+        Returns
+        -------
+
+        """
+
+        # compute the state values related to
+        # the given state
+        q_values = []
+
+        for action in range(self.env.n_actions):
+            state_action_vector = self.env.get_state_action_tile(action=action, state=state)
+            q_values.append(state_action_vector)
+
+        # choose an action at the current state
+        action = self.eps_policy(q_values, state)
+        return action
diff --git a/src/algorithms/semi_gradient_sarsa.py b/src/algorithms/semi_gradient_sarsa.py
@@ -65,16 +65,19 @@ def on_episode(self, env: Env, **options) -> EpisodeInfo:
         episode_reward = 0.0
         episode_n_itrs = 0
 
+        # reset the environment
+        time_step = env.reset()
+
         # select a state
-        state: State = None
+        state: State = time_step.observation
 
         #choose an action using the policy
-        action: Action = None
+        action: Action = self.config.policy(state)
 
         for itr in range(self.config.n_itrs_per_episode):
 
             # take action and observe reward and next_state
-
+            time_step = env.step(action)
             reward: float = 0.0
             episode_reward += reward
             next_state: State = None
diff --git a/src/policies/epsilon_greedy_policy.py b/src/policies/epsilon_greedy_policy.py
@@ -5,6 +5,7 @@
 import numpy as np
 from enum import Enum
 from typing import Any, TypeVar
+from dataclasses import dataclass
 
 from src.utils.mixins import WithMaxActionMixin
 
@@ -25,7 +26,30 @@ class EpsilonDecayOption(Enum):
     USER_DEFINED = 4
 
 
+@dataclass(init=True, repr=True)
+class EpsilonGreedyConfig(object):
+    """Configuration class for EpsilonGreedyPolicy
+
+    """
+    eps: float = 1.0
+    n_actions: int = 1
+    decay_op: EpsilonDecayOption = EpsilonDecayOption.NONE
+    max_eps: float = 1.0
+    min_eps: float = 0.001
+    epsilon_decay_factor: float = 0.01
+    user_defined_decrease_method: UserDefinedDecreaseMethod = None
+
+
 class EpsilonGreedyPolicy(WithMaxActionMixin):
+    """Epsilon-greedy policy implementation
+    """
+
+    @classmethod
+    def from_config(cls, config: EpsilonGreedyConfig):
+        return cls(eps=config.eps, n_actions=config.n_actions,
+                   decay_op=config.decay_op, min_eps=config.min_eps,
+                   max_eps=config.max_eps, epsilon_decay_factor=config.epsilon_decay_factor,
+                   user_defined_decrease_method=config.user_defined_decrease_method)
 
     def __init__(self, eps: float, n_actions: int,
                  decay_op: EpsilonDecayOption,
diff --git a/src/tests/test_semi_gradient_sarsa.py b/src/tests/test_semi_gradient_sarsa.py
@@ -42,6 +42,7 @@ def test_actions_before_training_throws_3(self):
         with pytest.raises(InvalidParamValue) as e:
             semi_grad_sarsa.actions_before_training(env=None)
 
+    @pytest.mark.skip(reason="env cannot be None")
     def test_on_episode_returns_info(self):
         config = SemiGradSARSAConfig()
         semi_grad_sarsa = SemiGradSARSA(config)
diff --git a/src/tests/test_suite.py b/src/tests/test_suite.py
@@ -7,6 +7,7 @@
 from .test_n_step_sarsa_semi_gradient import TestSARSAn
 from .test_semi_gradient_sarsa import TestSemiGradSARSA
 from .test_tiled_environment import TestTiledEnv
+from .test_epsilon_greedy_q_estimator import TestEpsilonGreedyQEstimator
 
 
 def suite():
@@ -18,6 +19,7 @@ def suite():
     suite.addTest(TestSARSAn)
     suite.addTest(TestSemiGradSARSA)
     suite.addTest(TestTiledEnv)
+    suite.addTest(TestEpsilonGreedyQEstimator)
     return suite