#13 Add policies

pockerman · pockerman · commit 72e98cc68bc0 · 2022-01-11T16:21:44.000Z
diff --git a/src/policies/__init__.py b/src/policies/__init__.py
diff --git a/src/policies/deterministic_policy.py b/src/policies/deterministic_policy.py
@@ -0,0 +1,23 @@
+import numpy as np
+from typing import TypeVar
+
+from src.policies.policy_adaptor_base import PolicyAdaptorBase
+
+PolicyBase = TypeVar('PolicyBase')
+
+
+class DeterministicAdaptorPolicy(PolicyAdaptorBase):
+
+    """
+    Update a policy by choosing the best action
+    """
+
+    def __init__(self) -> None:
+        super(DeterministicAdaptorPolicy, self).__init__()
+
+    def __call__(self, policy: PolicyBase, *args, **kwargs) -> PolicyBase:
+        s: int = kwargs["s"]
+        state_actions: np.ndarray = kwargs["state_actions"]
+        action = np.argmax(state_actions)
+        policy[s][action] = 1.0
+        return policy
diff --git a/src/policies/epsilon_greedy_policy.py b/src/policies/epsilon_greedy_policy.py
@@ -0,0 +1,88 @@
+"""
+Epsilon greedy policy implementation
+"""
+import random
+import numpy as np
+from enum import Enum
+from typing import Any, TypeVar
+
+
+from src.utils.mixins import WithMaxActionMixin
+
+UserDefinedDecreaseMethod = TypeVar('UserDefinedDecreaseMethod')
+Env = TypeVar("Env")
+
+
+class EpsilonDecreaseOption(Enum):
+    """
+    Options for reducing epsilon
+    """
+
+    NONE = 0
+    EXPONENTIAL = 1
+    INVERSE_STEP = 2
+    CONSTANT_RATE = 3
+    USER_DEFINED = 4
+
+
+class EpsilonGreedyPolicy(WithMaxActionMixin):
+
+    def __init__(self, env: Env, eps: float,
+                 decay_op: EpsilonDecreaseOption,
+                 max_eps: float = 1.0, min_eps: float = 0.001,
+                 epsilon_decay_factor: float = 0.01,
+                 user_defined_decrease_method: UserDefinedDecreaseMethod = None) -> None:
+        super(WithMaxActionMixin, self).__init__()
+        self._eps = eps
+        self._n_actions = env.action_space.n
+        self._decay_op = decay_op
+        self._max_eps = max_eps
+        self._min_eps = min_eps
+        self._epsilon_decay_factor = epsilon_decay_factor
+        self.user_defined_decrease_method: UserDefinedDecreaseMethod = user_defined_decrease_method
+
+    def __call__(self, q_func: Any, state: Any) -> int:
+
+        # select greedy action with probability epsilon
+        if random.random() > self._eps:
+            self.q_table = q_func
+            return self.max_action(state=state, n_actions=self._n_actions)
+
+        else:
+
+            # otherwise, select an action randomly
+            # what happens if we select an action that
+            # has exhausted it's transforms?
+            return random.choice(np.arange(self._n_actions))
+
+    def actions_after_episode(self, episode_idx: int, **options) -> None:
+        """
+        Apply actions on the policy after the end of the episode
+        :param episode_idx: The episode index
+        :param options:
+        :return: None
+        """
+
+        if self._decay_op == EpsilonDecreaseOption.NONE:
+            return
+
+        if self._decay_op == EpsilonDecreaseOption.USER_DEFINED:
+            self._eps = self.user_defined_decrease_method(self._eps, episode_idx)
+
+        if self._decay_op == EpsilonDecreaseOption.INVERSE_STEP:
+
+            if episode_idx == 0:
+                episode_idx = 1
+
+            self._eps = 1.0 / episode_idx
+
+        elif self._decay_op == EpsilonDecreaseOption.EXPONENTIAL:
+            self._eps = self._min_eps + (self._max_eps - self._min_eps) * np.exp(-self._epsilon_decay_factor * episode_idx)
+
+        elif self._decay_op == EpsilonDecreaseOption.CONSTANT_RATE:
+            self._eps -= self._epsilon_decay_factor
+
+        if self._eps < self._min_eps:
+            self._eps = self._min_eps
+
+