#27 API updates

pockerman · pockerman · commit 7dbb2dc644ce · 2022-01-28T14:58:55.000Z
diff --git a/src/algorithms/q_learning.py b/src/algorithms/q_learning.py
@@ -6,10 +6,11 @@
 from typing import TypeVar
 
 from src.exceptions.exceptions import InvalidParamValue
-from src.utils.mixins import WithMaxActionMixin
+from src.utils.mixins import WithMaxActionMixin, WithQTableMixinBase
 
 Env = TypeVar('Env')
 Policy = TypeVar('Policy')
+Criterion = TypeVar('Criterion')
 
 
 class QLearnConfig(object):
@@ -39,8 +40,8 @@ def name(self) -> str:
 
     def actions_before_training(self, env: Env, **options):
 
-        if self.config.policy is None:
-            raise InvalidParamValue(param_name="policy", param_value="None")
+        if not isinstance(self.config.policy, WithQTableMixinBase):
+            raise InvalidParamValue(param_name="policy", param_value=str(self.config.policy))
 
         for state in range(1, env.n_states):
             for action in range(env.n_actions):
@@ -56,10 +57,11 @@ def actions_after_episode_ends(self, **options):
 
         self.config.policy.actions_after_episode(options['episode_idx'])
 
-    def play(self, env: Env) -> None:
+    def play(self, env: Env, stop_criterion: Criterion) -> None:
         """
         Play the game on the environment. This should produce
         a distorted dataset
+        :param stop_criterion:
         :param env:
         :return:
         """
@@ -69,7 +71,23 @@ def play(self, env: Env) -> None:
         # the max payout.
         # TODO: This will no work as the distortion is calculated
         # by summing over the columns.
-        raise NotImplementedError("Function not implemented")
+
+        # set the q_table for the policy
+        self.config.policy.q_table = self.q_table
+        total_dist = env.total_average_current_distortion()
+        while stop_criterion.continue_itr(total_dist):
+
+            if stop_criterion.iteration_counter == 12:
+                print("Break...")
+
+            # use the policy to select an action
+            state_idx = env.get_aggregated_state(total_dist)
+            action_idx = self.config.policy.on_state(state_idx)
+            action = env.get_action(action_idx)
+            print("{0} At state={1} with distortion={2} select action={3}".format("INFO: ", state_idx, total_dist,
+                                                                                  action.column_name + "-" + action.action_type.name))
+            env.step(action=action)
+            total_dist = env.total_average_current_distortion()
 
     def train(self, env: Env, **options) -> tuple:
 
@@ -84,15 +102,10 @@ def train(self, env: Env, **options) -> tuple:
         for itr in range(self.config.n_itrs_per_episode):
 
             # epsilon-greedy action selection
-            action_idx = self.config.policy(q_func=self.q_table, state=state)
+            action_idx = self.config.policy(q_table=self.q_table, state=state)
 
             action = env.get_action(action_idx)
 
-            #if action.action_type.name == "GENERALIZE" and action.column_name == "salary":
-             #   print("Attempt to generalize salary")
-            #else:
-             #   print(action.action_type.name, " on ", action.column_name)
-
             # take action A, observe R, S'
             next_time_step = env.step(action)
             next_state = next_time_step.observation
@@ -111,7 +124,8 @@ def train(self, env: Env, **options) -> tuple:
 
         return episode_score, total_distortion, counter
 
-    def _update_Q_table(self, state: int, action: int, n_actions: int, reward: float, next_state: int = None) -> None:
+    def _update_Q_table(self, state: int, action: int, n_actions: int,
+                        reward: float, next_state: int = None) -> None:
         """
         Update the Q-value for the state
         """
diff --git a/src/algorithms/trainer.py b/src/algorithms/trainer.py
@@ -32,6 +32,17 @@ def avg_rewards(self) -> np.array:
             avg[i] = self.total_rewards[i] / self.iterations_per_episode[i]
         return avg
 
+    def avg_distortion(self) -> np.array:
+        """
+        Returns the average reward per episode
+        :return:
+        """
+        avg = np.zeros(self.configuration['n_episodes'])
+
+        for i in range(len(self.total_distortions)):
+            avg[i] = self.total_distortions[i] / self.iterations_per_episode[i]
+        return avg
+
     def actions_before_training(self):
         """
         Any actions to perform before training begins
diff --git a/src/policies/epsilon_greedy_policy.py b/src/policies/epsilon_greedy_policy.py
@@ -6,7 +6,6 @@
 from enum import Enum
 from typing import Any, TypeVar
 
-
 from src.utils.mixins import WithMaxActionMixin
 
 UserDefinedDecreaseMethod = TypeVar('UserDefinedDecreaseMethod')
@@ -42,26 +41,38 @@ def __init__(self, env: Env, eps: float,
         self._epsilon_decay_factor = epsilon_decay_factor
         self.user_defined_decrease_method: UserDefinedDecreaseMethod = user_defined_decrease_method
 
-    def __call__(self, q_func: QTable, state: Any) -> int:
+    def __str__(self) -> str:
+        return self.__name__
+
+    def __call__(self, q_table: QTable, state: Any) -> int:
         """
         Execute the policy
         :param q_func:
         :param state:
         :return:
         """
 
+        # update the store q_table
+        self.q_table = q_table
+
         # select greedy action with probability epsilon
         if random.random() > self._eps:
-            self.q_table = q_func
             return self.max_action(state=state, n_actions=self._n_actions)
-
         else:
 
             # otherwise, select an action randomly
             # what happens if we select an action that
             # has exhausted it's transforms?
             return random.choice(np.arange(self._n_actions))
 
+    def on_state(self, state: Any) -> int:
+        """
+        Returns the optimal action on the current state
+        :param state:
+        :return:
+        """
+        return self.max_action(state=state, n_actions=self._n_actions)
+
     def actions_after_episode(self, episode_idx: int, **options) -> None:
         """
         Apply actions on the policy after the end of the episode
diff --git a/src/spaces/discrete_state_environment.py b/src/spaces/discrete_state_environment.py
@@ -129,6 +129,11 @@ def get_action(self, aidx: int) -> ActionBase:
         return self.config.action_space[aidx]
 
     def save_current_dataset(self, episode_index: int) -> None:
+        """
+        Save the current distorted datase for the given episode index
+        :param episode_index:
+        :return:
+        """
         self.distorted_data_set.save_to_csv(
             filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index)))
 
@@ -192,7 +197,8 @@ def apply_action(self, action: ActionBase):
             return
 
         # apply the transform of the data set
-        self.distorted_data_set.apply_column_transform(column_name=action.column_name, transform=action)
+        self.distorted_data_set.apply_column_transform(column_name=action.column_name,
+                                                       transform=action)
 
         # what is the previous and current values for the column
         current_column = self.distorted_data_set.get_column(col_name=action.column_name)
@@ -205,14 +211,8 @@ def apply_action(self, action: ActionBase):
             start_column = "".join(start_column.values)
             datatype = 'str'
 
-            # join the column to calculate the distance
-            # distance = self.string_distance_calculator.calculate(txt1="".join(current_column.values),
-            #                                                     txt2="".join(start_column.values))
-        # else:
-        #    distance = NumericDistanceCalculator(dist_type=self.config.numeric_column_distortion_metric_type)\
-        #       .calculate(state1=current_column, state2=start_column)
-
-        distance = self.config.distortion_calculator.calculate(current_column, start_column, datatype)
+        distance = self.config.distortion_calculator.calculate(current_column,
+                                                               start_column, datatype)
 
         self.column_distances[action.column_name] = distance
 
@@ -312,35 +312,38 @@ def step(self, action: ActionBase) -> TimeStep:
 
         # TODO: these modifications will cause the agent to always
         # move close to transition points
-        if next_state < min_dist_bin <= self.current_time_step.observation:
-            # the agent chose to step into the chaos again
-            # we punish him with double the reward
-            reward = 2.0 * self.config.reward_manager.out_of_min_bound_reward
-        elif next_state > max_dist_bin >= self.current_time_step.observation:
-            # the agent is going to chaos from above
-            # punish him
-            reward = 2.0 * self.config.reward_manager.out_of_max_bound_reward
-
-        elif next_state >= min_dist_bin > self.current_time_step.observation:
-            # the agent goes towards the transition of min point so give a higher reward
-            # for this
-            reward = 0.95 * self.config.reward_manager.in_bounds_reward
-
-        elif next_state <= max_dist_bin < self.current_time_step.observation:
-            # the agent goes towards the transition of max point so give a higher reward
-            # for this
-            reward = 0.95 * self.config.reward_manager.in_bounds_reward
-
-        if next_state >= self.n_states:
+        if next_state is not None and self.current_time_step.observation is not None:
+            if next_state < min_dist_bin <= self.current_time_step.observation:
+                # the agent chose to step into the chaos again
+                # we punish him with double the reward
+                reward = 2.0 * self.config.reward_manager.out_of_min_bound_reward
+            elif next_state > max_dist_bin >= self.current_time_step.observation:
+                # the agent is going to chaos from above
+                # punish him
+                reward = 2.0 * self.config.reward_manager.out_of_max_bound_reward
+
+            elif next_state >= min_dist_bin > self.current_time_step.observation:
+                # the agent goes towards the transition of min point so give a higher reward
+                # for this
+                reward = 0.95 * self.config.reward_manager.in_bounds_reward
+
+            elif next_state <= max_dist_bin < self.current_time_step.observation:
+                # the agent goes towards the transition of max point so give a higher reward
+                # for this
+                reward = 0.95 * self.config.reward_manager.in_bounds_reward
+
+        if next_state is None or next_state >= self.n_states:
             done = True
 
         if done:
             step_type = StepType.LAST
             next_state = None
 
-        self.current_time_step = TimeStep(step_type=step_type, reward=reward,
+        self.current_time_step = TimeStep(step_type=step_type,
+                                          reward=reward,
                                           observation=next_state,
-                                          discount=self.config.gamma, info={"total_distortion": current_distortion})
+                                          discount=self.config.gamma,
+                                          info={"total_distortion": current_distortion})
 
         return self.current_time_step
 
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
@@ -1 +1,2 @@
-INFO = "INFO: "
+from src.utils.version import VERSION
+INFO = "INFO: "
diff --git a/src/utils/mixins.py b/src/utils/mixins.py
@@ -3,8 +3,11 @@
 """
 
 import numpy as np
+import abc
 from typing import TypeVar, Any
 
+from src.exceptions.exceptions import InvalidParamValue
+
 QTable = TypeVar('QTable')
 Hierarchy = TypeVar('Hierarchy')
 
@@ -48,26 +51,42 @@ def finished(self) -> bool:
         return exhausted
 
 
-class WithQTableMixin(object):
+class WithQTableMixinBase(metaclass=abc.ABCMeta):
     """
-    Helper class to associate a q_table with an algorithm
-     if this is needed.
+    Base class to impose the concept of Q-table
     """
+
     def __init__(self):
         # the table representing the q function
         # client code should choose the type of
         # the table
         self.q_table: QTable = None
 
 
-class WithMaxActionMixin(object):
+class WithQTableMixin(WithQTableMixinBase):
+    """
+    Helper class to associate a q_table with an algorithm
+     if this is needed.
+    """
+    def __init__(self):
+        super(WithQTableMixin, self).__init__()
+
+    def state_action_values(self, state: Any, n_actions: int):
+
+        if self.q_table is None:
+            raise InvalidParamValue(param_name="q_table", param_value="None")
+
+        values = [self.q_table[state, a] for a in range(n_actions)]
+        return values
+
+
+class WithMaxActionMixin(WithQTableMixin):
     """
     The class WithMaxActionMixin.
     """
 
     def __init__(self):
         super(WithMaxActionMixin, self).__init__()
-        self.q_table: QTable = None
 
     def max_action(self, state: Any, n_actions: int) -> int:
         """
@@ -77,7 +96,7 @@ def max_action(self, state: Any, n_actions: int) -> int:
         :param n_actions: Total number of actions allowed
         :return: The action that corresponds to the maximum value
         """
-        values = [self.q_table[state, a] for a in range(n_actions)]
+        values = self.state_action_values(state, n_actions) #[self.q_table[state, a] for a in range(n_actions)]
         values = np.array(values)
         action = np.argmax(values)
         return int(action)
diff --git a/src/utils/serial_hierarchy.py b/src/utils/serial_hierarchy.py
@@ -5,7 +5,6 @@
 
 from typing import List, Any
 from src.utils.hierarchy_base import HierarchyBase
-from src.utils.updateable_map import UpdateableMap
 
 
 class SerialtHierarchyIterator(object):

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-INFO = "INFO: "`
	`1`	`+from src.utils.version import VERSION`
	`2`	`+INFO = "INFO: "`