pockerman
diff --git a/‎.github/workflows/python-app.yml‎
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/python-app.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/API/actions.rst‎
Lines changed: 1 addition & 9 deletions b/‎docs/source/API/actions.rst‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎docs/source/API/epsilon_greedy_q_estimator.rst‎
Lines changed: 9 additions & 0 deletions b/‎docs/source/API/epsilon_greedy_q_estimator.rst‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/source/API/state.rst‎
Lines changed: 10 additions & 0 deletions b/‎docs/source/API/state.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/source/modules.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/modules.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/algorithms/epsilon_greedy_q_estimator.py‎
Lines changed: 112 additions & 0 deletions b/‎src/algorithms/epsilon_greedy_q_estimator.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎src/algorithms/sarsa_semi_gradient.py‎ ‎…algorithms/n_step_semi_gradient_sarsa.py‎src/algorithms/sarsa_semi_gradient.py renamed to src/algorithms/n_step_semi_gradient_sarsa.py
Lines changed: 35 additions & 4 deletions b/‎src/algorithms/sarsa_semi_gradient.py‎ ‎…algorithms/n_step_semi_gradient_sarsa.py‎src/algorithms/sarsa_semi_gradient.py renamed to src/algorithms/n_step_semi_gradient_sarsa.py
Lines changed: 35 additions & 4 deletions
diff --git a/‎src/algorithms/q_learning.py‎
Lines changed: 0 additions & 3 deletions b/‎src/algorithms/q_learning.py‎
Lines changed: 0 additions & 3 deletions
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: Python application
+name: Data-Anonymity-RL
 
 on:
   push:
@@ -33,8 +33,9 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with unittest
       run: |
-        pytest
+        pytest tests/test_suite.py -v --junitxml="test_result.xml"
     - name: Upload Unit Test Results
+      uses: EnricoMi/publish-unit-test-result-action@v1
       if: always()
       with:
-        files: test-results/**/*.xml
+        files: test_result.xml
@@ -1,3 +1,6 @@
+
+[![Data-Anonymity-RL](https://github.com/pockerman/rl_anonymity_with_python/actions/workflows/python-app.yml/badge.svg)](https://github.com/pockerman/rl_anonymity_with_python/actions/workflows/python-app.yml)
+
 # RL anonymity (with Python)
 
 An experimental effort to use reinforcement learning techniques for data anonymization. 
 
@@ -3,16 +3,8 @@
 
 .. automodule:: actions
 
-   
-   
-   
 
-   
-   
-   
-
-   
-   
+      
    .. rubric:: Classes
 
    .. autosummary::
 
@@ -0,0 +1,9 @@
+epsilon\_greedy\_q\_estimator
+=============================
+
+.. automodule:: epsilon_greedy_q_estimator
+
+.. autoclass:: EpsilonGreedyQEstimatorConfig
+  
+.. autoclass:: EpsilonGreedyQEstimator
+   :members: __init__, q_hat_value, update_weights, on_state
@@ -0,0 +1,10 @@
+state
+=====
+
+.. automodule:: state
+
+.. autoclass:: StateIterator
+   :members: __init__, at, finished, __next__, __len__
+   
+.. autoclass:: State
+   :members: __init__, __contains__, __iter__, __getitem__
@@ -5,14 +5,14 @@ API
    :maxdepth: 4
 
    API/actions
+   API/state
+   API/epsilon_greedy_q_estimator
    generated/action_space
    generated/q_estimator
    generated/q_learning
    generated/trainer
-   generated/sarsa_semi_gradient
    generated/exceptions
    generated/action_space
-   generated/actions
    generated/column_type
    generated/discrete_state_environment
    generated/observation_space
 
@@ -0,0 +1,112 @@
+"""Module epsilon_greedy_q_estimator. Implements
+a q-estimator by assuming linear function approximation
+
+"""
+from typing import TypeVar
+import numpy as np
+from dataclasses import dataclass
+
+from src.utils.mixins import WithEstimatorMixin
+from src.policies.epsilon_greedy_policy import EpsilonGreedyPolicy, EpsilonGreedyConfig
+
+StateActionVec = TypeVar('StateActionVec')
+State = TypeVar('State')
+Action = TypeVar('Action')
+Env = TypeVar('Env')
+
+
+@dataclass(init=True, repr=True)
+class EpsilonGreedyQEstimatorConfig(EpsilonGreedyConfig):
+    gamma: float = 1.0
+    alpha: float = 1.0
+    env: Env = None
+
+
+class EpsilonGreedyQEstimator(WithEstimatorMixin):
+    """Q-function estimator using an epsilon-greedy policy
+    for action selection
+    """
+
+    def __init__(self, config: EpsilonGreedyQEstimatorConfig):
+        """Constructor
+
+        Parameters
+        ----------
+
+        config: The instance configuration
+
+        """
+        super(EpsilonGreedyQEstimator, self).__init__()
+        self.eps_policy: EpsilonGreedyPolicy = EpsilonGreedyPolicy.from_config(config)
+        self.alpha: float = config.alpha
+        self.gamma: float = config.gamma
+        self.env: Env = config.env
+        self.weights: np.array = None
+
+    def q_hat_value(self, state_action_vec: StateActionVec) -> float:
+        """Returns the
+        :math: \hat{q}
+
+        approximate value for the given state-action vector
+
+        Parameters
+        ----------
+
+        state_action_vec: The state-action tiled vector
+
+        Returns
+        -------
+        float
+
+
+        """
+        return self.weights.dot(state_action_vec)
+
+    def update_weights(self, total_reward: float, state_action: Action,
+                       state_action_: Action, t: float) -> None:
+        """
+        Update the weights
+
+        Parameters
+        ----------
+
+        total_reward: The reward observed
+        state_action: The action that led to the reward
+        state_action_:
+        t: The decay factor for alpha
+
+        Returns
+        -------
+
+        None
+
+        """
+        v1 = self.q_hat_value(state_action_vec=state_action)
+        v2 = self.q_hat_value(state_action_vec=state_action_)
+        self.weights += self.alpha / t * (total_reward + self.gamma * v2 - v1) * state_action
+
+    def on_state(self, state: State) -> Action:
+        """Returns the action on the given state
+
+        Parameters
+        ----------
+
+        state: The state observed
+
+        Returns
+        -------
+
+        An environment specific Action type
+        """
+
+        # compute the state values related to
+        # the given state
+        q_values = []
+
+        for action in range(self.env.n_actions):
+            state_action_vector = self.env.get_state_action_tile(action=action, state=state)
+            q_values.append(state_action_vector)
+
+        # choose an action at the current state
+        action = self.eps_policy(q_values, state)
+        return action
@@ -21,7 +21,6 @@
 @dataclass(init=True, repr=True)
 class SARSAnConfig:
     """Configuration class for n-step SARSA algorithm
-
     """
     gamma: float = 1.0
     alpha: float = 0.1
@@ -39,13 +38,45 @@ class SARSAn(WithMaxActionMixin):
     """
 
     def __init__(self, sarsa_config: SARSAnConfig):
-        super(SARSAn, self).__init__()
+        super(SARSAn, self).__init__(table={})
         self.name = "SARSAn"
         self.config = sarsa_config
-        self.q_table = {}
 
     def play(self, env: Env, stop_criterion: Criterion) -> None:
-        pass
+        """
+        Apply the trained agent on the given environment.
+
+        Parameters
+        ----------
+        env: The environment to apply the agent
+        stop_criterion: Criteria that specify when play should stop
+
+        Returns
+        -------
+
+        None
+
+        """
+        # loop over the columns and for the
+        # column get the action that corresponds to
+        # the max payout.
+        # TODO: This will no work as the distortion is calculated
+        # by summing over the columns.
+
+        # set the q_table for the policy
+        # this is the table we should be using to
+        # make decisions
+        self.config.policy.q_table = self.q_table
+        total_dist = env.total_current_distortion()
+        while stop_criterion.continue_itr(total_dist):
+            # use the policy to select an action
+            state_idx = env.get_aggregated_state(total_dist)
+            action_idx = self.config.policy.on_state(state_idx)
+            action = env.get_action(action_idx)
+            print("{0} At state={1} with distortion={2} select action={3}".format("INFO: ", state_idx, total_dist,
+                                                                                  action.column_name + "-" + action.action_type.name))
+            env.step(action=action)
+            total_dist = env.total_current_distortion()
 
     def actions_before_training(self, env: Env) -> None:
         """
 
@@ -77,9 +77,6 @@ def play(self, env: Env, stop_criterion: Criterion) -> None:
         total_dist = env.total_current_distortion()
         while stop_criterion.continue_itr(total_dist):
 
-            if stop_criterion.iteration_counter == 12:
-                print("Break...")
-
             # use the policy to select an action
             state_idx = env.get_aggregated_state(total_dist)
             action_idx = self.config.policy.on_state(state_idx)