Merge pull request #33 from pockerman/add_scala_utilities

pockerman · web-flow · commit f3257c9b96c0 · 2022-01-31T12:31:30.000Z
Add scala utilities
diff --git a/scala_helpers/MeasureDataQuality.scala b/scala_helpers/MeasureDataQuality.scala
@@ -0,0 +1,77 @@
+/**
+ *  Investigate various output quality measures supplied by ARX
+ *
+ */
+package examples.example_3
+
+
+import base.DefaultConfiguration
+import org.deidentifier.arx.Data
+import postprocessor.ResultPrinter.printHandleTop
+
+//import scala.collection.JavaConversions._
+//import collection.convert.ImplicitConversionsToScala.map AsScala
+import collection.JavaConverters.* // asScala
+import collection.convert.ImplicitConversions.*
+import java.io.File
+import java.nio.charset.Charset
+
+object MeasureDataQuality extends App{
+
+  def loadData: Tuple2[Data, Data] = {
+
+    val dataFileOrg: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-1")
+    val dataOrg: Data = Data.create(dataFileOrg, Charset.defaultCharset, ',')
+
+    val dataFileDist: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-2")
+    val dataDist: Data = Data.create(dataFileDist, Charset.defaultCharset, ',')
+
+    require(dataOrg.getHandle.getNumRows == dataDist.getHandle.getNumRows)
+    require(dataOrg.getHandle.getNumColumns == dataDist.getHandle.getNumColumns)
+
+    // define the attribute types
+    System.out.println(s"Number of rows ${dataOrg.getHandle.getNumRows}")
+    System.out.println(s"Number of cols ${dataOrg.getHandle.getNumColumns}")
+
+    printHandleTop(handle = dataOrg.getHandle, n = 5)
+    System.out.println("Done...")
+
+    (dataOrg, dataDist)
+  }
+
+  def experiment1: Unit = {
+
+    val data = loadData
+
+    val dataHandleOrg = data._1.getHandle
+    val dataHandleDist = data._2.getHandle
+
+    val summaryStatsDist = dataHandleDist.getStatistics().getSummaryStatistics(true)
+    val summaryStatsOrg  = dataHandleOrg.getStatistics().getSummaryStatistics(true)
+    // getEquivalenceClassStatistics(); //getEquivalenceClassStatistics();
+
+    for((key, value) <- summaryStatsDist){
+      println(s"Column: ${key}")
+      println("-----------------------Distorted/Original")
+      println(s"distinctNumberOfValues ${value.getNumberOfDistinctValuesAsString}/${summaryStatsOrg.get(key).getNumberOfDistinctValuesAsString}")
+      println(s"Mode                   ${value.getModeAsString}/${summaryStatsOrg.get(key).getModeAsString}")
+      if(value.isMaxAvailable) {
+        println(s"Max                  ${value.getMaxAsString}/${summaryStatsOrg.get(key).getMaxAsString}")
+        println(s"Min                  ${value.getMinAsString}/${summaryStatsOrg.get(key).getMinAsString}")
+      }
+    }
+  }
+
+  def runKAnonimity: Unit = {
+
+    val data = loadData
+
+    // create the hierarchies for the ethnicity and
+    // salary
+
+  }
+
+  // execute Experiment 1
+  experiment1
+
+}
diff --git a/scala_helpers/build.sbt b/scala_helpers/build.sbt
@@ -0,0 +1,8 @@
+name := "data_anonymizer_scala"
+
+version := "0.1"
+
+scalaVersion := "3.0.2"
+
+libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.10"
+libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.10" % "test"
diff --git a/src/algorithms/q_learning.py b/src/algorithms/q_learning.py
@@ -74,7 +74,7 @@ def play(self, env: Env, stop_criterion: Criterion) -> None:
 
         # set the q_table for the policy
         self.config.policy.q_table = self.q_table
-        total_dist = env.total_average_current_distortion()
+        total_dist = env.total_current_distortion()
         while stop_criterion.continue_itr(total_dist):
 
             if stop_criterion.iteration_counter == 12:
@@ -87,7 +87,7 @@ def play(self, env: Env, stop_criterion: Criterion) -> None:
             print("{0} At state={1} with distortion={2} select action={3}".format("INFO: ", state_idx, total_dist,
                                                                                   action.column_name + "-" + action.action_type.name))
             env.step(action=action)
-            total_dist = env.total_average_current_distortion()
+            total_dist = env.total_current_distortion()
 
     def train(self, env: Env, **options) -> tuple:
 
diff --git a/src/datasets/dataset_wrapper.py b/src/datasets/dataset_wrapper.py
@@ -29,7 +29,6 @@ def read(self, filename: Path, **options) -> None:
 
 
 class PandasDSWrapper(DSWrapper[pd.DataFrame]):
-
     """
     Simple wrapper to a pandas DataFrame object.
     Facilitates various actions on the original dataset
@@ -60,15 +59,15 @@ def n_columns(self) -> int:
     def schema(self) -> dict:
         return pd.io.json.build_table_schema(self.ds)
 
-    def save_to_csv(self, filename: Path) -> None:
+    def save_to_csv(self, filename: Path, save_index: bool) -> None:
         """
         Save the underlying dataset in a csv format
         :param filename:
         :return:
         """
-        self.ds.to_csv(filename)
+        self.ds.to_csv(filename, index=save_index)
 
-    def read(self, filename: Path,  **options) -> None:
+    def read(self, filename: Path, **options) -> None:
         """
         Load a data set from a file
         :param filename:
@@ -145,14 +144,14 @@ def get_column(self, col_name: str):
         return self.ds.loc[:, col_name]
 
     def get_column_unique_values(self, col_name: str):
-       """
+        """
        Returns the unique values for the column
        :param col_name:
        :return:
        """
-       col = self.get_column(col_name=col_name)
-       vals = col.values.ravel()
-       return pd.unique(vals)
+        col = self.get_column(col_name=col_name)
+        vals = col.values.ravel()
+        return pd.unique(vals)
 
     def get_columns_types(self):
         return list(self.ds.dtypes)
@@ -181,8 +180,3 @@ def apply_column_transform(self, column_name: str, transform: Transform) -> None
         column = self.get_column(col_name=column_name)
         column = transform.act(**{"data": column.values})
         self.ds[transform.column_name] = column
-
-
-
-
-
diff --git a/src/examples/__init__.py b/src/examples/__init__.py
diff --git a/src/examples/create_hierarchies_arx.py b/src/examples/create_hierarchies_arx.py
@@ -0,0 +1,66 @@
+"""
+This example shows how to create hierarchies suitable to
+be loaded in the ARX tool
+"""
+import csv
+from src.datasets.datasets_loaders import MockSubjectsLoader
+
+
+def get_ethnicity_hierarchy():
+
+    ethnicity_hierarchy = {}
+
+    ethnicity_hierarchy["Mixed White/Asian"] = ["White/Asian", "Mixed"]
+    ethnicity_hierarchy["Chinese"] = ["Asian", "Asian"]
+    ethnicity_hierarchy["Indian"] = ["Asian", "Asian"]
+    ethnicity_hierarchy["Mixed White/Black African"] = ["White/Black", "Mixed"]
+    ethnicity_hierarchy["Black African"] = ["Black", "African"]
+    ethnicity_hierarchy["Asian other"] = ["Asian", "Other"]
+    ethnicity_hierarchy["Black other"] = ["Black", "Other"]
+    ethnicity_hierarchy["Mixed White/Black Caribbean"] = ["White/Black", "Mixed"]
+    ethnicity_hierarchy["Mixed other"] = ["Mixed", "Mixe"]
+    ethnicity_hierarchy["Arab"] = ["Asian", "Asian"]
+    ethnicity_hierarchy["White Irish"] = ["Irish", "European"]
+    ethnicity_hierarchy["Not stated"] = ["Not stated", "Not stated"]
+    ethnicity_hierarchy["White Gypsy/Traveller"] = ["White", "White"]
+    ethnicity_hierarchy["White British"] = ["British", "European"]
+    ethnicity_hierarchy["Bangladeshi"] = ["Asian", "Asian"]
+    ethnicity_hierarchy["White other"] = ["White", "White"]
+    ethnicity_hierarchy["Black Caribbean"] = ["Black", "Caribbean"]
+    ethnicity_hierarchy["Pakistani"] = ["Asian", "Asian"]
+
+    return ethnicity_hierarchy
+
+
+if __name__ == '__main__':
+
+    # specify the columns to drop
+    drop_columns = MockSubjectsLoader.FEATURES_DROP_NAMES + ["preventative_treatment", "gender",
+                                                             "education", "mutation_status"]
+    MockSubjectsLoader.FEATURES_DROP_NAMES = drop_columns
+
+    # do a salary normalization
+    MockSubjectsLoader.NORMALIZED_COLUMNS = ["salary"]
+
+    # specify the columns to use
+    MockSubjectsLoader.COLUMNS_TYPES = {"ethnicity": str, "salary": float, "diagnosis": int}
+    ds = MockSubjectsLoader()
+
+    ehnicity_map = get_ethnicity_hierarchy()
+    # get the ethincity column loop over
+    # the values and create the hierarchy file
+    filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv"
+    with open(filename, 'w') as fh:
+        writer = csv.writer(fh, delimiter=",")
+
+        ethnicity_column = ds.get_column(col_name="ethnicity").values
+
+        for val in ethnicity_column:
+
+            if val not in ehnicity_map:
+                raise ValueError("Value {0} not in ethnicity map")
+
+            row = [val]
+            row.extend(ehnicity_map[val])
+            writer.writerow(row)
+
diff --git a/src/examples/qlearning_three_columns.py b/src/examples/qlearning_three_columns.py
@@ -151,11 +151,10 @@ def get_ethinicity_hierarchy():
     # create the environment
     env = DiscreteStateEnvironment(env_config=env_config)
     env.reset()
-    env.save_current_dataset(episode_index=-1)
 
-    # save the original dataset for comparison
-    env.save_current_dataset(episode_index=-1)
-    env.reset()
+    # save the data before distortion so that we can
+    # later load it on ARX
+    env.save_current_dataset(episode_index=-1, save_index=False)
 
     # configuration for the Q-learner
     algo_config = QLearnConfig()
@@ -195,7 +194,8 @@ def get_ethinicity_hierarchy():
 
     stop_criterion = IterationControl(n_itrs=10, min_dist=MIN_DISTORTION, max_dist=MAX_DISTORTION)
     agent.play(env=env, stop_criterion=stop_criterion)
-    env.save_current_dataset(episode_index=-2)
-
+    env.save_current_dataset(episode_index=-2, save_index=False)
+    print("{0} Done....".format(INFO))
+    print("=============================================")
 
 
diff --git a/src/spaces/discrete_state_environment.py b/src/spaces/discrete_state_environment.py
@@ -128,14 +128,16 @@ def n_states(self) -> int:
     def get_action(self, aidx: int) -> ActionBase:
         return self.config.action_space[aidx]
 
-    def save_current_dataset(self, episode_index: int) -> None:
+    def save_current_dataset(self, episode_index: int, save_index: bool = False) -> None:
         """
         Save the current distorted datase for the given episode index
         :param episode_index:
+        :param save_index:
         :return:
         """
         self.distorted_data_set.save_to_csv(
-            filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index)))
+            filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index)),
+            save_index=save_index)
 
     def create_bins(self) -> None:
         """
@@ -216,15 +218,14 @@ def apply_action(self, action: ActionBase):
 
         self.column_distances[action.column_name] = distance
 
-    def total_average_current_distortion(self) -> float:
+    def total_current_distortion(self) -> float:
         """
-        Calculates the average total distortion of the dataset
-        by summing over the current computed distances for each column
+        Calculates the current total distortion of the dataset.
         :return:
         """
 
         return self.config.distortion_calculator.total_distortion(
-            list(self.column_distances.values()))  # float(np.mean(list(self.column_distances.values())))
+            list(self.column_distances.values()))
 
     def reset(self, **options) -> TimeStep:
         """
@@ -270,7 +271,7 @@ def step(self, action: ActionBase) -> TimeStep:
         self.apply_action(action=action)
 
         # calculate the distortion of the dataset
-        current_distortion = self.total_average_current_distortion()
+        current_distortion = self.total_current_distortion()
 
         # get the reward for the current distortion
         reward = self.config.reward_manager.get_reward_for_state(state=current_distortion, **{"action": action})
@@ -312,6 +313,7 @@ def step(self, action: ActionBase) -> TimeStep:
 
         # TODO: these modifications will cause the agent to always
         # move close to transition points
+        # TODO: Remove the magic constants
         if next_state is not None and self.current_time_step.observation is not None:
             if next_state < min_dist_bin <= self.current_time_step.observation:
                 # the agent chose to step into the chaos again