Update scala script

pockerman · pockerman · commit e950a012c430 · 2022-02-02T13:29:34.000Z
diff --git a/scala_helpers/MeasureDataQuality.scala b/scala_helpers/MeasureDataQuality.scala
@@ -6,18 +6,42 @@ package examples.example_3
 
 
 import base.DefaultConfiguration
-import org.deidentifier.arx.Data
-import postprocessor.ResultPrinter.printHandleTop
-
-//import scala.collection.JavaConversions._
-//import collection.convert.ImplicitConversionsToScala.map AsScala
-import collection.JavaConverters.* // asScala
+import org.deidentifier.arx.AttributeType.{Hierarchy, listMicroAggregationFunctions}
+import org.deidentifier.arx.aggregates.AggregateFunction
+import org.deidentifier.arx.aggregates.AggregateFunction.AggregateFunctionBuilder
+//import org.deidentifier.arx.aggregates.AggregateFunction.AggregateFunctionBuilder.*
+import org.deidentifier.arx.aggregates.HierarchyBuilderIntervalBased
+import org.deidentifier.arx.criteria.KAnonymity
+import org.deidentifier.arx.{ARXAnonymizer, ARXConfiguration, AttributeType, Data, DataType}
+
+import java.lang
+import collection.JavaConverters.*
 import collection.convert.ImplicitConversions.*
 import java.io.File
-import java.nio.charset.Charset
+import java.nio.charset.{Charset, StandardCharsets}
+import postprocessor.ResultPrinter.{printHandle, printHandleTop, printResult}
+
 
 object MeasureDataQuality extends App{
 
+  def buildSalaryHierarchy: HierarchyBuilderIntervalBased[lang.Double] = {
+
+    val salaryHierarchy: HierarchyBuilderIntervalBased[lang.Double] = HierarchyBuilderIntervalBased.create(DataType.DECIMAL)
+    val aggregateFunctionBuilder = AggregateFunction.forType(DataType.DECIMAL)
+
+    salaryHierarchy.addInterval(lang.Double(0.0), lang.Double(0.2222222222222222), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(0.2222222222222222), lang.Double(0.4444444444444444), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(0.4444444444444444), lang.Double(0.6666666666666666), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(0.6666666666666666), lang.Double(0.8888888888888888), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(0.8888888888888888), lang.Double(1.1111111111111112), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(1.1111111111111112), lang.Double(1.3333333333333333), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(1.3333333333333333), lang.Double(1.5555555555555554), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(1.5555555555555554), lang.Double(1.7777777777777777), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+    salaryHierarchy.addInterval(lang.Double(1.7777777777777777), lang.Double(2.0), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
+
+    salaryHierarchy
+  }
+
   def loadData: Tuple2[Data, Data] = {
 
     val dataFileOrg: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-1")
@@ -64,14 +88,70 @@ object MeasureDataQuality extends App{
 
   def runKAnonimity: Unit = {
 
-    val data = loadData
+    // load the data
+    //val dataFile: File = new File("/home/alex/qi3/drl_anonymity/data/mocksubjects.csv")
+    val dataFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/normalized_salary_mocksubjects.csv")
+    val data: Data = Data.create(dataFile, Charset.defaultCharset, ',')
+
+    printHandleTop(handle = data.getHandle, n = 5)
+
+    // set the attribute types if AttributeType.IDENTIFYING_ATTRIBUTE
+    // then the attribute will be removed
+    data.getDefinition().setAttributeType("preventative_treatment", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("gender", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("education", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("mutation_status", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("NHSno", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("given_name", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("surname", AttributeType.IDENTIFYING_ATTRIBUTE)
+    data.getDefinition().setAttributeType("dob", AttributeType.IDENTIFYING_ATTRIBUTE)
+
+    // keep the diagnosis as an insensitive attribute
+    data.getDefinition().setAttributeType("diagnosis", AttributeType.INSENSITIVE_ATTRIBUTE)
+
+    // quasi-sensitive attriutes we set the
+    // hierarchies
+    // the ethnicity hierarchy file
+    val ethnicityHierarchyFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv")
+    data.getDefinition().setAttributeType("ethnicity", Hierarchy.create(ethnicityHierarchyFile,
+      StandardCharsets.UTF_8, ';'))/*AttributeType.QUASI_IDENTIFYING_ATTRIBUTE)*/
+
+    // the salary hierarchy
+    //val salaryHierarchyFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/salary_hierarchy.csv")
+    data.getDefinition().setAttributeType("salary", buildSalaryHierarchy) //AttributeType.QUASI_IDENTIFYING_ATTRIBUTE)
+
+
+    // create the ethnicity hierarchy
+    //val ethnicityHierarchy = Hierarchy.create(ethnicityHierarchyFile,
+    //  Charset.defaultCharset, ',')
 
     // create the hierarchies for the ethnicity and
     // salary
+    // Create an instance of the anonymizer
+    val anonymizer = new ARXAnonymizer
+    val config = ARXConfiguration.create
+    config.addPrivacyModel(new KAnonymity(5))
+    config.setSuppressionLimit(0.02d)
+
+
+    // anonymize the data using K-anonimity
+    val result = anonymizer.anonymize(data, config)
+
+    // Print info
+    printResult(result, data)
+
+    // Process results
+    System.out.println(" - Transformed data:")
+    printHandle(handle = result.getOutput(false))
+    System.out.println("Done!")
 
   }
 
   // execute Experiment 1
-  experiment1
+  //experiment1
+
+  //exploreHierarchy
+  // run K-anonimity
+  runKAnonimity
 
 }
diff --git a/src/examples/create_hierarchies_arx.py b/src/examples/create_hierarchies_arx.py
@@ -3,6 +3,7 @@
 be loaded in the ARX tool
 """
 import csv
+import numpy as np
 from src.datasets.datasets_loaders import MockSubjectsLoader
 
 
@@ -14,8 +15,8 @@ def get_ethnicity_hierarchy():
     ethnicity_hierarchy["Chinese"] = ["Asian", "Asian"]
     ethnicity_hierarchy["Indian"] = ["Asian", "Asian"]
     ethnicity_hierarchy["Mixed White/Black African"] = ["White/Black", "Mixed"]
-    ethnicity_hierarchy["Black African"] = ["Black", "African"]
-    ethnicity_hierarchy["Asian other"] = ["Asian", "Other"]
+    ethnicity_hierarchy["Black African"] = ["African", "Black"]
+    ethnicity_hierarchy["Asian other"] = ["Asian", "Asian"]
     ethnicity_hierarchy["Black other"] = ["Black", "Other"]
     ethnicity_hierarchy["Mixed White/Black Caribbean"] = ["White/Black", "Mixed"]
     ethnicity_hierarchy["Mixed other"] = ["Mixed", "Mixe"]
@@ -26,7 +27,7 @@ def get_ethnicity_hierarchy():
     ethnicity_hierarchy["White British"] = ["British", "European"]
     ethnicity_hierarchy["Bangladeshi"] = ["Asian", "Asian"]
     ethnicity_hierarchy["White other"] = ["White", "White"]
-    ethnicity_hierarchy["Black Caribbean"] = ["Black", "Caribbean"]
+    ethnicity_hierarchy["Black Caribbean"] = ["Caribbean", "Black"]
     ethnicity_hierarchy["Pakistani"] = ["Asian", "Asian"]
 
     return ethnicity_hierarchy
@@ -51,7 +52,7 @@ def get_ethnicity_hierarchy():
     # the values and create the hierarchy file
     filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv"
     with open(filename, 'w') as fh:
-        writer = csv.writer(fh, delimiter=",")
+        writer = csv.writer(fh, delimiter=";")
 
         ethnicity_column = ds.get_column(col_name="ethnicity").values
 
@@ -64,3 +65,25 @@ def get_ethnicity_hierarchy():
             row.extend(ehnicity_map[val])
             writer.writerow(row)
 
+    # get the salary column
+    filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/salary_hierarchy.csv"
+
+    # create bins for the salary generalization
+    unique_salary = ds.get_column_unique_values(col_name="salary")
+    unique_salary.sort()
+
+    # modify slightly the max value because
+    # we get out of bounds
+    bins = np.linspace(unique_salary[0], unique_salary[-1] + 1, 10)
+
+    with open(filename, 'w') as fh:
+        writer = csv.writer(fh, delimiter=";")
+
+        start = bins[0]
+        for i in range(1, bins.shape[0]): #ethnicity_column:
+            end = bins[i]
+
+            row = [start, end]
+            writer.writerow(row)
+            start = end
+