Skip to content

Commit e950a01

Browse files
committed
Update scala script
1 parent 019ee50 commit e950a01

File tree

2 files changed

+116
-13
lines changed

2 files changed

+116
-13
lines changed

scala_helpers/MeasureDataQuality.scala

Lines changed: 89 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,42 @@ package examples.example_3
66

77

88
import base.DefaultConfiguration
9-
import org.deidentifier.arx.Data
10-
import postprocessor.ResultPrinter.printHandleTop
11-
12-
//import scala.collection.JavaConversions._
13-
//import collection.convert.ImplicitConversionsToScala.map AsScala
14-
import collection.JavaConverters.* // asScala
9+
import org.deidentifier.arx.AttributeType.{Hierarchy, listMicroAggregationFunctions}
10+
import org.deidentifier.arx.aggregates.AggregateFunction
11+
import org.deidentifier.arx.aggregates.AggregateFunction.AggregateFunctionBuilder
12+
//import org.deidentifier.arx.aggregates.AggregateFunction.AggregateFunctionBuilder.*
13+
import org.deidentifier.arx.aggregates.HierarchyBuilderIntervalBased
14+
import org.deidentifier.arx.criteria.KAnonymity
15+
import org.deidentifier.arx.{ARXAnonymizer, ARXConfiguration, AttributeType, Data, DataType}
16+
17+
import java.lang
18+
import collection.JavaConverters.*
1519
import collection.convert.ImplicitConversions.*
1620
import java.io.File
17-
import java.nio.charset.Charset
21+
import java.nio.charset.{Charset, StandardCharsets}
22+
import postprocessor.ResultPrinter.{printHandle, printHandleTop, printResult}
23+
1824

1925
object MeasureDataQuality extends App{
2026

27+
def buildSalaryHierarchy: HierarchyBuilderIntervalBased[lang.Double] = {
28+
29+
val salaryHierarchy: HierarchyBuilderIntervalBased[lang.Double] = HierarchyBuilderIntervalBased.create(DataType.DECIMAL)
30+
val aggregateFunctionBuilder = AggregateFunction.forType(DataType.DECIMAL)
31+
32+
salaryHierarchy.addInterval(lang.Double(0.0), lang.Double(0.2222222222222222), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
33+
salaryHierarchy.addInterval(lang.Double(0.2222222222222222), lang.Double(0.4444444444444444), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
34+
salaryHierarchy.addInterval(lang.Double(0.4444444444444444), lang.Double(0.6666666666666666), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
35+
salaryHierarchy.addInterval(lang.Double(0.6666666666666666), lang.Double(0.8888888888888888), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
36+
salaryHierarchy.addInterval(lang.Double(0.8888888888888888), lang.Double(1.1111111111111112), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
37+
salaryHierarchy.addInterval(lang.Double(1.1111111111111112), lang.Double(1.3333333333333333), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
38+
salaryHierarchy.addInterval(lang.Double(1.3333333333333333), lang.Double(1.5555555555555554), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
39+
salaryHierarchy.addInterval(lang.Double(1.5555555555555554), lang.Double(1.7777777777777777), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
40+
salaryHierarchy.addInterval(lang.Double(1.7777777777777777), lang.Double(2.0), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
41+
42+
salaryHierarchy
43+
}
44+
2145
def loadData: Tuple2[Data, Data] = {
2246

2347
val dataFileOrg: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-1")
@@ -64,14 +88,70 @@ object MeasureDataQuality extends App{
6488

6589
def runKAnonimity: Unit = {
6690

67-
val data = loadData
91+
// load the data
92+
//val dataFile: File = new File("/home/alex/qi3/drl_anonymity/data/mocksubjects.csv")
93+
val dataFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/normalized_salary_mocksubjects.csv")
94+
val data: Data = Data.create(dataFile, Charset.defaultCharset, ',')
95+
96+
printHandleTop(handle = data.getHandle, n = 5)
97+
98+
// set the attribute types if AttributeType.IDENTIFYING_ATTRIBUTE
99+
// then the attribute will be removed
100+
data.getDefinition().setAttributeType("preventative_treatment", AttributeType.IDENTIFYING_ATTRIBUTE)
101+
data.getDefinition().setAttributeType("gender", AttributeType.IDENTIFYING_ATTRIBUTE)
102+
data.getDefinition().setAttributeType("education", AttributeType.IDENTIFYING_ATTRIBUTE)
103+
data.getDefinition().setAttributeType("mutation_status", AttributeType.IDENTIFYING_ATTRIBUTE)
104+
data.getDefinition().setAttributeType("NHSno", AttributeType.IDENTIFYING_ATTRIBUTE)
105+
data.getDefinition().setAttributeType("given_name", AttributeType.IDENTIFYING_ATTRIBUTE)
106+
data.getDefinition().setAttributeType("surname", AttributeType.IDENTIFYING_ATTRIBUTE)
107+
data.getDefinition().setAttributeType("dob", AttributeType.IDENTIFYING_ATTRIBUTE)
108+
109+
// keep the diagnosis as an insensitive attribute
110+
data.getDefinition().setAttributeType("diagnosis", AttributeType.INSENSITIVE_ATTRIBUTE)
111+
112+
// quasi-sensitive attriutes we set the
113+
// hierarchies
114+
// the ethnicity hierarchy file
115+
val ethnicityHierarchyFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv")
116+
data.getDefinition().setAttributeType("ethnicity", Hierarchy.create(ethnicityHierarchyFile,
117+
StandardCharsets.UTF_8, ';'))/*AttributeType.QUASI_IDENTIFYING_ATTRIBUTE)*/
118+
119+
// the salary hierarchy
120+
//val salaryHierarchyFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/salary_hierarchy.csv")
121+
data.getDefinition().setAttributeType("salary", buildSalaryHierarchy) //AttributeType.QUASI_IDENTIFYING_ATTRIBUTE)
122+
123+
124+
// create the ethnicity hierarchy
125+
//val ethnicityHierarchy = Hierarchy.create(ethnicityHierarchyFile,
126+
// Charset.defaultCharset, ',')
68127

69128
// create the hierarchies for the ethnicity and
70129
// salary
130+
// Create an instance of the anonymizer
131+
val anonymizer = new ARXAnonymizer
132+
val config = ARXConfiguration.create
133+
config.addPrivacyModel(new KAnonymity(5))
134+
config.setSuppressionLimit(0.02d)
135+
136+
137+
// anonymize the data using K-anonimity
138+
val result = anonymizer.anonymize(data, config)
139+
140+
// Print info
141+
printResult(result, data)
142+
143+
// Process results
144+
System.out.println(" - Transformed data:")
145+
printHandle(handle = result.getOutput(false))
146+
System.out.println("Done!")
71147

72148
}
73149

74150
// execute Experiment 1
75-
experiment1
151+
//experiment1
152+
153+
//exploreHierarchy
154+
// run K-anonimity
155+
runKAnonimity
76156

77157
}

src/examples/create_hierarchies_arx.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
be loaded in the ARX tool
44
"""
55
import csv
6+
import numpy as np
67
from src.datasets.datasets_loaders import MockSubjectsLoader
78

89

@@ -14,8 +15,8 @@ def get_ethnicity_hierarchy():
1415
ethnicity_hierarchy["Chinese"] = ["Asian", "Asian"]
1516
ethnicity_hierarchy["Indian"] = ["Asian", "Asian"]
1617
ethnicity_hierarchy["Mixed White/Black African"] = ["White/Black", "Mixed"]
17-
ethnicity_hierarchy["Black African"] = ["Black", "African"]
18-
ethnicity_hierarchy["Asian other"] = ["Asian", "Other"]
18+
ethnicity_hierarchy["Black African"] = ["African", "Black"]
19+
ethnicity_hierarchy["Asian other"] = ["Asian", "Asian"]
1920
ethnicity_hierarchy["Black other"] = ["Black", "Other"]
2021
ethnicity_hierarchy["Mixed White/Black Caribbean"] = ["White/Black", "Mixed"]
2122
ethnicity_hierarchy["Mixed other"] = ["Mixed", "Mixe"]
@@ -26,7 +27,7 @@ def get_ethnicity_hierarchy():
2627
ethnicity_hierarchy["White British"] = ["British", "European"]
2728
ethnicity_hierarchy["Bangladeshi"] = ["Asian", "Asian"]
2829
ethnicity_hierarchy["White other"] = ["White", "White"]
29-
ethnicity_hierarchy["Black Caribbean"] = ["Black", "Caribbean"]
30+
ethnicity_hierarchy["Black Caribbean"] = ["Caribbean", "Black"]
3031
ethnicity_hierarchy["Pakistani"] = ["Asian", "Asian"]
3132

3233
return ethnicity_hierarchy
@@ -51,7 +52,7 @@ def get_ethnicity_hierarchy():
5152
# the values and create the hierarchy file
5253
filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv"
5354
with open(filename, 'w') as fh:
54-
writer = csv.writer(fh, delimiter=",")
55+
writer = csv.writer(fh, delimiter=";")
5556

5657
ethnicity_column = ds.get_column(col_name="ethnicity").values
5758

@@ -64,3 +65,25 @@ def get_ethnicity_hierarchy():
6465
row.extend(ehnicity_map[val])
6566
writer.writerow(row)
6667

68+
# get the salary column
69+
filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/salary_hierarchy.csv"
70+
71+
# create bins for the salary generalization
72+
unique_salary = ds.get_column_unique_values(col_name="salary")
73+
unique_salary.sort()
74+
75+
# modify slightly the max value because
76+
# we get out of bounds
77+
bins = np.linspace(unique_salary[0], unique_salary[-1] + 1, 10)
78+
79+
with open(filename, 'w') as fh:
80+
writer = csv.writer(fh, delimiter=";")
81+
82+
start = bins[0]
83+
for i in range(1, bins.shape[0]): #ethnicity_column:
84+
end = bins[i]
85+
86+
row = [start, end]
87+
writer.writerow(row)
88+
start = end
89+

0 commit comments

Comments
 (0)