Skip to content

Commit 516595a

Browse files
authored
Merge pull request #34 from pockerman/add_scala_utilities
Add scala utilities
2 parents f3257c9 + b220516 commit 516595a

File tree

6 files changed

+267
-13
lines changed

6 files changed

+267
-13
lines changed

scala_helpers/MeasureDataQuality.scala

Lines changed: 89 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,42 @@ package examples.example_3
66

77

88
import base.DefaultConfiguration
9-
import org.deidentifier.arx.Data
10-
import postprocessor.ResultPrinter.printHandleTop
11-
12-
//import scala.collection.JavaConversions._
13-
//import collection.convert.ImplicitConversionsToScala.map AsScala
14-
import collection.JavaConverters.* // asScala
9+
import org.deidentifier.arx.AttributeType.{Hierarchy, listMicroAggregationFunctions}
10+
import org.deidentifier.arx.aggregates.AggregateFunction
11+
import org.deidentifier.arx.aggregates.AggregateFunction.AggregateFunctionBuilder
12+
//import org.deidentifier.arx.aggregates.AggregateFunction.AggregateFunctionBuilder.*
13+
import org.deidentifier.arx.aggregates.HierarchyBuilderIntervalBased
14+
import org.deidentifier.arx.criteria.KAnonymity
15+
import org.deidentifier.arx.{ARXAnonymizer, ARXConfiguration, AttributeType, Data, DataType}
16+
17+
import java.lang
18+
import collection.JavaConverters.*
1519
import collection.convert.ImplicitConversions.*
1620
import java.io.File
17-
import java.nio.charset.Charset
21+
import java.nio.charset.{Charset, StandardCharsets}
22+
import postprocessor.ResultPrinter.{printHandle, printHandleTop, printResult}
23+
1824

1925
object MeasureDataQuality extends App{
2026

27+
def buildSalaryHierarchy: HierarchyBuilderIntervalBased[lang.Double] = {
28+
29+
val salaryHierarchy: HierarchyBuilderIntervalBased[lang.Double] = HierarchyBuilderIntervalBased.create(DataType.DECIMAL)
30+
val aggregateFunctionBuilder = AggregateFunction.forType(DataType.DECIMAL)
31+
32+
salaryHierarchy.addInterval(lang.Double(0.0), lang.Double(0.2222222222222222), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
33+
salaryHierarchy.addInterval(lang.Double(0.2222222222222222), lang.Double(0.4444444444444444), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
34+
salaryHierarchy.addInterval(lang.Double(0.4444444444444444), lang.Double(0.6666666666666666), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
35+
salaryHierarchy.addInterval(lang.Double(0.6666666666666666), lang.Double(0.8888888888888888), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
36+
salaryHierarchy.addInterval(lang.Double(0.8888888888888888), lang.Double(1.1111111111111112), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
37+
salaryHierarchy.addInterval(lang.Double(1.1111111111111112), lang.Double(1.3333333333333333), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
38+
salaryHierarchy.addInterval(lang.Double(1.3333333333333333), lang.Double(1.5555555555555554), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
39+
salaryHierarchy.addInterval(lang.Double(1.5555555555555554), lang.Double(1.7777777777777777), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
40+
salaryHierarchy.addInterval(lang.Double(1.7777777777777777), lang.Double(2.0), aggregateFunctionBuilder.createArithmeticMeanOfBoundsFunction())
41+
42+
salaryHierarchy
43+
}
44+
2145
def loadData: Tuple2[Data, Data] = {
2246

2347
val dataFileOrg: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-1")
@@ -64,14 +88,70 @@ object MeasureDataQuality extends App{
6488

6589
def runKAnonimity: Unit = {
6690

67-
val data = loadData
91+
// load the data
92+
//val dataFile: File = new File("/home/alex/qi3/drl_anonymity/data/mocksubjects.csv")
93+
val dataFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/normalized_salary_mocksubjects.csv")
94+
val data: Data = Data.create(dataFile, Charset.defaultCharset, ',')
95+
96+
printHandleTop(handle = data.getHandle, n = 5)
97+
98+
// set the attribute types if AttributeType.IDENTIFYING_ATTRIBUTE
99+
// then the attribute will be removed
100+
data.getDefinition().setAttributeType("preventative_treatment", AttributeType.IDENTIFYING_ATTRIBUTE)
101+
data.getDefinition().setAttributeType("gender", AttributeType.IDENTIFYING_ATTRIBUTE)
102+
data.getDefinition().setAttributeType("education", AttributeType.IDENTIFYING_ATTRIBUTE)
103+
data.getDefinition().setAttributeType("mutation_status", AttributeType.IDENTIFYING_ATTRIBUTE)
104+
data.getDefinition().setAttributeType("NHSno", AttributeType.IDENTIFYING_ATTRIBUTE)
105+
data.getDefinition().setAttributeType("given_name", AttributeType.IDENTIFYING_ATTRIBUTE)
106+
data.getDefinition().setAttributeType("surname", AttributeType.IDENTIFYING_ATTRIBUTE)
107+
data.getDefinition().setAttributeType("dob", AttributeType.IDENTIFYING_ATTRIBUTE)
108+
109+
// keep the diagnosis as an insensitive attribute
110+
data.getDefinition().setAttributeType("diagnosis", AttributeType.INSENSITIVE_ATTRIBUTE)
111+
112+
// quasi-sensitive attriutes we set the
113+
// hierarchies
114+
// the ethnicity hierarchy file
115+
val ethnicityHierarchyFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv")
116+
data.getDefinition().setAttributeType("ethnicity", Hierarchy.create(ethnicityHierarchyFile,
117+
StandardCharsets.UTF_8, ';'))/*AttributeType.QUASI_IDENTIFYING_ATTRIBUTE)*/
118+
119+
// the salary hierarchy
120+
//val salaryHierarchyFile: File = new File("/home/alex/qi3/drl_anonymity/data/hierarchies/salary_hierarchy.csv")
121+
data.getDefinition().setAttributeType("salary", buildSalaryHierarchy) //AttributeType.QUASI_IDENTIFYING_ATTRIBUTE)
122+
123+
124+
// create the ethnicity hierarchy
125+
//val ethnicityHierarchy = Hierarchy.create(ethnicityHierarchyFile,
126+
// Charset.defaultCharset, ',')
68127

69128
// create the hierarchies for the ethnicity and
70129
// salary
130+
// Create an instance of the anonymizer
131+
val anonymizer = new ARXAnonymizer
132+
val config = ARXConfiguration.create
133+
config.addPrivacyModel(new KAnonymity(5))
134+
config.setSuppressionLimit(0.02d)
135+
136+
137+
// anonymize the data using K-anonimity
138+
val result = anonymizer.anonymize(data, config)
139+
140+
// Print info
141+
printResult(result, data)
142+
143+
// Process results
144+
System.out.println(" - Transformed data:")
145+
printHandle(handle = result.getOutput(false))
146+
System.out.println("Done!")
71147

72148
}
73149

74150
// execute Experiment 1
75-
experiment1
151+
//experiment1
152+
153+
//exploreHierarchy
154+
// run K-anonimity
155+
runKAnonimity
76156

77157
}

scala_helpers/ResultPrinter.scala

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package postprocessor
2+
3+
import scala.util.control.Breaks._
4+
import java.text.DecimalFormat
5+
import collection.JavaConverters.* // asScala
6+
import org.deidentifier.arx.{ARXResult, Data, DataHandle}
7+
8+
/**
9+
* Utility class to print on the console ARXResult instances
10+
* Adapted from https://github.com/arx-deidentifier/arx/blob/master/src/example/org/deidentifier/arx/examples/Example.java
11+
*/
12+
object ResultPrinter {
13+
14+
def printResult(result: ARXResult, data: Data): Unit = { // Print time
15+
16+
17+
val df1 = new DecimalFormat("#####0.00")
18+
val sTotal = df1.format(result.getTime / 1000d) + "s"
19+
System.out.println(" - Time needed: " + sTotal)
20+
// Extract
21+
val optimum = result.getGlobalOptimum
22+
val dataDef = data.getDefinition
23+
val attrs: Set[String] = dataDef.getQuasiIdentifyingAttributes.asScala.toSet[String]
24+
25+
val qis = attrs.toArray[String]
26+
27+
if (optimum == null) {
28+
System.out.println(" - No solution found!")
29+
return
30+
}
31+
// Initialize
32+
val identifiers = new Array[StringBuffer](qis.size)
33+
val generalizations = new Array[StringBuffer](qis.size)
34+
var lengthI = 0
35+
var lengthG = 0
36+
37+
for (i <- 0 until qis.size) {
38+
identifiers(i) = new StringBuffer
39+
generalizations(i) = new StringBuffer
40+
identifiers(i).append(qis(i))
41+
generalizations(i).append(optimum.getGeneralization(qis(i)))
42+
if (data.getDefinition.isHierarchyAvailable(qis(i))) generalizations(i).append("/").append(data.getDefinition.getHierarchy(qis(i))(0).length - 1)
43+
lengthI = Math.max(lengthI, identifiers(i).length)
44+
lengthG = Math.max(lengthG, generalizations(i).length)
45+
}
46+
47+
// Padding
48+
for (i <- 0 until qis.size) {
49+
while ( {
50+
identifiers(i).length < lengthI
51+
}) identifiers(i).append(" ")
52+
while ( {
53+
generalizations(i).length < lengthG
54+
}) generalizations(i).insert(0, " ")
55+
}
56+
// Print
57+
System.out.println(" - Information loss: " + result.getGlobalOptimum.getLowestScore + " / " + result.getGlobalOptimum.getHighestScore)
58+
System.out.println(" - Optimal generalization")
59+
for (i <- 0 until qis.size) {
60+
System.out.println(" * " + identifiers(i) + ": " + generalizations(i))
61+
}
62+
System.out.println(" - Statistics")
63+
System.out.println(result.getOutput(result.getGlobalOptimum, false).getStatistics.getEquivalenceClassStatistics)
64+
}
65+
66+
def printHandle(handle: DataHandle): Unit = {
67+
68+
val transformed = handle.iterator
69+
while (transformed.hasNext) {
70+
System.out.print(" ")
71+
val item = transformed.next
72+
System.out.println(item.mkString(" "))
73+
}
74+
}
75+
76+
/**
77+
* Print the n top items in the handle
78+
*/
79+
def printHandleTop(handle: DataHandle, n: Int): Unit = {
80+
81+
val transformed = handle.iterator
82+
var counter = 0
83+
84+
breakable{
85+
while (transformed.hasNext) {
86+
System.out.print(" ")
87+
val item = transformed.next
88+
System.out.println(item.mkString(" "))
89+
counter += 1
90+
91+
if(counter >= n) break
92+
}
93+
}
94+
95+
}
96+
97+
}

scala_helpers/ResultWriter.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package postprocessor
2+
3+
import java.io.File
4+
import org.deidentifier.arx.DataHandle
5+
6+
7+
abstract class ResultWriter {
8+
9+
def save(handle: DataHandle, fileName: String): Unit
10+
def save(handle: DataHandle, file: File): Unit
11+
12+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package postprocessor
2+
3+
import java.io.File
4+
import org.deidentifier.arx.DataHandle
5+
6+
7+
/**
8+
* Write to CSV file the given DataHandle
9+
* @param delimiter
10+
*/
11+
class ResultWriterCSV(val delimiter: Char=',') extends ResultWriter {
12+
13+
override def save(handle: DataHandle, fileName: String): Unit = {
14+
handle.save(fileName, delimiter)
15+
}
16+
17+
override def save(handle: DataHandle, file: File): Unit = {
18+
handle.save(file)
19+
}
20+
21+
}

src/examples/create_hierarchies_arx.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
be loaded in the ARX tool
44
"""
55
import csv
6+
import numpy as np
67
from src.datasets.datasets_loaders import MockSubjectsLoader
78

89

@@ -14,8 +15,8 @@ def get_ethnicity_hierarchy():
1415
ethnicity_hierarchy["Chinese"] = ["Asian", "Asian"]
1516
ethnicity_hierarchy["Indian"] = ["Asian", "Asian"]
1617
ethnicity_hierarchy["Mixed White/Black African"] = ["White/Black", "Mixed"]
17-
ethnicity_hierarchy["Black African"] = ["Black", "African"]
18-
ethnicity_hierarchy["Asian other"] = ["Asian", "Other"]
18+
ethnicity_hierarchy["Black African"] = ["African", "Black"]
19+
ethnicity_hierarchy["Asian other"] = ["Asian", "Asian"]
1920
ethnicity_hierarchy["Black other"] = ["Black", "Other"]
2021
ethnicity_hierarchy["Mixed White/Black Caribbean"] = ["White/Black", "Mixed"]
2122
ethnicity_hierarchy["Mixed other"] = ["Mixed", "Mixe"]
@@ -26,7 +27,7 @@ def get_ethnicity_hierarchy():
2627
ethnicity_hierarchy["White British"] = ["British", "European"]
2728
ethnicity_hierarchy["Bangladeshi"] = ["Asian", "Asian"]
2829
ethnicity_hierarchy["White other"] = ["White", "White"]
29-
ethnicity_hierarchy["Black Caribbean"] = ["Black", "Caribbean"]
30+
ethnicity_hierarchy["Black Caribbean"] = ["Caribbean", "Black"]
3031
ethnicity_hierarchy["Pakistani"] = ["Asian", "Asian"]
3132

3233
return ethnicity_hierarchy
@@ -51,7 +52,7 @@ def get_ethnicity_hierarchy():
5152
# the values and create the hierarchy file
5253
filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv"
5354
with open(filename, 'w') as fh:
54-
writer = csv.writer(fh, delimiter=",")
55+
writer = csv.writer(fh, delimiter=";")
5556

5657
ethnicity_column = ds.get_column(col_name="ethnicity").values
5758

@@ -64,3 +65,25 @@ def get_ethnicity_hierarchy():
6465
row.extend(ehnicity_map[val])
6566
writer.writerow(row)
6667

68+
# get the salary column
69+
filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/salary_hierarchy.csv"
70+
71+
# create bins for the salary generalization
72+
unique_salary = ds.get_column_unique_values(col_name="salary")
73+
unique_salary.sort()
74+
75+
# modify slightly the max value because
76+
# we get out of bounds
77+
bins = np.linspace(unique_salary[0], unique_salary[-1] + 1, 10)
78+
79+
with open(filename, 'w') as fh:
80+
writer = csv.writer(fh, delimiter=";")
81+
82+
start = bins[0]
83+
for i in range(1, bins.shape[0]): #ethnicity_column:
84+
end = bins[i]
85+
86+
row = [start, end]
87+
writer.writerow(row)
88+
start = end
89+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import csv
2+
from pathlib import Path
3+
from src.datasets.datasets_loaders import MockSubjectsLoader
4+
5+
if __name__ == '__main__':
6+
# specify the columns to drop
7+
drop_columns = MockSubjectsLoader.FEATURES_DROP_NAMES + ["preventative_treatment", "gender",
8+
"education", "mutation_status"]
9+
MockSubjectsLoader.FEATURES_DROP_NAMES = [] # drop_columns
10+
11+
# do a salary normalization
12+
MockSubjectsLoader.NORMALIZED_COLUMNS = ["salary"]
13+
14+
# specify the columns to use
15+
MockSubjectsLoader.COLUMNS_TYPES = {"gender": str, "ethnicity": str, "education": int,
16+
"salary": float, "diagnosis": int, "preventative_treatment": str,
17+
"mutation_status": int, }
18+
ds = MockSubjectsLoader()
19+
20+
ds.save_to_csv(filename=Path("/home/alex/qi3/drl_anonymity/data/hierarchies/normalized_salary_mocksubjects.csv"),
21+
save_index=False)

0 commit comments

Comments
 (0)