Skip to content

Commit b02705d

Browse files
committed
Add dataset distances
1 parent 1b54a81 commit b02705d

File tree

2 files changed

+29
-0
lines changed

2 files changed

+29
-0
lines changed

src/datasets/dataset_distances.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""
2+
Various utilities to calculate the distance
3+
between two datasets. All distance metrics work
4+
accumulative
5+
"""
6+
7+
from typing import TypeVar
8+
import numpy as np
9+
10+
DataSet = TypeVar("DataSet")
11+
12+
13+
def lp_distance(ds1: DataSet, ds2: DataSet, p=None):
14+
15+
assert ds1.schema == ds2.schema, "Invalid schema for datasets"
16+
17+
distances = {}
18+
cols = ds1.get_columns_names()
19+
for col in cols:
20+
21+
val1 = ds1.get_column(col_name=col)
22+
val2 = ds2.get_column(col_name=col)
23+
distances[col] = np.linalg.norm(val1 - val2, ord=p)
24+
25+
return distances, sum(distances.values())
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""
2+
Utilities for calculating the information leakage
3+
for a dataset
4+
"""

0 commit comments

Comments
 (0)