Skip to content

Commit 5b461b3

Browse files
committed
#23 Add column normalization
1 parent f9fb8fa commit 5b461b3

File tree

2 files changed

+44
-13
lines changed

2 files changed

+44
-13
lines changed

src/datasets/dataset_wrapper.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@ def __init__(self, columns: dir) -> None:
4040

4141
self.columns: dir = columns
4242

43-
# map that holds the hierarchy to be applied
44-
# on each column in the dataset
45-
#self.column_hierarchy = {}
46-
4743
@property
4844
def n_rows(self) -> int:
4945
"""
@@ -91,6 +87,11 @@ def read(self, filename: Path, **options) -> None:
9187
# try to cast to the data types
9288
self.ds = change_column_types(ds=self.ds, column_types=self.columns)
9389

90+
if "column_normalization" in options and \
91+
options["column_normalization"] is not None:
92+
for col in options["column_normalization"]:
93+
self.normalize_column(column_name=col)
94+
9495
def normalize_column(self, column_name) -> None:
9596
"""
9697
Normalizes the column with the given name using the following
@@ -108,7 +109,15 @@ def normalize_column(self, column_name) -> None:
108109
if data_type is not int or data_type is not float:
109110
raise InvalidDataTypeException(param_name=column_name, param_types="[int, float]")
110111

111-
raise NotImplementedError("Function is not implemented")
112+
col_vals = self.get_column(col_name=column_name).values
113+
114+
min_val = np.min(col_vals)
115+
max_val = np.max(col_vals)
116+
117+
for i in range(len(col_vals)):
118+
col_vals[i] = (col_vals[i] - min_val) / (max_val - min_val)
119+
120+
self.ds[column_name] = col_vals
112121

113122
def sample_column_name(self) -> str:
114123
"""

src/datasets/datasets_loaders.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,50 @@
1+
"""
2+
Utility class that allows to load the mocksubjects.csv
3+
dataset and perform various transformations and cleaning
4+
on it
5+
"""
6+
17
from pathlib import Path
28
from src.datasets.dataset_wrapper import PandasDSWrapper
39

410

511
class MockSubjectsLoader(PandasDSWrapper):
12+
"""
13+
The class MockSubjectsLoader. Loads the mocksubjects.csv
14+
"""
15+
16+
# Path to the dataset file
17+
FILENAME = Path("../../data/mocksubjects.csv")
618

7-
DEFAULT_COLUMNS = {"gender": str, "ethnicity": str, "education": int,
19+
# the assumed column types. We use this map to cast
20+
# the types of the columns
21+
COLUMNS_TYPES = {"gender": str, "ethnicity": str, "education": int,
822
"salary": int, "diagnosis": int, "preventative_treatment": str,
923
"mutation_status": int, }
1024

11-
FILENAME = Path("../../data/mocksubjects.csv")
12-
25+
# features to drop
1326
FEATURES_DROP_NAMES = ["NHSno", "given_name", "surname", "dob"]
1427

28+
# Names of the columns in the dataset
1529
NAMES = ["NHSno", "given_name", "surname", "gender",
1630
"dob", "ethnicity", "education", "salary",
1731
"mutation_status", "preventative_treatment", "diagnosis"]
1832

33+
# option to drop NaN
1934
DROP_NA = True
2035

36+
# Map that holds for each column the transformations
37+
# we want to apply for each value
2138
CHANGE_COLS_VALS = {"diagnosis": [('N', 0)]}
2239

40+
# list of columns to be normalized
41+
NORMALIZED_COLUMNS = []
42+
2343
def __init__(self):
24-
super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.DEFAULT_COLUMNS)
25-
self.read(filename=MockSubjectsLoader.FILENAME, **{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES,
26-
"names": MockSubjectsLoader.NAMES,
27-
"drop_na": MockSubjectsLoader.DROP_NA,
28-
"change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS})
44+
super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.COLUMNS_TYPES)
45+
self.read(filename=MockSubjectsLoader.FILENAME,
46+
**{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES,
47+
"names": MockSubjectsLoader.NAMES,
48+
"drop_na": MockSubjectsLoader.DROP_NA,
49+
"change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS,
50+
"column_normalization": MockSubjectsLoader.NORMALIZED_COLUMNS})

0 commit comments

Comments
 (0)