1+ """
2+ Utility class that allows to load the mocksubjects.csv
3+ dataset and perform various transformations and cleaning
4+ on it
5+ """
6+
17from pathlib import Path
28from src .datasets .dataset_wrapper import PandasDSWrapper
39
410
511class MockSubjectsLoader (PandasDSWrapper ):
12+ """
13+ The class MockSubjectsLoader. Loads the mocksubjects.csv
14+ """
15+
16+ # Path to the dataset file
17+ FILENAME = Path ("../../data/mocksubjects.csv" )
618
7- DEFAULT_COLUMNS = {"gender" : str , "ethnicity" : str , "education" : int ,
19+ # the assumed column types. We use this map to cast
20+ # the types of the columns
21+ COLUMNS_TYPES = {"gender" : str , "ethnicity" : str , "education" : int ,
822 "salary" : int , "diagnosis" : int , "preventative_treatment" : str ,
923 "mutation_status" : int , }
1024
11- FILENAME = Path ("../../data/mocksubjects.csv" )
12-
25+ # features to drop
1326 FEATURES_DROP_NAMES = ["NHSno" , "given_name" , "surname" , "dob" ]
1427
28+ # Names of the columns in the dataset
1529 NAMES = ["NHSno" , "given_name" , "surname" , "gender" ,
1630 "dob" , "ethnicity" , "education" , "salary" ,
1731 "mutation_status" , "preventative_treatment" , "diagnosis" ]
1832
33+ # option to drop NaN
1934 DROP_NA = True
2035
36+ # Map that holds for each column the transformations
37+ # we want to apply for each value
2138 CHANGE_COLS_VALS = {"diagnosis" : [('N' , 0 )]}
2239
40+ # list of columns to be normalized
41+ NORMALIZED_COLUMNS = []
42+
2343 def __init__ (self ):
24- super (MockSubjectsLoader , self ).__init__ (columns = MockSubjectsLoader .DEFAULT_COLUMNS )
25- self .read (filename = MockSubjectsLoader .FILENAME , ** {"features_drop_names" : MockSubjectsLoader .FEATURES_DROP_NAMES ,
26- "names" : MockSubjectsLoader .NAMES ,
27- "drop_na" : MockSubjectsLoader .DROP_NA ,
28- "change_col_vals" : MockSubjectsLoader .CHANGE_COLS_VALS })
44+ super (MockSubjectsLoader , self ).__init__ (columns = MockSubjectsLoader .COLUMNS_TYPES )
45+ self .read (filename = MockSubjectsLoader .FILENAME ,
46+ ** {"features_drop_names" : MockSubjectsLoader .FEATURES_DROP_NAMES ,
47+ "names" : MockSubjectsLoader .NAMES ,
48+ "drop_na" : MockSubjectsLoader .DROP_NA ,
49+ "change_col_vals" : MockSubjectsLoader .CHANGE_COLS_VALS ,
50+ "column_normalization" : MockSubjectsLoader .NORMALIZED_COLUMNS })
0 commit comments