|
5 | 5 | """ |
6 | 6 |
|
7 | 7 | from pathlib import Path |
| 8 | +from typing import List |
| 9 | +from dataclasses import dataclass, field |
| 10 | + |
8 | 11 | from src.datasets.dataset_wrapper import PandasDSWrapper |
9 | 12 |
|
10 | 13 |
|
11 | | -class MockSubjectsLoader(PandasDSWrapper): |
12 | | - """ |
13 | | - The class MockSubjectsLoader. Loads the mocksubjects.csv |
14 | | - """ |
| 14 | +@dataclass(init=True, repr=True) |
| 15 | +class MockSubjectsData(object): |
15 | 16 |
|
16 | 17 | # Path to the dataset file |
17 | | - FILENAME = Path("../../data/mocksubjects.csv") |
| 18 | + FILENAME: Path = Path("../../data/mocksubjects.csv") |
18 | 19 |
|
19 | 20 | # the assumed column types. We use this map to cast |
20 | 21 | # the types of the columns |
21 | | - COLUMNS_TYPES = {"gender": str, "ethnicity": str, "education": int, |
22 | | - "salary": int, "diagnosis": int, "preventative_treatment": str, |
23 | | - "mutation_status": int, } |
| 22 | + COLUMNS_TYPES: dict = field(default_factory=lambda: {"gender": str, "ethnicity": str, "education": int, |
| 23 | + "salary": int, "diagnosis": int, "preventative_treatment": str, |
| 24 | + "mutation_status": int,}) |
24 | 25 |
|
25 | 26 | # features to drop |
26 | | - FEATURES_DROP_NAMES = ["NHSno", "given_name", "surname", "dob"] |
| 27 | + FEATURES_DROP_NAMES: List[str] = field(default_factory=lambda: ["NHSno", "given_name", "surname", "dob"]) |
27 | 28 |
|
28 | 29 | # Names of the columns in the dataset |
29 | | - NAMES = ["NHSno", "given_name", "surname", "gender", |
30 | | - "dob", "ethnicity", "education", "salary", |
31 | | - "mutation_status", "preventative_treatment", "diagnosis"] |
| 30 | + NAMES: List[str] = field(default_factory=lambda: ["NHSno", "given_name", "surname", "gender", |
| 31 | + "dob", "ethnicity", "education", "salary", |
| 32 | + "mutation_status", "preventative_treatment", "diagnosis"]) |
32 | 33 |
|
33 | 34 | # option to drop NaN |
34 | | - DROP_NA = True |
| 35 | + DROP_NA: bool = True |
35 | 36 |
|
36 | 37 | # Map that holds for each column the transformations |
37 | 38 | # we want to apply for each value |
38 | | - CHANGE_COLS_VALS = {"diagnosis": [('N', 0)]} |
| 39 | + CHANGE_COLS_VALS: dict = field(default_factory=lambda: {"diagnosis": [('N', 0)]}) |
39 | 40 |
|
40 | 41 | # list of columns to be normalized |
41 | | - NORMALIZED_COLUMNS = [] |
42 | | - |
43 | | - def __init__(self): |
44 | | - super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.COLUMNS_TYPES) |
45 | | - self.read(filename=MockSubjectsLoader.FILENAME, |
46 | | - **{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES, |
47 | | - "names": MockSubjectsLoader.NAMES, |
48 | | - "drop_na": MockSubjectsLoader.DROP_NA, |
49 | | - "change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS, |
50 | | - "column_normalization": MockSubjectsLoader.NORMALIZED_COLUMNS}) |
| 42 | + NORMALIZED_COLUMNS: List[str] = field(default_factory=list) |
| 43 | + |
| 44 | + |
| 45 | +class MockSubjectsLoader(PandasDSWrapper): |
| 46 | + """The class MockSubjectsLoader. Loads the mocksubjects.csv |
| 47 | + """ |
| 48 | + |
| 49 | + @classmethod |
| 50 | + def from_options(cls, *, filename: Path, |
| 51 | + column_types: dir, features_drop_names: List[str], |
| 52 | + names: List[str], drop_na: bool, change_col_vals: dict, column_normalization: List[str]): |
| 53 | + |
| 54 | + data = MockSubjectsData(FILENAME=filename, COLUMNS_TYPES=column_types, |
| 55 | + FEATURES_DROP_NAMES=features_drop_names, NAMES=names, |
| 56 | + DROP_NA=drop_na, CHANGE_COLS_VALS=change_col_vals, |
| 57 | + NORMALIZED_COLUMNS=column_normalization) |
| 58 | + return cls(data=data) |
| 59 | + |
| 60 | + def __init__(self, data: MockSubjectsData, do_read: bool=True): |
| 61 | + super(MockSubjectsLoader, self).__init__(columns=data.COLUMNS_TYPES) |
| 62 | + |
| 63 | + if do_read: |
| 64 | + self.read(filename=data.FILENAME, |
| 65 | + **{"features_drop_names": data.FEATURES_DROP_NAMES, |
| 66 | + "names": data.NAMES, |
| 67 | + "drop_na": data.DROP_NA, |
| 68 | + "change_col_vals": data.CHANGE_COLS_VALS, |
| 69 | + "column_normalization": data.NORMALIZED_COLUMNS}) |
0 commit comments