55import numpy as np
66
77from src .preprocessor .cleanup_utils import read_csv , replace , change_column_types
8+ from src .exceptions .exceptions import InvalidDataTypeException
89
910DS = TypeVar ("DS" )
1011HierarchyBase = TypeVar ('HierarchyBase' )
@@ -41,7 +42,7 @@ def __init__(self, columns: dir) -> None:
4142
4243 # map that holds the hierarchy to be applied
4344 # on each column in the dataset
44- self .column_hierarchy = {}
45+ # self.column_hierarchy = {}
4546
4647 @property
4748 def n_rows (self ) -> int :
@@ -63,6 +64,14 @@ def n_columns(self) -> int:
6364 def schema (self ) -> dict :
6465 return pd .io .json .build_table_schema (self .ds )
6566
67+ def save_to_csv (self , filename : Path ) -> None :
68+ """
69+ Save the underlying dataset in a csv format
70+ :param filename:
71+ :return:
72+ """
73+ self .ds .to_csv (filename )
74+
6675 def read (self , filename : Path , ** options ) -> None :
6776 """
6877 Load a data set from a file
@@ -82,6 +91,25 @@ def read(self, filename: Path, **options) -> None:
8291 # try to cast to the data types
8392 self .ds = change_column_types (ds = self .ds , column_types = self .columns )
8493
94+ def normalize_column (self , column_name ) -> None :
95+ """
96+ Normalizes the column with the given name using the following
97+ transformation:
98+
99+ z_i = \f rac{x_i - min(x)}{max(x) - min(x)}
100+
101+ if the column is not of numeric type then this function
102+ throws an InvalidDataTypeException
103+ :param column_name:
104+ :return:
105+ """
106+
107+ data_type = self .columns [column_name ]
108+ if data_type is not int or data_type is not float :
109+ raise InvalidDataTypeException (param_name = column_name , param_types = "[int, float]" )
110+
111+ raise NotImplementedError ("Function is not implemented" )
112+
85113 def sample_column_name (self ) -> str :
86114 """
87115 Samples a name from the columns
@@ -98,18 +126,23 @@ def set_columns_to_type(self, col_name_types) -> None:
98126 """
99127 self .ds .astype (dtype = col_name_types )
100128
101- def attach_column_hierarchy (self , col_name : str , hierarchy : HierarchyBase ):
102- self .column_hierarchy [col_name ] = hierarchy
103-
104129 def get_column (self , col_name : str ):
130+ """
131+ Returns the column with the given name
132+ :param col_name:
133+ :return:
134+ """
105135 return self .ds .loc [:, col_name ]
106136
107137 def get_column_unique_values (self , col_name : str ):
108- # what are the unique values?
109-
110- col = self .get_column (col_name = col_name )
111- vals = col .values .ravel ()
112- return pd .unique (vals )
138+ """
139+ Returns the unique values for the column
140+ :param col_name:
141+ :return:
142+ """
143+ col = self .get_column (col_name = col_name )
144+ vals = col .values .ravel ()
145+ return pd .unique (vals )
113146
114147 def get_columns_types (self ):
115148 return list (self .ds .dtypes )
@@ -136,7 +169,7 @@ def apply_column_transform(self, column_name: str, transform: Transform) -> None
136169
137170 # get the column
138171 column = self .get_column (col_name = column_name )
139- column = transform .act (** {"data" : column })
172+ column = transform .act (** {"data" : column . values })
140173 self .ds [transform .column_name ] = column
141174
142175
0 commit comments