torch data utils added, torch is an optional dependency of the module

cosmic-cortex · cosmic-cortex · commit 87f64be9678e · 2021-11-29T11:05:27.000+01:00
diff --git a/modAL/utils/data.py b/modAL/utils/data.py
@@ -4,6 +4,12 @@
 import pandas as pd
 import scipy.sparse as sp
 
+try:
+    import torch
+except:
+    pass
+
+
 modALinput = Union[sp.csr_matrix, pd.DataFrame, np.ndarray, list]
 
 
@@ -26,7 +32,13 @@ def data_vstack(blocks: Sequence[modALinput]) -> modALinput:
     elif isinstance(blocks[0], list):
         return np.concatenate(blocks).tolist()
 
-    raise TypeError('%s datatype is not supported' % type(blocks[0]))
+    try:
+        if torch.is_tensor(blocks[0]):
+            return torch.cat(blocks)
+    except:
+        pass
+
+    raise TypeError("%s datatype is not supported" % type(blocks[0]))
 
 
 def data_hstack(blocks: Sequence[modALinput]) -> modALinput:
@@ -48,7 +60,13 @@ def data_hstack(blocks: Sequence[modALinput]) -> modALinput:
     elif isinstance(blocks[0], list):
         return np.hstack(blocks).tolist()
 
-    TypeError('%s datatype is not supported' % type(blocks[0]))
+    try:
+        if torch.is_tensor(blocks[0]):
+            return torch.cat(blocks, dim=1)
+    except:
+        pass
+
+    TypeError("%s datatype is not supported" % type(blocks[0]))
 
 
 def add_row(X: modALinput, row: modALinput):
@@ -68,8 +86,9 @@ def add_row(X: modALinput, row: modALinput):
     return data_vstack([X, row])
 
 
-def retrieve_rows(X: modALinput,
-                  I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
+def retrieve_rows(
+    X: modALinput, I: Union[int, List[int], np.ndarray]
+) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
     """
     Returns the rows I from the data set X
 
@@ -78,34 +97,34 @@ def retrieve_rows(X: modALinput,
     * pandas series in case of a pandas data frame
     * row in case of list or numpy format
     """
-    if sp.issparse(X):
-        # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix,
-        # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix
-        # and sp.dia_matrix don't support indexing and need to be converted to a sparse format
-        # that does support indexing. It seems conversion to CSR is currently most efficient.
-
-        try:
-            return X[I]
-        except:
-            sp_format = X.getformat()
-            return X.tocsr()[I].asformat(sp_format)
-    elif isinstance(X, pd.DataFrame):
-        return X.iloc[I]
-    elif isinstance(X, list):
-        return np.array(X)[I].tolist()
-    elif isinstance(X, dict):
-        X_return = {}
-        for key, value in X.items():
-            X_return[key] = retrieve_rows(value, I)
-        return X_return
-    elif isinstance(X, np.ndarray):
-        return X[I]
-
-    raise TypeError('%s datatype is not supported' % type(X))
 
+    try:
+        return X[I]
+    except:
+        if sp.issparse(X):
+            # Out of the sparse matrix formats (sp.csc_matrix, sp.csr_matrix, sp.bsr_matrix,
+            # sp.lil_matrix, sp.dok_matrix, sp.coo_matrix, sp.dia_matrix), only sp.bsr_matrix, sp.coo_matrix
+            # and sp.dia_matrix don't support indexing and need to be converted to a sparse format
+            # that does support indexing. It seems conversion to CSR is currently most efficient.
 
-def drop_rows(X: modALinput,
-              I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
+            sp_format = X.getformat()
+            return X.tocsr()[I].asformat(sp_format)
+        elif isinstance(X, pd.DataFrame):
+            return X.iloc[I]
+        elif isinstance(X, list):
+            return np.array(X)[I].tolist()
+        elif isinstance(X, dict):
+            X_return = {}
+            for key, value in X.items():
+                X_return[key] = retrieve_rows(value, I)
+            return X_return
+
+    raise TypeError("%s datatype is not supported" % type(X))
+
+
+def drop_rows(
+    X: modALinput, I: Union[int, List[int], np.ndarray]
+) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
     """
     Returns X without the row(s) at index/indices I
     """
@@ -120,7 +139,13 @@ def drop_rows(X: modALinput,
     elif isinstance(X, list):
         return np.delete(X, I, axis=0).tolist()
 
-    raise TypeError('%s datatype is not supported' % type(X))
+    try:
+        if torch.is_tensor(blocks[0]):
+            return torch.cat(blocks)
+    except:
+        X[[True if row not in I else False for row in range(X.size(0))]]
+
+    raise TypeError("%s datatype is not supported" % type(X))
 
 
 def enumerate_data(X: modALinput):
@@ -141,17 +166,18 @@ def enumerate_data(X: modALinput):
         # numpy arrays and lists can readily be enumerated
         return enumerate(X)
 
-    raise TypeError('%s datatype is not supported' % type(X))
+    raise TypeError("%s datatype is not supported" % type(X))
 
 
 def data_shape(X: modALinput):
     """
     Returns the shape of the data set X
     """
-    if sp.issparse(X) or isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray):
-        # scipy.sparse, pandas and numpy all support .shape
+    try:
+        # scipy.sparse, torch, pandas and numpy all support .shape
         return X.shape
-    elif isinstance(X, list):
-        return np.array(X).shape
+    except:
+        if isinstance(X, list):
+            return np.array(X).shape
 
-    raise TypeError('%s datatype is not supported' % type(X))
+    raise TypeError("%s datatype is not supported" % type(X))
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -189,6 +189,13 @@ def test_data_vstack(self):
                 np.concatenate((a, b))
             )
 
+            # torch.Tensors
+            a, b = torch.ones(2, 2), torch.ones(2, 2)
+            torch.testing.assert_allclose(
+                modAL.utils.data.data_vstack((a, b)),
+                torch.cat((a, b))
+            )
+
         # not supported formats
         self.assertRaises(TypeError, modAL.utils.data.data_vstack, (1, 1))