[BUG] fixed memory leak in TimeSeriesDataset by using @cached_property and clean-up of index construction (#1905)

Vishnu-Rangiah · web-flow · commit e8c8b4614597 · 2025-07-02T11:24:23.000+02:00
#### Reference Issues/PRs #648 #### What does this implement/fix? Explain your changes. - Replaced `@property` and `@lru_cache` with `@cached_property` to fix a self-reference leak: previously, the cache kept strong references to every instance, preventing garbage collection and causing memory growth if many instances were created. - Improved `_construct_index()` function to return only essential columns in a consistent format.
diff --git a/pytorch_forecasting/data/timeseries/_timeseries.py b/pytorch_forecasting/data/timeseries/_timeseries.py
@@ -7,7 +7,7 @@
 """
 
 from copy import copy as _copy, deepcopy
-from functools import lru_cache
+from functools import cached_property
 import inspect
 from typing import Any, Callable, Optional, TypeVar, Union
 import warnings
@@ -812,8 +812,7 @@ def _get_lagged_names(self, name: str) -> dict[str, int]:
         """
         return {f"{name}_lagged_by_{lag}": lag for lag in self._lags.get(name, [])}
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def lagged_variables(self) -> dict[str, str]:
         """Lagged variables.
 
@@ -828,8 +827,7 @@ def lagged_variables(self) -> dict[str, str]:
             vars.update({lag_name: name for lag_name in self._get_lagged_names(name)})
         return vars
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def lagged_targets(self) -> dict[str, str]:
         """Subset of lagged_variables to variables that are lagged targets.
 
@@ -850,8 +848,7 @@ def lagged_targets(self) -> dict[str, str]:
             )
         return vars
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def min_lag(self) -> int:
         """
         Minimum number of time steps variables are lagged.
@@ -865,8 +862,7 @@ def min_lag(self) -> int:
         else:
             return min([min(lag) for lag in self._lags.values()])
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def max_lag(self) -> int:
         """
         Maximum number of time steps variables are lagged.
@@ -983,8 +979,7 @@ def _get_auto_normalizer(self, data_properties: DataProperties) -> TorchNormaliz
             target_normalizer = normalizers[0]
         return target_normalizer
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def _group_ids_mapping(self) -> dict[str, str]:
         """
         Mapping of group id names to group ids used to identify series in dataset -
@@ -995,8 +990,7 @@ def _group_ids_mapping(self) -> dict[str, str]:
         """
         return {name: f"__group_id__{name}" for name in self.group_ids}
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def _group_ids(self) -> list[str]:
         """
         Group ids used to identify series in dataset.
@@ -1487,6 +1481,7 @@ def _to_tensor(cols, long=True) -> torch.Tensor:
             weight=weight,
             time=time,
         )
+
         return tensors
 
     def _check_tensors(self, tensors):
@@ -1568,8 +1563,7 @@ def reals(self) -> list[str]:
             + self._time_varying_unknown_reals
         )
 
-    @property
-    @lru_cache(None)
+    @cached_property
     def target_names(self) -> list[str]:
         """
         List of targets.
@@ -1860,7 +1854,18 @@ def _construct_index(self, data: pd.DataFrame, predict_mode: bool) -> pd.DataFra
         )
         assert len(df_index) > 0, msg
 
-        return df_index
+        minimal_columns = [
+            "index_start",
+            "index_end",
+            "sequence_length",
+            "time",
+            "sequence_id",
+        ]
+        if predict_mode and "sequence_id" in df_index.columns:
+            minimal_columns.append("sequence_id")
+
+        df_index = df_index[minimal_columns].astype("int32", copy=False)
+        return df_index.reset_index(drop=True)
 
     def filter(self, filter_func: Callable, copy: bool = True) -> TimeSeriesDataType:
         """Filter subsequences in dataset.