From 3902ddef978ffbbe71704da1b053b5a95bd874d5 Mon Sep 17 00:00:00 2001
From: whning <whning@zju.edu.cn>
Date: Sat, 6 Jun 2026 02:28:43 +0800
Subject: [PATCH] Fix mutable default arguments across qlib data layer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merge 5 individual fixes into one module-level PR:

1. data.py (10 occurrences) — list/dict default arguments in D/XD/RNNData
2. cache.py (11 occurrences) — list default in Cache, CalendarCache, InstrumentCache
3. processor.py — mutable list defaults in DropCol, FilterCol
4. utils/index_data.py — mutable list default in SingleData.__init__
5. backtest/high_performance_ds.py — mutable dict default in PandasSingleMetric

All follow the standard Python pattern: replace mutable default with None,
add `if arg is None: arg = []` guard inside the function body.
---
 qlib/backtest/high_performance_ds.py |  4 ++-
 qlib/data/cache.py                   | 41 ++++++++++++++++++++--------
 qlib/data/data.py                    | 40 ++++++++++++++++++++-------
 qlib/data/dataset/processor.py       |  8 ++++--
 qlib/utils/index_data.py             | 12 ++++++--
 5 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/qlib/backtest/high_performance_ds.py b/qlib/backtest/high_performance_ds.py
index f149f13dd5c..3f716be45ec 100644
--- a/qlib/backtest/high_performance_ds.py
+++ b/qlib/backtest/high_performance_ds.py
@@ -508,7 +508,9 @@ def __len__(self):
 class PandasSingleMetric(SingleMetric):
     """Each SingleMetric is based on pd.Series."""
 
-    def __init__(self, metric: Union[dict, pd.Series] = {}):
+    def __init__(self, metric: Union[dict, pd.Series] | None = None):
+        if metric is None:
+            metric = {}
         if isinstance(metric, dict):
             self.metric = pd.Series(metric)
         elif isinstance(metric, pd.Series):
diff --git a/qlib/data/cache.py b/qlib/data/cache.py
index fbf6e839db1..7c7707866b1 100644
--- a/qlib/data/cache.py
+++ b/qlib/data/cache.py
@@ -389,7 +389,7 @@ class DatasetCache(BaseProviderCache):
     HDF_KEY = "df"
 
     def dataset(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=None
     ):
         """Get feature dataset.
 
@@ -399,6 +399,8 @@ def dataset(
             read-write conflicts will not be triggered
             but client readers are not considered.
         """
+        if inst_processors is None:
+            inst_processors = []
         if disk_cache == 0:
             # skip cache
             return self.provider.dataset(
@@ -423,7 +425,7 @@ def _uri(self, instruments, fields, start_time, end_time, freq, **kwargs):
         raise NotImplementedError("Implement this function to match your own cache mechanism")
 
     def _dataset(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=None
     ):
         """Get feature dataset using cache.
 
@@ -432,7 +434,7 @@ def _dataset(
         raise NotImplementedError("Implement this method if you want to use dataset feature cache")
 
     def _dataset_uri(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=None
     ):
         """Get a uri of feature dataset using cache.
         specially:
@@ -653,7 +655,9 @@ def __init__(self, provider, **kwargs):
         self.remote = kwargs.get("remote", False)
 
     @staticmethod
-    def _uri(instruments, fields, start_time, end_time, freq, disk_cache=1, inst_processors=[], **kwargs):
+    def _uri(instruments, fields, start_time, end_time, freq, disk_cache=1, inst_processors=None, **kwargs):
+        if inst_processors is None:
+            inst_processors = []
         return hash_args(*DatasetCache.normalize_uri_args(instruments, fields, freq), disk_cache, inst_processors)
 
     def get_cache_dir(self, freq: str = None) -> Path:
@@ -694,8 +698,10 @@ def read_data_from_cache(cls, cache_path: Union[str, Path], start_time, end_time
         return df
 
     def _dataset(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=None
     ):
+        if inst_processors is None:
+            inst_processors = []
         if disk_cache == 0:
             # In this case, data_set cache is configured but will not be used.
             return self.provider.dataset(
@@ -748,8 +754,10 @@ def _dataset(
         return features
 
     def _dataset_uri(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=None
     ):
+        if inst_processors is None:
+            inst_processors = []
         if disk_cache == 0:
             # In this case, server only checks the expression cache.
             # The client will load the cache data by itself.
@@ -854,7 +862,7 @@ def build_index_from_data(data, start_index=0):
             index_data += start_index
             return index_data
 
-    def gen_dataset_cache(self, cache_path: Union[str, Path], instruments, fields, freq, inst_processors=[]):
+    def gen_dataset_cache(self, cache_path: Union[str, Path], instruments, fields, freq, inst_processors=None):
         """gen_dataset_cache
 
         .. note:: This function does not consider the cache read write lock. Please
@@ -872,6 +880,9 @@ def gen_dataset_cache(self, cache_path: Union[str, Path], instruments, fields, f
                     1999-11-10 00:00:00     0   1
                     1999-11-11 00:00:00     1   2
                     1999-11-12 00:00:00     2   3
+        """
+        if inst_processors is None:
+            inst_processors = []
                     ...
 
                 .. note:: The start is closed. The end is open!!!!!
@@ -1076,15 +1087,19 @@ def __init__(self, provider):
             f"modify the cache directory via the local_cache_path in the config"
         )
 
-    def _uri(self, instruments, fields, start_time, end_time, freq, disk_cache=1, inst_processors=[], **kwargs):
+    def _uri(self, instruments, fields, start_time, end_time, freq, disk_cache=1, inst_processors=None, **kwargs):
+        if inst_processors is None:
+            inst_processors = []
         instruments, fields, freq = self.normalize_uri_args(instruments, fields, freq)
         return hash_args(
             instruments, fields, start_time, end_time, freq, disk_cache, str(self.local_cache_path), inst_processors
         )
 
     def _dataset(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=1, inst_processors=None
     ):
+        if inst_processors is None:
+            inst_processors = []
         if disk_cache == 0:
             # In this case, data_set cache is configured but will not be used.
             return self.provider.dataset(instruments, fields, start_time, end_time, freq)
@@ -1118,12 +1133,16 @@ def _dataset(
 class DatasetURICache(DatasetCache):
     """Prepared cache mechanism for server."""
 
-    def _uri(self, instruments, fields, start_time, end_time, freq, disk_cache=1, inst_processors=[], **kwargs):
+    def _uri(self, instruments, fields, start_time, end_time, freq, disk_cache=1, inst_processors=None, **kwargs):
+        if inst_processors is None:
+            inst_processors = []
         return hash_args(*self.normalize_uri_args(instruments, fields, freq), disk_cache, inst_processors)
 
     def dataset(
-        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[]
+        self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=None
     ):
+        if inst_processors is None:
+            inst_processors = []
         if "local" in C.dataset_provider.lower():
             # use LocalDatasetProvider
             return self.provider.dataset(
diff --git a/qlib/data/data.py b/qlib/data/data.py
index aba75c0b1ab..3468977b936 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -450,7 +450,7 @@ class DatasetProvider(abc.ABC):
     """
 
     @abc.abstractmethod
-    def dataset(self, instruments, fields, start_time=None, end_time=None, freq="day", inst_processors=[]):
+    def dataset(self, instruments, fields, start_time=None, end_time=None, freq="day", inst_processors=None):
         """Get dataset data.
 
         Parameters
@@ -473,6 +473,8 @@ def dataset(self, instruments, fields, start_time=None, end_time=None, freq="day
         pd.DataFrame
             a pandas dataframe with <instrument, datetime> index.
         """
+        if inst_processors is None:
+            inst_processors = []
         raise NotImplementedError("Subclass of DatasetProvider must implement `Dataset` method")
 
     def _uri(
@@ -483,7 +485,7 @@ def _uri(
         end_time=None,
         freq="day",
         disk_cache=1,
-        inst_processors=[],
+        inst_processors=None,
         **kwargs,
     ):
         """Get task uri, used when generating rabbitmq task in qlib_server
@@ -504,6 +506,8 @@ def _uri(
             whether to skip(0)/use(1)/replace(2) disk_cache.
 
         """
+        if inst_processors is None:
+            inst_processors = []
         # TODO: qlib-server support inst_processors
         return DiskDatasetCache._uri(instruments, fields, start_time, end_time, freq, disk_cache, inst_processors)
 
@@ -545,12 +549,14 @@ def parse_fields(fields):
         return [ExpressionD.get_expression_instance(f) for f in fields]
 
     @staticmethod
-    def dataset_processor(instruments_d, column_names, start_time, end_time, freq, inst_processors=[]):
+    def dataset_processor(instruments_d, column_names, start_time, end_time, freq, inst_processors=None):
         """
         Load and process the data, return the data set.
         - default using multi-kernel method.
 
         """
+        if inst_processors is None:
+            inst_processors = []
         normalize_column_names = normalize_cache_fields(column_names)
         # One process for one task, so that the memory will be freed quicker.
         workers = max(min(C.get_kernels(freq), len(instruments_d)), 1)
@@ -597,7 +603,7 @@ def dataset_processor(instruments_d, column_names, start_time, end_time, freq, i
         return data
 
     @staticmethod
-    def inst_calculator(inst, start_time, end_time, freq, column_names, spans=None, g_config=None, inst_processors=[]):
+    def inst_calculator(inst, start_time, end_time, freq, column_names, spans=None, g_config=None, inst_processors=None):
         """
         Calculate the expressions for **one** instrument, return a df result.
         If the expression has been calculated before, load from cache.
@@ -605,6 +611,8 @@ def inst_calculator(inst, start_time, end_time, freq, column_names, spans=None,
         return value: A data frame with index 'datetime' and other data columns.
 
         """
+        if inst_processors is None:
+            inst_processors = []
         # FIXME: Windows OS or MacOS using spawn: https://docs.python.org/3.8/library/multiprocessing.html?highlight=spawn#contexts-and-start-methods
         # NOTE: This place is compatible with windows, windows multi-process is spawn
         C.register_from_C(g_config)
@@ -640,7 +648,9 @@ class LocalCalendarProvider(CalendarProvider, ProviderBackendMixin):
     Provide calendar data from local data source.
     """
 
-    def __init__(self, remote=False, backend={}):
+    def __init__(self, remote=False, backend=None):
+        if backend is None:
+            backend = {}
         super().__init__()
         self.remote = remote
         self.backend = backend
@@ -681,7 +691,9 @@ class LocalInstrumentProvider(InstrumentProvider, ProviderBackendMixin):
     Provide instrument data from local data source.
     """
 
-    def __init__(self, backend={}) -> None:
+    def __init__(self, backend=None) -> None:
+        if backend is None:
+            backend = {}
         super().__init__()
         self.backend = backend
 
@@ -729,7 +741,9 @@ class LocalFeatureProvider(FeatureProvider, ProviderBackendMixin):
     Provide feature data from local data source.
     """
 
-    def __init__(self, remote=False, backend={}):
+    def __init__(self, remote=False, backend=None):
+        if backend is None:
+            backend = {}
         super().__init__()
         self.remote = remote
         self.backend = backend
@@ -906,8 +920,10 @@ def dataset(
         start_time=None,
         end_time=None,
         freq="day",
-        inst_processors=[],
+        inst_processors=None,
     ):
+        if inst_processors is None:
+            inst_processors = []
         instruments_d = self.get_instruments_d(instruments, freq)
         column_names = self.get_column_names(fields)
         if self.align_time:
@@ -1046,8 +1062,10 @@ def dataset(
         freq="day",
         disk_cache=0,
         return_uri=False,
-        inst_processors=[],
+        inst_processors=None,
     ):
+        if inst_processors is None:
+            inst_processors = []
         if Inst.get_inst_type(instruments) == Inst.DICT:
             get_module_logger("data").warning(
                 "Getting features from a dict of instruments is not recommended because the features will not be "
@@ -1167,7 +1185,7 @@ def features(
         end_time=None,
         freq="day",
         disk_cache=None,
-        inst_processors=[],
+        inst_processors=None,
     ):
         """
         Parameters
@@ -1180,6 +1198,8 @@ def features(
         and will use provider method if a type error is raised because the DatasetD instance
         is a provider class.
         """
+        if inst_processors is None:
+            inst_processors = []
         disk_cache = C.default_disk_cache if disk_cache is None else disk_cache
         fields = list(fields)  # In case of tuple.
         try:
diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py
index d05dbe381c5..73a1d81657c 100644
--- a/qlib/data/dataset/processor.py
+++ b/qlib/data/dataset/processor.py
@@ -112,7 +112,9 @@ def is_for_infer(self) -> bool:
 
 
 class DropCol(Processor):
-    def __init__(self, col_list=[]):
+    def __init__(self, col_list=None):
+        if col_list is None:
+            col_list = []
         self.col_list = col_list
 
     def __call__(self, df):
@@ -127,7 +129,9 @@ def readonly(self):
 
 
 class FilterCol(Processor):
-    def __init__(self, fields_group="feature", col_list=[]):
+    def __init__(self, fields_group="feature", col_list=None):
+        if col_list is None:
+            col_list = []
         self.fields_group = fields_group
         self.col_list = col_list
 
diff --git a/qlib/utils/index_data.py b/qlib/utils/index_data.py
index c707240d098..5100fbb5c83 100644
--- a/qlib/utils/index_data.py
+++ b/qlib/utils/index_data.py
@@ -528,19 +528,25 @@ def values(self):
 
 class SingleData(IndexData):
     def __init__(
-        self, data: Union[int, float, np.number, list, dict, pd.Series] = [], index: Union[List, pd.Index, Index] = []
+        self,
+        data: Union[int, float, np.number, list, dict, pd.Series] | None = None,
+        index: Union[List, pd.Index, Index] | None = None,
     ):
         """A data structure of index and numpy data.
         It's used to replace pd.Series due to high-speed.
 
         Parameters
         ----------
-        data : Union[int, float, np.number, list, dict, pd.Series]
+        data : Union[int, float, np.number, list, dict, pd.Series], optional
             the input data
-        index : Union[list, pd.Index]
+        index : Union[list, pd.Index], optional
             the index of data.
             empty list indicates that auto filling the index to the length of data
         """
+        if data is None:
+            data = []
+        if index is None:
+            index = []
         # for special data type
         if isinstance(data, dict):
             assert len(index) == 0