diff --git a/qlib/backtest/__init__.py b/qlib/backtest/__init__.py index 9daba911533..344ba7f4dc9 100644 --- a/qlib/backtest/__init__.py +++ b/qlib/backtest/__init__.py @@ -116,6 +116,7 @@ def create_account_instance( benchmark: Optional[str], account: Union[float, int, dict], pos_type: str = "Position", + freq: str = "day", ) -> Account: """ # TODO: is very strange pass benchmark_config in the account (maybe for report) @@ -148,6 +149,8 @@ def create_account_instance( ... pos_type: str Postion type. + freq: str + trading frequency, passed through to Account for report metrics. """ if isinstance(account, (int, float)): init_cash = account @@ -162,6 +165,7 @@ def create_account_instance( init_cash=init_cash, position_dict=position_dict, pos_type=pos_type, + freq=freq, benchmark_config=( {} if benchmark is None @@ -196,6 +200,7 @@ def get_strategy_executor( benchmark=benchmark, account=account, pos_type=pos_type, + freq=exchange_kwargs.get("freq", "day"), ) exchange_kwargs = copy.copy(exchange_kwargs) diff --git a/qlib/data/data.py b/qlib/data/data.py index aba75c0b1ab..9b7e44590e5 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -779,8 +779,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") quarterly = field.endswith("_q") - index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" - data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" + index_path = C.dpm.get_data_uri() / "financial" / instrument / f"{field}.index" + data_path = C.dpm.get_data_uri() / "financial" / instrument / f"{field}.data" if not (index_path.exists() and data_path.exists()): raise FileNotFoundError("No file is found.") # NOTE: The most significant performance loss is here. diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index e2bc5c3679a..10ce5741a05 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -286,7 +286,9 @@ class FileFeatureStorage(FileStorageMixin, FeatureStorage): def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = None, **kwargs): super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) - self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" + # NOTE: instrument case is normalized by code_to_fname() before reaching here. + # freq/field are also normalized to lowercase for path consistency. + self.file_name = f"{instrument}/{field.lower()}.{freq.lower()}.bin" def clear(self): with self.uri.open("wb") as _: diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 2a94ebd555b..775ac66794f 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -916,10 +916,16 @@ def code_to_fname(code: str): replace_names += [f"LPT{i}" for i in range(10)] prefix = "_qlib_" - if str(code).upper() in replace_names: - code = prefix + str(code) - - return code + code = str(code) + if code.upper() in replace_names: + code = prefix + code + + # Normalize to lowercase for case-insensitive file paths. + # All file-based storage (FileFeatureStorage, FileInstrumentStorage, etc.) + # assumes lowercase paths internally, but not all callers pre-normalize. + # Centralizing the normalization here prevents path mismatches on + # case-sensitive filesystems (e.g., Linux ext4). + return code.lower() def fname_to_code(fname: str): diff --git a/qlib/utils/paral.py b/qlib/utils/paral.py index a6177833413..0efa68ddfa0 100644 --- a/qlib/utils/paral.py +++ b/qlib/utils/paral.py @@ -6,7 +6,6 @@ from threading import Thread from typing import Callable, Text, Union -import joblib from joblib import Parallel, delayed from joblib._parallel_backends import MultiprocessingBackend import pandas as pd @@ -22,12 +21,15 @@ def __init__(self, *args, **kwargs): maxtasksperchild = kwargs.pop("maxtasksperchild", None) super(ParallelExt, self).__init__(*args, **kwargs) if isinstance(self._backend, MultiprocessingBackend): - # 2025-05-04 joblib released version 1.5.0, in which _backend_args was removed and replaced by _backend_kwargs. + # 2025-05-04 joblib released version 1.5.0, in which _backend_args was + # removed and replaced by _backend_kwargs. # Ref: https://github.com/joblib/joblib/pull/1525/files#diff-e4dff8042ce45b443faf49605b75a58df35b8c195978d4a57f4afa695b406bdc - if joblib.__version__ < "1.5.0": - self._backend_args["maxtasksperchild"] = maxtasksperchild # pylint: disable=E1101 - else: + # Use getattr/hasattr for robustness: in some joblib versions the + # attribute may not exist yet during __init__. + if hasattr(self, "_backend_kwargs"): self._backend_kwargs["maxtasksperchild"] = maxtasksperchild # pylint: disable=E1101 + elif hasattr(self, "_backend_args"): + self._backend_args["maxtasksperchild"] = maxtasksperchild # pylint: disable=E1101 def datetime_groupby_apply( diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 82660f1112b..1dc4f07ed89 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -240,7 +240,7 @@ def download_index_data(self): logger.warning(f"get {_index_name} error: {e}") continue df.columns = ["date", "open", "close", "high", "low", "volume", "money", "change"] - df["date"] = pd.to_datetime(df["date"]) + df["date"] = pd.to_datetime(df["date"], format="mixed") df = df.astype(float, errors="ignore") df["adjclose"] = df["close"] df["symbol"] = f"sh{_index_code}" @@ -392,7 +392,7 @@ def normalize_yahoo( columns = copy.deepcopy(YahooNormalize.COLUMNS) df = df.copy() df.set_index(date_field_name, inplace=True) - df.index = pd.to_datetime(df.index) + df.index = pd.to_datetime(df.index, format="mixed") df.index = df.index.tz_localize(None) df = df[~df.index.duplicated(keep="first")] if calendar_list is not None: