From 374160e5c4c0f7ea630e823a3a8496767ccf0d3d Mon Sep 17 00:00:00 2001 From: whning Date: Fri, 5 Jun 2026 01:41:08 +0800 Subject: [PATCH 1/3] Fix Yahoo date parsing with mixed timezone formats (#2014) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit YahooCollectorCN1d.download_index_data calls pd.to_datetime(df['date']) on timestamps from East Money API that may include timezone offsets (e.g. ' 09:30:00+08:00'). Similarly, YahooNormalize.normalize_yahoo calls pd.to_datetime(df.index) on date strings from Yahoo Finance. Without specifying format='mixed', pd.to_datetime raises when it encounters mixed formats in the same column — some entries are date-only strings while others carry timezone info. Adding format='mixed' lets pandas infer the format per-entry and parse them correctly. --- scripts/data_collector/yahoo/collector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 82660f1112b..1dc4f07ed89 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -240,7 +240,7 @@ def download_index_data(self): logger.warning(f"get {_index_name} error: {e}") continue df.columns = ["date", "open", "close", "high", "low", "volume", "money", "change"] - df["date"] = pd.to_datetime(df["date"]) + df["date"] = pd.to_datetime(df["date"], format="mixed") df = df.astype(float, errors="ignore") df["adjclose"] = df["close"] df["symbol"] = f"sh{_index_code}" @@ -392,7 +392,7 @@ def normalize_yahoo( columns = copy.deepcopy(YahooNormalize.COLUMNS) df = df.copy() df.set_index(date_field_name, inplace=True) - df.index = pd.to_datetime(df.index) + df.index = pd.to_datetime(df.index, format="mixed") df.index = df.index.tz_localize(None) df = df[~df.index.duplicated(keep="first")] if calendar_list is not None: From 216028fab1b8e3f4c1f94cadd6d6690bedc5cb49 Mon Sep 17 00:00:00 2001 From: whning Date: Fri, 5 Jun 2026 01:50:10 +0800 Subject: [PATCH 2/3] Centralize instrument name normalization in code_to_fname (#2053) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The root cause is that code_to_fname() — the central normalization function for instrument names — doesn't normalize case before returning. Individual consumers (FileFeatureStorage, LocalPITProvider) each add ad-hoc .lower() calls, but any code path that skips them uses the original case, causing path mismatches on case-sensitive filesystems (Linux ext4). Fix: 1. Normalize to lowercase in code_to_fname() — the single source of truth for instrument-to-filename conversion. 2. Remove the redundant instrument.lower() from FileFeatureStorage (instrument already flows through code_to_fname in LocalFeatureProvider). 3. Remove the redundant instrument.lower() from LocalPITProvider (same reason). This is a no-op on Windows/macOS (case-insensitive FS) and fixes FileNotFoundError on Linux when callers pass mixed-case instrument names. --- qlib/data/data.py | 4 ++-- qlib/data/storage/file_storage.py | 4 +++- qlib/utils/__init__.py | 14 ++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/qlib/data/data.py b/qlib/data/data.py index aba75c0b1ab..9b7e44590e5 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -779,8 +779,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe if not field.endswith("_q") and not field.endswith("_a"): raise ValueError("period field must ends with '_q' or '_a'") quarterly = field.endswith("_q") - index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index" - data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data" + index_path = C.dpm.get_data_uri() / "financial" / instrument / f"{field}.index" + data_path = C.dpm.get_data_uri() / "financial" / instrument / f"{field}.data" if not (index_path.exists() and data_path.exists()): raise FileNotFoundError("No file is found.") # NOTE: The most significant performance loss is here. diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index e2bc5c3679a..10ce5741a05 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -286,7 +286,9 @@ class FileFeatureStorage(FileStorageMixin, FeatureStorage): def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = None, **kwargs): super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) - self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" + # NOTE: instrument case is normalized by code_to_fname() before reaching here. + # freq/field are also normalized to lowercase for path consistency. + self.file_name = f"{instrument}/{field.lower()}.{freq.lower()}.bin" def clear(self): with self.uri.open("wb") as _: diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 2a94ebd555b..775ac66794f 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -916,10 +916,16 @@ def code_to_fname(code: str): replace_names += [f"LPT{i}" for i in range(10)] prefix = "_qlib_" - if str(code).upper() in replace_names: - code = prefix + str(code) - - return code + code = str(code) + if code.upper() in replace_names: + code = prefix + code + + # Normalize to lowercase for case-insensitive file paths. + # All file-based storage (FileFeatureStorage, FileInstrumentStorage, etc.) + # assumes lowercase paths internally, but not all callers pre-normalize. + # Centralizing the normalization here prevents path mismatches on + # case-sensitive filesystems (e.g., Linux ext4). + return code.lower() def fname_to_code(fname: str): From b21627ee91633bcf00cdea8a842a27462bd3a607 Mon Sep 17 00:00:00 2001 From: whning Date: Fri, 5 Jun 2026 01:50:49 +0800 Subject: [PATCH 3/3] Fix ParallelExt AttributeError on _backend_args with joblib >=1.5.0 (#1927) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing code used joblib.__version__ to decide whether to write maxtasksperchild to _backend_args (joblib <1.5.0) or _backend_kwargs (joblib >=1.5.0). However: 1. The version check fails on local joblib forks or patched installs. 2. The attribute may not exist yet during Parallel.__init__() depending on which backend is selected and how far initialization has progressed. Fix: use hasattr() to probe for _backend_kwargs first (joblib >=1.5.0), then fall back to _backend_args (legacy joblib). This is both more robust and simpler — no version string parsing needed. Also removed the now-unused statement. --- qlib/utils/paral.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/qlib/utils/paral.py b/qlib/utils/paral.py index a6177833413..0efa68ddfa0 100644 --- a/qlib/utils/paral.py +++ b/qlib/utils/paral.py @@ -6,7 +6,6 @@ from threading import Thread from typing import Callable, Text, Union -import joblib from joblib import Parallel, delayed from joblib._parallel_backends import MultiprocessingBackend import pandas as pd @@ -22,12 +21,15 @@ def __init__(self, *args, **kwargs): maxtasksperchild = kwargs.pop("maxtasksperchild", None) super(ParallelExt, self).__init__(*args, **kwargs) if isinstance(self._backend, MultiprocessingBackend): - # 2025-05-04 joblib released version 1.5.0, in which _backend_args was removed and replaced by _backend_kwargs. + # 2025-05-04 joblib released version 1.5.0, in which _backend_args was + # removed and replaced by _backend_kwargs. # Ref: https://github.com/joblib/joblib/pull/1525/files#diff-e4dff8042ce45b443faf49605b75a58df35b8c195978d4a57f4afa695b406bdc - if joblib.__version__ < "1.5.0": - self._backend_args["maxtasksperchild"] = maxtasksperchild # pylint: disable=E1101 - else: + # Use getattr/hasattr for robustness: in some joblib versions the + # attribute may not exist yet during __init__. + if hasattr(self, "_backend_kwargs"): self._backend_kwargs["maxtasksperchild"] = maxtasksperchild # pylint: disable=E1101 + elif hasattr(self, "_backend_args"): + self._backend_args["maxtasksperchild"] = maxtasksperchild # pylint: disable=E1101 def datetime_groupby_apply(