From 374160e5c4c0f7ea630e823a3a8496767ccf0d3d Mon Sep 17 00:00:00 2001
From: whning <whning@zju.edu.cn>
Date: Fri, 5 Jun 2026 01:41:08 +0800
Subject: [PATCH 1/3] Fix Yahoo date parsing with mixed timezone formats
 (#2014)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

YahooCollectorCN1d.download_index_data calls pd.to_datetime(df['date'])
on timestamps from East Money API that may include timezone offsets
(e.g. ' 09:30:00+08:00'). Similarly, YahooNormalize.normalize_yahoo calls
pd.to_datetime(df.index) on date strings from Yahoo Finance.

Without specifying format='mixed', pd.to_datetime raises when it encounters
mixed formats in the same column — some entries are date-only strings while
others carry timezone info. Adding format='mixed' lets pandas infer the
format per-entry and parse them correctly.
---
 scripts/data_collector/yahoo/collector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 82660f1112b..1dc4f07ed89 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -240,7 +240,7 @@ def download_index_data(self):
                 logger.warning(f"get {_index_name} error: {e}")
                 continue
             df.columns = ["date", "open", "close", "high", "low", "volume", "money", "change"]
-            df["date"] = pd.to_datetime(df["date"])
+            df["date"] = pd.to_datetime(df["date"], format="mixed")
             df = df.astype(float, errors="ignore")
             df["adjclose"] = df["close"]
             df["symbol"] = f"sh{_index_code}"
@@ -392,7 +392,7 @@ def normalize_yahoo(
         columns = copy.deepcopy(YahooNormalize.COLUMNS)
         df = df.copy()
         df.set_index(date_field_name, inplace=True)
-        df.index = pd.to_datetime(df.index)
+        df.index = pd.to_datetime(df.index, format="mixed")
         df.index = df.index.tz_localize(None)
         df = df[~df.index.duplicated(keep="first")]
         if calendar_list is not None:

From 216028fab1b8e3f4c1f94cadd6d6690bedc5cb49 Mon Sep 17 00:00:00 2001
From: whning <whning@zju.edu.cn>
Date: Fri, 5 Jun 2026 01:50:10 +0800
Subject: [PATCH 2/3] Centralize instrument name normalization in code_to_fname
 (#2053)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The root cause is that code_to_fname() — the central normalization
function for instrument names — doesn't normalize case before
returning. Individual consumers (FileFeatureStorage, LocalPITProvider)
each add ad-hoc .lower() calls, but any code path that skips them uses
the original case, causing path mismatches on case-sensitive
filesystems (Linux ext4).

Fix:
1. Normalize to lowercase in code_to_fname() — the single source of
   truth for instrument-to-filename conversion.
2. Remove the redundant instrument.lower() from FileFeatureStorage
   (instrument already flows through code_to_fname in LocalFeatureProvider).
3. Remove the redundant instrument.lower() from LocalPITProvider
   (same reason).

This is a no-op on Windows/macOS (case-insensitive FS) and fixes
FileNotFoundError on Linux when callers pass mixed-case instrument
names.
---
 qlib/data/data.py                 |  4 ++--
 qlib/data/storage/file_storage.py |  4 +++-
 qlib/utils/__init__.py            | 14 ++++++++++----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/qlib/data/data.py b/qlib/data/data.py
index aba75c0b1ab..9b7e44590e5 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -779,8 +779,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time, pe
         if not field.endswith("_q") and not field.endswith("_a"):
             raise ValueError("period field must ends with '_q' or '_a'")
         quarterly = field.endswith("_q")
-        index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index"
-        data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data"
+        index_path = C.dpm.get_data_uri() / "financial" / instrument / f"{field}.index"
+        data_path = C.dpm.get_data_uri() / "financial" / instrument / f"{field}.data"
         if not (index_path.exists() and data_path.exists()):
             raise FileNotFoundError("No file is found.")
         # NOTE: The most significant performance loss is here.
diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py
index e2bc5c3679a..10ce5741a05 100644
--- a/qlib/data/storage/file_storage.py
+++ b/qlib/data/storage/file_storage.py
@@ -286,7 +286,9 @@ class FileFeatureStorage(FileStorageMixin, FeatureStorage):
     def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = None, **kwargs):
         super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs)
         self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri)
-        self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin"
+        # NOTE: instrument case is normalized by code_to_fname() before reaching here.
+        # freq/field are also normalized to lowercase for path consistency.
+        self.file_name = f"{instrument}/{field.lower()}.{freq.lower()}.bin"
 
     def clear(self):
         with self.uri.open("wb") as _:
diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py
index 2a94ebd555b..775ac66794f 100644
--- a/qlib/utils/__init__.py
+++ b/qlib/utils/__init__.py
@@ -916,10 +916,16 @@ def code_to_fname(code: str):
     replace_names += [f"LPT{i}" for i in range(10)]
 
     prefix = "_qlib_"
-    if str(code).upper() in replace_names:
-        code = prefix + str(code)
-
-    return code
+    code = str(code)
+    if code.upper() in replace_names:
+        code = prefix + code
+
+    # Normalize to lowercase for case-insensitive file paths.
+    # All file-based storage (FileFeatureStorage, FileInstrumentStorage, etc.)
+    # assumes lowercase paths internally, but not all callers pre-normalize.
+    # Centralizing the normalization here prevents path mismatches on
+    # case-sensitive filesystems (e.g., Linux ext4).
+    return code.lower()
 
 
 def fname_to_code(fname: str):

From b21627ee91633bcf00cdea8a842a27462bd3a607 Mon Sep 17 00:00:00 2001
From: whning <whning@zju.edu.cn>
Date: Fri, 5 Jun 2026 01:50:49 +0800
Subject: [PATCH 3/3] Fix ParallelExt AttributeError on _backend_args with
 joblib >=1.5.0 (#1927)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing code used joblib.__version__ to decide whether to write
maxtasksperchild to _backend_args (joblib <1.5.0) or _backend_kwargs
(joblib >=1.5.0). However:

1. The version check fails on local joblib forks or patched installs.
2. The attribute may not exist yet during Parallel.__init__() depending
   on which backend is selected and how far initialization has progressed.

Fix: use hasattr() to probe for _backend_kwargs first (joblib >=1.5.0),
then fall back to _backend_args (legacy joblib). This is both more
robust and simpler — no version string parsing needed.

Also removed the now-unused  statement.
---
 qlib/utils/paral.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/qlib/utils/paral.py b/qlib/utils/paral.py
index a6177833413..0efa68ddfa0 100644
--- a/qlib/utils/paral.py
+++ b/qlib/utils/paral.py
@@ -6,7 +6,6 @@
 from threading import Thread
 from typing import Callable, Text, Union
 
-import joblib
 from joblib import Parallel, delayed
 from joblib._parallel_backends import MultiprocessingBackend
 import pandas as pd
@@ -22,12 +21,15 @@ def __init__(self, *args, **kwargs):
         maxtasksperchild = kwargs.pop("maxtasksperchild", None)
         super(ParallelExt, self).__init__(*args, **kwargs)
         if isinstance(self._backend, MultiprocessingBackend):
-            # 2025-05-04 joblib released version 1.5.0, in which _backend_args was removed and replaced by _backend_kwargs.
+            # 2025-05-04 joblib released version 1.5.0, in which _backend_args was
+            # removed and replaced by _backend_kwargs.
             # Ref: https://github.com/joblib/joblib/pull/1525/files#diff-e4dff8042ce45b443faf49605b75a58df35b8c195978d4a57f4afa695b406bdc
-            if joblib.__version__ < "1.5.0":
-                self._backend_args["maxtasksperchild"] = maxtasksperchild  # pylint: disable=E1101
-            else:
+            # Use getattr/hasattr for robustness: in some joblib versions the
+            # attribute may not exist yet during __init__.
+            if hasattr(self, "_backend_kwargs"):
                 self._backend_kwargs["maxtasksperchild"] = maxtasksperchild  # pylint: disable=E1101
+            elif hasattr(self, "_backend_args"):
+                self._backend_args["maxtasksperchild"] = maxtasksperchild  # pylint: disable=E1101
 
 
 def datetime_groupby_apply(