From cd3e1fbc51e38be3513749b7c1f5063656f1d081 Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 16:12:07 +0000 Subject: [PATCH 1/8] feat(data_collector): add GB_ALL to calendar benchmark map and symbol cache --- scripts/data_collector/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 2b75916989b..ca61a21b9d2 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -38,6 +38,7 @@ "US_ALL": "^GSPC", "IN_ALL": "^NSEI", "BR_ALL": "^BVSP", + "GB_ALL": "^FTSE", } _BENCH_CALENDAR_LIST = None @@ -46,6 +47,7 @@ _US_SYMBOLS = None _IN_SYMBOLS = None _BR_SYMBOLS = None +_GB_SYMBOLS = None _EN_FUND_SYMBOLS = None _CALENDAR_MAP = {} @@ -74,7 +76,12 @@ def _get_calendar(url): calendar = _CALENDAR_MAP.get(bench_code, None) if calendar is None: - if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"): + if ( + bench_code.startswith("US_") + or bench_code.startswith("IN_") + or bench_code.startswith("BR_") + or bench_code.startswith("GB_") + ): print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code])) print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")) df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max") From 05d4fb19d8b75ab0a0dad474a66d7154c9ca72a7 Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 16:13:12 +0000 Subject: [PATCH 2/8] feat(data_collector): add get_gb_stock_symbols via Yahoo Finance screener API --- scripts/data_collector/utils.py | 77 +++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index ca61a21b9d2..a5175b4f663 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -457,6 +457,83 @@ def _format(s_): return _BR_SYMBOLS +def get_gb_stock_symbols(qlib_data_path: [str, Path] = None) -> list: + """get GB (London Stock Exchange) stock symbols via Yahoo Finance screener API. + + Parameters + ---------- + qlib_data_path : str or Path, optional + Path to a local qlib data directory whose ``instruments/`` sub-directory + will be scanned for additional symbols (e.g. ``ftse100.txt``, ``ftse250.txt``), + by default None. + + Returns + ------- + list + Sorted, deduplicated list of Yahoo Finance ticker symbols with a ``.L`` + suffix, e.g. ``["AZN.L", "BP.L", "HSBA.L", ...]``. + + Notes + ----- + Symbols are fetched from the Yahoo Finance predefined ``most_actives_gb`` + screener endpoint, which covers the full GB market universe tracked by + Yahoo Finance. Pagination is handled automatically (250 results per page). + Results are cached in the module-level ``_GB_SYMBOLS`` variable after the + first call. + """ + global _GB_SYMBOLS # pylint: disable=W0603 + + _SCREENER_URL = ( + "https://query1.finance.yahoo.com/v1/finance/screener/predefined/saved" + "?scrIds=most_actives_gb&count=250&start={start}" + ) + _HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": "application/json", + } + + @deco_retry + def _fetch_page(start: int) -> list: + resp = requests.get(_SCREENER_URL.format(start=start), headers=_HEADERS, timeout=30) + resp.raise_for_status() + return resp.json().get("finance", {}).get("result", [{}])[0].get("quotes", []) + + if _GB_SYMBOLS is None: + _all_symbols = [] + start = 0 + page_size = 250 + + while True: + quotes = _fetch_page(start) + if not quotes: + break + for q in quotes: + symbol = q.get("symbol", "") + if symbol.endswith(".L"): + _all_symbols.append(symbol) + if len(quotes) < page_size: + break + start += page_size + + if qlib_data_path is not None: + for _index in ["ftse100", "ftse250"]: + _ins_path = Path(qlib_data_path).joinpath(f"instruments/{_index}.txt") + if _ins_path.exists(): + ins_df = pd.read_csv( + _ins_path, + sep="\t", + names=["symbol", "start_date", "end_date"], + ) + _all_symbols += ins_df["symbol"].unique().tolist() + + _GB_SYMBOLS = sorted(set(_all_symbols)) + + return _GB_SYMBOLS + + def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list: """get en fund symbols From 69f53b9920d173e3689ed408ee40a1ad025f9107 Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 16:14:21 +0000 Subject: [PATCH 3/8] feat(yahoo): add YahooCollector and YahooNormalize classes for GB (LSE) region --- scripts/data_collector/yahoo/collector.py | 78 +++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 82660f1112b..b75c4726cc9 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -38,6 +38,7 @@ get_us_stock_symbols, get_in_stock_symbols, get_br_stock_symbols, + get_gb_stock_symbols, generate_minutes_calendar_from_daily, calc_adjusted_price, ) @@ -364,6 +365,40 @@ class YahooCollectorBR1min(YahooCollectorBR): retry = 2 +class YahooCollectorGB(YahooCollector, ABC): + """Collector for GB (London Stock Exchange) equities via Yahoo Finance. + + Symbols carry a ``.L`` suffix as returned by the Yahoo Finance screener + (e.g. ``HSBA.L``, ``AZN.L``). The ``^FTSE`` index is used as the + trading-calendar benchmark. Trading hours are 08:00-16:30 Europe/London. + Prices are quoted in GBp (pence) by Yahoo Finance. + """ + + def get_instrument_list(self): + logger.info("get GB (LSE) stock symbols......") + symbols = get_gb_stock_symbols() + logger.info(f"get {len(symbols)} symbols.") + return symbols + + def download_index_data(self): + pass + + def normalize_symbol(self, symbol): + return code_to_fname(symbol).upper() + + @property + def _timezone(self): + return "Europe/London" + + +class YahooCollectorGB1d(YahooCollectorGB): + pass + + +class YahooCollectorGB1min(YahooCollectorGB): + pass + + class YahooNormalize(BaseNormalize): COLUMNS = ["open", "close", "high", "low", "volume"] DAILY_FORMAT = "%Y-%m-%d" @@ -720,6 +755,49 @@ def symbol_to_yahoo(self, symbol): return fname_to_code(symbol) +class YahooNormalizeGB: + """Calendar mixin for GB (London Stock Exchange) normalisers. + + Uses ``^FTSE`` daily history via :func:`get_calendar_list` with key + ``"GB_ALL"`` as the trading-date sequence. + """ + + def _get_calendar_list(self) -> Iterable[pd.Timestamp]: + return get_calendar_list("GB_ALL") + + +class YahooNormalizeGB1d(YahooNormalizeGB, YahooNormalize1d): + pass + + +class YahooNormalizeGB1dExtend(YahooNormalizeGB, YahooNormalize1dExtend): + pass + + +class YahooNormalizeGB1min(YahooNormalizeGB, YahooNormalize1min): + """1-minute normaliser for GB (London Stock Exchange) equities. + + LSE trades continuously from 08:00 to 16:30 Europe/London with no midday + break. ``AM_RANGE`` covers the full session; ``PM_RANGE`` is a zero-width + sentinel so the parent generator loop is satisfied without adding extra + minutes. ``CALC_PAUSED_NUM = False`` mirrors US/IN/BR 1min normalisers. + """ + + CALC_PAUSED_NUM = False + AM_RANGE = ("08:00:00", "16:29:00") + PM_RANGE = ("16:29:00", "16:29:00") + + def _get_calendar_list(self) -> Iterable[pd.Timestamp]: + # TODO: support 1min + raise ValueError("Does not support 1min") + + def _get_1d_calendar_list(self): + return get_calendar_list("GB_ALL") + + def symbol_to_yahoo(self, symbol): + return fname_to_code(symbol) + + class Run(BaseRun): def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN): """ From fb3c516e5f7a75dff498e390abe8ffc62e812e00 Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 16:17:58 +0000 Subject: [PATCH 4/8] docs(yahoo): add GB region usage examples and fix supported regions list --- scripts/data_collector/README.md | 2 +- scripts/data_collector/yahoo/README.md | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/scripts/data_collector/README.md b/scripts/data_collector/README.md index d0058b33e2c..32e1ffe89c0 100644 --- a/scripts/data_collector/README.md +++ b/scripts/data_collector/README.md @@ -4,7 +4,7 @@ Scripts for data collection -- yahoo: get *US/CN* stock data from *Yahoo Finance* +- yahoo: get *CN/US/IN/BR/GB* stock data from *Yahoo Finance* - fund: get fund data from *http://fund.eastmoney.com* - cn_index: get *CN index* from *http://www.csindex.com.cn*, *CSI300*/*CSI100* - us_index: get *US index* from *https://en.wikipedia.org/wiki*, *SP500*/*NASDAQ100*/*DJIA*/*SP400* diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index c12a2383a40..91a4d779dcf 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -63,7 +63,7 @@ pip install -r requirements.txt - `source_dir`: save the directory - `interval`: `1d` or `1min`, by default `1d` > **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`** - - `region`: `CN` or `US` or `IN` or `BR`, by default `CN` + - `region`: `CN` or `US` or `IN` or `BR` or `GB`, by default `CN` - `delay`: `time.sleep(delay)`, by default *0.5* - `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)* - `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)* @@ -92,6 +92,11 @@ pip install -r requirements.txt python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR # br 1min data python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR + + # gb 1d data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/gb_data --start 2000-01-04 --end 2025-12-31 --delay 1 --interval 1d --region GB + # gb 1min data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/gb_data_1min --delay 1 --interval 1min --region GB ``` 2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data` @@ -105,7 +110,7 @@ pip install -r requirements.txt - `max_workers`: number of concurrent, by default *1* - `interval`: `1d` or `1min`, by default `1d` > if **`interval == 1min`**, `qlib_data_1d_dir` cannot be `None` - - `region`: `CN` or `US` or `IN`, by default `CN` + - `region`: `CN` or `US` or `IN` or `GB`, by default `CN` - `date_field_name`: column *name* identifying time in csv files, by default `date` - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol` - `end_date`: if not `None`, normalize the last date saved (*including end_date*); if `None`, it will ignore this parameter; by default `None` @@ -133,6 +138,12 @@ pip install -r requirements.txt # normalize 1min br python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min + + # normalize 1d gb + python scripts/data_collector/yahoo/collector.py normalize_data --source_dir ~/.qlib/stock_data/source/gb_data --normalize_dir ~/.qlib/stock_data/source/gb_1d_nor --region GB --interval 1d + + # normalize 1min gb + python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/gb_data --source_dir ~/.qlib/stock_data/source/gb_data_1min --normalize_dir ~/.qlib/stock_data/source/gb_1min_nor --region GB --interval 1min ``` 3. dump data: `python scripts/dump_bin.py dump_all` @@ -221,5 +232,16 @@ pip install -r requirements.txt df = D.features(inst[:100], ["$close"], freq="1min") # get all symbol data # df = D.features(D.instruments("all"), ["$close"], freq="1min") + + # 1d data gb + # NOTE: Yahoo Finance quotes GB (.L) equities in GBp (pence), not GBP pounds. + # e.g. HSBA.L price ~1200 means 1200p = £12. No scaling is applied by the normaliser. + qlib.init(provider_uri="~/.qlib/qlib_data/gb_data", region="gb") + df = D.features(D.instruments("all"), ["$close"], freq="day") + + # 1min data gb + qlib.init(provider_uri="~/.qlib/qlib_data/gb_data_1min", region="gb") + inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True) + df = D.features(inst[:100], ["$close"], freq="1min") ``` From 0e253ee298be6b678739f2ba2c033ee3489fb9de Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 16:19:32 +0000 Subject: [PATCH 5/8] test(yahoo): add unit tests for GB collector, symbol fetcher, and class resolution --- tests/test_yahoo_collector_gb.py | 253 +++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 tests/test_yahoo_collector_gb.py diff --git a/tests/test_yahoo_collector_gb.py b/tests/test_yahoo_collector_gb.py new file mode 100644 index 00000000000..23ec695e648 --- /dev/null +++ b/tests/test_yahoo_collector_gb.py @@ -0,0 +1,253 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Unit tests for the GB (London Stock Exchange) Yahoo Finance data collector. + +Covers: +- get_gb_stock_symbols: pagination, .L filtering, caching behaviour +- get_calendar_list routing for "GB_ALL" -> ^FTSE +- YahooCollectorGB: timezone, normalize_symbol, instrument list +- YahooNormalizeGB1d / YahooNormalizeGB1min class instantiation +- Run class-name resolution for GB region +""" + +import sys +import unittest +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +import pandas as pd + +# --------------------------------------------------------------------------- +# Ensure the data_collector package is on the path (mirrors collector.py setup) +# --------------------------------------------------------------------------- +_SCRIPTS_DIR = Path(__file__).resolve().parent.parent / "scripts" +_COLLECTOR_DIR = _SCRIPTS_DIR / "data_collector" +_YAHOO_DIR = _COLLECTOR_DIR / "yahoo" + +for _p in [str(_SCRIPTS_DIR), str(_COLLECTOR_DIR), str(_YAHOO_DIR)]: + if _p not in sys.path: + sys.path.insert(0, _p) + +import data_collector.utils as dc_utils # noqa: E402 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_screener_page(symbols: list) -> MagicMock: + """Return a mock requests.Response for one screener page.""" + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.json.return_value = { + "finance": { + "result": [{"quotes": [{"symbol": s} for s in symbols]}] + } + } + return mock_resp + + +# --------------------------------------------------------------------------- +# Tests: get_gb_stock_symbols +# --------------------------------------------------------------------------- + + +class TestGetGBStockSymbols(unittest.TestCase): + def setUp(self): + # Reset the module-level cache before every test + dc_utils._GB_SYMBOLS = None + + def test_filters_dot_l_symbols_only(self): + """Only symbols ending in '.L' should be retained.""" + mock_resp = _make_screener_page(["HSBA.L", "AZN.L", "AAPL", "7203.T", "BP.L"]) + with patch("data_collector.utils.requests.get", return_value=mock_resp): + symbols = dc_utils.get_gb_stock_symbols() + self.assertIn("HSBA.L", symbols) + self.assertIn("AZN.L", symbols) + self.assertIn("BP.L", symbols) + self.assertNotIn("AAPL", symbols) + self.assertNotIn("7203.T", symbols) + + def test_pagination_stops_on_empty_page(self): + """Pagination must stop when the API returns an empty quotes list.""" + mock_full = _make_screener_page(["HSBA.L", "AZN.L"]) + mock_empty = _make_screener_page([]) + with patch("data_collector.utils.requests.get", side_effect=[mock_full, mock_empty]): + symbols = dc_utils.get_gb_stock_symbols() + self.assertEqual(symbols, sorted({"HSBA.L", "AZN.L"})) + + def test_pagination_stops_when_page_smaller_than_page_size(self): + """Pagination must stop when len(quotes) < 250 without a second request.""" + mock_partial = _make_screener_page(["HSBA.L", "AZN.L", "SHEL.L"]) + with patch("data_collector.utils.requests.get", return_value=mock_partial) as mock_get: + dc_utils.get_gb_stock_symbols() + self.assertEqual(mock_get.call_count, 1) + + def test_result_is_sorted(self): + """Returned list must be in sorted order.""" + mock_resp = _make_screener_page(["SHEL.L", "AZN.L", "BP.L"]) + with patch("data_collector.utils.requests.get", return_value=mock_resp): + symbols = dc_utils.get_gb_stock_symbols() + self.assertEqual(symbols, sorted(symbols)) + + def test_result_is_deduplicated(self): + """Duplicate symbols across pages must appear only once.""" + page1 = _make_screener_page(["HSBA.L", "AZN.L"]) + # page2 has a duplicate from page1 — still < 250 so stops after page2 + page2 = _make_screener_page(["AZN.L", "BP.L"]) + with patch("data_collector.utils.requests.get", side_effect=[page1, page2]): + symbols = dc_utils.get_gb_stock_symbols() + self.assertEqual(symbols.count("AZN.L"), 1) + + def test_cache_is_populated_after_first_call(self): + """_GB_SYMBOLS must be set after the first call.""" + mock_resp = _make_screener_page(["BP.L"]) + with patch("data_collector.utils.requests.get", return_value=mock_resp): + dc_utils.get_gb_stock_symbols() + self.assertIsNotNone(dc_utils._GB_SYMBOLS) + + def test_cache_prevents_second_http_request(self): + """A second call must not make another HTTP request.""" + mock_resp = _make_screener_page(["BP.L"]) + with patch("data_collector.utils.requests.get", return_value=mock_resp) as mock_get: + dc_utils.get_gb_stock_symbols() + dc_utils.get_gb_stock_symbols() + self.assertEqual(mock_get.call_count, 1) + + +# --------------------------------------------------------------------------- +# Tests: get_calendar_list routing for GB_ALL +# --------------------------------------------------------------------------- + + +class TestCalendarListGBRouting(unittest.TestCase): + def setUp(self): + dc_utils._CALENDAR_MAP = {} + + def test_gb_all_in_bench_url_map(self): + """CALENDAR_BENCH_URL_MAP must contain 'GB_ALL' mapped to '^FTSE'.""" + self.assertIn("GB_ALL", dc_utils.CALENDAR_BENCH_URL_MAP) + self.assertEqual(dc_utils.CALENDAR_BENCH_URL_MAP["GB_ALL"], "^FTSE") + + def test_gb_startswith_guard(self): + """'GB_ALL'.startswith('GB_') must be True so the Ticker branch is taken.""" + self.assertTrue("GB_ALL".startswith("GB_")) + + @patch("data_collector.utils.Ticker") + def test_get_calendar_list_calls_ticker_with_ftse(self, mock_ticker_cls): + """get_calendar_list('GB_ALL') must call Ticker('^FTSE').history(...).""" + dates = pd.to_datetime(["2024-01-02", "2024-01-03"]) + idx = pd.MultiIndex.from_tuples( + [("^FTSE", d) for d in dates], names=["symbol", "date"] + ) + mock_df = pd.DataFrame({"close": [7700.0, 7750.0]}, index=idx) + mock_instance = MagicMock() + mock_instance.history.return_value = mock_df + mock_ticker_cls.return_value = mock_instance + + calendar = dc_utils.get_calendar_list("GB_ALL") + + mock_ticker_cls.assert_called_with("^FTSE") + mock_instance.history.assert_called_with(interval="1d", period="max") + self.assertEqual(len(calendar), 2) + self.assertIsInstance(calendar[0], pd.Timestamp) + + +# --------------------------------------------------------------------------- +# Tests: YahooCollectorGB classes +# --------------------------------------------------------------------------- + + +class TestYahooCollectorGBClasses(unittest.TestCase): + @classmethod + def setUpClass(cls): + dc_utils._GB_SYMBOLS = None + cls._sym_patch = patch( + "data_collector.utils.requests.get", + return_value=_make_screener_page(["AZN.L", "BP.L", "HSBA.L"]), + ) + cls._sym_patch.start() + import collector as col_mod + + cls.col = col_mod + + @classmethod + def tearDownClass(cls): + cls._sym_patch.stop() + + def setUp(self): + dc_utils._GB_SYMBOLS = None + + def test_timezone_is_europe_london(self): + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + obj = self.col.YahooCollectorGB1d(save_dir=tmpdir, start="2024-01-01", end="2024-01-10") + self.assertEqual(obj._timezone, "Europe/London") + + def test_normalize_symbol_returns_uppercase(self): + import tempfile + from qlib.utils import code_to_fname + + with tempfile.TemporaryDirectory() as tmpdir: + obj = self.col.YahooCollectorGB1d(save_dir=tmpdir, start="2024-01-01", end="2024-01-10") + self.assertEqual(obj.normalize_symbol("AZN.L"), code_to_fname("AZN.L").upper()) + + def test_gb1min_class_exists(self): + self.assertTrue(hasattr(self.col, "YahooCollectorGB1min")) + + def test_normalize_gb1d_class_exists(self): + self.assertTrue(hasattr(self.col, "YahooNormalizeGB1d")) + + def test_normalize_gb1d_extend_class_exists(self): + self.assertTrue(hasattr(self.col, "YahooNormalizeGB1dExtend")) + + def test_normalize_gb1min_class_exists(self): + self.assertTrue(hasattr(self.col, "YahooNormalizeGB1min")) + + +# --------------------------------------------------------------------------- +# Tests: Run class-name resolution for GB +# --------------------------------------------------------------------------- + + +class TestRunClassResolutionGB(unittest.TestCase): + @classmethod + def setUpClass(cls): + import collector as col_mod + + cls.col = col_mod + + def _make_run(self, region, interval): + run = self.col.Run.__new__(self.col.Run) + run.region = region + run.interval = interval + return run + + def test_collector_class_name_1d(self): + self.assertEqual(self._make_run("GB", "1d").collector_class_name, "YahooCollectorGB1d") + + def test_collector_class_name_1min(self): + self.assertEqual(self._make_run("GB", "1min").collector_class_name, "YahooCollectorGB1min") + + def test_normalize_class_name_1d(self): + self.assertEqual(self._make_run("GB", "1d").normalize_class_name, "YahooNormalizeGB1d") + + def test_normalize_class_name_1min(self): + self.assertEqual(self._make_run("GB", "1min").normalize_class_name, "YahooNormalizeGB1min") + + def test_all_gb_classes_resolvable_from_module(self): + for name in [ + "YahooCollectorGB1d", + "YahooCollectorGB1min", + "YahooNormalizeGB1d", + "YahooNormalizeGB1dExtend", + "YahooNormalizeGB1min", + ]: + self.assertTrue(hasattr(self.col, name), f"Missing class: {name}") + + +if __name__ == "__main__": + unittest.main() From 499d168c0524a2fec1ae101aaee94b6e13fa456e Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 16:22:22 +0000 Subject: [PATCH 6/8] style: apply black formatting (line length 120) --- scripts/data_collector/utils.py | 3 +-- tests/test_yahoo_collector_gb.py | 11 ++--------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index a5175b4f663..7a41a5e141f 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -489,8 +489,7 @@ def get_gb_stock_symbols(qlib_data_path: [str, Path] = None) -> list: ) _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Accept": "application/json", } diff --git a/tests/test_yahoo_collector_gb.py b/tests/test_yahoo_collector_gb.py index 23ec695e648..472e5181392 100644 --- a/tests/test_yahoo_collector_gb.py +++ b/tests/test_yahoo_collector_gb.py @@ -31,7 +31,6 @@ import data_collector.utils as dc_utils # noqa: E402 - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -41,11 +40,7 @@ def _make_screener_page(symbols: list) -> MagicMock: """Return a mock requests.Response for one screener page.""" mock_resp = MagicMock() mock_resp.raise_for_status = MagicMock() - mock_resp.json.return_value = { - "finance": { - "result": [{"quotes": [{"symbol": s} for s in symbols]}] - } - } + mock_resp.json.return_value = {"finance": {"result": [{"quotes": [{"symbol": s} for s in symbols]}]}} return mock_resp @@ -139,9 +134,7 @@ def test_gb_startswith_guard(self): def test_get_calendar_list_calls_ticker_with_ftse(self, mock_ticker_cls): """get_calendar_list('GB_ALL') must call Ticker('^FTSE').history(...).""" dates = pd.to_datetime(["2024-01-02", "2024-01-03"]) - idx = pd.MultiIndex.from_tuples( - [("^FTSE", d) for d in dates], names=["symbol", "date"] - ) + idx = pd.MultiIndex.from_tuples([("^FTSE", d) for d in dates], names=["symbol", "date"]) mock_df = pd.DataFrame({"close": [7700.0, 7750.0]}, index=idx) mock_instance = MagicMock() mock_instance.history.return_value = mock_df From 82bfe3ef4514e23091f28f028ec67b95eea3a6be Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 17:20:56 +0000 Subject: [PATCH 7/8] fix(gb-collector): add inter-page sleep to avoid screener rate limiting The GB symbol fetch paginates Yahoo Finance screener in a tight loop (up to 6 requests for ~1400 symbols). Without a delay between pages, Yahoo returns 429 Too Many Requests on the second request. Add a 1-second sleep after each full page to stay within rate limits. --- scripts/data_collector/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 7a41a5e141f..a9acd9e527a 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -516,6 +516,7 @@ def _fetch_page(start: int) -> list: if len(quotes) < page_size: break start += page_size + time.sleep(1) # avoid triggering Yahoo Finance screener rate limits between pages if qlib_data_path is not None: for _index in ["ftse100", "ftse250"]: From 0f5179f9b7064540afd2bada957a8080630a16ec Mon Sep 17 00:00:00 2001 From: Mohammed Kaish Ansari Date: Sat, 14 Mar 2026 20:16:02 +0000 Subject: [PATCH 8/8] fix(gb-collector): use simple User-Agent for screener requests Yahoo Finance blocks the Chrome UA string but accepts a plain Mozilla/5.0 agent, consistent with the existing BR collector. --- scripts/data_collector/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index a9acd9e527a..c94e4caed6c 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -488,9 +488,7 @@ def get_gb_stock_symbols(qlib_data_path: [str, Path] = None) -> list: "?scrIds=most_actives_gb&count=250&start={start}" ) _HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - ), + "User-Agent": "Mozilla/5.0", "Accept": "application/json", }