Skip to content
Open
2 changes: 1 addition & 1 deletion scripts/data_collector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

Scripts for data collection

- yahoo: get *US/CN* stock data from *Yahoo Finance*
- yahoo: get *CN/US/IN/BR/GB* stock data from *Yahoo Finance*
- fund: get fund data from *http://fund.eastmoney.com*
- cn_index: get *CN index* from *http://www.csindex.com.cn*, *CSI300*/*CSI100*
- us_index: get *US index* from *https://en.wikipedia.org/wiki*, *SP500*/*NASDAQ100*/*DJIA*/*SP400*
Expand Down
84 changes: 83 additions & 1 deletion scripts/data_collector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"US_ALL": "^GSPC",
"IN_ALL": "^NSEI",
"BR_ALL": "^BVSP",
"GB_ALL": "^FTSE",
}

_BENCH_CALENDAR_LIST = None
Expand All @@ -46,6 +47,7 @@
_US_SYMBOLS = None
_IN_SYMBOLS = None
_BR_SYMBOLS = None
_GB_SYMBOLS = None
_EN_FUND_SYMBOLS = None
_CALENDAR_MAP = {}

Expand Down Expand Up @@ -74,7 +76,12 @@ def _get_calendar(url):

calendar = _CALENDAR_MAP.get(bench_code, None)
if calendar is None:
if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):
if (
bench_code.startswith("US_")
or bench_code.startswith("IN_")
or bench_code.startswith("BR_")
or bench_code.startswith("GB_")
):
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]))
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
Expand Down Expand Up @@ -450,6 +457,81 @@ def _format(s_):
return _BR_SYMBOLS


def get_gb_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get GB (London Stock Exchange) stock symbols via Yahoo Finance screener API.

Parameters
----------
qlib_data_path : str or Path, optional
Path to a local qlib data directory whose ``instruments/`` sub-directory
will be scanned for additional symbols (e.g. ``ftse100.txt``, ``ftse250.txt``),
by default None.

Returns
-------
list
Sorted, deduplicated list of Yahoo Finance ticker symbols with a ``.L``
suffix, e.g. ``["AZN.L", "BP.L", "HSBA.L", ...]``.

Notes
-----
Symbols are fetched from the Yahoo Finance predefined ``most_actives_gb``
screener endpoint, which covers the full GB market universe tracked by
Yahoo Finance. Pagination is handled automatically (250 results per page).
Results are cached in the module-level ``_GB_SYMBOLS`` variable after the
first call.
"""
global _GB_SYMBOLS # pylint: disable=W0603

_SCREENER_URL = (
"https://query1.finance.yahoo.com/v1/finance/screener/predefined/saved"
"?scrIds=most_actives_gb&count=250&start={start}"
)
_HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept": "application/json",
}

@deco_retry
def _fetch_page(start: int) -> list:
resp = requests.get(_SCREENER_URL.format(start=start), headers=_HEADERS, timeout=30)
resp.raise_for_status()
return resp.json().get("finance", {}).get("result", [{}])[0].get("quotes", [])

if _GB_SYMBOLS is None:
_all_symbols = []
start = 0
page_size = 250

while True:
quotes = _fetch_page(start)
if not quotes:
break
for q in quotes:
symbol = q.get("symbol", "")
if symbol.endswith(".L"):
_all_symbols.append(symbol)
if len(quotes) < page_size:
break
start += page_size
time.sleep(1) # avoid triggering Yahoo Finance screener rate limits between pages

if qlib_data_path is not None:
for _index in ["ftse100", "ftse250"]:
_ins_path = Path(qlib_data_path).joinpath(f"instruments/{_index}.txt")
if _ins_path.exists():
ins_df = pd.read_csv(
_ins_path,
sep="\t",
names=["symbol", "start_date", "end_date"],
)
_all_symbols += ins_df["symbol"].unique().tolist()

_GB_SYMBOLS = sorted(set(_all_symbols))

return _GB_SYMBOLS


def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list:
"""get en fund symbols

Expand Down
26 changes: 24 additions & 2 deletions scripts/data_collector/yahoo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pip install -r requirements.txt
- `source_dir`: save the directory
- `interval`: `1d` or `1min`, by default `1d`
> **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`**
- `region`: `CN` or `US` or `IN` or `BR`, by default `CN`
- `region`: `CN` or `US` or `IN` or `BR` or `GB`, by default `CN`
- `delay`: `time.sleep(delay)`, by default *0.5*
- `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)*
- `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)*
Expand Down Expand Up @@ -92,6 +92,11 @@ pip install -r requirements.txt
python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR
# br 1min data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR

# gb 1d data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/gb_data --start 2000-01-04 --end 2025-12-31 --delay 1 --interval 1d --region GB
# gb 1min data
python collector.py download_data --source_dir ~/.qlib/stock_data/source/gb_data_1min --delay 1 --interval 1min --region GB
```
2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data`

Expand All @@ -105,7 +110,7 @@ pip install -r requirements.txt
- `max_workers`: number of concurrent, by default *1*
- `interval`: `1d` or `1min`, by default `1d`
> if **`interval == 1min`**, `qlib_data_1d_dir` cannot be `None`
- `region`: `CN` or `US` or `IN`, by default `CN`
- `region`: `CN` or `US` or `IN` or `GB`, by default `CN`
- `date_field_name`: column *name* identifying time in csv files, by default `date`
- `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
- `end_date`: if not `None`, normalize the last date saved (*including end_date*); if `None`, it will ignore this parameter; by default `None`
Expand Down Expand Up @@ -133,6 +138,12 @@ pip install -r requirements.txt

# normalize 1min br
python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min

# normalize 1d gb
python scripts/data_collector/yahoo/collector.py normalize_data --source_dir ~/.qlib/stock_data/source/gb_data --normalize_dir ~/.qlib/stock_data/source/gb_1d_nor --region GB --interval 1d

# normalize 1min gb
python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/gb_data --source_dir ~/.qlib/stock_data/source/gb_data_1min --normalize_dir ~/.qlib/stock_data/source/gb_1min_nor --region GB --interval 1min
```
3. dump data: `python scripts/dump_bin.py dump_all`

Expand Down Expand Up @@ -221,5 +232,16 @@ pip install -r requirements.txt
df = D.features(inst[:100], ["$close"], freq="1min")
# get all symbol data
# df = D.features(D.instruments("all"), ["$close"], freq="1min")

# 1d data gb
# NOTE: Yahoo Finance quotes GB (.L) equities in GBp (pence), not GBP pounds.
# e.g. HSBA.L price ~1200 means 1200p = £12. No scaling is applied by the normaliser.
qlib.init(provider_uri="~/.qlib/qlib_data/gb_data", region="gb")
df = D.features(D.instruments("all"), ["$close"], freq="day")

# 1min data gb
qlib.init(provider_uri="~/.qlib/qlib_data/gb_data_1min", region="gb")
inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True)
df = D.features(inst[:100], ["$close"], freq="1min")
```

78 changes: 78 additions & 0 deletions scripts/data_collector/yahoo/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
get_us_stock_symbols,
get_in_stock_symbols,
get_br_stock_symbols,
get_gb_stock_symbols,
generate_minutes_calendar_from_daily,
calc_adjusted_price,
)
Expand Down Expand Up @@ -364,6 +365,40 @@ class YahooCollectorBR1min(YahooCollectorBR):
retry = 2


class YahooCollectorGB(YahooCollector, ABC):
"""Collector for GB (London Stock Exchange) equities via Yahoo Finance.

Symbols carry a ``.L`` suffix as returned by the Yahoo Finance screener
(e.g. ``HSBA.L``, ``AZN.L``). The ``^FTSE`` index is used as the
trading-calendar benchmark. Trading hours are 08:00-16:30 Europe/London.
Prices are quoted in GBp (pence) by Yahoo Finance.
"""

def get_instrument_list(self):
logger.info("get GB (LSE) stock symbols......")
symbols = get_gb_stock_symbols()
logger.info(f"get {len(symbols)} symbols.")
return symbols

def download_index_data(self):
pass

def normalize_symbol(self, symbol):
return code_to_fname(symbol).upper()

@property
def _timezone(self):
return "Europe/London"


class YahooCollectorGB1d(YahooCollectorGB):
pass


class YahooCollectorGB1min(YahooCollectorGB):
pass


class YahooNormalize(BaseNormalize):
COLUMNS = ["open", "close", "high", "low", "volume"]
DAILY_FORMAT = "%Y-%m-%d"
Expand Down Expand Up @@ -720,6 +755,49 @@ def symbol_to_yahoo(self, symbol):
return fname_to_code(symbol)


class YahooNormalizeGB:
"""Calendar mixin for GB (London Stock Exchange) normalisers.

Uses ``^FTSE`` daily history via :func:`get_calendar_list` with key
``"GB_ALL"`` as the trading-date sequence.
"""

def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
return get_calendar_list("GB_ALL")


class YahooNormalizeGB1d(YahooNormalizeGB, YahooNormalize1d):
pass


class YahooNormalizeGB1dExtend(YahooNormalizeGB, YahooNormalize1dExtend):
pass


class YahooNormalizeGB1min(YahooNormalizeGB, YahooNormalize1min):
"""1-minute normaliser for GB (London Stock Exchange) equities.

LSE trades continuously from 08:00 to 16:30 Europe/London with no midday
break. ``AM_RANGE`` covers the full session; ``PM_RANGE`` is a zero-width
sentinel so the parent generator loop is satisfied without adding extra
minutes. ``CALC_PAUSED_NUM = False`` mirrors US/IN/BR 1min normalisers.
"""

CALC_PAUSED_NUM = False
AM_RANGE = ("08:00:00", "16:29:00")
PM_RANGE = ("16:29:00", "16:29:00")

def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
# TODO: support 1min
raise ValueError("Does not support 1min")

def _get_1d_calendar_list(self):
return get_calendar_list("GB_ALL")

def symbol_to_yahoo(self, symbol):
return fname_to_code(symbol)


class Run(BaseRun):
def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
"""
Expand Down
Loading