From 5b06b901530e2372859951ee0c25f19215914d1d Mon Sep 17 00:00:00 2001 From: Dmitrii Gurev Date: Thu, 26 Feb 2026 14:01:28 +0100 Subject: [PATCH 01/11] Add SuperMAG SME --- swvo/io/sme/__init__.py | 5 + swvo/io/sme/supermag.py | 278 ++++++++++++++++++++++++++++++ tests/io/sme/test_sme_supermag.py | 130 ++++++++++++++ 3 files changed, 413 insertions(+) create mode 100644 swvo/io/sme/__init__.py create mode 100644 swvo/io/sme/supermag.py create mode 100644 tests/io/sme/test_sme_supermag.py diff --git a/swvo/io/sme/__init__.py b/swvo/io/sme/__init__.py new file mode 100644 index 0000000..04137fb --- /dev/null +++ b/swvo/io/sme/__init__.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# +# SPDX-License-Identifier: Apache-2.0 + +from swvo.io.sme.supermag import SMESuperMAG as SMESuperMAG \ No newline at end of file diff --git a/swvo/io/sme/supermag.py b/swvo/io/sme/supermag.py new file mode 100644 index 0000000..ab412c9 --- /dev/null +++ b/swvo/io/sme/supermag.py @@ -0,0 +1,278 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Module for handling SuperMAG SME data. +""" + +import json +import logging +import os +import re +import warnings +from datetime import datetime, timedelta, timezone +from pathlib import Path +from shutil import rmtree +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +import requests + +from swvo.io.utils import enforce_utc_timezone + +logger = logging.getLogger(__name__) + +logging.captureWarnings(True) + +class SMESuperMAG: + """Class for SuperMAG SME data. + + Parameters + ---------- + username : str + SuperMAG username used for authenticated data access (register at the SuperMAG website to obtain one) + data_dir : Path | None + Data directory for the SuperMAG SME data. If not provided, it will be read from the environment variable + + Methods + ------- + download_and_process + read + + Raises + ------ + ValueError + Raised if the required environment variable is not set. + """ + + ENV_VAR_NAME = "SUPERMAG_STREAM_DIR" + + def __init__(self, username: str, data_dir: Optional[Path] = None) -> None: + self.username = username + + if data_dir is None: + if self.ENV_VAR_NAME not in os.environ: + msg = f"Necessary environment variable {self.ENV_VAR_NAME} not set!" + raise ValueError(msg) + data_dir = os.environ.get(self.ENV_VAR_NAME) # ty: ignore[invalid-assignment] + + self.data_dir: Path = Path(data_dir) # ty:ignore[invalid-argument-type] + self.data_dir.mkdir(parents=True, exist_ok=True) + + logger.info(f"SuperMAG SME data directory: {self.data_dir}") + + def download_and_process(self, start_time: datetime, end_time: datetime, reprocess_files: bool = False) -> None: + """Download and process SuperMAG SME data files. + + Parameters + ---------- + start_time : datetime + Start time of the data to download. Must be timezone-aware. + end_time : datetime + End time of the data to download. Must be timezone-aware. + reprocess_files : bool, optional + Download and process files again. Defaults to False. + + Returns + ------- + None + """ + + assert start_time < end_time, "Start time must be before end time" + + temporary_dir = Path("./temp_supermag") + temporary_dir.mkdir(exist_ok=True, parents=True) + + file_paths, time_intervals = self._get_processed_file_list(start_time, end_time) + + for file_path, time_interval in zip(file_paths, time_intervals): + if file_path.exists() and not reprocess_files: + continue + + tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") + + try: + start_str = time_interval.strftime("%Y-%m-%dT%H:%M") + extent = int(timedelta(days=1).total_seconds()) + url = ( + "https://supermag.jhuapl.edu/services/indices.php" + f"?python&nohead&logon={self.username}" + f"&start={start_str}" + f"&extent={extent}" + "&indices=sme" + ) + + logger.debug(f"Downloading data from {url} ...") + + response = requests.get(url) + response.raise_for_status() + + data = response.text.splitlines() + if data[0].startswith("ERROR"): + logger.info(f"SuperMAG {data[0]}") + + filename = "index.html" + with open(temporary_dir / filename, "w") as file: + file.write("\n".join(data)) + + logger.debug("Processing file ...") + + processed_df = self._process_single_file(temporary_dir / filename) + processed_df.to_csv(tmp_path, index=True, header=True) + tmp_path.replace(file_path) + + except Exception as e: + logger.error(f"Failed to process {file_path}: {e}") + if tmp_path.exists(): + tmp_path.unlink() + continue + + rmtree(temporary_dir, ignore_errors=True) + + def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> Tuple[List, List]: + """Get a list of file paths and their corresponding time intervals. + + Returns + ------- + Tuple[List, List] + List of file paths and time intervals. + """ + + file_paths = [] + time = [] + + current_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0) + end_time = (end_time.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)) + + while current_time < end_time: + file_path = self.data_dir / f"SuperMAG_SME_{current_time.strftime('%Y%m%d')}.csv" + file_paths.append(file_path) + + file_time = current_time + + time.append(file_time) + + # Increment the day + current_time = current_time + timedelta(days=1) + + return file_paths, time + + def _process_single_file(self, file_path: Path) -> pd.DataFrame: + """Process daily SuperMAG SME file into a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + Processed SuperMAG SME data. + + Raises + ------ + ValueError + If no JSON object/array is found in the downloaded file. + """ + + + with open(file_path, "r") as file: + text = file.read() + + match = re.search(r'(\{.*\}|\[.*\])', text, re.S) + if match is None: + raise ValueError("No JSON object/array found in file") + json_text = match.group(1) + data = json.loads(json_text) + + df = pd.DataFrame(data) + + df["timestamp"] = pd.to_datetime(df["tval"], unit="s", utc=True) + df.index = df["timestamp"] + df.drop(columns=["timestamp", "tval"], inplace=True) + df = df.rename(columns={'SME': 'sme'}) + + mask = df["sme"] >= 999998 + df.loc[mask, "sme"] = np.nan + + return df + + def read(self, start_time: datetime, end_time: datetime, download: bool = False) -> pd.DataFrame: + """ + Read SuperMAG SME data for a given time range. + + Parameters + ---------- + start_time : datetime + Start time of the data to read. Must be timezone-aware. + end_time : datetime + End time of the data to read. Must be timezone-aware. + download : bool, optional + Download missing data files on demand. Defaults to False. + + Returns + ------- + :class:`pandas.DataFrame` + SuperMAG SME data. + """ + + start_time = enforce_utc_timezone(start_time) + end_time = enforce_utc_timezone(end_time) + + assert start_time < end_time, "Start time must be before end time!" + + file_paths, _ = self._get_processed_file_list(start_time, end_time) + t = pd.date_range( + datetime(start_time.year, start_time.month, start_time.day), + datetime(end_time.year, end_time.month, end_time.day, 23, 59, 00), + freq=timedelta(minutes=1), + tz=timezone.utc, + ) + data_out = pd.DataFrame(index=t) + data_out["sme"] = np.array([np.nan] * len(t)) + data_out["file_name"] = np.array([None] * len(t)) + + for file_path in file_paths: + if not file_path.exists(): + if download: + self.download_and_process(start_time, end_time) + else: + warnings.warn(f"File {file_path} not found") + continue + + df_one_file = self._read_single_file(file_path) + data_out = df_one_file.combine_first(data_out) + + data_out = data_out.truncate( + before=start_time - timedelta(minutes=0.9999), + after=end_time + timedelta(minutes=0.9999), + ) + + return data_out + + def _read_single_file(self, file_path: Path) -> pd.DataFrame: + """Read a daily SuperMAG SME file into a DataFrame. + + Parameters + ---------- + file_path : Path + Path to the file. + + Returns + ------- + pd.DataFrame + Data from daily SuperMAG SME file. + """ + df = pd.read_csv(file_path) + + df.index = pd.to_datetime(df["timestamp"], utc=True) + df.drop(columns=["timestamp"], inplace=True) + df.index.name = None + + df["file_name"] = file_path + df.loc[df["sme"].isna(), "file_name"] = None + + return df diff --git a/tests/io/sme/test_sme_supermag.py b/tests/io/sme/test_sme_supermag.py new file mode 100644 index 0000000..7e3020e --- /dev/null +++ b/tests/io/sme/test_sme_supermag.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: 2026 GFZ Helmholtz Centre for Geosciences +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +import warnings +from datetime import datetime +from pathlib import Path +from unittest.mock import patch + +import pandas as pd +import pytest + +from swvo.io.sme import SMESuperMAG + +TEST_DATA_DIR = Path("test_data") +MOCK_DATA_PATH = TEST_DATA_DIR / "mock_sme" +TEST_USERNAME = "swvo_test" + +class TestSMESuperMAG: + @pytest.fixture(autouse=True) + def setup_and_cleanup(self): + TEST_DATA_DIR.mkdir(exist_ok=True) + MOCK_DATA_PATH.mkdir(exist_ok=True) + + yield + + if TEST_DATA_DIR.exists(): + shutil.rmtree(TEST_DATA_DIR, ignore_errors=True) + + @pytest.fixture + def sme_instance(self): + with patch.dict("os.environ", {SMESuperMAG.ENV_VAR_NAME: str(MOCK_DATA_PATH)}): + instance = SMESuperMAG(TEST_USERNAME) + return instance + + @pytest.fixture + def mock_sme_supermag_data(self): + return """ [{"tval": 1668816000.000000, "SME": 263.221710}, +{"tval": 1668816060.000000, "SME": 250.118866}, +{"tval": 1668816120.000000, "SME": 234.960663}, +{"tval": 1668816180.000000, "SME": 227.111343}, +{"tval": 1668816240.000000, "SME": 220.567047}, +{"tval": 1668816300.000000, "SME": 213.856384}] + """ + + def test_initialization_with_env_var(self): + with patch.dict("os.environ", {SMESuperMAG.ENV_VAR_NAME: str(MOCK_DATA_PATH)}): + sme = SMESuperMAG(TEST_USERNAME) + assert sme.data_dir == MOCK_DATA_PATH + + def test_initialization_without_env_var(self): + if SMESuperMAG.ENV_VAR_NAME in os.environ: + del os.environ[SMESuperMAG.ENV_VAR_NAME] + with pytest.raises(ValueError): + SMESuperMAG(TEST_USERNAME) + + def test_get_processed_file_list(self, sme_instance): + start_time = datetime(2020, 1, 1) + end_time = datetime(2020, 2, 1) + + file_paths, time_intervals = sme_instance._get_processed_file_list(start_time, end_time) + + assert len(file_paths) == 32 + assert all(str(path).startswith(str(MOCK_DATA_PATH)) for path in file_paths) + assert all(path.name.startswith("SuperMAG_SME_") for path in file_paths) + assert len(time_intervals) == 32 + + def test_download_and_process(self, sme_instance): + sme_instance.download_and_process(datetime(2020, 1, 1), datetime(2020, 1, 2)) + + expected_files = list(MOCK_DATA_PATH.glob("**/SuperMAG_SME_*.csv")) + print(expected_files) + + assert 1 <= len(expected_files) & len(expected_files) <= 2 + + data = pd.read_csv(expected_files[0]) + assert "sme" in data.columns + + def test_process_single_file(self, sme_instance, mock_sme_supermag_data): + test_file = MOCK_DATA_PATH / "test_sme.txt" + test_file.parent.mkdir(exist_ok=True) + + with open(test_file, "w") as f: + f.write(mock_sme_supermag_data) + + data = sme_instance._process_single_file(test_file) + + assert isinstance(data, pd.DataFrame) + assert "sme" in data.columns + assert len(data) == 6 + + def test_read_with_no_data(self, sme_instance): + start_time = datetime(2020, 1, 1) + end_time = datetime(2020, 1, 10) + + with warnings.catch_warnings(record=True) as w: + df = sme_instance.read(start_time, end_time, download=False) + + assert "SuperMAG_SME_20200110.csv not found" in str(w[-1].message) + assert isinstance(df, pd.DataFrame) + assert len(df) == 9 * 24 * 60 + 1 + assert all(df["sme"].isna()) + assert all(df["file_name"].isnull()) + + def test_read_invalid_time_range(self, sme_instance): + start_time = datetime(2020, 12, 31) + end_time = datetime(2020, 1, 1) + + with pytest.raises(AssertionError, match="Start time must be before end time!"): + sme_instance.read(start_time, end_time) + + def test_read_with_existing_data(self, sme_instance): + sample_data = pd.DataFrame( + {"sme": range(1441)}, index=pd.date_range(start="2020-01-01", end="2020-01-02", freq="min") + ) + sample_data.index.name = "timestamp" + + file_path = MOCK_DATA_PATH / "SuperMAG_SME_20200101.csv" + sample_data.to_csv(file_path, index=True) + + start_time = datetime(2020, 1, 1, 12) + end_time = datetime(2020, 1, 1, 18) + + data = sme_instance.read(start_time, end_time) + + assert isinstance(data, pd.DataFrame) + assert len(data) > 0 + assert all(col in data.columns for col in ["sme"]) \ No newline at end of file From b55675823cba5474450578ba8b6dccc6d41e3edb Mon Sep 17 00:00:00 2001 From: Dmitrii Gurev Date: Thu, 26 Feb 2026 14:06:10 +0100 Subject: [PATCH 02/11] Add SME to README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 106f3e5..c27704e 100755 --- a/README.md +++ b/README.md @@ -71,6 +71,11 @@ This package provides tools to read, process, and analyze several key solar and - SWPC: `F107SWPC` - Combined: `read_f107_from_multiple_models` +- **SME Index**: + The SME (SuperMAG Electrojet) index measures auroral electrojet strength based on SuperMAG ground magnetometers. + - **Sources & Classes:** + - SuperMAG: `SMESuperMAG` + - **Solar Wind Parameters**: Access to solar wind data (speed, density, magnetic field components) from various spacecraft. Essential for solar-terrestrial interaction studies. - **Sources & Classes:** From 84ae6e4d5832e299e04dbc48eec566c6629e7d36 Mon Sep 17 00:00:00 2001 From: Dmitrii Gurev Date: Thu, 26 Feb 2026 14:07:59 +0100 Subject: [PATCH 03/11] Add new line to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c27704e..e38111a 100755 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ This package provides tools to read, process, and analyze several key solar and - SWPC: `F107SWPC` - Combined: `read_f107_from_multiple_models` -- **SME Index**: +- **SME Index**: The SME (SuperMAG Electrojet) index measures auroral electrojet strength based on SuperMAG ground magnetometers. - **Sources & Classes:** - SuperMAG: `SMESuperMAG` From 90975a9c3ae841be7f31e404308c1cc978b68f07 Mon Sep 17 00:00:00 2001 From: Dmitrii Gurev Date: Thu, 26 Feb 2026 14:19:50 +0100 Subject: [PATCH 04/11] Add SME to init --- swvo/io/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/swvo/io/__init__.py b/swvo/io/__init__.py index c4f7d37..cfd7060 100755 --- a/swvo/io/__init__.py +++ b/swvo/io/__init__.py @@ -11,4 +11,5 @@ plasmasphere as plasmasphere, RBMDataSet as RBMDataSet, solar_wind as solar_wind, + sme as sme, ) From d7e7ccee8905917e289f7ea3dc5b5637bc25130b Mon Sep 17 00:00:00 2001 From: Dmitrii Gurev Date: Thu, 26 Feb 2026 14:20:18 +0100 Subject: [PATCH 05/11] Update a length assertion in SME tests --- tests/io/sme/test_sme_supermag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/io/sme/test_sme_supermag.py b/tests/io/sme/test_sme_supermag.py index 7e3020e..485b55c 100644 --- a/tests/io/sme/test_sme_supermag.py +++ b/tests/io/sme/test_sme_supermag.py @@ -73,7 +73,7 @@ def test_download_and_process(self, sme_instance): expected_files = list(MOCK_DATA_PATH.glob("**/SuperMAG_SME_*.csv")) print(expected_files) - assert 1 <= len(expected_files) & len(expected_files) <= 2 + assert len(expected_files) == 2 data = pd.read_csv(expected_files[0]) assert "sme" in data.columns From b4b8f76331b46d2c917b9c4fd064b7f3fc9f3f38 Mon Sep 17 00:00:00 2001 From: Dmitrii Gurev Date: Thu, 26 Feb 2026 14:20:38 +0100 Subject: [PATCH 06/11] Add SuperMAG account requirement to README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e38111a..7a2d30d 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -