scverse · Darlokt · May 15, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/.github/workflows/prepare_test_data.yaml b/.github/workflows/prepare_test_data.yaml
@@ -2,82 +2,67 @@ name: Prepare test data
 
 on:
     schedule:
-        - cron: "0 0 1 * *" # run once a month to prevent artifact expiration
+        - cron: "0 0 1 */2 *" # Run on the first day of every other month at midnight
     workflow_dispatch:
-    # Uncomment and adjust the branch name if you need to add new datasets to the artifact.
-    # It needs to be a branch in the spatialdata-io origin repository, not from a fork.
-#    push:
-#        branches:
-#            - main
+        inputs:
+            force_all:
+                description: "Download all registered datasets. Set to false to use dataset_keys."
+                required: true
+                type: boolean
+                default: true
+            dataset_keys:
+                description: "Dataset keys to download when force_all is false. Separate keys with spaces or commas."
+                required: false
+                type: string
+                default: ""
+            force_redownload:
+                description: "Redownload and replace existing selected datasets."
+                required: true
+                type: boolean
+                default: false
+    push:
+        branches:
+            - main
+        paths:
+            - ".github/workflows/prepare_test_data.yaml"
+            - "scripts/test_data_downloader/**"
 
 jobs:
     prepare-data:
         runs-on: ubuntu-latest
 
         steps:
-            - uses: actions/checkout@v4
+            - name: Checkout repository
+              uses: actions/checkout@v6
 
-            - name: Download test datasets
-              run: |
-                  mkdir -p ./data
-                  cd ./data
-
-                  # -------
-                  # the Xenium datasets are licensed as CC BY 4.0, as shown here
-                  # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data
-
-                  # 10x Genomics Xenium 2.0.0
-                  curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Breast_2fov/Xenium_V1_human_Breast_2fov_outs.zip
-                  curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip
-
-                  # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation
-                  # this file seems to be corrupted; skipping it for now
-                  # curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip
-
-                  # 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion
-                  curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip
-
-                  # 10x Genomics Xenium 4.0.0 (v1) Human ovary, nuclear expansion
-                  curl -O https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Human_Ovary_tiny/Xenium_V1_Human_Ovary_tiny_outs.zip
-
-                  # 10x Genomics Xenium 4.0.0 (v1) Human ovary, multimodal cell segmentation
-                  curl -O https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_MultiCellSeg_Human_Ovary_tiny/Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs.zip
-
-                  # 10x Genomics Xenium 4.0.0 (v1+Protein) Human kidney, multimodal cell segmentation
-                  curl -O https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Protein_Human_Kidney_tiny/Xenium_V1_Protein_Human_Kidney_tiny_outs.zip
-
-                  # -------
-                  # the Visium HD dataset is licensed as CC BY 4.0, as shown here
-                  # https://www.10xgenomics.com/support/software/space-ranger/latest/resources/visium-hd-example-data
-
-                  # 10x Genomics Visium HD 4.0.1 3' Mouse Brain Chunk
-                  curl -O https://cf.10xgenomics.com/samples/spatial-exp/4.0.1/Visium_HD_Tiny_3prime_Dataset/Visium_HD_Tiny_3prime_Dataset_outs.zip
-
-                  # -------
-                  # we received written permission to make the following dataset public and integrate it in the CI system of spatialdata-io
-                  # Spatial Genomics seqFISH v2
-                  curl -O https://s3.embl.de/spatialdata/raw_data/seqfish-2-test-dataset.zip
-
-                  # -------
-                  # MACSima OMAP datasets are licensed as CC BY 4.0
-                  # OMAP23 for format v1.x.x
-                  curl -o OMAP23_small.zip "https://zenodo.org/api/records/18196452/files-archive"
-
-                  # OMAP10 for format v0.x.x
-                  curl -o OMAP10_small.zip "https://zenodo.org/api/records/18196366/files-archive"
+            - name: Set up Python
+              uses: actions/setup-python@v6
+              with:
+                  python-version: "3.13"
 
-            - name: Unzip files
+            - name: Download test datasets
               run: |
-                  cd ./data
-                  for file in *.zip; do
-                      dir="${file%.zip}"
-                      mkdir -p "$dir"
-                      unzip "$file" -d "$dir"
-                      rm "$file"
-                  done
+                  args=(--output ./data)
+                  if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ inputs.force_all }}" == "false" ]]; then
+                      dataset_keys="${{ inputs.dataset_keys }}"
+                      if [[ -z "${dataset_keys}" ]]; then
+                          echo "::error::dataset_keys must be provided when force_all is false."
+                          exit 1
+                      fi
+                      dataset_keys="${dataset_keys//,/ }"
+                      for dataset_key in ${dataset_keys}; do
+                          args+=(--dataset "${dataset_key}")
+                      done
+                  fi
+                  if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ inputs.force_redownload }}" == "true" ]]; then
+                      args+=(--force)
+                  fi
+                  python scripts/test_data_downloader "${args[@]}"
 
             - name: Upload artifacts
-              uses: actions/upload-artifact@v4
+              uses: actions/upload-artifact@v7
               with:
                   name: data
                   path: ./data
+                  if-no-files-found: error
+                  retention-days: 64
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -9,9 +9,9 @@ jobs:
         runs-on: ubuntu-latest
         if: startsWith(github.ref, 'refs/tags/v')
         steps:
-            - uses: actions/checkout@v4
+            - uses: actions/checkout@v6
             - name: Set up Python 3.12
-              uses: actions/setup-python@v5
+              uses: actions/setup-python@v6
               with:
                   python-version: "3.13"
                   cache: pip

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -26,9 +26,9 @@ jobs:
             PYTHON: ${{ matrix.python }}
 
         steps:
-            - uses: actions/checkout@v4
+            - uses: actions/checkout@v6
             - name: Set up Python ${{ matrix.python }}
-              uses: actions/setup-python@v5
+              uses: actions/setup-python@v6
               with:
                   python-version: ${{ matrix.python }}
 
@@ -37,7 +37,7 @@ jobs:
               run: |
                   echo "::set-output name=dir::$(pip cache dir)"
             - name: Restore pip cache
-              uses: actions/cache@v4
+              uses: actions/cache@v5
               with:
                   path: ${{ steps.pip-cache-dir.outputs.dir }}
                   key: pip-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('**/pyproject.toml') }}

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -123,6 +123,38 @@ If the `download.py` and `to_zarr.py` scripts require Python imports for package
 
 We encourage testing the reader function and any helper function.
 
+Tests are split by scope:
+
+- Unit tests live in `tests/unit/` and should not require downloaded test data.
+- Integration tests live in `tests/integration/` and cover reader workflows, CLI commands, file I/O, and zarr roundtrips.
+- Integration tests that require external datasets use dataset keys from `scripts/test_data_downloader/datasets.toml`.
+  They resolve data under `SPATIALDATA_IO_TEST_DATA_DIR` when set, otherwise `data/` in the repository root. If the required dataset is unavailable, the test should skip with a clear message.
+- Reader tests are marked by reader name. When modifying one reader, use `pytest -m <reader>` to run the tests
+  specific to that reader, including shared parametrized checks for that reader.
+
+Useful local commands:
+
+```bash
+pytest tests/unit
+pytest tests/integration
+pytest -m "integration and data"
+pytest -m xenium
+pytest -m "xenium and data"
+pytest -m "xenium and not slow"
+pytest -m "xenium and cli"
+python scripts/test_data_downloader --group xenium
+SPATIALDATA_IO_TEST_DATA_DIR=/path/to/data pytest -m data
+```
+
+To download the same optional datasets used by CI, run:
+
+```bash
+python scripts/test_data_downloader
+```
+
+By default, the downloader skips datasets that already exist. Use `--force` to redownload selected datasets, `--dataset` for a single dataset key, and `--list` to show the available keys.
+The dataset registry lives in `scripts/test_data_downloader/datasets.toml`; append new entries there when adding or updating test datasets.
+
 ### Testing multiple versions
 
 When multiple versions of the raw data format are present, we encourage testing the reader on all of them to ensure backward compatibility. This task is greatly simplified if small test datasets are used for the CI tests. If this is not available, we suggest running the tests locally on multiple versions of the data before the PR is ready for review.

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,6 +82,29 @@ testpaths = ["tests"]
 xfail_strict = true
 addopts = [
     "--import-mode=importlib", # allow using test files with same name
+    "--strict-markers",
+]
+markers = [
+    "unit: fast isolated tests that do not require external datasets",
+    "integration: multi-component tests, file I/O tests, or reader workflow tests",
+    "data: tests that require optional downloaded test datasets",
+    "slow: tests with comparatively high runtime",
+    "cli: command-line interface tests",
+    "codex: tests for the codex reader",
+    "cosmx: tests for the cosmx reader",
+    "curio: tests for the curio reader",
+    "dbit: tests for the dbit reader",
+    "generic: tests for the generic reader module",
+    "iss: tests for the iss reader",
+    "macsima: tests for the macsima reader",
+    "mcmicro: tests for the mcmicro reader",
+    "merscope: tests for the merscope reader",
+    "seqfish: tests for the seqfish reader",
+    "steinbock: tests for the steinbock reader",
+    "stereoseq: tests for the stereoseq reader",
+    "visium: tests for the visium reader",
+    "visium_hd: tests for the visium_hd reader",
+    "xenium: tests for the xenium reader",
 ]
 
 [tool.ruff]

diff --git a/scripts/test_data_downloader/__main__.py b/scripts/test_data_downloader/__main__.py
@@ -0,0 +1,8 @@
+"""Command-line entrypoint for the optional test data downloader."""
+
+from __future__ import annotations
+
+from downloader import main
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/test_data_downloader/datasets.toml b/scripts/test_data_downloader/datasets.toml
@@ -0,0 +1,106 @@
+# -------
+# the Xenium datasets are licensed as CC BY 4.0, as shown here
+# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data
+
+# 10x Genomics Xenium 2.0.0
+[[datasets]]
+key = "xenium_breast"
+group = "xenium"
+url = "https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Breast_2fov/Xenium_V1_human_Breast_2fov_outs.zip"
+archive_name = "Xenium_V1_human_Breast_2fov_outs.zip"
+extracted_dir = "Xenium_V1_human_Breast_2fov_outs"
+source = "10x Genomics Xenium 2.0.0, CC BY 4.0"
+
+# 10x Genomics Xenium 2.0.0
+[[datasets]]
+key = "xenium_lung"
+group = "xenium"
+url = "https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip"
+archive_name = "Xenium_V1_human_Lung_2fov_outs.zip"
+extracted_dir = "Xenium_V1_human_Lung_2fov_outs"
+source = "10x Genomics Xenium 2.0.0, CC BY 4.0"
+
+# 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation
+# this file seems to be corrupted; skipping it for now
+# https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip
+
+# 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion
+[[datasets]]
+key = "xenium_prime_mouse_ileum"
+group = "xenium"
+url = "https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip"
+archive_name = "Xenium_Prime_Mouse_Ileum_tiny_outs.zip"
+extracted_dir = "Xenium_Prime_Mouse_Ileum_tiny_outs"
+source = "10x Genomics Xenium 3.0.0, CC BY 4.0"
+
+# 10x Genomics Xenium 4.0.0 (v1) Human ovary, nuclear expansion
+[[datasets]]
+key = "xenium_ovary"
+group = "xenium"
+url = "https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Human_Ovary_tiny/Xenium_V1_Human_Ovary_tiny_outs.zip"
+archive_name = "Xenium_V1_Human_Ovary_tiny_outs.zip"
+extracted_dir = "Xenium_V1_Human_Ovary_tiny_outs"
+source = "10x Genomics Xenium 4.0.0, CC BY 4.0"
+
+# 10x Genomics Xenium 4.0.0 (v1) Human ovary, multimodal cell segmentation
+[[datasets]]
+key = "xenium_multicell_ovary"
+group = "xenium"
+url = "https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_MultiCellSeg_Human_Ovary_tiny/Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs.zip"
+archive_name = "Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs.zip"
+extracted_dir = "Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs"
+source = "10x Genomics Xenium 4.0.0, CC BY 4.0"
+
+# 10x Genomics Xenium 4.0.0 (v1+Protein) Human kidney, multimodal cell segmentation
+[[datasets]]
+key = "xenium_protein_kidney"
+group = "xenium"
+url = "https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Protein_Human_Kidney_tiny/Xenium_V1_Protein_Human_Kidney_tiny_outs.zip"
+archive_name = "Xenium_V1_Protein_Human_Kidney_tiny_outs.zip"
+extracted_dir = "Xenium_V1_Protein_Human_Kidney_tiny_outs"
+source = "10x Genomics Xenium 4.0.0, CC BY 4.0"
+
+# -------
+# the Visium HD dataset is licensed as CC BY 4.0, as shown here
+# https://www.10xgenomics.com/support/software/space-ranger/latest/resources/visium-hd-example-data
+
+# 10x Genomics Visium HD 4.0.1 3' Mouse Brain Chunk
+[[datasets]]
+key = "visium_hd_tiny"
+group = "visium_hd"
+url = "https://cf.10xgenomics.com/samples/spatial-exp/4.0.1/Visium_HD_Tiny_3prime_Dataset/Visium_HD_Tiny_3prime_Dataset_outs.zip"
+archive_name = "Visium_HD_Tiny_3prime_Dataset_outs.zip"
+extracted_dir = "Visium_HD_Tiny_3prime_Dataset_outs"
+source = "10x Genomics Visium HD 4.0.1, CC BY 4.0"
+
+# -------
+# we received written permission to make the following dataset public and integrate it in the CI system of spatialdata-io
+# Spatial Genomics seqFISH v2
+[[datasets]]
+key = "seqfish"
+group = "seqfish"
+url = "https://s3.embl.de/spatialdata/raw_data/seqfish-2-test-dataset.zip"
+archive_name = "seqfish-2-test-dataset.zip"
+extracted_dir = "seqfish-2-test-dataset"
+source = "Spatial Genomics seqFISH v2, public test data"
+test_path = "instrument 2 official"
+
+# -------
+# MACSima OMAP datasets are licensed as CC BY 4.0
+# OMAP23 for format v1.x.x
+[[datasets]]
+key = "macsima_omap23"
+group = "macsima"
+url = "https://zenodo.org/api/records/18196452/files-archive"
+archive_name = "OMAP23_small.zip"
+extracted_dir = "OMAP23_small"
+source = "MACSima OMAP23, CC BY 4.0"
+
+# OMAP10 for format v0.x.x
+[[datasets]]
+key = "macsima_omap10"
+group = "macsima"
+url = "https://zenodo.org/api/records/18196366/files-archive"
+archive_name = "OMAP10_small.zip"
+extracted_dir = "OMAP10_small"
+source = "MACSima OMAP10, CC BY 4.0"