Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 49 additions & 64 deletions .github/workflows/prepare_test_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,67 @@ name: Prepare test data

on:
schedule:
- cron: "0 0 1 * *" # run once a month to prevent artifact expiration
- cron: "0 0 1 */2 *" # Run on the first day of every other month at midnight
workflow_dispatch:
# Uncomment and adjust the branch name if you need to add new datasets to the artifact.
# It needs to be a branch in the spatialdata-io origin repository, not from a fork.
# push:
# branches:
# - main
inputs:
force_all:
description: "Download all registered datasets. Set to false to use dataset_keys."
required: true
type: boolean
default: true
dataset_keys:
description: "Dataset keys to download when force_all is false. Separate keys with spaces or commas."
required: false
type: string
default: ""
force_redownload:
description: "Redownload and replace existing selected datasets."
required: true
type: boolean
default: false
push:
branches:
- main
paths:
- ".github/workflows/prepare_test_data.yaml"
- "scripts/test_data_downloader/**"

jobs:
prepare-data:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Checkout repository
uses: actions/checkout@v6

- name: Download test datasets
run: |
mkdir -p ./data
cd ./data

# -------
# the Xenium datasets are licensed as CC BY 4.0, as shown here
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data

# 10x Genomics Xenium 2.0.0
curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Breast_2fov/Xenium_V1_human_Breast_2fov_outs.zip
curl -O https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip

# 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation
# this file seems to be corrupted; skipping it for now
# curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip

# 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion
curl -O https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip

# 10x Genomics Xenium 4.0.0 (v1) Human ovary, nuclear expansion
curl -O https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Human_Ovary_tiny/Xenium_V1_Human_Ovary_tiny_outs.zip

# 10x Genomics Xenium 4.0.0 (v1) Human ovary, multimodal cell segmentation
curl -O https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_MultiCellSeg_Human_Ovary_tiny/Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs.zip

# 10x Genomics Xenium 4.0.0 (v1+Protein) Human kidney, multimodal cell segmentation
curl -O https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Protein_Human_Kidney_tiny/Xenium_V1_Protein_Human_Kidney_tiny_outs.zip

# -------
# the Visium HD dataset is licensed as CC BY 4.0, as shown here
# https://www.10xgenomics.com/support/software/space-ranger/latest/resources/visium-hd-example-data

# 10x Genomics Visium HD 4.0.1 3' Mouse Brain Chunk
curl -O https://cf.10xgenomics.com/samples/spatial-exp/4.0.1/Visium_HD_Tiny_3prime_Dataset/Visium_HD_Tiny_3prime_Dataset_outs.zip

# -------
# we received written permission to make the following dataset public and integrate it in the CI system of spatialdata-io
# Spatial Genomics seqFISH v2
curl -O https://s3.embl.de/spatialdata/raw_data/seqfish-2-test-dataset.zip

# -------
# MACSima OMAP datasets are licensed as CC BY 4.0
# OMAP23 for format v1.x.x
curl -o OMAP23_small.zip "https://zenodo.org/api/records/18196452/files-archive"

# OMAP10 for format v0.x.x
curl -o OMAP10_small.zip "https://zenodo.org/api/records/18196366/files-archive"
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.13"

- name: Unzip files
- name: Download test datasets
run: |
cd ./data
for file in *.zip; do
dir="${file%.zip}"
mkdir -p "$dir"
unzip "$file" -d "$dir"
rm "$file"
done
args=(--output ./data)
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ inputs.force_all }}" == "false" ]]; then
dataset_keys="${{ inputs.dataset_keys }}"
if [[ -z "${dataset_keys}" ]]; then
echo "::error::dataset_keys must be provided when force_all is false."
exit 1
fi
dataset_keys="${dataset_keys//,/ }"
for dataset_key in ${dataset_keys}; do
args+=(--dataset "${dataset_key}")
done
fi
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ inputs.force_redownload }}" == "true" ]]; then
args+=(--force)
fi
python scripts/test_data_downloader "${args[@]}"

- name: Upload artifacts
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v7
with:
name: data
path: ./data
if-no-files-found: error
retention-days: 64
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ jobs:
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- name: Set up Python 3.12
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: "3.13"
cache: pip
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ jobs:
PYTHON: ${{ matrix.python }}

steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python }}

Expand All @@ -37,7 +37,7 @@ jobs:
run: |
echo "::set-output name=dir::$(pip cache dir)"
- name: Restore pip cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ${{ steps.pip-cache-dir.outputs.dir }}
key: pip-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('**/pyproject.toml') }}
Expand Down
32 changes: 32 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,38 @@ If the `download.py` and `to_zarr.py` scripts require Python imports for package

We encourage testing the reader function and any helper function.

Tests are split by scope:

- Unit tests live in `tests/unit/` and should not require downloaded test data.
- Integration tests live in `tests/integration/` and cover reader workflows, CLI commands, file I/O, and zarr roundtrips.
- Integration tests that require external datasets use dataset keys from `scripts/test_data_downloader/datasets.toml`.
They resolve data under `SPATIALDATA_IO_TEST_DATA_DIR` when set, otherwise `data/` in the repository root. If the required dataset is unavailable, the test should skip with a clear message.
- Reader tests are marked by reader name. When modifying one reader, use `pytest -m <reader>` to run the tests
specific to that reader, including shared parametrized checks for that reader.

Useful local commands:

```bash
pytest tests/unit
pytest tests/integration
pytest -m "integration and data"
pytest -m xenium
pytest -m "xenium and data"
pytest -m "xenium and not slow"
pytest -m "xenium and cli"
python scripts/test_data_downloader --group xenium
SPATIALDATA_IO_TEST_DATA_DIR=/path/to/data pytest -m data
```

To download the same optional datasets used by CI, run:

```bash
python scripts/test_data_downloader
```

By default, the downloader skips datasets that already exist. Use `--force` to redownload selected datasets, `--dataset` for a single dataset key, and `--list` to show the available keys.
The dataset registry lives in `scripts/test_data_downloader/datasets.toml`; append new entries there when adding or updating test datasets.

### Testing multiple versions

When multiple versions of the raw data format are present, we encourage testing the reader on all of them to ensure backward compatibility. This task is greatly simplified if small test datasets are used for the CI tests. If this is not available, we suggest running the tests locally on multiple versions of the data before the PR is ready for review.
Expand Down
23 changes: 23 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,29 @@ testpaths = ["tests"]
xfail_strict = true
addopts = [
"--import-mode=importlib", # allow using test files with same name
"--strict-markers",
]
markers = [
"unit: fast isolated tests that do not require external datasets",
"integration: multi-component tests, file I/O tests, or reader workflow tests",
"data: tests that require optional downloaded test datasets",
"slow: tests with comparatively high runtime",
"cli: command-line interface tests",
"codex: tests for the codex reader",
"cosmx: tests for the cosmx reader",
"curio: tests for the curio reader",
"dbit: tests for the dbit reader",
"generic: tests for the generic reader module",
"iss: tests for the iss reader",
"macsima: tests for the macsima reader",
"mcmicro: tests for the mcmicro reader",
"merscope: tests for the merscope reader",
"seqfish: tests for the seqfish reader",
"steinbock: tests for the steinbock reader",
"stereoseq: tests for the stereoseq reader",
"visium: tests for the visium reader",
"visium_hd: tests for the visium_hd reader",
"xenium: tests for the xenium reader",
]

[tool.ruff]
Expand Down
8 changes: 8 additions & 0 deletions scripts/test_data_downloader/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Command-line entrypoint for the optional test data downloader."""

from __future__ import annotations

from downloader import main

if __name__ == "__main__":
main()
106 changes: 106 additions & 0 deletions scripts/test_data_downloader/datasets.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -------
# the Xenium datasets are licensed as CC BY 4.0, as shown here
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/resources/xenium-example-data

# 10x Genomics Xenium 2.0.0
[[datasets]]
key = "xenium_breast"
group = "xenium"
url = "https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Breast_2fov/Xenium_V1_human_Breast_2fov_outs.zip"
archive_name = "Xenium_V1_human_Breast_2fov_outs.zip"
extracted_dir = "Xenium_V1_human_Breast_2fov_outs"
source = "10x Genomics Xenium 2.0.0, CC BY 4.0"

# 10x Genomics Xenium 2.0.0
[[datasets]]
key = "xenium_lung"
group = "xenium"
url = "https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_human_Lung_2fov/Xenium_V1_human_Lung_2fov_outs.zip"
archive_name = "Xenium_V1_human_Lung_2fov_outs.zip"
extracted_dir = "Xenium_V1_human_Lung_2fov_outs"
source = "10x Genomics Xenium 2.0.0, CC BY 4.0"

# 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, multimodal cell segmentation
# this file seems to be corrupted; skipping it for now
# https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny/Xenium_Prime_MultiCellSeg_Mouse_Ileum_tiny.zip

# 10x Genomics Xenium 3.0.0 (5K) Mouse ileum, nuclear expansion
[[datasets]]
key = "xenium_prime_mouse_ileum"
group = "xenium"
url = "https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_Mouse_Ileum_tiny/Xenium_Prime_Mouse_Ileum_tiny_outs.zip"
archive_name = "Xenium_Prime_Mouse_Ileum_tiny_outs.zip"
extracted_dir = "Xenium_Prime_Mouse_Ileum_tiny_outs"
source = "10x Genomics Xenium 3.0.0, CC BY 4.0"

# 10x Genomics Xenium 4.0.0 (v1) Human ovary, nuclear expansion
[[datasets]]
key = "xenium_ovary"
group = "xenium"
url = "https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Human_Ovary_tiny/Xenium_V1_Human_Ovary_tiny_outs.zip"
archive_name = "Xenium_V1_Human_Ovary_tiny_outs.zip"
extracted_dir = "Xenium_V1_Human_Ovary_tiny_outs"
source = "10x Genomics Xenium 4.0.0, CC BY 4.0"

# 10x Genomics Xenium 4.0.0 (v1) Human ovary, multimodal cell segmentation
[[datasets]]
key = "xenium_multicell_ovary"
group = "xenium"
url = "https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_MultiCellSeg_Human_Ovary_tiny/Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs.zip"
archive_name = "Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs.zip"
extracted_dir = "Xenium_V1_MultiCellSeg_Human_Ovary_tiny_outs"
source = "10x Genomics Xenium 4.0.0, CC BY 4.0"

# 10x Genomics Xenium 4.0.0 (v1+Protein) Human kidney, multimodal cell segmentation
[[datasets]]
key = "xenium_protein_kidney"
group = "xenium"
url = "https://cf.10xgenomics.com/samples/xenium/4.0.0/Xenium_V1_Protein_Human_Kidney_tiny/Xenium_V1_Protein_Human_Kidney_tiny_outs.zip"
archive_name = "Xenium_V1_Protein_Human_Kidney_tiny_outs.zip"
extracted_dir = "Xenium_V1_Protein_Human_Kidney_tiny_outs"
source = "10x Genomics Xenium 4.0.0, CC BY 4.0"

# -------
# the Visium HD dataset is licensed as CC BY 4.0, as shown here
# https://www.10xgenomics.com/support/software/space-ranger/latest/resources/visium-hd-example-data

# 10x Genomics Visium HD 4.0.1 3' Mouse Brain Chunk
[[datasets]]
key = "visium_hd_tiny"
group = "visium_hd"
url = "https://cf.10xgenomics.com/samples/spatial-exp/4.0.1/Visium_HD_Tiny_3prime_Dataset/Visium_HD_Tiny_3prime_Dataset_outs.zip"
archive_name = "Visium_HD_Tiny_3prime_Dataset_outs.zip"
extracted_dir = "Visium_HD_Tiny_3prime_Dataset_outs"
source = "10x Genomics Visium HD 4.0.1, CC BY 4.0"

# -------
# we received written permission to make the following dataset public and integrate it in the CI system of spatialdata-io
# Spatial Genomics seqFISH v2
[[datasets]]
key = "seqfish"
group = "seqfish"
url = "https://s3.embl.de/spatialdata/raw_data/seqfish-2-test-dataset.zip"
archive_name = "seqfish-2-test-dataset.zip"
extracted_dir = "seqfish-2-test-dataset"
source = "Spatial Genomics seqFISH v2, public test data"
test_path = "instrument 2 official"

# -------
# MACSima OMAP datasets are licensed as CC BY 4.0
# OMAP23 for format v1.x.x
[[datasets]]
key = "macsima_omap23"
group = "macsima"
url = "https://zenodo.org/api/records/18196452/files-archive"
archive_name = "OMAP23_small.zip"
extracted_dir = "OMAP23_small"
source = "MACSima OMAP23, CC BY 4.0"

# OMAP10 for format v0.x.x
[[datasets]]
key = "macsima_omap10"
group = "macsima"
url = "https://zenodo.org/api/records/18196366/files-archive"
archive_name = "OMAP10_small.zip"
extracted_dir = "OMAP10_small"
source = "MACSima OMAP10, CC BY 4.0"
Loading
Loading