From be3a928fe9cabd5dea6869895f5814571bc7dc0d Mon Sep 17 00:00:00 2001 From: Liudeng Zhang Date: Thu, 14 May 2026 17:44:50 -0500 Subject: [PATCH] fix: handle compound ontology ids in translate() translate() resolved each ontology id with a single `.filter(ontology_id=...).one()` call. CellxGene records multi-ethnic donors with comma-joined ids (e.g. "HANCESTRO:0005,HANCESTRO:0008"), which match no single term, so `.one()` raised ObjectDoesNotExist and aborted the whole run. Add a `_lookup_name` helper that splits comma-joined ids, resolves each part, and rejoins the names. It uses `.one_or_none()` so a genuinely unknown id degrades to None instead of raising. All four branches of translate() now go through the helper; return types are unchanged. Adds tests/test_translate.py covering single, compound, unknown, and mixed ids with a mock registry (no ontology DB needed). Refs jkobject/scPRINT#49 --- scdataloader/utils.py | 34 +++++++++++++++++--- tests/test_translate.py | 69 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 tests/test_translate.py diff --git a/scdataloader/utils.py b/scdataloader/utils.py index 6a86339..52d5309 100644 --- a/scdataloader/utils.py +++ b/scdataloader/utils.py @@ -740,6 +740,32 @@ def length_normalize(adata: AnnData, gene_lengths: list) -> AnnData: return adata +def _lookup_name(obj, ontology_id): + """Resolve an ontology term's name from its ontology id. + + Handles comma-joined compound ids (e.g. CellxGene's + ``self_reported_ethnicity_ontology_term_id`` for multi-ethnic donors) by + resolving each part and rejoining the names. Returns None if any part is + absent from the database, so callers can skip unmappable terms instead of + raising. + + Args: + obj: the bionty registry to look the id up in (e.g. bt.Ethnicity). + ontology_id (str): a single ontology id, or several joined by commas. + + Returns: + str or None: the resolved name(s), comma-joined for compound ids, or + None if any part could not be found. + """ + names = [] + for part in str(ontology_id).split(","): + record = obj.filter(ontology_id=part.strip()).one_or_none() + if record is None: + return None + names.append(record.name) + return ",".join(names) + + def translate( val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id" ) -> dict: @@ -776,11 +802,11 @@ def translate( else: return None if type(val) is str: - return {val: obj.filter(ontology_id=val).one().name} + return {val: _lookup_name(obj, val)} elif type(val) is dict or type(val) is Counter: - return {obj.filter(ontology_id=k).one().name: v for k, v in val.items()} + return {_lookup_name(obj, k): v for k, v in val.items()} elif type(val) is set: - return {i: obj.filter(ontology_id=i).one().name for i in val} + return {i: _lookup_name(obj, i) for i in val} else: - rl = {i: obj.filter(ontology_id=i).one().name for i in set(val)} + rl = {i: _lookup_name(obj, i) for i in set(val)} return [rl.get(i, None) for i in val] diff --git a/tests/test_translate.py b/tests/test_translate.py new file mode 100644 index 0000000..a997d0e --- /dev/null +++ b/tests/test_translate.py @@ -0,0 +1,69 @@ +"""Unit tests for ``scdataloader.utils.translate`` and its ``_lookup_name`` helper. + +These use a fake bionty registry so they exercise the id-splitting / name-joining +logic without needing a populated ontology database. +""" + +import bionty as bt + +from scdataloader.utils import _lookup_name, translate + + +class _Record: + def __init__(self, name): + self.name = name + + +class _Filtered: + def __init__(self, record): + self._record = record + + def one_or_none(self): + return self._record + + +class _FakeRegistry: + """Mimics a bionty registry: ``.filter(ontology_id=...).one_or_none()``.""" + + _DB = {"HANCESTRO:0005": "European", "HANCESTRO:0008": "African"} + + def filter(self, ontology_id): + name = self._DB.get(ontology_id) + return _Filtered(_Record(name) if name is not None else None) + + +def test_lookup_name_single_id(): + assert _lookup_name(_FakeRegistry(), "HANCESTRO:0005") == "European" + + +def test_lookup_name_compound_id(): + # CellxGene records multi-ethnic donors as comma-joined ids; this used to + # raise because the whole string was looked up as a single term. + assert ( + _lookup_name(_FakeRegistry(), "HANCESTRO:0005,HANCESTRO:0008") + == "European,African" + ) + + +def test_lookup_name_unknown_id_returns_none(): + assert _lookup_name(_FakeRegistry(), "HANCESTRO:9999") is None + + +def test_lookup_name_compound_with_unknown_part_returns_none(): + assert _lookup_name(_FakeRegistry(), "HANCESTRO:0005,HANCESTRO:9999") is None + + +def test_lookup_name_tolerates_whitespace(): + assert ( + _lookup_name(_FakeRegistry(), "HANCESTRO:0005, HANCESTRO:0008") + == "European,African" + ) + + +def test_translate_list_with_compound_id(monkeypatch): + monkeypatch.setattr(bt, "Ethnicity", _FakeRegistry()) + out = translate( + ["HANCESTRO:0005", "HANCESTRO:0005,HANCESTRO:0008"], + "self_reported_ethnicity_ontology_term_id", + ) + assert out == ["European", "European,African"]