From be3a928fe9cabd5dea6869895f5814571bc7dc0d Mon Sep 17 00:00:00 2001
From: Liudeng Zhang <zhangliudeng@gmail.com>
Date: Thu, 14 May 2026 17:44:50 -0500
Subject: [PATCH] fix: handle compound ontology ids in translate()

translate() resolved each ontology id with a single
`.filter(ontology_id=...).one()` call. CellxGene records multi-ethnic
donors with comma-joined ids (e.g. "HANCESTRO:0005,HANCESTRO:0008"),
which match no single term, so `.one()` raised ObjectDoesNotExist and
aborted the whole run.

Add a `_lookup_name` helper that splits comma-joined ids, resolves each
part, and rejoins the names. It uses `.one_or_none()` so a genuinely
unknown id degrades to None instead of raising. All four branches of
translate() now go through the helper; return types are unchanged.

Adds tests/test_translate.py covering single, compound, unknown, and
mixed ids with a mock registry (no ontology DB needed).

Refs jkobject/scPRINT#49
---
 scdataloader/utils.py   | 34 +++++++++++++++++---
 tests/test_translate.py | 69 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_translate.py

diff --git a/scdataloader/utils.py b/scdataloader/utils.py
index 6a86339..52d5309 100644
--- a/scdataloader/utils.py
+++ b/scdataloader/utils.py
@@ -740,6 +740,32 @@ def length_normalize(adata: AnnData, gene_lengths: list) -> AnnData:
     return adata
 
 
+def _lookup_name(obj, ontology_id):
+    """Resolve an ontology term's name from its ontology id.
+
+    Handles comma-joined compound ids (e.g. CellxGene's
+    ``self_reported_ethnicity_ontology_term_id`` for multi-ethnic donors) by
+    resolving each part and rejoining the names. Returns None if any part is
+    absent from the database, so callers can skip unmappable terms instead of
+    raising.
+
+    Args:
+        obj: the bionty registry to look the id up in (e.g. bt.Ethnicity).
+        ontology_id (str): a single ontology id, or several joined by commas.
+
+    Returns:
+        str or None: the resolved name(s), comma-joined for compound ids, or
+            None if any part could not be found.
+    """
+    names = []
+    for part in str(ontology_id).split(","):
+        record = obj.filter(ontology_id=part.strip()).one_or_none()
+        if record is None:
+            return None
+        names.append(record.name)
+    return ",".join(names)
+
+
 def translate(
     val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
 ) -> dict:
@@ -776,11 +802,11 @@ def translate(
     else:
         return None
     if type(val) is str:
-        return {val: obj.filter(ontology_id=val).one().name}
+        return {val: _lookup_name(obj, val)}
     elif type(val) is dict or type(val) is Counter:
-        return {obj.filter(ontology_id=k).one().name: v for k, v in val.items()}
+        return {_lookup_name(obj, k): v for k, v in val.items()}
     elif type(val) is set:
-        return {i: obj.filter(ontology_id=i).one().name for i in val}
+        return {i: _lookup_name(obj, i) for i in val}
     else:
-        rl = {i: obj.filter(ontology_id=i).one().name for i in set(val)}
+        rl = {i: _lookup_name(obj, i) for i in set(val)}
         return [rl.get(i, None) for i in val]
diff --git a/tests/test_translate.py b/tests/test_translate.py
new file mode 100644
index 0000000..a997d0e
--- /dev/null
+++ b/tests/test_translate.py
@@ -0,0 +1,69 @@
+"""Unit tests for ``scdataloader.utils.translate`` and its ``_lookup_name`` helper.
+
+These use a fake bionty registry so they exercise the id-splitting / name-joining
+logic without needing a populated ontology database.
+"""
+
+import bionty as bt
+
+from scdataloader.utils import _lookup_name, translate
+
+
+class _Record:
+    def __init__(self, name):
+        self.name = name
+
+
+class _Filtered:
+    def __init__(self, record):
+        self._record = record
+
+    def one_or_none(self):
+        return self._record
+
+
+class _FakeRegistry:
+    """Mimics a bionty registry: ``.filter(ontology_id=...).one_or_none()``."""
+
+    _DB = {"HANCESTRO:0005": "European", "HANCESTRO:0008": "African"}
+
+    def filter(self, ontology_id):
+        name = self._DB.get(ontology_id)
+        return _Filtered(_Record(name) if name is not None else None)
+
+
+def test_lookup_name_single_id():
+    assert _lookup_name(_FakeRegistry(), "HANCESTRO:0005") == "European"
+
+
+def test_lookup_name_compound_id():
+    # CellxGene records multi-ethnic donors as comma-joined ids; this used to
+    # raise because the whole string was looked up as a single term.
+    assert (
+        _lookup_name(_FakeRegistry(), "HANCESTRO:0005,HANCESTRO:0008")
+        == "European,African"
+    )
+
+
+def test_lookup_name_unknown_id_returns_none():
+    assert _lookup_name(_FakeRegistry(), "HANCESTRO:9999") is None
+
+
+def test_lookup_name_compound_with_unknown_part_returns_none():
+    assert _lookup_name(_FakeRegistry(), "HANCESTRO:0005,HANCESTRO:9999") is None
+
+
+def test_lookup_name_tolerates_whitespace():
+    assert (
+        _lookup_name(_FakeRegistry(), "HANCESTRO:0005, HANCESTRO:0008")
+        == "European,African"
+    )
+
+
+def test_translate_list_with_compound_id(monkeypatch):
+    monkeypatch.setattr(bt, "Ethnicity", _FakeRegistry())
+    out = translate(
+        ["HANCESTRO:0005", "HANCESTRO:0005,HANCESTRO:0008"],
+        "self_reported_ethnicity_ontology_term_id",
+    )
+    assert out == ["European", "European,African"]