Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions scdataloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,32 @@ def length_normalize(adata: AnnData, gene_lengths: list) -> AnnData:
return adata


def _lookup_name(obj, ontology_id):
"""Resolve an ontology term's name from its ontology id.

Handles comma-joined compound ids (e.g. CellxGene's
``self_reported_ethnicity_ontology_term_id`` for multi-ethnic donors) by
resolving each part and rejoining the names. Returns None if any part is
absent from the database, so callers can skip unmappable terms instead of
raising.

Args:
obj: the bionty registry to look the id up in (e.g. bt.Ethnicity).
ontology_id (str): a single ontology id, or several joined by commas.

Returns:
str or None: the resolved name(s), comma-joined for compound ids, or
None if any part could not be found.
"""
names = []
for part in str(ontology_id).split(","):
record = obj.filter(ontology_id=part.strip()).one_or_none()
if record is None:
return None
names.append(record.name)
return ",".join(names)


def translate(
val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
) -> dict:
Expand Down Expand Up @@ -776,11 +802,11 @@ def translate(
else:
return None
if type(val) is str:
return {val: obj.filter(ontology_id=val).one().name}
return {val: _lookup_name(obj, val)}
elif type(val) is dict or type(val) is Counter:
return {obj.filter(ontology_id=k).one().name: v for k, v in val.items()}
return {_lookup_name(obj, k): v for k, v in val.items()}
elif type(val) is set:
return {i: obj.filter(ontology_id=i).one().name for i in val}
return {i: _lookup_name(obj, i) for i in val}
else:
rl = {i: obj.filter(ontology_id=i).one().name for i in set(val)}
rl = {i: _lookup_name(obj, i) for i in set(val)}
return [rl.get(i, None) for i in val]
69 changes: 69 additions & 0 deletions tests/test_translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Unit tests for ``scdataloader.utils.translate`` and its ``_lookup_name`` helper.

These use a fake bionty registry so they exercise the id-splitting / name-joining
logic without needing a populated ontology database.
"""

import bionty as bt

from scdataloader.utils import _lookup_name, translate


class _Record:
def __init__(self, name):
self.name = name


class _Filtered:
def __init__(self, record):
self._record = record

def one_or_none(self):
return self._record


class _FakeRegistry:
"""Mimics a bionty registry: ``.filter(ontology_id=...).one_or_none()``."""

_DB = {"HANCESTRO:0005": "European", "HANCESTRO:0008": "African"}

def filter(self, ontology_id):
name = self._DB.get(ontology_id)
return _Filtered(_Record(name) if name is not None else None)


def test_lookup_name_single_id():
assert _lookup_name(_FakeRegistry(), "HANCESTRO:0005") == "European"


def test_lookup_name_compound_id():
# CellxGene records multi-ethnic donors as comma-joined ids; this used to
# raise because the whole string was looked up as a single term.
assert (
_lookup_name(_FakeRegistry(), "HANCESTRO:0005,HANCESTRO:0008")
== "European,African"
)


def test_lookup_name_unknown_id_returns_none():
assert _lookup_name(_FakeRegistry(), "HANCESTRO:9999") is None


def test_lookup_name_compound_with_unknown_part_returns_none():
assert _lookup_name(_FakeRegistry(), "HANCESTRO:0005,HANCESTRO:9999") is None


def test_lookup_name_tolerates_whitespace():
assert (
_lookup_name(_FakeRegistry(), "HANCESTRO:0005, HANCESTRO:0008")
== "European,African"
)


def test_translate_list_with_compound_id(monkeypatch):
monkeypatch.setattr(bt, "Ethnicity", _FakeRegistry())
out = translate(
["HANCESTRO:0005", "HANCESTRO:0005,HANCESTRO:0008"],
"self_reported_ethnicity_ontology_term_id",
)
assert out == ["European", "European,African"]