From 3f36a833de57a2e22ed51f8fc88ecbc6b39ab3b2 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sat, 20 Jun 2026 21:24:28 -0700 Subject: [PATCH 01/10] saving them as proper objects --- lib/src/rdswrapper.cpp | 214 +++++++++++++++++++++++++- src/rds2py/PyRdsReader.py | 7 + src/rds2py/read_frame.py | 6 +- src/rds2py/save_compressed_list.py | 39 ++++- src/rds2py/save_delayed_matrix.py | 6 +- src/rds2py/save_factor.py | 9 +- src/rds2py/save_frame.py | 28 +++- src/rds2py/save_granges.py | 76 ++++++++-- src/rds2py/save_mae.py | 26 +++- src/rds2py/save_matrix.py | 40 ++++- src/rds2py/save_rle.py | 16 +- src/rds2py/save_sce.py | 157 +++++++++++++++++-- src/rds2py/save_se.py | 61 ++++++-- tests/test_clists.py | 14 +- tests/test_delayedmatrices.py | 12 +- tests/test_factors.py | 4 +- tests/test_mae.py | 13 +- tests/test_roundtrip_r.py | 234 +++++++++++++++++++++++++++++ tests/test_save_rds_complex.py | 53 ++++--- 19 files changed, 908 insertions(+), 107 deletions(-) create mode 100644 tests/test_roundtrip_r.py diff --git a/lib/src/rdswrapper.cpp b/lib/src/rdswrapper.cpp index 28131d9..74c93a0 100644 --- a/lib/src/rdswrapper.cpp +++ b/lib/src/rdswrapper.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace py = pybind11; @@ -28,6 +29,7 @@ class RdsReader { case rds2cpp::SEXPType::LGL: return "boolean"; case rds2cpp::SEXPType::VEC: return "vector"; case rds2cpp::SEXPType::NIL: return "null"; + case rds2cpp::SEXPType::SYM: return "symbol"; default: return "other"; } } @@ -139,6 +141,17 @@ class RdsReader { return {static_cast(dims[0]), static_cast(dims[1])}; } + std::string get_symbol_name() const { + if (!ptr || ptr->type() != rds2cpp::SEXPType::SYM) { + throw std::runtime_error("Not a symbol object"); + } + const auto* sym = static_cast(ptr); + if (sym->index >= symbols_ptr->size()) { + throw std::runtime_error("Symbol index out of range"); + } + return (*symbols_ptr)[sym->index].name; + } + private: std::string resolve_symbol(const rds2cpp::SymbolIndex& sym) const { if (sym.index >= symbols_ptr->size()) { @@ -321,18 +334,210 @@ std::unique_ptr py_to_robject(const py::object& obj, std::vect throw std::runtime_error("Unsupported numpy dtype for RDS writing"); } - // dict -> GenericVector with names attribute + // dict if (py::isinstance(obj)) { auto d = obj.cast(); - auto gvec = std::make_unique(); + // If it's a structured R object dictionary: + if (d.contains("type")) { + std::string rtype = d["type"].cast(); + + if (rtype == "S4") { + auto s4 = std::make_unique(); + s4->class_name = d["class_name"].cast(); + s4->package_name = d["package_name"].cast(); + if (d.contains("attributes") && !d["attributes"].is_none()) { + auto attrs = d["attributes"].cast(); + for (auto& item : attrs) { + auto name_str = item.first.cast(); + auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols); + py::object val_py = py::reinterpret_borrow(item.second); + std::unique_ptr val_obj; + if (val_py.is_none()) { + val_obj = std::make_unique( + rds2cpp::register_symbol("\001NULL\001", rds2cpp::StringEncoding::UTF8, symbols) + ); + } else if (py::isinstance(val_py) && val_py.cast().contains("type") && py::isinstance(val_py.cast()["type"]) && val_py.cast()["type"].cast() == "null") { + val_obj = std::make_unique( + rds2cpp::register_symbol("\001NULL\001", rds2cpp::StringEncoding::UTF8, symbols) + ); + } else { + val_obj = py_to_robject(val_py, symbols); + } + s4->attributes.emplace_back(name_sym, std::move(val_obj)); + } + } + return s4; + } + + if (rtype == "integer") { + auto vec = std::make_unique(); + if (d.contains("data") && !d["data"].is_none()) { + auto data_obj = d["data"]; + if (py::isinstance(data_obj)) { + auto arr = data_obj.cast>(); + auto r = arr.unchecked<1>(); + vec->data.reserve(r.shape(0)); + for (ssize_t i = 0; i < r.shape(0); ++i) vec->data.push_back(r(i)); + } else { + auto seq = data_obj.cast(); + vec->data.reserve(py::len(seq)); + for (size_t i = 0; i < py::len(seq); ++i) { + if (seq[i].is_none()) { + vec->data.push_back(-2147483648); + } else { + vec->data.push_back(seq[i].cast()); + } + } + } + } + if (d.contains("attributes") && !d["attributes"].is_none()) { + auto attrs = d["attributes"].cast(); + for (auto& item : attrs) { + auto name_str = item.first.cast(); + auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols); + auto val_obj = py_to_robject(py::reinterpret_borrow(item.second), symbols); + vec->attributes.emplace_back(name_sym, std::move(val_obj)); + } + } + return vec; + } + + if (rtype == "double" || rtype == "numeric") { + auto vec = std::make_unique(); + if (d.contains("data") && !d["data"].is_none()) { + auto data_obj = d["data"]; + if (py::isinstance(data_obj)) { + auto arr = data_obj.cast>(); + auto r = arr.unchecked<1>(); + vec->data.reserve(r.shape(0)); + for (ssize_t i = 0; i < r.shape(0); ++i) vec->data.push_back(r(i)); + } else { + auto seq = data_obj.cast(); + vec->data.reserve(py::len(seq)); + for (size_t i = 0; i < py::len(seq); ++i) { + if (seq[i].is_none()) { + vec->data.push_back(std::numeric_limits::quiet_NaN()); + } else { + vec->data.push_back(seq[i].cast()); + } + } + } + } + if (d.contains("attributes") && !d["attributes"].is_none()) { + auto attrs = d["attributes"].cast(); + for (auto& item : attrs) { + auto name_str = item.first.cast(); + auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols); + auto val_obj = py_to_robject(py::reinterpret_borrow(item.second), symbols); + vec->attributes.emplace_back(name_sym, std::move(val_obj)); + } + } + return vec; + } + + if (rtype == "boolean" || rtype == "logical") { + auto vec = std::make_unique(); + if (d.contains("data") && !d["data"].is_none()) { + auto data_obj = d["data"]; + if (py::isinstance(data_obj)) { + auto arr = data_obj.cast>(); + auto r = arr.unchecked<1>(); + vec->data.reserve(r.shape(0)); + for (ssize_t i = 0; i < r.shape(0); ++i) vec->data.push_back(r(i) ? 1 : 0); + } else { + auto seq = data_obj.cast(); + vec->data.reserve(py::len(seq)); + for (size_t i = 0; i < py::len(seq); ++i) { + if (seq[i].is_none()) { + vec->data.push_back(-2147483648); + } else { + vec->data.push_back(seq[i].cast() ? 1 : 0); + } + } + } + } + if (d.contains("attributes") && !d["attributes"].is_none()) { + auto attrs = d["attributes"].cast(); + for (auto& item : attrs) { + auto name_str = item.first.cast(); + auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols); + auto val_obj = py_to_robject(py::reinterpret_borrow(item.second), symbols); + vec->attributes.emplace_back(name_sym, std::move(val_obj)); + } + } + return vec; + } + + if (rtype == "string" || rtype == "character") { + auto vec = std::make_unique(); + if (d.contains("data") && !d["data"].is_none()) { + auto lst = d["data"].cast(); + vec->data.reserve(py::len(lst)); + for (size_t i = 0; i < py::len(lst); ++i) { + auto item = lst[i]; + if (item.is_none()) { + vec->data.emplace_back(); + } else { + vec->data.emplace_back(item.cast(), rds2cpp::StringEncoding::UTF8); + } + } + } + if (d.contains("attributes") && !d["attributes"].is_none()) { + auto attrs = d["attributes"].cast(); + for (auto& item : attrs) { + auto name_str = item.first.cast(); + auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols); + auto val_obj = py_to_robject(py::reinterpret_borrow(item.second), symbols); + vec->attributes.emplace_back(name_sym, std::move(val_obj)); + } + } + return vec; + } + + if (rtype == "vector" || rtype == "list") { + auto vec = std::make_unique(); + if (d.contains("data") && !d["data"].is_none()) { + auto lst = d["data"].cast(); + vec->data.reserve(py::len(lst)); + for (size_t i = 0; i < py::len(lst); ++i) { + vec->data.push_back(py_to_robject(lst[i].cast(), symbols)); + } + } + if (d.contains("attributes") && !d["attributes"].is_none()) { + auto attrs = d["attributes"].cast(); + for (auto& item : attrs) { + auto name_str = item.first.cast(); + auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols); + auto val_obj = py_to_robject(py::reinterpret_borrow(item.second), symbols); + vec->attributes.emplace_back(name_sym, std::move(val_obj)); + } + } + return vec; + } + + if (rtype == "symbol") { + std::string name_str = d["name"].cast(); + return std::make_unique( + rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols) + ); + } + + if (rtype == "null") { + return std::make_unique(); + } + + throw std::runtime_error("Unsupported type for structured RDS writing: " + rtype); + } + + // Default dictionary -> GenericVector with names attribute + auto gvec = std::make_unique(); py::list keys; for (auto& item : d) { keys.append(item.first); gvec->data.push_back(py_to_robject(py::reinterpret_borrow(item.second), symbols)); } add_names_attribute(gvec->attributes, keys, symbols); - return gvec; } @@ -447,7 +652,8 @@ PYBIND11_MODULE(lib_rds_parser, m) { .def("load_vec_element", &RdsReader::load_vec_element) .def("get_package_name", &RdsReader::get_package_name) .def("get_class_name", &RdsReader::get_class_name) - .def("get_dimensions", &RdsReader::get_dimensions); + .def("get_dimensions", &RdsReader::get_dimensions) + .def("get_symbol_name", &RdsReader::get_symbol_name); m.def("write_rds", &write_rds_file, "Write a Python object to an RDS file", py::arg("obj"), py::arg("path")); diff --git a/src/rds2py/PyRdsReader.py b/src/rds2py/PyRdsReader.py index 166719b..238ecf5 100644 --- a/src/rds2py/PyRdsReader.py +++ b/src/rds2py/PyRdsReader.py @@ -98,6 +98,13 @@ def _process_object(self, obj: RdsReader) -> Dict[str, Any]: result["data"] = self._process_vector(obj) result["attributes"] = self._process_attributes(obj) result["class_name"] = "vector" + elif rtype == "symbol": + symbol_name = obj.get_symbol_name() + if symbol_name == "\001NULL\001": + result = {"type": "null"} + else: + result["name"] = symbol_name + result["class_name"] = "symbol" elif rtype == "null": pass else: diff --git a/src/rds2py/read_frame.py b/src/rds2py/read_frame.py index d802f9f..ebc5f04 100644 --- a/src/rds2py/read_frame.py +++ b/src/rds2py/read_frame.py @@ -75,12 +75,12 @@ def read_dframe(robject: dict, **kwargs): data[colname] = _dispatcher(robject["attributes"]["listData"]["data"][idx], **kwargs) index = None - if robject["attributes"]["rownames"]["data"]: + if "data" in robject["attributes"]["rownames"] and robject["attributes"]["rownames"]["data"]: index = _dispatcher(robject["attributes"]["rownames"], **kwargs) nrows = None - if robject["attributes"]["nrows"]["data"]: - nrows = list(_dispatcher(robject["attributes"]["nrows"]), **kwargs)[0] + if "data" in robject["attributes"]["nrows"] and robject["attributes"]["nrows"]["data"]: + nrows = list(_dispatcher(robject["attributes"]["nrows"], **kwargs))[0] df = BiocFrame( data, diff --git a/src/rds2py/save_compressed_list.py b/src/rds2py/save_compressed_list.py index fcc1666..095b5e6 100644 --- a/src/rds2py/save_compressed_list.py +++ b/src/rds2py/save_compressed_list.py @@ -22,10 +22,34 @@ def _get(obj, name): return getattr(obj, f"get_{name}")() return getattr(obj, name, None) + class_name = type(x).__name__ + r_class_name = class_name + element_type = "ANY" + if class_name == "CompressedIntegerList": + element_type = "integer" + elif class_name == "CompressedCharacterList": + element_type = "character" + elif class_name == "CompressedBooleanList": + r_class_name = "CompressedLogicalList" + element_type = "logical" + elif class_name == "CompressedFloatList": + r_class_name = "CompressedNumericList" + element_type = "numeric" + elif class_name == "CompressedSplitBiocFrameList": + r_class_name = "CompressedSplitDFrameList" + element_type = "DFrame" + converted = { - "unlist_data": save_rds(_get(x, "unlist_data")), - "partitioning": save_rds(_get(x, "partitioning")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": r_class_name, + "package_name": "IRanges", + "attributes": { + "unlistData": save_rds(_get(x, "unlist_data")), + "partitioning": save_rds(_get(x, "partitioning")), + "elementType": {"type": "string", "data": [element_type]}, + "elementMetadata": save_rds(_get(x, "element_metadata")), + "metadata": save_rds(_get(x, "metadata")), + }, } if path is not None: @@ -43,8 +67,13 @@ def _get(obj, name): return getattr(obj, name, None) converted = { - "ends": save_rds(_get(x, "ends")), - "names": save_rds(_get(x, "names")), + "type": "S4", + "class_name": "PartitioningByEnd", + "package_name": "IRanges", + "attributes": { + "end": save_rds(_get(x, "ends")), + "NAMES": save_rds(_get(x, "names")), + }, } if path is not None: diff --git a/src/rds2py/save_delayed_matrix.py b/src/rds2py/save_delayed_matrix.py index 9aa109c..d5ff660 100644 --- a/src/rds2py/save_delayed_matrix.py +++ b/src/rds2py/save_delayed_matrix.py @@ -49,9 +49,9 @@ def _save_rds_h5sparse_seed(x: Hdf5CompressedSparseMatrixSeed, path: Optional[st "class_name": "CSC_H5SparseMatrixSeed" if x.by_column else "CSR_H5SparseMatrixSeed", "package_name": "HDF5Array", "attributes": { - "dim": save_rds(list(x.shape)), - "filepath": save_rds([x.path]), - "group": save_rds([x.group_name]), + "dim": {"type": "integer", "data": list(x.shape)}, + "filepath": {"type": "string", "data": [x.path]}, + "group": {"type": "string", "data": [x.group_name]}, }, } diff --git a/src/rds2py/save_factor.py b/src/rds2py/save_factor.py index 4cdf428..cd0eb29 100644 --- a/src/rds2py/save_factor.py +++ b/src/rds2py/save_factor.py @@ -14,9 +14,14 @@ def _save_rds_factor(x: Factor, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native converted = { - "levels": save_rds(x.get_levels()), - "data": save_rds(x.get_codes() + 1), + "type": "integer", + "data": list(x.get_codes() + 1), + "attributes": { + "class": {"type": "string", "data": ["factor"]}, + "levels": {"type": "string", "data": list(x.get_levels())}, + }, } + if path is not None: _write_rds_native(converted, path) diff --git a/src/rds2py/save_frame.py b/src/rds2py/save_frame.py index 68e8979..714f0e3 100644 --- a/src/rds2py/save_frame.py +++ b/src/rds2py/save_frame.py @@ -16,9 +16,31 @@ def _save_rds_biocframe(x: BiocFrame, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native - converted = {} - for col_name in x.column_names: - converted[col_name] = save_rds(x.column(col_name)) + list_data = { + "type": "vector", + "data": [save_rds(x.column(col_name)) for col_name in x.column_names], + "attributes": {"names": {"type": "string", "data": list(x.column_names)}}, + } + + rownames = x.row_names + if rownames is not None: + rownames_data = {"type": "string", "data": list(rownames)} + else: + rownames_data = {"type": "null"} + + converted = { + "type": "S4", + "class_name": "DFrame", + "package_name": "S4Vectors", + "attributes": { + "listData": list_data, + "rownames": rownames_data, + "nrows": {"type": "integer", "data": [x.shape[0]]}, + "elementType": {"type": "string", "data": ["ANY"]}, + "elementMetadata": {"type": "null"}, + "metadata": {"type": "vector", "data": []}, + }, + } if path is not None: _write_rds_native(converted, path) diff --git a/src/rds2py/save_granges.py b/src/rds2py/save_granges.py index 660674c..811cd7b 100644 --- a/src/rds2py/save_granges.py +++ b/src/rds2py/save_granges.py @@ -22,10 +22,15 @@ def _get(obj, name): return getattr(obj, name, None) converted = { - "seqnames": save_rds(_get(x, "seqnames")), - "seqlengths": save_rds(_get(x, "seqlengths")), - "is_circular": save_rds(_get(x, "is_circular")), - "genome": save_rds(_get(x, "genome")), + "type": "S4", + "class_name": "SeqInfo", + "package_name": "GenomeInfoDb", + "attributes": { + "seqnames": save_rds(_get(x, "seqnames")), + "seqlengths": save_rds(_get(x, "seqlengths")), + "is_circular": save_rds(_get(x, "is_circular")), + "genome": save_rds(_get(x, "genome")), + }, } if path is not None: @@ -35,6 +40,8 @@ def _get(obj, name): @save_rds.register(GenomicRanges) def _save_rds_genomicranges(x: GenomicRanges, path: Optional[str] = None): + import numpy as np + from .lib_rds_parser import write_rds as _write_rds_native def _get(obj, name): @@ -42,13 +49,49 @@ def _get(obj, name): return getattr(obj, f"get_{name}")() return getattr(obj, name, None) + # Map strand codes: 1 -> 1 (+), -1 -> 2 (-), 0 -> 3 (*) + strand_data = _get(x, "strand") + mapped_strand_codes = np.zeros_like(strand_data, dtype=np.int32) + mapped_strand_codes[strand_data == 1] = 1 + mapped_strand_codes[strand_data == -1] = 2 + mapped_strand_codes[strand_data == 0] = 3 + + converted_strand = { + "type": "integer", + "data": list(mapped_strand_codes), + "attributes": { + "class": {"type": "string", "data": ["factor"]}, + "levels": {"type": "string", "data": ["+", "-", "*"]}, + }, + } + + # R expects seqnames to be a factor vector + seq_names = _get(x, "seqnames") + seq_info_names = list(x.seqinfo.seqnames) + seq_codes = [seq_info_names.index(name) + 1 for name in seq_names] + converted_seqnames = { + "type": "integer", + "data": seq_codes, + "attributes": { + "class": {"type": "string", "data": ["factor"]}, + "levels": {"type": "string", "data": seq_info_names}, + }, + } + converted = { - "seqnames": save_rds(_get(x, "seqnames")), - "ranges": save_rds(_get(x, "ranges")), - "strand": save_rds(_get(x, "strand")), - "seqinfo": save_rds(_get(x, "seqinfo")), - "mcols": save_rds(_get(x, "mcols")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": "GRanges", + "package_name": "GenomicRanges", + "attributes": { + "seqnames": converted_seqnames, + "ranges": save_rds(_get(x, "ranges")), + "strand": converted_strand, + "seqinfo": save_rds(_get(x, "seqinfo")), + "elementMetadata": save_rds(_get(x, "mcols")), + "elementType": {"type": "string", "data": ["ANY"]}, + "metadata": save_rds(_get(x, "metadata")), + "NAMES": save_rds(_get(x, "names")), + }, } if path is not None: @@ -66,9 +109,16 @@ def _get(obj, name): return getattr(obj, name, None) converted = { - "unlist_data": save_rds(_get(x, "unlist_data")), - "partitioning": save_rds(_get(x, "partitioning")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": "CompressedGRangesList", + "package_name": "GenomicRanges", + "attributes": { + "unlistData": save_rds(_get(x, "unlist_data")), + "partitioning": save_rds(_get(x, "partitioning")), + "elementMetadata": save_rds(_get(x, "element_metadata")), + "elementType": {"type": "string", "data": ["GRanges"]}, + "metadata": save_rds(_get(x, "metadata")), + }, } if path is not None: diff --git a/src/rds2py/save_mae.py b/src/rds2py/save_mae.py index 14fd1ae..276761e 100644 --- a/src/rds2py/save_mae.py +++ b/src/rds2py/save_mae.py @@ -21,11 +21,29 @@ def _get(obj, name): return getattr(obj, f"get_{name}")() return getattr(obj, name, None) + expts = _get(x, "experiments") + expts_list_data = { + "type": "vector", + "data": [save_rds(v) for v in expts.values()], + "attributes": {"names": {"type": "string", "data": list(expts.keys())}}, + } + expt_list_s4 = { + "type": "S4", + "class_name": "ExperimentList", + "package_name": "MultiAssayExperiment", + "attributes": {"listData": expts_list_data}, + } + converted = { - "experiments": save_rds(_get(x, "experiments")), - "col_data": save_rds(_get(x, "col_data")), - "sample_map": save_rds(_get(x, "sample_map")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": "MultiAssayExperiment", + "package_name": "MultiAssayExperiment", + "attributes": { + "ExperimentList": expt_list_s4, + "colData": save_rds(_get(x, "column_data")), + "sampleMap": save_rds(_get(x, "sample_map")), + "metadata": save_rds(_get(x, "metadata")), + }, } if path is not None: diff --git a/src/rds2py/save_matrix.py b/src/rds2py/save_matrix.py index 4a76bc1..8c30c4f 100644 --- a/src/rds2py/save_matrix.py +++ b/src/rds2py/save_matrix.py @@ -3,6 +3,7 @@ from numpy import ndarray from .generics import save_rds +from .read_matrix import MatrixWrapper __author__ = "jkanche" __copyright__ = "jkanche" @@ -14,7 +15,42 @@ def _save_rds_ndarray(x: ndarray, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native x_flat = x.flatten(order="F") if x.ndim > 1 else x + + if x.ndim > 1: + type_str = "double" + if x.dtype.kind == "b": + type_str = "logical" + elif x.dtype.kind in "iu": + type_str = "integer" + + converted = { + "type": type_str, + "data": x_flat, + "attributes": {"dim": {"type": "integer", "data": list(x.shape)}}, + } + else: + converted = x_flat + + if path is not None: + _write_rds_native(converted, path) + + return converted + + +@save_rds.register(MatrixWrapper) +def _save_rds_matrixwrapper(x: MatrixWrapper, path: Optional[str] = None): + from .lib_rds_parser import write_rds as _write_rds_native + + converted = save_rds(x.matrix) + + if isinstance(converted, dict) and "attributes" in converted: + if x.dimnames is not None: + converted["attributes"]["dimnames"] = { + "type": "vector", + "data": [save_rds(list(names)) if names is not None else {"type": "null"} for names in x.dimnames], + } + if path is not None: - _write_rds_native(x_flat, path) + _write_rds_native(converted, path) - return x_flat + return converted diff --git a/src/rds2py/save_rle.py b/src/rds2py/save_rle.py index 8195ee7..3a7b88d 100644 --- a/src/rds2py/save_rle.py +++ b/src/rds2py/save_rle.py @@ -22,11 +22,17 @@ def _get(obj, name): return getattr(obj, name, None) converted = { - "start": save_rds(_get(x, "start")), - "width": save_rds(_get(x, "width")), - "names": save_rds(_get(x, "names")), - "mcols": save_rds(_get(x, "mcols")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": "IRanges", + "package_name": "IRanges", + "attributes": { + "start": save_rds(_get(x, "start")), + "width": save_rds(_get(x, "width")), + "NAMES": save_rds(_get(x, "names")), + "elementType": {"type": "string", "data": ["ANY"]}, + "elementMetadata": save_rds(_get(x, "mcols")), + "metadata": save_rds(_get(x, "metadata")), + }, } if path is not None: diff --git a/src/rds2py/save_sce.py b/src/rds2py/save_sce.py index 0e28122..cfbd975 100644 --- a/src/rds2py/save_sce.py +++ b/src/rds2py/save_sce.py @@ -12,6 +12,37 @@ if is_package_installed("singlecellexperiment", verbose=True): from singlecellexperiment import SingleCellExperiment + def _get_assay_dict(x): + assays = x.get_assays() if hasattr(x, "get_assays") else getattr(x, "assays", {}) + if not assays: + return None + + assay_names = list(assays.keys()) + assay_list_data = { + "type": "vector", + "data": [save_rds(v) for v in assays.values()], + "attributes": {"names": {"type": "string", "data": assay_names}}, + } + + return { + "type": "S4", + "class_name": "SimpleAssays", + "package_name": "SummarizedExperiment", + "attributes": { + "data": { + "type": "S4", + "class_name": "SimpleList", + "package_name": "S4Vectors", + "attributes": { + "listData": assay_list_data, + "elementType": {"type": "string", "data": ["ANY"]}, + "elementMetadata": None, + "metadata": {"type": "vector", "data": []}, + }, + } + }, + } + @save_rds.register(SingleCellExperiment) def _save_rds_sce(x: SingleCellExperiment, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native @@ -21,17 +52,123 @@ def _get(obj, name): return getattr(obj, f"get_{name}")() return getattr(obj, name, None) + def _make_internal_dframe(list_data, nrows): + return { + "type": "S4", + "class_name": "DFrame", + "package_name": "S4Vectors", + "attributes": { + "listData": list_data, + "rownames": {"type": "null"}, + "nrows": {"type": "integer", "data": [nrows]}, + "elementType": {"type": "string", "data": ["ANY"]}, + "elementMetadata": {"type": "null"}, + "metadata": {"type": "vector", "data": []}, + }, + } + + int_list_data = {"type": "vector", "data": [], "attributes": {"names": {"type": "string", "data": []}}} + + reduced_dims = _get(x, "reduced_dims") + if reduced_dims is not None and len(reduced_dims) > 0: + red_dims_list_data = { + "type": "vector", + "data": [save_rds(v) for v in reduced_dims.values()], + "attributes": {"names": {"type": "string", "data": list(reduced_dims.keys())}}, + } + red_dims_converted = _make_internal_dframe(red_dims_list_data, x.shape[1]) + else: + red_dims_converted = _make_internal_dframe( + {"type": "vector", "data": [], "attributes": {"names": {"type": "string", "data": []}}}, x.shape[1] + ) + + int_list_data["data"].append(red_dims_converted) + int_list_data["attributes"]["names"]["data"].append("reducedDims") + + alt_exps = _get(x, "alternative_experiments") + if alt_exps is not None and len(alt_exps) > 0: + alt_exps_list_data = { + "type": "vector", + "data": [], + "attributes": {"names": {"type": "string", "data": list(alt_exps.keys())}}, + } + for k, v in alt_exps.items(): + alt_exps_list_data["data"].append( + { + "type": "S4", + "class_name": "SummarizedExperimentByColumn", + "package_name": "SingleCellExperiment", + "attributes": {"se": save_rds(v)}, + } + ) + alt_exps_converted = _make_internal_dframe(alt_exps_list_data, x.shape[1]) + else: + alt_exps_converted = _make_internal_dframe( + {"type": "vector", "data": [], "attributes": {"names": {"type": "string", "data": []}}}, x.shape[1] + ) + + int_list_data["data"].append(alt_exps_converted) + int_list_data["attributes"]["names"]["data"].append("altExps") + + col_pairs_converted = _make_internal_dframe( + {"type": "vector", "data": [], "attributes": {"names": {"type": "string", "data": []}}}, x.shape[1] + ) + int_list_data["data"].append(col_pairs_converted) + int_list_data["attributes"]["names"]["data"].append("colPairs") + + int_coldata = { + "type": "S4", + "class_name": "DFrame", + "package_name": "S4Vectors", + "attributes": { + "listData": int_list_data, + "rownames": {"type": "null"}, + "nrows": {"type": "integer", "data": [x.shape[1]]}, + "elementType": {"type": "string", "data": ["ANY"]}, + "elementMetadata": {"type": "null"}, + "metadata": {"type": "vector", "data": []}, + }, + } + + row_pairs_converted = _make_internal_dframe( + {"type": "vector", "data": [], "attributes": {"names": {"type": "string", "data": []}}}, x.shape[0] + ) + + int_elementMetadata_list_data = { + "type": "vector", + "data": [row_pairs_converted], + "attributes": {"names": {"type": "string", "data": ["rowPairs"]}}, + } + + int_elementMetadata = _make_internal_dframe(int_elementMetadata_list_data, x.shape[0]) + + version_obj = { + "type": "vector", + "data": [{"type": "integer", "data": [99, 99, 99]}], + "attributes": {"class": {"type": "string", "data": ["package_version", "numeric_version"]}}, + } + + int_metadata = { + "type": "vector", + "data": [version_obj], + "attributes": {"names": {"type": "string", "data": ["version"]}}, + } + converted = { - "assays": save_rds(_get(x, "assays")), - "row_data": save_rds(_get(x, "row_data")), - "column_data": save_rds(_get(x, "column_data")), - "row_ranges": save_rds(_get(x, "row_ranges")), - "metadata": save_rds(_get(x, "metadata")), - "reduced_dims": save_rds(_get(x, "reduced_dims")), - "main_experiment_name": save_rds(_get(x, "main_experiment_name")), - "alternative_experiments": save_rds(_get(x, "alternative_experiments")), - "row_pairs": save_rds(_get(x, "row_pairs")), - "column_pairs": save_rds(_get(x, "column_pairs")), + "type": "S4", + "class_name": "SingleCellExperiment", + "package_name": "SingleCellExperiment", + "attributes": { + "assays": _get_assay_dict(x), + "colData": save_rds(_get(x, "column_data")), + "elementMetadata": save_rds(_get(x, "row_data")), + "metadata": save_rds(_get(x, "metadata")), + "rowRanges": save_rds(_get(x, "row_ranges")), + "int_colData": int_coldata, + "int_elementMetadata": int_elementMetadata, + "int_metadata": int_metadata, + "NAMES": None, + }, } if path is not None: diff --git a/src/rds2py/save_se.py b/src/rds2py/save_se.py index e743f27..41a4656 100644 --- a/src/rds2py/save_se.py +++ b/src/rds2py/save_se.py @@ -12,6 +12,37 @@ if is_package_installed("summarizedexperiment", verbose=True): from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment + def _get_assay_dict(x): + assays = x.get_assays() if hasattr(x, "get_assays") else getattr(x, "assays", {}) + if not assays: + return None + + assay_names = list(assays.keys()) + assay_list_data = { + "type": "vector", + "data": [save_rds(v) for v in assays.values()], + "attributes": {"names": {"type": "string", "data": assay_names}}, + } + + return { + "type": "S4", + "class_name": "SimpleAssays", + "package_name": "SummarizedExperiment", + "attributes": { + "data": { + "type": "S4", + "class_name": "SimpleList", + "package_name": "S4Vectors", + "attributes": { + "listData": assay_list_data, + "elementType": {"type": "string", "data": ["ANY"]}, + "elementMetadata": None, + "metadata": {"type": "vector", "data": []}, + }, + } + }, + } + @save_rds.register(SummarizedExperiment) def _save_rds_se(x: SummarizedExperiment, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native @@ -22,10 +53,16 @@ def _get(obj, name): return getattr(obj, name, None) converted = { - "assays": save_rds(_get(x, "assays")), - "row_data": save_rds(_get(x, "row_data")), - "column_data": save_rds(_get(x, "column_data")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": "SummarizedExperiment", + "package_name": "SummarizedExperiment", + "attributes": { + "assays": _get_assay_dict(x), + "colData": save_rds(_get(x, "column_data")), + "elementMetadata": save_rds(_get(x, "row_data")), + "metadata": save_rds(_get(x, "metadata")), + "NAMES": None, + }, } if path is not None: @@ -43,11 +80,17 @@ def _get(obj, name): return getattr(obj, name, None) converted = { - "assays": save_rds(_get(x, "assays")), - "row_data": save_rds(_get(x, "row_data")), - "column_data": save_rds(_get(x, "column_data")), - "row_ranges": save_rds(_get(x, "row_ranges")), - "metadata": save_rds(_get(x, "metadata")), + "type": "S4", + "class_name": "RangedSummarizedExperiment", + "package_name": "SummarizedExperiment", + "attributes": { + "assays": _get_assay_dict(x), + "colData": save_rds(_get(x, "column_data")), + "elementMetadata": save_rds(_get(x, "row_data")), + "metadata": save_rds(_get(x, "metadata")), + "rowRanges": save_rds(_get(x, "row_ranges")), + "NAMES": None, + }, } if path is not None: diff --git a/tests/test_clists.py b/tests/test_clists.py index 5e15b86..78e0051 100644 --- a/tests/test_clists.py +++ b/tests/test_clists.py @@ -67,9 +67,10 @@ def test_save_compressed_lists(): res = save_rds(obj) assert isinstance(res, dict) - assert "unlist_data" in res - assert "partitioning" in res - assert "ends" in res["partitioning"] + assert res["type"] == "S4" + assert res["class_name"] == "CompressedIntegerList" + assert "unlistData" in res["attributes"] + assert "partitioning" in res["attributes"] with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: rds_path = tmp.name @@ -78,7 +79,12 @@ def test_save_compressed_lists(): from rds2py.rdsutils import parse_rds parsed = parse_rds(rds_path) - assert parsed["type"] == "vector" + assert parsed["type"] == "S4" + assert parsed["class_name"] == "CompressedIntegerList" + + recreated = read_rds(rds_path) + assert isinstance(recreated, type(obj)) + assert recreated.to_list() == obj.to_list() finally: if os.path.exists(rds_path): os.unlink(rds_path) diff --git a/tests/test_delayedmatrices.py b/tests/test_delayedmatrices.py index 456b6bd..dc463bf 100644 --- a/tests/test_delayedmatrices.py +++ b/tests/test_delayedmatrices.py @@ -59,15 +59,9 @@ def test_roundtrip_h5sparse(): result = read_rds(rds_path) assert result is not None - assert isinstance(result, dict) - assert list(result["class_name"]) == ["H5SparseMatrix"] - assert list(result["package_name"]) == ["HDF5Array"] - assert "attributes" in result - - seed = result["attributes"]["seed"] - assert list(seed["class_name"]) == ["CSC_H5SparseMatrixSeed"] - assert list(seed["package_name"]) == ["HDF5Array"] - assert list(seed["attributes"]["group"]) == ["obsp/connectivities"] + assert isinstance(result, Hdf5CompressedSparseMatrix) + assert result.shape == (3, 3) + assert result.group_name == "obsp/connectivities" finally: if os.path.exists(h5_path): diff --git a/tests/test_factors.py b/tests/test_factors.py index 2bd775b..34b4c42 100644 --- a/tests/test_factors.py +++ b/tests/test_factors.py @@ -29,9 +29,7 @@ def test_roundtrip_factors(): try: write_rds(factor, rds_path) result = read_rds(rds_path) - assert isinstance(result, dict) - assert list(result["levels"]) == ["A", "B"] - assert list(result["data"]) == [1, 2, 1] + assert result == ["A", "B", "A"] finally: if os.path.exists(rds_path): os.unlink(rds_path) diff --git a/tests/test_mae.py b/tests/test_mae.py index 43bb55f..1bbda0a 100644 --- a/tests/test_mae.py +++ b/tests/test_mae.py @@ -25,8 +25,10 @@ def test_save_mae(): res = save_rds(data) assert isinstance(res, dict) - assert "experiments" in res - assert "col_data" in res + assert res["type"] == "S4" + assert res["class_name"] == "MultiAssayExperiment" + assert "ExperimentList" in res["attributes"] + assert "colData" in res["attributes"] with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: rds_path = tmp.name @@ -35,7 +37,12 @@ def test_save_mae(): from rds2py.rdsutils import parse_rds parsed = parse_rds(rds_path) - assert parsed["type"] == "vector" + assert parsed["type"] == "S4" + assert parsed["class_name"] == "MultiAssayExperiment" + + recreated = read_rds(rds_path) + assert isinstance(recreated, MultiAssayExperiment) + assert len(recreated.get_experiment_names()) == 2 finally: if os.path.exists(rds_path): os.unlink(rds_path) diff --git a/tests/test_roundtrip_r.py b/tests/test_roundtrip_r.py new file mode 100644 index 0000000..0f422c8 --- /dev/null +++ b/tests/test_roundtrip_r.py @@ -0,0 +1,234 @@ +import os +import shutil +import subprocess +import tempfile + +import numpy as np +import pytest +from biocframe import BiocFrame +from biocutils import Factor +from genomicranges import GenomicRanges +from iranges import IRanges +from multiassayexperiment import MultiAssayExperiment +from singlecellexperiment import SingleCellExperiment +from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment + +from rds2py import read_rds, write_rds + +r_available = shutil.which("Rscript") is not None +pytestmark = pytest.mark.skipif(not r_available, reason="Rscript not found on PATH") + + +def run_r_script(script_code: str): + with tempfile.NamedTemporaryFile(suffix=".R", mode="w", delete=False) as f: + f.write(script_code) + script_path = f.name + try: + res = subprocess.run(["Rscript", script_path], capture_output=True, text=True) + if res.returncode != 0: + print("STDOUT:", res.stdout) + print("STDERR:", res.stderr) + raise RuntimeError(f"Rscript failed with exit code {res.returncode}") + return res.stdout + finally: + os.unlink(script_path) + + +def test_roundtrip_r_summarizedexperiment(): + se = SummarizedExperiment( + assays={"counts": np.array([[1, 2], [3, 4]], dtype=np.int32)}, + row_data=BiocFrame({"gene": ["g1", "g2"]}), + column_data=BiocFrame({"cell": ["c1", "c2"]}), + ) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(se, path) + script = f""" + library(SummarizedExperiment) + obj <- readRDS("{path}") + stopifnot(is(obj, "SummarizedExperiment")) + stopifnot(all(dim(obj) == c(2, 2))) + stopifnot(assayNames(obj) == "counts") + stopifnot(all(as.matrix(assay(obj)) == matrix(c(1L, 3L, 2L, 4L), nrow=2))) + stopifnot(all(colnames(obj) == c("c1", "c2"))) + stopifnot(all(rowData(obj)$gene == c("g1", "g2"))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_rangedsummarizedexperiment(): + rse = RangedSummarizedExperiment( + assays={"counts": np.array([[10, 20], [30, 40]], dtype=np.int32)}, + row_ranges=GenomicRanges( + seqnames=["chr1", "chr2"], ranges=IRanges(start=[100, 200], width=[10, 20]), strand=["+", "-"] + ), + row_data=BiocFrame({"gene": ["g1", "g2"]}), + column_data=BiocFrame({"cell": ["c1", "c2"]}), + ) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(rse, path) + script = f""" + library(SummarizedExperiment) + obj <- readRDS("{path}") + stopifnot(is(obj, "RangedSummarizedExperiment")) + stopifnot(all(dim(obj) == c(2, 2))) + rr <- rowRanges(obj) + stopifnot(is(rr, "GRanges")) + stopifnot(all(seqnames(rr) == c("chr1", "chr2"))) + stopifnot(all(start(rr) == c(100, 200))) + stopifnot(all(width(rr) == c(10, 20))) + stopifnot(all(as.character(strand(rr)) == c("+", "-"))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_singlecellexperiment(): + sce = SingleCellExperiment( + assays={"counts": np.array([[1, 2], [3, 4]], dtype=np.int32)}, + reduced_dims={"PCA": np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float64)}, + ) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(sce, path) + script = f""" + library(SingleCellExperiment) + obj <- readRDS("{path}") + stopifnot(is(obj, "SingleCellExperiment")) + pca <- reducedDim(obj, "PCA") + stopifnot(is.matrix(pca)) + stopifnot(all(dim(pca) == c(2, 2))) + stopifnot(all(pca == matrix(c(0.1, 0.3, 0.2, 0.4), nrow=2))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_genomicranges(): + gr = GenomicRanges( + seqnames=["chrA", "chrB"], + ranges=IRanges(start=[10, 20], width=[5, 15]), + strand=["+", "*"], + mcols=BiocFrame({"score": [1.5, 2.5]}), + ) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(gr, path) + script = f""" + library(GenomicRanges) + obj <- readRDS("{path}") + stopifnot(is(obj, "GRanges")) + stopifnot(all(seqnames(obj) == c("chrA", "chrB"))) + stopifnot(all(start(obj) == c(10, 20))) + stopifnot(all(width(obj) == c(5, 15))) + stopifnot(all(as.character(strand(obj)) == c("+", "*"))) + stopifnot(all(mcols(obj)$score == c(1.5, 2.5))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_iranges(): + ir = IRanges(start=[1, 5, 10], width=[3, 4, 5]) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(ir, path) + script = f""" + library(IRanges) + obj <- readRDS("{path}") + stopifnot(is(obj, "IRanges")) + stopifnot(all(start(obj) == c(1, 5, 10))) + stopifnot(all(width(obj) == c(3, 4, 5))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_multiassayexperiment(): + mae = read_rds("tests/data/simple_mae.rds") + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(mae, path) + script = f""" + library(MultiAssayExperiment) + obj <- readRDS("{path}") + stopifnot(is(obj, "MultiAssayExperiment")) + stopifnot(length(experiments(obj)) == 2) + stopifnot(identical(names(experiments(obj)), c("methyl 2k", "methyl 3k"))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_factor(): + factor = Factor([0, 1, 0], levels=["X", "Y"]) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(factor, path) + script = f""" + obj <- readRDS("{path}") + stopifnot(is.factor(obj)) + stopifnot(all(levels(obj) == c("X", "Y"))) + stopifnot(all(as.integer(obj) == c(1, 2, 1))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_roundtrip_r_biocframe(): + bf = BiocFrame({"colA": [1, 2, 3], "colB": ["foo", "bar", "baz"]}, row_names=["r1", "r2", "r3"]) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as f: + path = f.name + + try: + write_rds(bf, path) + script = f""" + library(S4Vectors) + obj <- readRDS("{path}") + stopifnot(is(obj, "DFrame")) + stopifnot(all(rownames(obj) == c("r1", "r2", "r3"))) + stopifnot(all(obj$colA == c(1, 2, 3))) + stopifnot(all(obj$colB == c("foo", "bar", "baz"))) + """ + run_r_script(script) + finally: + if os.path.exists(path): + os.unlink(path) diff --git a/tests/test_save_rds_complex.py b/tests/test_save_rds_complex.py index 8159446..d9d06ef 100644 --- a/tests/test_save_rds_complex.py +++ b/tests/test_save_rds_complex.py @@ -25,10 +25,12 @@ def test_save_rds_genomicranges(): res = save_rds(gr) assert isinstance(res, dict) - assert "seqnames" in res - assert "ranges" in res - assert "strand" in res - assert "mcols" in res + assert res["type"] == "S4" + assert res["class_name"] == "GRanges" + assert "seqnames" in res["attributes"] + assert "ranges" in res["attributes"] + assert "strand" in res["attributes"] + assert "elementMetadata" in res["attributes"] def test_save_rds_summarizedexperiment(): @@ -40,9 +42,11 @@ def test_save_rds_summarizedexperiment(): res = save_rds(se) assert isinstance(res, dict) - assert "assays" in res - assert "row_data" in res - assert "column_data" in res + assert res["type"] == "S4" + assert res["class_name"] == "SummarizedExperiment" + assert "assays" in res["attributes"] + assert "elementMetadata" in res["attributes"] + assert "colData" in res["attributes"] def test_save_rds_singlecellexperiment(): @@ -52,8 +56,10 @@ def test_save_rds_singlecellexperiment(): res = save_rds(sce) assert isinstance(res, dict) - assert "reduced_dims" in res - assert "assays" in res + assert res["type"] == "S4" + assert res["class_name"] == "SingleCellExperiment" + assert "assays" in res["attributes"] + assert "int_colData" in res["attributes"] def test_roundtrip_genomicranges(): @@ -71,13 +77,9 @@ def test_roundtrip_genomicranges(): write_rds(gr, path) result = read_rds(path) - # Complex objects are saved as GenericVectors with names, - # so they should be read back as dictionaries. - assert isinstance(result, dict) - assert "seqnames" in result - assert "ranges" in result - assert "strand" in result - assert "mcols" in result + assert isinstance(result, GenomicRanges) + assert list(result.seqnames) == ["chr1", "chr2"] + assert list(result.get_strand()) == [1, -1] finally: os.unlink(path) @@ -96,10 +98,9 @@ def test_roundtrip_summarizedexperiment(): write_rds(se, path) result = read_rds(path) - assert isinstance(result, dict) - assert "assays" in result - assert "row_data" in result - assert "column_data" in result + assert isinstance(result, SummarizedExperiment) + assert "counts" in result.assays + assert result.shape == (2, 2) finally: os.unlink(path) @@ -117,9 +118,11 @@ def test_save_rds_rangedsummarizedexperiment(): res = save_rds(rse) assert isinstance(res, dict) - assert "assays" in res - assert "row_ranges" in res - assert "column_data" in res + assert res["type"] == "S4" + assert res["class_name"] == "RangedSummarizedExperiment" + assert "assays" in res["attributes"] + assert "rowRanges" in res["attributes"] + assert "colData" in res["attributes"] def test_write_rds_complex(): @@ -148,8 +151,8 @@ def test_write_rds_complex(): from rds2py.rdsutils import parse_rds - assert parse_rds(rse_path)["type"] == "vector" - assert parse_rds(sce_path)["type"] == "vector" + assert parse_rds(rse_path)["type"] == "S4" + assert parse_rds(sce_path)["type"] == "S4" finally: if os.path.exists(rse_path): os.unlink(rse_path) From 5ecdd8ff8343acdce6dd0137239b807799704e2d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Jun 2026 04:25:43 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_roundtrip_r.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_roundtrip_r.py b/tests/test_roundtrip_r.py index 0f422c8..bb2438d 100644 --- a/tests/test_roundtrip_r.py +++ b/tests/test_roundtrip_r.py @@ -9,7 +9,6 @@ from biocutils import Factor from genomicranges import GenomicRanges from iranges import IRanges -from multiassayexperiment import MultiAssayExperiment from singlecellexperiment import SingleCellExperiment from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment From 9b45da1468fd9e0df48727c056f7ccee2df1917d Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sat, 20 Jun 2026 21:27:26 -0700 Subject: [PATCH 03/10] update changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b94647c..3503d64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ # Changelog -## Version 0.10.0 +## Version 0.10.0 - 0.10.1 - Added methods to write to RDS/RData files. - Supports atomic types, generic dictionaries/lists, and **BiocPy objects**. +- Read `symbols` registered in RDS objects. +- Fixed an issue with S4 classes not properly saved as RDS files. ## Version 0.9.0 - 0.9.1 From 007d0bb3e879d9615b9e620d32863b8c5587a800 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Jun 2026 04:28:38 +0000 Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3503d64..7588b6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,8 @@ - Added methods to write to RDS/RData files. - Supports atomic types, generic dictionaries/lists, and **BiocPy objects**. -- Read `symbols` registered in RDS objects. -- Fixed an issue with S4 classes not properly saved as RDS files. +- Read `symbols` registered in RDS objects. +- Fixed an issue with S4 classes not properly saved as RDS files. ## Version 0.9.0 - 0.9.1 From 66e84a2e3a93cad72bdb1d5db63a5cf99ff4312a Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sat, 20 Jun 2026 23:25:46 -0700 Subject: [PATCH 05/10] add more tests; replace null values in compressed lists --- src/rds2py/read_sce.py | 8 +- src/rds2py/read_se.py | 25 +++--- src/rds2py/save_atomic.py | 22 +++++- src/rds2py/save_compressed_list.py | 7 +- tests/test_clists.py | 117 +++++++++++++++++++++++++++++ tests/test_delayedmatrices.py | 65 ++++++++++++++++ tests/test_dict.py | 9 +++ tests/test_factors.py | 31 ++++++++ tests/test_frames.py | 17 ++++- tests/test_granges.py | 81 +++++++++++++++++++- tests/test_mae.py | 9 +++ tests/test_matrices.py | 94 ++++++++++++++++++++++- tests/test_rle.py | 18 ++++- tests/test_sce.py | 73 +++++++++++++++++- tests/test_se.py | 76 ++++++++++++++++++- tests/test_write.py | 89 ++++++++++++++++++++++ 16 files changed, 713 insertions(+), 28 deletions(-) diff --git a/src/rds2py/read_sce.py b/src/rds2py/read_sce.py index a5addae..dd2dcba 100644 --- a/src/rds2py/read_sce.py +++ b/src/rds2py/read_sce.py @@ -62,8 +62,12 @@ def read_single_cell_experiment(robject: dict, **kwargs): idx_col = col_attrs[idx] idx_value = robject["attributes"]["int_colData"]["attributes"]["listData"]["data"][idx] - if idx_col == "reducedDims" and idx_value.get("data", None) is not None: - robj_reduced_dims = _dispatcher(idx_value, **kwargs) + if idx_col == "reducedDims" and idx_value.get("type", None) != "null": + robj_reduced_dims_frame = _dispatcher(idx_value, **kwargs) + if hasattr(robj_reduced_dims_frame, "to_dict"): + robj_reduced_dims = robj_reduced_dims_frame.to_dict() + else: + robj_reduced_dims = robj_reduced_dims_frame if idx_col == "altExps": alt_names = list(_dispatcher(idx_value["attributes"]["listData"]["attributes"]["names"], **kwargs)) diff --git a/src/rds2py/read_se.py b/src/rds2py/read_se.py index c72ac7c..00b6a29 100644 --- a/src/rds2py/read_se.py +++ b/src/rds2py/read_se.py @@ -52,21 +52,24 @@ def read_summarized_experiment(robject: dict, **kwargs): if _cls not in ["SummarizedExperiment"]: raise RuntimeError(f"`robject` does not contain a 'SummarizedExperiment' object, contains `{_cls}`.") + # parse assays names robj_asys = {} - assay_dims = None - asy_names = list( - _dispatcher( - robject["attributes"]["assays"]["attributes"]["data"]["attributes"]["listData"]["attributes"]["names"], - **kwargs, + assay_dims = (0, 0) + assays_node = robject["attributes"].get("assays", None) + if assays_node is not None and assays_node.get("type", None) != "null": + asy_names = list( + _dispatcher( + assays_node["attributes"]["data"]["attributes"]["listData"]["attributes"]["names"], + **kwargs, + ) ) - ) - for idx, asyname in enumerate(asy_names): - idx_asy = robject["attributes"]["assays"]["attributes"]["data"]["attributes"]["listData"]["data"][idx] + for idx, asyname in enumerate(asy_names): + idx_asy = assays_node["attributes"]["data"]["attributes"]["listData"]["data"][idx] - robj_asys[asyname] = _dispatcher(idx_asy, **kwargs) - if assay_dims is None: - assay_dims = robj_asys[asyname].shape + robj_asys[asyname] = _dispatcher(idx_asy, **kwargs) + if assay_dims == (0, 0) and hasattr(robj_asys[asyname], "shape"): + assay_dims = robj_asys[asyname].shape # parse coldata robj_coldata = _sanitize_empty_frame(_dispatcher(robject["attributes"]["colData"], **kwargs), assay_dims[1]) diff --git a/src/rds2py/save_atomic.py b/src/rds2py/save_atomic.py index b170a5e..1ecd09b 100644 --- a/src/rds2py/save_atomic.py +++ b/src/rds2py/save_atomic.py @@ -3,7 +3,7 @@ from typing import Optional import numpy as np -from biocutils import BooleanList, FloatList, IntegerList, StringList +from biocutils import BooleanList, FloatList, IntegerList, Names, StringList from .generics import save_rds @@ -30,7 +30,8 @@ def _save_rds_primitives(x, path: Optional[str] = None): def _save_rds_booleanlist(x: BooleanList, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native - converted = np.array(list(x), dtype=bool) + cleaned = [-2147483648 if val is None else (1 if val else 0) for val in x] + converted = np.array(cleaned, dtype=np.int32) if path is not None: _write_rds_native(converted, path) @@ -41,7 +42,8 @@ def _save_rds_booleanlist(x: BooleanList, path: Optional[str] = None): def _save_rds_integerlist(x: IntegerList, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native - converted = np.array(list(x), dtype=np.int32) + cleaned = [-2147483648 if val is None else val for val in x] + converted = np.array(cleaned, dtype=np.int32) if path is not None: _write_rds_native(converted, path) @@ -52,7 +54,8 @@ def _save_rds_integerlist(x: IntegerList, path: Optional[str] = None): def _save_rds_floatlist(x: FloatList, path: Optional[str] = None): from .lib_rds_parser import write_rds as _write_rds_native - converted = np.array(list(x), dtype=np.float64) + cleaned = [np.nan if val is None else val for val in x] + converted = np.array(cleaned, dtype=np.float64) if path is not None: _write_rds_native(converted, path) @@ -68,3 +71,14 @@ def _save_rds_stringlist(x: StringList, path: Optional[str] = None): _write_rds_native(converted, path) return converted + + +@save_rds.register(Names) +def _save_rds_names(x: Names, path: Optional[str] = None): + from .lib_rds_parser import write_rds as _write_rds_native + + converted = list(x) + if path is not None: + _write_rds_native(converted, path) + + return converted diff --git a/src/rds2py/save_compressed_list.py b/src/rds2py/save_compressed_list.py index 095b5e6..0a81527 100644 --- a/src/rds2py/save_compressed_list.py +++ b/src/rds2py/save_compressed_list.py @@ -20,7 +20,12 @@ def _save_rds_compressedlist(x: CompressedList, path: Optional[str] = None): def _get(obj, name): if hasattr(obj, f"get_{name}"): return getattr(obj, f"get_{name}")() - return getattr(obj, name, None) + val = getattr(obj, name, None) + if val is None and name == "partitioning": + if hasattr(obj, "get_paritioning"): + return obj.get_paritioning() + return getattr(obj, "paritioning", None) + return val class_name = type(x).__name__ r_class_name = class_name diff --git a/tests/test_clists.py b/tests/test_clists.py index 78e0051..f0d7910 100644 --- a/tests/test_clists.py +++ b/tests/test_clists.py @@ -88,3 +88,120 @@ def test_save_compressed_lists(): finally: if os.path.exists(rds_path): os.unlink(rds_path) + + +def test_compressed_lists_errors(): + import pytest + + from rds2py.read_compressed_list import ( + _get_compressed_common_attrs, + read_compressed_boolean_list, + read_compressed_character_list, + read_compressed_float_list, + read_compressed_frame_list, + read_compressed_integer_list, + read_compressed_string_list, + read_partitioning_by_end, + ) + + bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} + + with pytest.raises(RuntimeError): + read_partitioning_by_end(bad_obj) + + with pytest.raises(RuntimeError): + read_compressed_integer_list(bad_obj) + + with pytest.raises(RuntimeError): + read_compressed_string_list(bad_obj) + + with pytest.raises(RuntimeError): + read_compressed_boolean_list(bad_obj) + + with pytest.raises(RuntimeError): + read_compressed_float_list(bad_obj) + + with pytest.raises(RuntimeError): + read_compressed_frame_list(bad_obj) + + with pytest.raises(ValueError): + _get_compressed_common_attrs({"attributes": {}}) + + with pytest.raises(RuntimeError): + read_compressed_character_list(bad_obj) + + res = _get_compressed_common_attrs( + {"attributes": {"unlistData": {"type": "integer", "data": [1], "class_name": "integer_vector"}}} + ) + assert res[0][0] == 1 + assert res[1] is None + assert res[2] is None + assert res[3] is None + + +def test_save_all_compressed_list_classes(): + import os + import tempfile + + from rds2py import read_rds, save_rds, write_rds + + for file_name, expected_class in [ + ("tests/data/compressedlist_char.rds", clist.CompressedCharacterList), + ("tests/data/compressedlist_numeric.rds", clist.CompressedFloatList), + ("tests/data/compressedlist_logical.rds", clist.CompressedBooleanList), + ("tests/data/compressedlist_splitdframe.rds", clist.CompressedSplitBiocFrameList), + ]: + obj = read_rds(file_name) + res = save_rds(obj) + assert isinstance(res, dict) + assert res["type"] == "S4" + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + rds_path = tmp.name + try: + write_rds(obj, rds_path) + recreated = read_rds(rds_path) + assert isinstance(recreated, expected_class) + finally: + if os.path.exists(rds_path): + os.unlink(rds_path) + + obj_int = read_rds("tests/data/compressedlist_int.rds") + part = obj_int.paritioning if hasattr(obj_int, "paritioning") else obj_int.partitioning + res_part = save_rds(part) + assert isinstance(res_part, dict) + assert res_part["type"] == "S4" + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + rds_path = tmp.name + try: + write_rds(part, rds_path) + recreated_part = read_rds(rds_path) + assert isinstance(recreated_part, clist.Partitioning) + finally: + if os.path.exists(rds_path): + os.unlink(rds_path) + + +def test_clist_fallback_get(): + import os + import tempfile + + from rds2py import read_rds, write_rds + + obj = read_rds("tests/data/compressedlist_int.rds") + + orig_get_names = clist.Partitioning.get_names + try: + del clist.Partitioning.get_names + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + rds_path = tmp.name + try: + write_rds(obj, rds_path) + recreated = read_rds(rds_path) + assert isinstance(recreated, clist.CompressedIntegerList) + finally: + if os.path.exists(rds_path): + os.unlink(rds_path) + finally: + clist.Partitioning.get_names = orig_get_names diff --git a/tests/test_delayedmatrices.py b/tests/test_delayedmatrices.py index dc463bf..b5e003e 100644 --- a/tests/test_delayedmatrices.py +++ b/tests/test_delayedmatrices.py @@ -68,3 +68,68 @@ def test_roundtrip_h5sparse(): os.unlink(h5_path) if os.path.exists(rds_path): os.unlink(rds_path) + + +def test_delayedarray_extra_branches_and_errors(): + import pytest + from delayedarray import DelayedArray + from hdf5array import Hdf5CompressedSparseMatrix + + from rds2py import save_rds + from rds2py.read_delayed_matrix import read_hdf5_sparse + + with tempfile.NamedTemporaryFile(suffix=".h5", delete=False) as tmp_h5: + h5_path = tmp_h5.name + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp_rds: + rds_path = tmp_rds.name + + try: + with h5py.File(h5_path, "w") as f: + g = f.create_group("obsp/connectivities") + g.create_dataset("data", data=np.array([1, 2, 3], dtype=np.float64)) + g.create_dataset("indices", data=np.array([0, 1, 2], dtype=np.int32)) + g.create_dataset("indptr", data=np.array([0, 1, 2, 3], dtype=np.int32)) + + mat = Hdf5CompressedSparseMatrix(h5_path, "obsp/connectivities", (3, 3), True) + + res = save_rds(mat) + assert isinstance(res, dict) + assert res["type"] == "S4" + + write_rds(mat.seed, rds_path) + recreated_seed = read_rds(rds_path) + assert recreated_seed is not None + + mat.get_seed = lambda: mat.seed + try: + res_hasattr = save_rds(mat) + assert isinstance(res_hasattr, dict) + finally: + del mat.get_seed + + with pytest.raises(RuntimeError): + read_hdf5_sparse({"type": "S4", "class_name": "BadClass"}) + + csr_seed_mock = { + "type": "S4", + "class_name": "H5SparseMatrix", + "attributes": { + "seed": { + "type": "S4", + "class_name": "CSR_H5SparseMatrixSeed", + "attributes": { + "dim": {"type": "integer", "data": np.array([3, 3]), "class_name": "integer_vector"}, + "filepath": {"type": "string", "data": [h5_path], "class_name": "string_vector"}, + "group": {"type": "string", "data": ["obsp/connectivities"], "class_name": "string_vector"}, + }, + } + }, + } + res_csr = read_hdf5_sparse(csr_seed_mock) + assert isinstance(res_csr, Hdf5CompressedSparseMatrix) + + finally: + if os.path.exists(h5_path): + os.unlink(h5_path) + if os.path.exists(rds_path): + os.unlink(rds_path) diff --git a/tests/test_dict.py b/tests/test_dict.py index dc14321..da9e4c1 100644 --- a/tests/test_dict.py +++ b/tests/test_dict.py @@ -48,3 +48,12 @@ def test_read_atomic_lists_nested_deep_rownames(): assert obj is not None assert len(obj) > 0 + + +def test_read_dict_errors(): + import pytest + + from rds2py.read_dict import read_dict + + with pytest.raises(RuntimeError): + read_dict({"type": "vector", "class_name": "not_vector"}) diff --git a/tests/test_factors.py b/tests/test_factors.py index 34b4c42..1977d04 100644 --- a/tests/test_factors.py +++ b/tests/test_factors.py @@ -33,3 +33,34 @@ def test_roundtrip_factors(): finally: if os.path.exists(rds_path): os.unlink(rds_path) + + +def test_read_factor_errors(): + import pytest + from rds2py.read_factor import read_factor + + bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} + + with pytest.raises(RuntimeError): + read_factor(bad_obj) + + +def test_read_factor_lengths_and_no_levels(): + import pytest + from rds2py.read_factor import read_factor + + mock_factor_lengths = { + "type": "integer", + "class_name": "factor", + "data": [1, 2], + "attributes": { + "levels": {"type": "string", "data": ["A", "B"], "class_name": "string_vector"}, + "lengths": {"type": "integer", "data": [2, 3], "class_name": "integer_vector"}, + }, + } + res = read_factor(mock_factor_lengths) + assert res == ["A", "A", "B", "B", "B"] + + mock_factor_no_levels = {"type": "integer", "class_name": "factor", "data": [1, 2], "attributes": {}} + with pytest.raises(TypeError): + read_factor(mock_factor_no_levels) diff --git a/tests/test_frames.py b/tests/test_frames.py index fca29ee..4bcfadc 100644 --- a/tests/test_frames.py +++ b/tests/test_frames.py @@ -1,6 +1,7 @@ -from rds2py import read_rds from biocframe import BiocFrame +from rds2py import read_rds + __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" @@ -20,3 +21,17 @@ def test_read_atomic_lists_nested_deep_rownames(): assert frame is not None assert isinstance(frame, BiocFrame) assert len(frame) > 0 + + +def test_read_frame_errors(): + import pytest + + from rds2py.read_frame import read_data_frame, read_dframe + + bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} + + with pytest.raises(RuntimeError): + read_data_frame({"type": "vector", "attributes": {"class": {"data": ["bad"]}}}) + + with pytest.raises(RuntimeError): + read_dframe(bad_obj) diff --git a/tests/test_granges.py b/tests/test_granges.py index 292b3ed..4a5211d 100644 --- a/tests/test_granges.py +++ b/tests/test_granges.py @@ -1,7 +1,7 @@ -from rds2py import read_rds - -from genomicranges import GenomicRanges, CompressedGenomicRangesList import numpy as np +from genomicranges import CompressedGenomicRangesList, GenomicRanges + +from rds2py import read_rds __author__ = "jkanche" __copyright__ = "jkanche" @@ -34,3 +34,78 @@ def test_granges_list(): assert isinstance(gr, CompressedGenomicRangesList) assert len(gr) == 5 + + +def test_read_granges_errors(): + import pytest + + from rds2py.read_granges import read_genomic_ranges, read_granges_list + + bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} + + with pytest.raises(TypeError): + read_genomic_ranges(bad_obj) + + with pytest.raises(TypeError): + read_granges_list(bad_obj) + + +def test_save_seqinfo_directly(): + import os + import tempfile + + from genomicranges import SeqInfo + + from rds2py import write_rds + + si = SeqInfo(seqnames=["chrA"], seqlengths=[100], is_circular=[False], genome=["hg38"]) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + try: + write_rds(si, path) + parsed = read_rds(path) + assert parsed is not None + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_granges_list_roundtrip_and_fallbacks(): + import os + import tempfile + + from genomicranges import CompressedGenomicRangesList, GenomicRanges, SeqInfo + + from rds2py import read_rds, save_rds, write_rds + + gr_list = read_rds("tests/data/grangeslist.rds") + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + try: + write_rds(gr_list, path) + recreated = read_rds(path) + assert isinstance(recreated, CompressedGenomicRangesList) + assert len(recreated) == len(gr_list) + finally: + if os.path.exists(path): + os.unlink(path) + + from compressed_lists.base import CompressedList + + orig_get_names = CompressedList.get_names + try: + del CompressedList.get_names + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + + try: + write_rds(gr_list, path) + recreated = read_rds(path) + assert isinstance(recreated, CompressedGenomicRangesList) + finally: + if os.path.exists(path): + os.unlink(path) + finally: + CompressedList.get_names = orig_get_names diff --git a/tests/test_mae.py b/tests/test_mae.py index 1bbda0a..07cda28 100644 --- a/tests/test_mae.py +++ b/tests/test_mae.py @@ -46,3 +46,12 @@ def test_save_mae(): finally: if os.path.exists(rds_path): os.unlink(rds_path) + + +def test_mae_errors(): + import pytest + + from rds2py.read_mae import read_multi_assay_experiment + + with pytest.raises(RuntimeError): + read_multi_assay_experiment({"type": "S4", "class_name": "BadClass"}) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index d26f66a..626b027 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -1,7 +1,7 @@ -from rds2py import read_rds import numpy as np from scipy import sparse as sp +from rds2py import read_rds from rds2py.read_matrix import MatrixWrapper __author__ = "jkanche" @@ -49,3 +49,95 @@ def test_read_dense_numpy_dtype(): assert isinstance(array.matrix, np.ndarray) assert array.dimnames is not None assert len(array.dimnames) == len(array.matrix.shape) + + +def test_save_matrix_integer(): + from rds2py import save_rds + + mat = np.array([[1, 2], [3, 4]], dtype=np.int32) + res = save_rds(mat) + assert res["type"] == "integer" + assert res["attributes"]["dim"]["data"] == [2, 2] + + +def test_save_matrix_bool(): + from rds2py import save_rds + + mat = np.array([[True, False], [False, True]], dtype=bool) + res = save_rds(mat) + assert res["type"] == "logical" + assert res["attributes"]["dim"]["data"] == [2, 2] + + +def test_save_matrix_wrapper_with_dimnames(): + import os + import tempfile + + from rds2py import save_rds, write_rds + + mat = np.array([[1.0, 2.0], [3.0, 4.0]]) + wrapper = MatrixWrapper(mat, dimnames=[["r1", "r2"], ["c1", "c2"]]) + res = save_rds(wrapper) + assert "dimnames" in res["attributes"] + assert res["attributes"]["dimnames"]["data"][0] == ["r1", "r2"] + assert res["attributes"]["dimnames"]["data"][1] == ["c1", "c2"] + + wrapper_partial = MatrixWrapper(mat, dimnames=[None, ["c1", "c2"]]) + res_partial = save_rds(wrapper_partial) + assert res_partial["attributes"]["dimnames"]["data"][0]["type"] == "null" + assert res_partial["attributes"]["dimnames"]["data"][1] == ["c1", "c2"] + + assert wrapper.shape == (2, 2) + + mat_1d = np.array([1.0, 2.0]) + wrapper_1d = MatrixWrapper(mat_1d) + res_1d = save_rds(wrapper_1d) + assert isinstance(res_1d, np.ndarray) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + try: + write_rds(wrapper, path) + recreated = read_rds(path) + assert isinstance(recreated, MatrixWrapper) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_matrix_read_errors_and_dgrmatrix(): + import pytest + + from rds2py.read_matrix import _as_dense_matrix, _as_sparse_matrix, read_dgrmatrix + + with pytest.raises(RuntimeError): + _as_sparse_matrix({"type": "S4", "class_name": "BadClass"}) + + dgr_mock = { + "type": "S4", + "class_name": "dgRMatrix", + "attributes": { + "x": {"type": "double", "data": np.array([1.0, 2.0])}, + "i": {"type": "integer", "data": np.array([0, 1])}, + "p": {"type": "integer", "data": np.array([0, 1, 2])}, + "Dim": {"type": "integer", "data": np.array([2, 2])}, + }, + } + res_dgr = read_dgrmatrix(dgr_mock) + assert res_dgr is not None + assert res_dgr.shape == (2, 2) + + with pytest.raises(ValueError): + _as_dense_matrix({"type": "ndarray"}, order="X") + + with pytest.raises(TypeError): + _as_dense_matrix({"type": "ndarray", "class_name": "not_ndarray"}) + + dense_no_names = { + "type": "ndarray", + "class_name": "ndarray", + "data": np.array([1, 2, 3, 4], dtype=np.int32), + "attributes": {"dim": {"type": "integer", "data": np.array([2, 2])}}, + } + res_dense = _as_dense_matrix(dense_no_names) + assert isinstance(res_dense, np.ndarray) diff --git a/tests/test_rle.py b/tests/test_rle.py index da9be5d..b2c37cd 100644 --- a/tests/test_rle.py +++ b/tests/test_rle.py @@ -1,6 +1,5 @@ from rds2py import read_rds - __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" @@ -13,3 +12,20 @@ def test_read_simple_rle(): assert data is not None assert len(data) == 36 + + +def test_rle_errors_and_edge_cases(): + import pytest + + from rds2py.read_rle import read_rle + + with pytest.raises(RuntimeError): + read_rle({"type": "S4", "class_name": "BadClass"}) + + mock_rle = { + "type": "S4", + "class_name": "Rle", + "attributes": {"values": {"type": "integer", "data": [1, 2], "class_name": "integer_vector"}}, + } + res = read_rle(mock_rle) + assert res == [1, 2] diff --git a/tests/test_sce.py b/tests/test_sce.py index bc16825..c58ce5d 100644 --- a/tests/test_sce.py +++ b/tests/test_sce.py @@ -1,7 +1,7 @@ -from rds2py import read_rds - from singlecellexperiment import SingleCellExperiment +from rds2py import read_rds + __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" @@ -13,3 +13,72 @@ def test_read_sce(): assert data is not None assert isinstance(data, SingleCellExperiment) assert data.shape == (100, 100) + + +def test_read_sce_errors(): + import pytest + + from rds2py.read_sce import read_alts_summarized_experiment_by_column, read_single_cell_experiment + + bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} + + with pytest.raises(RuntimeError): + read_single_cell_experiment(bad_obj) + + with pytest.raises(RuntimeError): + read_alts_summarized_experiment_by_column(bad_obj) + + +def test_roundtrip_sce_complex(): + import os + import tempfile + + import numpy as np + from singlecellexperiment import SingleCellExperiment + + from rds2py import write_rds + + alt_sce = SingleCellExperiment(assays={"counts": np.array([[10, 20]], dtype=np.int32)}) + sce = SingleCellExperiment( + assays={"counts": np.array([[1, 2], [3, 4]], dtype=np.int32)}, + reduced_dims={"PCA": np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float64)}, + alternative_experiments={"alt": alt_sce}, + ) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + + try: + write_rds(sce, path) + recreated = read_rds(path) + assert isinstance(recreated, SingleCellExperiment) + assert "counts" in recreated.assays + assert "PCA" in recreated.reduced_dims + assert np.allclose(recreated.reduced_dims["PCA"], sce.reduced_dims["PCA"]) + assert "alt" in recreated.alternative_experiments + assert isinstance(recreated.alternative_experiments["alt"], SingleCellExperiment) + finally: + if os.path.exists(path): + os.unlink(path) + + +def test_sce_empty_assays(): + import os + import tempfile + + from singlecellexperiment import SingleCellExperiment + + from rds2py import write_rds + + sce = SingleCellExperiment() + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + + try: + write_rds(sce, path) + recreated = read_rds(path) + assert isinstance(recreated, SingleCellExperiment) + assert len(recreated.assays) == 0 + finally: + if os.path.exists(path): + os.unlink(path) diff --git a/tests/test_se.py b/tests/test_se.py index 321307f..2e6d8ec 100644 --- a/tests/test_se.py +++ b/tests/test_se.py @@ -1,6 +1,6 @@ -from rds2py import read_rds +from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment -from summarizedexperiment import SummarizedExperiment, RangedSummarizedExperiment +from rds2py import read_rds __author__ = "jkanche" __copyright__ = "jkanche" @@ -21,3 +21,75 @@ def test_read_ranged_summ_expt(): assert data is not None assert isinstance(data, RangedSummarizedExperiment) assert data.shape == (200, 6) + + +def test_read_se_errors(): + import pytest + + from rds2py.read_se import read_summarized_experiment + + bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} + + with pytest.raises(RuntimeError): + read_summarized_experiment(bad_obj) + + +def test_se_empty_assays_and_fallback(): + import os + import tempfile + + import numpy as np + from genomicranges import GenomicRanges + from iranges import IRanges + from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment + + from rds2py import write_rds + + se = SummarizedExperiment() + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + + try: + write_rds(se, path) + recreated = read_rds(path) + assert isinstance(recreated, SummarizedExperiment) + assert len(recreated.assays) == 0 + finally: + if os.path.exists(path): + os.unlink(path) + + # Test _get fallback/hasattr checks in save_se.py by adding get_metadata dynamically + se_with_assays = SummarizedExperiment(assays={"counts": np.ones((2, 2))}) + se_with_assays.get_metadata = lambda: se_with_assays.metadata + + try: + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + try: + write_rds(se_with_assays, path) + recreated = read_rds(path) + assert isinstance(recreated, SummarizedExperiment) + finally: + if os.path.exists(path): + os.unlink(path) + finally: + del se_with_assays.get_metadata + + rse = RangedSummarizedExperiment( + assays={"counts": np.ones((2, 2))}, + row_ranges=GenomicRanges(seqnames=["chr1", "chr2"], ranges=IRanges(start=[1, 2], width=[10, 20])), + ) + rse.get_metadata = lambda: rse.metadata + + try: + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + try: + write_rds(rse, path) + recreated = read_rds(path) + assert isinstance(recreated, RangedSummarizedExperiment) + finally: + if os.path.exists(path): + os.unlink(path) + finally: + del rse.get_metadata diff --git a/tests/test_write.py b/tests/test_write.py index dfca4f4..2600ec7 100644 --- a/tests/test_write.py +++ b/tests/test_write.py @@ -335,3 +335,92 @@ def test_roundtrip_rda(self): assert list(result["words"]) == ["alpha", "beta"] finally: os.unlink(path) + + +def test_py_rds_parser_invalid_file(): + import pytest + + from rds2py.PyRdsReader import PyRdsParser, PyRdsParserError + + with pytest.raises(PyRdsParserError): + PyRdsParser("non_existent_file.rds") + + +def test_save_rds_not_implemented(): + import pytest + + from rds2py import save_rds + + with pytest.raises(NotImplementedError): + save_rds(object()) + + +def test_py_rds_parser_edge_cases_and_mocks(): + from unittest.mock import MagicMock, patch + + import pytest + + from rds2py.generics import _dispatcher + from rds2py.PyRdsReader import PyRdsParser, PyRdsParserError, RdsReader + from rds2py.rdsutils import get_class + + with patch("rds2py.PyRdsReader.RdsObject") as mock_rds_obj_cls: + mock_instance = MagicMock() + mock_instance.get_robject.return_value = "not_an_RdsReader" + mock_rds_obj_cls.return_value = mock_instance + with pytest.raises(PyRdsParserError, match="Expected 'RdsReader' object"): + PyRdsParser("dummy.rds") + + parser = object.__new__(PyRdsParser) + + with pytest.raises(PyRdsParserError, match="Error parsing RDS object"): + parser.root_object = MagicMock(spec=RdsReader) + parser.root_object.get_rtype.side_effect = Exception("test parse error") + parser.parse() + + mock_sym = MagicMock(spec=RdsReader) + mock_sym.get_rtype.return_value = "symbol" + mock_sym.get_symbol_name.return_value = "custom_symbol" + res_sym = parser._process_object(mock_sym) + assert res_sym["name"] == "custom_symbol" + assert res_sym["class_name"] == "symbol" + + mock_unsup = MagicMock(spec=RdsReader) + mock_unsup.get_rtype.return_value = "unsupported_type" + with pytest.warns(RuntimeWarning, match="Unsupported R object type: unsupported_type"): + res_unsup = parser._process_object(mock_unsup) + assert res_unsup["data"] is None + + with pytest.raises(PyRdsParserError, match="Error processing object"): + mock_err = MagicMock(spec=RdsReader) + mock_err.get_rtype.side_effect = Exception("process error") + parser._process_object(mock_err) + + with pytest.raises(PyRdsParserError, match="Error handling R special cases"): + parser._handle_r_special_cases(None, "integer", 0) + + with pytest.raises(PyRdsParserError, match="Error getting numeric data"): + mock_num_err = MagicMock(spec=RdsReader) + mock_num_err.get_numeric_data.side_effect = Exception("numeric error") + parser._get_numeric_data(mock_num_err, "integer") + + with pytest.raises(PyRdsParserError, match="Error processing attributes"): + mock_attr_err = MagicMock(spec=RdsReader) + mock_attr_err.get_attribute_names.side_effect = Exception("attributes error") + parser._process_attributes(mock_attr_err) + + mock_root = MagicMock(spec=RdsReader) + mock_root.get_dimensions.return_value = (10, 20) + parser.root_object = mock_root + assert parser.get_dimensions() == (10, 20) + + mock_root.get_dimensions.side_effect = Exception("dimensions error") + with pytest.raises(PyRdsParserError, match="Error getting dimensions"): + parser.get_dimensions() + + with pytest.warns(RuntimeWarning, match="Failed to coerce RDS object to class"): + res_coerce = _dispatcher({"type": "S4", "class_name": "dgCMatrix", "attributes": {}}) + assert isinstance(res_coerce, dict) + + assert get_class({"type": "integer_vector", "class_name": "integer"}) == "integer" + assert get_class({"type": "integer_vector", "class_name": "integer", "attributes": None}) == "integer" From c45ec20c0cec24850e0b78f97e8d42c4a19493e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 21 Jun 2026 06:26:28 +0000 Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/rds2py/read_se.py | 2 +- tests/test_delayedmatrices.py | 1 - tests/test_granges.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/rds2py/read_se.py b/src/rds2py/read_se.py index 00b6a29..dedd22a 100644 --- a/src/rds2py/read_se.py +++ b/src/rds2py/read_se.py @@ -52,7 +52,7 @@ def read_summarized_experiment(robject: dict, **kwargs): if _cls not in ["SummarizedExperiment"]: raise RuntimeError(f"`robject` does not contain a 'SummarizedExperiment' object, contains `{_cls}`.") - + # parse assays names robj_asys = {} assay_dims = (0, 0) diff --git a/tests/test_delayedmatrices.py b/tests/test_delayedmatrices.py index b5e003e..43fce5b 100644 --- a/tests/test_delayedmatrices.py +++ b/tests/test_delayedmatrices.py @@ -72,7 +72,6 @@ def test_roundtrip_h5sparse(): def test_delayedarray_extra_branches_and_errors(): import pytest - from delayedarray import DelayedArray from hdf5array import Hdf5CompressedSparseMatrix from rds2py import save_rds diff --git a/tests/test_granges.py b/tests/test_granges.py index 4a5211d..30926b5 100644 --- a/tests/test_granges.py +++ b/tests/test_granges.py @@ -75,9 +75,9 @@ def test_granges_list_roundtrip_and_fallbacks(): import os import tempfile - from genomicranges import CompressedGenomicRangesList, GenomicRanges, SeqInfo + from genomicranges import CompressedGenomicRangesList - from rds2py import read_rds, save_rds, write_rds + from rds2py import read_rds, write_rds gr_list = read_rds("tests/data/grangeslist.rds") From 3e1ed1ca70aff99bc0eab7a3652e5b8cb231afe8 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sun, 21 Jun 2026 19:53:39 -0700 Subject: [PATCH 07/10] add the generic for iranges class --- src/rds2py/generics.py | 1 + src/rds2py/read_granges.py | 47 ++++++++++++++++++++++++++++++++++++++ tests/test_granges.py | 29 +++++++++++++++++++++++ 3 files changed, 77 insertions(+) diff --git a/src/rds2py/generics.py b/src/rds2py/generics.py index 1d86081..b878056 100644 --- a/src/rds2py/generics.py +++ b/src/rds2py/generics.py @@ -47,6 +47,7 @@ "data.frame": "rds2py.read_frame.read_data_frame", "DFrame": "rds2py.read_frame.read_dframe", # genomic ranges + "IRanges": "rds2py.read_granges.read_iranges", "GRanges": "rds2py.read_granges.read_genomic_ranges", "GenomicRanges": "rds2py.read_granges.read_genomic_ranges", "CompressedGRangesList": "rds2py.read_granges.read_granges_list", diff --git a/src/rds2py/read_granges.py b/src/rds2py/read_granges.py index 40f3be0..7c4f91a 100644 --- a/src/rds2py/read_granges.py +++ b/src/rds2py/read_granges.py @@ -120,3 +120,50 @@ def read_granges_list(robject: dict, **kwargs): return CompressedGenomicRangesList( unlist_data=_gre, partitioning=_part_obj, element_metadata=element_metadata, metadata=metadata ) + + +def read_iranges(robject: dict, **kwargs): + """Convert an R `IRanges` object to a Python `IRanges` object. + + Args: + robject: + Dictionary containing parsed IRanges data. + + **kwargs: + Additional arguments. + + Returns: + A Python IRanges object. + """ + from iranges import IRanges + + _cls = get_class(robject) + + if _cls not in ["IRanges"]: + raise TypeError(f"obj is not 'IRanges', but is `{_cls}`.") + + _start = _dispatcher(robject["attributes"]["start"], **kwargs) + _width = _dispatcher(robject["attributes"]["width"], **kwargs) + + _names = None + if "NAMES" in robject["attributes"]: + _tmp_names = robject["attributes"]["NAMES"] + _names = _dispatcher(_tmp_names, **kwargs) + if _names is not None: + _names = list(_names) + + _mcols = None + if "elementMetadata" in robject["attributes"]: + _mcols = _dispatcher(robject["attributes"]["elementMetadata"], **kwargs) + + _metadata = None + if "metadata" in robject["attributes"]: + _metadata = _metadata = _dispatcher(robject["attributes"]["metadata"], **kwargs) + + return IRanges( + start=_start, + width=_width, + names=_names, + mcols=_mcols, + metadata=_metadata, + ) diff --git a/tests/test_granges.py b/tests/test_granges.py index 4a5211d..df66efe 100644 --- a/tests/test_granges.py +++ b/tests/test_granges.py @@ -109,3 +109,32 @@ def test_granges_list_roundtrip_and_fallbacks(): os.unlink(path) finally: CompressedList.get_names = orig_get_names + + +def test_iranges_roundtrip(): + import os + import tempfile + + import pytest + from iranges import IRanges + + from rds2py import read_rds, write_rds + from rds2py.read_granges import read_iranges + + ir = IRanges(start=[1, 5, 10], width=[3, 4, 5], names=["a", "b", "c"]) + + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + try: + write_rds(ir, path) + recreated = read_rds(path) + assert isinstance(recreated, IRanges) + assert list(recreated.get_start()) == [1, 5, 10] + assert list(recreated.get_width()) == [3, 4, 5] + assert list(recreated.get_names()) == ["a", "b", "c"] + finally: + if os.path.exists(path): + os.unlink(path) + + with pytest.raises(TypeError): + read_iranges({"type": "S4", "class_name": "BadClass"}) From e905c0cdbe6da66389c4c023205b0c54d029d034 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sun, 21 Jun 2026 20:40:03 -0700 Subject: [PATCH 08/10] add a few more tests to check for names and named lists --- src/rds2py/read_granges.py | 2 +- tests/test_atomics.py | 24 +++++++++++++++-- tests/test_dict.py | 20 ++++++++++++++ tests/test_factors.py | 8 +++++- tests/test_matrices.py | 54 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+), 4 deletions(-) diff --git a/src/rds2py/read_granges.py b/src/rds2py/read_granges.py index 7c4f91a..ade4189 100644 --- a/src/rds2py/read_granges.py +++ b/src/rds2py/read_granges.py @@ -158,7 +158,7 @@ def read_iranges(robject: dict, **kwargs): _metadata = None if "metadata" in robject["attributes"]: - _metadata = _metadata = _dispatcher(robject["attributes"]["metadata"], **kwargs) + _metadata = _dispatcher(robject["attributes"]["metadata"], **kwargs) return IRanges( start=_start, diff --git a/tests/test_atomics.py b/tests/test_atomics.py index 4cf110d..a4a31c2 100644 --- a/tests/test_atomics.py +++ b/tests/test_atomics.py @@ -1,7 +1,7 @@ -from rds2py import read_rds - from biocutils import BooleanList, FloatList, IntegerList, StringList +from rds2py import read_rds + __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" @@ -99,3 +99,23 @@ def test_read_scalar_float(): assert isinstance(obj, FloatList) assert len(obj) == 1 assert obj[0] == 10.0 + + +def test_save_names_directly(): + import os + import tempfile + + from biocutils import Names + + from rds2py import write_rds + + names_obj = Names(["a", "b", "c"]) + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + + try: + write_rds(names_obj, path) + assert os.path.exists(path) + finally: + if os.path.exists(path): + os.unlink(path) diff --git a/tests/test_dict.py b/tests/test_dict.py index da9e4c1..29c48a3 100644 --- a/tests/test_dict.py +++ b/tests/test_dict.py @@ -57,3 +57,23 @@ def test_read_dict_errors(): with pytest.raises(RuntimeError): read_dict({"type": "vector", "class_name": "not_vector"}) + + +def test_save_namedlist_directly(): + import os + import tempfile + + from biocutils import NamedList + + from rds2py import write_rds + + nl = NamedList([1, 2], names=["a", "b"]) + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + + try: + write_rds(nl, path) + assert os.path.exists(path) + finally: + if os.path.exists(path): + os.unlink(path) diff --git a/tests/test_factors.py b/tests/test_factors.py index 1977d04..217a496 100644 --- a/tests/test_factors.py +++ b/tests/test_factors.py @@ -20,9 +20,13 @@ def test_roundtrip_factors(): from biocutils import Factor - from rds2py import read_rds, write_rds + from rds2py import read_rds, save_rds, write_rds factor = Factor([0, 1, 0], levels=["A", "B"]) + res = save_rds(factor) + assert isinstance(res, dict) + assert res["type"] == "integer" + with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: rds_path = tmp.name @@ -37,6 +41,7 @@ def test_roundtrip_factors(): def test_read_factor_errors(): import pytest + from rds2py.read_factor import read_factor bad_obj = {"type": "S4", "class_name": "BadClass", "attributes": {}} @@ -47,6 +52,7 @@ def test_read_factor_errors(): def test_read_factor_lengths_and_no_levels(): import pytest + from rds2py.read_factor import read_factor mock_factor_lengths = { diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 626b027..fdd898c 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -141,3 +141,57 @@ def test_matrix_read_errors_and_dgrmatrix(): } res_dense = _as_dense_matrix(dense_no_names) assert isinstance(res_dense, np.ndarray) + + dense_null_names = { + "type": "ndarray", + "class_name": "ndarray", + "data": np.array([1, 2, 3, 4], dtype=np.int32), + "attributes": { + "dim": {"type": "integer", "class_name": "integer_vector", "data": np.array([2, 2])}, + "dimnames": {"type": "null"}, + }, + } + res_null_names = _as_dense_matrix(dense_null_names) + assert isinstance(res_null_names, np.ndarray) + + dgr_none_names = { + "type": "S4", + "class_name": "dgRMatrix", + "attributes": { + "x": {"type": "double", "class_name": "double_vector", "data": np.array([1.0, 2.0])}, + "i": {"type": "integer", "class_name": "integer_vector", "data": np.array([0, 1])}, + "p": {"type": "integer", "class_name": "integer_vector", "data": np.array([0, 1, 2])}, + "Dim": {"type": "integer", "class_name": "integer_vector", "data": np.array([2, 2])}, + "Dimnames": { + "type": "vector", + "class_name": "vector", + "data": [{"type": "null"}, {"type": "null"}], + "attributes": {}, + }, + }, + } + res_dgr_none_names = _as_sparse_matrix(dgr_none_names) + assert isinstance(res_dgr_none_names, sp.spmatrix) + assert not isinstance(res_dgr_none_names, MatrixWrapper) + + dgr_null_names = { + "type": "S4", + "class_name": "dgRMatrix", + "attributes": { + "x": {"type": "double", "class_name": "double_vector", "data": np.array([1.0, 2.0])}, + "i": {"type": "integer", "class_name": "integer_vector", "data": np.array([0, 1])}, + "p": {"type": "integer", "class_name": "integer_vector", "data": np.array([0, 1, 2])}, + "Dim": {"type": "integer", "class_name": "integer_vector", "data": np.array([2, 2])}, + "Dimnames": {"type": "null"}, + }, + } + res_dgr_null_names = _as_sparse_matrix(dgr_null_names) + assert isinstance(res_dgr_null_names, sp.spmatrix) + assert not isinstance(res_dgr_null_names, MatrixWrapper) + + from rds2py import save_rds + + wrapper_no_names = MatrixWrapper(np.array([[1, 2], [3, 4]]), dimnames=None) + res_no_names = save_rds(wrapper_no_names) + assert isinstance(res_no_names, dict) + assert "dimnames" not in res_no_names["attributes"] From 2187727c47f3ce2f26f1b0189e504bcbe56f36e5 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sun, 21 Jun 2026 21:00:55 -0700 Subject: [PATCH 09/10] update docs and README --- README.md | 142 ++++++++++++++++++++++------------- docs/custom_serialization.md | 138 ++++++++++++++++++++++++++++++++++ docs/index.md | 19 +++-- docs/tutorial.md | 113 ++++++++++++++++++++++------ 4 files changed, 331 insertions(+), 81 deletions(-) create mode 100644 docs/custom_serialization.md diff --git a/README.md b/README.md index 595e827..81db4ee 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # rds2py -Parse and save Python objects as **RDS or RData** files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** +`rds2py` allows you to read and write R's native **RDS** and **RData** files directly in Python. Beyond standard R types, it provides integration with the [BiocPy](https://github.com/biocpy) ecosystem, allowing you to easily roundtrip complex S4 data structures like `SummarizedExperiment`, `SingleCellExperiment`, and `GenomicRanges`. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** ## Installation @@ -12,93 +12,133 @@ Package is published to [PyPI](https://pypi.org/project/rds2py/) ```shell pip install rds2py +``` + +To enable automatic conversion to Bioconductor/BiocPy classes, make sure to install the optional dependencies: -# or install optional dependencies +```shell pip install rds2py[optional] ``` -By default, the package does not install packages to convert python representations to BiocPy classes. Please consider installing all optional dependencies. -## Usage +## Quickstart -> [!NOTE] -> -> If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). +### 1. Reading RDS and RData files + +Reading an RDS or RData file is as simple as a single function call. `rds2py` automatically detects and maps known R/Bioconductor classes to their Python equivalents: ```python from rds2py import read_rds, read_rda -r_obj = read_rds("path/to/file.rds") # or read_rda("path/to/file.rda") + +# Read an RDS file (returns a Python/BiocPy object or dict) +data = read_rds("path/to/file.rds") + +# Read objects from an RData workspace file (returns a dictionary of objects) +workspace = read_rda("path/to/workspace.rda") ``` -The returned `r_obj` either returns an appropriate Python class if a parser is already implemented or returns the dictionary containing the data from the RDS file. +If `rds2py` encounters an S4 class or complex R structure it doesn't have a parser registered for, it falls back to returning a dictionary so you don't lose any data. -### Save RDS/RData files +### 2. Saving to RDS and RData files -You can also construct RDS or RData files from Python objects. `rds2py` supports writing atomic types, generic dictionaries/lists, and **BiocPy objects**. +You can serialize Python objects back to RDS or RData formats. This includes NumPy arrays, SciPy sparse matrices, standard dictionaries/lists, and BiocPy objects: ```python -from rds2py import write_rds, write_rda import numpy as np - -# Write atomic types -write_rds(np.array([1, 2, 3], dtype=np.int32), "path/to/file.rds") - -# Write complex objects +from rds2py import write_rds, write_rda from genomicranges import GenomicRanges from iranges import IRanges -gr = GenomicRanges( - seqnames=["chr1", "chr2"], - ranges=IRanges(start=[1, 2], width=[10, 20]), - strand=["+", "-"] -) -write_rds(gr, "path/to/granges.rds") +# 1. Write an atomic NumPy array +write_rds(np.array([10, 20, 30], dtype=np.int32), "array.rds") + +# 2. Write a complex Bioconductor GenomicRanges object +gr = GenomicRanges(seqnames=["chr1", "chr2"], ranges=IRanges(start=[1, 100], width=[10, 50]), strand=["+", "-"]) +write_rds(gr, "genomic_ranges.rds") + +# 3. Write multiple Python objects into a single RData workspace +objects = {"my_array": np.array([1.1, 2.2, 3.3]), "my_granges": gr} +write_rda(objects, "workspace.rda") ``` -### Write-your-own-reader +### 3. Custom Extensions -Reading RDS or RData files as dictionary representations allows users to write their own custom readers into appropriate Python representations. +If you have custom S4 representations or class mapping needs, you can parse the raw RDS structure into Python dictionary representations using `parse_rds`/`parse_rda` and apply your custom deserializers: ```python -from rds2py import parse_rds, parse_rda +from rds2py import parse_rds +from rds2py.read_granges import read_genomic_ranges + +# 1. Parse into a raw dictionary representation of the RDS tree +raw_dict = parse_rds("path/to/file.rds") +print(raw_dict.keys()) # ['type', 'class_name', 'attributes', 'data', ...] -robject = parse_rds("path/to/file.rds") # or use parse_rda for rdata files -print(robject) +# 2. Build or invoke custom parser logic +if raw_dict.get("class_name") == "GRanges": + gr = read_genomic_ranges(raw_dict) + print(gr) ``` -If you know this RDS file contains an `GenomicRanges` object, you can use the built-in reader or write your own reader to convert this dictionary. +For writing custom objects, you can register your classes to `rds2py`'s serialization registry using the `save_rds` singledispatch generic: ```python -from rds2py.read_granges import read_genomic_ranges +from rds2py.generics import save_rds + -gr = read_genomic_ranges(robject) -print(gr) +class MyCustomClass: + def __init__(self, value): + self.value = value + + +@save_rds.register(MyCustomClass) +def _serialize_custom(x: MyCustomClass, path=None): + # Construct the raw RDS dictionary representation expected by rds2cpp + converted = { + "type": "integer", + "data": [x.value], + "attributes": {"class": {"type": "string", "data": ["MyCustomRClass"]}}, + } + + # Optionally save if path is provided, otherwise return representation + if path is not None: + from rds2py.lib_rds_parser import write_rds as write_rds_native + + write_rds_native(converted, path) + return converted ``` + ## Type Conversion Reference -| R Type | Python/NumPy Type | -| ---------- | ------------------------------------ | -| numeric | numpy.ndarray (float64) | -| integer | numpy.ndarray (int32) | -| character | list of str | -| logical | numpy.ndarray (bool) | -| factor | list | -| data.frame | BiocFrame | -| matrix | numpy.ndarray or scipy.sparse matrix | -| dgCMatrix | scipy.sparse.csc_matrix | -| dgRMatrix | scipy.sparse.csr_matrix | - -and integration with BiocPy ecosystem for Bioconductor classes - - SummarizedExperiment - - RangedSummarizedExperiment - - SingleCellExperiment - - GenomicRanges - - MultiAssayExperiment +The table below describes how core R types are mapped to Python/NumPy/SciPy counterparts: + +| R Type / Class | Python / NumPy / SciPy Counterpart | +| :--- | :--- | +| **numeric** | `numpy.ndarray` (`float64`) | +| **integer** | `numpy.ndarray` (`int32`) | +| **logical** | `numpy.ndarray` (`bool`) | +| **character** | `list` of `str` | +| **factor** | `list` / representation levels | +| **matrix (dense)** | `numpy.ndarray` | +| **dgCMatrix** (Column-sparse) | `scipy.sparse.csc_matrix` | +| **dgRMatrix** (Row-sparse) | `scipy.sparse.csr_matrix` | +| **data.frame** / **DFrame** | `biocframe.BiocFrame` | + +### Supported Bioconductor Classes +When `rds2py[optional]` is installed, the package fully translates R/S4 classes to their BiocPy equivalents: +- **GenomicRanges** / **GRanges** <-> `genomicranges.GenomicRanges` +- **GenomicRangesList** / **GRangesList** <-> `genomicranges.CompressedGenomicRangesList` +- **SummarizedExperiment** <-> `summarizedexperiment.SummarizedExperiment` +- **RangedSummarizedExperiment** <-> `summarizedexperiment.RangedSummarizedExperiment` +- **SingleCellExperiment** <-> `singlecellexperiment.SingleCellExperiment` +- **MultiAssayExperiment** <-> `multiassayexperiment.MultiAssayExperiment` + +--- ## Developer Notes -This project uses pybind11 to provide bindings to the rds2cpp library. Please make sure necessary C++ compiler is installed on your system. +- `rds2py` uses `pybind11` to bind the core C++ `rds2cpp` library. Compiling from source requires a compatible C++ compiler. +- Tests can be run via `tox` or directly using `pytest`. diff --git a/docs/custom_serialization.md b/docs/custom_serialization.md new file mode 100644 index 0000000..922bbc4 --- /dev/null +++ b/docs/custom_serialization.md @@ -0,0 +1,138 @@ +# Custom Serialization and Deserialization Guide + +This guide shows you how to extend `rds2py` to support custom Python classes. By implementing custom readers and writers, you can serialize your custom Python representations directly into native R RDS/RData structures, and read them back seamlessly. + +`rds2py` achieves this two-way extensibility using: +1. Python's `functools.singledispatch` mechanism for writing/serialization (`save_rds`). +2. A global class mapping registry for reading/deserialization (`read_rds`). + +--- + +## 1. Custom Serialization (Python -> RDS) + +To serialize a custom Python class, you register it with the `save_rds` generic dispatcher. Your custom function needs to take your object and convert it into a structured dictionary that matches R's internal representation format. + +### The Structured RDS Representation Format +R objects are represented in Python as nested dictionaries containing the following keys: +- `"type"`: The R type descriptor (e.g., `"S4"`, `"vector"`, `"integer"`, `"double"`, `"string"`, `"logical"`, or `"null"`). +- `"class_name"`: The target R class name (e.g., `"MyCustomRClass"`). +- `"package_name"`: *(Optional, for S4 classes)* The name of the R package where the class is defined. +- `"attributes"`: A dictionary representing R attributes or S4 slots. Each slot value must also be a structured representation dictionary. +- `"data"`: The flat list or array of values for vector/atomic types. + +### Example: Implementing a Custom Serializer + +Let's say we have a custom Python class named `MyFeature`: + +```python +class MyFeature: + def __init__(self, name: str, values: list): + self.name = name + self.values = values +``` + +To serialize `MyFeature` as a native R S4 class called `"MyCustomRClass"` from package `"MyRPackage"`, we register it using `@save_rds.register`: + +```python +from typing import Optional +from rds2py import save_rds + + +@save_rds.register(MyFeature) +def _save_rds_myfeature(x: MyFeature, path: Optional[str] = None): + # Native C++ writer call + from rds2py.lib_rds_parser import write_rds as write_rds_native + + # 1. Structure the Python object into the expected R dictionary format + converted = { + "type": "S4", + "class_name": "MyCustomRClass", + "package_name": "MyRPackage", + "attributes": { + # Recursively call save_rds to serialize internal elements + "featureName": save_rds(x.name), + "featureValues": save_rds(x.values), + }, + } + + # 2. If a save path is specified, write directly using the native writer + if path is not None: + write_rds_native(converted, path) + + return converted +``` + +--- + +## 2. Custom Deserialization (RDS -> Python) + +To read custom S4 objects back into Python classes via `read_rds`, you need to: +1. Write a deserialization function that constructs your Python class from the raw parsed dictionary. +2. Register your deserializer function in `rds2py`'s global class mapping registry. + +### Example: Implementing the Reader + +```python +from rds2py.generics import _dispatcher +from rds2py.rdsutils import get_class + + +def read_my_custom_class(robject: dict, **kwargs) -> MyFeature: + # 1. Verify the incoming R class name + cls_name = get_class(robject) + if cls_name != "MyCustomRClass": + raise ValueError(f"Expected class 'MyCustomRClass', but received '{cls_name}'") + + # 2. Extract and parse the slots recursively + # We call the internal _dispatcher helper to parse child structures + feature_name = _dispatcher(robject["attributes"]["featureName"], **kwargs) + feature_values = _dispatcher(robject["attributes"]["featureValues"], **kwargs) + + # 3. Instantiate and return your custom Python class + return MyFeature(name=feature_name, values=list(feature_values)) +``` + +### Registering the Reader +Map your class name to the reader function in the global class registry (`REGISTRY` from `rds2py.generics`): + +```python +from rds2py.generics import REGISTRY + +# Register our custom deserializer in the global map +REGISTRY["MyCustomRClass"] = read_my_custom_class +``` + +--- + +## 3. Full Roundtrip + +Here is how the entire custom serialization and deserialization workflow works together: + +```python +import tempfile +import os +from rds2py import write_rds, read_rds + +# 1. Create a custom instance +feature = MyFeature(name="expression_level", values=[10, 20, 30]) + +# 2. Serialize to a temporary RDS file +with tempfile.NamedTemporaryFile(suffix=".rds", delete=False) as tmp: + path = tmp.name + +try: + # Write custom class to RDS format + write_rds(feature, path) + + # Read the RDS file back into Python + recreated = read_rds(path) + + # 3. Verify that the roundtrip correctly recreated the custom class + assert isinstance(recreated, MyFeature) + assert recreated.name == "expression_level" + assert recreated.values == [10, 20, 30] + print("Roundtrip validation successful!") +finally: + if os.path.exists(path): + os.unlink(path) +``` diff --git a/docs/index.md b/docs/index.md index 8f6e12c..54d5799 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,24 +1,31 @@ -# rds2py +# rds2py: R Serialization Formats in Python -Parse, extract and create Python representations for datasets stored in RDS files. It supports Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` objects. This is possible because of [Aaron's rds2cpp library](https://github.com/LTLA/rds2cpp). +`rds2py` is designed to parse, extract, and write R data formats (RDS and RData) directly in Python. It provides native, out-of-the-box integration with the [BiocPy](https://github.com/biocpy) ecosystem, allowing seamless roundtripping of complex S4 datasets like `SummarizedExperiment`, `SingleCellExperiment`, and `GenomicRanges`. -The package uses memory views (except for strings) so that we can access the same memory from C++ space in Python (through Cython of course). This is especially useful for large datasets so we don't make copies of data. +This library is built on top of [Aaron Lun's rds2cpp library](https://github.com/LTLA/rds2cpp). -## Install +## Installation -Package is published to [PyPI](https://pypi.org/project/rds2py/) +`rds2py` is available on [PyPI](https://pypi.org/project/rds2py/): ```shell pip install rds2py ``` -## Contents +To enable full conversion support for Bioconductor/BiocPy classes, consider installing the optional dependencies: + +```shell +pip install rds2py[optional] +``` + +## Table of Contents ```{toctree} :maxdepth: 2 Overview Tutorial +Custom Serialization Guide Contributions & Help License Authors diff --git a/docs/tutorial.md b/docs/tutorial.md index dbabf6d..6a9c483 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -1,41 +1,106 @@ -# Tutorial +# Tutorial: Getting Started with rds2py -If you do not have an RDS object handy, feel free to download one from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases). +Welcome to the `rds2py` tutorial! In this guide, we'll walk through how to read and write RDS and RData files in Python, parse objects into raw representations, and work with Python/BiocPy data structures. -### Basic Usage +--- + +## 1. Basic Reading + +Reading an RDS file is designed to be a one-line operation. Under the hood, `rds2py` reads the data format and instantiates corresponding Python or Bioconductor classes (from the `BiocPy` ecosystem) where possible. + +```python +from rds2py import read_rds, read_rda + +# 1. Read a single R object from an RDS file +my_object = read_rds("path/to/file.rds") + +# 2. Read all objects from an RData workspace file +# This returns a dictionary mapping variable names to their Python counterparts +workspace = read_rda("path/to/file.rda") +for name, obj in workspace.items(): + print(f"Loaded object: {name} of type {type(obj)}") +``` + +*Note: If you need an RDS file to experiment with, you can download some pre-made test files from [single-cell-test-files](https://github.com/jkanche/random-test-files/releases).* + +--- + +## 2. Basic Writing (Saving) + +You can write Python data structures back to R's native formats. `rds2py` automatically translates standard Python lists, NumPy arrays, SciPy sparse matrices, and BiocPy S4 objects into the correct representation formats. ```python -from rds2py import read_rds -r_obj = read_rds("path/to/file.rds") +import numpy as np +from rds2py import write_rds, write_rda +from genomicranges import GenomicRanges +from iranges import IRanges + +# 1. Save a NumPy array as an RDS integer vector +data_array = np.array([1, 2, 3, 4], dtype=np.int32) +write_rds(data_array, "vector.rds") + +# 2. Save a Bioconductor GenomicRanges S4 object +gr = GenomicRanges( + seqnames=["chr1", "chr2"], + ranges=IRanges(start=[1, 100], width=[10, 50]), + strand=["+", "-"] +) +write_rds(gr, "granges.rds") + +# 3. Save multiple objects to an RData workspace file +workspace_objects = { + "my_data": data_array, + "my_ranges": gr +} +write_rda(workspace_objects, "workspace.rda") ``` -The returned `r_obj` either returns an appropriate Python class if a parser is already implemented or returns the dictionary containing the data from the RDS file. +--- -## Write-your-own-reader +## 3. Raw Dictionary Parsing (Custom Readers) -In addition, the package provides the dictionary representation of the RDS file, allowing users to write their own custom readers into appropriate Python representations. +Sometimes you want to inspect the structure of an RDS file without automatically converting it to Python classes, or you might want to write a custom reader for an unsupported S4 class. + +For these scenarios, use `parse_rds` or `parse_rda`. They return the raw RDS tree structure as nested Python dictionaries: ```python from rds2py import parse_rds +from rds2py.read_granges import read_genomic_ranges -data = parse_rds("path/to/file.rds") -print(data) +# Parse the RDS file into a raw nested dictionary representation +raw_representation = parse_rds("path/to/file.rds") +print(raw_representation) -# now write your own parser to convert this dictionary. +# If you know the underlying S4 class is a GRanges object, +# you can use a parser directly: +if raw_representation.get("class_name") == "GRanges": + gr = read_genomic_ranges(raw_representation) + print(gr) ``` +--- + ## Type Conversion Reference -| R Type | Python/NumPy Type | -|--------|------------------| -| numeric | numpy.ndarray (float64) | -| integer | numpy.ndarray (int32) | -| character | list of str | -| logical | numpy.ndarray (bool) | -| factor | list | -| data.frame | BiocFrame | -| matrix | numpy.ndarray or scipy.sparse matrix | -| dgCMatrix | scipy.sparse.csc_matrix | -| dgRMatrix | scipy.sparse.csr_matrix | - -Check out the module reference for more information on these classes. +The following table summarizes how basic R data structures map to Python, NumPy, and SciPy types: + +| R Type / Class | Python / NumPy / SciPy Counterpart | +| :--- | :--- | +| **numeric** | `numpy.ndarray` (`float64`) | +| **integer** | `numpy.ndarray` (`int32`) | +| **logical** | `numpy.ndarray` (`bool`) | +| **character** | `list` of `str` | +| **factor** | `list` / representation levels | +| **matrix (dense)** | `numpy.ndarray` | +| **dgCMatrix** (Column-sparse) | `scipy.sparse.csc_matrix` | +| **dgRMatrix** (Row-sparse) | `scipy.sparse.csr_matrix` | +| **data.frame** / **DFrame** | `biocframe.BiocFrame` | + +### BiocPy Ecosystem Support +If optional dependencies are installed (`pip install rds2py[optional]`), R S4 classes are automatically converted: +- **GenomicRanges** / **GRanges** <-> `genomicranges.GenomicRanges` +- **GenomicRangesList** / **GRangesList** <-> `genomicranges.CompressedGenomicRangesList` +- **SummarizedExperiment** <-> `summarizedexperiment.SummarizedExperiment` +- **RangedSummarizedExperiment** <-> `summarizedexperiment.RangedSummarizedExperiment` +- **SingleCellExperiment** <-> `singlecellexperiment.SingleCellExperiment` +- **MultiAssayExperiment** <-> `multiassayexperiment.MultiAssayExperiment` From 7c32f6b43cf487a262e58ad2f6429dbb283c7475 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Jun 2026 04:01:07 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/tutorial.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorial.md b/docs/tutorial.md index 6a9c483..2a5f9f8 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -59,7 +59,7 @@ write_rda(workspace_objects, "workspace.rda") ## 3. Raw Dictionary Parsing (Custom Readers) -Sometimes you want to inspect the structure of an RDS file without automatically converting it to Python classes, or you might want to write a custom reader for an unsupported S4 class. +Sometimes you want to inspect the structure of an RDS file without automatically converting it to Python classes, or you might want to write a custom reader for an unsupported S4 class. For these scenarios, use `parse_rds` or `parse_rda`. They return the raw RDS tree structure as nested Python dictionaries: @@ -71,7 +71,7 @@ from rds2py.read_granges import read_genomic_ranges raw_representation = parse_rds("path/to/file.rds") print(raw_representation) -# If you know the underlying S4 class is a GRanges object, +# If you know the underlying S4 class is a GRanges object, # you can use a parser directly: if raw_representation.get("class_name") == "GRanges": gr = read_genomic_ranges(raw_representation)