Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Changelog

## Version 0.10.0
## Version 0.10.0 - 0.10.1

- Added methods to write to RDS/RData files.
- Supports atomic types, generic dictionaries/lists, and **BiocPy objects**.
- Read `symbols` registered in RDS objects.
- Fixed an issue with S4 classes not properly saved as RDS files.

## Version 0.9.0 - 0.9.1

Expand Down
214 changes: 210 additions & 4 deletions lib/src/rdswrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <rds2cpp/rds2cpp.hpp>
#include <stdexcept>
#include <pybind11/iostream.h>
#include <limits>

namespace py = pybind11;

Expand All @@ -28,6 +29,7 @@ class RdsReader {
case rds2cpp::SEXPType::LGL: return "boolean";
case rds2cpp::SEXPType::VEC: return "vector";
case rds2cpp::SEXPType::NIL: return "null";
case rds2cpp::SEXPType::SYM: return "symbol";
default: return "other";
}
}
Expand Down Expand Up @@ -139,6 +141,17 @@ class RdsReader {
return {static_cast<size_t>(dims[0]), static_cast<size_t>(dims[1])};
}

std::string get_symbol_name() const {
if (!ptr || ptr->type() != rds2cpp::SEXPType::SYM) {
throw std::runtime_error("Not a symbol object");
}
const auto* sym = static_cast<const rds2cpp::SymbolIndex*>(ptr);
if (sym->index >= symbols_ptr->size()) {
throw std::runtime_error("Symbol index out of range");
}
return (*symbols_ptr)[sym->index].name;
}

private:
std::string resolve_symbol(const rds2cpp::SymbolIndex& sym) const {
if (sym.index >= symbols_ptr->size()) {
Expand Down Expand Up @@ -321,18 +334,210 @@ std::unique_ptr<rds2cpp::RObject> py_to_robject(const py::object& obj, std::vect
throw std::runtime_error("Unsupported numpy dtype for RDS writing");
}

// dict -> GenericVector with names attribute
// dict
if (py::isinstance<py::dict>(obj)) {
auto d = obj.cast<py::dict>();
auto gvec = std::make_unique<rds2cpp::GenericVector>();

// If it's a structured R object dictionary:
if (d.contains("type")) {
std::string rtype = d["type"].cast<std::string>();

if (rtype == "S4") {
auto s4 = std::make_unique<rds2cpp::S4Object>();
s4->class_name = d["class_name"].cast<std::string>();
s4->package_name = d["package_name"].cast<std::string>();
if (d.contains("attributes") && !d["attributes"].is_none()) {
auto attrs = d["attributes"].cast<py::dict>();
for (auto& item : attrs) {
auto name_str = item.first.cast<std::string>();
auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols);
py::object val_py = py::reinterpret_borrow<py::object>(item.second);
std::unique_ptr<rds2cpp::RObject> val_obj;
if (val_py.is_none()) {
val_obj = std::make_unique<rds2cpp::SymbolIndex>(
rds2cpp::register_symbol("\001NULL\001", rds2cpp::StringEncoding::UTF8, symbols)
);
} else if (py::isinstance<py::dict>(val_py) && val_py.cast<py::dict>().contains("type") && py::isinstance<py::str>(val_py.cast<py::dict>()["type"]) && val_py.cast<py::dict>()["type"].cast<std::string>() == "null") {
val_obj = std::make_unique<rds2cpp::SymbolIndex>(
rds2cpp::register_symbol("\001NULL\001", rds2cpp::StringEncoding::UTF8, symbols)
);
} else {
val_obj = py_to_robject(val_py, symbols);
}
s4->attributes.emplace_back(name_sym, std::move(val_obj));
}
}
return s4;
}

if (rtype == "integer") {
auto vec = std::make_unique<rds2cpp::IntegerVector>();
if (d.contains("data") && !d["data"].is_none()) {
auto data_obj = d["data"];
if (py::isinstance<py::array>(data_obj)) {
auto arr = data_obj.cast<py::array_t<int32_t, py::array::c_style | py::array::forcecast>>();
auto r = arr.unchecked<1>();
vec->data.reserve(r.shape(0));
for (ssize_t i = 0; i < r.shape(0); ++i) vec->data.push_back(r(i));
} else {
auto seq = data_obj.cast<py::sequence>();
vec->data.reserve(py::len(seq));
for (size_t i = 0; i < py::len(seq); ++i) {
if (seq[i].is_none()) {
vec->data.push_back(-2147483648);
} else {
vec->data.push_back(seq[i].cast<int32_t>());
}
}
}
}
if (d.contains("attributes") && !d["attributes"].is_none()) {
auto attrs = d["attributes"].cast<py::dict>();
for (auto& item : attrs) {
auto name_str = item.first.cast<std::string>();
auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols);
auto val_obj = py_to_robject(py::reinterpret_borrow<py::object>(item.second), symbols);
vec->attributes.emplace_back(name_sym, std::move(val_obj));
}
}
return vec;
}

if (rtype == "double" || rtype == "numeric") {
auto vec = std::make_unique<rds2cpp::DoubleVector>();
if (d.contains("data") && !d["data"].is_none()) {
auto data_obj = d["data"];
if (py::isinstance<py::array>(data_obj)) {
auto arr = data_obj.cast<py::array_t<double, py::array::c_style | py::array::forcecast>>();
auto r = arr.unchecked<1>();
vec->data.reserve(r.shape(0));
for (ssize_t i = 0; i < r.shape(0); ++i) vec->data.push_back(r(i));
} else {
auto seq = data_obj.cast<py::sequence>();
vec->data.reserve(py::len(seq));
for (size_t i = 0; i < py::len(seq); ++i) {
if (seq[i].is_none()) {
vec->data.push_back(std::numeric_limits<double>::quiet_NaN());
} else {
vec->data.push_back(seq[i].cast<double>());
}
}
}
}
if (d.contains("attributes") && !d["attributes"].is_none()) {
auto attrs = d["attributes"].cast<py::dict>();
for (auto& item : attrs) {
auto name_str = item.first.cast<std::string>();
auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols);
auto val_obj = py_to_robject(py::reinterpret_borrow<py::object>(item.second), symbols);
vec->attributes.emplace_back(name_sym, std::move(val_obj));
}
}
return vec;
}

if (rtype == "boolean" || rtype == "logical") {
auto vec = std::make_unique<rds2cpp::LogicalVector>();
if (d.contains("data") && !d["data"].is_none()) {
auto data_obj = d["data"];
if (py::isinstance<py::array>(data_obj)) {
auto arr = data_obj.cast<py::array_t<bool, py::array::c_style | py::array::forcecast>>();
auto r = arr.unchecked<1>();
vec->data.reserve(r.shape(0));
for (ssize_t i = 0; i < r.shape(0); ++i) vec->data.push_back(r(i) ? 1 : 0);
} else {
auto seq = data_obj.cast<py::sequence>();
vec->data.reserve(py::len(seq));
for (size_t i = 0; i < py::len(seq); ++i) {
if (seq[i].is_none()) {
vec->data.push_back(-2147483648);
} else {
vec->data.push_back(seq[i].cast<bool>() ? 1 : 0);
}
}
}
}
if (d.contains("attributes") && !d["attributes"].is_none()) {
auto attrs = d["attributes"].cast<py::dict>();
for (auto& item : attrs) {
auto name_str = item.first.cast<std::string>();
auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols);
auto val_obj = py_to_robject(py::reinterpret_borrow<py::object>(item.second), symbols);
vec->attributes.emplace_back(name_sym, std::move(val_obj));
}
}
return vec;
}

if (rtype == "string" || rtype == "character") {
auto vec = std::make_unique<rds2cpp::StringVector>();
if (d.contains("data") && !d["data"].is_none()) {
auto lst = d["data"].cast<py::list>();
vec->data.reserve(py::len(lst));
for (size_t i = 0; i < py::len(lst); ++i) {
auto item = lst[i];
if (item.is_none()) {
vec->data.emplace_back();
} else {
vec->data.emplace_back(item.cast<std::string>(), rds2cpp::StringEncoding::UTF8);
}
}
}
if (d.contains("attributes") && !d["attributes"].is_none()) {
auto attrs = d["attributes"].cast<py::dict>();
for (auto& item : attrs) {
auto name_str = item.first.cast<std::string>();
auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols);
auto val_obj = py_to_robject(py::reinterpret_borrow<py::object>(item.second), symbols);
vec->attributes.emplace_back(name_sym, std::move(val_obj));
}
}
return vec;
}

if (rtype == "vector" || rtype == "list") {
auto vec = std::make_unique<rds2cpp::GenericVector>();
if (d.contains("data") && !d["data"].is_none()) {
auto lst = d["data"].cast<py::list>();
vec->data.reserve(py::len(lst));
for (size_t i = 0; i < py::len(lst); ++i) {
vec->data.push_back(py_to_robject(lst[i].cast<py::object>(), symbols));
}
}
if (d.contains("attributes") && !d["attributes"].is_none()) {
auto attrs = d["attributes"].cast<py::dict>();
for (auto& item : attrs) {
auto name_str = item.first.cast<std::string>();
auto name_sym = rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols);
auto val_obj = py_to_robject(py::reinterpret_borrow<py::object>(item.second), symbols);
vec->attributes.emplace_back(name_sym, std::move(val_obj));
}
}
return vec;
}

if (rtype == "symbol") {
std::string name_str = d["name"].cast<std::string>();
return std::make_unique<rds2cpp::SymbolIndex>(
rds2cpp::register_symbol(name_str, rds2cpp::StringEncoding::UTF8, symbols)
);
}

if (rtype == "null") {
return std::make_unique<rds2cpp::Null>();
}

throw std::runtime_error("Unsupported type for structured RDS writing: " + rtype);
}

// Default dictionary -> GenericVector with names attribute
auto gvec = std::make_unique<rds2cpp::GenericVector>();
py::list keys;
for (auto& item : d) {
keys.append(item.first);
gvec->data.push_back(py_to_robject(py::reinterpret_borrow<py::object>(item.second), symbols));
}
add_names_attribute(gvec->attributes, keys, symbols);

return gvec;
}

Expand Down Expand Up @@ -447,7 +652,8 @@ PYBIND11_MODULE(lib_rds_parser, m) {
.def("load_vec_element", &RdsReader::load_vec_element)
.def("get_package_name", &RdsReader::get_package_name)
.def("get_class_name", &RdsReader::get_class_name)
.def("get_dimensions", &RdsReader::get_dimensions);
.def("get_dimensions", &RdsReader::get_dimensions)
.def("get_symbol_name", &RdsReader::get_symbol_name);

m.def("write_rds", &write_rds_file, "Write a Python object to an RDS file",
py::arg("obj"), py::arg("path"));
Expand Down
7 changes: 7 additions & 0 deletions src/rds2py/PyRdsReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ def _process_object(self, obj: RdsReader) -> Dict[str, Any]:
result["data"] = self._process_vector(obj)
result["attributes"] = self._process_attributes(obj)
result["class_name"] = "vector"
elif rtype == "symbol":
symbol_name = obj.get_symbol_name()
if symbol_name == "\001NULL\001":
result = {"type": "null"}
else:
result["name"] = symbol_name
result["class_name"] = "symbol"
elif rtype == "null":
pass
else:
Expand Down
6 changes: 3 additions & 3 deletions src/rds2py/read_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ def read_dframe(robject: dict, **kwargs):
data[colname] = _dispatcher(robject["attributes"]["listData"]["data"][idx], **kwargs)

index = None
if robject["attributes"]["rownames"]["data"]:
if "data" in robject["attributes"]["rownames"] and robject["attributes"]["rownames"]["data"]:
index = _dispatcher(robject["attributes"]["rownames"], **kwargs)

nrows = None
if robject["attributes"]["nrows"]["data"]:
nrows = list(_dispatcher(robject["attributes"]["nrows"]), **kwargs)[0]
if "data" in robject["attributes"]["nrows"] and robject["attributes"]["nrows"]["data"]:
nrows = list(_dispatcher(robject["attributes"]["nrows"], **kwargs))[0]

df = BiocFrame(
data,
Expand Down
8 changes: 6 additions & 2 deletions src/rds2py/read_sce.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,12 @@ def read_single_cell_experiment(robject: dict, **kwargs):
idx_col = col_attrs[idx]
idx_value = robject["attributes"]["int_colData"]["attributes"]["listData"]["data"][idx]

if idx_col == "reducedDims" and idx_value.get("data", None) is not None:
robj_reduced_dims = _dispatcher(idx_value, **kwargs)
if idx_col == "reducedDims" and idx_value.get("type", None) != "null":
robj_reduced_dims_frame = _dispatcher(idx_value, **kwargs)
if hasattr(robj_reduced_dims_frame, "to_dict"):
robj_reduced_dims = robj_reduced_dims_frame.to_dict()
else:
robj_reduced_dims = robj_reduced_dims_frame

if idx_col == "altExps":
alt_names = list(_dispatcher(idx_value["attributes"]["listData"]["attributes"]["names"], **kwargs))
Expand Down
25 changes: 14 additions & 11 deletions src/rds2py/read_se.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,24 @@ def read_summarized_experiment(robject: dict, **kwargs):

if _cls not in ["SummarizedExperiment"]:
raise RuntimeError(f"`robject` does not contain a 'SummarizedExperiment' object, contains `{_cls}`.")

# parse assays names
robj_asys = {}
assay_dims = None
asy_names = list(
_dispatcher(
robject["attributes"]["assays"]["attributes"]["data"]["attributes"]["listData"]["attributes"]["names"],
**kwargs,
assay_dims = (0, 0)
assays_node = robject["attributes"].get("assays", None)
if assays_node is not None and assays_node.get("type", None) != "null":
asy_names = list(
_dispatcher(
assays_node["attributes"]["data"]["attributes"]["listData"]["attributes"]["names"],
**kwargs,
)
)
)
for idx, asyname in enumerate(asy_names):
idx_asy = robject["attributes"]["assays"]["attributes"]["data"]["attributes"]["listData"]["data"][idx]
for idx, asyname in enumerate(asy_names):
idx_asy = assays_node["attributes"]["data"]["attributes"]["listData"]["data"][idx]

robj_asys[asyname] = _dispatcher(idx_asy, **kwargs)
if assay_dims is None:
assay_dims = robj_asys[asyname].shape
robj_asys[asyname] = _dispatcher(idx_asy, **kwargs)
if assay_dims == (0, 0) and hasattr(robj_asys[asyname], "shape"):
assay_dims = robj_asys[asyname].shape

# parse coldata
robj_coldata = _sanitize_empty_frame(_dispatcher(robject["attributes"]["colData"], **kwargs), assay_dims[1])
Expand Down
Loading
Loading