Skip to content

Commit fd89ab5

Browse files
committed
feat: create simdjson binding
1 parent 30d1d56 commit fd89ab5

File tree

12 files changed

+329
-1510
lines changed

12 files changed

+329
-1510
lines changed

pandas/_libs/json.pyi

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,3 @@ def ujson_dumps(
1414
default_handler: None
1515
| Callable[[Any], str | float | bool | list | dict | None] = ...,
1616
) -> str: ...
17-
def ujson_loads(
18-
s: str,
19-
precise_float: bool = ...,
20-
numpy: bool = ...,
21-
dtype: None = ...,
22-
labelled: bool = ...,
23-
) -> Any: ...

pandas/_libs/meson.build

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,10 @@ libs_sources = {
128128
'sources': [
129129
'src/vendored/ujson/python/ujson.c',
130130
'src/vendored/ujson/python/objToJSON.c',
131-
'src/vendored/ujson/python/JSONtoObj.c',
132131
'src/vendored/ujson/lib/ultrajsonenc.c',
133-
'src/vendored/ujson/lib/ultrajsondec.c',
134132
],
135133
},
134+
'simdjson': {'sources': ['src/parser/json.cpp'], 'deps': simdjson_dep},
136135
'ops': {'sources': ['ops.pyx']},
137136
'ops_dispatch': {'sources': ['ops_dispatch.pyx']},
138137
'properties': {'sources': ['properties.pyx']},
@@ -195,6 +194,7 @@ sources_to_install = [
195194
'properties.pyi',
196195
'reshape.pyi',
197196
'sas.pyi',
197+
'simdjson.pyi',
198198
'sparse.pyi',
199199
'testing.pyi',
200200
'tslib.pyi',

pandas/_libs/simdjson.pyi

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from typing import (
2+
Any,
3+
)
4+
5+
def simdjson_loads(
6+
s: str | bytes,
7+
precise_float: bool = ...,
8+
) -> Any: ...

pandas/_libs/src/parser/json.cpp

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#define PY_SSIZE_T_CLEAN
2+
3+
#include "Python.h"
4+
#include "simdjson.h"
5+
6+
namespace pandas {
7+
namespace json {
8+
using namespace simdjson;
9+
10+
ondemand::parser parser;
11+
12+
static PyObject *build_python_object(ondemand::value element);
13+
14+
static PyObject *object_to_dict(ondemand::object element) {
15+
PyObject *dict = PyDict_New();
16+
for (auto field : element) {
17+
std::string_view key = field.unescaped_key();
18+
PyObject *value = build_python_object(field.value());
19+
20+
if (!value) {
21+
Py_DECREF(dict);
22+
return NULL;
23+
}
24+
25+
PyObject *key_py = PyUnicode_FromStringAndSize(key.data(), key.size());
26+
PyDict_SetItem(dict, key_py, value);
27+
Py_DECREF(key_py);
28+
Py_DECREF(value);
29+
}
30+
31+
return dict;
32+
}
33+
34+
static PyObject *array_to_list(ondemand::array element) {
35+
PyObject *list = PyList_New(0);
36+
for (auto child : element) {
37+
PyObject *tmp = build_python_object(child.value());
38+
if (!tmp) {
39+
Py_DECREF(list);
40+
return NULL;
41+
}
42+
43+
if (PyList_Append(list, tmp) != 0) {
44+
Py_DECREF(list);
45+
Py_DECREF(tmp);
46+
return NULL;
47+
}
48+
49+
Py_DECREF(tmp);
50+
}
51+
return list;
52+
}
53+
54+
static PyObject *big_int_to_pylong(ondemand::value element) {
55+
std::string_view s = element.raw_json_token();
56+
std::string null_terminated_s(s);
57+
return PyLong_FromString(null_terminated_s.c_str(), NULL, 10);
58+
}
59+
60+
static PyObject *json_number_to_pyobject(ondemand::value element) {
61+
ondemand::number num = element.get_number();
62+
switch (num.get_number_type()) {
63+
case ondemand::number_type::signed_integer:
64+
return PyLong_FromLongLong(num.get_int64());
65+
break;
66+
case ondemand::number_type::unsigned_integer:
67+
return PyLong_FromUnsignedLongLong(num.get_uint64());
68+
break;
69+
case ondemand::number_type::floating_point_number:
70+
return PyFloat_FromDouble(num.get_double());
71+
break;
72+
case ondemand::number_type::big_integer:
73+
return big_int_to_pylong(element);
74+
break;
75+
}
76+
}
77+
78+
static PyObject *json_str_to_pyobject(ondemand::value element) {
79+
std::string_view s = element.get_string(true);
80+
return PyUnicode_FromStringAndSize(s.data(), s.size());
81+
}
82+
83+
static PyObject *build_python_object(ondemand::value element) {
84+
switch (element.type()) {
85+
case ondemand::json_type::object:
86+
return object_to_dict(element.get_object());
87+
break;
88+
case ondemand::json_type::array:
89+
return array_to_list(element.get_array());
90+
break;
91+
case ondemand::json_type::boolean:
92+
return element.get_bool() ? Py_True : Py_False;
93+
break;
94+
case ondemand::json_type::null:
95+
return Py_None;
96+
case ondemand::json_type::string:
97+
return json_str_to_pyobject(element);
98+
break;
99+
case ondemand::json_type::number:
100+
return json_number_to_pyobject(element);
101+
break;
102+
case ondemand::json_type::unknown:
103+
// TODO: improve error hadling
104+
PyErr_Format(PyExc_ValueError, "Some error occourred");
105+
break;
106+
}
107+
108+
return NULL;
109+
}
110+
111+
} // namespace json
112+
} // namespace pandas
113+
114+
extern "C" {
115+
116+
PyObject *json_loads(PyObject *Py_UNUSED(self), PyObject *args,
117+
PyObject *kwargs) {
118+
static const char *kwlist[] = {"obj", "precise_float", NULL};
119+
const char *buf;
120+
Py_ssize_t len;
121+
int *precise_float; // Unused. It's declared for compatibility with old parser
122+
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|b", kwlist, &buf, &len,
123+
&precise_float)) {
124+
return NULL;
125+
}
126+
127+
PyObject *ret;
128+
try {
129+
simdjson::padded_string padded_json(buf, len);
130+
simdjson::ondemand::document doc =
131+
pandas::json::parser.iterate(padded_json);
132+
switch (doc.type()) {
133+
case simdjson::fallback::ondemand::json_type::null:
134+
ret = Py_None;
135+
break;
136+
case simdjson::fallback::ondemand::json_type::boolean:
137+
ret = doc.get_bool() ? Py_True : Py_False;
138+
break;
139+
case simdjson::fallback::ondemand::json_type::number: {
140+
simdjson::ondemand::number num = doc.get_number();
141+
switch (num.get_number_type()) {
142+
case simdjson::ondemand::number_type::signed_integer:
143+
ret = PyLong_FromLongLong(num.get_int64());
144+
break;
145+
case simdjson::ondemand::number_type::unsigned_integer:
146+
ret = PyLong_FromUnsignedLongLong(num.get_uint64());
147+
break;
148+
case simdjson::ondemand::number_type::floating_point_number:
149+
ret = PyFloat_FromDouble(num.get_double());
150+
break;
151+
case simdjson::ondemand::number_type::big_integer:
152+
PyErr_Format(PyExc_ValueError, "Overflow");
153+
return NULL;
154+
}
155+
break;
156+
}
157+
case simdjson::fallback::ondemand::json_type::string: {
158+
std::string_view s = doc.get_string();
159+
ret = PyUnicode_FromStringAndSize(s.data(), s.size());
160+
break;
161+
}
162+
default:
163+
simdjson::ondemand::value val = doc;
164+
ret = pandas::json::build_python_object(val);
165+
break;
166+
}
167+
} catch (simdjson::simdjson_error &error) {
168+
Py_XDECREF(ret);
169+
ret = NULL;
170+
// TODO: get location or token where error occourred
171+
PyErr_Format(PyExc_ValueError, "JSON parsing error: %s", error.what());
172+
return NULL;
173+
}
174+
175+
return ret;
176+
}
177+
178+
static PyMethodDef json_methods[] = {
179+
{"simdjson_loads", (PyCFunction)(void (*)(void))json_loads,
180+
METH_VARARGS | METH_KEYWORDS, "Parse JSON string using simdjson"},
181+
{NULL, NULL, 0, NULL} /* sentinel */
182+
};
183+
184+
static struct PyModuleDef json_module = {
185+
.m_base = PyModuleDef_HEAD_INIT,
186+
.m_name = "pandas._libs.simdjson",
187+
.m_doc = "simdjson python binding",
188+
.m_size = 0,
189+
.m_methods = json_methods,
190+
.m_slots = NULL,
191+
.m_traverse = NULL,
192+
.m_clear = NULL,
193+
.m_free = NULL,
194+
};
195+
196+
PyMODINIT_FUNC PyInit_simdjson(void) { return PyModuleDef_Init(&json_module); }
197+
198+
} // extern "C"

0 commit comments

Comments
 (0)