From cb94433727608146013b950925c416f16825b787 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Thu, 17 Jul 2025 16:17:45 +0200 Subject: [PATCH 1/2] Test and fix the chaining SQL/JSONiq --- README.md | 47 ++++++++++++++++++++++++++++++++++++++++--- pyproject.toml | 2 +- src/jsoniq/session.py | 5 ++++- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 69ab865..fb0d453 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ pip install jsoniq We will make more documentation available as we go. In the meantime, you will find a sample code below that should just run after installing the library. +You can directly copy paste the code below to a Python file and execute it with Python. + ``` from jsoniq import RumbleSession @@ -83,13 +85,41 @@ modes = res.availableOutputs(); for mode in modes: print(mode) -###### Parallel access ###### +######################################################### +###### Manipulating DataFrames with SQL and JSONiq ###### +######################################################### -# This returns a regular data frame that can be further processed with spark.sql() or rumble.jsoniq(). +# If the output of the JSONiq query is structured (i.e., RumbleDB was able to detect a schema), +# then we can extract a regular data frame that can be further processed with spark.sql() or rumble.jsoniq(). df = res.df(); df.show(); +# We are continuously working on the detection of schemas and RumbleDB will get better at it with them. +# JSONiq is a very powerful language and can also produce heterogeneous output "by design". Then you need +# to use rdd() instead of df(), or to collect the list of JSON values (see further down). Remember +# that availableOutputs() tells you what is at your disposal. + +# A DataFrame output by JSONiq can be reused as input to a Spark SQL query. +# (Remember that rumble is a wrapper around a SparkSession object, so you can use rumble.sql() just like spark.sql()) +df.createTempView("input") +df2 = rumble.sql("SELECT * FROM input").toDF("name"); +df2.show(); + +# A DataFrame output by Spark SQL can be reused as input to a JSONiq query. +rumble.bindDataFrameAsVariable('$b', df2); +seq2 = rumble.jsoniq("for $i in 1 to 5 return $b"); +df3 = seq2.df(); +df3.show(); + +# And a DataFrame output by JSONiq can be reused as input to another JSONiq query. +rumble.bindDataFrameAsVariable('$b', df3); +seq3 = rumble.jsoniq("$b[position() lt 3]"); +df4 = seq3.df(); +df4.show(); + +######################### ##### Local access ###### +######################### # This materializes the rows as items. # The items are accessed with the RumbleDB Item API. @@ -103,7 +133,9 @@ while (res.hasNext()): print(res.next().getStringValue()); res.close(); +################################################################################################################ ###### Native Python/JSON Access for bypassing the Item API (but losing on the richer JSONiq type system) ###### +################################################################################################################ # This method directly gets the result as JSON (dict, list, strings, ints, etc). jlist = res.json(); @@ -122,6 +154,10 @@ print(rdd.count()); for str in rdd.take(10): print(str); +################################################### +###### Write back to the disk (or data lake) ###### +################################################### + # It is also possible to write the output to a file locally or on a cluster. The API is similar to that of Spark dataframes. # Note that it creates a directory and stores the (potentially very large) output in a sharded directory. # RumbleDB was already tested with up to 64 AWS machines and 100s of TBs of data. @@ -134,7 +170,9 @@ seq.write().mode("overwrite").parquet("outputparquet"); seq = rumble.jsoniq("1+1"); seq.write().mode("overwrite").text("outputtext"); -# A more complex, standalone query +############################################ +##### More complex, standalone queries ##### +############################################ seq = rumble.jsoniq(""" @@ -181,6 +219,9 @@ return { print(seq.json()); ``` +# How to learn JSONiq, and more query examples + +Even more queries can be found [here](https://colab.research.google.com/github/RumbleDB/rumble/blob/master/RumbleSandbox.ipynb) and you can look at the [JSONiq documentation](https://www.jsoniq.org) and tutorials. # Last updates diff --git a/pyproject.toml b/pyproject.toml index 2ee019c..97b10b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "jsoniq" -version = "0.1.0a10" +version = "0.1.0a11" description = "Python edition of RumbleDB, a JSONiq engine" requires-python = ">=3.11" dependencies = [ diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py index 81db729..e5cc968 100644 --- a/src/jsoniq/session.py +++ b/src/jsoniq/session.py @@ -88,7 +88,10 @@ def bindDataFrameAsVariable(self, name: str, df): if not name.startswith("$"): raise ValueError("Variable name must start with a dollar symbol ('$').") name = name[1:] - conf.setExternalVariableValue(name, df._jdf); + if(hasattr(df, "_get_object_id")): + conf.setExternalVariableValue(name, df); + else: + conf.setExternalVariableValue(name, df._jdf); return self; def jsoniq(self, str): From 480c0b86c9f995d449a5722189ff75e8f8136415 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Thu, 17 Jul 2025 16:19:06 +0200 Subject: [PATCH 2/2] Add release history --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index fb0d453..bc968b8 100644 --- a/README.md +++ b/README.md @@ -225,6 +225,9 @@ Even more queries can be found [here](https://colab.research.google.com/github/R # Last updates +## Version 0.1.0 alpha 11 +- Fix an issue when feeding a DataFrame output by rumble.jsoniq() back to a new JSONiq query (as a variable). + ## Version 0.1.0 alpha 10 - Add an explicit explanation on stderr if the Java version is not properly set, together with hints.