diff --git a/README.md b/README.md index 32ef89e..d4061f3 100644 --- a/README.md +++ b/README.md @@ -349,6 +349,13 @@ Even more queries can be found [here](https://colab.research.google.com/github/R # Latest updates +## Version 0.3.0 alpha 3 +- Added parameters to the jsoniq magic to select the desired output to print: -j, -df, -pdf +- Added informative error message with a hint on how to fix when trying to get a DataFrame and there is no schema. +- Added parameter -t to the jsoniq magic to measure the response time +- The RumbleSession object now saves the latest result (sequence of items) in a field called lastResult. This is particularly useful in notebooks for post-processing a result in Python after obtained it through the jsoniq magic. +- Improved static type detection upon binding a pandas or pyspark DataFrame as an input variable to a JSONiq queries. + ## Version 0.2.0 alpha 2 - You can change the result size cap through to the now accessible Rumble configuration (for example rumble .getRumbleConf().setResultSizeCap(10)). This controls how many items can be retrieved at most with a json() call. You can increase it to whichever number you would like if you reach the cap. - Add the JSONiq magic to execute JSONiq queries directly in a notebook cell, using the RumbleDB instance shipped with the library. diff --git a/pyproject.toml b/pyproject.toml index b99a64e..3744a4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "jsoniq" -version = "0.2.0a2" +version = "0.2.0a3" description = "Python edition of RumbleDB, a JSONiq engine" requires-python = ">=3.11" dependencies = [ diff --git a/src/jsoniq/jars/rumbledb-1.24.0.jar b/src/jsoniq/jars/rumbledb-1.24.0.jar index f87e7b7..371abc6 100644 Binary files a/src/jsoniq/jars/rumbledb-1.24.0.jar and b/src/jsoniq/jars/rumbledb-1.24.0.jar differ diff --git a/src/jsoniq/sequence.py b/src/jsoniq/sequence.py index 2aa5b46..7d9993a 100644 --- a/src/jsoniq/sequence.py +++ b/src/jsoniq/sequence.py @@ -2,8 +2,30 @@ from pyspark.sql import SparkSession from pyspark.sql import DataFrame import json +import sys class SequenceOfItems: + schema_str = """ +No DataFrame available as no schema was automatically detected. If you still believe the output is structured enough, you could add a schema and validate expression explicitly to your query. + +This is an example of how you can simply define a schema and wrap your query in a validate expression: + +declare type local:mytype as { + "product" : "string", + "store-number" : "int", + "quantity" : "decimal" +}; +validate type local:mytype* { + for $product in json-lines("http://rumbledb.org/samples/products-small.json", 10) + where $product.quantity ge 995 + return $product +} + +RumbleDB keeps getting improved and automatic schema detection will improve as new versions get released. But even when RumbleDB fails to detect a schema, you can always declare your own schema as shown above. + +For more information, see the documentation at https://docs.rumbledb.org/rumbledb-reference/types +""" + def __init__(self, sequence, rumblesession): self._jsequence = sequence self._rumblesession = rumblesession @@ -28,9 +50,15 @@ def rdd(self): return rdd.map(lambda l: json.loads(l)) def df(self): + if (not "DataFrame" in self._jsequence.availableOutputs()): + sys.stderr.write(self.schema_str) + return None return DataFrame(self._jsequence.getAsDataFrame(), self._sparksession) def pdf(self): + if (not "DataFrame" in self._jsequence.availableOutputs()): + sys.stderr.write(self.schema_str) + return None return self.df().toPandas() def count(self): diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py index f4c03fe..0d838f2 100644 --- a/src/jsoniq/session.py +++ b/src/jsoniq/session.py @@ -169,7 +169,8 @@ def bindDataFrameAsVariable(self, name: str, df): def jsoniq(self, str): sequence = self._jrumblesession.runQuery(str); - return SequenceOfItems(sequence, self); + self.lastResult = SequenceOfItems(sequence, self); + return self.lastResult; def __getattr__(self, item): return getattr(self._sparksession, item) \ No newline at end of file diff --git a/src/jsoniqmagic/magic.py b/src/jsoniqmagic/magic.py index ef02ee7..d47adc5 100644 --- a/src/jsoniqmagic/magic.py +++ b/src/jsoniqmagic/magic.py @@ -1,16 +1,36 @@ from IPython.core.magic import Magics, cell_magic, magics_class +from IPython.core.magic_arguments import ( + argument, magic_arguments, parse_argstring +) import time, json from jsoniq.session import RumbleSession from py4j.protocol import Py4JJavaError @magics_class class JSONiqMagic(Magics): + @magic_arguments() + @argument( + '-t', '--timed', action='store_true', help='Measure execution time.' + ) + @argument( + '-df', '--pyspark-data-frame', action='store_true', help='Prints the output as a Pyspark DataFrame (if a schema is available).' + ) + @argument( + '-pdf', '--pandas-data-frame', action='store_true', help='Prints the output as a Pandas DataFrame (if a schema is available).' + ) + @argument( + '-j', '--json', action='store_true', help='Prints the output as JSON.' + ) + @argument( + '-u', '--apply-updates', action='store_true', help='Applies updates if a PUL is output.' + ) def run(self, line, cell=None, timed=False): if cell is None: data = line else: data = cell + args = parse_argstring(self.run, line) start = time.time() try: rumble = RumbleSession.builder.getOrCreate(); @@ -28,28 +48,53 @@ def run(self, line, cell=None, timed=False): print("Query unsuccessful.") print("Usual reasons: firewall, misconfigured proxy.") return - end = time.time() - if(timed): - print("Response time: %s ms" % (end - start)) - if ("DataFrame" in response.availableOutputs()): - print(response.pdf()) - elif ("Local" in response.availableOutputs()): + schema_str = """ +No DataFrame available as no schema was detected. If you still believe the output is structured enough, you could add a schema and validate expression explicitly to your query. + +This is an example of how you can simply define a schema and wrap your query in a validate expression: + +declare type mytype as { + "product" : "string", + "store-number" : "int", + "quantity" : "decimal" +}; +validate type mytype* { + for $product in json-lines("http://rumbledb.org/samples/products-small.json", 10) + where $product.quantity ge 995 + return $product +} +""" + + if(args.pyspark_data_frame): + df = response.df(); + if df is not None: + df.show() + + if (args.pandas_data_frame): + pdf = response.pdf() + if pdf is not None: + print(pdf) + + if (args.apply_updates): + if ("PUL" in response.availableOutputs()): + response.applyPUL() + print("Updates applied successfully.") + else: + print("No Pending Update List (PUL) available to apply.") + + if (args.json or (not args.pandas_data_frame and not args.pyspark_data_frame)): capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1) if len(capplusone) > rumble.getRumbleConf().getResultSizeCap(): count = response.count() print("The query output %s items, which is too many to display. Displaying the first %s items:" % (count, rumble.getRumbleConf().getResultSizeCap())) for e in capplusone[:rumble.getRumbleConf().getResultSizeCap()]: print(json.dumps(json.loads(e.serializeAsJSON()), indent=2)) - elif ("PUL" in response.availableOutputs()): - print("The query output a Pending Update List.") - else: - print("No output available.") + + end = time.time() + if(args.timed): + print("Response time: %s ms" % (end - start)) @cell_magic def jsoniq(self, line, cell=None): return self.run(line, cell, False) - - @cell_magic - def timedjsoniq(self, line, cell=None): - return self.run(line, cell, True) \ No newline at end of file diff --git a/tests/test_sample.py b/tests/test_sample.py index 3acb22e..4232ac1 100644 --- a/tests/test_sample.py +++ b/tests/test_sample.py @@ -32,6 +32,7 @@ def test1(self): # Generally though, the results may contain zero, one, two, or more items. python_list = items.json() print(python_list) + self.assertTrue(json.dumps(python_list) == json.dumps((2,))) ############################################ ##### More complex, standalone queries #####