diff --git a/extruct/jsonld.py b/extruct/jsonld.py index e30c6fe6..e43ad9e8 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -4,14 +4,10 @@ """ import json -import re -import jstyleson import lxml.etree -from extruct.utils import parse_html - -HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') +from extruct.utils import parse_html, parse_json class JsonLdExtractor(object): @@ -29,13 +25,7 @@ def extract_items(self, document, base_url=None): ] def _extract_items(self, node): - script = node.xpath('string()') - try: - # TODO: `strict=False` can be configurable if needed - data = json.loads(script, strict=False) - except ValueError: - # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script),strict=False) + data = parse_json(node.xpath('string()')) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 1cd1d133..187c774d 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -17,7 +17,7 @@ from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context -from extruct.utils import parse_xmldom_html +from extruct.utils import parse_json, parse_xmldom_html # silence rdflib/PyRdfa INFO logs @@ -159,4 +159,4 @@ def extract_items(self, document, base_url=None, expanded=True): # it should be disabled once PyRDFA fixes itself return self._fix_order(jsonld_string, document) except: - return json.loads(jsonld_string) + return parse_json(jsonld_string) diff --git a/extruct/utils.py b/extruct/utils.py index a29a61f6..d36a8351 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -1,4 +1,14 @@ # -*- coding: utf-8 -*- + +import json +import re + +try: + from json.decoder import JSONDecodeError +except ImportError: + JSONDecodeError = ValueError + +import jstyleson import lxml.html from extruct.xmldom import XmlDomHTMLParser @@ -10,6 +20,35 @@ def parse_html(html, encoding): return lxml.html.fromstring(html, parser=parser) +HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') + + +def parse_json(json_string): + try: + return json.loads(json_string, strict=False) + except ValueError: + pass + + # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments + json_string = HTML_OR_JS_COMMENTLINE.sub('', json_string) + + while True: + try: + return jstyleson.loads(json_string, strict=False) + except JSONDecodeError as error: + if ( + hasattr(error, 'msg') + and error.msg == "Expecting ',' delimiter" + and json_string[error.pos-1] == '"' + ): + insertion_position = error.pos-1 + prefix = json_string[:insertion_position] + suffix = json_string[insertion_position:] + json_string = prefix + '\\' + suffix + continue + raise + + def parse_xmldom_html(html, encoding): """ Parse HTML using XmlDomHTMLParser, return a tree """ parser = XmlDomHTMLParser(encoding=encoding) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..a3a3988c --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,26 @@ +from sys import version_info + +from pytest import mark, raises + +from extruct.utils import parse_json + + +@mark.parametrize( + 'input,output', + [ + ( + '{"a": ["10\'5""]}', + {'a': ['10\'5"']}, + ), + ( + '{"a": ["Say "Hello""]}', + {'a': ['Say "Hello"']}, + ), + ] +) +def test_parse_json(input, output): + if version_info >= (3,): + assert parse_json(input) == output + else: + with raises(ValueError): + parse_json(input)