From 44edb078565008089f23b3351d0b9beb6f9a4b02 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:05:07 +0000 Subject: [PATCH 01/11] init validate entity plugin with basic parameters --- .../validate_entities/__init__.py | 1 + .../validate_entities/task.py | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 cmem_plugin_validation/validate_entities/__init__.py create mode 100644 cmem_plugin_validation/validate_entities/task.py diff --git a/cmem_plugin_validation/validate_entities/__init__.py b/cmem_plugin_validation/validate_entities/__init__.py new file mode 100644 index 0000000..08f67d9 --- /dev/null +++ b/cmem_plugin_validation/validate_entities/__init__.py @@ -0,0 +1 @@ +"""validate entities.""" diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py new file mode 100644 index 0000000..37db339 --- /dev/null +++ b/cmem_plugin_validation/validate_entities/task.py @@ -0,0 +1,47 @@ +"""Entities validation workflow task""" + +from collections.abc import Sequence + +from cmem_plugin_base.dataintegration.context import ExecutionContext +from cmem_plugin_base.dataintegration.description import Plugin, PluginParameter +from cmem_plugin_base.dataintegration.entity import Entities +from cmem_plugin_base.dataintegration.parameter.dataset import DatasetParameterType +from cmem_plugin_base.dataintegration.plugins import WorkflowPlugin + +DEFAULT_FAIL_ON_VIOLATION = False + + +@Plugin( + label="Validate Entity", + plugin_id="cmem_plugin_validation-validate-ValidateEntity", + description="Use JSON schema to validate entities", + documentation="Sai", + parameters=[ + PluginParameter( + name="json_schema_dataset", + label="JSON Schema Dataset", + description="This dataset holds the resources you want to validate.", + param_type=DatasetParameterType(dataset_type="json"), + ), + PluginParameter( + name="fail_on_violations", + label="Fail workflow on violations", + default_value=DEFAULT_FAIL_ON_VIOLATION, + ), + ], +) +class ValidateEntity(WorkflowPlugin): + """Validate entities against a JSON schema""" + + def __init__(self, json_schema_dataset: str, fail_on_violations: bool): + self.json_schema_dataset = json_schema_dataset + self.fail_on_violations = fail_on_violations + + def execute( + self, + inputs: Sequence[Entities], # noqa: ARG002 + context: ExecutionContext, + ) -> Entities | None: + """Run the workflow operator.""" + _ = context + return None From 547fdd24fdf311936ccac748f739181323ae60a2 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:08:20 +0000 Subject: [PATCH 02/11] add jsonschema dependency --- poetry.lock | 182 +++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + 2 files changed, 176 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index d150aca..5916aa3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,23 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. + +[[package]] +name = "attrs" +version = "24.2.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "beautifulsoup4" @@ -470,6 +489,41 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "jsonschema" +version = "4.23.0" +description = "An implementation of JSON Schema validation for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, + {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +jsonschema-specifications = ">=2023.03.6" +referencing = ">=0.28.4" +rpds-py = ">=0.7.1" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"] + +[[package]] +name = "jsonschema-specifications" +version = "2024.10.1" +description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +optional = false +python-versions = ">=3.9" +files = [ + {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"}, + {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"}, +] + +[package.dependencies] +referencing = ">=0.31.0" + [[package]] name = "junit-xml" version = "1.9" @@ -1149,6 +1203,21 @@ html = ["html5lib (>=1.0,<2.0)"] lxml = ["lxml (>=4.3.0,<5.0.0)"] networkx = ["networkx (>=2.0.0,<3.0.0)"] +[[package]] +name = "referencing" +version = "0.35.1" +description = "JSON Referencing + Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"}, + {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +rpds-py = ">=0.7.0" + [[package]] name = "requests" version = "2.32.3" @@ -1202,6 +1271,105 @@ pygments = ">=2.13.0,<3.0.0" [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] +[[package]] +name = "rpds-py" +version = "0.21.0" +description = "Python bindings to Rust's persistent data structures (rpds)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "rpds_py-0.21.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a017f813f24b9df929674d0332a374d40d7f0162b326562daae8066b502d0590"}, + {file = "rpds_py-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20cc1ed0bcc86d8e1a7e968cce15be45178fd16e2ff656a243145e0b439bd250"}, + {file = "rpds_py-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad116dda078d0bc4886cb7840e19811562acdc7a8e296ea6ec37e70326c1b41c"}, + {file = "rpds_py-0.21.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:808f1ac7cf3b44f81c9475475ceb221f982ef548e44e024ad5f9e7060649540e"}, + {file = "rpds_py-0.21.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de552f4a1916e520f2703ec474d2b4d3f86d41f353e7680b597512ffe7eac5d0"}, + {file = "rpds_py-0.21.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:efec946f331349dfc4ae9d0e034c263ddde19414fe5128580f512619abed05f1"}, + {file = "rpds_py-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b80b4690bbff51a034bfde9c9f6bf9357f0a8c61f548942b80f7b66356508bf5"}, + {file = "rpds_py-0.21.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:085ed25baac88953d4283e5b5bd094b155075bb40d07c29c4f073e10623f9f2e"}, + {file = "rpds_py-0.21.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:daa8efac2a1273eed2354397a51216ae1e198ecbce9036fba4e7610b308b6153"}, + {file = "rpds_py-0.21.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:95a5bad1ac8a5c77b4e658671642e4af3707f095d2b78a1fdd08af0dfb647624"}, + {file = "rpds_py-0.21.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3e53861b29a13d5b70116ea4230b5f0f3547b2c222c5daa090eb7c9c82d7f664"}, + {file = "rpds_py-0.21.0-cp310-none-win32.whl", hash = "sha256:ea3a6ac4d74820c98fcc9da4a57847ad2cc36475a8bd9683f32ab6d47a2bd682"}, + {file = "rpds_py-0.21.0-cp310-none-win_amd64.whl", hash = "sha256:b8f107395f2f1d151181880b69a2869c69e87ec079c49c0016ab96860b6acbe5"}, + {file = "rpds_py-0.21.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:5555db3e618a77034954b9dc547eae94166391a98eb867905ec8fcbce1308d95"}, + {file = "rpds_py-0.21.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:97ef67d9bbc3e15584c2f3c74bcf064af36336c10d2e21a2131e123ce0f924c9"}, + {file = "rpds_py-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ab2c2a26d2f69cdf833174f4d9d86118edc781ad9a8fa13970b527bf8236027"}, + {file = "rpds_py-0.21.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4e8921a259f54bfbc755c5bbd60c82bb2339ae0324163f32868f63f0ebb873d9"}, + {file = "rpds_py-0.21.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a7ff941004d74d55a47f916afc38494bd1cfd4b53c482b77c03147c91ac0ac3"}, + {file = "rpds_py-0.21.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5145282a7cd2ac16ea0dc46b82167754d5e103a05614b724457cffe614f25bd8"}, + {file = "rpds_py-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de609a6f1b682f70bb7163da745ee815d8f230d97276db049ab447767466a09d"}, + {file = "rpds_py-0.21.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:40c91c6e34cf016fa8e6b59d75e3dbe354830777fcfd74c58b279dceb7975b75"}, + {file = "rpds_py-0.21.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d2132377f9deef0c4db89e65e8bb28644ff75a18df5293e132a8d67748397b9f"}, + {file = "rpds_py-0.21.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0a9e0759e7be10109645a9fddaaad0619d58c9bf30a3f248a2ea57a7c417173a"}, + {file = "rpds_py-0.21.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e20da3957bdf7824afdd4b6eeb29510e83e026473e04952dca565170cd1ecc8"}, + {file = "rpds_py-0.21.0-cp311-none-win32.whl", hash = "sha256:f71009b0d5e94c0e86533c0b27ed7cacc1239cb51c178fd239c3cfefefb0400a"}, + {file = "rpds_py-0.21.0-cp311-none-win_amd64.whl", hash = "sha256:e168afe6bf6ab7ab46c8c375606298784ecbe3ba31c0980b7dcbb9631dcba97e"}, + {file = "rpds_py-0.21.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:30b912c965b2aa76ba5168fd610087bad7fcde47f0a8367ee8f1876086ee6d1d"}, + {file = "rpds_py-0.21.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ca9989d5d9b1b300bc18e1801c67b9f6d2c66b8fd9621b36072ed1df2c977f72"}, + {file = "rpds_py-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f54e7106f0001244a5f4cf810ba8d3f9c542e2730821b16e969d6887b664266"}, + {file = "rpds_py-0.21.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fed5dfefdf384d6fe975cc026886aece4f292feaf69d0eeb716cfd3c5a4dd8be"}, + {file = "rpds_py-0.21.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:590ef88db231c9c1eece44dcfefd7515d8bf0d986d64d0caf06a81998a9e8cab"}, + {file = "rpds_py-0.21.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f983e4c2f603c95dde63df633eec42955508eefd8d0f0e6d236d31a044c882d7"}, + {file = "rpds_py-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b229ce052ddf1a01c67d68166c19cb004fb3612424921b81c46e7ea7ccf7c3bf"}, + {file = "rpds_py-0.21.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ebf64e281a06c904a7636781d2e973d1f0926a5b8b480ac658dc0f556e7779f4"}, + {file = "rpds_py-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:998a8080c4495e4f72132f3d66ff91f5997d799e86cec6ee05342f8f3cda7dca"}, + {file = "rpds_py-0.21.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:98486337f7b4f3c324ab402e83453e25bb844f44418c066623db88e4c56b7c7b"}, + {file = "rpds_py-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a78d8b634c9df7f8d175451cfeac3810a702ccb85f98ec95797fa98b942cea11"}, + {file = "rpds_py-0.21.0-cp312-none-win32.whl", hash = "sha256:a58ce66847711c4aa2ecfcfaff04cb0327f907fead8945ffc47d9407f41ff952"}, + {file = "rpds_py-0.21.0-cp312-none-win_amd64.whl", hash = "sha256:e860f065cc4ea6f256d6f411aba4b1251255366e48e972f8a347cf88077b24fd"}, + {file = "rpds_py-0.21.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:ee4eafd77cc98d355a0d02f263efc0d3ae3ce4a7c24740010a8b4012bbb24937"}, + {file = "rpds_py-0.21.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:688c93b77e468d72579351a84b95f976bd7b3e84aa6686be6497045ba84be560"}, + {file = "rpds_py-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c38dbf31c57032667dd5a2f0568ccde66e868e8f78d5a0d27dcc56d70f3fcd3b"}, + {file = "rpds_py-0.21.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2d6129137f43f7fa02d41542ffff4871d4aefa724a5fe38e2c31a4e0fd343fb0"}, + {file = "rpds_py-0.21.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:520ed8b99b0bf86a176271f6fe23024323862ac674b1ce5b02a72bfeff3fff44"}, + {file = "rpds_py-0.21.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaeb25ccfb9b9014a10eaf70904ebf3f79faaa8e60e99e19eef9f478651b9b74"}, + {file = "rpds_py-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af04ac89c738e0f0f1b913918024c3eab6e3ace989518ea838807177d38a2e94"}, + {file = "rpds_py-0.21.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9b76e2afd585803c53c5b29e992ecd183f68285b62fe2668383a18e74abe7a3"}, + {file = "rpds_py-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5afb5efde74c54724e1a01118c6e5c15e54e642c42a1ba588ab1f03544ac8c7a"}, + {file = "rpds_py-0.21.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:52c041802a6efa625ea18027a0723676a778869481d16803481ef6cc02ea8cb3"}, + {file = "rpds_py-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee1e4fc267b437bb89990b2f2abf6c25765b89b72dd4a11e21934df449e0c976"}, + {file = "rpds_py-0.21.0-cp313-none-win32.whl", hash = "sha256:0c025820b78817db6a76413fff6866790786c38f95ea3f3d3c93dbb73b632202"}, + {file = "rpds_py-0.21.0-cp313-none-win_amd64.whl", hash = "sha256:320c808df533695326610a1b6a0a6e98f033e49de55d7dc36a13c8a30cfa756e"}, + {file = "rpds_py-0.21.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:2c51d99c30091f72a3c5d126fad26236c3f75716b8b5e5cf8effb18889ced928"}, + {file = "rpds_py-0.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cbd7504a10b0955ea287114f003b7ad62330c9e65ba012c6223dba646f6ffd05"}, + {file = "rpds_py-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6dcc4949be728ede49e6244eabd04064336012b37f5c2200e8ec8eb2988b209c"}, + {file = "rpds_py-0.21.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f414da5c51bf350e4b7960644617c130140423882305f7574b6cf65a3081cecb"}, + {file = "rpds_py-0.21.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9afe42102b40007f588666bc7de82451e10c6788f6f70984629db193849dced1"}, + {file = "rpds_py-0.21.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b929c2bb6e29ab31f12a1117c39f7e6d6450419ab7464a4ea9b0b417174f044"}, + {file = "rpds_py-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8404b3717da03cbf773a1d275d01fec84ea007754ed380f63dfc24fb76ce4592"}, + {file = "rpds_py-0.21.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e12bb09678f38b7597b8346983d2323a6482dcd59e423d9448108c1be37cac9d"}, + {file = "rpds_py-0.21.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:58a0e345be4b18e6b8501d3b0aa540dad90caeed814c515e5206bb2ec26736fd"}, + {file = "rpds_py-0.21.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:c3761f62fcfccf0864cc4665b6e7c3f0c626f0380b41b8bd1ce322103fa3ef87"}, + {file = "rpds_py-0.21.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c2b2f71c6ad6c2e4fc9ed9401080badd1469fa9889657ec3abea42a3d6b2e1ed"}, + {file = "rpds_py-0.21.0-cp39-none-win32.whl", hash = "sha256:b21747f79f360e790525e6f6438c7569ddbfb1b3197b9e65043f25c3c9b489d8"}, + {file = "rpds_py-0.21.0-cp39-none-win_amd64.whl", hash = "sha256:0626238a43152918f9e72ede9a3b6ccc9e299adc8ade0d67c5e142d564c9a83d"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6b4ef7725386dc0762857097f6b7266a6cdd62bfd209664da6712cb26acef035"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:6bc0e697d4d79ab1aacbf20ee5f0df80359ecf55db33ff41481cf3e24f206919"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da52d62a96e61c1c444f3998c434e8b263c384f6d68aca8274d2e08d1906325c"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:98e4fe5db40db87ce1c65031463a760ec7906ab230ad2249b4572c2fc3ef1f9f"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30bdc973f10d28e0337f71d202ff29345320f8bc49a31c90e6c257e1ccef4333"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:faa5e8496c530f9c71f2b4e1c49758b06e5f4055e17144906245c99fa6d45356"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32eb88c30b6a4f0605508023b7141d043a79b14acb3b969aa0b4f99b25bc7d4a"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a89a8ce9e4e75aeb7fa5d8ad0f3fecdee813802592f4f46a15754dcb2fd6b061"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:241e6c125568493f553c3d0fdbb38c74babf54b45cef86439d4cd97ff8feb34d"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:3b766a9f57663396e4f34f5140b3595b233a7b146e94777b97a8413a1da1be18"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:af4a644bf890f56e41e74be7d34e9511e4954894d544ec6b8efe1e21a1a8da6c"}, + {file = "rpds_py-0.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:3e30a69a706e8ea20444b98a49f386c17b26f860aa9245329bab0851ed100677"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:031819f906bb146561af051c7cef4ba2003d28cff07efacef59da973ff7969ba"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b876f2bc27ab5954e2fd88890c071bd0ed18b9c50f6ec3de3c50a5ece612f7a6"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc5695c321e518d9f03b7ea6abb5ea3af4567766f9852ad1560f501b17588c7b"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b4de1da871b5c0fd5537b26a6fc6814c3cc05cabe0c941db6e9044ffbb12f04a"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:878f6fea96621fda5303a2867887686d7a198d9e0f8a40be100a63f5d60c88c9"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8eeec67590e94189f434c6d11c426892e396ae59e4801d17a93ac96b8c02a6c"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ff2eba7f6c0cb523d7e9cff0903f2fe1feff8f0b2ceb6bd71c0e20a4dcee271"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a429b99337062877d7875e4ff1a51fe788424d522bd64a8c0a20ef3021fdb6ed"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:d167e4dbbdac48bd58893c7e446684ad5d425b407f9336e04ab52e8b9194e2ed"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:4eb2de8a147ffe0626bfdc275fc6563aa7bf4b6db59cf0d44f0ccd6ca625a24e"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e78868e98f34f34a88e23ee9ccaeeec460e4eaf6db16d51d7a9b883e5e785a5e"}, + {file = "rpds_py-0.21.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4991ca61656e3160cdaca4851151fd3f4a92e9eba5c7a530ab030d6aee96ec89"}, + {file = "rpds_py-0.21.0.tar.gz", hash = "sha256:ed6378c9d66d0de903763e7706383d60c33829581f0adff47b6535f1802fa6db"}, +] + [[package]] name = "ruff" version = "0.6.9" @@ -1249,23 +1417,23 @@ setuptools = "*" [[package]] name = "setuptools" -version = "75.5.0" +version = "75.6.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.9" files = [ - {file = "setuptools-75.5.0-py3-none-any.whl", hash = "sha256:87cb777c3b96d638ca02031192d40390e0ad97737e27b6b4fa831bea86f2f829"}, - {file = "setuptools-75.5.0.tar.gz", hash = "sha256:5c4ccb41111392671f02bb5f8436dfc5a9a7185e80500531b133f5775c4163ef"}, + {file = "setuptools-75.6.0-py3-none-any.whl", hash = "sha256:ce74b49e8f7110f9bf04883b730f4765b774ef3ef28f722cce7c273d253aaf7d"}, + {file = "setuptools-75.6.0.tar.gz", hash = "sha256:8199222558df7c86216af4f84c30e9b34a61d8ba19366cc914424cdbd28252f6"}, ] [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.7.0)"] -core = ["importlib-metadata (>=6)", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +core = ["importlib_metadata (>=6)", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (>=1.12,<1.14)", "pytest-mypy"] +type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (>=1.12,<1.14)", "pytest-mypy"] [[package]] name = "six" @@ -1496,4 +1664,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "1d66b3f8ac656dd03b0c801e7384ca34c53a195f1e012f03db3b53aff125ab3f" +content-hash = "75f6ff566b3c5f8d2b9e9b7727bb19eebaa38c17022d5924c506ecb78eae6d72" diff --git a/pyproject.toml b/pyproject.toml index fb7f0f9..7cc0ff8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ homepage = "https://github.com/eccenca/cmem-plugin-validation" [tool.poetry.dependencies]# if you need to change python version here, change it also in .python-version python = "^3.11" +jsonschema = "^4.23.0" [tool.poetry.dependencies.cmem-plugin-base] version = "^4.7.0" From 4df7934e3a49631614b127ab88b8386b8ebb0877 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Mon, 25 Nov 2024 07:43:32 +0000 Subject: [PATCH 03/11] add new parameter for json dataset --- .idea/cmem-plugin-validation.iml | 5 +---- cmem_plugin_validation/validate_entities/task.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.idea/cmem-plugin-validation.iml b/.idea/cmem-plugin-validation.iml index 63247f0..e85cfa8 100644 --- a/.idea/cmem-plugin-validation.iml +++ b/.idea/cmem-plugin-validation.iml @@ -2,12 +2,9 @@ - - - - + diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index 37db339..8546540 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -1,7 +1,6 @@ """Entities validation workflow task""" from collections.abc import Sequence - from cmem_plugin_base.dataintegration.context import ExecutionContext from cmem_plugin_base.dataintegration.description import Plugin, PluginParameter from cmem_plugin_base.dataintegration.entity import Entities @@ -17,6 +16,12 @@ description="Use JSON schema to validate entities", documentation="Sai", parameters=[ + PluginParameter( + name="json_dataset", + label="JSON Dataset", + description="This dataset holds the resources you want to validate.", + param_type=DatasetParameterType(dataset_type="json"), + ), PluginParameter( name="json_schema_dataset", label="JSON Schema Dataset", @@ -33,7 +38,8 @@ class ValidateEntity(WorkflowPlugin): """Validate entities against a JSON schema""" - def __init__(self, json_schema_dataset: str, fail_on_violations: bool): + def __init__(self, json_dataset:str,json_schema_dataset: str, fail_on_violations: bool): + self.json_dataset = json_dataset self.json_schema_dataset = json_schema_dataset self.fail_on_violations = fail_on_violations From 43aec5e64c295563f1c49aa7e1bacc253054b562 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Mon, 25 Nov 2024 22:24:13 +0000 Subject: [PATCH 04/11] update execution report on validation errors --- .../validate_entities/state.py | 19 +++++ .../validate_entities/task.py | 71 ++++++++++++++++++- 2 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 cmem_plugin_validation/validate_entities/state.py diff --git a/cmem_plugin_validation/validate_entities/state.py b/cmem_plugin_validation/validate_entities/state.py new file mode 100644 index 0000000..af7c3f3 --- /dev/null +++ b/cmem_plugin_validation/validate_entities/state.py @@ -0,0 +1,19 @@ +"""JSON validation process state""" + + +class State: + """State of a validation process""" + + def __init__(self): + self.total = 0 + self.violations = 0 + self.violations_messages = [] + + def increment_total(self) -> None: + """Increment total""" + self.total += 1 + + def add_violations_message(self, message: str) -> None: + """Add violation message""" + self.violations += 1 + self.violations_messages.append(message) diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index 8546540..3245ca0 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -1,15 +1,36 @@ """Entities validation workflow task""" from collections.abc import Sequence -from cmem_plugin_base.dataintegration.context import ExecutionContext + +from cmem.cmempy.workspace.projects.resources.resource import get_resource_response +from cmem.cmempy.workspace.tasks import get_task +from cmem_plugin_base.dataintegration.context import ( + ExecutionContext, + ExecutionReport, + UserContext, +) from cmem_plugin_base.dataintegration.description import Plugin, PluginParameter from cmem_plugin_base.dataintegration.entity import Entities from cmem_plugin_base.dataintegration.parameter.dataset import DatasetParameterType from cmem_plugin_base.dataintegration.plugins import WorkflowPlugin +from cmem_plugin_base.dataintegration.utils import ( + setup_cmempy_user_access, + split_task_id, +) +from jsonschema import validate +from jsonschema.exceptions import ValidationError + +from cmem_plugin_validation.validate_entities import state DEFAULT_FAIL_ON_VIOLATION = False +def get_task_metadata(project: str, task: str, context: UserContext) -> dict: + """Get metadata information of a task""" + setup_cmempy_user_access(context=context) + return dict(get_task(project=project, task=task)) + + @Plugin( label="Validate Entity", plugin_id="cmem_plugin_validation-validate-ValidateEntity", @@ -38,10 +59,11 @@ class ValidateEntity(WorkflowPlugin): """Validate entities against a JSON schema""" - def __init__(self, json_dataset:str,json_schema_dataset: str, fail_on_violations: bool): + def __init__(self, json_dataset: str, json_schema_dataset: str, fail_on_violations: bool): self.json_dataset = json_dataset self.json_schema_dataset = json_schema_dataset self.fail_on_violations = fail_on_violations + self._state = state.State() def execute( self, @@ -50,4 +72,49 @@ def execute( ) -> Entities | None: """Run the workflow operator.""" _ = context + + json_data_set = self._get_json_dataset_content(context, self.json_dataset) + json_data_set_schema = self._get_json_dataset_content(context, self.json_schema_dataset) + if isinstance(json_data_set, list): + for _ in json_data_set: + self._validate_json(_, json_data_set_schema) # type: ignore[arg-type] + else: + self._validate_json(json_data_set, json_data_set_schema) # type: ignore[arg-type] + _state = self._state + summary: list[tuple[str, str]] = [ + (str(_), message) for _, message in enumerate(_state.violations_messages) + ] + validation_message = None + if _state.violations: + validation_message = f"Found {_state.violations} violations in {_state.total} entities" + context.report.update( + ExecutionReport( + entity_count=_state.total, + operation="read", + operation_desc=" entities validated", + summary=summary, + error=validation_message if self.fail_on_violations else None, + warnings=[validation_message] + if not self.fail_on_violations and _state.violations + else [], + ) + ) return None + + def _validate_json(self, json: dict, schema: dict) -> None: + """Validate JSON""" + try: + self._state.increment_total() + validate(instance=json, schema=schema) + except ValidationError as e: + self._state.add_violations_message(e.message) + + @staticmethod + def _get_json_dataset_content(context: ExecutionContext, dataset: str) -> dict | list[dict]: + """Get json dataset content""" + dataset_id = f"{context.task.project_id()}:{dataset}" + project_id, task_id = split_task_id(dataset_id) + task_meta_data = get_task_metadata(project_id, task_id, context.user) + resource_name = str(task_meta_data["data"]["parameters"]["file"]["value"]) + response = get_resource_response(project_id, resource_name) + return response.json() # type: ignore[no-any-return] From c8123262d76f192e02fa280a480a4722915ac43e Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Tue, 26 Nov 2024 18:36:20 +0000 Subject: [PATCH 05/11] validate JSON objects from entities --- .../validate_entities/task.py | 217 ++++++++++++++++-- 1 file changed, 199 insertions(+), 18 deletions(-) diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index 3245ca0..a4361f4 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -1,6 +1,11 @@ """Entities validation workflow task""" -from collections.abc import Sequence +import io +import json +from collections import OrderedDict +from collections.abc import Generator, Sequence +from types import SimpleNamespace +from typing import Any from cmem.cmempy.workspace.projects.resources.resource import get_resource_response from cmem.cmempy.workspace.tasks import get_task @@ -11,12 +16,20 @@ ) from cmem_plugin_base.dataintegration.description import Plugin, PluginParameter from cmem_plugin_base.dataintegration.entity import Entities +from cmem_plugin_base.dataintegration.parameter.choice import ChoiceParameterType from cmem_plugin_base.dataintegration.parameter.dataset import DatasetParameterType from cmem_plugin_base.dataintegration.plugins import WorkflowPlugin +from cmem_plugin_base.dataintegration.ports import ( + FixedNumberOfInputs, + FlexibleNumberOfInputs, + UnknownSchemaPort, +) from cmem_plugin_base.dataintegration.utils import ( setup_cmempy_user_access, split_task_id, + write_to_dataset, ) +from cmem_plugin_base.dataintegration.utils.entity_builder import build_entities_from_data from jsonschema import validate from jsonschema.exceptions import ValidationError @@ -31,22 +44,72 @@ def get_task_metadata(project: str, task: str, context: UserContext) -> dict: return dict(get_task(project=project, task=task)) +SOURCE = SimpleNamespace() +SOURCE.entities = "entities" +SOURCE.file = "dataset" +SOURCE.options = OrderedDict( + { + SOURCE.entities: f"{SOURCE.entities}: " + "Validate content from the input port in a workflow.", + SOURCE.file: f"{SOURCE.file}: " + "Validate content from a project dataset (see advanced options).", + } +) + +TARGET = SimpleNamespace() +TARGET.entities = "entities" +TARGET.dataset = "dataset" +TARGET.options = OrderedDict( + { + TARGET.dataset: f"{TARGET.dataset}: " + "Valid JSON objects will be is saved in a JSON dataset (see advanced options).", + TARGET.entities: f"{TARGET.entities}: " + "Valid JSON objects will be send as entities to the output port.", + } +) + + @Plugin( label="Validate Entity", plugin_id="cmem_plugin_validation-validate-ValidateEntity", - description="Use JSON schema to validate entities", + description="Use JSON schema to validate entities/JSON Dataset", documentation="Sai", parameters=[ PluginParameter( - name="json_dataset", - label="JSON Dataset", - description="This dataset holds the resources you want to validate.", + name="source_mode", + label="Source / Input Mode", + description="", + param_type=ChoiceParameterType(SOURCE.options), + default_value=SOURCE.entities, + ), + PluginParameter( + name="target_mode", + label="Target / Output Mode", + description="", + param_type=ChoiceParameterType(TARGET.options), + default_value=TARGET.entities, + ), + PluginParameter( + name="source_dataset", + label="Source JSON Dataset", + description="This dataset holds the resources you want to validate.", + param_type=DatasetParameterType(dataset_type="json"), + advanced=True, + default_value="", + ), + PluginParameter( + name="target_dataset", + label="Target JSON Dataset", + description="This dataset will be used to store the valid JSON objects" + " after validation.", param_type=DatasetParameterType(dataset_type="json"), + default_value="", + advanced=True, ), PluginParameter( name="json_schema_dataset", label="JSON Schema Dataset", - description="This dataset holds the resources you want to validate.", + description="This dataset holds the JSON schema to use for validation.", param_type=DatasetParameterType(dataset_type="json"), ), PluginParameter( @@ -59,27 +122,93 @@ def get_task_metadata(project: str, task: str, context: UserContext) -> dict: class ValidateEntity(WorkflowPlugin): """Validate entities against a JSON schema""" - def __init__(self, json_dataset: str, json_schema_dataset: str, fail_on_violations: bool): - self.json_dataset = json_dataset + source_mode: str + target_mode: str + source_dataset: str + target_dataset: str + + def __init__( # noqa: PLR0913 + self, + source_mode: str, + target_mode: str, + json_schema_dataset: str, + fail_on_violations: bool, + source_dataset: str = "", + target_dataset: str = "", + ): + self.source_mode = source_mode + self.target_mode = target_mode + self.source_dataset = source_dataset + self.target_dataset = target_dataset self.json_schema_dataset = json_schema_dataset self.fail_on_violations = fail_on_violations self._state = state.State() + self._validate_config() + self._set_ports() + + def _raise_error(self, message: str) -> None: + """Send a report and raise an error""" + raise ValueError(message) + + def _validate_config(self) -> None: + """Raise value errors on bad configurations""" + if self.source_mode == SOURCE.file and self.source_dataset == "": + self._raise_error( + f"When using the source mode '{SOURCE.file}', " + "you need to select a Source JSON Dataset." + ) + if self.target_mode == TARGET.dataset and self.target_dataset == "": + self._raise_error( + f"When using the target mode '{TARGET.dataset}', " + "you need to select a Target JSON dataset." + ) + + def _set_ports(self) -> None: + """Define input/output ports based on the configuration""" + match self.source_mode: + case SOURCE.file: + # no input port + self.input_ports = FixedNumberOfInputs([]) + case SOURCE.entities: + self.input_ports = FlexibleNumberOfInputs() + case _: + raise ValueError(f"Unknown source mode: {self.source_mode}") + match self.target_mode: + case TARGET.entities: + # output port with flexible schema + self.output_port = UnknownSchemaPort() + case TARGET.dataset: + # not output port + self.output_port = None + case _: + raise ValueError(f"Unknown target mode: {self.target_mode}") def execute( self, - inputs: Sequence[Entities], # noqa: ARG002 + inputs: Sequence[Entities], context: ExecutionContext, ) -> Entities | None: """Run the workflow operator.""" - _ = context - - json_data_set = self._get_json_dataset_content(context, self.json_dataset) json_data_set_schema = self._get_json_dataset_content(context, self.json_schema_dataset) - if isinstance(json_data_set, list): - for _ in json_data_set: - self._validate_json(_, json_data_set_schema) # type: ignore[arg-type] + valid_json_objects = [] + if self.source_mode == SOURCE.entities: + valid_json_objects += [ + _j + for _j in self._convert_entities_to_json(inputs, {}, "") + if self._validate_json(_j, json_data_set_schema) # type: ignore[arg-type] + ] + else: - self._validate_json(json_data_set, json_data_set_schema) # type: ignore[arg-type] + json_data_set = self._get_json_dataset_content(context, self.source_dataset) + if isinstance(json_data_set, list): + valid_json_objects += [ + _ + for _ in json_data_set + if self._validate_json(_, json_data_set_schema) # type: ignore[arg-type] + ] + elif self._validate_json(json_data_set, json_data_set_schema): # type: ignore[arg-type] + valid_json_objects.append(json_data_set) + _state = self._state summary: list[tuple[str, str]] = [ (str(_), message) for _, message in enumerate(_state.violations_messages) @@ -99,15 +228,25 @@ def execute( else [], ) ) - return None + if self.target_mode == TARGET.dataset: + write_to_dataset( + dataset_id=f"{context.task.project_id()}:{self.target_dataset}", + file_resource=io.StringIO(json.dumps(valid_json_objects)), + context=context.user, + ) + return None - def _validate_json(self, json: dict, schema: dict) -> None: + return build_entities_from_data(valid_json_objects) + + def _validate_json(self, json: dict, schema: dict) -> bool: """Validate JSON""" try: self._state.increment_total() validate(instance=json, schema=schema) except ValidationError as e: self._state.add_violations_message(e.message) + return False + return True @staticmethod def _get_json_dataset_content(context: ExecutionContext, dataset: str) -> dict | list[dict]: @@ -118,3 +257,45 @@ def _get_json_dataset_content(context: ExecutionContext, dataset: str) -> dict | resource_name = str(task_meta_data["data"]["parameters"]["file"]["value"]) response = get_resource_response(project_id, resource_name) return response.json() # type: ignore[no-any-return] + + def _convert_entities_to_json( + self, inputs: Sequence[Entities], path_to_entities: dict[str, Entities], path: str = "" + ) -> Generator[dict[str, Any], None, None]: + """Convert a sequence of Entities into JSON-like dictionaries using recursive traversal.""" + for entities in inputs: + # Initialize path-to-entities map for the root level + if not path: + path_to_entities = {"": entities} + + # Map sub-entities to their paths + if entities.sub_entities: + for sub_entity in entities.sub_entities: + sub_path = f"{path}/{sub_entity.schema.path_to_root.path}" + path_to_entities[sub_path] = sub_entity + + # Process individual entities + for item in entities.entities: + json_obj = {} + for index, schema_path in enumerate(entities.schema.paths): + value = list(item.values[index]) + + if schema_path.is_relation: + # Handle relational sub-entities + related_entity_path = f"{path}/{schema_path.path}" + related_entity = path_to_entities.get(related_entity_path) + if related_entity: + # Recursively process related entities and fetch the first result + related_gen = self._convert_entities_to_json( + [related_entity], + path_to_entities, + related_entity_path, + ) + value = [next(related_gen)] + + # Assign values based on whether the path is single-value or multi-value + if schema_path.is_single_value: + json_obj[schema_path.path] = value.pop() if value else None + else: + json_obj[schema_path.path] = value + + yield json_obj From ac4c9d4d30030c34f9a330343607face4ff2da2a Mon Sep 17 00:00:00 2001 From: Sebastian Tramp Date: Wed, 27 Nov 2024 09:25:52 +0100 Subject: [PATCH 06/11] fix warnings field output in case no validation_message is there --- cmem_plugin_validation/validate_graph/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmem_plugin_validation/validate_graph/task.py b/cmem_plugin_validation/validate_graph/task.py index 5485a95..d2246db 100644 --- a/cmem_plugin_validation/validate_graph/task.py +++ b/cmem_plugin_validation/validate_graph/task.py @@ -218,7 +218,7 @@ def execute( operation_desc=f"/ {state.total} Resources have violations", summary=summary, error=validation_message if self.fail_on_violations else None, - warnings=[validation_message] if not self.fail_on_violations else None, + warnings=[validation_message] if not self.fail_on_violations else [], ) ) if not self.output_results: From 735b01a8bedf9fed2293f7cf8044cadee85be347 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:35:48 +0000 Subject: [PATCH 07/11] change on operation_desc --- cmem_plugin_validation/validate_entities/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index a4361f4..b6787dd 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -220,7 +220,7 @@ def execute( ExecutionReport( entity_count=_state.total, operation="read", - operation_desc=" entities validated", + operation_desc=f"entities validate ({_state.violations} failed)", summary=summary, error=validation_message if self.fail_on_violations else None, warnings=[validation_message] From 124f8d2cb408d62be0ff02a2c8a73b0c680852b1 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:09:03 +0000 Subject: [PATCH 08/11] extend error report with prefixed position --- cmem_plugin_validation/validate_entities/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index b6787dd..07575d5 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -244,7 +244,7 @@ def _validate_json(self, json: dict, schema: dict) -> bool: self._state.increment_total() validate(instance=json, schema=schema) except ValidationError as e: - self._state.add_violations_message(e.message) + self._state.add_violations_message(f"{e.json_path}: {e.message}") return False return True From 1710e34b95d59f7a8fdfa216cd29a6e1e7401d42 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Wed, 27 Nov 2024 18:36:40 +0000 Subject: [PATCH 09/11] add validation of source mode is entities --- .../validate_entities/task.py | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index 07575d5..e3c3a54 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -35,8 +35,11 @@ from cmem_plugin_validation.validate_entities import state -DEFAULT_FAIL_ON_VIOLATION = False +DOCUMENTATION = """ +The JSON Entity Validation Plugin ensures that JSON objects conform to schema standards, validating structure and content for data integrity before processing. +""" +DEFAULT_FAIL_ON_VIOLATION = False def get_task_metadata(project: str, task: str, context: UserContext) -> dict: """Get metadata information of a task""" @@ -73,7 +76,7 @@ def get_task_metadata(project: str, task: str, context: UserContext) -> dict: label="Validate Entity", plugin_id="cmem_plugin_validation-validate-ValidateEntity", description="Use JSON schema to validate entities/JSON Dataset", - documentation="Sai", + documentation=DOCUMENTATION, parameters=[ PluginParameter( name="source_mode", @@ -127,6 +130,9 @@ class ValidateEntity(WorkflowPlugin): source_dataset: str target_dataset: str + inputs: Sequence[Entities] + execution_context: ExecutionContext + def __init__( # noqa: PLR0913 self, source_mode: str, @@ -157,11 +163,28 @@ def _validate_config(self) -> None: f"When using the source mode '{SOURCE.file}', " "you need to select a Source JSON Dataset." ) + if self.source_mode == SOURCE.entities and self.source_dataset != "": + self._raise_error( + f"When using the source mode '{SOURCE.entities}', " + "you don't need to select a Source JSON Dataset." + ) + if self.source_mode == SOURCE.entities: + if hasattr(self, "execution_context") and not self.inputs: + self._raise_error( + f"When using the source mode '{SOURCE.entities}', " + "you need to pass entities to input port." + ) + if self.target_mode == TARGET.dataset and self.target_dataset == "": self._raise_error( f"When using the target mode '{TARGET.dataset}', " "you need to select a Target JSON dataset." ) + if self.target_mode == SOURCE.entities and self.target_dataset != "": + self._raise_error( + f"When using the source mode '{TARGET.entities}', " + "you don't need to select a Target JSON Dataset." + ) def _set_ports(self) -> None: """Define input/output ports based on the configuration""" @@ -189,6 +212,9 @@ def execute( context: ExecutionContext, ) -> Entities | None: """Run the workflow operator.""" + self.execution_context = context + self.inputs = inputs + self._validate_config() json_data_set_schema = self._get_json_dataset_content(context, self.json_schema_dataset) valid_json_objects = [] if self.source_mode == SOURCE.entities: From 4f0957bc94fe2e507409ed6a52cccee813eb8d10 Mon Sep 17 00:00:00 2001 From: saipraneeth <2506664+msaipraneeth@users.noreply.github.com> Date: Wed, 27 Nov 2024 20:06:22 +0000 Subject: [PATCH 10/11] generate empty dict at the end --- cmem_plugin_validation/validate_entities/task.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index e3c3a54..a22466a 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -221,7 +221,7 @@ def execute( valid_json_objects += [ _j for _j in self._convert_entities_to_json(inputs, {}, "") - if self._validate_json(_j, json_data_set_schema) # type: ignore[arg-type] + if _j and self._validate_json(_j, json_data_set_schema) # type: ignore[arg-type] ] else: @@ -325,3 +325,4 @@ def _convert_entities_to_json( json_obj[schema_path.path] = value yield json_obj + yield {} From 2e4386cd8bc09f2e4e7637e145259e68471da698 Mon Sep 17 00:00:00 2001 From: Sebastian Tramp Date: Thu, 28 Nov 2024 10:04:47 +0100 Subject: [PATCH 11/11] nice ups, icon, documentation --- .../validate_entities/icon.svg | 42 ++++++++++++++++ .../validate_entities/task.py | 48 +++++++++++++------ cmem_plugin_validation/validate_graph/task.py | 6 +-- 3 files changed, 78 insertions(+), 18 deletions(-) create mode 100644 cmem_plugin_validation/validate_entities/icon.svg diff --git a/cmem_plugin_validation/validate_entities/icon.svg b/cmem_plugin_validation/validate_entities/icon.svg new file mode 100644 index 0000000..e75eaad --- /dev/null +++ b/cmem_plugin_validation/validate_entities/icon.svg @@ -0,0 +1,42 @@ + + + + + + + + + diff --git a/cmem_plugin_validation/validate_entities/task.py b/cmem_plugin_validation/validate_entities/task.py index a22466a..5ed4485 100644 --- a/cmem_plugin_validation/validate_entities/task.py +++ b/cmem_plugin_validation/validate_entities/task.py @@ -14,7 +14,7 @@ ExecutionReport, UserContext, ) -from cmem_plugin_base.dataintegration.description import Plugin, PluginParameter +from cmem_plugin_base.dataintegration.description import Icon, Plugin, PluginParameter from cmem_plugin_base.dataintegration.entity import Entities from cmem_plugin_base.dataintegration.parameter.choice import ChoiceParameterType from cmem_plugin_base.dataintegration.parameter.dataset import DatasetParameterType @@ -35,12 +35,25 @@ from cmem_plugin_validation.validate_entities import state -DOCUMENTATION = """ -The JSON Entity Validation Plugin ensures that JSON objects conform to schema standards, validating structure and content for data integrity before processing. +DOCUMENTATION = """[JSON Schema](https://json-schema.org/) specifies a JSON-based format to +define the structure of JSON data for validation, documentation, and interaction control. +It provides a contract for the JSON data required by a given application. + +This workflow task can validate incoming entities or a stand-alone JSON dataset by using a +JSON Schema specification. + +The used JSON Schema needs to be provided as a JSON Dataset in the project. + +Validated data objects can be send to an output port, to further process them in the workflow, +or saved in a JSON dataset in the project. + +The task can either fail instantly if there is a data violation, or just provide warning in the +workflow report and allow to run follow-up tasks based on the data which was validated. """ DEFAULT_FAIL_ON_VIOLATION = False + def get_task_metadata(project: str, task: str, context: UserContext) -> dict: """Get metadata information of a task""" setup_cmempy_user_access(context=context) @@ -53,9 +66,9 @@ def get_task_metadata(project: str, task: str, context: UserContext) -> dict: SOURCE.options = OrderedDict( { SOURCE.entities: f"{SOURCE.entities}: " - "Validate content from the input port in a workflow.", + "Validate entities received from the input port in the workflow.", SOURCE.file: f"{SOURCE.file}: " - "Validate content from a project dataset (see advanced options).", + "Validate a JSON Dataset from a project (see advanced options).", } ) @@ -73,9 +86,10 @@ def get_task_metadata(project: str, task: str, context: UserContext) -> dict: @Plugin( - label="Validate Entity", - plugin_id="cmem_plugin_validation-validate-ValidateEntity", - description="Use JSON schema to validate entities/JSON Dataset", + label="Validate Entities", + plugin_id="cmem_plugin_validation-validate-ValidateEntities", + icon=Icon(file_name="icon.svg", package=__package__), + description="Use a JSON schema to validate entities or a JSON dataset.", documentation=DOCUMENTATION, parameters=[ PluginParameter( @@ -117,7 +131,8 @@ def get_task_metadata(project: str, task: str, context: UserContext) -> dict: ), PluginParameter( name="fail_on_violations", - label="Fail workflow on violations", + label="Fail on violations", + description="If enabled, the task will fail on the first data violation.", default_value=DEFAULT_FAIL_ON_VIOLATION, ), ], @@ -168,12 +183,15 @@ def _validate_config(self) -> None: f"When using the source mode '{SOURCE.entities}', " "you don't need to select a Source JSON Dataset." ) - if self.source_mode == SOURCE.entities: - if hasattr(self, "execution_context") and not self.inputs: - self._raise_error( - f"When using the source mode '{SOURCE.entities}', " - "you need to pass entities to input port." - ) + if ( + self.source_mode == SOURCE.entities + and hasattr(self, "execution_context") + and not self.inputs + ): + self._raise_error( + f"When using the source mode '{SOURCE.entities}', " + "you need to pass entities to input port." + ) if self.target_mode == TARGET.dataset and self.target_dataset == "": self._raise_error( diff --git a/cmem_plugin_validation/validate_graph/task.py b/cmem_plugin_validation/validate_graph/task.py index d2246db..a997ede 100644 --- a/cmem_plugin_validation/validate_graph/task.py +++ b/cmem_plugin_validation/validate_graph/task.py @@ -28,8 +28,8 @@ from cmem_plugin_validation.validate_graph.state import State DOCUMENTATION = """ -A graph validation process verifies, that resources in a specific graph are valid according to -the node shapes in a shape catalog graph. +Start a graph validation process which verifies, that resources in a specific graph are valid +according to the node shapes in a shape catalog graph. """ DEFAULT_SHAPE_GRAPH = "https://vocab.eccenca.com/shacl/" @@ -47,7 +47,7 @@ label="Validate Knowledge Graph", plugin_id="cmem_plugin_validation-validate-ValidateGraph", icon=Icon(file_name="icon.svg", package=__package__), - description="Use shapes to validate resources in a knowledge graph.", + description="Use SHACL shapes to validate resources in a Knowledge Graph.", documentation=DOCUMENTATION, parameters=[ PluginParameter(