From 44e346450b3b7ba78744847a87bc561c3330c906 Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Sat, 28 Feb 2026 16:18:57 +0530 Subject: [PATCH 1/3] feat: add Python SDK for ModelPack spec types and validation Signed-off-by: pradhyum6144 --- .gitignore | 7 + specs-python/modelpack/__init__.py | 0 specs-python/modelpack/v1/__init__.py | 91 +++ specs-python/modelpack/v1/annotations.py | 77 ++ specs-python/modelpack/v1/config-schema.json | 168 +++++ specs-python/modelpack/v1/config.py | 285 ++++++++ specs-python/modelpack/v1/mediatype.py | 55 ++ specs-python/modelpack/v1/validator.py | 54 ++ specs-python/setup.py | 33 + specs-python/tests/__init__.py | 0 specs-python/tests/test_annotations.py | 72 ++ specs-python/tests/test_config.py | 268 +++++++ specs-python/tests/test_mediatype.py | 119 ++++ specs-python/tests/test_validator.py | 705 +++++++++++++++++++ 14 files changed, 1934 insertions(+) create mode 100644 specs-python/modelpack/__init__.py create mode 100644 specs-python/modelpack/v1/__init__.py create mode 100644 specs-python/modelpack/v1/annotations.py create mode 100644 specs-python/modelpack/v1/config-schema.json create mode 100644 specs-python/modelpack/v1/config.py create mode 100644 specs-python/modelpack/v1/mediatype.py create mode 100644 specs-python/modelpack/v1/validator.py create mode 100644 specs-python/setup.py create mode 100644 specs-python/tests/__init__.py create mode 100644 specs-python/tests/test_annotations.py create mode 100644 specs-python/tests/test_config.py create mode 100644 specs-python/tests/test_mediatype.py create mode 100644 specs-python/tests/test_validator.py diff --git a/.gitignore b/.gitignore index 71cce7a..5bb9fb7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,13 @@ # Dependency directories (remove the comment below to include it) vendor/ + +# Python +__pycache__/ +*.pyc +*.egg-info/ +.venv/ +.pytest_cache/ .idea .vscode .cache diff --git a/specs-python/modelpack/__init__.py b/specs-python/modelpack/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/specs-python/modelpack/v1/__init__.py b/specs-python/modelpack/v1/__init__.py new file mode 100644 index 0000000..6a5c8cf --- /dev/null +++ b/specs-python/modelpack/v1/__init__.py @@ -0,0 +1,91 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ModelPack Python SDK - CNCF standard for packaging and distributing AI models.""" + +from modelpack.v1.config import ( + Model, + ModelCapabilities, + ModelConfig, + ModelDescriptor, + ModelFS, + Modality, +) +from modelpack.v1.annotations import ( + ANNOTATION_FILEPATH, + ANNOTATION_FILE_METADATA, + ANNOTATION_MEDIA_TYPE_UNTESTED, + FileMetadata, +) +from modelpack.v1.mediatype import ( + ARTIFACT_TYPE_MODEL_MANIFEST, + MEDIA_TYPE_MODEL_CONFIG, + MEDIA_TYPE_MODEL_WEIGHT_RAW, + MEDIA_TYPE_MODEL_WEIGHT, + MEDIA_TYPE_MODEL_WEIGHT_GZIP, + MEDIA_TYPE_MODEL_WEIGHT_ZSTD, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG_RAW, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG_GZIP, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG_ZSTD, + MEDIA_TYPE_MODEL_DOC_RAW, + MEDIA_TYPE_MODEL_DOC, + MEDIA_TYPE_MODEL_DOC_GZIP, + MEDIA_TYPE_MODEL_DOC_ZSTD, + MEDIA_TYPE_MODEL_CODE_RAW, + MEDIA_TYPE_MODEL_CODE, + MEDIA_TYPE_MODEL_CODE_GZIP, + MEDIA_TYPE_MODEL_CODE_ZSTD, + MEDIA_TYPE_MODEL_DATASET_RAW, + MEDIA_TYPE_MODEL_DATASET, + MEDIA_TYPE_MODEL_DATASET_GZIP, + MEDIA_TYPE_MODEL_DATASET_ZSTD, +) +from modelpack.v1.validator import validate_config + +__all__ = [ + "Model", + "ModelCapabilities", + "ModelConfig", + "ModelDescriptor", + "ModelFS", + "Modality", + "FileMetadata", + "ANNOTATION_FILEPATH", + "ANNOTATION_FILE_METADATA", + "ANNOTATION_MEDIA_TYPE_UNTESTED", + "ARTIFACT_TYPE_MODEL_MANIFEST", + "MEDIA_TYPE_MODEL_CONFIG", + "MEDIA_TYPE_MODEL_WEIGHT_RAW", + "MEDIA_TYPE_MODEL_WEIGHT", + "MEDIA_TYPE_MODEL_WEIGHT_GZIP", + "MEDIA_TYPE_MODEL_WEIGHT_ZSTD", + "MEDIA_TYPE_MODEL_WEIGHT_CONFIG_RAW", + "MEDIA_TYPE_MODEL_WEIGHT_CONFIG", + "MEDIA_TYPE_MODEL_WEIGHT_CONFIG_GZIP", + "MEDIA_TYPE_MODEL_WEIGHT_CONFIG_ZSTD", + "MEDIA_TYPE_MODEL_DOC_RAW", + "MEDIA_TYPE_MODEL_DOC", + "MEDIA_TYPE_MODEL_DOC_GZIP", + "MEDIA_TYPE_MODEL_DOC_ZSTD", + "MEDIA_TYPE_MODEL_CODE_RAW", + "MEDIA_TYPE_MODEL_CODE", + "MEDIA_TYPE_MODEL_CODE_GZIP", + "MEDIA_TYPE_MODEL_CODE_ZSTD", + "MEDIA_TYPE_MODEL_DATASET_RAW", + "MEDIA_TYPE_MODEL_DATASET", + "MEDIA_TYPE_MODEL_DATASET_GZIP", + "MEDIA_TYPE_MODEL_DATASET_ZSTD", + "validate_config", +] diff --git a/specs-python/modelpack/v1/annotations.py b/specs-python/modelpack/v1/annotations.py new file mode 100644 index 0000000..e958f7a --- /dev/null +++ b/specs-python/modelpack/v1/annotations.py @@ -0,0 +1,77 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Annotation constants and types matching specs-go/v1/annotations.go.""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime + +# Annotation key for the file path of the layer. +ANNOTATION_FILEPATH = "org.cncf.model.filepath" + +# Annotation key for the file metadata of the layer. +ANNOTATION_FILE_METADATA = "org.cncf.model.file.metadata+json" + +# Annotation key for file media type untested flag of the layer. +ANNOTATION_MEDIA_TYPE_UNTESTED = "org.cncf.model.file.mediatype.untested" + + +@dataclass +class FileMetadata: + """Represents the metadata of a file. + + Mirrors the Go FileMetadata struct in specs-go/v1/annotations.go. + """ + + name: str = "" + mode: int = 0 + uid: int = 0 + gid: int = 0 + size: int = 0 + mod_time: datetime | None = None + typeflag: int = 0 + + def to_dict(self) -> dict: + """Serialize to a dict matching the JSON field names.""" + d: dict = { + "name": self.name, + "mode": self.mode, + "uid": self.uid, + "gid": self.gid, + "size": self.size, + "typeflag": self.typeflag, + } + if self.mod_time is not None: + d["mtime"] = self.mod_time.isoformat() + return d + + @classmethod + def from_dict(cls, data: dict) -> FileMetadata: + """Deserialize from a dict with JSON field names.""" + mod_time = None + if "mtime" in data: + mod_time = datetime.fromisoformat( + data["mtime"].replace("Z", "+00:00") + ) + return cls( + name=data.get("name", ""), + mode=data.get("mode", 0), + uid=data.get("uid", 0), + gid=data.get("gid", 0), + size=data.get("size", 0), + mod_time=mod_time, + typeflag=data.get("typeflag", 0), + ) diff --git a/specs-python/modelpack/v1/config-schema.json b/specs-python/modelpack/v1/config-schema.json new file mode 100644 index 0000000..ce13fcc --- /dev/null +++ b/specs-python/modelpack/v1/config-schema.json @@ -0,0 +1,168 @@ +{ + "description": "Model Artifact Configuration Schema", + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://github.com/modelpack/model-spec/config", + "type": "object", + "properties": { + "descriptor": { + "$ref": "#/$defs/ModelDescriptor" + }, + "modelfs": { + "$ref": "#/$defs/ModelFS" + }, + "config": { + "$ref": "#/$defs/ModelConfig" + } + }, + "additionalProperties": false, + "required": [ + "descriptor", + "config", + "modelfs" + ], + "$defs": { + "ModelConfig": { + "type": "object", + "properties": { + "architecture": { + "type": "string" + }, + "format": { + "type": "string" + }, + "paramSize": { + "type": "string" + }, + "precision": { + "type": "string" + }, + "quantization": { + "type": "string" + }, + "capabilities": { + "$ref": "#/$defs/ModelCapabilities" + } + }, + "additionalProperties": false + }, + "ModelDescriptor": { + "type": "object", + "properties": { + "createdAt": { + "type": "string", + "format": "date-time" + }, + "authors": { + "type": "array", + "items": { + "type": "string" + } + }, + "family": { + "type": "string" + }, + "name": { + "type": "string", + "minLength": 1 + }, + "docURL": { + "type": "string" + }, + "sourceURL": { + "type": "string" + }, + "datasetsURL": { + "type": "array", + "items": { + "type": "string" + } + }, + "version": { + "type": "string" + }, + "revision": { + "type": "string" + }, + "vendor": { + "type": "string" + }, + "licenses": { + "type": "array", + "items": { + "type": "string" + } + }, + "title": { + "type": "string" + }, + "description": { + "type": "string" + } + }, + "additionalProperties": false + }, + "ModelFS": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["layers"] + }, + "diffIds": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + } + }, + "additionalProperties": false, + "required": [ + "type", + "diffIds" + ] + }, + "ModelCapabilities": { + "type": "object", + "properties": { + "inputTypes": { + "type": "array", + "items": { + "$ref": "#/$defs/Modality" + } + }, + "outputTypes": { + "type": "array", + "items": { + "$ref": "#/$defs/Modality" + } + }, + "knowledgeCutoff": { + "type": "string", + "format": "date-time" + }, + "reasoning": { + "type": "boolean" + }, + "toolUsage": { + "type": "boolean" + }, + "reward": { + "type": "boolean" + }, + "languages": { + "type": "array", + "items": { + "type": "string", + "pattern": "^[a-z]{2}$" + } + } + }, + "additionalProperties": false + }, + "Modality": { + "type": "string", + "enum": ["text", "image", "audio", "video", "embedding", "other"] + } + } +} diff --git a/specs-python/modelpack/v1/config.py b/specs-python/modelpack/v1/config.py new file mode 100644 index 0000000..cdeaf77 --- /dev/null +++ b/specs-python/modelpack/v1/config.py @@ -0,0 +1,285 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model configuration types matching the Go structs in specs-go/v1/config.go.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from typing import Optional + + +class Modality(str, Enum): + """Defines the input and output types of the model. + + Mirrors the Go Modality type in specs-go/v1/config.go. + """ + + TEXT = "text" + IMAGE = "image" + AUDIO = "audio" + VIDEO = "video" + EMBEDDING = "embedding" + OTHER = "other" + + +@dataclass +class ModelCapabilities: + """Defines the special capabilities that the model supports. + + Mirrors the Go ModelCapabilities struct in specs-go/v1/config.go. + """ + + input_types: Optional[list[Modality]] = None + output_types: Optional[list[Modality]] = None + knowledge_cutoff: Optional[datetime] = None + reasoning: Optional[bool] = None + tool_usage: Optional[bool] = None + reward: Optional[bool] = None + languages: Optional[list[str]] = None + + def to_dict(self) -> dict: + """Serialize to a dict matching the JSON schema field names.""" + d: dict = {} + if self.input_types is not None: + d["inputTypes"] = [m.value for m in self.input_types] + if self.output_types is not None: + d["outputTypes"] = [m.value for m in self.output_types] + if self.knowledge_cutoff is not None: + d["knowledgeCutoff"] = self.knowledge_cutoff.isoformat() + if self.reasoning is not None: + d["reasoning"] = self.reasoning + if self.tool_usage is not None: + d["toolUsage"] = self.tool_usage + if self.reward is not None: + d["reward"] = self.reward + if self.languages is not None: + d["languages"] = self.languages + return d + + @classmethod + def from_dict(cls, data: dict) -> ModelCapabilities: + """Deserialize from a dict with JSON schema field names.""" + kwargs: dict = {} + if "inputTypes" in data: + kwargs["input_types"] = [Modality(v) for v in data["inputTypes"]] + if "outputTypes" in data: + kwargs["output_types"] = [Modality(v) for v in data["outputTypes"]] + if "knowledgeCutoff" in data and data["knowledgeCutoff"]: + kwargs["knowledge_cutoff"] = datetime.fromisoformat( + data["knowledgeCutoff"].replace("Z", "+00:00") + ) + if "reasoning" in data: + kwargs["reasoning"] = data["reasoning"] + if "toolUsage" in data: + kwargs["tool_usage"] = data["toolUsage"] + if "reward" in data: + kwargs["reward"] = data["reward"] + if "languages" in data: + kwargs["languages"] = data["languages"] + return cls(**kwargs) + + +@dataclass +class ModelConfig: + """Defines the execution parameters for running a model. + + Mirrors the Go ModelConfig struct in specs-go/v1/config.go. + """ + + architecture: str = "" + format: str = "" + param_size: str = "" + precision: str = "" + quantization: str = "" + capabilities: Optional[ModelCapabilities] = None + + def to_dict(self) -> dict: + """Serialize to a dict matching the JSON schema field names.""" + d: dict = {} + if self.architecture: + d["architecture"] = self.architecture + if self.format: + d["format"] = self.format + if self.param_size: + d["paramSize"] = self.param_size + if self.precision: + d["precision"] = self.precision + if self.quantization: + d["quantization"] = self.quantization + if self.capabilities is not None: + d["capabilities"] = self.capabilities.to_dict() + return d + + @classmethod + def from_dict(cls, data: dict) -> ModelConfig: + """Deserialize from a dict with JSON schema field names.""" + caps = None + if "capabilities" in data: + caps = ModelCapabilities.from_dict(data["capabilities"]) + return cls( + architecture=data.get("architecture", ""), + format=data.get("format", ""), + param_size=data.get("paramSize", ""), + precision=data.get("precision", ""), + quantization=data.get("quantization", ""), + capabilities=caps, + ) + + +@dataclass +class ModelFS: + """Describes layer content addresses. + + Mirrors the Go ModelFS struct in specs-go/v1/config.go. + """ + + type: str = "" + diff_ids: list[str] = field(default_factory=list) + + def to_dict(self) -> dict: + """Serialize to a dict matching the JSON schema field names.""" + return { + "type": self.type, + "diffIds": self.diff_ids, + } + + @classmethod + def from_dict(cls, data: dict) -> ModelFS: + """Deserialize from a dict with JSON schema field names.""" + return cls( + type=data.get("type", ""), + diff_ids=data.get("diffIds", []), + ) + + +@dataclass +class ModelDescriptor: + """Defines the general information of a model. + + Mirrors the Go ModelDescriptor struct in specs-go/v1/config.go. + """ + + created_at: Optional[datetime] = None + authors: Optional[list[str]] = None + family: str = "" + name: str = "" + doc_url: str = "" + source_url: str = "" + datasets_url: Optional[list[str]] = None + version: str = "" + revision: str = "" + vendor: str = "" + licenses: Optional[list[str]] = None + title: str = "" + description: str = "" + + def to_dict(self) -> dict: + """Serialize to a dict matching the JSON schema field names.""" + d: dict = {} + if self.created_at is not None: + d["createdAt"] = self.created_at.isoformat() + if self.authors is not None: + d["authors"] = self.authors + if self.family: + d["family"] = self.family + if self.name: + d["name"] = self.name + if self.doc_url: + d["docURL"] = self.doc_url + if self.source_url: + d["sourceURL"] = self.source_url + if self.datasets_url is not None: + d["datasetsURL"] = self.datasets_url + if self.version: + d["version"] = self.version + if self.revision: + d["revision"] = self.revision + if self.vendor: + d["vendor"] = self.vendor + if self.licenses is not None: + d["licenses"] = self.licenses + if self.title: + d["title"] = self.title + if self.description: + d["description"] = self.description + return d + + @classmethod + def from_dict(cls, data: dict) -> ModelDescriptor: + """Deserialize from a dict with JSON schema field names.""" + created_at = None + if "createdAt" in data: + created_at = datetime.fromisoformat( + data["createdAt"].replace("Z", "+00:00") + ) + return cls( + created_at=created_at, + authors=data.get("authors"), + family=data.get("family", ""), + name=data.get("name", ""), + doc_url=data.get("docURL", ""), + source_url=data.get("sourceURL", ""), + datasets_url=data.get("datasetsURL"), + version=data.get("version", ""), + revision=data.get("revision", ""), + vendor=data.get("vendor", ""), + licenses=data.get("licenses"), + title=data.get("title", ""), + description=data.get("description", ""), + ) + + +@dataclass +class Model: + """Defines the basic information of a model. + + Provides the application/vnd.cncf.model.config.v1+json mediatype + when marshalled to JSON. + + Mirrors the Go Model struct in specs-go/v1/config.go. + """ + + descriptor: ModelDescriptor = field(default_factory=ModelDescriptor) + modelfs: ModelFS = field(default_factory=ModelFS) + config: ModelConfig = field(default_factory=ModelConfig) + + def to_dict(self) -> dict: + """Serialize to a dict matching the JSON schema field names.""" + return { + "descriptor": self.descriptor.to_dict(), + "modelfs": self.modelfs.to_dict(), + "config": self.config.to_dict(), + } + + def to_json(self, indent: Optional[int] = 2) -> str: + """Serialize to a JSON string.""" + return json.dumps(self.to_dict(), indent=indent) + + @classmethod + def from_dict(cls, data: dict) -> Model: + """Deserialize from a dict with JSON schema field names.""" + return cls( + descriptor=ModelDescriptor.from_dict(data.get("descriptor", {})), + modelfs=ModelFS.from_dict(data.get("modelfs", {})), + config=ModelConfig.from_dict(data.get("config", {})), + ) + + @classmethod + def from_json(cls, json_str: str) -> Model: + """Deserialize from a JSON string.""" + return cls.from_dict(json.loads(json_str)) diff --git a/specs-python/modelpack/v1/mediatype.py b/specs-python/modelpack/v1/mediatype.py new file mode 100644 index 0000000..1b1bfd6 --- /dev/null +++ b/specs-python/modelpack/v1/mediatype.py @@ -0,0 +1,55 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Media type constants matching specs-go/v1/mediatype.go.""" + +# Artifact type for a model manifest. +ARTIFACT_TYPE_MODEL_MANIFEST = "application/vnd.cncf.model.manifest.v1+json" + +# Media type for a model configuration. +MEDIA_TYPE_MODEL_CONFIG = "application/vnd.cncf.model.config.v1+json" + +# Model weight media types. +MEDIA_TYPE_MODEL_WEIGHT_RAW = "application/vnd.cncf.model.weight.v1.raw" +MEDIA_TYPE_MODEL_WEIGHT = "application/vnd.cncf.model.weight.v1.tar" +MEDIA_TYPE_MODEL_WEIGHT_GZIP = "application/vnd.cncf.model.weight.v1.tar+gzip" +MEDIA_TYPE_MODEL_WEIGHT_ZSTD = "application/vnd.cncf.model.weight.v1.tar+zstd" + +# Model weight config media types. +MEDIA_TYPE_MODEL_WEIGHT_CONFIG_RAW = "application/vnd.cncf.model.weight.config.v1.raw" +MEDIA_TYPE_MODEL_WEIGHT_CONFIG = "application/vnd.cncf.model.weight.config.v1.tar" +MEDIA_TYPE_MODEL_WEIGHT_CONFIG_GZIP = ( + "application/vnd.cncf.model.weight.config.v1.tar+gzip" +) +MEDIA_TYPE_MODEL_WEIGHT_CONFIG_ZSTD = ( + "application/vnd.cncf.model.weight.config.v1.tar+zstd" +) + +# Model documentation media types. +MEDIA_TYPE_MODEL_DOC_RAW = "application/vnd.cncf.model.doc.v1.raw" +MEDIA_TYPE_MODEL_DOC = "application/vnd.cncf.model.doc.v1.tar" +MEDIA_TYPE_MODEL_DOC_GZIP = "application/vnd.cncf.model.doc.v1.tar+gzip" +MEDIA_TYPE_MODEL_DOC_ZSTD = "application/vnd.cncf.model.doc.v1.tar+zstd" + +# Model code media types. +MEDIA_TYPE_MODEL_CODE_RAW = "application/vnd.cncf.model.code.v1.raw" +MEDIA_TYPE_MODEL_CODE = "application/vnd.cncf.model.code.v1.tar" +MEDIA_TYPE_MODEL_CODE_GZIP = "application/vnd.cncf.model.code.v1.tar+gzip" +MEDIA_TYPE_MODEL_CODE_ZSTD = "application/vnd.cncf.model.code.v1.tar+zstd" + +# Model dataset media types. +MEDIA_TYPE_MODEL_DATASET_RAW = "application/vnd.cncf.model.dataset.v1.raw" +MEDIA_TYPE_MODEL_DATASET = "application/vnd.cncf.model.dataset.v1.tar" +MEDIA_TYPE_MODEL_DATASET_GZIP = "application/vnd.cncf.model.dataset.v1.tar+gzip" +MEDIA_TYPE_MODEL_DATASET_ZSTD = "application/vnd.cncf.model.dataset.v1.tar+zstd" diff --git a/specs-python/modelpack/v1/validator.py b/specs-python/modelpack/v1/validator.py new file mode 100644 index 0000000..83df26c --- /dev/null +++ b/specs-python/modelpack/v1/validator.py @@ -0,0 +1,54 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""JSON schema validation for ModelPack configs. + +Uses the same config-schema.json as the Go validator to ensure +consistent validation behavior across languages. +""" + +from __future__ import annotations + +import importlib.resources +import json + +from jsonschema import Draft202012Validator, FormatChecker + + +def _load_schema() -> dict: + """Load and return the config JSON schema.""" + schema_file = importlib.resources.files("modelpack.v1").joinpath( + "config-schema.json" + ) + with schema_file.open(encoding="utf-8") as f: + return json.load(f) + + +def validate_config(data: dict | str) -> None: + """Validate a model config against the JSON schema. + + Args: + data: Either a dict or a JSON string representing the model config. + + Raises: + jsonschema.ValidationError: If the config is invalid. + jsonschema.SchemaError: If the schema itself is invalid. + json.JSONDecodeError: If data is a string that is not valid JSON. + """ + if isinstance(data, str): + data = json.loads(data) + + schema = _load_schema() + format_checker = FormatChecker() + Draft202012Validator(schema, format_checker=format_checker).validate(data) diff --git a/specs-python/setup.py b/specs-python/setup.py new file mode 100644 index 0000000..0634240 --- /dev/null +++ b/specs-python/setup.py @@ -0,0 +1,33 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import setup, find_packages + +setup( + name="modelpack", + version="0.1.0", + description="Python SDK for the CNCF ModelPack specification", + packages=find_packages(), + package_data={"modelpack.v1": ["config-schema.json"]}, + python_requires=">=3.10", + install_requires=[ + "jsonschema[format]>=4.20.0", + ], + extras_require={ + "dev": [ + "pytest>=7.0", + "ruff>=0.4.0", + ], + }, +) diff --git a/specs-python/tests/__init__.py b/specs-python/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/specs-python/tests/test_annotations.py b/specs-python/tests/test_annotations.py new file mode 100644 index 0000000..4681cfe --- /dev/null +++ b/specs-python/tests/test_annotations.py @@ -0,0 +1,72 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for annotation constants and FileMetadata type.""" + +from datetime import datetime, timezone + +from modelpack.v1.annotations import ( + ANNOTATION_FILEPATH, + ANNOTATION_FILE_METADATA, + ANNOTATION_MEDIA_TYPE_UNTESTED, + FileMetadata, +) + + +class TestAnnotationConstants: + """Verify annotation constants match Go definitions exactly.""" + + def test_filepath(self): + assert ANNOTATION_FILEPATH == "org.cncf.model.filepath" + + def test_file_metadata(self): + assert ANNOTATION_FILE_METADATA == "org.cncf.model.file.metadata+json" + + def test_media_type_untested(self): + assert ( + ANNOTATION_MEDIA_TYPE_UNTESTED == "org.cncf.model.file.mediatype.untested" + ) + + +class TestFileMetadata: + """Tests for FileMetadata serialization.""" + + def test_round_trip(self): + dt = datetime(2025, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + meta = FileMetadata( + name="model.bin", + mode=0o644, + uid=1000, + gid=1000, + size=1024, + mod_time=dt, + typeflag=0, + ) + d = meta.to_dict() + assert d["name"] == "model.bin" + assert d["mode"] == 0o644 + assert d["size"] == 1024 + assert "mtime" in d + + restored = FileMetadata.from_dict(d) + assert restored.name == "model.bin" + assert restored.mode == 0o644 + assert restored.size == 1024 + + def test_empty(self): + meta = FileMetadata() + d = meta.to_dict() + assert d["name"] == "" + assert d["size"] == 0 + assert "mtime" not in d diff --git a/specs-python/tests/test_config.py b/specs-python/tests/test_config.py new file mode 100644 index 0000000..2619e18 --- /dev/null +++ b/specs-python/tests/test_config.py @@ -0,0 +1,268 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for model config types - serialization and deserialization.""" + +import json +from datetime import datetime, timezone + +import pytest + +from modelpack.v1.config import ( + Model, + ModelCapabilities, + ModelConfig, + ModelDescriptor, + ModelFS, + Modality, +) + + +class TestModality: + """Tests for the Modality enum.""" + + def test_all_values(self): + assert Modality.TEXT.value == "text" + assert Modality.IMAGE.value == "image" + assert Modality.AUDIO.value == "audio" + assert Modality.VIDEO.value == "video" + assert Modality.EMBEDDING.value == "embedding" + assert Modality.OTHER.value == "other" + + def test_from_string(self): + assert Modality("text") == Modality.TEXT + assert Modality("image") == Modality.IMAGE + + def test_invalid_value(self): + with pytest.raises(ValueError): + Modality("invalid") + + +class TestModelCapabilities: + """Tests for ModelCapabilities serialization.""" + + def test_empty(self): + caps = ModelCapabilities() + assert caps.to_dict() == {} + + def test_round_trip(self): + caps = ModelCapabilities( + input_types=[Modality.TEXT, Modality.IMAGE], + output_types=[Modality.TEXT], + reasoning=True, + tool_usage=False, + reward=True, + languages=["en", "fr"], + ) + d = caps.to_dict() + assert d["inputTypes"] == ["text", "image"] + assert d["outputTypes"] == ["text"] + assert d["reasoning"] is True + assert d["toolUsage"] is False + assert d["reward"] is True + assert d["languages"] == ["en", "fr"] + + restored = ModelCapabilities.from_dict(d) + assert restored.input_types == [Modality.TEXT, Modality.IMAGE] + assert restored.reasoning is True + assert restored.tool_usage is False + + def test_knowledge_cutoff(self): + dt = datetime(2025, 1, 1, tzinfo=timezone.utc) + caps = ModelCapabilities(knowledge_cutoff=dt) + d = caps.to_dict() + assert "knowledgeCutoff" in d + + restored = ModelCapabilities.from_dict(d) + assert restored.knowledge_cutoff is not None + + +class TestModelConfig: + """Tests for ModelConfig serialization.""" + + def test_empty(self): + cfg = ModelConfig() + assert cfg.to_dict() == {} + + def test_round_trip(self): + cfg = ModelConfig( + architecture="transformer", + format="safetensors", + param_size="8b", + precision="float16", + quantization="awq", + ) + d = cfg.to_dict() + assert d["architecture"] == "transformer" + assert d["paramSize"] == "8b" + + restored = ModelConfig.from_dict(d) + assert restored.architecture == "transformer" + assert restored.param_size == "8b" + + def test_with_capabilities(self): + cfg = ModelConfig( + param_size="8b", + capabilities=ModelCapabilities( + input_types=[Modality.TEXT], + output_types=[Modality.TEXT], + ), + ) + d = cfg.to_dict() + assert "capabilities" in d + assert d["capabilities"]["inputTypes"] == ["text"] + + +class TestModelFS: + """Tests for ModelFS serialization.""" + + def test_round_trip(self): + fs = ModelFS( + type="layers", + diff_ids=["sha256:abc123"], + ) + d = fs.to_dict() + assert d["type"] == "layers" + assert d["diffIds"] == ["sha256:abc123"] + + restored = ModelFS.from_dict(d) + assert restored.type == "layers" + assert restored.diff_ids == ["sha256:abc123"] + + +class TestModelDescriptor: + """Tests for ModelDescriptor serialization.""" + + def test_empty(self): + desc = ModelDescriptor() + assert desc.to_dict() == {} + + def test_round_trip(self): + desc = ModelDescriptor( + name="llama3-8b-instruct", + version="3.1", + family="llama3", + authors=["Meta"], + licenses=["Apache-2.0"], + ) + d = desc.to_dict() + assert d["name"] == "llama3-8b-instruct" + assert d["version"] == "3.1" + + restored = ModelDescriptor.from_dict(d) + assert restored.name == "llama3-8b-instruct" + assert restored.authors == ["Meta"] + + def test_created_at(self): + dt = datetime(2025, 6, 15, 10, 30, 0, tzinfo=timezone.utc) + desc = ModelDescriptor(name="test", created_at=dt) + d = desc.to_dict() + assert "createdAt" in d + + restored = ModelDescriptor.from_dict(d) + assert restored.created_at is not None + + +class TestModel: + """Tests for Model serialization.""" + + def test_minimal(self): + model = Model( + descriptor=ModelDescriptor(name="test-model"), + modelfs=ModelFS(type="layers", diff_ids=["sha256:abc"]), + config=ModelConfig(param_size="8b"), + ) + d = model.to_dict() + assert d["descriptor"]["name"] == "test-model" + assert d["modelfs"]["type"] == "layers" + assert d["config"]["paramSize"] == "8b" + + def test_json_round_trip(self): + model = Model( + descriptor=ModelDescriptor( + name="llama3-8b-instruct", + version="3.1", + ), + modelfs=ModelFS( + type="layers", + diff_ids=[ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + ), + config=ModelConfig(param_size="8b"), + ) + json_str = model.to_json() + restored = Model.from_json(json_str) + assert restored.descriptor.name == "llama3-8b-instruct" + assert restored.modelfs.type == "layers" + assert restored.config.param_size == "8b" + + def test_from_json_string(self): + data = json.dumps( + { + "descriptor": {"name": "test"}, + "modelfs": {"type": "layers", "diffIds": ["sha256:abc"]}, + "config": {"paramSize": "1b"}, + } + ) + model = Model.from_json(data) + assert model.descriptor.name == "test" + assert model.config.param_size == "1b" + + def test_full_model(self): + model = Model( + descriptor=ModelDescriptor( + name="qwen2-vl-72b-instruct", + version="2.0", + family="qwen2", + vendor="Alibaba", + authors=["Qwen Team"], + licenses=["Apache-2.0"], + title="Qwen2 VL 72B Instruct", + description="A vision-language model", + doc_url="https://example.com/docs", + source_url="https://github.com/example/qwen2", + datasets_url=["https://example.com/dataset"], + ), + modelfs=ModelFS( + type="layers", + diff_ids=["sha256:aabbcc", "sha256:ddeeff"], + ), + config=ModelConfig( + architecture="transformer", + format="safetensors", + param_size="72b", + precision="bfloat16", + capabilities=ModelCapabilities( + input_types=[Modality.TEXT, Modality.IMAGE], + output_types=[Modality.TEXT], + reasoning=True, + tool_usage=True, + languages=["en", "zh"], + ), + ), + ) + d = model.to_dict() + assert d["descriptor"]["vendor"] == "Alibaba" + assert d["config"]["capabilities"]["inputTypes"] == ["text", "image"] + assert d["config"]["capabilities"]["languages"] == ["en", "zh"] + + json_str = model.to_json() + restored = Model.from_json(json_str) + assert restored.config.capabilities.input_types == [ + Modality.TEXT, + Modality.IMAGE, + ] + assert restored.config.capabilities.languages == ["en", "zh"] diff --git a/specs-python/tests/test_mediatype.py b/specs-python/tests/test_mediatype.py new file mode 100644 index 0000000..fb47c08 --- /dev/null +++ b/specs-python/tests/test_mediatype.py @@ -0,0 +1,119 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for media type constants matching specs-go/v1/mediatype.go.""" + +from modelpack.v1 import mediatype + + +class TestMediaTypes: + """Verify media type constants match Go definitions exactly.""" + + def test_artifact_type(self): + assert ( + mediatype.ARTIFACT_TYPE_MODEL_MANIFEST + == "application/vnd.cncf.model.manifest.v1+json" + ) + + def test_config(self): + assert ( + mediatype.MEDIA_TYPE_MODEL_CONFIG + == "application/vnd.cncf.model.config.v1+json" + ) + + def test_weight_types(self): + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_RAW + == "application/vnd.cncf.model.weight.v1.raw" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT + == "application/vnd.cncf.model.weight.v1.tar" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_GZIP + == "application/vnd.cncf.model.weight.v1.tar+gzip" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_ZSTD + == "application/vnd.cncf.model.weight.v1.tar+zstd" + ) + + def test_weight_config_types(self): + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_CONFIG_RAW + == "application/vnd.cncf.model.weight.config.v1.raw" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_CONFIG + == "application/vnd.cncf.model.weight.config.v1.tar" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_CONFIG_GZIP + == "application/vnd.cncf.model.weight.config.v1.tar+gzip" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_WEIGHT_CONFIG_ZSTD + == "application/vnd.cncf.model.weight.config.v1.tar+zstd" + ) + + def test_doc_types(self): + assert ( + mediatype.MEDIA_TYPE_MODEL_DOC_RAW + == "application/vnd.cncf.model.doc.v1.raw" + ) + assert mediatype.MEDIA_TYPE_MODEL_DOC == "application/vnd.cncf.model.doc.v1.tar" + assert ( + mediatype.MEDIA_TYPE_MODEL_DOC_GZIP + == "application/vnd.cncf.model.doc.v1.tar+gzip" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_DOC_ZSTD + == "application/vnd.cncf.model.doc.v1.tar+zstd" + ) + + def test_code_types(self): + assert ( + mediatype.MEDIA_TYPE_MODEL_CODE_RAW + == "application/vnd.cncf.model.code.v1.raw" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_CODE == "application/vnd.cncf.model.code.v1.tar" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_CODE_GZIP + == "application/vnd.cncf.model.code.v1.tar+gzip" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_CODE_ZSTD + == "application/vnd.cncf.model.code.v1.tar+zstd" + ) + + def test_dataset_types(self): + assert ( + mediatype.MEDIA_TYPE_MODEL_DATASET_RAW + == "application/vnd.cncf.model.dataset.v1.raw" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_DATASET + == "application/vnd.cncf.model.dataset.v1.tar" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_DATASET_GZIP + == "application/vnd.cncf.model.dataset.v1.tar+gzip" + ) + assert ( + mediatype.MEDIA_TYPE_MODEL_DATASET_ZSTD + == "application/vnd.cncf.model.dataset.v1.tar+zstd" + ) diff --git a/specs-python/tests/test_validator.py b/specs-python/tests/test_validator.py new file mode 100644 index 0000000..5060797 --- /dev/null +++ b/specs-python/tests/test_validator.py @@ -0,0 +1,705 @@ +# Copyright 2025 The CNCF ModelPack Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Validation tests mirroring the Go test cases in schema/config_test.go. + +Each test case matches the corresponding Go test to ensure +consistent validation behavior between the Go and Python SDKs. +""" + +import json + +import pytest +from jsonschema import ValidationError + +from modelpack.v1.validator import validate_config + +# A valid base config used across tests. +VALID_CONFIG = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef" + ], + }, +} + + +class TestValidConfigCases: + """Tests that valid configs pass validation.""" + + def test_minimal_valid(self): + validate_config(VALID_CONFIG) + + def test_valid_as_json_string(self): + validate_config(json.dumps(VALID_CONFIG)) + + def test_valid_with_all_fields(self): + config = { + "descriptor": { + "name": "llama3-8b-instruct", + "version": "3.1", + "family": "llama3", + "authors": ["Meta"], + "vendor": "Meta", + "licenses": ["Apache-2.0"], + "title": "Llama 3 8B Instruct", + "description": "An instruction-tuned LLM", + "createdAt": "2025-01-01T00:00:00Z", + "docURL": "https://example.com/docs", + "sourceURL": "https://github.com/meta/llama3", + "datasetsURL": ["https://example.com/data"], + "revision": "abc123", + }, + "config": { + "architecture": "transformer", + "format": "safetensors", + "paramSize": "8b", + "precision": "float16", + "quantization": "awq", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "knowledgeCutoff": "2025-01-01T00:00:00Z", + "reasoning": True, + "toolUsage": True, + "reward": False, + "languages": ["en", "fr"], + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:abcdef1234567890abcdef1234567890" + "abcdef1234567890abcdef1234567890" + ], + }, + } + validate_config(config) + + +class TestFailureConfigCases: + """Tests mirroring Go config_test.go failure cases. + + Each test corresponds to a numbered test case in the Go file. + """ + + def test_config_missing(self): + """Go test 0: config is missing.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_version_is_number(self): + """Go test 1: version is a number.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": 3.1, + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_revision_is_number(self): + """Go test 2: revision is a number.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "revision": 1234567890, + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_created_at_not_rfc3339(self): + """Go test 3: createdAt is not RFC3339 format.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "createdAt": "2025/01/01T00:00:00Z", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_authors_not_array(self): + """Go test 4: authors is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "authors": "John Doe", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_licenses_not_array(self): + """Go test 5: licenses is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "licenses": "Apache-2.0", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_doc_url_is_array(self): + """Go test 6: docURL is an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "docURL": ["https://example.com/doc"], + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_source_url_is_array(self): + """Go test 7: sourceURL is an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "sourceURL": ["https://github.com/xyz/xyz3"], + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_datasets_url_not_array(self): + """Go test 8: datasetsURL is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + "sourceURL": "https://github.com/xyz/xyz3", + "datasetsURL": "https://example.com/dataset", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_param_size_is_number(self): + """Go test 9: paramSize is a number.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": {"paramSize": 8000000}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_precision_is_number(self): + """Go test 10: precision is a number.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": {"precision": 16}, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_type_not_layers(self): + """Go test 11: type is not 'layers'.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layer", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_diff_ids_not_array(self): + """Go test 12: diffIds is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": ( + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ), + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_diff_ids_empty(self): + """Go test 13: diffIds is empty.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": [], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_input_types_not_array(self): + """Go test 14: inputTypes is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": {"inputTypes": "text"}, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_output_types_not_array(self): + """Go test 15: outputTypes is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": {"outputTypes": "text"}, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_invalid_modality(self): + """Go test 16: invalid modality value.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": {"inputTypes": ["img"]}, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_knowledge_cutoff_not_rfc3339(self): + """Go test 17: knowledgeCutoff is not RFC3339 format.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "knowledgeCutoff": "2025-01-01", + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_reasoning_not_boolean(self): + """Go test 18: reasoning is not boolean.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "reasoning": "true", + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_tool_usage_not_boolean(self): + """Go test 19: toolUsage is not boolean.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "toolUsage": "true", + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_reward_not_boolean(self): + """Go test 20: reward is not boolean.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "reward": "true", + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_languages_not_array(self): + """Go test 21: languages is not an array.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "languages": "en", + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_language_code_not_iso639(self): + """Go test 22: language code is not a two-letter ISO 639 code.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "outputTypes": ["text"], + "languages": ["fra"], + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_unknown_field_in_capabilities(self): + """Go test 23: unknown field in capabilities.""" + config = { + "descriptor": { + "name": "xyz-3-8B-Instruct", + "version": "3.1", + }, + "config": { + "paramSize": "8b", + "capabilities": { + "inputTypes": ["text"], + "unknownField": True, + }, + }, + "modelfs": { + "type": "layers", + "diffIds": [ + "sha256:1234567890abcdef1234567890abcdef" + "1234567890abcdef1234567890abcdef" + ], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + +class TestEdgeCases: + """Additional edge case tests.""" + + def test_empty_dict(self): + with pytest.raises(ValidationError): + validate_config({}) + + def test_invalid_json_string(self): + with pytest.raises(Exception): + validate_config("{invalid json") + + def test_empty_name(self): + """Name with minLength: 1 should reject empty string.""" + config = { + "descriptor": {"name": "", "version": "1.0"}, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": ["sha256:abc"], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_unknown_field_at_root(self): + config = { + "descriptor": {"name": "test", "version": "1.0"}, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": ["sha256:abc"], + }, + "extraField": "should fail", + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_unknown_field_in_descriptor(self): + config = { + "descriptor": { + "name": "test", + "version": "1.0", + "unknownField": "value", + }, + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": ["sha256:abc"], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_unknown_field_in_config(self): + config = { + "descriptor": {"name": "test", "version": "1.0"}, + "config": {"paramSize": "8b", "unknownField": "value"}, + "modelfs": { + "type": "layers", + "diffIds": ["sha256:abc"], + }, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_modelfs_missing(self): + config = { + "descriptor": {"name": "test", "version": "1.0"}, + "config": {"paramSize": "8b"}, + } + with pytest.raises(ValidationError): + validate_config(config) + + def test_descriptor_missing(self): + config = { + "config": {"paramSize": "8b"}, + "modelfs": { + "type": "layers", + "diffIds": ["sha256:abc"], + }, + } + with pytest.raises(ValidationError): + validate_config(config) From d177ffef2b313b68567316cf024a158af498c0f5 Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Fri, 20 Mar 2026 17:40:16 +0530 Subject: [PATCH 2/3] feat: address review feedback on Python SDK - Remove copied config-schema.json, load from repo root as single source of truth - Fix validator dialect: use Draft4Validator to match schema's draft-04 declaration - Fix timestamp serialization to use 'Z' suffix for UTC, matching Go's RFC 3339 - Fix FileMetadata.mtime to always serialize (matches Go's non-pointer time.Time) - Migrate from setup.py to pyproject.toml - Fix import sorting (ruff) Closes #138 Signed-off-by: pradhyum6144 --- specs-python/modelpack/v1/__init__.py | 44 ++--- specs-python/modelpack/v1/annotations.py | 32 ++-- specs-python/modelpack/v1/config-schema.json | 168 ------------------- specs-python/modelpack/v1/config.py | 12 +- specs-python/modelpack/v1/validator.py | 20 ++- specs-python/pyproject.toml | 31 ++++ specs-python/setup.py | 33 ---- specs-python/tests/test_annotations.py | 4 +- specs-python/tests/test_config.py | 2 +- 9 files changed, 98 insertions(+), 248 deletions(-) delete mode 100644 specs-python/modelpack/v1/config-schema.json create mode 100644 specs-python/pyproject.toml delete mode 100644 specs-python/setup.py diff --git a/specs-python/modelpack/v1/__init__.py b/specs-python/modelpack/v1/__init__.py index 6a5c8cf..20a1225 100644 --- a/specs-python/modelpack/v1/__init__.py +++ b/specs-python/modelpack/v1/__init__.py @@ -14,43 +14,43 @@ """ModelPack Python SDK - CNCF standard for packaging and distributing AI models.""" +from modelpack.v1.annotations import ( + ANNOTATION_FILE_METADATA, + ANNOTATION_FILEPATH, + ANNOTATION_MEDIA_TYPE_UNTESTED, + FileMetadata, +) from modelpack.v1.config import ( + Modality, Model, ModelCapabilities, ModelConfig, ModelDescriptor, ModelFS, - Modality, -) -from modelpack.v1.annotations import ( - ANNOTATION_FILEPATH, - ANNOTATION_FILE_METADATA, - ANNOTATION_MEDIA_TYPE_UNTESTED, - FileMetadata, ) from modelpack.v1.mediatype import ( ARTIFACT_TYPE_MODEL_MANIFEST, - MEDIA_TYPE_MODEL_CONFIG, - MEDIA_TYPE_MODEL_WEIGHT_RAW, - MEDIA_TYPE_MODEL_WEIGHT, - MEDIA_TYPE_MODEL_WEIGHT_GZIP, - MEDIA_TYPE_MODEL_WEIGHT_ZSTD, - MEDIA_TYPE_MODEL_WEIGHT_CONFIG_RAW, - MEDIA_TYPE_MODEL_WEIGHT_CONFIG, - MEDIA_TYPE_MODEL_WEIGHT_CONFIG_GZIP, - MEDIA_TYPE_MODEL_WEIGHT_CONFIG_ZSTD, - MEDIA_TYPE_MODEL_DOC_RAW, - MEDIA_TYPE_MODEL_DOC, - MEDIA_TYPE_MODEL_DOC_GZIP, - MEDIA_TYPE_MODEL_DOC_ZSTD, - MEDIA_TYPE_MODEL_CODE_RAW, MEDIA_TYPE_MODEL_CODE, MEDIA_TYPE_MODEL_CODE_GZIP, + MEDIA_TYPE_MODEL_CODE_RAW, MEDIA_TYPE_MODEL_CODE_ZSTD, - MEDIA_TYPE_MODEL_DATASET_RAW, + MEDIA_TYPE_MODEL_CONFIG, MEDIA_TYPE_MODEL_DATASET, MEDIA_TYPE_MODEL_DATASET_GZIP, + MEDIA_TYPE_MODEL_DATASET_RAW, MEDIA_TYPE_MODEL_DATASET_ZSTD, + MEDIA_TYPE_MODEL_DOC, + MEDIA_TYPE_MODEL_DOC_GZIP, + MEDIA_TYPE_MODEL_DOC_RAW, + MEDIA_TYPE_MODEL_DOC_ZSTD, + MEDIA_TYPE_MODEL_WEIGHT, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG_GZIP, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG_RAW, + MEDIA_TYPE_MODEL_WEIGHT_CONFIG_ZSTD, + MEDIA_TYPE_MODEL_WEIGHT_GZIP, + MEDIA_TYPE_MODEL_WEIGHT_RAW, + MEDIA_TYPE_MODEL_WEIGHT_ZSTD, ) from modelpack.v1.validator import validate_config diff --git a/specs-python/modelpack/v1/annotations.py b/specs-python/modelpack/v1/annotations.py index e958f7a..3aa85d1 100644 --- a/specs-python/modelpack/v1/annotations.py +++ b/specs-python/modelpack/v1/annotations.py @@ -16,8 +16,8 @@ from __future__ import annotations -from dataclasses import dataclass -from datetime import datetime +from dataclasses import dataclass, field +from datetime import datetime, timezone # Annotation key for the file path of the layer. ANNOTATION_FILEPATH = "org.cncf.model.filepath" @@ -29,6 +29,14 @@ ANNOTATION_MEDIA_TYPE_UNTESTED = "org.cncf.model.file.mediatype.untested" +def _format_datetime(dt: datetime) -> str: + """Format a datetime as RFC 3339 with 'Z' suffix for UTC, matching Go.""" + s = dt.isoformat() + if s.endswith("+00:00"): + s = s[:-6] + "Z" + return s + + @dataclass class FileMetadata: """Represents the metadata of a file. @@ -41,31 +49,33 @@ class FileMetadata: uid: int = 0 gid: int = 0 size: int = 0 - mod_time: datetime | None = None + mod_time: datetime = field( + default_factory=lambda: datetime(1, 1, 1, tzinfo=timezone.utc) + ) typeflag: int = 0 def to_dict(self) -> dict: - """Serialize to a dict matching the JSON field names.""" - d: dict = { + """Serialize to a dict matching the JSON field names. + + All fields are always present, matching Go's FileMetadata + which has no omitempty tags. + """ + return { "name": self.name, "mode": self.mode, "uid": self.uid, "gid": self.gid, "size": self.size, + "mtime": _format_datetime(self.mod_time), "typeflag": self.typeflag, } - if self.mod_time is not None: - d["mtime"] = self.mod_time.isoformat() - return d @classmethod def from_dict(cls, data: dict) -> FileMetadata: """Deserialize from a dict with JSON field names.""" mod_time = None if "mtime" in data: - mod_time = datetime.fromisoformat( - data["mtime"].replace("Z", "+00:00") - ) + mod_time = datetime.fromisoformat(data["mtime"].replace("Z", "+00:00")) return cls( name=data.get("name", ""), mode=data.get("mode", 0), diff --git a/specs-python/modelpack/v1/config-schema.json b/specs-python/modelpack/v1/config-schema.json deleted file mode 100644 index ce13fcc..0000000 --- a/specs-python/modelpack/v1/config-schema.json +++ /dev/null @@ -1,168 +0,0 @@ -{ - "description": "Model Artifact Configuration Schema", - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "https://github.com/modelpack/model-spec/config", - "type": "object", - "properties": { - "descriptor": { - "$ref": "#/$defs/ModelDescriptor" - }, - "modelfs": { - "$ref": "#/$defs/ModelFS" - }, - "config": { - "$ref": "#/$defs/ModelConfig" - } - }, - "additionalProperties": false, - "required": [ - "descriptor", - "config", - "modelfs" - ], - "$defs": { - "ModelConfig": { - "type": "object", - "properties": { - "architecture": { - "type": "string" - }, - "format": { - "type": "string" - }, - "paramSize": { - "type": "string" - }, - "precision": { - "type": "string" - }, - "quantization": { - "type": "string" - }, - "capabilities": { - "$ref": "#/$defs/ModelCapabilities" - } - }, - "additionalProperties": false - }, - "ModelDescriptor": { - "type": "object", - "properties": { - "createdAt": { - "type": "string", - "format": "date-time" - }, - "authors": { - "type": "array", - "items": { - "type": "string" - } - }, - "family": { - "type": "string" - }, - "name": { - "type": "string", - "minLength": 1 - }, - "docURL": { - "type": "string" - }, - "sourceURL": { - "type": "string" - }, - "datasetsURL": { - "type": "array", - "items": { - "type": "string" - } - }, - "version": { - "type": "string" - }, - "revision": { - "type": "string" - }, - "vendor": { - "type": "string" - }, - "licenses": { - "type": "array", - "items": { - "type": "string" - } - }, - "title": { - "type": "string" - }, - "description": { - "type": "string" - } - }, - "additionalProperties": false - }, - "ModelFS": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["layers"] - }, - "diffIds": { - "type": "array", - "items": { - "type": "string" - }, - "minItems": 1 - } - }, - "additionalProperties": false, - "required": [ - "type", - "diffIds" - ] - }, - "ModelCapabilities": { - "type": "object", - "properties": { - "inputTypes": { - "type": "array", - "items": { - "$ref": "#/$defs/Modality" - } - }, - "outputTypes": { - "type": "array", - "items": { - "$ref": "#/$defs/Modality" - } - }, - "knowledgeCutoff": { - "type": "string", - "format": "date-time" - }, - "reasoning": { - "type": "boolean" - }, - "toolUsage": { - "type": "boolean" - }, - "reward": { - "type": "boolean" - }, - "languages": { - "type": "array", - "items": { - "type": "string", - "pattern": "^[a-z]{2}$" - } - } - }, - "additionalProperties": false - }, - "Modality": { - "type": "string", - "enum": ["text", "image", "audio", "video", "embedding", "other"] - } - } -} diff --git a/specs-python/modelpack/v1/config.py b/specs-python/modelpack/v1/config.py index cdeaf77..1dc857a 100644 --- a/specs-python/modelpack/v1/config.py +++ b/specs-python/modelpack/v1/config.py @@ -23,6 +23,14 @@ from typing import Optional +def _format_datetime(dt: datetime) -> str: + """Format a datetime as RFC 3339 with 'Z' suffix for UTC, matching Go.""" + s = dt.isoformat() + if s.endswith("+00:00"): + s = s[:-6] + "Z" + return s + + class Modality(str, Enum): """Defines the input and output types of the model. @@ -60,7 +68,7 @@ def to_dict(self) -> dict: if self.output_types is not None: d["outputTypes"] = [m.value for m in self.output_types] if self.knowledge_cutoff is not None: - d["knowledgeCutoff"] = self.knowledge_cutoff.isoformat() + d["knowledgeCutoff"] = _format_datetime(self.knowledge_cutoff) if self.reasoning is not None: d["reasoning"] = self.reasoning if self.tool_usage is not None: @@ -192,7 +200,7 @@ def to_dict(self) -> dict: """Serialize to a dict matching the JSON schema field names.""" d: dict = {} if self.created_at is not None: - d["createdAt"] = self.created_at.isoformat() + d["createdAt"] = _format_datetime(self.created_at) if self.authors is not None: d["authors"] = self.authors if self.family: diff --git a/specs-python/modelpack/v1/validator.py b/specs-python/modelpack/v1/validator.py index 83df26c..9a1873c 100644 --- a/specs-python/modelpack/v1/validator.py +++ b/specs-python/modelpack/v1/validator.py @@ -14,24 +14,26 @@ """JSON schema validation for ModelPack configs. -Uses the same config-schema.json as the Go validator to ensure -consistent validation behavior across languages. +Loads config-schema.json from the repo root (schema/config-schema.json) +as the single source of truth, matching the Go validator. """ from __future__ import annotations -import importlib.resources import json +from pathlib import Path -from jsonschema import Draft202012Validator, FormatChecker +from jsonschema import Draft4Validator, FormatChecker def _load_schema() -> dict: - """Load and return the config JSON schema.""" - schema_file = importlib.resources.files("modelpack.v1").joinpath( - "config-schema.json" + """Load and return the config JSON schema from the repo root.""" + schema_path = ( + Path(__file__).resolve().parent.parent.parent.parent + / "schema" + / "config-schema.json" ) - with schema_file.open(encoding="utf-8") as f: + with schema_path.open(encoding="utf-8") as f: return json.load(f) @@ -51,4 +53,4 @@ def validate_config(data: dict | str) -> None: schema = _load_schema() format_checker = FormatChecker() - Draft202012Validator(schema, format_checker=format_checker).validate(data) + Draft4Validator(schema, format_checker=format_checker).validate(data) diff --git a/specs-python/pyproject.toml b/specs-python/pyproject.toml new file mode 100644 index 0000000..6095901 --- /dev/null +++ b/specs-python/pyproject.toml @@ -0,0 +1,31 @@ +[build-system] +requires = ["setuptools>=64"] +build-backend = "setuptools.build_meta" + +[project] +name = "modelpack" +version = "0.1.0" +description = "Python SDK for the CNCF ModelPack specification" +requires-python = ">=3.10" +license = "Apache-2.0" +dependencies = [ + "jsonschema[format]>=4.20.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "ruff>=0.4.0", +] + +[tool.setuptools.packages.find] +include = ["modelpack*"] + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "I", "W"] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/specs-python/setup.py b/specs-python/setup.py deleted file mode 100644 index 0634240..0000000 --- a/specs-python/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2025 The CNCF ModelPack Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from setuptools import setup, find_packages - -setup( - name="modelpack", - version="0.1.0", - description="Python SDK for the CNCF ModelPack specification", - packages=find_packages(), - package_data={"modelpack.v1": ["config-schema.json"]}, - python_requires=">=3.10", - install_requires=[ - "jsonschema[format]>=4.20.0", - ], - extras_require={ - "dev": [ - "pytest>=7.0", - "ruff>=0.4.0", - ], - }, -) diff --git a/specs-python/tests/test_annotations.py b/specs-python/tests/test_annotations.py index 4681cfe..cc31ab4 100644 --- a/specs-python/tests/test_annotations.py +++ b/specs-python/tests/test_annotations.py @@ -17,8 +17,8 @@ from datetime import datetime, timezone from modelpack.v1.annotations import ( - ANNOTATION_FILEPATH, ANNOTATION_FILE_METADATA, + ANNOTATION_FILEPATH, ANNOTATION_MEDIA_TYPE_UNTESTED, FileMetadata, ) @@ -69,4 +69,4 @@ def test_empty(self): d = meta.to_dict() assert d["name"] == "" assert d["size"] == 0 - assert "mtime" not in d + assert "mtime" in d diff --git a/specs-python/tests/test_config.py b/specs-python/tests/test_config.py index 2619e18..09fd512 100644 --- a/specs-python/tests/test_config.py +++ b/specs-python/tests/test_config.py @@ -20,12 +20,12 @@ import pytest from modelpack.v1.config import ( + Modality, Model, ModelCapabilities, ModelConfig, ModelDescriptor, ModelFS, - Modality, ) From cea4b91177cbd4ce7bdd3a9a55bd8a7053e74d38 Mon Sep 17 00:00:00 2001 From: pradhyum6144 Date: Mon, 23 Mar 2026 00:35:16 +0530 Subject: [PATCH 3/3] feat: adopt schema-driven generation for Python SDK types Replace hand-written dataclasses with auto-generated Pydantic models from schema/config-schema.json using datamodel-code-generator. This keeps Python types in sync with the canonical schema automatically. - Add tools/generate_python_models.py for type generation - Add Makefile target: make generate-python-models - Replace config.py (hand-written) with models.py (auto-generated) - Update tests to use Pydantic model_validate/model_dump API - Add pydantic>=2 dependency to pyproject.toml - Add specs-python/README.md with usage and regeneration docs - All 73 tests pass Signed-off-by: pradhyum6144 --- Makefile | 4 + specs-python/README.md | 58 +++++ specs-python/modelpack/v1/__init__.py | 24 ++- specs-python/modelpack/v1/config.py | 293 -------------------------- specs-python/modelpack/v1/models.py | 79 +++++++ specs-python/pyproject.toml | 2 + specs-python/tests/test_config.py | 204 +++++++++++------- tools/generate_python_models.py | 69 ++++++ 8 files changed, 353 insertions(+), 380 deletions(-) create mode 100644 specs-python/README.md delete mode 100644 specs-python/modelpack/v1/config.py create mode 100644 specs-python/modelpack/v1/models.py create mode 100644 tools/generate_python_models.py diff --git a/Makefile b/Makefile index 803e878..bdc16d2 100644 --- a/Makefile +++ b/Makefile @@ -5,3 +5,7 @@ validate-examples: ## validate examples in the specification markdown files .PHONY: test test: go test ./... + +.PHONY: generate-python-models +generate-python-models: ## generate Python models from JSON schema + python3 tools/generate_python_models.py diff --git a/specs-python/README.md b/specs-python/README.md new file mode 100644 index 0000000..d1cdcc4 --- /dev/null +++ b/specs-python/README.md @@ -0,0 +1,58 @@ +# Python ModelPack Types + +This directory provides Python data structures for the CNCF ModelPack specification. + +The core model types are **auto-generated** from the canonical JSON Schema at `schema/config-schema.json` using [datamodel-code-generator](https://github.com/koxudaxi/datamodel-code-generator), ensuring they stay in sync with the specification automatically. + +## Requirements + +- Python >= 3.10 +- pydantic >= 2 +- jsonschema >= 4.20.0 + +## Installation + +```bash +cd specs-python +pip install -e . +``` + +For development: + +```bash +pip install -e ".[dev]" +``` + +## Usage + +```python +from modelpack.v1 import Model, ModelDescriptor, ModelFS, ModelConfig + +# Create a model from a JSON payload +model = Model.model_validate_json(json_payload) +print(model.descriptor.name) + +# Validate a config dict against the JSON schema +from modelpack.v1 import validate_config +validate_config(config_dict) +``` + +## Regenerate Models + +If the schema changes, regenerate the Pydantic models: + +```bash +pip install datamodel-code-generator +make generate-python-models +``` + +This runs `tools/generate_python_models.py`, which regenerates `specs-python/modelpack/v1/models.py`. + +**Do not edit `models.py` manually.** Update the schema and regenerate instead. + +## Run Tests + +```bash +cd specs-python +pytest +``` diff --git a/specs-python/modelpack/v1/__init__.py b/specs-python/modelpack/v1/__init__.py index 20a1225..3c9b164 100644 --- a/specs-python/modelpack/v1/__init__.py +++ b/specs-python/modelpack/v1/__init__.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""ModelPack Python SDK - CNCF standard for packaging and distributing AI models.""" +"""ModelPack Python SDK - CNCF standard for packaging and distributing AI models. + +Types are auto-generated from schema/config-schema.json using datamodel-code-generator. +Do not edit models.py manually — regenerate with: make generate-python-models +""" from modelpack.v1.annotations import ( ANNOTATION_FILE_METADATA, @@ -20,14 +24,6 @@ ANNOTATION_MEDIA_TYPE_UNTESTED, FileMetadata, ) -from modelpack.v1.config import ( - Modality, - Model, - ModelCapabilities, - ModelConfig, - ModelDescriptor, - ModelFS, -) from modelpack.v1.mediatype import ( ARTIFACT_TYPE_MODEL_MANIFEST, MEDIA_TYPE_MODEL_CODE, @@ -52,6 +48,15 @@ MEDIA_TYPE_MODEL_WEIGHT_RAW, MEDIA_TYPE_MODEL_WEIGHT_ZSTD, ) +from modelpack.v1.models import ( + Language, + Modality, + Model, + ModelCapabilities, + ModelConfig, + ModelDescriptor, + ModelFS, +) from modelpack.v1.validator import validate_config __all__ = [ @@ -61,6 +66,7 @@ "ModelDescriptor", "ModelFS", "Modality", + "Language", "FileMetadata", "ANNOTATION_FILEPATH", "ANNOTATION_FILE_METADATA", diff --git a/specs-python/modelpack/v1/config.py b/specs-python/modelpack/v1/config.py deleted file mode 100644 index 1dc857a..0000000 --- a/specs-python/modelpack/v1/config.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright 2025 The CNCF ModelPack Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Model configuration types matching the Go structs in specs-go/v1/config.go.""" - -from __future__ import annotations - -import json -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from typing import Optional - - -def _format_datetime(dt: datetime) -> str: - """Format a datetime as RFC 3339 with 'Z' suffix for UTC, matching Go.""" - s = dt.isoformat() - if s.endswith("+00:00"): - s = s[:-6] + "Z" - return s - - -class Modality(str, Enum): - """Defines the input and output types of the model. - - Mirrors the Go Modality type in specs-go/v1/config.go. - """ - - TEXT = "text" - IMAGE = "image" - AUDIO = "audio" - VIDEO = "video" - EMBEDDING = "embedding" - OTHER = "other" - - -@dataclass -class ModelCapabilities: - """Defines the special capabilities that the model supports. - - Mirrors the Go ModelCapabilities struct in specs-go/v1/config.go. - """ - - input_types: Optional[list[Modality]] = None - output_types: Optional[list[Modality]] = None - knowledge_cutoff: Optional[datetime] = None - reasoning: Optional[bool] = None - tool_usage: Optional[bool] = None - reward: Optional[bool] = None - languages: Optional[list[str]] = None - - def to_dict(self) -> dict: - """Serialize to a dict matching the JSON schema field names.""" - d: dict = {} - if self.input_types is not None: - d["inputTypes"] = [m.value for m in self.input_types] - if self.output_types is not None: - d["outputTypes"] = [m.value for m in self.output_types] - if self.knowledge_cutoff is not None: - d["knowledgeCutoff"] = _format_datetime(self.knowledge_cutoff) - if self.reasoning is not None: - d["reasoning"] = self.reasoning - if self.tool_usage is not None: - d["toolUsage"] = self.tool_usage - if self.reward is not None: - d["reward"] = self.reward - if self.languages is not None: - d["languages"] = self.languages - return d - - @classmethod - def from_dict(cls, data: dict) -> ModelCapabilities: - """Deserialize from a dict with JSON schema field names.""" - kwargs: dict = {} - if "inputTypes" in data: - kwargs["input_types"] = [Modality(v) for v in data["inputTypes"]] - if "outputTypes" in data: - kwargs["output_types"] = [Modality(v) for v in data["outputTypes"]] - if "knowledgeCutoff" in data and data["knowledgeCutoff"]: - kwargs["knowledge_cutoff"] = datetime.fromisoformat( - data["knowledgeCutoff"].replace("Z", "+00:00") - ) - if "reasoning" in data: - kwargs["reasoning"] = data["reasoning"] - if "toolUsage" in data: - kwargs["tool_usage"] = data["toolUsage"] - if "reward" in data: - kwargs["reward"] = data["reward"] - if "languages" in data: - kwargs["languages"] = data["languages"] - return cls(**kwargs) - - -@dataclass -class ModelConfig: - """Defines the execution parameters for running a model. - - Mirrors the Go ModelConfig struct in specs-go/v1/config.go. - """ - - architecture: str = "" - format: str = "" - param_size: str = "" - precision: str = "" - quantization: str = "" - capabilities: Optional[ModelCapabilities] = None - - def to_dict(self) -> dict: - """Serialize to a dict matching the JSON schema field names.""" - d: dict = {} - if self.architecture: - d["architecture"] = self.architecture - if self.format: - d["format"] = self.format - if self.param_size: - d["paramSize"] = self.param_size - if self.precision: - d["precision"] = self.precision - if self.quantization: - d["quantization"] = self.quantization - if self.capabilities is not None: - d["capabilities"] = self.capabilities.to_dict() - return d - - @classmethod - def from_dict(cls, data: dict) -> ModelConfig: - """Deserialize from a dict with JSON schema field names.""" - caps = None - if "capabilities" in data: - caps = ModelCapabilities.from_dict(data["capabilities"]) - return cls( - architecture=data.get("architecture", ""), - format=data.get("format", ""), - param_size=data.get("paramSize", ""), - precision=data.get("precision", ""), - quantization=data.get("quantization", ""), - capabilities=caps, - ) - - -@dataclass -class ModelFS: - """Describes layer content addresses. - - Mirrors the Go ModelFS struct in specs-go/v1/config.go. - """ - - type: str = "" - diff_ids: list[str] = field(default_factory=list) - - def to_dict(self) -> dict: - """Serialize to a dict matching the JSON schema field names.""" - return { - "type": self.type, - "diffIds": self.diff_ids, - } - - @classmethod - def from_dict(cls, data: dict) -> ModelFS: - """Deserialize from a dict with JSON schema field names.""" - return cls( - type=data.get("type", ""), - diff_ids=data.get("diffIds", []), - ) - - -@dataclass -class ModelDescriptor: - """Defines the general information of a model. - - Mirrors the Go ModelDescriptor struct in specs-go/v1/config.go. - """ - - created_at: Optional[datetime] = None - authors: Optional[list[str]] = None - family: str = "" - name: str = "" - doc_url: str = "" - source_url: str = "" - datasets_url: Optional[list[str]] = None - version: str = "" - revision: str = "" - vendor: str = "" - licenses: Optional[list[str]] = None - title: str = "" - description: str = "" - - def to_dict(self) -> dict: - """Serialize to a dict matching the JSON schema field names.""" - d: dict = {} - if self.created_at is not None: - d["createdAt"] = _format_datetime(self.created_at) - if self.authors is not None: - d["authors"] = self.authors - if self.family: - d["family"] = self.family - if self.name: - d["name"] = self.name - if self.doc_url: - d["docURL"] = self.doc_url - if self.source_url: - d["sourceURL"] = self.source_url - if self.datasets_url is not None: - d["datasetsURL"] = self.datasets_url - if self.version: - d["version"] = self.version - if self.revision: - d["revision"] = self.revision - if self.vendor: - d["vendor"] = self.vendor - if self.licenses is not None: - d["licenses"] = self.licenses - if self.title: - d["title"] = self.title - if self.description: - d["description"] = self.description - return d - - @classmethod - def from_dict(cls, data: dict) -> ModelDescriptor: - """Deserialize from a dict with JSON schema field names.""" - created_at = None - if "createdAt" in data: - created_at = datetime.fromisoformat( - data["createdAt"].replace("Z", "+00:00") - ) - return cls( - created_at=created_at, - authors=data.get("authors"), - family=data.get("family", ""), - name=data.get("name", ""), - doc_url=data.get("docURL", ""), - source_url=data.get("sourceURL", ""), - datasets_url=data.get("datasetsURL"), - version=data.get("version", ""), - revision=data.get("revision", ""), - vendor=data.get("vendor", ""), - licenses=data.get("licenses"), - title=data.get("title", ""), - description=data.get("description", ""), - ) - - -@dataclass -class Model: - """Defines the basic information of a model. - - Provides the application/vnd.cncf.model.config.v1+json mediatype - when marshalled to JSON. - - Mirrors the Go Model struct in specs-go/v1/config.go. - """ - - descriptor: ModelDescriptor = field(default_factory=ModelDescriptor) - modelfs: ModelFS = field(default_factory=ModelFS) - config: ModelConfig = field(default_factory=ModelConfig) - - def to_dict(self) -> dict: - """Serialize to a dict matching the JSON schema field names.""" - return { - "descriptor": self.descriptor.to_dict(), - "modelfs": self.modelfs.to_dict(), - "config": self.config.to_dict(), - } - - def to_json(self, indent: Optional[int] = 2) -> str: - """Serialize to a JSON string.""" - return json.dumps(self.to_dict(), indent=indent) - - @classmethod - def from_dict(cls, data: dict) -> Model: - """Deserialize from a dict with JSON schema field names.""" - return cls( - descriptor=ModelDescriptor.from_dict(data.get("descriptor", {})), - modelfs=ModelFS.from_dict(data.get("modelfs", {})), - config=ModelConfig.from_dict(data.get("config", {})), - ) - - @classmethod - def from_json(cls, json_str: str) -> Model: - """Deserialize from a JSON string.""" - return cls.from_dict(json.loads(json_str)) diff --git a/specs-python/modelpack/v1/models.py b/specs-python/modelpack/v1/models.py new file mode 100644 index 0000000..bd89a00 --- /dev/null +++ b/specs-python/modelpack/v1/models.py @@ -0,0 +1,79 @@ +# generated by datamodel-codegen: +# filename: config-schema.json + +from __future__ import annotations + +from typing import Literal + +from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel + + +class ModelDescriptor(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + createdAt: AwareDatetime | None = None + authors: list[str] | None = None + family: str | None = None + name: str | None = Field(None, min_length=1) + docURL: str | None = None + sourceURL: str | None = None + datasetsURL: list[str] | None = None + version: str | None = None + revision: str | None = None + vendor: str | None = None + licenses: list[str] | None = None + title: str | None = None + description: str | None = None + + +class ModelFS(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + type: Literal["layers"] + diffIds: list[str] = Field(..., min_length=1) + + +class Language(RootModel[str]): + root: str = Field(..., pattern="^[a-z]{2}$") + + +class Modality( + RootModel[Literal["text", "image", "audio", "video", "embedding", "other"]] +): + root: Literal["text", "image", "audio", "video", "embedding", "other"] + + +class ModelCapabilities(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + inputTypes: list[Modality] | None = None + outputTypes: list[Modality] | None = None + knowledgeCutoff: AwareDatetime | None = None + reasoning: bool | None = None + toolUsage: bool | None = None + reward: bool | None = None + languages: list[Language] | None = None + + +class ModelConfig(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + architecture: str | None = None + format: str | None = None + paramSize: str | None = None + precision: str | None = None + quantization: str | None = None + capabilities: ModelCapabilities | None = None + + +class Model(BaseModel): + model_config = ConfigDict( + extra="forbid", + ) + descriptor: ModelDescriptor + modelfs: ModelFS + config: ModelConfig diff --git a/specs-python/pyproject.toml b/specs-python/pyproject.toml index 6095901..d4295a6 100644 --- a/specs-python/pyproject.toml +++ b/specs-python/pyproject.toml @@ -9,6 +9,7 @@ description = "Python SDK for the CNCF ModelPack specification" requires-python = ">=3.10" license = "Apache-2.0" dependencies = [ + "pydantic>=2", "jsonschema[format]>=4.20.0", ] @@ -16,6 +17,7 @@ dependencies = [ dev = [ "pytest>=7.0", "ruff>=0.4.0", + "datamodel-code-generator>=0.25.0", ] [tool.setuptools.packages.find] diff --git a/specs-python/tests/test_config.py b/specs-python/tests/test_config.py index 09fd512..7706f72 100644 --- a/specs-python/tests/test_config.py +++ b/specs-python/tests/test_config.py @@ -12,14 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for model config types - serialization and deserialization.""" +"""Tests for auto-generated Pydantic model types.""" import json from datetime import datetime, timezone import pytest +from pydantic import ValidationError -from modelpack.v1.config import ( +from modelpack.v1.models import ( + Language, Modality, Model, ModelCapabilities, @@ -30,42 +32,56 @@ class TestModality: - """Tests for the Modality enum.""" + """Tests for the Modality RootModel.""" def test_all_values(self): - assert Modality.TEXT.value == "text" - assert Modality.IMAGE.value == "image" - assert Modality.AUDIO.value == "audio" - assert Modality.VIDEO.value == "video" - assert Modality.EMBEDDING.value == "embedding" - assert Modality.OTHER.value == "other" + for val in ("text", "image", "audio", "video", "embedding", "other"): + m = Modality(root=val) + assert m.root == val def test_from_string(self): - assert Modality("text") == Modality.TEXT - assert Modality("image") == Modality.IMAGE + m = Modality.model_validate("text") + assert m.root == "text" def test_invalid_value(self): - with pytest.raises(ValueError): - Modality("invalid") + with pytest.raises(ValidationError): + Modality.model_validate("invalid") + + +class TestLanguage: + """Tests for the Language RootModel.""" + + def test_valid(self): + lang = Language.model_validate("en") + assert lang.root == "en" + + def test_invalid_three_letter(self): + with pytest.raises(ValidationError): + Language.model_validate("fra") + + def test_invalid_uppercase(self): + with pytest.raises(ValidationError): + Language.model_validate("EN") class TestModelCapabilities: - """Tests for ModelCapabilities serialization.""" + """Tests for ModelCapabilities Pydantic model.""" def test_empty(self): caps = ModelCapabilities() - assert caps.to_dict() == {} + d = caps.model_dump(exclude_none=True) + assert d == {} def test_round_trip(self): caps = ModelCapabilities( - input_types=[Modality.TEXT, Modality.IMAGE], - output_types=[Modality.TEXT], + inputTypes=[Modality(root="text"), Modality(root="image")], + outputTypes=[Modality(root="text")], reasoning=True, - tool_usage=False, + toolUsage=False, reward=True, - languages=["en", "fr"], + languages=[Language(root="en"), Language(root="fr")], ) - d = caps.to_dict() + d = caps.model_dump(exclude_none=True) assert d["inputTypes"] == ["text", "image"] assert d["outputTypes"] == ["text"] assert d["reasoning"] is True @@ -73,80 +89,90 @@ def test_round_trip(self): assert d["reward"] is True assert d["languages"] == ["en", "fr"] - restored = ModelCapabilities.from_dict(d) - assert restored.input_types == [Modality.TEXT, Modality.IMAGE] + restored = ModelCapabilities.model_validate(d) + assert restored.inputTypes[0].root == "text" assert restored.reasoning is True - assert restored.tool_usage is False + assert restored.toolUsage is False def test_knowledge_cutoff(self): dt = datetime(2025, 1, 1, tzinfo=timezone.utc) - caps = ModelCapabilities(knowledge_cutoff=dt) - d = caps.to_dict() + caps = ModelCapabilities(knowledgeCutoff=dt) + d = caps.model_dump(exclude_none=True, mode="json") assert "knowledgeCutoff" in d - restored = ModelCapabilities.from_dict(d) - assert restored.knowledge_cutoff is not None + restored = ModelCapabilities.model_validate(d) + assert restored.knowledgeCutoff is not None class TestModelConfig: - """Tests for ModelConfig serialization.""" + """Tests for ModelConfig Pydantic model.""" def test_empty(self): cfg = ModelConfig() - assert cfg.to_dict() == {} + d = cfg.model_dump(exclude_none=True) + assert d == {} def test_round_trip(self): cfg = ModelConfig( architecture="transformer", format="safetensors", - param_size="8b", + paramSize="8b", precision="float16", quantization="awq", ) - d = cfg.to_dict() + d = cfg.model_dump(exclude_none=True) assert d["architecture"] == "transformer" assert d["paramSize"] == "8b" - restored = ModelConfig.from_dict(d) + restored = ModelConfig.model_validate(d) assert restored.architecture == "transformer" - assert restored.param_size == "8b" + assert restored.paramSize == "8b" def test_with_capabilities(self): cfg = ModelConfig( - param_size="8b", + paramSize="8b", capabilities=ModelCapabilities( - input_types=[Modality.TEXT], - output_types=[Modality.TEXT], + inputTypes=[Modality(root="text")], + outputTypes=[Modality(root="text")], ), ) - d = cfg.to_dict() + d = cfg.model_dump(exclude_none=True) assert "capabilities" in d assert d["capabilities"]["inputTypes"] == ["text"] class TestModelFS: - """Tests for ModelFS serialization.""" + """Tests for ModelFS Pydantic model.""" def test_round_trip(self): fs = ModelFS( type="layers", - diff_ids=["sha256:abc123"], + diffIds=["sha256:abc123"], ) - d = fs.to_dict() + d = fs.model_dump() assert d["type"] == "layers" assert d["diffIds"] == ["sha256:abc123"] - restored = ModelFS.from_dict(d) + restored = ModelFS.model_validate(d) assert restored.type == "layers" - assert restored.diff_ids == ["sha256:abc123"] + assert restored.diffIds == ["sha256:abc123"] + + def test_invalid_type(self): + with pytest.raises(ValidationError): + ModelFS(type="invalid", diffIds=["sha256:abc"]) + + def test_empty_diff_ids(self): + with pytest.raises(ValidationError): + ModelFS(type="layers", diffIds=[]) class TestModelDescriptor: - """Tests for ModelDescriptor serialization.""" + """Tests for ModelDescriptor Pydantic model.""" def test_empty(self): desc = ModelDescriptor() - assert desc.to_dict() == {} + d = desc.model_dump(exclude_none=True) + assert d == {} def test_round_trip(self): desc = ModelDescriptor( @@ -156,34 +182,42 @@ def test_round_trip(self): authors=["Meta"], licenses=["Apache-2.0"], ) - d = desc.to_dict() + d = desc.model_dump(exclude_none=True) assert d["name"] == "llama3-8b-instruct" assert d["version"] == "3.1" - restored = ModelDescriptor.from_dict(d) + restored = ModelDescriptor.model_validate(d) assert restored.name == "llama3-8b-instruct" assert restored.authors == ["Meta"] def test_created_at(self): dt = datetime(2025, 6, 15, 10, 30, 0, tzinfo=timezone.utc) - desc = ModelDescriptor(name="test", created_at=dt) - d = desc.to_dict() + desc = ModelDescriptor(name="test", createdAt=dt) + d = desc.model_dump(exclude_none=True, mode="json") assert "createdAt" in d - restored = ModelDescriptor.from_dict(d) - assert restored.created_at is not None + restored = ModelDescriptor.model_validate(d) + assert restored.createdAt is not None + + def test_empty_name_rejected(self): + with pytest.raises(ValidationError): + ModelDescriptor(name="") + + def test_extra_field_rejected(self): + with pytest.raises(ValidationError): + ModelDescriptor.model_validate({"name": "test", "unknownField": "value"}) class TestModel: - """Tests for Model serialization.""" + """Tests for Model Pydantic model.""" def test_minimal(self): model = Model( descriptor=ModelDescriptor(name="test-model"), - modelfs=ModelFS(type="layers", diff_ids=["sha256:abc"]), - config=ModelConfig(param_size="8b"), + modelfs=ModelFS(type="layers", diffIds=["sha256:abc"]), + config=ModelConfig(paramSize="8b"), ) - d = model.to_dict() + d = model.model_dump(exclude_none=True) assert d["descriptor"]["name"] == "test-model" assert d["modelfs"]["type"] == "layers" assert d["config"]["paramSize"] == "8b" @@ -196,18 +230,18 @@ def test_json_round_trip(self): ), modelfs=ModelFS( type="layers", - diff_ids=[ + diffIds=[ "sha256:1234567890abcdef1234567890abcdef" "1234567890abcdef1234567890abcdef" ], ), - config=ModelConfig(param_size="8b"), + config=ModelConfig(paramSize="8b"), ) - json_str = model.to_json() - restored = Model.from_json(json_str) + json_str = model.model_dump_json() + restored = Model.model_validate_json(json_str) assert restored.descriptor.name == "llama3-8b-instruct" assert restored.modelfs.type == "layers" - assert restored.config.param_size == "8b" + assert restored.config.paramSize == "8b" def test_from_json_string(self): data = json.dumps( @@ -217,9 +251,9 @@ def test_from_json_string(self): "config": {"paramSize": "1b"}, } ) - model = Model.from_json(data) + model = Model.model_validate_json(data) assert model.descriptor.name == "test" - assert model.config.param_size == "1b" + assert model.config.paramSize == "1b" def test_full_model(self): model = Model( @@ -232,37 +266,51 @@ def test_full_model(self): licenses=["Apache-2.0"], title="Qwen2 VL 72B Instruct", description="A vision-language model", - doc_url="https://example.com/docs", - source_url="https://github.com/example/qwen2", - datasets_url=["https://example.com/dataset"], + docURL="https://example.com/docs", + sourceURL="https://github.com/example/qwen2", + datasetsURL=["https://example.com/dataset"], ), modelfs=ModelFS( type="layers", - diff_ids=["sha256:aabbcc", "sha256:ddeeff"], + diffIds=["sha256:aabbcc", "sha256:ddeeff"], ), config=ModelConfig( architecture="transformer", format="safetensors", - param_size="72b", + paramSize="72b", precision="bfloat16", capabilities=ModelCapabilities( - input_types=[Modality.TEXT, Modality.IMAGE], - output_types=[Modality.TEXT], + inputTypes=[Modality(root="text"), Modality(root="image")], + outputTypes=[Modality(root="text")], reasoning=True, - tool_usage=True, - languages=["en", "zh"], + toolUsage=True, + languages=[Language(root="en"), Language(root="zh")], ), ), ) - d = model.to_dict() + d = model.model_dump(exclude_none=True) assert d["descriptor"]["vendor"] == "Alibaba" assert d["config"]["capabilities"]["inputTypes"] == ["text", "image"] assert d["config"]["capabilities"]["languages"] == ["en", "zh"] - json_str = model.to_json() - restored = Model.from_json(json_str) - assert restored.config.capabilities.input_types == [ - Modality.TEXT, - Modality.IMAGE, - ] - assert restored.config.capabilities.languages == ["en", "zh"] + json_str = model.model_dump_json() + restored = Model.model_validate_json(json_str) + assert restored.config.capabilities.inputTypes[0].root == "text" + assert restored.config.capabilities.inputTypes[1].root == "image" + assert restored.config.capabilities.languages[0].root == "en" + assert restored.config.capabilities.languages[1].root == "zh" + + def test_missing_required_fields(self): + with pytest.raises(ValidationError): + Model.model_validate({}) + + def test_extra_field_at_root(self): + with pytest.raises(ValidationError): + Model.model_validate( + { + "descriptor": {"name": "test"}, + "modelfs": {"type": "layers", "diffIds": ["sha256:abc"]}, + "config": {"paramSize": "8b"}, + "extraField": "should fail", + } + ) diff --git a/tools/generate_python_models.py b/tools/generate_python_models.py new file mode 100644 index 0000000..95bbc30 --- /dev/null +++ b/tools/generate_python_models.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +"""Generate Python models from the canonical ModelPack JSON Schema.""" + +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +SCHEMA_PATH = ROOT / "schema" / "config-schema.json" +OUTPUT_PATH = ROOT / "specs-python" / "modelpack" / "v1" / "models.py" + + +def main() -> int: + try: + import datamodel_code_generator # noqa: F401 + except ModuleNotFoundError: + print( + "error: datamodel-code-generator is not installed. " + "Install it with: pip install datamodel-code-generator", + file=sys.stderr, + ) + return 1 + + if not SCHEMA_PATH.is_file(): + print( + f"error: JSON Schema not found at: {SCHEMA_PATH}", + file=sys.stderr, + ) + return 1 + + OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) + + cmd = [ + sys.executable, + "-m", + "datamodel_code_generator", + "--input", + str(SCHEMA_PATH), + "--output", + str(OUTPUT_PATH), + "--input-file-type", + "jsonschema", + "--output-model-type", + "pydantic_v2.BaseModel", + "--target-python-version", + "3.10", + "--enum-field-as-literal", + "all", + "--field-constraints", + "--disable-timestamp", + ] + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as exc: + print( + f"error: datamodel-code-generator failed with exit code {exc.returncode}.", + file=sys.stderr, + ) + return exc.returncode or 1 + else: + print(f"Generated: {OUTPUT_PATH}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())