diff --git a/pyproject.toml b/pyproject.toml index 46fa96dc7..fec3f7d54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -297,6 +297,8 @@ lint.per-file-ignores."examples/**" = [ "ANN", "D100", "D103", "S101", "T20" ] lint.per-file-ignores."scripts/e2e_eval/**" = [ "ANN", "D103", "E501", "PERF203", "T20" ] # Download scripts: Allow subprocess, print, missing docstrings lint.per-file-ignores."scripts/download_rules.py" = [ "D103", "E501", "PERF401", "S603", "S607", "SIM108", "T20" ] +# COCO dataset build script: Allow print, missing docstrings, broad excepts +lint.per-file-ignores."scripts/build_coco_keypoints.py" = [ "BLE001", "D103", "E501", "T20" ] # CLI: Allow print statements lint.per-file-ignores."src/winml/modelkit/cli.py" = [ "T20", "T201" ] lint.per-file-ignores."src/winml/modelkit/commands/**" = [ "T20", "T201" ] diff --git a/scripts/build_coco_keypoints.py b/scripts/build_coco_keypoints.py new file mode 100644 index 000000000..98c0e5114 --- /dev/null +++ b/scripts/build_coco_keypoints.py @@ -0,0 +1,165 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Build a local COCO keypoints dataset for ``winml eval`` keypoint-detection. + +Downloads the COCO person-keypoints annotations (cached once) and a chosen +number of validation images individually (so a small subset does not require +the full ~780 MB image zip), then writes an Arrow dataset to disk via +``datasets.Dataset.save_to_disk``. Point ``winml eval --dataset-path`` at the +output directory. + +Each record has: + - ``image``: the RGB image (datasets ``Image`` feature) + - ``objects``: dict with parallel per-person lists ``keypoints`` (flat + ``[x, y, v]`` triplets), ``bbox`` (COCO ``[x, y, w, h]``) and ``area``. + +Only images containing at least one labeled-keypoint person are included. + +Usage: + uv run python scripts/build_coco_keypoints.py --output-dir ~/.cache/winml/datasets/coco_keypoints_val2017 + uv run python scripts/build_coco_keypoints.py --output-dir --num-images 100 + uv run python scripts/build_coco_keypoints.py --output-dir --num-images 0 # all images +""" + +import argparse +import io +import json +import random +import shutil +import sys +import urllib.request +import zipfile +from pathlib import Path + + +ANNOTATIONS_URL = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip" +ANNOTATION_MEMBER = "annotations/person_keypoints_val2017.json" +IMAGE_URL_TEMPLATE = "http://images.cocodataset.org/val2017/{file_name}" + +DEFAULT_CACHE = Path.home() / ".cache" / "winml" / "coco_build" + + +def _download(url: str, dest: Path) -> None: + """Download ``url`` to ``dest`` (skips if it already exists).""" + if dest.exists(): + return + dest.parent.mkdir(parents=True, exist_ok=True) + print(f"Downloading {url}") + with urllib.request.urlopen(url) as resp, dest.open("wb") as fh: # noqa: S310 + shutil.copyfileobj(resp, fh) + + +def _load_annotations(cache_dir: Path) -> dict: + """Return the parsed person-keypoints annotation JSON, downloading once.""" + ann_zip = cache_dir / "annotations_trainval2017.zip" + _download(ANNOTATIONS_URL, ann_zip) + print("Reading keypoint annotations...") + with zipfile.ZipFile(ann_zip) as zf, zf.open(ANNOTATION_MEMBER) as fh: + return json.load(fh) + + +def _group_annotations_by_image(annotations: list[dict]) -> dict[int, list[dict]]: + """Group person annotations by image id, keeping only labeled-keypoint people.""" + by_image: dict[int, list[dict]] = {} + for ann in annotations: + if ann.get("num_keypoints", 0) <= 0 or ann.get("iscrowd", 0): + continue + by_image.setdefault(ann["image_id"], []).append(ann) + return by_image + + +def _fetch_image(file_name: str) -> bytes: + """Download one validation image and return its raw bytes.""" + url = IMAGE_URL_TEMPLATE.format(file_name=file_name) + with urllib.request.urlopen(url) as resp: # noqa: S310 + return resp.read() + + +def build(output_dir: Path, num_images: int, cache_dir: Path, seed: int = 42) -> None: + """Build and save the COCO keypoints dataset to ``output_dir``.""" + from datasets import Dataset, Features, Image, Sequence, Value + from PIL import Image as PILImage + + coco = _load_annotations(cache_dir) + images_by_id = {img["id"]: img for img in coco["images"]} + by_image = _group_annotations_by_image(coco["annotations"]) + + image_ids = sorted(by_image) + if num_images > 0: + # Shuffle before truncating so a small subset is a representative random + # sample of the validation set rather than the lowest image ids. Seeded + # so repeated builds produce the same subset. + random.Random(seed).shuffle(image_ids) + image_ids = image_ids[:num_images] + print(f"Building {len(image_ids)} images with keypoint annotations...") + + records = [] + for idx, image_id in enumerate(image_ids, start=1): + info = images_by_id[image_id] + try: + raw = _fetch_image(info["file_name"]) + image = PILImage.open(io.BytesIO(raw)).convert("RGB") + except Exception as exc: + print(f" skip {info['file_name']}: {exc}") + continue + + persons = by_image[image_id] + records.append( + { + "image": image, + "objects": { + "keypoints": [[float(v) for v in p["keypoints"]] for p in persons], + "bbox": [[float(v) for v in p["bbox"]] for p in persons], + "area": [float(p["area"]) for p in persons], + }, + } + ) + if idx % 50 == 0: + print(f" {idx}/{len(image_ids)}") + + features = Features( + { + "image": Image(), + "objects": { + "keypoints": Sequence(Sequence(Value("float32"))), + "bbox": Sequence(Sequence(Value("float32"))), + "area": Sequence(Value("float32")), + }, + } + ) + dataset = Dataset.from_list(records, features=features) + output_dir.mkdir(parents=True, exist_ok=True) + dataset.save_to_disk(str(output_dir)) + print(f"Saved {len(dataset)} samples to {output_dir}") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Build a local COCO keypoints dataset.") + parser.add_argument("--output-dir", required=True, type=Path, help="Dataset output directory.") + parser.add_argument( + "--num-images", + type=int, + default=100, + help="Number of images to include (0 = all images with keypoints).", + ) + parser.add_argument( + "--cache-dir", + type=Path, + default=DEFAULT_CACHE, + help="Where to cache the downloaded annotations zip.", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for selecting the image subset (used when --num-images > 0).", + ) + args = parser.parse_args() + build(args.output_dir, args.num_images, args.cache_dir, args.seed) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/e2e_eval/testsets/models_with_acc.json b/scripts/e2e_eval/testsets/models_with_acc.json index 49e4b73e5..32bd965b5 100644 --- a/scripts/e2e_eval/testsets/models_with_acc.json +++ b/scripts/e2e_eval/testsets/models_with_acc.json @@ -1927,5 +1927,115 @@ "depth_column": "depth_map" } } + }, + { + "hf_id": "usyd-community/vitpose-base-simple", + "task": "keypoint-detection", + "model_type": "vitpose", + "group": "Top200", + "priority": "P3", + "dataset_config": { + "build_script": "scripts/build_coco_keypoints.py", + "path": "~/.cache/winml/datasets/coco_keypoints_val2017", + "split": "validation", + "metric": "map", + "winml_metric_key": "map", + "columns_mapping": { + "input_column": "image", + "annotation_column": "objects", + "keypoints_key": "keypoints", + "bbox_key": "bbox", + "area_key": "area", + "box_format": "xywh" + } + } + }, + { + "hf_id": "usyd-community/vitpose-plus-small", + "task": "keypoint-detection", + "model_type": "vitpose", + "group": "Top200", + "priority": "P3", + "dataset_config": { + "build_script": "scripts/build_coco_keypoints.py", + "path": "~/.cache/winml/datasets/coco_keypoints_val2017", + "split": "validation", + "metric": "map", + "winml_metric_key": "map", + "columns_mapping": { + "input_column": "image", + "annotation_column": "objects", + "keypoints_key": "keypoints", + "bbox_key": "bbox", + "area_key": "area", + "box_format": "xywh" + } + } + }, + { + "hf_id": "usyd-community/vitpose-plus-base", + "task": "keypoint-detection", + "model_type": "vitpose", + "group": "Top200", + "priority": "P3", + "dataset_config": { + "build_script": "scripts/build_coco_keypoints.py", + "path": "~/.cache/winml/datasets/coco_keypoints_val2017", + "split": "validation", + "metric": "map", + "winml_metric_key": "map", + "columns_mapping": { + "input_column": "image", + "annotation_column": "objects", + "keypoints_key": "keypoints", + "bbox_key": "bbox", + "area_key": "area", + "box_format": "xywh" + } + } + }, + { + "hf_id": "usyd-community/vitpose-plus-large", + "task": "keypoint-detection", + "model_type": "vitpose", + "group": "Top200", + "priority": "P3", + "dataset_config": { + "build_script": "scripts/build_coco_keypoints.py", + "path": "~/.cache/winml/datasets/coco_keypoints_val2017", + "split": "validation", + "metric": "map", + "winml_metric_key": "map", + "columns_mapping": { + "input_column": "image", + "annotation_column": "objects", + "keypoints_key": "keypoints", + "bbox_key": "bbox", + "area_key": "area", + "box_format": "xywh" + } + } + }, + { + "hf_id": "usyd-community/vitpose-plus-huge", + "task": "keypoint-detection", + "model_type": "vitpose", + "group": "Top200", + "priority": "P3", + "dataset_config": { + "build_script": "scripts/build_coco_keypoints.py", + "path": "~/.cache/winml/datasets/coco_keypoints_val2017", + "split": "validation", + "metric": "map", + "winml_metric_key": "map", + "columns_mapping": { + "input_column": "image", + "annotation_column": "objects", + "keypoints_key": "keypoints", + "bbox_key": "bbox", + "area_key": "area", + "box_format": "xywh" + } + } } ] diff --git a/src/winml/modelkit/eval/__init__.py b/src/winml/modelkit/eval/__init__.py index efb34f1fd..7aa18048f 100644 --- a/src/winml/modelkit/eval/__init__.py +++ b/src/winml/modelkit/eval/__init__.py @@ -25,8 +25,10 @@ from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator from .image_to_text_evaluator import WinMLImageToTextEvaluator + from .keypoint_detection_evaluator import WinMLKeypointDetectionEvaluator from .metrics.classification import ClassificationMetric from .metrics.depth import DepthMetric + from .metrics.keypoint import KeypointAPMetric from .metrics.knn_accuracy import KNNAccuracyMetric from .metrics.mean_average_precision import MAPMetric from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric @@ -56,6 +58,8 @@ ".image_segmentation_evaluator:WinMLImageSegmentationEvaluator", "WinMLImageToTextEvaluator": ".image_to_text_evaluator:WinMLImageToTextEvaluator", + "WinMLKeypointDetectionEvaluator": + ".keypoint_detection_evaluator:WinMLKeypointDetectionEvaluator", "WinMLObjectDetectionEvaluator": ".object_detection_evaluator:WinMLObjectDetectionEvaluator", "WinMLQuestionAnsweringEvaluator": @@ -75,6 +79,8 @@ ".metrics.classification:ClassificationMetric", "DepthMetric": ".metrics.depth:DepthMetric", + "KeypointAPMetric": + ".metrics.keypoint:KeypointAPMetric", "IGNORE_INDEX": ".metrics.mean_iou:IGNORE_INDEX", "KNNAccuracyMetric": @@ -116,6 +122,7 @@ def __dir__() -> list[str]: "DepthMetric", "EvalResult", "KNNAccuracyMetric", + "KeypointAPMetric", "MAPMetric", "MeanIoUMetric", "PseudoPerplexityMetric", @@ -130,6 +137,7 @@ def __dir__() -> list[str]: "WinMLImageFeatureExtractionEvaluator", "WinMLImageSegmentationEvaluator", "WinMLImageToTextEvaluator", + "WinMLKeypointDetectionEvaluator", "WinMLObjectDetectionEvaluator", "WinMLQuestionAnsweringEvaluator", "WinMLTextClassificationEvaluator", diff --git a/src/winml/modelkit/eval/evaluate.py b/src/winml/modelkit/eval/evaluate.py index c5633f1e3..99ef1be41 100644 --- a/src/winml/modelkit/eval/evaluate.py +++ b/src/winml/modelkit/eval/evaluate.py @@ -62,6 +62,8 @@ "winml.modelkit.eval.zero_shot_image_classification_evaluator:WinMLZeroShotImageClassificationEvaluator", "depth-estimation": "winml.modelkit.eval.depth_estimation_evaluator:WinMLDepthEstimationEvaluator", + "keypoint-detection": + "winml.modelkit.eval.keypoint_detection_evaluator:WinMLKeypointDetectionEvaluator", "compare-tensor": "winml.modelkit.eval.tensor_similarity_evaluator:TensorSimilarityEvaluator", } @@ -172,6 +174,21 @@ def get_evaluator_class(config: WinMLEvaluationConfig) -> type[WinMLEvaluator]: # the legacy `nyu_depth_v2.py` loader script. "revision": "refs/convert/parquet", }, + "keypoint-detection": { + # Built locally by scripts/build_coco_keypoints.py (COCO has no + # script-free HF mirror for person keypoints). Run that script first, + # or pass --dataset-path to point at your own build. + "path": "~/.cache/winml/datasets/coco_keypoints_val2017", + "split": "validation", + "columns_mapping": { + "input_column": "image", + "annotation_column": "objects", + "keypoints_key": "keypoints", + "bbox_key": "bbox", + "area_key": "area", + "box_format": "xywh", + }, + }, } diff --git a/src/winml/modelkit/eval/keypoint_detection_evaluator.py b/src/winml/modelkit/eval/keypoint_detection_evaluator.py new file mode 100644 index 000000000..0cbc634bf --- /dev/null +++ b/src/winml/modelkit/eval/keypoint_detection_evaluator.py @@ -0,0 +1,222 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Keypoint detection (human pose) evaluator using COCO OKS-based AP. + +ViTPose is top-down: it predicts keypoints inside a given person box, and +transformers exposes no ``keypoint-detection`` pipeline. So this evaluator +drives the image processor and ONNX model directly — for each ground-truth +person box it runs ``processor.preprocess -> model -> post_process_pose_estimation`` +— and scores the predictions against ground truth with ``KeypointAPMetric``. + +Using ground-truth person boxes isolates pose accuracy from detection quality, +which is the standard COCO top-down evaluation protocol. +""" + +from __future__ import annotations + +import logging +from types import SimpleNamespace +from typing import TYPE_CHECKING, Any + +from .base_evaluator import WinMLEvaluator + + +if TYPE_CHECKING: + from transformers.image_processing_utils import BaseImageProcessor + + from ..models.winml.base import WinMLPreTrainedModel + from .config import WinMLEvaluationConfig + +logger = logging.getLogger(__name__) + + +class WinMLKeypointDetectionEvaluator(WinMLEvaluator): + """Evaluator for keypoint detection using COCO OKS-based AP.""" + + def __init__( + self, + config: WinMLEvaluationConfig, + model: WinMLPreTrainedModel, + ) -> None: + from ..utils.eval_utils import get_default + + mapping = config.dataset.columns_mapping + task = "keypoint-detection" + self._image_col = mapping.get("input_column", get_default(task, "input_column")) + ann_col = mapping.get("annotation_column", get_default(task, "annotation_column")) + keypoints_key = mapping.get("keypoints_key", get_default(task, "keypoints_key")) + bbox_key = mapping.get("bbox_key", get_default(task, "bbox_key")) + area_key = mapping.get("area_key", get_default(task, "area_key")) + box_format = mapping.get("box_format", get_default(task, "box_format")) + assert ann_col is not None, "annotation_column has no default for keypoint-detection" + assert keypoints_key is not None, "keypoints_key has no default for keypoint-detection" + assert bbox_key is not None, "bbox_key has no default for keypoint-detection" + assert area_key is not None, "area_key has no default for keypoint-detection" + assert box_format is not None, "box_format has no default for keypoint-detection" + self._annotation_col: str = ann_col + self._keypoints_key: str = keypoints_key + self._bbox_key: str = bbox_key + self._area_key: str = area_key + self._box_format: str = box_format + + # Optional non-COCO keypoint layout: a model with a different keypoint + # set (e.g. SynthPose's 52 anatomical markers) can be scored by this + # same evaluator by supplying matching OKS sigmas and keypoint names + # through the dataset config. Absent -> the metric's COCO 17 defaults. + raw_sigmas = mapping.get("sigmas") + raw_names = mapping.get("keypoint_names") + self._sigmas: tuple[float, ...] | None = ( + tuple(float(s) for s in self._as_list(raw_sigmas)) if raw_sigmas else None + ) + self._keypoint_names: tuple[str, ...] | None = ( + tuple(str(n) for n in self._as_list(raw_names)) if raw_names else None + ) + + super().__init__(config, model) + + def prepare_pipeline(self) -> BaseImageProcessor: + """Load the image processor (no HF pipeline exists for this task). + + The processor size is forced to the exported ONNX input shape so the + preprocessed crops match the static model input. + """ + from transformers import AutoImageProcessor + + processor = AutoImageProcessor.from_pretrained(self.config.model_id) + + io_config = getattr(self.model, "io_config", None) or {} + input_shapes = io_config.get("input_shapes", []) + if input_shapes and len(input_shapes[0]) == 4: + _, _, h, w = input_shapes[0] + processor.size = {"height": h, "width": w} # type: ignore[attr-defined] + + return processor + + def compute(self) -> dict[str, Any]: + """Run keypoint evaluation over all samples and return COCO AP/AR.""" + from tqdm import tqdm + + from .metrics import KeypointAPMetric + + processor = self.pipe + predictions: list[dict[str, Any]] = [] + references: list[dict[str, Any]] = [] + skipped = 0 + + for image_id, sample in enumerate(tqdm(self.data, desc="Evaluating keypoints")): + image = sample.get(self._image_col) + annotation = sample.get(self._annotation_col) + if image is None or not annotation: + skipped += 1 + continue + + boxes = [self._to_xywh(b) for b in annotation[self._bbox_key]] + gt_keypoints = annotation[self._keypoints_key] + areas = annotation[self._area_key] + if not boxes: + skipped += 1 + continue + + pose_results = self._predict_poses(processor, image, boxes) + + for person_idx, pose in enumerate(pose_results): + predictions.append( + { + "image_id": image_id, + "keypoints": self._flatten_prediction(pose), + "score": self._person_score(pose), + } + ) + references.append( + { + "image_id": image_id, + "keypoints": list(gt_keypoints[person_idx]), + "bbox": boxes[person_idx], + "area": float(areas[person_idx]), + } + ) + + if skipped: + logger.warning("Skipped %d samples with missing image or annotations.", skipped) + + metric_kwargs: dict[str, Any] = {} + if self._sigmas is not None: + metric_kwargs["sigmas"] = self._sigmas + if self._keypoint_names is not None: + metric_kwargs["keypoint_names"] = self._keypoint_names + return KeypointAPMetric().compute( + predictions=predictions, references=references, **metric_kwargs + ) + + def _predict_poses( + self, + processor: BaseImageProcessor, + image: Any, + boxes: list[list[float]], + ) -> list[dict[str, Any]]: + """Run preprocess -> model -> post_process for one image's person boxes. + + ViTPose is exported with a static batch size of 1, so each person crop + is run separately and the resulting heatmaps are stacked back into one + ``(num_persons, ...)`` batch for post-processing. + """ + import torch + + inputs = processor.preprocess(images=image, boxes=[boxes], return_tensors="pt") + pixel_values = inputs["pixel_values"] + + heatmaps = [] + for i in range(pixel_values.shape[0]): + outputs = self.model(pixel_values=pixel_values[i : i + 1]) + heatmaps.append(self._extract_heatmaps(outputs)) + + wrapped = SimpleNamespace(heatmaps=torch.cat(heatmaps, dim=0)) + # post_process returns one list per image; we pass a single image. + return processor.post_process_pose_estimation(wrapped, boxes=[boxes])[0] + + @staticmethod + def _extract_heatmaps(outputs: Any) -> Any: + """Pull the heatmap tensor from the model output. + + Falls back to the first output when the name differs, so the evaluator + does not depend on a specific ONNX output tensor name. + """ + if not isinstance(outputs, dict): + return outputs.heatmaps + heatmaps = outputs.get("heatmaps") + if heatmaps is None: + heatmaps = next(iter(outputs.values())) + return heatmaps + + @staticmethod + def _as_list(value: Any) -> list[Any]: + """Coerce a comma-separated string or an existing sequence into a list.""" + if isinstance(value, str): + return [item.strip() for item in value.split(",") if item.strip()] + return list(value) + + def _to_xywh(self, box: Any) -> list[float]: + """Normalize a person box to COCO ``[x, y, w, h]``.""" + x0, y0, a, b = (float(v) for v in box) + if self._box_format == "xyxy": + return [x0, y0, a - x0, b - y0] + return [x0, y0, a, b] + + @staticmethod + def _flatten_prediction(pose: dict[str, Any]) -> list[float]: + """Interleave predicted ``(x, y)`` and per-keypoint score to ``[x, y, s, ...]``.""" + keypoints = pose["keypoints"].cpu().numpy() + scores = pose["scores"].cpu().numpy() + flat: list[float] = [] + for (x, y), score in zip(keypoints, scores, strict=False): + flat.extend([float(x), float(y), float(score)]) + return flat + + @staticmethod + def _person_score(pose: dict[str, Any]) -> float: + """Overall person confidence: mean of per-keypoint scores.""" + scores = pose["scores"].cpu().numpy() + return float(scores.mean()) if scores.size else 0.0 diff --git a/src/winml/modelkit/eval/metrics/__init__.py b/src/winml/modelkit/eval/metrics/__init__.py index 2695c3d84..2adfaf237 100644 --- a/src/winml/modelkit/eval/metrics/__init__.py +++ b/src/winml/modelkit/eval/metrics/__init__.py @@ -14,6 +14,7 @@ if TYPE_CHECKING: from .classification import ClassificationMetric from .depth import DepthMetric + from .keypoint import KeypointAPMetric from .knn_accuracy import KNNAccuracyMetric from .mean_average_precision import MAPMetric from .mean_iou import IGNORE_INDEX, MeanIoUMetric @@ -28,6 +29,7 @@ _LAZY_ATTRS: dict[str, str] = { "ClassificationMetric": ".classification:ClassificationMetric", "DepthMetric": ".depth:DepthMetric", + "KeypointAPMetric": ".keypoint:KeypointAPMetric", "IGNORE_INDEX": ".mean_iou:IGNORE_INDEX", "KNNAccuracyMetric": ".knn_accuracy:KNNAccuracyMetric", "MAPMetric": ".mean_average_precision:MAPMetric", @@ -59,6 +61,7 @@ def __dir__() -> list[str]: "ClassificationMetric", "DepthMetric", "KNNAccuracyMetric", + "KeypointAPMetric", "MAPMetric", "MeanIoUMetric", "PseudoPerplexityMetric", diff --git a/src/winml/modelkit/eval/metrics/keypoint.py b/src/winml/modelkit/eval/metrics/keypoint.py new file mode 100644 index 000000000..a345e5e39 --- /dev/null +++ b/src/winml/modelkit/eval/metrics/keypoint.py @@ -0,0 +1,209 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""COCO keypoint detection metric: OKS-based Average Precision. + +Computes the official COCO keypoint score — Average Precision averaged over +Object Keypoint Similarity (OKS) thresholds 0.50:0.95 — via ``pycocotools`` +``COCOeval(iouType="keypoints")``. This mirrors how the object-detection +evaluator reuses the COCO mAP protocol, but for pose keypoints. +""" + +from __future__ import annotations + +from typing import Any + + +# Standard COCO 17-keypoint OKS per-keypoint constants (pycocotools default). +# Exposed so non-COCO keypoint layouts can override them. +COCO_KEYPOINT_SIGMAS: tuple[float, ...] = ( + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, + 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, +) + +# COCO person keypoint names (order matters; index == keypoint id). +COCO_KEYPOINT_NAMES: tuple[str, ...] = ( + "nose", "left_eye", "right_eye", "left_ear", "right_ear", + "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", + "left_wrist", "right_wrist", "left_hip", "right_hip", + "left_knee", "right_knee", "left_ankle", "right_ankle", +) + + +class KeypointAPMetric: + """COCO-standard keypoint AP (OKS) wrapping ``pycocotools`` ``COCOeval``. + + Accepts per-instance predictions and ground truth as plain Python dicts + keyed by ``image_id`` and builds the COCO JSON structures internally. One + instance is one person (top-down pose estimation produces one keypoint set + per person box). + """ + + def compute( + self, + predictions: list[dict[str, Any]], + references: list[dict[str, Any]], + sigmas: tuple[float, ...] = COCO_KEYPOINT_SIGMAS, + keypoint_names: tuple[str, ...] = COCO_KEYPOINT_NAMES, + ) -> dict[str, float]: + """Compute COCO keypoint AP/AR. + + Args: + predictions: Per-person predictions. Each dict has: + - ``image_id``: int grouping key + - ``keypoints``: flat list ``[x1, y1, s1, ...]`` of length + ``3 * num_keypoints`` (``s`` is the per-keypoint score) + - ``score``: overall person confidence (float) + references: Per-person ground truth. Each dict has: + - ``image_id``: int grouping key + - ``keypoints``: flat list ``[x1, y1, v1, ...]`` (``v`` is the + COCO visibility flag 0/1/2) + - ``bbox``: ``[x, y, w, h]`` person box + - ``area``: person area used by the OKS normalization + - ``num_keypoints``: number of labeled keypoints (optional; + derived from visibility flags when absent) + sigmas: Per-keypoint OKS constants. Defaults to the COCO 17. + keypoint_names: Keypoint names for the category definition. + + Returns: + Dict with ``map``, ``map_50``, ``map_75``, ``map_medium``, + ``map_large``, ``mar``, ``mar_50``, ``mar_75``, plus + ``num_predictions``, ``num_ground_truths`` and ``num_images``. + Keys mirror the object-detection ``MAPMetric`` so downstream + reporting treats both COCO metrics the same way. + """ + import contextlib + import io + + import numpy as np + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + self._validate_keypoint_counts(predictions, references, len(sigmas)) + + image_ids = sorted( + {int(r["image_id"]) for r in references} | {int(p["image_id"]) for p in predictions} + ) + + gt_dict = { + "images": [{"id": image_id} for image_id in image_ids], + "annotations": self._build_gt_annotations(references), + "categories": [ + {"id": 1, "name": "person", "keypoints": list(keypoint_names), "skeleton": []} + ], + } + + coco_gt = COCO() + coco_gt.dataset = gt_dict + # pycocotools writes progress to stdout; keep eval output quiet. + with contextlib.redirect_stdout(io.StringIO()): + coco_gt.createIndex() + + detections = [ + { + "image_id": int(p["image_id"]), + "category_id": 1, + "keypoints": [float(v) for v in p["keypoints"]], + "score": float(p["score"]), + } + for p in predictions + ] + + if not detections or not gt_dict["annotations"]: + return self._empty_result(predictions, references, image_ids) + + with contextlib.redirect_stdout(io.StringIO()): + coco_dt = coco_gt.loadRes(detections) + coco_eval = COCOeval(coco_gt, coco_dt, iouType="keypoints") + coco_eval.params.kpt_oks_sigmas = np.array(sigmas, dtype=np.float64) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + stats = coco_eval.stats + return { + "map": float(stats[0]), + "map_50": float(stats[1]), + "map_75": float(stats[2]), + "map_medium": float(stats[3]), + "map_large": float(stats[4]), + "mar": float(stats[5]), + "mar_50": float(stats[6]), + "mar_75": float(stats[7]), + "num_predictions": len(detections), + "num_ground_truths": len(gt_dict["annotations"]), + "num_images": len(image_ids), + } + + @staticmethod + def _validate_keypoint_counts( + predictions: list[dict[str, Any]], + references: list[dict[str, Any]], + num_sigmas: int, + ) -> None: + """Ensure predictions, references and sigmas describe the same layout. + + OKS is only defined when the model's keypoints match the ground-truth + keypoint set. A model with a different layout (e.g. SynthPose's 52 + anatomical markers vs COCO's 17) cannot be scored against COCO ground + truth, so fail early with an actionable message instead of a numpy + broadcast error inside pycocotools. + """ + for kind, items in (("prediction", predictions), ("reference", references)): + for item in items: + count = len(item["keypoints"]) // 3 + if count != num_sigmas: + raise ValueError( + f"Keypoint count mismatch: {kind} has {count} keypoints but the " + f"metric expects {num_sigmas} (from sigmas). The model's keypoint " + f"layout must match the dataset and sigmas. For a non-COCO layout " + f"(e.g. SynthPose's 52 markers), pass matching sigmas and " + f"keypoint_names and use a dataset with the same keypoint definition." + ) + + @staticmethod + def _build_gt_annotations(references: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Convert ground-truth instances to COCO annotation dicts.""" + annotations = [] + for i, ref in enumerate(references): + keypoints = [float(v) for v in ref["keypoints"]] + num_keypoints = ref.get("num_keypoints") + if num_keypoints is None: + # COCO visibility flag is every 3rd value; >0 means labeled. + num_keypoints = sum(1 for v in keypoints[2::3] if v > 0) + annotations.append( + { + "id": i + 1, + "image_id": int(ref["image_id"]), + "category_id": 1, + "keypoints": keypoints, + "num_keypoints": int(num_keypoints), + "bbox": [float(v) for v in ref["bbox"]], + "area": float(ref["area"]), + "iscrowd": 0, + } + ) + return annotations + + @staticmethod + def _empty_result( + predictions: list[dict[str, Any]], + references: list[dict[str, Any]], + image_ids: list[int], + ) -> dict[str, float]: + """Return zeroed metrics when there is nothing to score.""" + return { + "map": 0.0, + "map_50": 0.0, + "map_75": 0.0, + "map_medium": 0.0, + "map_large": 0.0, + "mar": 0.0, + "mar_50": 0.0, + "mar_75": 0.0, + "num_predictions": len(predictions), + "num_ground_truths": len(references), + "num_images": len(image_ids), + } diff --git a/src/winml/modelkit/utils/eval_utils.py b/src/winml/modelkit/utils/eval_utils.py index 5a854cff9..27fdd66e1 100644 --- a/src/winml/modelkit/utils/eval_utils.py +++ b/src/winml/modelkit/utils/eval_utils.py @@ -287,6 +287,54 @@ class TaskSchema: ), ) +_KEYPOINT_DETECTION_SCHEMA = TaskSchema( + columns=( + SchemaItem( + "input_column", "input image (PIL.Image)", + default="image", remap_hint="", + ), + SchemaItem( + "annotation_column", + "annotation dict containing per-person keypoints + bbox + area", + default="objects", remap_hint="", + ), + ), + params=( + SchemaItem( + "keypoints_key", + "keypoints field inside the annotation dict " + "(flat [x, y, v] triplets per person)", + default="keypoints", remap_hint="", + ), + SchemaItem( + "bbox_key", + "person bbox field inside the annotation dict", + default="bbox", remap_hint="", + ), + SchemaItem( + "area_key", + "person area field inside the annotation dict", + default="area", remap_hint="", + ), + SchemaItem( + "box_format", "person bounding box layout", + default="xywh", remap_hint="", + ), + SchemaItem( + "sigmas", + "per-keypoint OKS sigmas as comma-separated floats; " + "defaults to the COCO 17-keypoint constants", + default="COCO 17 sigmas", remap_hint="", + ), + SchemaItem( + "keypoint_names", + "keypoint names in index order as comma-separated strings; " + "defaults to the COCO 17 names", + default="COCO 17 names", remap_hint="", + ), + ), +) + TASK_SCHEMAS: dict[str, TaskSchema] = { "image-classification": _IMAGE_CLASSIFICATION_SCHEMA, "text-classification": _TEXT_CLASSIFICATION_SCHEMA, @@ -304,6 +352,7 @@ class TaskSchema: "zero-shot-classification": _ZERO_SHOT_CLASSIFICATION_SCHEMA, "zero-shot-image-classification": _ZERO_SHOT_IMAGE_CLASSIFICATION_SCHEMA, "depth-estimation": _DEPTH_ESTIMATION_SCHEMA, + "keypoint-detection": _KEYPOINT_DETECTION_SCHEMA, } diff --git a/tests/unit/eval/test_keypoint_detection_evaluator.py b/tests/unit/eval/test_keypoint_detection_evaluator.py new file mode 100644 index 000000000..9cfbda7aa --- /dev/null +++ b/tests/unit/eval/test_keypoint_detection_evaluator.py @@ -0,0 +1,122 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Unit tests for WinMLKeypointDetectionEvaluator. + +The end-to-end pose pipeline is covered by integration runs; these tests +pin the box-format handling, prediction flattening, and the ``compute()`` +loop wiring with a mocked image processor and model. +""" + +from __future__ import annotations + +import pytest +import torch + +from winml.modelkit.eval import WinMLKeypointDetectionEvaluator + + +def _make_evaluator(box_format: str = "xywh") -> WinMLKeypointDetectionEvaluator: + """Create an evaluator instance without triggering data/model loading.""" + ev = object.__new__(WinMLKeypointDetectionEvaluator) + ev._image_col = "image" + ev._annotation_col = "objects" + ev._keypoints_key = "keypoints" + ev._bbox_key = "bbox" + ev._area_key = "area" + ev._box_format = box_format + ev._sigmas = None + ev._keypoint_names = None + return ev + + +class TestBoxFormat: + def test_xywh_passthrough(self): + ev = _make_evaluator("xywh") + assert ev._to_xywh([10.0, 20.0, 30.0, 40.0]) == [10.0, 20.0, 30.0, 40.0] + + def test_xyxy_converted_to_xywh(self): + ev = _make_evaluator("xyxy") + assert ev._to_xywh([10.0, 20.0, 40.0, 60.0]) == [10.0, 20.0, 30.0, 40.0] + + +class TestPredictionFlattening: + def test_flatten_interleaves_xy_and_score(self): + pose = { + "keypoints": torch.tensor([[1.0, 2.0], [3.0, 4.0]]), + "scores": torch.tensor([0.5, 0.9]), + } + flat = WinMLKeypointDetectionEvaluator._flatten_prediction(pose) + assert flat == pytest.approx([1.0, 2.0, 0.5, 3.0, 4.0, 0.9]) + + def test_person_score_is_mean(self): + pose = {"scores": torch.tensor([0.4, 0.6, 0.8])} + assert WinMLKeypointDetectionEvaluator._person_score(pose) == pytest.approx(0.6) + + +class _MockProcessor: + """Mock image processor returning fixed pixel values and poses.""" + + def __init__(self, num_keypoints: int = 17) -> None: + self._num_keypoints = num_keypoints + + def preprocess(self, images, boxes, return_tensors="pt"): + num_persons = len(boxes[0]) + return {"pixel_values": torch.zeros(num_persons, 3, 256, 192)} + + def post_process_pose_estimation(self, outputs, boxes): + num_persons = outputs.heatmaps.shape[0] + poses = [ + { + "keypoints": torch.ones(self._num_keypoints, 2), + "scores": torch.full((self._num_keypoints,), 0.8), + } + for _ in range(num_persons) + ] + return [poses] + + +class _MockModel: + """Mock model returning a single-person heatmap per call.""" + + def __init__(self, num_keypoints: int = 17) -> None: + self._num_keypoints = num_keypoints + + def __call__(self, pixel_values): + batch = pixel_values.shape[0] + return {"heatmaps": torch.zeros(batch, self._num_keypoints, 64, 48)} + + +class TestComputeLoop: + def test_compute_returns_ap_metrics(self): + ev = _make_evaluator("xywh") + ev.pipe = _MockProcessor() + ev.model = _MockModel() + # Two images: one with 2 persons, one with 1. + ev.data = [ + { + "image": object(), + "objects": { + "keypoints": [[1.0, 1.0, 2.0] * 17, [2.0, 2.0, 2.0] * 17], + "bbox": [[0.0, 0.0, 50.0, 80.0], [10.0, 10.0, 40.0, 70.0]], + "area": [4000.0, 2800.0], + }, + }, + { + "image": object(), + "objects": { + "keypoints": [[3.0, 3.0, 2.0] * 17], + "bbox": [[5.0, 5.0, 30.0, 60.0]], + "area": [1800.0], + }, + }, + ] + + result = ev.compute() + + assert "map" in result + assert result["num_images"] == 2 + assert result["num_predictions"] == 3 + assert result["num_ground_truths"] == 3 diff --git a/tests/unit/eval/test_keypoint_metric.py b/tests/unit/eval/test_keypoint_metric.py new file mode 100644 index 000000000..28b2fe623 --- /dev/null +++ b/tests/unit/eval/test_keypoint_metric.py @@ -0,0 +1,176 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Tests for KeypointAPMetric (COCO OKS-based AP).""" + +from __future__ import annotations + +import pytest + +from winml.modelkit.eval import KeypointAPMetric + + +def _coco_person_keypoints(cx: float, cy: float) -> list[float]: + """Build a plausible 17-keypoint COCO layout around a center, all visible.""" + offsets = [ + (0, -40), (-5, -45), (5, -45), (-10, -42), (10, -42), + (-20, -20), (20, -20), (-25, 0), (25, 0), + (-28, 20), (28, 20), (-15, 25), (15, 25), + (-15, 55), (15, 55), (-15, 85), (15, 85), + ] + flat: list[float] = [] + for dx, dy in offsets: + flat.extend([cx + dx, cy + dy, 2.0]) # visibility 2 = labeled + visible + return flat + + +class TestKeypointAPMetricPerfectMatch: + """Predictions identical to ground truth should score AP ~= 1.0.""" + + def test_single_person_perfect_match(self) -> None: + kpts = _coco_person_keypoints(100.0, 100.0) + pred_kpts = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(kpts)] + + metric = KeypointAPMetric() + result = metric.compute( + predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.95}], + references=[ + { + "image_id": 1, + "keypoints": kpts, + "bbox": [60.0, 50.0, 80.0, 110.0], + "area": 80.0 * 110.0, + } + ], + ) + + assert result["map"] == pytest.approx(1.0, abs=0.01) + assert result["map_50"] == pytest.approx(1.0, abs=0.01) + assert result["num_predictions"] == 1 + assert result["num_ground_truths"] == 1 + assert result["num_images"] == 1 + + def test_two_people_two_images_perfect_match(self) -> None: + refs = [] + preds = [] + for img_id, (cx, cy) in enumerate([(100.0, 100.0), (300.0, 200.0)], start=1): + kpts = _coco_person_keypoints(cx, cy) + pred_kpts = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(kpts)] + refs.append( + { + "image_id": img_id, + "keypoints": kpts, + "bbox": [cx - 40, cy - 50, 80.0, 110.0], + "area": 80.0 * 110.0, + } + ) + preds.append({"image_id": img_id, "keypoints": pred_kpts, "score": 0.9}) + + result = KeypointAPMetric().compute(predictions=preds, references=refs) + + assert result["map"] == pytest.approx(1.0, abs=0.01) + assert result["num_images"] == 2 + + +class TestKeypointAPMetricImperfect: + """Offset and empty-input behavior.""" + + def test_large_offset_lowers_ap(self) -> None: + kpts = _coco_person_keypoints(100.0, 100.0) + # Shift every predicted keypoint far from GT -> low OKS -> low AP. + pred_kpts: list[float] = [] + for i, v in enumerate(kpts): + if i % 3 == 0 or i % 3 == 1: + pred_kpts.append(v + 60.0) + else: + pred_kpts.append(1.0) + + result = KeypointAPMetric().compute( + predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.9}], + references=[ + { + "image_id": 1, + "keypoints": kpts, + "bbox": [60.0, 50.0, 80.0, 110.0], + "area": 80.0 * 110.0, + } + ], + ) + + assert result["map"] < 0.5 + + def test_no_predictions_returns_zero(self) -> None: + kpts = _coco_person_keypoints(100.0, 100.0) + result = KeypointAPMetric().compute( + predictions=[], + references=[ + { + "image_id": 1, + "keypoints": kpts, + "bbox": [60.0, 50.0, 80.0, 110.0], + "area": 80.0 * 110.0, + } + ], + ) + + assert result["map"] == 0.0 + assert result["num_predictions"] == 0 + assert result["num_ground_truths"] == 1 + + +class TestKeypointAPMetricMismatch: + """A non-COCO keypoint layout must fail early with a clear message.""" + + def test_mismatched_keypoint_count_raises(self): + # Model predicts 52 keypoints (e.g. SynthPose) against COCO-17 ground truth. + pred_kpts = [0.0, 0.0, 1.0] * 52 + gt_kpts = _coco_person_keypoints(100.0, 100.0) + + with pytest.raises(ValueError, match="Keypoint count mismatch"): + KeypointAPMetric().compute( + predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.9}], + references=[ + { + "image_id": 1, + "keypoints": gt_kpts, + "bbox": [60.0, 50.0, 80.0, 110.0], + "area": 80.0 * 110.0, + } + ], + ) + + +class TestKeypointAPMetricCustomLayout: + """A non-COCO keypoint layout scores when matching sigmas are supplied.""" + + def test_custom_layout_scores_with_matching_sigmas(self) -> None: + # A 5-keypoint layout (not COCO's 17): perfect predictions should still + # score map ~= 1.0 once sigmas/keypoint_names describe that layout. This + # is what lets one evaluator handle non-COCO models (e.g. SynthPose). + sigmas = (0.05, 0.05, 0.05, 0.05, 0.05) + names = ("a", "b", "c", "d", "e") + offsets = [(0, 0), (10, 0), (0, 10), (-10, 0), (0, -10)] + gt_flat: list[float] = [] + for dx, dy in offsets: + gt_flat.extend([100.0 + dx, 100.0 + dy, 2.0]) + pred_flat = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(gt_flat)] + + result = KeypointAPMetric().compute( + predictions=[{"image_id": 1, "keypoints": pred_flat, "score": 0.9}], + references=[ + { + "image_id": 1, + "keypoints": gt_flat, + "bbox": [80.0, 80.0, 40.0, 40.0], + "area": 1600.0, + } + ], + sigmas=sigmas, + keypoint_names=names, + ) + + assert result["map"] == pytest.approx(1.0, abs=0.01) + assert result["num_ground_truths"] == 1 +