diff --git a/pyproject.toml b/pyproject.toml
index 46fa96dc7..fec3f7d54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -297,6 +297,8 @@ lint.per-file-ignores."examples/**" = [ "ANN", "D100", "D103", "S101", "T20" ]
lint.per-file-ignores."scripts/e2e_eval/**" = [ "ANN", "D103", "E501", "PERF203", "T20" ]
# Download scripts: Allow subprocess, print, missing docstrings
lint.per-file-ignores."scripts/download_rules.py" = [ "D103", "E501", "PERF401", "S603", "S607", "SIM108", "T20" ]
+# COCO dataset build script: Allow print, missing docstrings, broad excepts
+lint.per-file-ignores."scripts/build_coco_keypoints.py" = [ "BLE001", "D103", "E501", "T20" ]
# CLI: Allow print statements
lint.per-file-ignores."src/winml/modelkit/cli.py" = [ "T20", "T201" ]
lint.per-file-ignores."src/winml/modelkit/commands/**" = [ "T20", "T201" ]
diff --git a/scripts/build_coco_keypoints.py b/scripts/build_coco_keypoints.py
new file mode 100644
index 000000000..98c0e5114
--- /dev/null
+++ b/scripts/build_coco_keypoints.py
@@ -0,0 +1,165 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Build a local COCO keypoints dataset for ``winml eval`` keypoint-detection.
+
+Downloads the COCO person-keypoints annotations (cached once) and a chosen
+number of validation images individually (so a small subset does not require
+the full ~780 MB image zip), then writes an Arrow dataset to disk via
+``datasets.Dataset.save_to_disk``. Point ``winml eval --dataset-path`` at the
+output directory.
+
+Each record has:
+ - ``image``: the RGB image (datasets ``Image`` feature)
+ - ``objects``: dict with parallel per-person lists ``keypoints`` (flat
+ ``[x, y, v]`` triplets), ``bbox`` (COCO ``[x, y, w, h]``) and ``area``.
+
+Only images containing at least one labeled-keypoint person are included.
+
+Usage:
+ uv run python scripts/build_coco_keypoints.py --output-dir ~/.cache/winml/datasets/coco_keypoints_val2017
+ uv run python scripts/build_coco_keypoints.py --output-dir
--num-images 100
+ uv run python scripts/build_coco_keypoints.py --output-dir --num-images 0 # all images
+"""
+
+import argparse
+import io
+import json
+import random
+import shutil
+import sys
+import urllib.request
+import zipfile
+from pathlib import Path
+
+
+ANNOTATIONS_URL = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
+ANNOTATION_MEMBER = "annotations/person_keypoints_val2017.json"
+IMAGE_URL_TEMPLATE = "http://images.cocodataset.org/val2017/{file_name}"
+
+DEFAULT_CACHE = Path.home() / ".cache" / "winml" / "coco_build"
+
+
+def _download(url: str, dest: Path) -> None:
+ """Download ``url`` to ``dest`` (skips if it already exists)."""
+ if dest.exists():
+ return
+ dest.parent.mkdir(parents=True, exist_ok=True)
+ print(f"Downloading {url}")
+ with urllib.request.urlopen(url) as resp, dest.open("wb") as fh: # noqa: S310
+ shutil.copyfileobj(resp, fh)
+
+
+def _load_annotations(cache_dir: Path) -> dict:
+ """Return the parsed person-keypoints annotation JSON, downloading once."""
+ ann_zip = cache_dir / "annotations_trainval2017.zip"
+ _download(ANNOTATIONS_URL, ann_zip)
+ print("Reading keypoint annotations...")
+ with zipfile.ZipFile(ann_zip) as zf, zf.open(ANNOTATION_MEMBER) as fh:
+ return json.load(fh)
+
+
+def _group_annotations_by_image(annotations: list[dict]) -> dict[int, list[dict]]:
+ """Group person annotations by image id, keeping only labeled-keypoint people."""
+ by_image: dict[int, list[dict]] = {}
+ for ann in annotations:
+ if ann.get("num_keypoints", 0) <= 0 or ann.get("iscrowd", 0):
+ continue
+ by_image.setdefault(ann["image_id"], []).append(ann)
+ return by_image
+
+
+def _fetch_image(file_name: str) -> bytes:
+ """Download one validation image and return its raw bytes."""
+ url = IMAGE_URL_TEMPLATE.format(file_name=file_name)
+ with urllib.request.urlopen(url) as resp: # noqa: S310
+ return resp.read()
+
+
+def build(output_dir: Path, num_images: int, cache_dir: Path, seed: int = 42) -> None:
+ """Build and save the COCO keypoints dataset to ``output_dir``."""
+ from datasets import Dataset, Features, Image, Sequence, Value
+ from PIL import Image as PILImage
+
+ coco = _load_annotations(cache_dir)
+ images_by_id = {img["id"]: img for img in coco["images"]}
+ by_image = _group_annotations_by_image(coco["annotations"])
+
+ image_ids = sorted(by_image)
+ if num_images > 0:
+ # Shuffle before truncating so a small subset is a representative random
+ # sample of the validation set rather than the lowest image ids. Seeded
+ # so repeated builds produce the same subset.
+ random.Random(seed).shuffle(image_ids)
+ image_ids = image_ids[:num_images]
+ print(f"Building {len(image_ids)} images with keypoint annotations...")
+
+ records = []
+ for idx, image_id in enumerate(image_ids, start=1):
+ info = images_by_id[image_id]
+ try:
+ raw = _fetch_image(info["file_name"])
+ image = PILImage.open(io.BytesIO(raw)).convert("RGB")
+ except Exception as exc:
+ print(f" skip {info['file_name']}: {exc}")
+ continue
+
+ persons = by_image[image_id]
+ records.append(
+ {
+ "image": image,
+ "objects": {
+ "keypoints": [[float(v) for v in p["keypoints"]] for p in persons],
+ "bbox": [[float(v) for v in p["bbox"]] for p in persons],
+ "area": [float(p["area"]) for p in persons],
+ },
+ }
+ )
+ if idx % 50 == 0:
+ print(f" {idx}/{len(image_ids)}")
+
+ features = Features(
+ {
+ "image": Image(),
+ "objects": {
+ "keypoints": Sequence(Sequence(Value("float32"))),
+ "bbox": Sequence(Sequence(Value("float32"))),
+ "area": Sequence(Value("float32")),
+ },
+ }
+ )
+ dataset = Dataset.from_list(records, features=features)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ dataset.save_to_disk(str(output_dir))
+ print(f"Saved {len(dataset)} samples to {output_dir}")
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="Build a local COCO keypoints dataset.")
+ parser.add_argument("--output-dir", required=True, type=Path, help="Dataset output directory.")
+ parser.add_argument(
+ "--num-images",
+ type=int,
+ default=100,
+ help="Number of images to include (0 = all images with keypoints).",
+ )
+ parser.add_argument(
+ "--cache-dir",
+ type=Path,
+ default=DEFAULT_CACHE,
+ help="Where to cache the downloaded annotations zip.",
+ )
+ parser.add_argument(
+ "--seed",
+ type=int,
+ default=42,
+ help="Random seed for selecting the image subset (used when --num-images > 0).",
+ )
+ args = parser.parse_args()
+ build(args.output_dir, args.num_images, args.cache_dir, args.seed)
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/scripts/e2e_eval/testsets/models_with_acc.json b/scripts/e2e_eval/testsets/models_with_acc.json
index 49e4b73e5..32bd965b5 100644
--- a/scripts/e2e_eval/testsets/models_with_acc.json
+++ b/scripts/e2e_eval/testsets/models_with_acc.json
@@ -1927,5 +1927,115 @@
"depth_column": "depth_map"
}
}
+ },
+ {
+ "hf_id": "usyd-community/vitpose-base-simple",
+ "task": "keypoint-detection",
+ "model_type": "vitpose",
+ "group": "Top200",
+ "priority": "P3",
+ "dataset_config": {
+ "build_script": "scripts/build_coco_keypoints.py",
+ "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+ "split": "validation",
+ "metric": "map",
+ "winml_metric_key": "map",
+ "columns_mapping": {
+ "input_column": "image",
+ "annotation_column": "objects",
+ "keypoints_key": "keypoints",
+ "bbox_key": "bbox",
+ "area_key": "area",
+ "box_format": "xywh"
+ }
+ }
+ },
+ {
+ "hf_id": "usyd-community/vitpose-plus-small",
+ "task": "keypoint-detection",
+ "model_type": "vitpose",
+ "group": "Top200",
+ "priority": "P3",
+ "dataset_config": {
+ "build_script": "scripts/build_coco_keypoints.py",
+ "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+ "split": "validation",
+ "metric": "map",
+ "winml_metric_key": "map",
+ "columns_mapping": {
+ "input_column": "image",
+ "annotation_column": "objects",
+ "keypoints_key": "keypoints",
+ "bbox_key": "bbox",
+ "area_key": "area",
+ "box_format": "xywh"
+ }
+ }
+ },
+ {
+ "hf_id": "usyd-community/vitpose-plus-base",
+ "task": "keypoint-detection",
+ "model_type": "vitpose",
+ "group": "Top200",
+ "priority": "P3",
+ "dataset_config": {
+ "build_script": "scripts/build_coco_keypoints.py",
+ "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+ "split": "validation",
+ "metric": "map",
+ "winml_metric_key": "map",
+ "columns_mapping": {
+ "input_column": "image",
+ "annotation_column": "objects",
+ "keypoints_key": "keypoints",
+ "bbox_key": "bbox",
+ "area_key": "area",
+ "box_format": "xywh"
+ }
+ }
+ },
+ {
+ "hf_id": "usyd-community/vitpose-plus-large",
+ "task": "keypoint-detection",
+ "model_type": "vitpose",
+ "group": "Top200",
+ "priority": "P3",
+ "dataset_config": {
+ "build_script": "scripts/build_coco_keypoints.py",
+ "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+ "split": "validation",
+ "metric": "map",
+ "winml_metric_key": "map",
+ "columns_mapping": {
+ "input_column": "image",
+ "annotation_column": "objects",
+ "keypoints_key": "keypoints",
+ "bbox_key": "bbox",
+ "area_key": "area",
+ "box_format": "xywh"
+ }
+ }
+ },
+ {
+ "hf_id": "usyd-community/vitpose-plus-huge",
+ "task": "keypoint-detection",
+ "model_type": "vitpose",
+ "group": "Top200",
+ "priority": "P3",
+ "dataset_config": {
+ "build_script": "scripts/build_coco_keypoints.py",
+ "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+ "split": "validation",
+ "metric": "map",
+ "winml_metric_key": "map",
+ "columns_mapping": {
+ "input_column": "image",
+ "annotation_column": "objects",
+ "keypoints_key": "keypoints",
+ "bbox_key": "bbox",
+ "area_key": "area",
+ "box_format": "xywh"
+ }
+ }
}
]
diff --git a/src/winml/modelkit/eval/__init__.py b/src/winml/modelkit/eval/__init__.py
index efb34f1fd..7aa18048f 100644
--- a/src/winml/modelkit/eval/__init__.py
+++ b/src/winml/modelkit/eval/__init__.py
@@ -25,8 +25,10 @@
from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
from .image_to_text_evaluator import WinMLImageToTextEvaluator
+ from .keypoint_detection_evaluator import WinMLKeypointDetectionEvaluator
from .metrics.classification import ClassificationMetric
from .metrics.depth import DepthMetric
+ from .metrics.keypoint import KeypointAPMetric
from .metrics.knn_accuracy import KNNAccuracyMetric
from .metrics.mean_average_precision import MAPMetric
from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric
@@ -56,6 +58,8 @@
".image_segmentation_evaluator:WinMLImageSegmentationEvaluator",
"WinMLImageToTextEvaluator":
".image_to_text_evaluator:WinMLImageToTextEvaluator",
+ "WinMLKeypointDetectionEvaluator":
+ ".keypoint_detection_evaluator:WinMLKeypointDetectionEvaluator",
"WinMLObjectDetectionEvaluator":
".object_detection_evaluator:WinMLObjectDetectionEvaluator",
"WinMLQuestionAnsweringEvaluator":
@@ -75,6 +79,8 @@
".metrics.classification:ClassificationMetric",
"DepthMetric":
".metrics.depth:DepthMetric",
+ "KeypointAPMetric":
+ ".metrics.keypoint:KeypointAPMetric",
"IGNORE_INDEX":
".metrics.mean_iou:IGNORE_INDEX",
"KNNAccuracyMetric":
@@ -116,6 +122,7 @@ def __dir__() -> list[str]:
"DepthMetric",
"EvalResult",
"KNNAccuracyMetric",
+ "KeypointAPMetric",
"MAPMetric",
"MeanIoUMetric",
"PseudoPerplexityMetric",
@@ -130,6 +137,7 @@ def __dir__() -> list[str]:
"WinMLImageFeatureExtractionEvaluator",
"WinMLImageSegmentationEvaluator",
"WinMLImageToTextEvaluator",
+ "WinMLKeypointDetectionEvaluator",
"WinMLObjectDetectionEvaluator",
"WinMLQuestionAnsweringEvaluator",
"WinMLTextClassificationEvaluator",
diff --git a/src/winml/modelkit/eval/evaluate.py b/src/winml/modelkit/eval/evaluate.py
index c5633f1e3..99ef1be41 100644
--- a/src/winml/modelkit/eval/evaluate.py
+++ b/src/winml/modelkit/eval/evaluate.py
@@ -62,6 +62,8 @@
"winml.modelkit.eval.zero_shot_image_classification_evaluator:WinMLZeroShotImageClassificationEvaluator",
"depth-estimation":
"winml.modelkit.eval.depth_estimation_evaluator:WinMLDepthEstimationEvaluator",
+ "keypoint-detection":
+ "winml.modelkit.eval.keypoint_detection_evaluator:WinMLKeypointDetectionEvaluator",
"compare-tensor":
"winml.modelkit.eval.tensor_similarity_evaluator:TensorSimilarityEvaluator",
}
@@ -172,6 +174,21 @@ def get_evaluator_class(config: WinMLEvaluationConfig) -> type[WinMLEvaluator]:
# the legacy `nyu_depth_v2.py` loader script.
"revision": "refs/convert/parquet",
},
+ "keypoint-detection": {
+ # Built locally by scripts/build_coco_keypoints.py (COCO has no
+ # script-free HF mirror for person keypoints). Run that script first,
+ # or pass --dataset-path to point at your own build.
+ "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+ "split": "validation",
+ "columns_mapping": {
+ "input_column": "image",
+ "annotation_column": "objects",
+ "keypoints_key": "keypoints",
+ "bbox_key": "bbox",
+ "area_key": "area",
+ "box_format": "xywh",
+ },
+ },
}
diff --git a/src/winml/modelkit/eval/keypoint_detection_evaluator.py b/src/winml/modelkit/eval/keypoint_detection_evaluator.py
new file mode 100644
index 000000000..0cbc634bf
--- /dev/null
+++ b/src/winml/modelkit/eval/keypoint_detection_evaluator.py
@@ -0,0 +1,222 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Keypoint detection (human pose) evaluator using COCO OKS-based AP.
+
+ViTPose is top-down: it predicts keypoints inside a given person box, and
+transformers exposes no ``keypoint-detection`` pipeline. So this evaluator
+drives the image processor and ONNX model directly — for each ground-truth
+person box it runs ``processor.preprocess -> model -> post_process_pose_estimation``
+— and scores the predictions against ground truth with ``KeypointAPMetric``.
+
+Using ground-truth person boxes isolates pose accuracy from detection quality,
+which is the standard COCO top-down evaluation protocol.
+"""
+
+from __future__ import annotations
+
+import logging
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Any
+
+from .base_evaluator import WinMLEvaluator
+
+
+if TYPE_CHECKING:
+ from transformers.image_processing_utils import BaseImageProcessor
+
+ from ..models.winml.base import WinMLPreTrainedModel
+ from .config import WinMLEvaluationConfig
+
+logger = logging.getLogger(__name__)
+
+
+class WinMLKeypointDetectionEvaluator(WinMLEvaluator):
+ """Evaluator for keypoint detection using COCO OKS-based AP."""
+
+ def __init__(
+ self,
+ config: WinMLEvaluationConfig,
+ model: WinMLPreTrainedModel,
+ ) -> None:
+ from ..utils.eval_utils import get_default
+
+ mapping = config.dataset.columns_mapping
+ task = "keypoint-detection"
+ self._image_col = mapping.get("input_column", get_default(task, "input_column"))
+ ann_col = mapping.get("annotation_column", get_default(task, "annotation_column"))
+ keypoints_key = mapping.get("keypoints_key", get_default(task, "keypoints_key"))
+ bbox_key = mapping.get("bbox_key", get_default(task, "bbox_key"))
+ area_key = mapping.get("area_key", get_default(task, "area_key"))
+ box_format = mapping.get("box_format", get_default(task, "box_format"))
+ assert ann_col is not None, "annotation_column has no default for keypoint-detection"
+ assert keypoints_key is not None, "keypoints_key has no default for keypoint-detection"
+ assert bbox_key is not None, "bbox_key has no default for keypoint-detection"
+ assert area_key is not None, "area_key has no default for keypoint-detection"
+ assert box_format is not None, "box_format has no default for keypoint-detection"
+ self._annotation_col: str = ann_col
+ self._keypoints_key: str = keypoints_key
+ self._bbox_key: str = bbox_key
+ self._area_key: str = area_key
+ self._box_format: str = box_format
+
+ # Optional non-COCO keypoint layout: a model with a different keypoint
+ # set (e.g. SynthPose's 52 anatomical markers) can be scored by this
+ # same evaluator by supplying matching OKS sigmas and keypoint names
+ # through the dataset config. Absent -> the metric's COCO 17 defaults.
+ raw_sigmas = mapping.get("sigmas")
+ raw_names = mapping.get("keypoint_names")
+ self._sigmas: tuple[float, ...] | None = (
+ tuple(float(s) for s in self._as_list(raw_sigmas)) if raw_sigmas else None
+ )
+ self._keypoint_names: tuple[str, ...] | None = (
+ tuple(str(n) for n in self._as_list(raw_names)) if raw_names else None
+ )
+
+ super().__init__(config, model)
+
+ def prepare_pipeline(self) -> BaseImageProcessor:
+ """Load the image processor (no HF pipeline exists for this task).
+
+ The processor size is forced to the exported ONNX input shape so the
+ preprocessed crops match the static model input.
+ """
+ from transformers import AutoImageProcessor
+
+ processor = AutoImageProcessor.from_pretrained(self.config.model_id)
+
+ io_config = getattr(self.model, "io_config", None) or {}
+ input_shapes = io_config.get("input_shapes", [])
+ if input_shapes and len(input_shapes[0]) == 4:
+ _, _, h, w = input_shapes[0]
+ processor.size = {"height": h, "width": w} # type: ignore[attr-defined]
+
+ return processor
+
+ def compute(self) -> dict[str, Any]:
+ """Run keypoint evaluation over all samples and return COCO AP/AR."""
+ from tqdm import tqdm
+
+ from .metrics import KeypointAPMetric
+
+ processor = self.pipe
+ predictions: list[dict[str, Any]] = []
+ references: list[dict[str, Any]] = []
+ skipped = 0
+
+ for image_id, sample in enumerate(tqdm(self.data, desc="Evaluating keypoints")):
+ image = sample.get(self._image_col)
+ annotation = sample.get(self._annotation_col)
+ if image is None or not annotation:
+ skipped += 1
+ continue
+
+ boxes = [self._to_xywh(b) for b in annotation[self._bbox_key]]
+ gt_keypoints = annotation[self._keypoints_key]
+ areas = annotation[self._area_key]
+ if not boxes:
+ skipped += 1
+ continue
+
+ pose_results = self._predict_poses(processor, image, boxes)
+
+ for person_idx, pose in enumerate(pose_results):
+ predictions.append(
+ {
+ "image_id": image_id,
+ "keypoints": self._flatten_prediction(pose),
+ "score": self._person_score(pose),
+ }
+ )
+ references.append(
+ {
+ "image_id": image_id,
+ "keypoints": list(gt_keypoints[person_idx]),
+ "bbox": boxes[person_idx],
+ "area": float(areas[person_idx]),
+ }
+ )
+
+ if skipped:
+ logger.warning("Skipped %d samples with missing image or annotations.", skipped)
+
+ metric_kwargs: dict[str, Any] = {}
+ if self._sigmas is not None:
+ metric_kwargs["sigmas"] = self._sigmas
+ if self._keypoint_names is not None:
+ metric_kwargs["keypoint_names"] = self._keypoint_names
+ return KeypointAPMetric().compute(
+ predictions=predictions, references=references, **metric_kwargs
+ )
+
+ def _predict_poses(
+ self,
+ processor: BaseImageProcessor,
+ image: Any,
+ boxes: list[list[float]],
+ ) -> list[dict[str, Any]]:
+ """Run preprocess -> model -> post_process for one image's person boxes.
+
+ ViTPose is exported with a static batch size of 1, so each person crop
+ is run separately and the resulting heatmaps are stacked back into one
+ ``(num_persons, ...)`` batch for post-processing.
+ """
+ import torch
+
+ inputs = processor.preprocess(images=image, boxes=[boxes], return_tensors="pt")
+ pixel_values = inputs["pixel_values"]
+
+ heatmaps = []
+ for i in range(pixel_values.shape[0]):
+ outputs = self.model(pixel_values=pixel_values[i : i + 1])
+ heatmaps.append(self._extract_heatmaps(outputs))
+
+ wrapped = SimpleNamespace(heatmaps=torch.cat(heatmaps, dim=0))
+ # post_process returns one list per image; we pass a single image.
+ return processor.post_process_pose_estimation(wrapped, boxes=[boxes])[0]
+
+ @staticmethod
+ def _extract_heatmaps(outputs: Any) -> Any:
+ """Pull the heatmap tensor from the model output.
+
+ Falls back to the first output when the name differs, so the evaluator
+ does not depend on a specific ONNX output tensor name.
+ """
+ if not isinstance(outputs, dict):
+ return outputs.heatmaps
+ heatmaps = outputs.get("heatmaps")
+ if heatmaps is None:
+ heatmaps = next(iter(outputs.values()))
+ return heatmaps
+
+ @staticmethod
+ def _as_list(value: Any) -> list[Any]:
+ """Coerce a comma-separated string or an existing sequence into a list."""
+ if isinstance(value, str):
+ return [item.strip() for item in value.split(",") if item.strip()]
+ return list(value)
+
+ def _to_xywh(self, box: Any) -> list[float]:
+ """Normalize a person box to COCO ``[x, y, w, h]``."""
+ x0, y0, a, b = (float(v) for v in box)
+ if self._box_format == "xyxy":
+ return [x0, y0, a - x0, b - y0]
+ return [x0, y0, a, b]
+
+ @staticmethod
+ def _flatten_prediction(pose: dict[str, Any]) -> list[float]:
+ """Interleave predicted ``(x, y)`` and per-keypoint score to ``[x, y, s, ...]``."""
+ keypoints = pose["keypoints"].cpu().numpy()
+ scores = pose["scores"].cpu().numpy()
+ flat: list[float] = []
+ for (x, y), score in zip(keypoints, scores, strict=False):
+ flat.extend([float(x), float(y), float(score)])
+ return flat
+
+ @staticmethod
+ def _person_score(pose: dict[str, Any]) -> float:
+ """Overall person confidence: mean of per-keypoint scores."""
+ scores = pose["scores"].cpu().numpy()
+ return float(scores.mean()) if scores.size else 0.0
diff --git a/src/winml/modelkit/eval/metrics/__init__.py b/src/winml/modelkit/eval/metrics/__init__.py
index 2695c3d84..2adfaf237 100644
--- a/src/winml/modelkit/eval/metrics/__init__.py
+++ b/src/winml/modelkit/eval/metrics/__init__.py
@@ -14,6 +14,7 @@
if TYPE_CHECKING:
from .classification import ClassificationMetric
from .depth import DepthMetric
+ from .keypoint import KeypointAPMetric
from .knn_accuracy import KNNAccuracyMetric
from .mean_average_precision import MAPMetric
from .mean_iou import IGNORE_INDEX, MeanIoUMetric
@@ -28,6 +29,7 @@
_LAZY_ATTRS: dict[str, str] = {
"ClassificationMetric": ".classification:ClassificationMetric",
"DepthMetric": ".depth:DepthMetric",
+ "KeypointAPMetric": ".keypoint:KeypointAPMetric",
"IGNORE_INDEX": ".mean_iou:IGNORE_INDEX",
"KNNAccuracyMetric": ".knn_accuracy:KNNAccuracyMetric",
"MAPMetric": ".mean_average_precision:MAPMetric",
@@ -59,6 +61,7 @@ def __dir__() -> list[str]:
"ClassificationMetric",
"DepthMetric",
"KNNAccuracyMetric",
+ "KeypointAPMetric",
"MAPMetric",
"MeanIoUMetric",
"PseudoPerplexityMetric",
diff --git a/src/winml/modelkit/eval/metrics/keypoint.py b/src/winml/modelkit/eval/metrics/keypoint.py
new file mode 100644
index 000000000..a345e5e39
--- /dev/null
+++ b/src/winml/modelkit/eval/metrics/keypoint.py
@@ -0,0 +1,209 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""COCO keypoint detection metric: OKS-based Average Precision.
+
+Computes the official COCO keypoint score — Average Precision averaged over
+Object Keypoint Similarity (OKS) thresholds 0.50:0.95 — via ``pycocotools``
+``COCOeval(iouType="keypoints")``. This mirrors how the object-detection
+evaluator reuses the COCO mAP protocol, but for pose keypoints.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+# Standard COCO 17-keypoint OKS per-keypoint constants (pycocotools default).
+# Exposed so non-COCO keypoint layouts can override them.
+COCO_KEYPOINT_SIGMAS: tuple[float, ...] = (
+ 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
+ 0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089,
+)
+
+# COCO person keypoint names (order matters; index == keypoint id).
+COCO_KEYPOINT_NAMES: tuple[str, ...] = (
+ "nose", "left_eye", "right_eye", "left_ear", "right_ear",
+ "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
+ "left_wrist", "right_wrist", "left_hip", "right_hip",
+ "left_knee", "right_knee", "left_ankle", "right_ankle",
+)
+
+
+class KeypointAPMetric:
+ """COCO-standard keypoint AP (OKS) wrapping ``pycocotools`` ``COCOeval``.
+
+ Accepts per-instance predictions and ground truth as plain Python dicts
+ keyed by ``image_id`` and builds the COCO JSON structures internally. One
+ instance is one person (top-down pose estimation produces one keypoint set
+ per person box).
+ """
+
+ def compute(
+ self,
+ predictions: list[dict[str, Any]],
+ references: list[dict[str, Any]],
+ sigmas: tuple[float, ...] = COCO_KEYPOINT_SIGMAS,
+ keypoint_names: tuple[str, ...] = COCO_KEYPOINT_NAMES,
+ ) -> dict[str, float]:
+ """Compute COCO keypoint AP/AR.
+
+ Args:
+ predictions: Per-person predictions. Each dict has:
+ - ``image_id``: int grouping key
+ - ``keypoints``: flat list ``[x1, y1, s1, ...]`` of length
+ ``3 * num_keypoints`` (``s`` is the per-keypoint score)
+ - ``score``: overall person confidence (float)
+ references: Per-person ground truth. Each dict has:
+ - ``image_id``: int grouping key
+ - ``keypoints``: flat list ``[x1, y1, v1, ...]`` (``v`` is the
+ COCO visibility flag 0/1/2)
+ - ``bbox``: ``[x, y, w, h]`` person box
+ - ``area``: person area used by the OKS normalization
+ - ``num_keypoints``: number of labeled keypoints (optional;
+ derived from visibility flags when absent)
+ sigmas: Per-keypoint OKS constants. Defaults to the COCO 17.
+ keypoint_names: Keypoint names for the category definition.
+
+ Returns:
+ Dict with ``map``, ``map_50``, ``map_75``, ``map_medium``,
+ ``map_large``, ``mar``, ``mar_50``, ``mar_75``, plus
+ ``num_predictions``, ``num_ground_truths`` and ``num_images``.
+ Keys mirror the object-detection ``MAPMetric`` so downstream
+ reporting treats both COCO metrics the same way.
+ """
+ import contextlib
+ import io
+
+ import numpy as np
+ from pycocotools.coco import COCO
+ from pycocotools.cocoeval import COCOeval
+
+ self._validate_keypoint_counts(predictions, references, len(sigmas))
+
+ image_ids = sorted(
+ {int(r["image_id"]) for r in references} | {int(p["image_id"]) for p in predictions}
+ )
+
+ gt_dict = {
+ "images": [{"id": image_id} for image_id in image_ids],
+ "annotations": self._build_gt_annotations(references),
+ "categories": [
+ {"id": 1, "name": "person", "keypoints": list(keypoint_names), "skeleton": []}
+ ],
+ }
+
+ coco_gt = COCO()
+ coco_gt.dataset = gt_dict
+ # pycocotools writes progress to stdout; keep eval output quiet.
+ with contextlib.redirect_stdout(io.StringIO()):
+ coco_gt.createIndex()
+
+ detections = [
+ {
+ "image_id": int(p["image_id"]),
+ "category_id": 1,
+ "keypoints": [float(v) for v in p["keypoints"]],
+ "score": float(p["score"]),
+ }
+ for p in predictions
+ ]
+
+ if not detections or not gt_dict["annotations"]:
+ return self._empty_result(predictions, references, image_ids)
+
+ with contextlib.redirect_stdout(io.StringIO()):
+ coco_dt = coco_gt.loadRes(detections)
+ coco_eval = COCOeval(coco_gt, coco_dt, iouType="keypoints")
+ coco_eval.params.kpt_oks_sigmas = np.array(sigmas, dtype=np.float64)
+ coco_eval.evaluate()
+ coco_eval.accumulate()
+ coco_eval.summarize()
+
+ stats = coco_eval.stats
+ return {
+ "map": float(stats[0]),
+ "map_50": float(stats[1]),
+ "map_75": float(stats[2]),
+ "map_medium": float(stats[3]),
+ "map_large": float(stats[4]),
+ "mar": float(stats[5]),
+ "mar_50": float(stats[6]),
+ "mar_75": float(stats[7]),
+ "num_predictions": len(detections),
+ "num_ground_truths": len(gt_dict["annotations"]),
+ "num_images": len(image_ids),
+ }
+
+ @staticmethod
+ def _validate_keypoint_counts(
+ predictions: list[dict[str, Any]],
+ references: list[dict[str, Any]],
+ num_sigmas: int,
+ ) -> None:
+ """Ensure predictions, references and sigmas describe the same layout.
+
+ OKS is only defined when the model's keypoints match the ground-truth
+ keypoint set. A model with a different layout (e.g. SynthPose's 52
+ anatomical markers vs COCO's 17) cannot be scored against COCO ground
+ truth, so fail early with an actionable message instead of a numpy
+ broadcast error inside pycocotools.
+ """
+ for kind, items in (("prediction", predictions), ("reference", references)):
+ for item in items:
+ count = len(item["keypoints"]) // 3
+ if count != num_sigmas:
+ raise ValueError(
+ f"Keypoint count mismatch: {kind} has {count} keypoints but the "
+ f"metric expects {num_sigmas} (from sigmas). The model's keypoint "
+ f"layout must match the dataset and sigmas. For a non-COCO layout "
+ f"(e.g. SynthPose's 52 markers), pass matching sigmas and "
+ f"keypoint_names and use a dataset with the same keypoint definition."
+ )
+
+ @staticmethod
+ def _build_gt_annotations(references: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Convert ground-truth instances to COCO annotation dicts."""
+ annotations = []
+ for i, ref in enumerate(references):
+ keypoints = [float(v) for v in ref["keypoints"]]
+ num_keypoints = ref.get("num_keypoints")
+ if num_keypoints is None:
+ # COCO visibility flag is every 3rd value; >0 means labeled.
+ num_keypoints = sum(1 for v in keypoints[2::3] if v > 0)
+ annotations.append(
+ {
+ "id": i + 1,
+ "image_id": int(ref["image_id"]),
+ "category_id": 1,
+ "keypoints": keypoints,
+ "num_keypoints": int(num_keypoints),
+ "bbox": [float(v) for v in ref["bbox"]],
+ "area": float(ref["area"]),
+ "iscrowd": 0,
+ }
+ )
+ return annotations
+
+ @staticmethod
+ def _empty_result(
+ predictions: list[dict[str, Any]],
+ references: list[dict[str, Any]],
+ image_ids: list[int],
+ ) -> dict[str, float]:
+ """Return zeroed metrics when there is nothing to score."""
+ return {
+ "map": 0.0,
+ "map_50": 0.0,
+ "map_75": 0.0,
+ "map_medium": 0.0,
+ "map_large": 0.0,
+ "mar": 0.0,
+ "mar_50": 0.0,
+ "mar_75": 0.0,
+ "num_predictions": len(predictions),
+ "num_ground_truths": len(references),
+ "num_images": len(image_ids),
+ }
diff --git a/src/winml/modelkit/utils/eval_utils.py b/src/winml/modelkit/utils/eval_utils.py
index 5a854cff9..27fdd66e1 100644
--- a/src/winml/modelkit/utils/eval_utils.py
+++ b/src/winml/modelkit/utils/eval_utils.py
@@ -287,6 +287,54 @@ class TaskSchema:
),
)
+_KEYPOINT_DETECTION_SCHEMA = TaskSchema(
+ columns=(
+ SchemaItem(
+ "input_column", "input image (PIL.Image)",
+ default="image", remap_hint="",
+ ),
+ SchemaItem(
+ "annotation_column",
+ "annotation dict containing per-person keypoints + bbox + area",
+ default="objects", remap_hint="",
+ ),
+ ),
+ params=(
+ SchemaItem(
+ "keypoints_key",
+ "keypoints field inside the annotation dict "
+ "(flat [x, y, v] triplets per person)",
+ default="keypoints", remap_hint="",
+ ),
+ SchemaItem(
+ "bbox_key",
+ "person bbox field inside the annotation dict",
+ default="bbox", remap_hint="",
+ ),
+ SchemaItem(
+ "area_key",
+ "person area field inside the annotation dict",
+ default="area", remap_hint="",
+ ),
+ SchemaItem(
+ "box_format", "person bounding box layout",
+ default="xywh", remap_hint="",
+ ),
+ SchemaItem(
+ "sigmas",
+ "per-keypoint OKS sigmas as comma-separated floats; "
+ "defaults to the COCO 17-keypoint constants",
+ default="COCO 17 sigmas", remap_hint="",
+ ),
+ SchemaItem(
+ "keypoint_names",
+ "keypoint names in index order as comma-separated strings; "
+ "defaults to the COCO 17 names",
+ default="COCO 17 names", remap_hint="",
+ ),
+ ),
+)
+
TASK_SCHEMAS: dict[str, TaskSchema] = {
"image-classification": _IMAGE_CLASSIFICATION_SCHEMA,
"text-classification": _TEXT_CLASSIFICATION_SCHEMA,
@@ -304,6 +352,7 @@ class TaskSchema:
"zero-shot-classification": _ZERO_SHOT_CLASSIFICATION_SCHEMA,
"zero-shot-image-classification": _ZERO_SHOT_IMAGE_CLASSIFICATION_SCHEMA,
"depth-estimation": _DEPTH_ESTIMATION_SCHEMA,
+ "keypoint-detection": _KEYPOINT_DETECTION_SCHEMA,
}
diff --git a/tests/unit/eval/test_keypoint_detection_evaluator.py b/tests/unit/eval/test_keypoint_detection_evaluator.py
new file mode 100644
index 000000000..9cfbda7aa
--- /dev/null
+++ b/tests/unit/eval/test_keypoint_detection_evaluator.py
@@ -0,0 +1,122 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Unit tests for WinMLKeypointDetectionEvaluator.
+
+The end-to-end pose pipeline is covered by integration runs; these tests
+pin the box-format handling, prediction flattening, and the ``compute()``
+loop wiring with a mocked image processor and model.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from winml.modelkit.eval import WinMLKeypointDetectionEvaluator
+
+
+def _make_evaluator(box_format: str = "xywh") -> WinMLKeypointDetectionEvaluator:
+ """Create an evaluator instance without triggering data/model loading."""
+ ev = object.__new__(WinMLKeypointDetectionEvaluator)
+ ev._image_col = "image"
+ ev._annotation_col = "objects"
+ ev._keypoints_key = "keypoints"
+ ev._bbox_key = "bbox"
+ ev._area_key = "area"
+ ev._box_format = box_format
+ ev._sigmas = None
+ ev._keypoint_names = None
+ return ev
+
+
+class TestBoxFormat:
+ def test_xywh_passthrough(self):
+ ev = _make_evaluator("xywh")
+ assert ev._to_xywh([10.0, 20.0, 30.0, 40.0]) == [10.0, 20.0, 30.0, 40.0]
+
+ def test_xyxy_converted_to_xywh(self):
+ ev = _make_evaluator("xyxy")
+ assert ev._to_xywh([10.0, 20.0, 40.0, 60.0]) == [10.0, 20.0, 30.0, 40.0]
+
+
+class TestPredictionFlattening:
+ def test_flatten_interleaves_xy_and_score(self):
+ pose = {
+ "keypoints": torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+ "scores": torch.tensor([0.5, 0.9]),
+ }
+ flat = WinMLKeypointDetectionEvaluator._flatten_prediction(pose)
+ assert flat == pytest.approx([1.0, 2.0, 0.5, 3.0, 4.0, 0.9])
+
+ def test_person_score_is_mean(self):
+ pose = {"scores": torch.tensor([0.4, 0.6, 0.8])}
+ assert WinMLKeypointDetectionEvaluator._person_score(pose) == pytest.approx(0.6)
+
+
+class _MockProcessor:
+ """Mock image processor returning fixed pixel values and poses."""
+
+ def __init__(self, num_keypoints: int = 17) -> None:
+ self._num_keypoints = num_keypoints
+
+ def preprocess(self, images, boxes, return_tensors="pt"):
+ num_persons = len(boxes[0])
+ return {"pixel_values": torch.zeros(num_persons, 3, 256, 192)}
+
+ def post_process_pose_estimation(self, outputs, boxes):
+ num_persons = outputs.heatmaps.shape[0]
+ poses = [
+ {
+ "keypoints": torch.ones(self._num_keypoints, 2),
+ "scores": torch.full((self._num_keypoints,), 0.8),
+ }
+ for _ in range(num_persons)
+ ]
+ return [poses]
+
+
+class _MockModel:
+ """Mock model returning a single-person heatmap per call."""
+
+ def __init__(self, num_keypoints: int = 17) -> None:
+ self._num_keypoints = num_keypoints
+
+ def __call__(self, pixel_values):
+ batch = pixel_values.shape[0]
+ return {"heatmaps": torch.zeros(batch, self._num_keypoints, 64, 48)}
+
+
+class TestComputeLoop:
+ def test_compute_returns_ap_metrics(self):
+ ev = _make_evaluator("xywh")
+ ev.pipe = _MockProcessor()
+ ev.model = _MockModel()
+ # Two images: one with 2 persons, one with 1.
+ ev.data = [
+ {
+ "image": object(),
+ "objects": {
+ "keypoints": [[1.0, 1.0, 2.0] * 17, [2.0, 2.0, 2.0] * 17],
+ "bbox": [[0.0, 0.0, 50.0, 80.0], [10.0, 10.0, 40.0, 70.0]],
+ "area": [4000.0, 2800.0],
+ },
+ },
+ {
+ "image": object(),
+ "objects": {
+ "keypoints": [[3.0, 3.0, 2.0] * 17],
+ "bbox": [[5.0, 5.0, 30.0, 60.0]],
+ "area": [1800.0],
+ },
+ },
+ ]
+
+ result = ev.compute()
+
+ assert "map" in result
+ assert result["num_images"] == 2
+ assert result["num_predictions"] == 3
+ assert result["num_ground_truths"] == 3
diff --git a/tests/unit/eval/test_keypoint_metric.py b/tests/unit/eval/test_keypoint_metric.py
new file mode 100644
index 000000000..28b2fe623
--- /dev/null
+++ b/tests/unit/eval/test_keypoint_metric.py
@@ -0,0 +1,176 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Tests for KeypointAPMetric (COCO OKS-based AP)."""
+
+from __future__ import annotations
+
+import pytest
+
+from winml.modelkit.eval import KeypointAPMetric
+
+
+def _coco_person_keypoints(cx: float, cy: float) -> list[float]:
+ """Build a plausible 17-keypoint COCO layout around a center, all visible."""
+ offsets = [
+ (0, -40), (-5, -45), (5, -45), (-10, -42), (10, -42),
+ (-20, -20), (20, -20), (-25, 0), (25, 0),
+ (-28, 20), (28, 20), (-15, 25), (15, 25),
+ (-15, 55), (15, 55), (-15, 85), (15, 85),
+ ]
+ flat: list[float] = []
+ for dx, dy in offsets:
+ flat.extend([cx + dx, cy + dy, 2.0]) # visibility 2 = labeled + visible
+ return flat
+
+
+class TestKeypointAPMetricPerfectMatch:
+ """Predictions identical to ground truth should score AP ~= 1.0."""
+
+ def test_single_person_perfect_match(self) -> None:
+ kpts = _coco_person_keypoints(100.0, 100.0)
+ pred_kpts = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(kpts)]
+
+ metric = KeypointAPMetric()
+ result = metric.compute(
+ predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.95}],
+ references=[
+ {
+ "image_id": 1,
+ "keypoints": kpts,
+ "bbox": [60.0, 50.0, 80.0, 110.0],
+ "area": 80.0 * 110.0,
+ }
+ ],
+ )
+
+ assert result["map"] == pytest.approx(1.0, abs=0.01)
+ assert result["map_50"] == pytest.approx(1.0, abs=0.01)
+ assert result["num_predictions"] == 1
+ assert result["num_ground_truths"] == 1
+ assert result["num_images"] == 1
+
+ def test_two_people_two_images_perfect_match(self) -> None:
+ refs = []
+ preds = []
+ for img_id, (cx, cy) in enumerate([(100.0, 100.0), (300.0, 200.0)], start=1):
+ kpts = _coco_person_keypoints(cx, cy)
+ pred_kpts = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(kpts)]
+ refs.append(
+ {
+ "image_id": img_id,
+ "keypoints": kpts,
+ "bbox": [cx - 40, cy - 50, 80.0, 110.0],
+ "area": 80.0 * 110.0,
+ }
+ )
+ preds.append({"image_id": img_id, "keypoints": pred_kpts, "score": 0.9})
+
+ result = KeypointAPMetric().compute(predictions=preds, references=refs)
+
+ assert result["map"] == pytest.approx(1.0, abs=0.01)
+ assert result["num_images"] == 2
+
+
+class TestKeypointAPMetricImperfect:
+ """Offset and empty-input behavior."""
+
+ def test_large_offset_lowers_ap(self) -> None:
+ kpts = _coco_person_keypoints(100.0, 100.0)
+ # Shift every predicted keypoint far from GT -> low OKS -> low AP.
+ pred_kpts: list[float] = []
+ for i, v in enumerate(kpts):
+ if i % 3 == 0 or i % 3 == 1:
+ pred_kpts.append(v + 60.0)
+ else:
+ pred_kpts.append(1.0)
+
+ result = KeypointAPMetric().compute(
+ predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.9}],
+ references=[
+ {
+ "image_id": 1,
+ "keypoints": kpts,
+ "bbox": [60.0, 50.0, 80.0, 110.0],
+ "area": 80.0 * 110.0,
+ }
+ ],
+ )
+
+ assert result["map"] < 0.5
+
+ def test_no_predictions_returns_zero(self) -> None:
+ kpts = _coco_person_keypoints(100.0, 100.0)
+ result = KeypointAPMetric().compute(
+ predictions=[],
+ references=[
+ {
+ "image_id": 1,
+ "keypoints": kpts,
+ "bbox": [60.0, 50.0, 80.0, 110.0],
+ "area": 80.0 * 110.0,
+ }
+ ],
+ )
+
+ assert result["map"] == 0.0
+ assert result["num_predictions"] == 0
+ assert result["num_ground_truths"] == 1
+
+
+class TestKeypointAPMetricMismatch:
+ """A non-COCO keypoint layout must fail early with a clear message."""
+
+ def test_mismatched_keypoint_count_raises(self):
+ # Model predicts 52 keypoints (e.g. SynthPose) against COCO-17 ground truth.
+ pred_kpts = [0.0, 0.0, 1.0] * 52
+ gt_kpts = _coco_person_keypoints(100.0, 100.0)
+
+ with pytest.raises(ValueError, match="Keypoint count mismatch"):
+ KeypointAPMetric().compute(
+ predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.9}],
+ references=[
+ {
+ "image_id": 1,
+ "keypoints": gt_kpts,
+ "bbox": [60.0, 50.0, 80.0, 110.0],
+ "area": 80.0 * 110.0,
+ }
+ ],
+ )
+
+
+class TestKeypointAPMetricCustomLayout:
+ """A non-COCO keypoint layout scores when matching sigmas are supplied."""
+
+ def test_custom_layout_scores_with_matching_sigmas(self) -> None:
+ # A 5-keypoint layout (not COCO's 17): perfect predictions should still
+ # score map ~= 1.0 once sigmas/keypoint_names describe that layout. This
+ # is what lets one evaluator handle non-COCO models (e.g. SynthPose).
+ sigmas = (0.05, 0.05, 0.05, 0.05, 0.05)
+ names = ("a", "b", "c", "d", "e")
+ offsets = [(0, 0), (10, 0), (0, 10), (-10, 0), (0, -10)]
+ gt_flat: list[float] = []
+ for dx, dy in offsets:
+ gt_flat.extend([100.0 + dx, 100.0 + dy, 2.0])
+ pred_flat = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(gt_flat)]
+
+ result = KeypointAPMetric().compute(
+ predictions=[{"image_id": 1, "keypoints": pred_flat, "score": 0.9}],
+ references=[
+ {
+ "image_id": 1,
+ "keypoints": gt_flat,
+ "bbox": [80.0, 80.0, 40.0, 40.0],
+ "area": 1600.0,
+ }
+ ],
+ sigmas=sigmas,
+ keypoint_names=names,
+ )
+
+ assert result["map"] == pytest.approx(1.0, abs=0.01)
+ assert result["num_ground_truths"] == 1
+