diff --git a/pyproject.toml b/pyproject.toml
index 46fa96dc7..fec3f7d54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -297,6 +297,8 @@ lint.per-file-ignores."examples/**" = [ "ANN", "D100", "D103", "S101", "T20" ]
 lint.per-file-ignores."scripts/e2e_eval/**" = [ "ANN", "D103", "E501", "PERF203", "T20" ]
 # Download scripts: Allow subprocess, print, missing docstrings
 lint.per-file-ignores."scripts/download_rules.py" = [ "D103", "E501", "PERF401", "S603", "S607", "SIM108", "T20" ]
+# COCO dataset build script: Allow print, missing docstrings, broad excepts
+lint.per-file-ignores."scripts/build_coco_keypoints.py" = [ "BLE001", "D103", "E501", "T20" ]
 # CLI: Allow print statements
 lint.per-file-ignores."src/winml/modelkit/cli.py" = [ "T20", "T201" ]
 lint.per-file-ignores."src/winml/modelkit/commands/**" = [ "T20", "T201" ]
diff --git a/scripts/build_coco_keypoints.py b/scripts/build_coco_keypoints.py
new file mode 100644
index 000000000..98c0e5114
--- /dev/null
+++ b/scripts/build_coco_keypoints.py
@@ -0,0 +1,165 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Build a local COCO keypoints dataset for ``winml eval`` keypoint-detection.
+
+Downloads the COCO person-keypoints annotations (cached once) and a chosen
+number of validation images individually (so a small subset does not require
+the full ~780 MB image zip), then writes an Arrow dataset to disk via
+``datasets.Dataset.save_to_disk``. Point ``winml eval --dataset-path`` at the
+output directory.
+
+Each record has:
+    - ``image``: the RGB image (datasets ``Image`` feature)
+    - ``objects``: dict with parallel per-person lists ``keypoints`` (flat
+      ``[x, y, v]`` triplets), ``bbox`` (COCO ``[x, y, w, h]``) and ``area``.
+
+Only images containing at least one labeled-keypoint person are included.
+
+Usage:
+    uv run python scripts/build_coco_keypoints.py --output-dir ~/.cache/winml/datasets/coco_keypoints_val2017
+    uv run python scripts/build_coco_keypoints.py --output-dir <dir> --num-images 100
+    uv run python scripts/build_coco_keypoints.py --output-dir <dir> --num-images 0  # all images
+"""
+
+import argparse
+import io
+import json
+import random
+import shutil
+import sys
+import urllib.request
+import zipfile
+from pathlib import Path
+
+
+ANNOTATIONS_URL = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
+ANNOTATION_MEMBER = "annotations/person_keypoints_val2017.json"
+IMAGE_URL_TEMPLATE = "http://images.cocodataset.org/val2017/{file_name}"
+
+DEFAULT_CACHE = Path.home() / ".cache" / "winml" / "coco_build"
+
+
+def _download(url: str, dest: Path) -> None:
+    """Download ``url`` to ``dest`` (skips if it already exists)."""
+    if dest.exists():
+        return
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Downloading {url}")
+    with urllib.request.urlopen(url) as resp, dest.open("wb") as fh:  # noqa: S310
+        shutil.copyfileobj(resp, fh)
+
+
+def _load_annotations(cache_dir: Path) -> dict:
+    """Return the parsed person-keypoints annotation JSON, downloading once."""
+    ann_zip = cache_dir / "annotations_trainval2017.zip"
+    _download(ANNOTATIONS_URL, ann_zip)
+    print("Reading keypoint annotations...")
+    with zipfile.ZipFile(ann_zip) as zf, zf.open(ANNOTATION_MEMBER) as fh:
+        return json.load(fh)
+
+
+def _group_annotations_by_image(annotations: list[dict]) -> dict[int, list[dict]]:
+    """Group person annotations by image id, keeping only labeled-keypoint people."""
+    by_image: dict[int, list[dict]] = {}
+    for ann in annotations:
+        if ann.get("num_keypoints", 0) <= 0 or ann.get("iscrowd", 0):
+            continue
+        by_image.setdefault(ann["image_id"], []).append(ann)
+    return by_image
+
+
+def _fetch_image(file_name: str) -> bytes:
+    """Download one validation image and return its raw bytes."""
+    url = IMAGE_URL_TEMPLATE.format(file_name=file_name)
+    with urllib.request.urlopen(url) as resp:  # noqa: S310
+        return resp.read()
+
+
+def build(output_dir: Path, num_images: int, cache_dir: Path, seed: int = 42) -> None:
+    """Build and save the COCO keypoints dataset to ``output_dir``."""
+    from datasets import Dataset, Features, Image, Sequence, Value
+    from PIL import Image as PILImage
+
+    coco = _load_annotations(cache_dir)
+    images_by_id = {img["id"]: img for img in coco["images"]}
+    by_image = _group_annotations_by_image(coco["annotations"])
+
+    image_ids = sorted(by_image)
+    if num_images > 0:
+        # Shuffle before truncating so a small subset is a representative random
+        # sample of the validation set rather than the lowest image ids. Seeded
+        # so repeated builds produce the same subset.
+        random.Random(seed).shuffle(image_ids)
+        image_ids = image_ids[:num_images]
+    print(f"Building {len(image_ids)} images with keypoint annotations...")
+
+    records = []
+    for idx, image_id in enumerate(image_ids, start=1):
+        info = images_by_id[image_id]
+        try:
+            raw = _fetch_image(info["file_name"])
+            image = PILImage.open(io.BytesIO(raw)).convert("RGB")
+        except Exception as exc:
+            print(f"  skip {info['file_name']}: {exc}")
+            continue
+
+        persons = by_image[image_id]
+        records.append(
+            {
+                "image": image,
+                "objects": {
+                    "keypoints": [[float(v) for v in p["keypoints"]] for p in persons],
+                    "bbox": [[float(v) for v in p["bbox"]] for p in persons],
+                    "area": [float(p["area"]) for p in persons],
+                },
+            }
+        )
+        if idx % 50 == 0:
+            print(f"  {idx}/{len(image_ids)}")
+
+    features = Features(
+        {
+            "image": Image(),
+            "objects": {
+                "keypoints": Sequence(Sequence(Value("float32"))),
+                "bbox": Sequence(Sequence(Value("float32"))),
+                "area": Sequence(Value("float32")),
+            },
+        }
+    )
+    dataset = Dataset.from_list(records, features=features)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    dataset.save_to_disk(str(output_dir))
+    print(f"Saved {len(dataset)} samples to {output_dir}")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Build a local COCO keypoints dataset.")
+    parser.add_argument("--output-dir", required=True, type=Path, help="Dataset output directory.")
+    parser.add_argument(
+        "--num-images",
+        type=int,
+        default=100,
+        help="Number of images to include (0 = all images with keypoints).",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=DEFAULT_CACHE,
+        help="Where to cache the downloaded annotations zip.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for selecting the image subset (used when --num-images > 0).",
+    )
+    args = parser.parse_args()
+    build(args.output_dir, args.num_images, args.cache_dir, args.seed)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/e2e_eval/testsets/models_with_acc.json b/scripts/e2e_eval/testsets/models_with_acc.json
index 49e4b73e5..32bd965b5 100644
--- a/scripts/e2e_eval/testsets/models_with_acc.json
+++ b/scripts/e2e_eval/testsets/models_with_acc.json
@@ -1927,5 +1927,115 @@
         "depth_column": "depth_map"
       }
     }
+  },
+  {
+    "hf_id": "usyd-community/vitpose-base-simple",
+    "task": "keypoint-detection",
+    "model_type": "vitpose",
+    "group": "Top200",
+    "priority": "P3",
+    "dataset_config": {
+      "build_script": "scripts/build_coco_keypoints.py",
+      "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+      "split": "validation",
+      "metric": "map",
+      "winml_metric_key": "map",
+      "columns_mapping": {
+        "input_column": "image",
+        "annotation_column": "objects",
+        "keypoints_key": "keypoints",
+        "bbox_key": "bbox",
+        "area_key": "area",
+        "box_format": "xywh"
+      }
+    }
+  },
+  {
+    "hf_id": "usyd-community/vitpose-plus-small",
+    "task": "keypoint-detection",
+    "model_type": "vitpose",
+    "group": "Top200",
+    "priority": "P3",
+    "dataset_config": {
+      "build_script": "scripts/build_coco_keypoints.py",
+      "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+      "split": "validation",
+      "metric": "map",
+      "winml_metric_key": "map",
+      "columns_mapping": {
+        "input_column": "image",
+        "annotation_column": "objects",
+        "keypoints_key": "keypoints",
+        "bbox_key": "bbox",
+        "area_key": "area",
+        "box_format": "xywh"
+      }
+    }
+  },
+  {
+    "hf_id": "usyd-community/vitpose-plus-base",
+    "task": "keypoint-detection",
+    "model_type": "vitpose",
+    "group": "Top200",
+    "priority": "P3",
+    "dataset_config": {
+      "build_script": "scripts/build_coco_keypoints.py",
+      "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+      "split": "validation",
+      "metric": "map",
+      "winml_metric_key": "map",
+      "columns_mapping": {
+        "input_column": "image",
+        "annotation_column": "objects",
+        "keypoints_key": "keypoints",
+        "bbox_key": "bbox",
+        "area_key": "area",
+        "box_format": "xywh"
+      }
+    }
+  },
+  {
+    "hf_id": "usyd-community/vitpose-plus-large",
+    "task": "keypoint-detection",
+    "model_type": "vitpose",
+    "group": "Top200",
+    "priority": "P3",
+    "dataset_config": {
+      "build_script": "scripts/build_coco_keypoints.py",
+      "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+      "split": "validation",
+      "metric": "map",
+      "winml_metric_key": "map",
+      "columns_mapping": {
+        "input_column": "image",
+        "annotation_column": "objects",
+        "keypoints_key": "keypoints",
+        "bbox_key": "bbox",
+        "area_key": "area",
+        "box_format": "xywh"
+      }
+    }
+  },
+  {
+    "hf_id": "usyd-community/vitpose-plus-huge",
+    "task": "keypoint-detection",
+    "model_type": "vitpose",
+    "group": "Top200",
+    "priority": "P3",
+    "dataset_config": {
+      "build_script": "scripts/build_coco_keypoints.py",
+      "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+      "split": "validation",
+      "metric": "map",
+      "winml_metric_key": "map",
+      "columns_mapping": {
+        "input_column": "image",
+        "annotation_column": "objects",
+        "keypoints_key": "keypoints",
+        "bbox_key": "bbox",
+        "area_key": "area",
+        "box_format": "xywh"
+      }
+    }
   }
 ]
diff --git a/src/winml/modelkit/eval/__init__.py b/src/winml/modelkit/eval/__init__.py
index efb34f1fd..7aa18048f 100644
--- a/src/winml/modelkit/eval/__init__.py
+++ b/src/winml/modelkit/eval/__init__.py
@@ -25,8 +25,10 @@
     from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
     from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
     from .image_to_text_evaluator import WinMLImageToTextEvaluator
+    from .keypoint_detection_evaluator import WinMLKeypointDetectionEvaluator
     from .metrics.classification import ClassificationMetric
     from .metrics.depth import DepthMetric
+    from .metrics.keypoint import KeypointAPMetric
     from .metrics.knn_accuracy import KNNAccuracyMetric
     from .metrics.mean_average_precision import MAPMetric
     from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric
@@ -56,6 +58,8 @@
         ".image_segmentation_evaluator:WinMLImageSegmentationEvaluator",
     "WinMLImageToTextEvaluator":
         ".image_to_text_evaluator:WinMLImageToTextEvaluator",
+    "WinMLKeypointDetectionEvaluator":
+        ".keypoint_detection_evaluator:WinMLKeypointDetectionEvaluator",
     "WinMLObjectDetectionEvaluator":
         ".object_detection_evaluator:WinMLObjectDetectionEvaluator",
     "WinMLQuestionAnsweringEvaluator":
@@ -75,6 +79,8 @@
         ".metrics.classification:ClassificationMetric",
     "DepthMetric":
         ".metrics.depth:DepthMetric",
+    "KeypointAPMetric":
+        ".metrics.keypoint:KeypointAPMetric",
     "IGNORE_INDEX":
         ".metrics.mean_iou:IGNORE_INDEX",
     "KNNAccuracyMetric":
@@ -116,6 +122,7 @@ def __dir__() -> list[str]:
     "DepthMetric",
     "EvalResult",
     "KNNAccuracyMetric",
+    "KeypointAPMetric",
     "MAPMetric",
     "MeanIoUMetric",
     "PseudoPerplexityMetric",
@@ -130,6 +137,7 @@ def __dir__() -> list[str]:
     "WinMLImageFeatureExtractionEvaluator",
     "WinMLImageSegmentationEvaluator",
     "WinMLImageToTextEvaluator",
+    "WinMLKeypointDetectionEvaluator",
     "WinMLObjectDetectionEvaluator",
     "WinMLQuestionAnsweringEvaluator",
     "WinMLTextClassificationEvaluator",
diff --git a/src/winml/modelkit/eval/evaluate.py b/src/winml/modelkit/eval/evaluate.py
index c5633f1e3..99ef1be41 100644
--- a/src/winml/modelkit/eval/evaluate.py
+++ b/src/winml/modelkit/eval/evaluate.py
@@ -62,6 +62,8 @@
         "winml.modelkit.eval.zero_shot_image_classification_evaluator:WinMLZeroShotImageClassificationEvaluator",
     "depth-estimation":
         "winml.modelkit.eval.depth_estimation_evaluator:WinMLDepthEstimationEvaluator",
+    "keypoint-detection":
+        "winml.modelkit.eval.keypoint_detection_evaluator:WinMLKeypointDetectionEvaluator",
     "compare-tensor":
         "winml.modelkit.eval.tensor_similarity_evaluator:TensorSimilarityEvaluator",
 }
@@ -172,6 +174,21 @@ def get_evaluator_class(config: WinMLEvaluationConfig) -> type[WinMLEvaluator]:
         # the legacy `nyu_depth_v2.py` loader script.
         "revision": "refs/convert/parquet",
     },
+    "keypoint-detection": {
+        # Built locally by scripts/build_coco_keypoints.py (COCO has no
+        # script-free HF mirror for person keypoints). Run that script first,
+        # or pass --dataset-path to point at your own build.
+        "path": "~/.cache/winml/datasets/coco_keypoints_val2017",
+        "split": "validation",
+        "columns_mapping": {
+            "input_column": "image",
+            "annotation_column": "objects",
+            "keypoints_key": "keypoints",
+            "bbox_key": "bbox",
+            "area_key": "area",
+            "box_format": "xywh",
+        },
+    },
 }
 
 
diff --git a/src/winml/modelkit/eval/keypoint_detection_evaluator.py b/src/winml/modelkit/eval/keypoint_detection_evaluator.py
new file mode 100644
index 000000000..0cbc634bf
--- /dev/null
+++ b/src/winml/modelkit/eval/keypoint_detection_evaluator.py
@@ -0,0 +1,222 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Keypoint detection (human pose) evaluator using COCO OKS-based AP.
+
+ViTPose is top-down: it predicts keypoints inside a given person box, and
+transformers exposes no ``keypoint-detection`` pipeline. So this evaluator
+drives the image processor and ONNX model directly — for each ground-truth
+person box it runs ``processor.preprocess -> model -> post_process_pose_estimation``
+— and scores the predictions against ground truth with ``KeypointAPMetric``.
+
+Using ground-truth person boxes isolates pose accuracy from detection quality,
+which is the standard COCO top-down evaluation protocol.
+"""
+
+from __future__ import annotations
+
+import logging
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Any
+
+from .base_evaluator import WinMLEvaluator
+
+
+if TYPE_CHECKING:
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    from ..models.winml.base import WinMLPreTrainedModel
+    from .config import WinMLEvaluationConfig
+
+logger = logging.getLogger(__name__)
+
+
+class WinMLKeypointDetectionEvaluator(WinMLEvaluator):
+    """Evaluator for keypoint detection using COCO OKS-based AP."""
+
+    def __init__(
+        self,
+        config: WinMLEvaluationConfig,
+        model: WinMLPreTrainedModel,
+    ) -> None:
+        from ..utils.eval_utils import get_default
+
+        mapping = config.dataset.columns_mapping
+        task = "keypoint-detection"
+        self._image_col = mapping.get("input_column", get_default(task, "input_column"))
+        ann_col = mapping.get("annotation_column", get_default(task, "annotation_column"))
+        keypoints_key = mapping.get("keypoints_key", get_default(task, "keypoints_key"))
+        bbox_key = mapping.get("bbox_key", get_default(task, "bbox_key"))
+        area_key = mapping.get("area_key", get_default(task, "area_key"))
+        box_format = mapping.get("box_format", get_default(task, "box_format"))
+        assert ann_col is not None, "annotation_column has no default for keypoint-detection"
+        assert keypoints_key is not None, "keypoints_key has no default for keypoint-detection"
+        assert bbox_key is not None, "bbox_key has no default for keypoint-detection"
+        assert area_key is not None, "area_key has no default for keypoint-detection"
+        assert box_format is not None, "box_format has no default for keypoint-detection"
+        self._annotation_col: str = ann_col
+        self._keypoints_key: str = keypoints_key
+        self._bbox_key: str = bbox_key
+        self._area_key: str = area_key
+        self._box_format: str = box_format
+
+        # Optional non-COCO keypoint layout: a model with a different keypoint
+        # set (e.g. SynthPose's 52 anatomical markers) can be scored by this
+        # same evaluator by supplying matching OKS sigmas and keypoint names
+        # through the dataset config. Absent -> the metric's COCO 17 defaults.
+        raw_sigmas = mapping.get("sigmas")
+        raw_names = mapping.get("keypoint_names")
+        self._sigmas: tuple[float, ...] | None = (
+            tuple(float(s) for s in self._as_list(raw_sigmas)) if raw_sigmas else None
+        )
+        self._keypoint_names: tuple[str, ...] | None = (
+            tuple(str(n) for n in self._as_list(raw_names)) if raw_names else None
+        )
+
+        super().__init__(config, model)
+
+    def prepare_pipeline(self) -> BaseImageProcessor:
+        """Load the image processor (no HF pipeline exists for this task).
+
+        The processor size is forced to the exported ONNX input shape so the
+        preprocessed crops match the static model input.
+        """
+        from transformers import AutoImageProcessor
+
+        processor = AutoImageProcessor.from_pretrained(self.config.model_id)
+
+        io_config = getattr(self.model, "io_config", None) or {}
+        input_shapes = io_config.get("input_shapes", [])
+        if input_shapes and len(input_shapes[0]) == 4:
+            _, _, h, w = input_shapes[0]
+            processor.size = {"height": h, "width": w}  # type: ignore[attr-defined]
+
+        return processor
+
+    def compute(self) -> dict[str, Any]:
+        """Run keypoint evaluation over all samples and return COCO AP/AR."""
+        from tqdm import tqdm
+
+        from .metrics import KeypointAPMetric
+
+        processor = self.pipe
+        predictions: list[dict[str, Any]] = []
+        references: list[dict[str, Any]] = []
+        skipped = 0
+
+        for image_id, sample in enumerate(tqdm(self.data, desc="Evaluating keypoints")):
+            image = sample.get(self._image_col)
+            annotation = sample.get(self._annotation_col)
+            if image is None or not annotation:
+                skipped += 1
+                continue
+
+            boxes = [self._to_xywh(b) for b in annotation[self._bbox_key]]
+            gt_keypoints = annotation[self._keypoints_key]
+            areas = annotation[self._area_key]
+            if not boxes:
+                skipped += 1
+                continue
+
+            pose_results = self._predict_poses(processor, image, boxes)
+
+            for person_idx, pose in enumerate(pose_results):
+                predictions.append(
+                    {
+                        "image_id": image_id,
+                        "keypoints": self._flatten_prediction(pose),
+                        "score": self._person_score(pose),
+                    }
+                )
+                references.append(
+                    {
+                        "image_id": image_id,
+                        "keypoints": list(gt_keypoints[person_idx]),
+                        "bbox": boxes[person_idx],
+                        "area": float(areas[person_idx]),
+                    }
+                )
+
+        if skipped:
+            logger.warning("Skipped %d samples with missing image or annotations.", skipped)
+
+        metric_kwargs: dict[str, Any] = {}
+        if self._sigmas is not None:
+            metric_kwargs["sigmas"] = self._sigmas
+        if self._keypoint_names is not None:
+            metric_kwargs["keypoint_names"] = self._keypoint_names
+        return KeypointAPMetric().compute(
+            predictions=predictions, references=references, **metric_kwargs
+        )
+
+    def _predict_poses(
+        self,
+        processor: BaseImageProcessor,
+        image: Any,
+        boxes: list[list[float]],
+    ) -> list[dict[str, Any]]:
+        """Run preprocess -> model -> post_process for one image's person boxes.
+
+        ViTPose is exported with a static batch size of 1, so each person crop
+        is run separately and the resulting heatmaps are stacked back into one
+        ``(num_persons, ...)`` batch for post-processing.
+        """
+        import torch
+
+        inputs = processor.preprocess(images=image, boxes=[boxes], return_tensors="pt")
+        pixel_values = inputs["pixel_values"]
+
+        heatmaps = []
+        for i in range(pixel_values.shape[0]):
+            outputs = self.model(pixel_values=pixel_values[i : i + 1])
+            heatmaps.append(self._extract_heatmaps(outputs))
+
+        wrapped = SimpleNamespace(heatmaps=torch.cat(heatmaps, dim=0))
+        # post_process returns one list per image; we pass a single image.
+        return processor.post_process_pose_estimation(wrapped, boxes=[boxes])[0]
+
+    @staticmethod
+    def _extract_heatmaps(outputs: Any) -> Any:
+        """Pull the heatmap tensor from the model output.
+
+        Falls back to the first output when the name differs, so the evaluator
+        does not depend on a specific ONNX output tensor name.
+        """
+        if not isinstance(outputs, dict):
+            return outputs.heatmaps
+        heatmaps = outputs.get("heatmaps")
+        if heatmaps is None:
+            heatmaps = next(iter(outputs.values()))
+        return heatmaps
+
+    @staticmethod
+    def _as_list(value: Any) -> list[Any]:
+        """Coerce a comma-separated string or an existing sequence into a list."""
+        if isinstance(value, str):
+            return [item.strip() for item in value.split(",") if item.strip()]
+        return list(value)
+
+    def _to_xywh(self, box: Any) -> list[float]:
+        """Normalize a person box to COCO ``[x, y, w, h]``."""
+        x0, y0, a, b = (float(v) for v in box)
+        if self._box_format == "xyxy":
+            return [x0, y0, a - x0, b - y0]
+        return [x0, y0, a, b]
+
+    @staticmethod
+    def _flatten_prediction(pose: dict[str, Any]) -> list[float]:
+        """Interleave predicted ``(x, y)`` and per-keypoint score to ``[x, y, s, ...]``."""
+        keypoints = pose["keypoints"].cpu().numpy()
+        scores = pose["scores"].cpu().numpy()
+        flat: list[float] = []
+        for (x, y), score in zip(keypoints, scores, strict=False):
+            flat.extend([float(x), float(y), float(score)])
+        return flat
+
+    @staticmethod
+    def _person_score(pose: dict[str, Any]) -> float:
+        """Overall person confidence: mean of per-keypoint scores."""
+        scores = pose["scores"].cpu().numpy()
+        return float(scores.mean()) if scores.size else 0.0
diff --git a/src/winml/modelkit/eval/metrics/__init__.py b/src/winml/modelkit/eval/metrics/__init__.py
index 2695c3d84..2adfaf237 100644
--- a/src/winml/modelkit/eval/metrics/__init__.py
+++ b/src/winml/modelkit/eval/metrics/__init__.py
@@ -14,6 +14,7 @@
 if TYPE_CHECKING:
     from .classification import ClassificationMetric
     from .depth import DepthMetric
+    from .keypoint import KeypointAPMetric
     from .knn_accuracy import KNNAccuracyMetric
     from .mean_average_precision import MAPMetric
     from .mean_iou import IGNORE_INDEX, MeanIoUMetric
@@ -28,6 +29,7 @@
 _LAZY_ATTRS: dict[str, str] = {
     "ClassificationMetric": ".classification:ClassificationMetric",
     "DepthMetric": ".depth:DepthMetric",
+    "KeypointAPMetric": ".keypoint:KeypointAPMetric",
     "IGNORE_INDEX": ".mean_iou:IGNORE_INDEX",
     "KNNAccuracyMetric": ".knn_accuracy:KNNAccuracyMetric",
     "MAPMetric": ".mean_average_precision:MAPMetric",
@@ -59,6 +61,7 @@ def __dir__() -> list[str]:
     "ClassificationMetric",
     "DepthMetric",
     "KNNAccuracyMetric",
+    "KeypointAPMetric",
     "MAPMetric",
     "MeanIoUMetric",
     "PseudoPerplexityMetric",
diff --git a/src/winml/modelkit/eval/metrics/keypoint.py b/src/winml/modelkit/eval/metrics/keypoint.py
new file mode 100644
index 000000000..a345e5e39
--- /dev/null
+++ b/src/winml/modelkit/eval/metrics/keypoint.py
@@ -0,0 +1,209 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""COCO keypoint detection metric: OKS-based Average Precision.
+
+Computes the official COCO keypoint score — Average Precision averaged over
+Object Keypoint Similarity (OKS) thresholds 0.50:0.95 — via ``pycocotools``
+``COCOeval(iouType="keypoints")``. This mirrors how the object-detection
+evaluator reuses the COCO mAP protocol, but for pose keypoints.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+# Standard COCO 17-keypoint OKS per-keypoint constants (pycocotools default).
+# Exposed so non-COCO keypoint layouts can override them.
+COCO_KEYPOINT_SIGMAS: tuple[float, ...] = (
+    0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
+    0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089,
+)
+
+# COCO person keypoint names (order matters; index == keypoint id).
+COCO_KEYPOINT_NAMES: tuple[str, ...] = (
+    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
+    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
+    "left_wrist", "right_wrist", "left_hip", "right_hip",
+    "left_knee", "right_knee", "left_ankle", "right_ankle",
+)
+
+
+class KeypointAPMetric:
+    """COCO-standard keypoint AP (OKS) wrapping ``pycocotools`` ``COCOeval``.
+
+    Accepts per-instance predictions and ground truth as plain Python dicts
+    keyed by ``image_id`` and builds the COCO JSON structures internally. One
+    instance is one person (top-down pose estimation produces one keypoint set
+    per person box).
+    """
+
+    def compute(
+        self,
+        predictions: list[dict[str, Any]],
+        references: list[dict[str, Any]],
+        sigmas: tuple[float, ...] = COCO_KEYPOINT_SIGMAS,
+        keypoint_names: tuple[str, ...] = COCO_KEYPOINT_NAMES,
+    ) -> dict[str, float]:
+        """Compute COCO keypoint AP/AR.
+
+        Args:
+            predictions: Per-person predictions. Each dict has:
+                - ``image_id``: int grouping key
+                - ``keypoints``: flat list ``[x1, y1, s1, ...]`` of length
+                  ``3 * num_keypoints`` (``s`` is the per-keypoint score)
+                - ``score``: overall person confidence (float)
+            references: Per-person ground truth. Each dict has:
+                - ``image_id``: int grouping key
+                - ``keypoints``: flat list ``[x1, y1, v1, ...]`` (``v`` is the
+                  COCO visibility flag 0/1/2)
+                - ``bbox``: ``[x, y, w, h]`` person box
+                - ``area``: person area used by the OKS normalization
+                - ``num_keypoints``: number of labeled keypoints (optional;
+                  derived from visibility flags when absent)
+            sigmas: Per-keypoint OKS constants. Defaults to the COCO 17.
+            keypoint_names: Keypoint names for the category definition.
+
+        Returns:
+            Dict with ``map``, ``map_50``, ``map_75``, ``map_medium``,
+            ``map_large``, ``mar``, ``mar_50``, ``mar_75``, plus
+            ``num_predictions``, ``num_ground_truths`` and ``num_images``.
+            Keys mirror the object-detection ``MAPMetric`` so downstream
+            reporting treats both COCO metrics the same way.
+        """
+        import contextlib
+        import io
+
+        import numpy as np
+        from pycocotools.coco import COCO
+        from pycocotools.cocoeval import COCOeval
+
+        self._validate_keypoint_counts(predictions, references, len(sigmas))
+
+        image_ids = sorted(
+            {int(r["image_id"]) for r in references} | {int(p["image_id"]) for p in predictions}
+        )
+
+        gt_dict = {
+            "images": [{"id": image_id} for image_id in image_ids],
+            "annotations": self._build_gt_annotations(references),
+            "categories": [
+                {"id": 1, "name": "person", "keypoints": list(keypoint_names), "skeleton": []}
+            ],
+        }
+
+        coco_gt = COCO()
+        coco_gt.dataset = gt_dict
+        # pycocotools writes progress to stdout; keep eval output quiet.
+        with contextlib.redirect_stdout(io.StringIO()):
+            coco_gt.createIndex()
+
+        detections = [
+            {
+                "image_id": int(p["image_id"]),
+                "category_id": 1,
+                "keypoints": [float(v) for v in p["keypoints"]],
+                "score": float(p["score"]),
+            }
+            for p in predictions
+        ]
+
+        if not detections or not gt_dict["annotations"]:
+            return self._empty_result(predictions, references, image_ids)
+
+        with contextlib.redirect_stdout(io.StringIO()):
+            coco_dt = coco_gt.loadRes(detections)
+            coco_eval = COCOeval(coco_gt, coco_dt, iouType="keypoints")
+            coco_eval.params.kpt_oks_sigmas = np.array(sigmas, dtype=np.float64)
+            coco_eval.evaluate()
+            coco_eval.accumulate()
+            coco_eval.summarize()
+
+        stats = coco_eval.stats
+        return {
+            "map": float(stats[0]),
+            "map_50": float(stats[1]),
+            "map_75": float(stats[2]),
+            "map_medium": float(stats[3]),
+            "map_large": float(stats[4]),
+            "mar": float(stats[5]),
+            "mar_50": float(stats[6]),
+            "mar_75": float(stats[7]),
+            "num_predictions": len(detections),
+            "num_ground_truths": len(gt_dict["annotations"]),
+            "num_images": len(image_ids),
+        }
+
+    @staticmethod
+    def _validate_keypoint_counts(
+        predictions: list[dict[str, Any]],
+        references: list[dict[str, Any]],
+        num_sigmas: int,
+    ) -> None:
+        """Ensure predictions, references and sigmas describe the same layout.
+
+        OKS is only defined when the model's keypoints match the ground-truth
+        keypoint set. A model with a different layout (e.g. SynthPose's 52
+        anatomical markers vs COCO's 17) cannot be scored against COCO ground
+        truth, so fail early with an actionable message instead of a numpy
+        broadcast error inside pycocotools.
+        """
+        for kind, items in (("prediction", predictions), ("reference", references)):
+            for item in items:
+                count = len(item["keypoints"]) // 3
+                if count != num_sigmas:
+                    raise ValueError(
+                        f"Keypoint count mismatch: {kind} has {count} keypoints but the "
+                        f"metric expects {num_sigmas} (from sigmas). The model's keypoint "
+                        f"layout must match the dataset and sigmas. For a non-COCO layout "
+                        f"(e.g. SynthPose's 52 markers), pass matching sigmas and "
+                        f"keypoint_names and use a dataset with the same keypoint definition."
+                    )
+
+    @staticmethod
+    def _build_gt_annotations(references: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Convert ground-truth instances to COCO annotation dicts."""
+        annotations = []
+        for i, ref in enumerate(references):
+            keypoints = [float(v) for v in ref["keypoints"]]
+            num_keypoints = ref.get("num_keypoints")
+            if num_keypoints is None:
+                # COCO visibility flag is every 3rd value; >0 means labeled.
+                num_keypoints = sum(1 for v in keypoints[2::3] if v > 0)
+            annotations.append(
+                {
+                    "id": i + 1,
+                    "image_id": int(ref["image_id"]),
+                    "category_id": 1,
+                    "keypoints": keypoints,
+                    "num_keypoints": int(num_keypoints),
+                    "bbox": [float(v) for v in ref["bbox"]],
+                    "area": float(ref["area"]),
+                    "iscrowd": 0,
+                }
+            )
+        return annotations
+
+    @staticmethod
+    def _empty_result(
+        predictions: list[dict[str, Any]],
+        references: list[dict[str, Any]],
+        image_ids: list[int],
+    ) -> dict[str, float]:
+        """Return zeroed metrics when there is nothing to score."""
+        return {
+            "map": 0.0,
+            "map_50": 0.0,
+            "map_75": 0.0,
+            "map_medium": 0.0,
+            "map_large": 0.0,
+            "mar": 0.0,
+            "mar_50": 0.0,
+            "mar_75": 0.0,
+            "num_predictions": len(predictions),
+            "num_ground_truths": len(references),
+            "num_images": len(image_ids),
+        }
diff --git a/src/winml/modelkit/utils/eval_utils.py b/src/winml/modelkit/utils/eval_utils.py
index 5a854cff9..27fdd66e1 100644
--- a/src/winml/modelkit/utils/eval_utils.py
+++ b/src/winml/modelkit/utils/eval_utils.py
@@ -287,6 +287,54 @@ class TaskSchema:
     ),
 )
 
+_KEYPOINT_DETECTION_SCHEMA = TaskSchema(
+    columns=(
+        SchemaItem(
+            "input_column", "input image (PIL.Image)",
+            default="image", remap_hint="<your_image_column>",
+        ),
+        SchemaItem(
+            "annotation_column",
+            "annotation dict containing per-person keypoints + bbox + area",
+            default="objects", remap_hint="<your_annotation_column>",
+        ),
+    ),
+    params=(
+        SchemaItem(
+            "keypoints_key",
+            "keypoints field inside the annotation dict "
+            "(flat [x, y, v] triplets per person)",
+            default="keypoints", remap_hint="<keypoints_field>",
+        ),
+        SchemaItem(
+            "bbox_key",
+            "person bbox field inside the annotation dict",
+            default="bbox", remap_hint="<bbox_field>",
+        ),
+        SchemaItem(
+            "area_key",
+            "person area field inside the annotation dict",
+            default="area", remap_hint="<area_field>",
+        ),
+        SchemaItem(
+            "box_format", "person bounding box layout",
+            default="xywh", remap_hint="<xywh|xyxy>",
+        ),
+        SchemaItem(
+            "sigmas",
+            "per-keypoint OKS sigmas as comma-separated floats; "
+            "defaults to the COCO 17-keypoint constants",
+            default="COCO 17 sigmas", remap_hint="<s1,s2,...>",
+        ),
+        SchemaItem(
+            "keypoint_names",
+            "keypoint names in index order as comma-separated strings; "
+            "defaults to the COCO 17 names",
+            default="COCO 17 names", remap_hint="<name1,name2,...>",
+        ),
+    ),
+)
+
 TASK_SCHEMAS: dict[str, TaskSchema] = {
     "image-classification": _IMAGE_CLASSIFICATION_SCHEMA,
     "text-classification": _TEXT_CLASSIFICATION_SCHEMA,
@@ -304,6 +352,7 @@ class TaskSchema:
     "zero-shot-classification": _ZERO_SHOT_CLASSIFICATION_SCHEMA,
     "zero-shot-image-classification": _ZERO_SHOT_IMAGE_CLASSIFICATION_SCHEMA,
     "depth-estimation": _DEPTH_ESTIMATION_SCHEMA,
+    "keypoint-detection": _KEYPOINT_DETECTION_SCHEMA,
 }
 
 
diff --git a/tests/unit/eval/test_keypoint_detection_evaluator.py b/tests/unit/eval/test_keypoint_detection_evaluator.py
new file mode 100644
index 000000000..9cfbda7aa
--- /dev/null
+++ b/tests/unit/eval/test_keypoint_detection_evaluator.py
@@ -0,0 +1,122 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Unit tests for WinMLKeypointDetectionEvaluator.
+
+The end-to-end pose pipeline is covered by integration runs; these tests
+pin the box-format handling, prediction flattening, and the ``compute()``
+loop wiring with a mocked image processor and model.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from winml.modelkit.eval import WinMLKeypointDetectionEvaluator
+
+
+def _make_evaluator(box_format: str = "xywh") -> WinMLKeypointDetectionEvaluator:
+    """Create an evaluator instance without triggering data/model loading."""
+    ev = object.__new__(WinMLKeypointDetectionEvaluator)
+    ev._image_col = "image"
+    ev._annotation_col = "objects"
+    ev._keypoints_key = "keypoints"
+    ev._bbox_key = "bbox"
+    ev._area_key = "area"
+    ev._box_format = box_format
+    ev._sigmas = None
+    ev._keypoint_names = None
+    return ev
+
+
+class TestBoxFormat:
+    def test_xywh_passthrough(self):
+        ev = _make_evaluator("xywh")
+        assert ev._to_xywh([10.0, 20.0, 30.0, 40.0]) == [10.0, 20.0, 30.0, 40.0]
+
+    def test_xyxy_converted_to_xywh(self):
+        ev = _make_evaluator("xyxy")
+        assert ev._to_xywh([10.0, 20.0, 40.0, 60.0]) == [10.0, 20.0, 30.0, 40.0]
+
+
+class TestPredictionFlattening:
+    def test_flatten_interleaves_xy_and_score(self):
+        pose = {
+            "keypoints": torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            "scores": torch.tensor([0.5, 0.9]),
+        }
+        flat = WinMLKeypointDetectionEvaluator._flatten_prediction(pose)
+        assert flat == pytest.approx([1.0, 2.0, 0.5, 3.0, 4.0, 0.9])
+
+    def test_person_score_is_mean(self):
+        pose = {"scores": torch.tensor([0.4, 0.6, 0.8])}
+        assert WinMLKeypointDetectionEvaluator._person_score(pose) == pytest.approx(0.6)
+
+
+class _MockProcessor:
+    """Mock image processor returning fixed pixel values and poses."""
+
+    def __init__(self, num_keypoints: int = 17) -> None:
+        self._num_keypoints = num_keypoints
+
+    def preprocess(self, images, boxes, return_tensors="pt"):
+        num_persons = len(boxes[0])
+        return {"pixel_values": torch.zeros(num_persons, 3, 256, 192)}
+
+    def post_process_pose_estimation(self, outputs, boxes):
+        num_persons = outputs.heatmaps.shape[0]
+        poses = [
+            {
+                "keypoints": torch.ones(self._num_keypoints, 2),
+                "scores": torch.full((self._num_keypoints,), 0.8),
+            }
+            for _ in range(num_persons)
+        ]
+        return [poses]
+
+
+class _MockModel:
+    """Mock model returning a single-person heatmap per call."""
+
+    def __init__(self, num_keypoints: int = 17) -> None:
+        self._num_keypoints = num_keypoints
+
+    def __call__(self, pixel_values):
+        batch = pixel_values.shape[0]
+        return {"heatmaps": torch.zeros(batch, self._num_keypoints, 64, 48)}
+
+
+class TestComputeLoop:
+    def test_compute_returns_ap_metrics(self):
+        ev = _make_evaluator("xywh")
+        ev.pipe = _MockProcessor()
+        ev.model = _MockModel()
+        # Two images: one with 2 persons, one with 1.
+        ev.data = [
+            {
+                "image": object(),
+                "objects": {
+                    "keypoints": [[1.0, 1.0, 2.0] * 17, [2.0, 2.0, 2.0] * 17],
+                    "bbox": [[0.0, 0.0, 50.0, 80.0], [10.0, 10.0, 40.0, 70.0]],
+                    "area": [4000.0, 2800.0],
+                },
+            },
+            {
+                "image": object(),
+                "objects": {
+                    "keypoints": [[3.0, 3.0, 2.0] * 17],
+                    "bbox": [[5.0, 5.0, 30.0, 60.0]],
+                    "area": [1800.0],
+                },
+            },
+        ]
+
+        result = ev.compute()
+
+        assert "map" in result
+        assert result["num_images"] == 2
+        assert result["num_predictions"] == 3
+        assert result["num_ground_truths"] == 3
diff --git a/tests/unit/eval/test_keypoint_metric.py b/tests/unit/eval/test_keypoint_metric.py
new file mode 100644
index 000000000..28b2fe623
--- /dev/null
+++ b/tests/unit/eval/test_keypoint_metric.py
@@ -0,0 +1,176 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Tests for KeypointAPMetric (COCO OKS-based AP)."""
+
+from __future__ import annotations
+
+import pytest
+
+from winml.modelkit.eval import KeypointAPMetric
+
+
+def _coco_person_keypoints(cx: float, cy: float) -> list[float]:
+    """Build a plausible 17-keypoint COCO layout around a center, all visible."""
+    offsets = [
+        (0, -40), (-5, -45), (5, -45), (-10, -42), (10, -42),
+        (-20, -20), (20, -20), (-25, 0), (25, 0),
+        (-28, 20), (28, 20), (-15, 25), (15, 25),
+        (-15, 55), (15, 55), (-15, 85), (15, 85),
+    ]
+    flat: list[float] = []
+    for dx, dy in offsets:
+        flat.extend([cx + dx, cy + dy, 2.0])  # visibility 2 = labeled + visible
+    return flat
+
+
+class TestKeypointAPMetricPerfectMatch:
+    """Predictions identical to ground truth should score AP ~= 1.0."""
+
+    def test_single_person_perfect_match(self) -> None:
+        kpts = _coco_person_keypoints(100.0, 100.0)
+        pred_kpts = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(kpts)]
+
+        metric = KeypointAPMetric()
+        result = metric.compute(
+            predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.95}],
+            references=[
+                {
+                    "image_id": 1,
+                    "keypoints": kpts,
+                    "bbox": [60.0, 50.0, 80.0, 110.0],
+                    "area": 80.0 * 110.0,
+                }
+            ],
+        )
+
+        assert result["map"] == pytest.approx(1.0, abs=0.01)
+        assert result["map_50"] == pytest.approx(1.0, abs=0.01)
+        assert result["num_predictions"] == 1
+        assert result["num_ground_truths"] == 1
+        assert result["num_images"] == 1
+
+    def test_two_people_two_images_perfect_match(self) -> None:
+        refs = []
+        preds = []
+        for img_id, (cx, cy) in enumerate([(100.0, 100.0), (300.0, 200.0)], start=1):
+            kpts = _coco_person_keypoints(cx, cy)
+            pred_kpts = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(kpts)]
+            refs.append(
+                {
+                    "image_id": img_id,
+                    "keypoints": kpts,
+                    "bbox": [cx - 40, cy - 50, 80.0, 110.0],
+                    "area": 80.0 * 110.0,
+                }
+            )
+            preds.append({"image_id": img_id, "keypoints": pred_kpts, "score": 0.9})
+
+        result = KeypointAPMetric().compute(predictions=preds, references=refs)
+
+        assert result["map"] == pytest.approx(1.0, abs=0.01)
+        assert result["num_images"] == 2
+
+
+class TestKeypointAPMetricImperfect:
+    """Offset and empty-input behavior."""
+
+    def test_large_offset_lowers_ap(self) -> None:
+        kpts = _coco_person_keypoints(100.0, 100.0)
+        # Shift every predicted keypoint far from GT -> low OKS -> low AP.
+        pred_kpts: list[float] = []
+        for i, v in enumerate(kpts):
+            if i % 3 == 0 or i % 3 == 1:
+                pred_kpts.append(v + 60.0)
+            else:
+                pred_kpts.append(1.0)
+
+        result = KeypointAPMetric().compute(
+            predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.9}],
+            references=[
+                {
+                    "image_id": 1,
+                    "keypoints": kpts,
+                    "bbox": [60.0, 50.0, 80.0, 110.0],
+                    "area": 80.0 * 110.0,
+                }
+            ],
+        )
+
+        assert result["map"] < 0.5
+
+    def test_no_predictions_returns_zero(self) -> None:
+        kpts = _coco_person_keypoints(100.0, 100.0)
+        result = KeypointAPMetric().compute(
+            predictions=[],
+            references=[
+                {
+                    "image_id": 1,
+                    "keypoints": kpts,
+                    "bbox": [60.0, 50.0, 80.0, 110.0],
+                    "area": 80.0 * 110.0,
+                }
+            ],
+        )
+
+        assert result["map"] == 0.0
+        assert result["num_predictions"] == 0
+        assert result["num_ground_truths"] == 1
+
+
+class TestKeypointAPMetricMismatch:
+    """A non-COCO keypoint layout must fail early with a clear message."""
+
+    def test_mismatched_keypoint_count_raises(self):
+        # Model predicts 52 keypoints (e.g. SynthPose) against COCO-17 ground truth.
+        pred_kpts = [0.0, 0.0, 1.0] * 52
+        gt_kpts = _coco_person_keypoints(100.0, 100.0)
+
+        with pytest.raises(ValueError, match="Keypoint count mismatch"):
+            KeypointAPMetric().compute(
+                predictions=[{"image_id": 1, "keypoints": pred_kpts, "score": 0.9}],
+                references=[
+                    {
+                        "image_id": 1,
+                        "keypoints": gt_kpts,
+                        "bbox": [60.0, 50.0, 80.0, 110.0],
+                        "area": 80.0 * 110.0,
+                    }
+                ],
+            )
+
+
+class TestKeypointAPMetricCustomLayout:
+    """A non-COCO keypoint layout scores when matching sigmas are supplied."""
+
+    def test_custom_layout_scores_with_matching_sigmas(self) -> None:
+        # A 5-keypoint layout (not COCO's 17): perfect predictions should still
+        # score map ~= 1.0 once sigmas/keypoint_names describe that layout. This
+        # is what lets one evaluator handle non-COCO models (e.g. SynthPose).
+        sigmas = (0.05, 0.05, 0.05, 0.05, 0.05)
+        names = ("a", "b", "c", "d", "e")
+        offsets = [(0, 0), (10, 0), (0, 10), (-10, 0), (0, -10)]
+        gt_flat: list[float] = []
+        for dx, dy in offsets:
+            gt_flat.extend([100.0 + dx, 100.0 + dy, 2.0])
+        pred_flat = [v if (i % 3) != 2 else 1.0 for i, v in enumerate(gt_flat)]
+
+        result = KeypointAPMetric().compute(
+            predictions=[{"image_id": 1, "keypoints": pred_flat, "score": 0.9}],
+            references=[
+                {
+                    "image_id": 1,
+                    "keypoints": gt_flat,
+                    "bbox": [80.0, 80.0, 40.0, 40.0],
+                    "area": 1600.0,
+                }
+            ],
+            sigmas=sigmas,
+            keypoint_names=names,
+        )
+
+        assert result["map"] == pytest.approx(1.0, abs=0.01)
+        assert result["num_ground_truths"] == 1
+