roboflow · leeclemnet · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,49 @@
 
 All notable changes to this project will be documented in this file.
 
+## Unreleased
+
+### Added — Model evaluations SDK & CLI
+
+Wraps the public `/{workspace}/model-evals` REST surface
+([roboflow/roboflow#11636](https://github.com/roboflow/roboflow/pull/11636))
+so users can read evaluation results — mAP, confidence sweep, per-class
+performance, confusion matrix, vector clusters, per-image stats,
+recommendations — from Python and from the CLI without hitting the API
+directly. Companion docs:
+[roboflow-dev-reference#18](https://github.com/roboflow/roboflow-dev-reference/pull/18).
+
+**SDK (`roboflow/core/model_eval.py`):**
+- `Workspace.evals(project=None, version=None, model=None, status=None, limit=None)` — list evals as `ModelEval` instances pre-populated with metadata from the list response.
+- `Workspace.eval(eval_id)` — fetch a single eval (returns a `ModelEval` with `.summary` populated when status is `done`).
+- `ModelEval.refresh()` — re-fetch the eval header.
+- `ModelEval.map_results()`, `.confidence_sweep()`, `.performance_by_class(split=None)`, `.confusion_matrix(split=None, confidence=None)`, `.vector_analysis(confidence=None)`, `.image_predictions(split=None, confidence=None, limit=None, offset=None)`, `.recommendations()` — one method per panel; each returns the raw JSON dict.
+
+**CLI (`roboflow/cli/handlers/eval.py`):**
+- `roboflow eval list [--project P] [--version V] [--model M] [--status S] [--limit N]`
+- `roboflow eval get <eval_id>`
+- `roboflow eval map-results <eval_id>`
+- `roboflow eval confidence-sweep <eval_id>`
+- `roboflow eval performance-by-class <eval_id> [--split S]`
+- `roboflow eval confusion-matrix <eval_id> [--split S] [--confidence N]`
+- `roboflow eval vector-analysis <eval_id> [--confidence N]`
+- `roboflow eval image-predictions <eval_id> [--split S] [--confidence N] [--limit N] [--offset N]`
+- `roboflow eval recommendations <eval_id>`
+
+Exit codes are stable per error class so shell scripts and AI agents can
+react without parsing message strings: `3` for `model_eval_not_found`
+(404), `4` for `model_eval_not_done` (409), `5` for `invalid_split` /
+`invalid_confidence` (400). Every command supports `--json` for
+structured output.
+
+**Low-level (`roboflow.adapters.rfapi`):**
+- `list_model_evals`, `get_model_eval`, `get_model_eval_map_results`, `get_model_eval_confidence_sweep`, `get_model_eval_performance_by_class`, `get_model_eval_confusion_matrix`, `get_model_eval_vector_analysis`, `get_model_eval_image_predictions`, `get_model_eval_recommendations`.
+- New typed exceptions `ModelEvalNotFoundError`, `ModelEvalNotDoneError`, `InvalidSplitError`, `InvalidConfidenceError` (all subclasses of `RoboflowError`) so callers can distinguish "eval doesn't exist" from "eval still running" from "bad argument" without parsing strings.
+
+The endpoints require the `model-eval:read` scope. The base URL is
+configurable via `API_URL` (set to `https://localapi.roboflow.one` to
+test against a local API server).
+
 ## 1.3.7
 
 ### Added — Soft-delete / Trash support

diff --git a/CLI-COMMANDS.md b/CLI-COMMANDS.md
@@ -216,6 +216,31 @@ single item) is intentionally not available from the SDK or CLI — those
 actions destroy data irrecoverably and live only in the web UI's Trash
 view. Items left in Trash are cleaned up automatically after 30 days.
 
+### Inspect model evaluations
+
+```bash
+# List evals in the workspace; filter by project, version, model, or status.
+roboflow eval list --status done --limit 10
+
+# Read a single eval's metadata + summary metrics.
+roboflow eval get <eval-id>
+
+# Pull each panel — pipe to jq for structured access.
+roboflow eval map-results <eval-id> --json | jq '.splits.test.map50'
+roboflow eval performance-by-class <eval-id> --split test
+roboflow eval confusion-matrix <eval-id> --split test --confidence 30
+roboflow eval confidence-sweep <eval-id> --json
+roboflow eval vector-analysis <eval-id> --confidence 20 --json
+roboflow eval image-predictions <eval-id> --split test --limit 200
+roboflow eval recommendations <eval-id> --json
+```
+
+Exit codes are stable per error class so scripts and agents can react
+without parsing message strings: `3` for `model_eval_not_found` (404),
+`4` for `model_eval_not_done` (409 — eval still running), `5` for
+`invalid_split` / `invalid_confidence` (400). Requires the
+`model-eval:read` scope on the api key.
+
 ### Workspace stats and billing
 
 ```bash
@@ -316,6 +341,7 @@ Version numbers are always numeric — that's how `x/y` is disambiguated between
 | `search` | Search workspace images (RoboQL), export results |
 | `deployment` | Manage dedicated deployments |
 | `device` | List, get, create, and observe RFDM devices (v2 deployment API) |
+| `eval` | Inspect model evaluation runs (mAP, confusion matrix, recommendations, ...) |
 | `workflow` | Manage workflows |
 | `folder` | Manage workspace folders |
 | `annotation` | Annotation batches and jobs |

diff --git a/roboflow/adapters/rfapi.py b/roboflow/adapters/rfapi.py
@@ -1165,3 +1165,184 @@ def restore_trash_item(api_key, workspace_url, item_type, item_id, parent_id=Non
 # Note: permanent-delete from Trash (deleteImmediately / empty) is
 # intentionally not exposed on the public API — those actions destroy data
 # irrecoverably and are only available through the web UI's Trash view.
+
+
+# ---------------------------------------------------------------------------
+# Model evaluations
+# ---------------------------------------------------------------------------
+
+
+class ModelEvalNotFoundError(RoboflowError):
+    """Raised when an eval id (or workspace) does not exist (HTTP 404)."""
+
+
+class ModelEvalNotDoneError(RoboflowError):
+    """Raised when reading panel data for an eval whose status is not ``done`` (HTTP 409)."""
+
+
+class InvalidSplitError(RoboflowError):
+    """Raised when ``split`` is not one of the accepted values (HTTP 400)."""
+
+
+class InvalidConfidenceError(RoboflowError):
+    """Raised when ``confidence`` is non-integer or out of range 0-100 (HTTP 400)."""
+
+
+def _model_eval_error_for(response):
+    """Translate a model-eval error response into the right RoboflowError subclass.
+
+    The model-eval REST surface returns errors as a flat envelope::
+
+        {"error": "<code>", "message": "<human readable>"}
+
+    Falls back to plain :class:`RoboflowError` when the body isn't JSON or
+    the code is unrecognised, so new error codes don't crash older SDK
+    callers. Status-code fallbacks for 404/409 keep typed exceptions
+    available even if the server omits the ``error`` field.
+    """
+    code = None
+    message = response.text
+    try:
+        body = response.json()
+        if isinstance(body, dict):
+            code = body.get("error")
+            if not isinstance(code, str):
+                code = None
+            message = body.get("message") or code or message
+    except (ValueError, TypeError):
+        pass
+
+    cls_by_code = {
+        "model_eval_not_found": ModelEvalNotFoundError,
+        "model_eval_not_done": ModelEvalNotDoneError,
+        "invalid_split": InvalidSplitError,
+        "invalid_confidence": InvalidConfidenceError,
+    }
+    cls = cls_by_code.get(code or "")
+    if cls is not None:
+        return cls(message)
+    if response.status_code == 404:
+        return ModelEvalNotFoundError(message)
+    if response.status_code == 409:
+        return ModelEvalNotDoneError(message)
+    return RoboflowError(message)
+
+
+def _eval_get(api_key, workspace_url, path, params=None):
+    """GET helper for model-eval endpoints with typed error mapping."""
+    query: Dict[str, Union[str, int]] = {"api_key": api_key}
+    if params:
+        for key, value in params.items():
+            if value is not None:
+                query[key] = value
+    url = f"{API_URL}/{workspace_url}/model-evals{path}"
+    response = requests.get(url, params=query)
+    if response.status_code != 200:
+        raise _model_eval_error_for(response)
+    return response.json()
+
+
+def list_model_evals(
+    api_key: str,
+    workspace_url: str,
+    *,
+    project: Optional[str] = None,
+    version: Optional[Union[str, int]] = None,
+    model: Optional[str] = None,
+    status: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> dict:
+    """GET /{workspace}/model-evals — list evals in the workspace."""
+    return _eval_get(
+        api_key,
+        workspace_url,
+        "",
+        params={"project": project, "version": version, "model": model, "status": status, "limit": limit},
+    )
+
+
+def get_model_eval(api_key: str, workspace_url: str, eval_id: str) -> dict:
+    """GET /{workspace}/model-evals/{evalId} — fetch a single eval (with summary if done)."""
+    return _eval_get(api_key, workspace_url, f"/{eval_id}")
+
+
+def get_model_eval_map_results(api_key: str, workspace_url: str, eval_id: str) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/map-results — per-split mAP breakdown."""
+    return _eval_get(api_key, workspace_url, f"/{eval_id}/map-results")
+
+
+def get_model_eval_confidence_sweep(api_key: str, workspace_url: str, eval_id: str) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/confidence-sweep — F1/precision/recall sweep."""
+    return _eval_get(api_key, workspace_url, f"/{eval_id}/confidence-sweep")
+
+
+def get_model_eval_performance_by_class(
+    api_key: str,
+    workspace_url: str,
+    eval_id: str,
+    *,
+    split: Optional[str] = None,
+) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/performance-by-class — per-class metrics.
+
+    Server rejects ``split=all`` for this panel; pass one of train/valid/test
+    or omit to use the server default (test).
+    """
+    return _eval_get(api_key, workspace_url, f"/{eval_id}/performance-by-class", params={"split": split})
+
+
+def get_model_eval_confusion_matrix(
+    api_key: str,
+    workspace_url: str,
+    eval_id: str,
+    *,
+    split: Optional[str] = None,
+    confidence: Optional[int] = None,
+) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/confusion-matrix — confusion matrix for split."""
+    return _eval_get(
+        api_key,
+        workspace_url,
+        f"/{eval_id}/confusion-matrix",
+        params={"split": split, "confidence": confidence},
+    )
+
+
+def get_model_eval_vector_analysis(
+    api_key: str,
+    workspace_url: str,
+    eval_id: str,
+    *,
+    confidence: Optional[int] = None,
+) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/vector-analysis — embedding clusters & metrics."""
+    return _eval_get(
+        api_key,
+        workspace_url,
+        f"/{eval_id}/vector-analysis",
+        params={"confidence": confidence},
+    )
+
+
+def get_model_eval_image_predictions(
+    api_key: str,
+    workspace_url: str,
+    eval_id: str,
+    *,
+    split: Optional[str] = None,
+    confidence: Optional[int] = None,
+    limit: Optional[int] = None,
+    offset: Optional[int] = None,
+) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/image-predictions — paginated per-image stats."""
+    return _eval_get(
+        api_key,
+        workspace_url,
+        f"/{eval_id}/image-predictions",
+        params={"split": split, "confidence": confidence, "limit": limit, "offset": offset},
+    )
+
+
+def get_model_eval_recommendations(api_key: str, workspace_url: str, eval_id: str) -> dict:
+    """GET /{workspace}/model-evals/{evalId}/recommendations — improvement suggestions."""
+    return _eval_get(api_key, workspace_url, f"/{eval_id}/recommendations")
diff --git a/roboflow/cli/__init__.py b/roboflow/cli/__init__.py
@@ -191,6 +191,7 @@ def _walk(group: Any, prefix: str = "") -> None:
 from roboflow.cli.handlers.completion import completion_app  # noqa: E402
 from roboflow.cli.handlers.deployment import deployment_app  # noqa: E402
 from roboflow.cli.handlers.device import device_app  # noqa: E402
+from roboflow.cli.handlers.eval import eval_app  # noqa: E402
 from roboflow.cli.handlers.folder import folder_app  # noqa: E402
 from roboflow.cli.handlers.image import image_app  # noqa: E402
 from roboflow.cli.handlers.infer import infer_command  # noqa: E402
@@ -214,6 +215,7 @@ def _walk(group: Any, prefix: str = "") -> None:
 app.add_typer(completion_app, name="completion")
 app.add_typer(deployment_app, name="deployment")
 app.add_typer(device_app, name="device")
+app.add_typer(eval_app, name="eval")
 app.add_typer(folder_app, name="folder")
 app.add_typer(image_app, name="image")