diff --git a/CHANGELOG.md b/CHANGELOG.md index f76ed7a7..c8d963d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,49 @@ All notable changes to this project will be documented in this file. +## Unreleased + +### Added — Model evaluations SDK & CLI + +Wraps the public `/{workspace}/model-evals` REST surface +([roboflow/roboflow#11636](https://github.com/roboflow/roboflow/pull/11636)) +so users can read evaluation results — mAP, confidence sweep, per-class +performance, confusion matrix, vector clusters, per-image stats, +recommendations — from Python and from the CLI without hitting the API +directly. Companion docs: +[roboflow-dev-reference#18](https://github.com/roboflow/roboflow-dev-reference/pull/18). + +**SDK (`roboflow/core/model_eval.py`):** +- `Workspace.evals(project=None, version=None, model=None, status=None, limit=None)` — list evals as `ModelEval` instances pre-populated with metadata from the list response. +- `Workspace.eval(eval_id)` — fetch a single eval (returns a `ModelEval` with `.summary` populated when status is `done`). +- `ModelEval.refresh()` — re-fetch the eval header. +- `ModelEval.map_results()`, `.confidence_sweep()`, `.performance_by_class(split=None)`, `.confusion_matrix(split=None, confidence=None)`, `.vector_analysis(confidence=None)`, `.image_predictions(split=None, confidence=None, limit=None, offset=None)`, `.recommendations()` — one method per panel; each returns the raw JSON dict. + +**CLI (`roboflow/cli/handlers/eval.py`):** +- `roboflow eval list [--project P] [--version V] [--model M] [--status S] [--limit N]` +- `roboflow eval get ` +- `roboflow eval map-results ` +- `roboflow eval confidence-sweep ` +- `roboflow eval performance-by-class [--split S]` +- `roboflow eval confusion-matrix [--split S] [--confidence N]` +- `roboflow eval vector-analysis [--confidence N]` +- `roboflow eval image-predictions [--split S] [--confidence N] [--limit N] [--offset N]` +- `roboflow eval recommendations ` + +Exit codes are stable per error class so shell scripts and AI agents can +react without parsing message strings: `3` for `model_eval_not_found` +(404), `4` for `model_eval_not_done` (409), `5` for `invalid_split` / +`invalid_confidence` (400). Every command supports `--json` for +structured output. + +**Low-level (`roboflow.adapters.rfapi`):** +- `list_model_evals`, `get_model_eval`, `get_model_eval_map_results`, `get_model_eval_confidence_sweep`, `get_model_eval_performance_by_class`, `get_model_eval_confusion_matrix`, `get_model_eval_vector_analysis`, `get_model_eval_image_predictions`, `get_model_eval_recommendations`. +- New typed exceptions `ModelEvalNotFoundError`, `ModelEvalNotDoneError`, `InvalidSplitError`, `InvalidConfidenceError` (all subclasses of `RoboflowError`) so callers can distinguish "eval doesn't exist" from "eval still running" from "bad argument" without parsing strings. + +The endpoints require the `model-eval:read` scope. The base URL is +configurable via `API_URL` (set to `https://localapi.roboflow.one` to +test against a local API server). + ## 1.3.7 ### Added — Soft-delete / Trash support diff --git a/CLI-COMMANDS.md b/CLI-COMMANDS.md index fa1c9082..d3532a8d 100644 --- a/CLI-COMMANDS.md +++ b/CLI-COMMANDS.md @@ -216,6 +216,31 @@ single item) is intentionally not available from the SDK or CLI — those actions destroy data irrecoverably and live only in the web UI's Trash view. Items left in Trash are cleaned up automatically after 30 days. +### Inspect model evaluations + +```bash +# List evals in the workspace; filter by project, version, model, or status. +roboflow eval list --status done --limit 10 + +# Read a single eval's metadata + summary metrics. +roboflow eval get + +# Pull each panel — pipe to jq for structured access. +roboflow eval map-results --json | jq '.splits.test.map50' +roboflow eval performance-by-class --split test +roboflow eval confusion-matrix --split test --confidence 30 +roboflow eval confidence-sweep --json +roboflow eval vector-analysis --confidence 20 --json +roboflow eval image-predictions --split test --limit 200 +roboflow eval recommendations --json +``` + +Exit codes are stable per error class so scripts and agents can react +without parsing message strings: `3` for `model_eval_not_found` (404), +`4` for `model_eval_not_done` (409 — eval still running), `5` for +`invalid_split` / `invalid_confidence` (400). Requires the +`model-eval:read` scope on the api key. + ### Workspace stats and billing ```bash @@ -316,6 +341,7 @@ Version numbers are always numeric — that's how `x/y` is disambiguated between | `search` | Search workspace images (RoboQL), export results | | `deployment` | Manage dedicated deployments | | `device` | List, get, create, and observe RFDM devices (v2 deployment API) | +| `eval` | Inspect model evaluation runs (mAP, confusion matrix, recommendations, ...) | | `workflow` | Manage workflows | | `folder` | Manage workspace folders | | `annotation` | Annotation batches and jobs | diff --git a/roboflow/adapters/rfapi.py b/roboflow/adapters/rfapi.py index 1f6aa1eb..8d887abb 100644 --- a/roboflow/adapters/rfapi.py +++ b/roboflow/adapters/rfapi.py @@ -1165,3 +1165,184 @@ def restore_trash_item(api_key, workspace_url, item_type, item_id, parent_id=Non # Note: permanent-delete from Trash (deleteImmediately / empty) is # intentionally not exposed on the public API — those actions destroy data # irrecoverably and are only available through the web UI's Trash view. + + +# --------------------------------------------------------------------------- +# Model evaluations +# --------------------------------------------------------------------------- + + +class ModelEvalNotFoundError(RoboflowError): + """Raised when an eval id (or workspace) does not exist (HTTP 404).""" + + +class ModelEvalNotDoneError(RoboflowError): + """Raised when reading panel data for an eval whose status is not ``done`` (HTTP 409).""" + + +class InvalidSplitError(RoboflowError): + """Raised when ``split`` is not one of the accepted values (HTTP 400).""" + + +class InvalidConfidenceError(RoboflowError): + """Raised when ``confidence`` is non-integer or out of range 0-100 (HTTP 400).""" + + +def _model_eval_error_for(response): + """Translate a model-eval error response into the right RoboflowError subclass. + + The model-eval REST surface returns errors as a flat envelope:: + + {"error": "", "message": ""} + + Falls back to plain :class:`RoboflowError` when the body isn't JSON or + the code is unrecognised, so new error codes don't crash older SDK + callers. Status-code fallbacks for 404/409 keep typed exceptions + available even if the server omits the ``error`` field. + """ + code = None + message = response.text + try: + body = response.json() + if isinstance(body, dict): + code = body.get("error") + if not isinstance(code, str): + code = None + message = body.get("message") or code or message + except (ValueError, TypeError): + pass + + cls_by_code = { + "model_eval_not_found": ModelEvalNotFoundError, + "model_eval_not_done": ModelEvalNotDoneError, + "invalid_split": InvalidSplitError, + "invalid_confidence": InvalidConfidenceError, + } + cls = cls_by_code.get(code or "") + if cls is not None: + return cls(message) + if response.status_code == 404: + return ModelEvalNotFoundError(message) + if response.status_code == 409: + return ModelEvalNotDoneError(message) + return RoboflowError(message) + + +def _eval_get(api_key, workspace_url, path, params=None): + """GET helper for model-eval endpoints with typed error mapping.""" + query: Dict[str, Union[str, int]] = {"api_key": api_key} + if params: + for key, value in params.items(): + if value is not None: + query[key] = value + url = f"{API_URL}/{workspace_url}/model-evals{path}" + response = requests.get(url, params=query) + if response.status_code != 200: + raise _model_eval_error_for(response) + return response.json() + + +def list_model_evals( + api_key: str, + workspace_url: str, + *, + project: Optional[str] = None, + version: Optional[Union[str, int]] = None, + model: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals — list evals in the workspace.""" + return _eval_get( + api_key, + workspace_url, + "", + params={"project": project, "version": version, "model": model, "status": status, "limit": limit}, + ) + + +def get_model_eval(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId} — fetch a single eval (with summary if done).""" + return _eval_get(api_key, workspace_url, f"/{eval_id}") + + +def get_model_eval_map_results(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId}/map-results — per-split mAP breakdown.""" + return _eval_get(api_key, workspace_url, f"/{eval_id}/map-results") + + +def get_model_eval_confidence_sweep(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId}/confidence-sweep — F1/precision/recall sweep.""" + return _eval_get(api_key, workspace_url, f"/{eval_id}/confidence-sweep") + + +def get_model_eval_performance_by_class( + api_key: str, + workspace_url: str, + eval_id: str, + *, + split: Optional[str] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/performance-by-class — per-class metrics. + + Server rejects ``split=all`` for this panel; pass one of train/valid/test + or omit to use the server default (test). + """ + return _eval_get(api_key, workspace_url, f"/{eval_id}/performance-by-class", params={"split": split}) + + +def get_model_eval_confusion_matrix( + api_key: str, + workspace_url: str, + eval_id: str, + *, + split: Optional[str] = None, + confidence: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/confusion-matrix — confusion matrix for split.""" + return _eval_get( + api_key, + workspace_url, + f"/{eval_id}/confusion-matrix", + params={"split": split, "confidence": confidence}, + ) + + +def get_model_eval_vector_analysis( + api_key: str, + workspace_url: str, + eval_id: str, + *, + confidence: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/vector-analysis — embedding clusters & metrics.""" + return _eval_get( + api_key, + workspace_url, + f"/{eval_id}/vector-analysis", + params={"confidence": confidence}, + ) + + +def get_model_eval_image_predictions( + api_key: str, + workspace_url: str, + eval_id: str, + *, + split: Optional[str] = None, + confidence: Optional[int] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/image-predictions — paginated per-image stats.""" + return _eval_get( + api_key, + workspace_url, + f"/{eval_id}/image-predictions", + params={"split": split, "confidence": confidence, "limit": limit, "offset": offset}, + ) + + +def get_model_eval_recommendations(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId}/recommendations — improvement suggestions.""" + return _eval_get(api_key, workspace_url, f"/{eval_id}/recommendations") diff --git a/roboflow/cli/__init__.py b/roboflow/cli/__init__.py index 1fd4a038..c2ff3594 100644 --- a/roboflow/cli/__init__.py +++ b/roboflow/cli/__init__.py @@ -191,6 +191,7 @@ def _walk(group: Any, prefix: str = "") -> None: from roboflow.cli.handlers.completion import completion_app # noqa: E402 from roboflow.cli.handlers.deployment import deployment_app # noqa: E402 from roboflow.cli.handlers.device import device_app # noqa: E402 +from roboflow.cli.handlers.eval import eval_app # noqa: E402 from roboflow.cli.handlers.folder import folder_app # noqa: E402 from roboflow.cli.handlers.image import image_app # noqa: E402 from roboflow.cli.handlers.infer import infer_command # noqa: E402 @@ -214,6 +215,7 @@ def _walk(group: Any, prefix: str = "") -> None: app.add_typer(completion_app, name="completion") app.add_typer(deployment_app, name="deployment") app.add_typer(device_app, name="device") +app.add_typer(eval_app, name="eval") app.add_typer(folder_app, name="folder") app.add_typer(image_app, name="image") diff --git a/roboflow/cli/handlers/eval.py b/roboflow/cli/handlers/eval.py new file mode 100644 index 00000000..0e41cc90 --- /dev/null +++ b/roboflow/cli/handlers/eval.py @@ -0,0 +1,496 @@ +"""Model evaluation commands. + +Wraps the public ``/{workspace}/model-evals`` REST surface — list runs in a +workspace and pull each panel (mAP, confidence sweep, per-class table, +confusion matrix, vector clusters, per-image stats, recommendations). + +The eval-id is opaque (the human in the UI navigates by URL); commands take +it as a positional argument so it composes well with ``--json | jq``. +""" + +from __future__ import annotations + +from typing import Annotated, Optional + +import typer + +from roboflow.cli._compat import SortedGroup, ctx_to_args + +eval_app = typer.Typer(cls=SortedGroup, help="Inspect model evaluation runs", no_args_is_help=True) + + +# --------------------------------------------------------------------------- +# Command surface (Typer) +# --------------------------------------------------------------------------- + + +@eval_app.command("list") +def list_evals_cmd( + ctx: typer.Context, + project: Annotated[Optional[str], typer.Option("-p", "--project", help="Filter by project slug or id")] = None, + version: Annotated[Optional[str], typer.Option("-v", "--version", help="Filter by version id")] = None, + model: Annotated[Optional[str], typer.Option("-m", "--model", help="Filter by model id")] = None, + status: Annotated[ + Optional[str], typer.Option("-s", "--status", help="Filter by status (running/done/failed)") + ] = None, + limit: Annotated[Optional[int], typer.Option("-n", "--limit", help="Max results (default 50, max 200)")] = None, +) -> None: + """List model evaluations in the workspace.""" + args = ctx_to_args(ctx, project=project, version=version, model=model, status=status, limit=limit) + _list_evals(args) + + +@eval_app.command("get") +def get_eval_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id (from `roboflow eval list`)")], +) -> None: + """Show a single eval's metadata and summary metrics.""" + args = ctx_to_args(ctx, eval_id=eval_id) + _get_eval(args) + + +@eval_app.command("map-results") +def map_results_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], +) -> None: + """Show per-split mAP results (mAP50, mAP50-95, mAP75, by object size, per class).""" + args = ctx_to_args(ctx, eval_id=eval_id) + _map_results(args) + + +@eval_app.command("confidence-sweep") +def confidence_sweep_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], +) -> None: + """Show the confidence-threshold sweep (precision/recall/F1) for the test split.""" + args = ctx_to_args(ctx, eval_id=eval_id) + _confidence_sweep(args) + + +@eval_app.command("performance-by-class") +def performance_by_class_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + split: Annotated[ + Optional[str], + typer.Option("-s", "--split", help="Split: train, valid, or test (default test). 'all' is rejected."), + ] = None, +) -> None: + """Show per-class precision / recall / F1 / mAP for the chosen split.""" + args = ctx_to_args(ctx, eval_id=eval_id, split=split) + _performance_by_class(args) + + +@eval_app.command("confusion-matrix") +def confusion_matrix_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + split: Annotated[ + Optional[str], typer.Option("-s", "--split", help="Split: train, valid, test, or all (default test)") + ] = None, + confidence: Annotated[ + Optional[int], + typer.Option("-c", "--confidence", help="Integer confidence threshold (0-100)"), + ] = None, +) -> None: + """Show the confusion matrix for *split* at *confidence*.""" + args = ctx_to_args(ctx, eval_id=eval_id, split=split, confidence=confidence) + _confusion_matrix(args) + + +@eval_app.command("vector-analysis") +def vector_analysis_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + confidence: Annotated[ + Optional[int], + typer.Option("-c", "--confidence", help="Integer confidence threshold (0-100)"), + ] = None, +) -> None: + """Show embedding-cluster diagnostics (per-cluster sample images + metrics).""" + args = ctx_to_args(ctx, eval_id=eval_id, confidence=confidence) + _vector_analysis(args) + + +@eval_app.command("image-predictions") +def image_predictions_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + split: Annotated[ + Optional[str], typer.Option("-s", "--split", help="Split: train, valid, test, or all (default test)") + ] = None, + confidence: Annotated[ + Optional[int], + typer.Option("-c", "--confidence", help="Integer confidence threshold (0-100)"), + ] = None, + limit: Annotated[ + Optional[int], + typer.Option("-n", "--limit", help="Page size (default 200, max 1000)"), + ] = None, + offset: Annotated[ + Optional[int], + typer.Option("-o", "--offset", help="Pagination offset"), + ] = None, +) -> None: + """Show paginated per-image stats (TP/FP/FN, augmentations, cluster id).""" + args = ctx_to_args(ctx, eval_id=eval_id, split=split, confidence=confidence, limit=limit, offset=offset) + _image_predictions(args) + + +@eval_app.command("recommendations") +def recommendations_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], +) -> None: + """Show server-generated suggestions for improving the model.""" + args = ctx_to_args(ctx, eval_id=eval_id) + _recommendations(args) + + +# --------------------------------------------------------------------------- +# Business logic +# --------------------------------------------------------------------------- + + +def _resolve(args): # noqa: ANN001 + from roboflow.cli._resolver import resolve_ws_and_key + + return resolve_ws_and_key(args) + + +def _eval_error_exit_code(exc: Exception) -> int: + """Map a model-eval error to the canonical CLI exit code. + + 1 = general; 2 = auth; 3 = not found; 4 = conflict (eval not done); + 5 = invalid argument (bad split / confidence). Keeping these distinct + lets shell scripts and AI agents react to specific failure modes + without parsing message strings. + """ + from roboflow.adapters import rfapi + + if isinstance(exc, rfapi.ModelEvalNotFoundError): + return 3 + if isinstance(exc, rfapi.ModelEvalNotDoneError): + return 4 + if isinstance(exc, (rfapi.InvalidSplitError, rfapi.InvalidConfidenceError)): + return 5 + return 1 + + +def _hint_for(exc: Exception) -> Optional[str]: + """Per-error actionable hint shown alongside the message in non-JSON mode.""" + from roboflow.adapters import rfapi + + if isinstance(exc, rfapi.ModelEvalNotFoundError): + return "Run 'roboflow eval list' to see eval ids in this workspace." + if isinstance(exc, rfapi.ModelEvalNotDoneError): + return "Wait for the eval to finish (status='done') before reading panel data." + if isinstance(exc, rfapi.InvalidSplitError): + return "Use one of: train, valid, test (or 'all' where supported)." + if isinstance(exc, rfapi.InvalidConfidenceError): + return "Pass an integer between 0 and 100." + return None + + +def _list_evals(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + from roboflow.cli._table import format_table + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + result = rfapi.list_model_evals( + api_key, + workspace_url, + project=args.project, + version=args.version, + model=args.model, + status=args.status, + limit=args.limit, + ) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + evals = result.get("evals", []) + rows = [ + { + # Prefer DNA's `evalId`; tolerate legacy `id` from older server versions. + "id": e.get("evalId", e.get("id", "")), + "status": e.get("status", ""), + # `project` is the URL slug; the public API does not expose the doc id. + # Tolerate legacy `projectId` for forward-compat against older deploys. + "project": e.get("project") or e.get("projectId", ""), + "version": e.get("versionId", ""), + "model": e.get("modelId", "") or "", + "created": e.get("createdAt", ""), + } + for e in evals + ] + table = format_table( + rows, + columns=["id", "status", "project", "version", "model", "created"], + headers=["ID", "STATUS", "PROJECT", "VERSION", "MODEL", "CREATED"], + ) + output(args, evals, text=table) + + +def _get_eval(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + info = rfapi.get_model_eval(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + lines = [ + # Prefer DNA's `evalId`; tolerate legacy `id`. + f"Eval: {info.get('evalId', info.get('id', args.eval_id))}", + f" Status: {info.get('status', '')}", + # `project` is the URL slug; tolerate legacy `projectId` for forward-compat. + f" Project: {info.get('project') or info.get('projectId', '')}", + f" Version: {info.get('versionId', '')}", + f" Model: {info.get('modelId', '') or '(none)'}", + f" Created: {info.get('createdAt', '')}", + ] + summary = info.get("summary") or {} + if summary: + lines.append( + f" Summary: mAP={summary.get('mAP')} precision={summary.get('precision')} recall={summary.get('recall')}" + ) + output(args, info, text="\n".join(lines)) + + +def _emit_dict(args, payload, *, header: Optional[str] = None) -> None: # noqa: ANN001 + """Default text rendering for panel commands: pretty-printed JSON. + + Each panel has a deeply nested per-eval shape that doesn't tabulate + well in the general case (per-class tables exist, but vector clusters + and recommendations don't). For agent ergonomics we lean on --json, + and for humans we just pretty-print so they can pipe to jq or eyeball. + """ + import json as _json + + from roboflow.cli._output import output + + text = _json.dumps(payload, indent=2, default=str) + if header: + text = f"{header}\n{text}" + output(args, payload, text=text) + + +def _map_results(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_map_results(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) + + +def _confidence_sweep(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_confidence_sweep(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) + + +def _performance_by_class(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + from roboflow.cli._table import format_table + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_performance_by_class(api_key, workspace_url, args.eval_id, split=args.split) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + classes = data.get("classes", []) + rows = [] + for c in classes: + rows.append( + { + "class": c.get("className", ""), + "map50": _fmt_float(c.get("map50")), + "map50_95": _fmt_float(c.get("map50_95")), + "map75": _fmt_float(c.get("map75")), + "precision": _fmt_float(c.get("precision")), + "recall": _fmt_float(c.get("recall")), + "f1": _fmt_float(c.get("f1")), + "opt_thresh": _fmt_float(c.get("optimalThreshold")), + } + ) + table = format_table( + rows, + columns=["class", "map50", "map50_95", "map75", "precision", "recall", "f1", "opt_thresh"], + headers=["CLASS", "mAP50", "mAP50-95", "mAP75", "P", "R", "F1", "OPT_THR"], + ) + header = f"Split: {data.get('split', args.split or 'test')}" + output(args, data, text=f"{header}\n{table}") + + +def _fmt_float(value): + """Format a float to 4 decimal places for table output; pass through ``None`` as ''.""" + if value is None: + return "" + try: + return f"{float(value):.4f}" + except (TypeError, ValueError): + return str(value) + + +def _confusion_matrix(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_confusion_matrix( + api_key, + workspace_url, + args.eval_id, + split=args.split, + confidence=args.confidence, + ) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + header = ( + f"Split: {data.get('split', args.split or 'test')} " + f"Confidence: {data.get('confidenceThreshold', args.confidence or 'default')}" + ) + _emit_dict(args, data, header=header) + + +def _vector_analysis(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_vector_analysis(api_key, workspace_url, args.eval_id, confidence=args.confidence) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) + + +def _image_predictions(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + from roboflow.cli._table import format_table + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_image_predictions( + api_key, + workspace_url, + args.eval_id, + split=args.split, + confidence=args.confidence, + limit=args.limit, + offset=args.offset, + ) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + images = data.get("images", []) + rows = [] + for img in images: + # Stats are camelCase per the public API: + # `truePositives`/`falsePositives`/`falseNegatives` (not `tp`/`fp`/`fn`). + stats = img.get("stats") or {} + cluster = img.get("cluster") or {} + cluster_id = cluster.get("id") if isinstance(cluster, dict) else cluster + rows.append( + { + "image": img.get("imageName", img.get("imageId", "")), + "split": img.get("split", ""), + "tp": stats.get("truePositives", ""), + "fp": stats.get("falsePositives", ""), + "fn": stats.get("falseNegatives", ""), + "cluster": cluster_id if cluster_id is not None else "", + } + ) + table = format_table( + rows, + columns=["image", "split", "tp", "fp", "fn", "cluster"], + headers=["IMAGE", "SPLIT", "TP", "FP", "FN", "CLUSTER"], + ) + header = ( + f"Split: {data.get('split', args.split or 'test')} " + f"Confidence: {data.get('confidenceThreshold', args.confidence or 'default')} " + f"Total: {data.get('totalImages', len(images))} " + f"Offset: {data.get('offset', args.offset or 0)} " + f"Limit: {data.get('limit', args.limit or 200)}" + ) + output(args, data, text=f"{header}\n{table}") + + +def _recommendations(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_recommendations(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) diff --git a/roboflow/core/model_eval.py b/roboflow/core/model_eval.py new file mode 100644 index 00000000..49479c72 --- /dev/null +++ b/roboflow/core/model_eval.py @@ -0,0 +1,169 @@ +"""Model evaluation results — wraps the public ``/model-evals`` REST surface. + +A :class:`ModelEval` is a thin lazy wrapper around a single evaluation run. +The constructor accepts the eval id (and optional cached metadata from a list +response); each panel (``map_results``, ``confusion_matrix``, etc.) is fetched +on demand and returned as the raw JSON dict the server emits. + +The shape mirrors the REST endpoints documented at +``docs.roboflow.com/api-reference/model-evaluations``. Errors surface as +typed :mod:`roboflow.adapters.rfapi` subclasses so callers can distinguish +"eval doesn't exist" from "eval still running" without parsing strings. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from roboflow.adapters import rfapi + + +class ModelEval: + """A single model-evaluation run. + + Construct via :meth:`roboflow.core.workspace.Workspace.eval` or list via + :meth:`roboflow.core.workspace.Workspace.evals`. Direct construction is + supported when you already hold an eval id:: + + from roboflow.core.model_eval import ModelEval + ev = ModelEval(api_key, "lee-sandbox", "huUF720inUcymARwqAGK") + ev.refresh() # populates .status, .summary, etc. + """ + + def __init__( + self, + api_key: str, + workspace_url: str, + eval_id: str, + info: Optional[Dict[str, Any]] = None, + ) -> None: + self._api_key = api_key + self._workspace_url = workspace_url + self.id = eval_id + # Populate metadata from a cached list/get response when available; the + # caller can still refresh() to re-fetch from the server. + self._apply(info or {}) + + # -- internal ----------------------------------------------------------- + + def _apply(self, info: Dict[str, Any]) -> None: + # Server returns `evalId` (per DNA's identifier-embedding convention, + # consistent with every panel response). Accept legacy `id` for + # forward-compat with cached responses from older server versions. + if info.get("evalId"): + self.id = info["evalId"] + self.status: Optional[str] = info.get("status") + # `project` is the project URL slug — the same identifier the REST API + # uses in URL paths. The internal Firestore doc id is intentionally + # never exposed in the public API. Accept legacy `projectId` for + # forward-compat with older server versions. + self.project: Optional[str] = info.get("project") or info.get("projectId") + self.version_id: Optional[str] = info.get("versionId") + self.model_id: Optional[str] = info.get("modelId") + self.created_at: Optional[str] = info.get("createdAt") + self.summary: Optional[Dict[str, Any]] = info.get("summary") + self._raw: Dict[str, Any] = info + + # -- core --------------------------------------------------------------- + + def refresh(self) -> "ModelEval": + """Re-fetch the eval header (status, summary, …) from the server.""" + info = rfapi.get_model_eval(self._api_key, self._workspace_url, self.id) + self._apply(info) + return self + + # -- panel accessors ---------------------------------------------------- + + def map_results(self) -> Dict[str, Any]: + """Per-split mAP results (mAP50, mAP50-95, mAP75, by object size, per class).""" + return rfapi.get_model_eval_map_results(self._api_key, self._workspace_url, self.id) + + def confidence_sweep(self) -> Dict[str, Any]: + """Confidence-threshold sweep (precision/recall/F1) for the test split.""" + return rfapi.get_model_eval_confidence_sweep(self._api_key, self._workspace_url, self.id) + + def performance_by_class(self, split: Optional[str] = None) -> Dict[str, Any]: + """Per-class precision / recall / F1 / mAP for the chosen split. + + ``split`` defaults to ``"test"`` server-side. Passing ``"all"`` raises + :class:`rfapi.InvalidSplitError` — this panel does not support an + aggregate view. + """ + return rfapi.get_model_eval_performance_by_class(self._api_key, self._workspace_url, self.id, split=split) + + def confusion_matrix( + self, + split: Optional[str] = None, + confidence: Optional[int] = None, + ) -> Dict[str, Any]: + """Confusion matrix (classes + matrix) for *split* at integer *confidence* (0-100).""" + return rfapi.get_model_eval_confusion_matrix( + self._api_key, self._workspace_url, self.id, split=split, confidence=confidence + ) + + def vector_analysis(self, confidence: Optional[int] = None) -> Dict[str, Any]: + """Embedding-cluster diagnostics (per-cluster sample images + metrics).""" + return rfapi.get_model_eval_vector_analysis(self._api_key, self._workspace_url, self.id, confidence=confidence) + + def image_predictions( + self, + split: Optional[str] = None, + confidence: Optional[int] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Dict[str, Any]: + """Paginated per-image stats (TP/FP/FN counts, augmentations, cluster id).""" + return rfapi.get_model_eval_image_predictions( + self._api_key, + self._workspace_url, + self.id, + split=split, + confidence=confidence, + limit=limit, + offset=offset, + ) + + def recommendations(self) -> Dict[str, Any]: + """Server-generated suggestions for improving the model.""" + return rfapi.get_model_eval_recommendations(self._api_key, self._workspace_url, self.id) + + # -- helpers ------------------------------------------------------------ + + # Mapping (json_key, attr_name) used by `to_dict()` to round-trip a + # constructor-only ModelEval (one with no `info=` payload) back into the + # public JSON shape. Same fields the server returns at the top level of + # `modelEvals.get`, in the same order. + _PUBLIC_FIELDS = ( + ("status", "status"), + ("project", "project"), + ("versionId", "version_id"), + ("modelId", "model_id"), + ("createdAt", "created_at"), + ("summary", "summary"), + ) + + def to_dict(self) -> Dict[str, Any]: + """Return the cached eval metadata as a plain dict (evalId + last header fetch). + + When the instance was created from a server payload (the usual path — + via ``Workspace.eval`` or ``Workspace.evals``) the raw payload is + round-tripped, with ``evalId`` overlaid so legacy ``id``-keyed + responses still emit the DNA-aligned field. When the instance was + created without a payload (constructor only — ``ModelEval(key, ws, + eval_id)`` with no ``refresh()``) only the attributes the caller has + set get serialised, omitting any ``None`` fields. + """ + if self._raw: + return {**self._raw, "evalId": self.id} + data: Dict[str, Any] = {"evalId": self.id} + for json_key, attr_name in self._PUBLIC_FIELDS: + value = getattr(self, attr_name, None) + if value is not None: + data[json_key] = value + return data + + def __repr__(self) -> str: # pragma: no cover - debug helper + return f"ModelEval(id={self.id!r}, status={self.status!r}, project={self.project!r})" + + +__all__: List[str] = ["ModelEval"] diff --git a/roboflow/core/workspace.py b/roboflow/core/workspace.py index 89519409..f25292e3 100644 --- a/roboflow/core/workspace.py +++ b/roboflow/core/workspace.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from roboflow.core.device import Device + from roboflow.core.model_eval import ModelEval class Workspace: @@ -1432,6 +1433,73 @@ def upload_vision_event_image( metadata=metadata, ) + # ----------------------------------------------------------------- + # Model evaluations + # ----------------------------------------------------------------- + + def evals( + self, + *, + project: Optional[str] = None, + version: Optional[str] = None, + model: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, + ) -> List["ModelEval"]: + """List model evaluations in this workspace. + + Args: + project: Filter by project slug or id. + version: Filter by version id (or numeric version). + model: Filter by model id. + status: Filter by status — one of ``"running"``, ``"done"``, ``"failed"``. + limit: Max evals to return (server caps at 200; default 50). + + Returns: + A list of :class:`ModelEval` instances pre-populated with the + metadata from the list response (``status``, ``createdAt``, etc.). + Call :meth:`ModelEval.refresh` to re-fetch the header, or any + panel method to load detailed data. + + Example: + >>> ws = rf.workspace("lee-sandbox") + >>> done = ws.evals(status="done", limit=5) + >>> for ev in done: + ... print(ev.id, ev.summary) + """ + from roboflow.core.model_eval import ModelEval + + result = rfapi.list_model_evals( + self.__api_key, + self.url, + project=project, + version=version, + model=model, + status=status, + limit=limit, + ) + # Server returns `evalId` (per DNA); fall back to legacy `id` for forward-compat. + return [ + ModelEval(self.__api_key, self.url, e.get("evalId") or e["id"], info=e) for e in result.get("evals", []) + ] + + def eval(self, eval_id: str) -> "ModelEval": + """Fetch a single model eval by id. + + Raises: + roboflow.adapters.rfapi.ModelEvalNotFoundError: If the id doesn't + exist in this workspace (HTTP 404). + + Example: + >>> ws = rf.workspace("lee-sandbox") + >>> ev = ws.eval("huUF720inUcymARwqAGK") + >>> ev.summary["mAP"] + """ + from roboflow.core.model_eval import ModelEval + + info = rfapi.get_model_eval(self.__api_key, self.url, eval_id) + return ModelEval(self.__api_key, self.url, info.get("id", eval_id), info=info) + def trash(self) -> dict: """ List items currently in the workspace Trash. diff --git a/tests/adapters/test_rfapi_model_evals.py b/tests/adapters/test_rfapi_model_evals.py new file mode 100644 index 00000000..41cc6942 --- /dev/null +++ b/tests/adapters/test_rfapi_model_evals.py @@ -0,0 +1,224 @@ +"""Unit tests for the model-eval rfapi helpers (`/{ws}/model-evals/...`).""" + +from __future__ import annotations + +import unittest +from unittest.mock import MagicMock, patch + +from roboflow.adapters import rfapi +from roboflow.config import API_URL + + +def _resp(status: int, body): + """Build a mock requests.Response double for the given status + JSON body.""" + mock = MagicMock(status_code=status) + mock.json.return_value = body + mock.text = repr(body) + return mock + + +class TestListModelEvals(unittest.TestCase): + @patch("roboflow.adapters.rfapi.requests.get") + def test_success_no_filters(self, mock_get): + mock_get.return_value = _resp(200, {"evals": [{"id": "e1", "status": "done"}]}) + + result = rfapi.list_model_evals("k", "ws") + + self.assertEqual(result, {"evals": [{"id": "e1", "status": "done"}]}) + url = mock_get.call_args[0][0] + params = mock_get.call_args.kwargs["params"] + self.assertEqual(url, f"{API_URL}/ws/model-evals") + self.assertEqual(params, {"api_key": "k"}) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_success_with_filters(self, mock_get): + mock_get.return_value = _resp(200, {"evals": []}) + + rfapi.list_model_evals("k", "ws", project="p1", version=3, model="m1", status="done", limit=10) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual( + params, + { + "api_key": "k", + "project": "p1", + "version": 3, + "model": "m1", + "status": "done", + "limit": 10, + }, + ) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_omits_none_filters(self, mock_get): + mock_get.return_value = _resp(200, {"evals": []}) + + rfapi.list_model_evals("k", "ws", status="done", limit=None) + + params = mock_get.call_args.kwargs["params"] + self.assertNotIn("limit", params) + self.assertEqual(params["status"], "done") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_404_raises_not_found(self, mock_get): + mock_get.return_value = _resp(404, {"error": "model_eval_not_found", "message": "nope"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError) as ctx: + rfapi.list_model_evals("k", "ws") + self.assertIn("nope", str(ctx.exception)) + + +class TestGetModelEval(unittest.TestCase): + @patch("roboflow.adapters.rfapi.requests.get") + def test_success(self, mock_get): + mock_get.return_value = _resp(200, {"id": "e1", "status": "done", "summary": {"mAP": 0.9}}) + + result = rfapi.get_model_eval("k", "ws", "e1") + + self.assertEqual(result["summary"]["mAP"], 0.9) + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1") + + +class TestPanelEndpoints(unittest.TestCase): + """Each panel endpoint forwards path + params correctly.""" + + @patch("roboflow.adapters.rfapi.requests.get") + def test_map_results_url(self, mock_get): + mock_get.return_value = _resp(200, {"splits": {}}) + + rfapi.get_model_eval_map_results("k", "ws", "e1") + + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1/map-results") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_confidence_sweep_url(self, mock_get): + mock_get.return_value = _resp(200, {"splits": {}}) + + rfapi.get_model_eval_confidence_sweep("k", "ws", "e1") + + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1/confidence-sweep") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_performance_by_class_passes_split(self, mock_get): + mock_get.return_value = _resp(200, {"split": "valid", "classes": []}) + + rfapi.get_model_eval_performance_by_class("k", "ws", "e1", split="valid") + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["split"], "valid") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_confusion_matrix_passes_params(self, mock_get): + mock_get.return_value = _resp(200, {"matrix": []}) + + rfapi.get_model_eval_confusion_matrix("k", "ws", "e1", split="test", confidence=30) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["split"], "test") + self.assertEqual(params["confidence"], 30) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_image_predictions_pagination(self, mock_get): + mock_get.return_value = _resp(200, {"images": []}) + + rfapi.get_model_eval_image_predictions("k", "ws", "e1", split="test", confidence=20, limit=50, offset=100) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["limit"], 50) + self.assertEqual(params["offset"], 100) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_recommendations_url(self, mock_get): + mock_get.return_value = _resp(200, {"recommendations": []}) + + rfapi.get_model_eval_recommendations("k", "ws", "e1") + + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1/recommendations") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_vector_analysis_passes_confidence(self, mock_get): + mock_get.return_value = _resp(200, {"clusters": []}) + + rfapi.get_model_eval_vector_analysis("k", "ws", "e1", confidence=25) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["confidence"], 25) + + +class TestErrorMapping(unittest.TestCase): + """Typed errors are routed to the right exception subclass.""" + + @patch("roboflow.adapters.rfapi.requests.get") + def test_404_flat_envelope(self, mock_get): + # Server returns the flat shape: {"error": "code", "message": "..."} + mock_get.return_value = _resp(404, {"error": "model_eval_not_found", "message": "Eval 'x' not found"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError) as ctx: + rfapi.get_model_eval("k", "ws", "x") + self.assertIn("Eval 'x' not found", str(ctx.exception)) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_404_status_code_fallback(self, mock_get): + # No `error` field at all — fall back to the status code mapping. + mock_get.return_value = _resp(404, {"message": "something went wrong"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError): + rfapi.get_model_eval("k", "ws", "x") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_409_not_done(self, mock_get): + mock_get.return_value = _resp(409, {"error": "model_eval_not_done", "message": "Eval still running"}) + + with self.assertRaises(rfapi.ModelEvalNotDoneError): + rfapi.get_model_eval_map_results("k", "ws", "x") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_400_invalid_split(self, mock_get): + mock_get.return_value = _resp(400, {"error": "invalid_split", "message": "Invalid split"}) + + with self.assertRaises(rfapi.InvalidSplitError): + rfapi.get_model_eval_performance_by_class("k", "ws", "x", split="all") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_400_invalid_confidence(self, mock_get): + mock_get.return_value = _resp(400, {"error": "invalid_confidence", "message": "out of range"}) + + with self.assertRaises(rfapi.InvalidConfidenceError): + rfapi.get_model_eval_confusion_matrix("k", "ws", "x", confidence=200) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_unknown_404_falls_back_to_not_found(self, mock_get): + # 404 without a recognised code still maps by status code (forward-compat). + mock_get.return_value = _resp(404, {"error": "some_new_code", "message": "?"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError): + rfapi.get_model_eval("k", "ws", "x") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_unknown_500_raises_generic_roboflow_error(self, mock_get): + mock_get.return_value = _resp(500, {"error": "server_oops", "message": "boom"}) + + with self.assertRaises(rfapi.RoboflowError) as ctx: + rfapi.get_model_eval("k", "ws", "x") + # Not one of the typed subclasses + self.assertNotIsInstance(ctx.exception, rfapi.ModelEvalNotFoundError) + self.assertNotIsInstance(ctx.exception, rfapi.ModelEvalNotDoneError) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_non_json_body_falls_back_to_text(self, mock_get): + # Some misbehaving proxies return HTML 502s — make sure we don't crash. + bad = MagicMock(status_code=502, text="Bad Gateway") + bad.json.side_effect = ValueError("not JSON") + mock_get.return_value = bad + + with self.assertRaises(rfapi.RoboflowError) as ctx: + rfapi.get_model_eval("k", "ws", "x") + self.assertIn("Bad Gateway", str(ctx.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cli/test_eval_handler.py b/tests/cli/test_eval_handler.py new file mode 100644 index 00000000..0d4a4a30 --- /dev/null +++ b/tests/cli/test_eval_handler.py @@ -0,0 +1,413 @@ +"""Tests for the model-eval CLI handler (`roboflow eval ...`).""" + +from __future__ import annotations + +import json +import unittest +from argparse import Namespace +from unittest.mock import patch + +from typer.testing import CliRunner + +from roboflow.cli import app + +runner = CliRunner() + + +# --------------------------------------------------------------------------- +# Registration / discoverability +# --------------------------------------------------------------------------- + + +class TestEvalRegistration(unittest.TestCase): + """`roboflow eval ...` subcommands are registered with valid --help.""" + + def test_eval_app_exists(self) -> None: + from roboflow.cli.handlers.eval import eval_app + + self.assertIsNotNone(eval_app) + + def test_eval_root_help(self) -> None: + result = runner.invoke(app, ["eval", "--help"]) + self.assertEqual(result.exit_code, 0) + + def test_each_subcommand_help(self) -> None: + for cmd in [ + "list", + "get", + "map-results", + "confidence-sweep", + "performance-by-class", + "confusion-matrix", + "vector-analysis", + "image-predictions", + "recommendations", + ]: + with self.subTest(cmd=cmd): + result = runner.invoke(app, ["eval", cmd, "--help"]) + self.assertEqual(result.exit_code, 0, f"{cmd} --help failed: {result.output}") + + +# --------------------------------------------------------------------------- +# Helpers — every test patches the workspace + key resolver so no IO happens. +# --------------------------------------------------------------------------- + + +def _args(**overrides): + """Build a Namespace matching what ctx_to_args produces, with sane defaults.""" + base = {"json": False, "workspace": "lee-sandbox", "api_key": None, "quiet": False} + base.update(overrides) + return Namespace(**base) + + +# --------------------------------------------------------------------------- +# `eval list` +# --------------------------------------------------------------------------- + + +class TestEvalListHandler(unittest.TestCase): + @patch("roboflow.adapters.rfapi.list_model_evals") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_list_text_calls_adapter_with_filters(self, _key, _ws, mock_list): + mock_list.return_value = { + "evals": [ + { + "id": "e1", + "status": "done", + "project": "my-project-slug", + "versionId": "3", + "modelId": None, + "createdAt": "2025-01-01", + } + ] + } + args = _args( + workspace=None, + project="my-project-slug", + version="3", + model=None, + status="done", + limit=5, + ) + + from roboflow.cli.handlers.eval import _list_evals + + with patch("builtins.print") as mock_print: + _list_evals(args) + + mock_list.assert_called_once_with( + "key", + "lee-sandbox", + project="my-project-slug", + version="3", + model=None, + status="done", + limit=5, + ) + printed = mock_print.call_args[0][0] + self.assertIn("e1", printed) + self.assertIn("done", printed) + + @patch("roboflow.adapters.rfapi.list_model_evals") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_list_json_emits_evals_array(self, _key, _ws, mock_list): + mock_list.return_value = {"evals": [{"id": "e1", "status": "done"}]} + args = _args(workspace=None, json=True, project=None, version=None, model=None, status=None, limit=None) + + from roboflow.cli.handlers.eval import _list_evals + + with patch("builtins.print") as mock_print: + _list_evals(args) + + printed = mock_print.call_args[0][0] + data = json.loads(printed) + self.assertEqual(data, [{"id": "e1", "status": "done"}]) + + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value=None) + def test_list_no_workspace_exits_2(self, _ws): + args = _args(workspace=None, project=None, version=None, model=None, status=None, limit=None) + + from roboflow.cli.handlers.eval import _list_evals + + with self.assertRaises(SystemExit) as ctx: + _list_evals(args) + self.assertEqual(ctx.exception.code, 2) + + +# --------------------------------------------------------------------------- +# `eval get` +# --------------------------------------------------------------------------- + + +class TestEvalGetHandler(unittest.TestCase): + @patch("roboflow.adapters.rfapi.get_model_eval") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_get_text(self, _key, _ws, mock_get): + mock_get.return_value = { + "id": "e1", + "status": "done", + "project": "my-project-slug", + "versionId": "3", + "modelId": "m1", + "createdAt": "2025-01-01", + "summary": {"mAP": 0.91, "precision": 0.85, "recall": 0.8}, + } + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _get_eval + + with patch("builtins.print") as mock_print: + _get_eval(args) + + printed = mock_print.call_args[0][0] + self.assertIn("Eval: e1", printed) + self.assertIn("Status: done", printed) + self.assertIn("mAP=0.91", printed) + mock_get.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_get_404_exits_3(self, _key, _ws, mock_get): + from roboflow.adapters import rfapi + + mock_get.side_effect = rfapi.ModelEvalNotFoundError("not found") + args = _args(workspace=None, eval_id="bad") + + from roboflow.cli.handlers.eval import _get_eval + + with self.assertRaises(SystemExit) as ctx: + _get_eval(args) + self.assertEqual(ctx.exception.code, 3) + + +# --------------------------------------------------------------------------- +# Per-panel handlers — each forwards args to the right adapter function. +# --------------------------------------------------------------------------- + + +class TestPanelHandlers(unittest.TestCase): + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_map_results_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"splits": {"test": {"map50": 0.9}}} + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _map_results + + with patch("builtins.print"): + _map_results(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_confidence_sweep") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_confidence_sweep_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"splits": {}} + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _confidence_sweep + + with patch("builtins.print"): + _confidence_sweep(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_performance_by_class_passes_split(self, _key, _ws, mock_fn): + mock_fn.return_value = {"split": "valid", "classes": [{"className": "car", "map50": 0.9}]} + args = _args(workspace=None, eval_id="e1", split="valid") + + from roboflow.cli.handlers.eval import _performance_by_class + + with patch("builtins.print"): + _performance_by_class(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", split="valid") + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_performance_by_class_invalid_split_exits_5(self, _key, _ws, mock_fn): + from roboflow.adapters import rfapi + + mock_fn.side_effect = rfapi.InvalidSplitError("no") + args = _args(workspace=None, eval_id="e1", split="all") + + from roboflow.cli.handlers.eval import _performance_by_class + + with self.assertRaises(SystemExit) as ctx: + _performance_by_class(args) + self.assertEqual(ctx.exception.code, 5) + + @patch("roboflow.adapters.rfapi.get_model_eval_confusion_matrix") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_confusion_matrix_passes_args(self, _key, _ws, mock_fn): + mock_fn.return_value = {"matrix": []} + args = _args(workspace=None, eval_id="e1", split="test", confidence=30) + + from roboflow.cli.handlers.eval import _confusion_matrix + + with patch("builtins.print"): + _confusion_matrix(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", split="test", confidence=30) + + @patch("roboflow.adapters.rfapi.get_model_eval_confusion_matrix") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_confusion_matrix_invalid_confidence_exits_5(self, _key, _ws, mock_fn): + from roboflow.adapters import rfapi + + mock_fn.side_effect = rfapi.InvalidConfidenceError("bad") + args = _args(workspace=None, eval_id="e1", split=None, confidence=999) + + from roboflow.cli.handlers.eval import _confusion_matrix + + with self.assertRaises(SystemExit) as ctx: + _confusion_matrix(args) + self.assertEqual(ctx.exception.code, 5) + + @patch("roboflow.adapters.rfapi.get_model_eval_vector_analysis") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_vector_analysis_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"clusters": []} + args = _args(workspace=None, eval_id="e1", confidence=20) + + from roboflow.cli.handlers.eval import _vector_analysis + + with patch("builtins.print"): + _vector_analysis(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", confidence=20) + + @patch("roboflow.adapters.rfapi.get_model_eval_image_predictions") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_image_predictions_pagination(self, _key, _ws, mock_fn): + mock_fn.return_value = { + "split": "test", + "confidenceThreshold": 30, + "totalImages": 100, + "offset": 50, + "limit": 10, + "images": [{"imageId": "i1", "imageName": "a.jpg", "split": "test", "stats": {}}], + } + args = _args(workspace=None, eval_id="e1", split="test", confidence=30, limit=10, offset=50) + + from roboflow.cli.handlers.eval import _image_predictions + + with patch("builtins.print") as mock_print: + _image_predictions(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", split="test", confidence=30, limit=10, offset=50) + printed = mock_print.call_args[0][0] + self.assertIn("a.jpg", printed) + + @patch("roboflow.adapters.rfapi.get_model_eval_image_predictions") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_image_predictions_table_renders_TP_FP_FN_from_camelCase_stats(self, _key, _ws, mock_fn): + # Regression: the public API nests counts under `stats` with camelCase + # keys (`truePositives`/`falsePositives`/`falseNegatives`). Earlier code + # read `stats.tp`/`stats.fp`/`stats.fn` and silently rendered blanks. + mock_fn.return_value = { + "split": "test", + "confidenceThreshold": 0.2, + "totalImages": 1, + "offset": 0, + "limit": 1, + "images": [ + { + "imageId": "i1", + "imageName": "abc.jpg", + "split": "test", + "augmentations": 2, + "stats": { + "truePositives": 7, + "falsePositives": 2, + "falseNegatives": 1, + "precision": 0.78, + "recall": 0.875, + "f1": 0.824, + }, + # The cluster column previously stringified the whole dict; + # we only want the cluster id rendered. + "cluster": {"id": 4, "embedding2D": [1.5, -3.2]}, + } + ], + } + args = _args(workspace=None, eval_id="e1", split="test", confidence=None, limit=None, offset=None) + + from roboflow.cli.handlers.eval import _image_predictions + + with patch("builtins.print") as mock_print: + _image_predictions(args) + printed = mock_print.call_args[0][0] + # TP/FP/FN counts must appear in the rendered table. + self.assertIn("7", printed) # truePositives + self.assertIn("2", printed) # falsePositives + augmentations both = 2; either way it should appear + self.assertIn("1", printed) # falseNegatives + # Cluster rendered as the bare id, not the embedding-bearing dict. + self.assertIn(" 4 ", printed) + self.assertNotIn("embedding2D", printed) + + @patch("roboflow.adapters.rfapi.get_model_eval_recommendations") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_recommendations_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"generated": False} + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _recommendations + + with patch("builtins.print"): + _recommendations(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_panel_409_not_done_exits_4(self, _key, _ws, mock_fn): + from roboflow.adapters import rfapi + + mock_fn.side_effect = rfapi.ModelEvalNotDoneError("running") + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _map_results + + with self.assertRaises(SystemExit) as ctx: + _map_results(args) + self.assertEqual(ctx.exception.code, 4) + + +# --------------------------------------------------------------------------- +# Exit-code mapping helper +# --------------------------------------------------------------------------- + + +class TestExitCodeMapping(unittest.TestCase): + """The handler distinguishes 404/409/400 to give shell scripts useful exit codes.""" + + def test_exit_codes(self) -> None: + from roboflow.adapters import rfapi + from roboflow.cli.handlers.eval import _eval_error_exit_code + + cases = { + rfapi.ModelEvalNotFoundError("x"): 3, + rfapi.ModelEvalNotDoneError("x"): 4, + rfapi.InvalidSplitError("x"): 5, + rfapi.InvalidConfidenceError("x"): 5, + rfapi.RoboflowError("x"): 1, + ValueError("x"): 1, + } + for exc, expected in cases.items(): + with self.subTest(exc=type(exc).__name__): + self.assertEqual(_eval_error_exit_code(exc), expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_model_eval.py b/tests/test_model_eval.py new file mode 100644 index 00000000..40072bab --- /dev/null +++ b/tests/test_model_eval.py @@ -0,0 +1,318 @@ +"""Unit tests for the ModelEval SDK class and Workspace.evals/eval accessors.""" + +from __future__ import annotations + +import unittest +from unittest.mock import patch + + +def _make_workspace(api_key="k", url="lee-sandbox"): + """Build a Workspace with the minimal info dict its constructor accepts.""" + from roboflow.core.workspace import Workspace + + info = { + "workspace": { + "name": "Test", + "url": url, + "projects": [], + "members": [], + } + } + return Workspace(info, api_key=api_key, default_workspace=url, model_format="yolov8") + + +class TestModelEvalConstruction(unittest.TestCase): + def test_apply_info_populates_attributes(self): + from roboflow.core.model_eval import ModelEval + + info = { + "evalId": "e1", + "status": "done", + "project": "my-project-slug", # URL slug — the public API only returns the slug + "versionId": "3", + "modelId": "m1", + "createdAt": "2025-01-01", + "summary": {"mAP": 0.9, "precision": 0.8, "recall": 0.85}, + } + ev = ModelEval("k", "ws", "e1", info=info) + + self.assertEqual(ev.id, "e1") + self.assertEqual(ev.status, "done") + self.assertEqual(ev.project, "my-project-slug") + self.assertEqual(ev.version_id, "3") + self.assertEqual(ev.model_id, "m1") + self.assertEqual(ev.created_at, "2025-01-01") + self.assertEqual(ev.summary["mAP"], 0.9) + + def test_construction_without_info(self): + from roboflow.core.model_eval import ModelEval + + ev = ModelEval("k", "ws", "e1") + self.assertEqual(ev.id, "e1") + self.assertIsNone(ev.status) + self.assertIsNone(ev.summary) + + +class TestModelEvalToDict(unittest.TestCase): + """`to_dict()` has two branches: with-payload (round-trip) and without-payload + (rebuild from attributes). Both need to behave correctly.""" + + def test_to_dict_round_trips_raw_payload_with_evalId_overlay(self): + from roboflow.core.model_eval import ModelEval + + # Server-payload path: the raw response is round-tripped (including any + # extra keys we don't surface as attrs), with `evalId` overlaid so legacy + # `id`-keyed responses still emit the DNA-aligned field. + info = { + "evalId": "e1", + "status": "done", + "project": "my-project-slug", + "versionId": "3", + "modelId": "m1", + "createdAt": "2025-01-01", + "summary": {"mAP": 0.9, "precision": 0.8, "recall": 0.85}, + "extraField": "preserved-by-roundtrip", + } + ev = ModelEval("k", "ws", "e1", info=info) + d = ev.to_dict() + # Round-trip preserves every server-side field, including ones we don't + # surface as attributes. + self.assertEqual(d["extraField"], "preserved-by-roundtrip") + self.assertEqual(d["project"], "my-project-slug") + self.assertEqual(d["evalId"], "e1") + self.assertEqual(d["summary"]["mAP"], 0.9) + + def test_to_dict_overlays_evalId_when_payload_used_legacy_id_key(self): + from roboflow.core.model_eval import ModelEval + + # Older server versions returned `id` instead of `evalId`. The SDK accepts + # both on the way in; on the way out it always emits `evalId`. + info = {"id": "e1-legacy", "status": "done", "project": "p"} + ev = ModelEval("k", "ws", "e1-legacy", info=info) + d = ev.to_dict() + self.assertEqual(d["evalId"], "e1-legacy") + + def test_to_dict_no_info_serialises_attrs_only_omitting_None(self): + from roboflow.core.model_eval import ModelEval + + # Constructor-only path (no `info=` payload, no `refresh()` call). + # Only attributes the caller sets get serialised; everything else is + # omitted rather than serialised as `null`. + ev = ModelEval("k", "ws", "e1") + d = ev.to_dict() + self.assertEqual(d, {"evalId": "e1"}) + + def test_to_dict_no_info_translates_attr_names_back_to_json_keys(self): + from roboflow.core.model_eval import ModelEval + + # Hand-construct an instance without an info payload, then mutate + # attributes (the way a user might before serialising for logging / + # comparison). `to_dict` should emit the JSON-side names, not the + # snake_case Python attr names. + ev = ModelEval("k", "ws", "e1") + ev.status = "done" + ev.project = "p" + ev.version_id = "3" + ev.model_id = "m1" + ev.created_at = "2025-01-01" + ev.summary = {"mAP": 0.9} + d = ev.to_dict() + self.assertEqual( + d, + { + "evalId": "e1", + "status": "done", + "project": "p", + "versionId": "3", # not version_id + "modelId": "m1", # not model_id + "createdAt": "2025-01-01", # not created_at + "summary": {"mAP": 0.9}, + }, + ) + + +class TestModelEvalRefresh(unittest.TestCase): + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_refresh_updates_status_and_summary(self, mock_get): + from roboflow.core.model_eval import ModelEval + + mock_get.return_value = { + "evalId": "e1", + "status": "done", + "summary": {"mAP": 0.95}, + } + ev = ModelEval("k", "ws", "e1") + result = ev.refresh() + + self.assertIs(result, ev) # chainable + self.assertEqual(ev.status, "done") + self.assertEqual(ev.summary["mAP"], 0.95) + mock_get.assert_called_once_with("k", "ws", "e1") + + +class TestModelEvalPanelAccessors(unittest.TestCase): + """Each panel method delegates to the matching rfapi function with the right args.""" + + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + def test_map_results(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"splits": {}} + ev = ModelEval("k", "ws", "e1") + result = ev.map_results() + + self.assertEqual(result, {"splits": {}}) + mock_fn.assert_called_once_with("k", "ws", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_confidence_sweep") + def test_confidence_sweep(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"splits": {}} + ModelEval("k", "ws", "e1").confidence_sweep() + + mock_fn.assert_called_once_with("k", "ws", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + def test_performance_by_class_default_split(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"classes": []} + ModelEval("k", "ws", "e1").performance_by_class() + mock_fn.assert_called_once_with("k", "ws", "e1", split=None) + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + def test_performance_by_class_with_split(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"classes": []} + ModelEval("k", "ws", "e1").performance_by_class(split="valid") + mock_fn.assert_called_once_with("k", "ws", "e1", split="valid") + + @patch("roboflow.adapters.rfapi.get_model_eval_confusion_matrix") + def test_confusion_matrix(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"matrix": []} + ModelEval("k", "ws", "e1").confusion_matrix(split="test", confidence=30) + mock_fn.assert_called_once_with("k", "ws", "e1", split="test", confidence=30) + + @patch("roboflow.adapters.rfapi.get_model_eval_vector_analysis") + def test_vector_analysis(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"clusters": []} + ModelEval("k", "ws", "e1").vector_analysis(confidence=40) + mock_fn.assert_called_once_with("k", "ws", "e1", confidence=40) + + @patch("roboflow.adapters.rfapi.get_model_eval_image_predictions") + def test_image_predictions(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"images": []} + ModelEval("k", "ws", "e1").image_predictions(split="valid", confidence=20, limit=50, offset=100) + mock_fn.assert_called_once_with("k", "ws", "e1", split="valid", confidence=20, limit=50, offset=100) + + @patch("roboflow.adapters.rfapi.get_model_eval_recommendations") + def test_recommendations(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"recommendations": []} + ModelEval("k", "ws", "e1").recommendations() + mock_fn.assert_called_once_with("k", "ws", "e1") + + +class TestModelEvalErrors(unittest.TestCase): + """Typed errors from the adapter propagate through the SDK accessors.""" + + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + def test_not_done_error_propagates(self, mock_fn): + from roboflow.adapters import rfapi + from roboflow.core.model_eval import ModelEval + + mock_fn.side_effect = rfapi.ModelEvalNotDoneError("Eval still running") + ev = ModelEval("k", "ws", "e1") + with self.assertRaises(rfapi.ModelEvalNotDoneError): + ev.map_results() + + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_refresh_404_propagates(self, mock_fn): + from roboflow.adapters import rfapi + from roboflow.core.model_eval import ModelEval + + mock_fn.side_effect = rfapi.ModelEvalNotFoundError("nope") + with self.assertRaises(rfapi.ModelEvalNotFoundError): + ModelEval("k", "ws", "e1").refresh() + + +class TestWorkspaceEvalAccessors(unittest.TestCase): + @patch("roboflow.adapters.rfapi.list_model_evals") + def test_evals_returns_modeleval_instances(self, mock_list): + from roboflow.core.model_eval import ModelEval + + mock_list.return_value = { + "evals": [ + {"evalId": "e1", "status": "done", "project": "my-project-slug"}, + {"evalId": "e2", "status": "running", "project": "my-project-slug"}, + ] + } + ws = _make_workspace() + result = ws.evals(status="done", limit=5) + + self.assertEqual(len(result), 2) + self.assertTrue(all(isinstance(e, ModelEval) for e in result)) + self.assertEqual(result[0].id, "e1") + self.assertEqual(result[0].status, "done") + self.assertEqual(result[1].id, "e2") + # Workspace forwards filters to the adapter + mock_list.assert_called_once_with( + "k", "lee-sandbox", project=None, version=None, model=None, status="done", limit=5 + ) + + @patch("roboflow.adapters.rfapi.list_model_evals") + def test_evals_passes_all_filters(self, mock_list): + mock_list.return_value = {"evals": []} + + ws = _make_workspace() + ws.evals(project="p1", version="3", model="m1", status="failed", limit=200) + + mock_list.assert_called_once_with( + "k", "lee-sandbox", project="p1", version="3", model="m1", status="failed", limit=200 + ) + + @patch("roboflow.adapters.rfapi.list_model_evals") + def test_evals_empty_list(self, mock_list): + mock_list.return_value = {"evals": []} + ws = _make_workspace() + self.assertEqual(ws.evals(), []) + + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_eval_returns_populated_modeleval(self, mock_get): + from roboflow.core.model_eval import ModelEval + + mock_get.return_value = { + "evalId": "e1", + "status": "done", + "summary": {"mAP": 0.91}, + } + ws = _make_workspace() + ev = ws.eval("e1") + + self.assertIsInstance(ev, ModelEval) + self.assertEqual(ev.id, "e1") + self.assertEqual(ev.status, "done") + self.assertEqual(ev.summary["mAP"], 0.91) + mock_get.assert_called_once_with("k", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_eval_propagates_not_found(self, mock_get): + from roboflow.adapters import rfapi + + mock_get.side_effect = rfapi.ModelEvalNotFoundError("nope") + ws = _make_workspace() + with self.assertRaises(rfapi.ModelEvalNotFoundError): + ws.eval("bad") + + +if __name__ == "__main__": + unittest.main()