From e34963b6cc9aa74034e143d4116055728dcf9c94 Mon Sep 17 00:00:00 2001 From: Lee Clement Date: Thu, 7 May 2026 10:01:20 -0400 Subject: [PATCH 1/5] Add model evaluations SDK and CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps the public /{workspace}/model-evals REST surface (roboflow/roboflow#11636) so users can read evaluation results — mAP, confidence sweep, per-class performance, confusion matrix, vector clusters, per-image stats, and recommendations — from Python and from the CLI without hitting the API directly. SDK: - Workspace.evals(...) and Workspace.eval(eval_id) accessors return ModelEval instances; ModelEval has one method per panel returning the raw JSON dict. - Typed exceptions (ModelEvalNotFoundError, ModelEvalNotDoneError, InvalidSplitError, InvalidConfidenceError) so callers can distinguish "doesn't exist" from "still running" from "bad argument" without parsing strings. CLI: roboflow eval {list, get, map-results, confidence-sweep, performance-by-class, confusion-matrix, vector-analysis, image-predictions, recommendations} — every command honors --json. Exit codes are stable per error class (3=not found, 4=not done, 5=invalid arg). Tests cover the adapter URL/param plumbing and error mapping (both flat and nested error envelopes), the ModelEval class, the Workspace accessors, and each CLI handler's adapter call + error path. Companion docs in roboflow/roboflow-dev-reference#18. --- CHANGELOG.md | 43 ++ CLI-COMMANDS.md | 26 ++ roboflow/adapters/rfapi.py | 190 +++++++++ roboflow/cli/__init__.py | 2 + roboflow/cli/handlers/eval.py | 490 +++++++++++++++++++++++ roboflow/core/model_eval.py | 151 +++++++ roboflow/core/workspace.py | 65 +++ tests/adapters/test_rfapi_model_evals.py | 224 +++++++++++ tests/cli/test_eval_handler.py | 353 ++++++++++++++++ tests/test_model_eval.py | 244 +++++++++++ 10 files changed, 1788 insertions(+) create mode 100644 roboflow/cli/handlers/eval.py create mode 100644 roboflow/core/model_eval.py create mode 100644 tests/adapters/test_rfapi_model_evals.py create mode 100644 tests/cli/test_eval_handler.py create mode 100644 tests/test_model_eval.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f76ed7a7..c8d963d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,49 @@ All notable changes to this project will be documented in this file. +## Unreleased + +### Added — Model evaluations SDK & CLI + +Wraps the public `/{workspace}/model-evals` REST surface +([roboflow/roboflow#11636](https://github.com/roboflow/roboflow/pull/11636)) +so users can read evaluation results — mAP, confidence sweep, per-class +performance, confusion matrix, vector clusters, per-image stats, +recommendations — from Python and from the CLI without hitting the API +directly. Companion docs: +[roboflow-dev-reference#18](https://github.com/roboflow/roboflow-dev-reference/pull/18). + +**SDK (`roboflow/core/model_eval.py`):** +- `Workspace.evals(project=None, version=None, model=None, status=None, limit=None)` — list evals as `ModelEval` instances pre-populated with metadata from the list response. +- `Workspace.eval(eval_id)` — fetch a single eval (returns a `ModelEval` with `.summary` populated when status is `done`). +- `ModelEval.refresh()` — re-fetch the eval header. +- `ModelEval.map_results()`, `.confidence_sweep()`, `.performance_by_class(split=None)`, `.confusion_matrix(split=None, confidence=None)`, `.vector_analysis(confidence=None)`, `.image_predictions(split=None, confidence=None, limit=None, offset=None)`, `.recommendations()` — one method per panel; each returns the raw JSON dict. + +**CLI (`roboflow/cli/handlers/eval.py`):** +- `roboflow eval list [--project P] [--version V] [--model M] [--status S] [--limit N]` +- `roboflow eval get ` +- `roboflow eval map-results ` +- `roboflow eval confidence-sweep ` +- `roboflow eval performance-by-class [--split S]` +- `roboflow eval confusion-matrix [--split S] [--confidence N]` +- `roboflow eval vector-analysis [--confidence N]` +- `roboflow eval image-predictions [--split S] [--confidence N] [--limit N] [--offset N]` +- `roboflow eval recommendations ` + +Exit codes are stable per error class so shell scripts and AI agents can +react without parsing message strings: `3` for `model_eval_not_found` +(404), `4` for `model_eval_not_done` (409), `5` for `invalid_split` / +`invalid_confidence` (400). Every command supports `--json` for +structured output. + +**Low-level (`roboflow.adapters.rfapi`):** +- `list_model_evals`, `get_model_eval`, `get_model_eval_map_results`, `get_model_eval_confidence_sweep`, `get_model_eval_performance_by_class`, `get_model_eval_confusion_matrix`, `get_model_eval_vector_analysis`, `get_model_eval_image_predictions`, `get_model_eval_recommendations`. +- New typed exceptions `ModelEvalNotFoundError`, `ModelEvalNotDoneError`, `InvalidSplitError`, `InvalidConfidenceError` (all subclasses of `RoboflowError`) so callers can distinguish "eval doesn't exist" from "eval still running" from "bad argument" without parsing strings. + +The endpoints require the `model-eval:read` scope. The base URL is +configurable via `API_URL` (set to `https://localapi.roboflow.one` to +test against a local API server). + ## 1.3.7 ### Added — Soft-delete / Trash support diff --git a/CLI-COMMANDS.md b/CLI-COMMANDS.md index fa1c9082..d3532a8d 100644 --- a/CLI-COMMANDS.md +++ b/CLI-COMMANDS.md @@ -216,6 +216,31 @@ single item) is intentionally not available from the SDK or CLI — those actions destroy data irrecoverably and live only in the web UI's Trash view. Items left in Trash are cleaned up automatically after 30 days. +### Inspect model evaluations + +```bash +# List evals in the workspace; filter by project, version, model, or status. +roboflow eval list --status done --limit 10 + +# Read a single eval's metadata + summary metrics. +roboflow eval get + +# Pull each panel — pipe to jq for structured access. +roboflow eval map-results --json | jq '.splits.test.map50' +roboflow eval performance-by-class --split test +roboflow eval confusion-matrix --split test --confidence 30 +roboflow eval confidence-sweep --json +roboflow eval vector-analysis --confidence 20 --json +roboflow eval image-predictions --split test --limit 200 +roboflow eval recommendations --json +``` + +Exit codes are stable per error class so scripts and agents can react +without parsing message strings: `3` for `model_eval_not_found` (404), +`4` for `model_eval_not_done` (409 — eval still running), `5` for +`invalid_split` / `invalid_confidence` (400). Requires the +`model-eval:read` scope on the api key. + ### Workspace stats and billing ```bash @@ -316,6 +341,7 @@ Version numbers are always numeric — that's how `x/y` is disambiguated between | `search` | Search workspace images (RoboQL), export results | | `deployment` | Manage dedicated deployments | | `device` | List, get, create, and observe RFDM devices (v2 deployment API) | +| `eval` | Inspect model evaluation runs (mAP, confusion matrix, recommendations, ...) | | `workflow` | Manage workflows | | `folder` | Manage workspace folders | | `annotation` | Annotation batches and jobs | diff --git a/roboflow/adapters/rfapi.py b/roboflow/adapters/rfapi.py index 1f6aa1eb..2079e8cd 100644 --- a/roboflow/adapters/rfapi.py +++ b/roboflow/adapters/rfapi.py @@ -1165,3 +1165,193 @@ def restore_trash_item(api_key, workspace_url, item_type, item_id, parent_id=Non # Note: permanent-delete from Trash (deleteImmediately / empty) is # intentionally not exposed on the public API — those actions destroy data # irrecoverably and are only available through the web UI's Trash view. + + +# --------------------------------------------------------------------------- +# Model evaluations +# --------------------------------------------------------------------------- + + +class ModelEvalNotFoundError(RoboflowError): + """Raised when an eval id (or workspace) does not exist (HTTP 404).""" + + +class ModelEvalNotDoneError(RoboflowError): + """Raised when reading panel data for an eval whose status is not ``done`` (HTTP 409).""" + + +class InvalidSplitError(RoboflowError): + """Raised when ``split`` is not one of the accepted values (HTTP 400).""" + + +class InvalidConfidenceError(RoboflowError): + """Raised when ``confidence`` is non-integer or out of range 0-100 (HTTP 400).""" + + +def _model_eval_error_for(response): + """Translate a model-eval error response into the right RoboflowError subclass. + + The model-eval REST surface returns errors in the shape:: + + {"error": "", "message": ""} + + Some routes (and earlier drafts of the spec) instead nest the code as + ``{"error": {"code": "...", "message": "..."}}``; we accept both so we + don't churn when the server normalises. Falls back to plain + :class:`RoboflowError` when the body isn't JSON or the code is + unrecognised, so new error codes don't crash older SDK callers. + """ + code = None + message = response.text + try: + body = response.json() + if isinstance(body, dict): + err = body.get("error") + if isinstance(err, str): + # Flat shape: {"error": "code_string", "message": "..."} + code = err + message = body.get("message") or err + elif isinstance(err, dict): + # Nested shape: {"error": {"code": "...", "message": "..."}} + code = err.get("code") + message = err.get("message") or body.get("message") or message + else: + message = body.get("message", message) + except (ValueError, TypeError): + pass + + cls_by_code = { + "model_eval_not_found": ModelEvalNotFoundError, + "model_eval_not_done": ModelEvalNotDoneError, + "invalid_split": InvalidSplitError, + "invalid_confidence": InvalidConfidenceError, + } + cls = cls_by_code.get(code or "") + if cls is not None: + return cls(message) + # Status-code fallbacks for backends that haven't shipped the typed code yet. + if response.status_code == 404: + return ModelEvalNotFoundError(message) + if response.status_code == 409: + return ModelEvalNotDoneError(message) + return RoboflowError(message) + + +def _eval_get(api_key, workspace_url, path, params=None): + """GET helper for model-eval endpoints with typed error mapping.""" + query: Dict[str, Union[str, int]] = {"api_key": api_key} + if params: + for key, value in params.items(): + if value is not None: + query[key] = value + url = f"{API_URL}/{workspace_url}/model-evals{path}" + response = requests.get(url, params=query) + if response.status_code != 200: + raise _model_eval_error_for(response) + return response.json() + + +def list_model_evals( + api_key: str, + workspace_url: str, + *, + project: Optional[str] = None, + version: Optional[Union[str, int]] = None, + model: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals — list evals in the workspace.""" + return _eval_get( + api_key, + workspace_url, + "", + params={"project": project, "version": version, "model": model, "status": status, "limit": limit}, + ) + + +def get_model_eval(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId} — fetch a single eval (with summary if done).""" + return _eval_get(api_key, workspace_url, f"/{eval_id}") + + +def get_model_eval_map_results(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId}/map-results — per-split mAP breakdown.""" + return _eval_get(api_key, workspace_url, f"/{eval_id}/map-results") + + +def get_model_eval_confidence_sweep(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId}/confidence-sweep — F1/precision/recall sweep.""" + return _eval_get(api_key, workspace_url, f"/{eval_id}/confidence-sweep") + + +def get_model_eval_performance_by_class( + api_key: str, + workspace_url: str, + eval_id: str, + *, + split: Optional[str] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/performance-by-class — per-class metrics. + + Server rejects ``split=all`` for this panel; pass one of train/valid/test + or omit to use the server default (test). + """ + return _eval_get(api_key, workspace_url, f"/{eval_id}/performance-by-class", params={"split": split}) + + +def get_model_eval_confusion_matrix( + api_key: str, + workspace_url: str, + eval_id: str, + *, + split: Optional[str] = None, + confidence: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/confusion-matrix — confusion matrix for split.""" + return _eval_get( + api_key, + workspace_url, + f"/{eval_id}/confusion-matrix", + params={"split": split, "confidence": confidence}, + ) + + +def get_model_eval_vector_analysis( + api_key: str, + workspace_url: str, + eval_id: str, + *, + confidence: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/vector-analysis — embedding clusters & metrics.""" + return _eval_get( + api_key, + workspace_url, + f"/{eval_id}/vector-analysis", + params={"confidence": confidence}, + ) + + +def get_model_eval_image_predictions( + api_key: str, + workspace_url: str, + eval_id: str, + *, + split: Optional[str] = None, + confidence: Optional[int] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, +) -> dict: + """GET /{workspace}/model-evals/{evalId}/image-predictions — paginated per-image stats.""" + return _eval_get( + api_key, + workspace_url, + f"/{eval_id}/image-predictions", + params={"split": split, "confidence": confidence, "limit": limit, "offset": offset}, + ) + + +def get_model_eval_recommendations(api_key: str, workspace_url: str, eval_id: str) -> dict: + """GET /{workspace}/model-evals/{evalId}/recommendations — improvement suggestions.""" + return _eval_get(api_key, workspace_url, f"/{eval_id}/recommendations") diff --git a/roboflow/cli/__init__.py b/roboflow/cli/__init__.py index 1fd4a038..c2ff3594 100644 --- a/roboflow/cli/__init__.py +++ b/roboflow/cli/__init__.py @@ -191,6 +191,7 @@ def _walk(group: Any, prefix: str = "") -> None: from roboflow.cli.handlers.completion import completion_app # noqa: E402 from roboflow.cli.handlers.deployment import deployment_app # noqa: E402 from roboflow.cli.handlers.device import device_app # noqa: E402 +from roboflow.cli.handlers.eval import eval_app # noqa: E402 from roboflow.cli.handlers.folder import folder_app # noqa: E402 from roboflow.cli.handlers.image import image_app # noqa: E402 from roboflow.cli.handlers.infer import infer_command # noqa: E402 @@ -214,6 +215,7 @@ def _walk(group: Any, prefix: str = "") -> None: app.add_typer(completion_app, name="completion") app.add_typer(deployment_app, name="deployment") app.add_typer(device_app, name="device") +app.add_typer(eval_app, name="eval") app.add_typer(folder_app, name="folder") app.add_typer(image_app, name="image") diff --git a/roboflow/cli/handlers/eval.py b/roboflow/cli/handlers/eval.py new file mode 100644 index 00000000..a4593b30 --- /dev/null +++ b/roboflow/cli/handlers/eval.py @@ -0,0 +1,490 @@ +"""Model evaluation commands. + +Wraps the public ``/{workspace}/model-evals`` REST surface — list runs in a +workspace and pull each panel (mAP, confidence sweep, per-class table, +confusion matrix, vector clusters, per-image stats, recommendations). + +The eval-id is opaque (the human in the UI navigates by URL); commands take +it as a positional argument so it composes well with ``--json | jq``. +""" + +from __future__ import annotations + +from typing import Annotated, Optional + +import typer + +from roboflow.cli._compat import SortedGroup, ctx_to_args + +eval_app = typer.Typer(cls=SortedGroup, help="Inspect model evaluation runs", no_args_is_help=True) + + +# --------------------------------------------------------------------------- +# Command surface (Typer) +# --------------------------------------------------------------------------- + + +@eval_app.command("list") +def list_evals_cmd( + ctx: typer.Context, + project: Annotated[Optional[str], typer.Option("-p", "--project", help="Filter by project slug or id")] = None, + version: Annotated[Optional[str], typer.Option("-v", "--version", help="Filter by version id")] = None, + model: Annotated[Optional[str], typer.Option("-m", "--model", help="Filter by model id")] = None, + status: Annotated[ + Optional[str], typer.Option("-s", "--status", help="Filter by status (running/done/failed)") + ] = None, + limit: Annotated[Optional[int], typer.Option("-n", "--limit", help="Max results (default 50, max 200)")] = None, +) -> None: + """List model evaluations in the workspace.""" + args = ctx_to_args(ctx, project=project, version=version, model=model, status=status, limit=limit) + _list_evals(args) + + +@eval_app.command("get") +def get_eval_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id (from `roboflow eval list`)")], +) -> None: + """Show a single eval's metadata and summary metrics.""" + args = ctx_to_args(ctx, eval_id=eval_id) + _get_eval(args) + + +@eval_app.command("map-results") +def map_results_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], +) -> None: + """Show per-split mAP results (mAP50, mAP50-95, mAP75, by object size, per class).""" + args = ctx_to_args(ctx, eval_id=eval_id) + _map_results(args) + + +@eval_app.command("confidence-sweep") +def confidence_sweep_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], +) -> None: + """Show the confidence-threshold sweep (precision/recall/F1) for the test split.""" + args = ctx_to_args(ctx, eval_id=eval_id) + _confidence_sweep(args) + + +@eval_app.command("performance-by-class") +def performance_by_class_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + split: Annotated[ + Optional[str], + typer.Option("-s", "--split", help="Split: train, valid, or test (default test). 'all' is rejected."), + ] = None, +) -> None: + """Show per-class precision / recall / F1 / mAP for the chosen split.""" + args = ctx_to_args(ctx, eval_id=eval_id, split=split) + _performance_by_class(args) + + +@eval_app.command("confusion-matrix") +def confusion_matrix_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + split: Annotated[ + Optional[str], typer.Option("-s", "--split", help="Split: train, valid, test, or all (default test)") + ] = None, + confidence: Annotated[ + Optional[int], + typer.Option("-c", "--confidence", help="Integer confidence threshold (0-100)"), + ] = None, +) -> None: + """Show the confusion matrix for *split* at *confidence*.""" + args = ctx_to_args(ctx, eval_id=eval_id, split=split, confidence=confidence) + _confusion_matrix(args) + + +@eval_app.command("vector-analysis") +def vector_analysis_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + confidence: Annotated[ + Optional[int], + typer.Option("-c", "--confidence", help="Integer confidence threshold (0-100)"), + ] = None, +) -> None: + """Show embedding-cluster diagnostics (per-cluster sample images + metrics).""" + args = ctx_to_args(ctx, eval_id=eval_id, confidence=confidence) + _vector_analysis(args) + + +@eval_app.command("image-predictions") +def image_predictions_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], + split: Annotated[ + Optional[str], typer.Option("-s", "--split", help="Split: train, valid, test, or all (default test)") + ] = None, + confidence: Annotated[ + Optional[int], + typer.Option("-c", "--confidence", help="Integer confidence threshold (0-100)"), + ] = None, + limit: Annotated[ + Optional[int], + typer.Option("-n", "--limit", help="Page size (default 200, max 1000)"), + ] = None, + offset: Annotated[ + Optional[int], + typer.Option("-o", "--offset", help="Pagination offset"), + ] = None, +) -> None: + """Show paginated per-image stats (TP/FP/FN, augmentations, cluster id).""" + args = ctx_to_args(ctx, eval_id=eval_id, split=split, confidence=confidence, limit=limit, offset=offset) + _image_predictions(args) + + +@eval_app.command("recommendations") +def recommendations_cmd( + ctx: typer.Context, + eval_id: Annotated[str, typer.Argument(help="Eval id")], +) -> None: + """Show server-generated suggestions for improving the model.""" + args = ctx_to_args(ctx, eval_id=eval_id) + _recommendations(args) + + +# --------------------------------------------------------------------------- +# Business logic +# --------------------------------------------------------------------------- + + +def _resolve(args): # noqa: ANN001 + from roboflow.cli._resolver import resolve_ws_and_key + + return resolve_ws_and_key(args) + + +def _eval_error_exit_code(exc: Exception) -> int: + """Map a model-eval error to the canonical CLI exit code. + + 1 = general; 2 = auth; 3 = not found; 4 = conflict (eval not done); + 5 = invalid argument (bad split / confidence). Keeping these distinct + lets shell scripts and AI agents react to specific failure modes + without parsing message strings. + """ + from roboflow.adapters import rfapi + + if isinstance(exc, rfapi.ModelEvalNotFoundError): + return 3 + if isinstance(exc, rfapi.ModelEvalNotDoneError): + return 4 + if isinstance(exc, (rfapi.InvalidSplitError, rfapi.InvalidConfidenceError)): + return 5 + return 1 + + +def _hint_for(exc: Exception) -> Optional[str]: + """Per-error actionable hint shown alongside the message in non-JSON mode.""" + from roboflow.adapters import rfapi + + if isinstance(exc, rfapi.ModelEvalNotFoundError): + return "Run 'roboflow eval list' to see eval ids in this workspace." + if isinstance(exc, rfapi.ModelEvalNotDoneError): + return "Wait for the eval to finish (status='done') before reading panel data." + if isinstance(exc, rfapi.InvalidSplitError): + return "Use one of: train, valid, test (or 'all' where supported)." + if isinstance(exc, rfapi.InvalidConfidenceError): + return "Pass an integer between 0 and 100." + return None + + +def _list_evals(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + from roboflow.cli._table import format_table + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + result = rfapi.list_model_evals( + api_key, + workspace_url, + project=args.project, + version=args.version, + model=args.model, + status=args.status, + limit=args.limit, + ) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + evals = result.get("evals", []) + rows = [ + { + "id": e.get("id", ""), + "status": e.get("status", ""), + "project": e.get("projectId", ""), + "version": e.get("versionId", ""), + "model": e.get("modelId", "") or "", + "created": e.get("createdAt", ""), + } + for e in evals + ] + table = format_table( + rows, + columns=["id", "status", "project", "version", "model", "created"], + headers=["ID", "STATUS", "PROJECT", "VERSION", "MODEL", "CREATED"], + ) + output(args, evals, text=table) + + +def _get_eval(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + info = rfapi.get_model_eval(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + lines = [ + f"Eval: {info.get('id', args.eval_id)}", + f" Status: {info.get('status', '')}", + f" Project: {info.get('projectId', '')}", + f" Version: {info.get('versionId', '')}", + f" Model: {info.get('modelId', '') or '(none)'}", + f" Created: {info.get('createdAt', '')}", + ] + config = info.get("config") or {} + if config: + lines.append(f" Config: overlap={config.get('overlap')} iouThreshold={config.get('iouThreshold')}") + summary = info.get("summary") or {} + if summary: + lines.append( + f" Summary: mAP={summary.get('mAP')} precision={summary.get('precision')} recall={summary.get('recall')}" + ) + output(args, info, text="\n".join(lines)) + + +def _emit_dict(args, payload, *, header: Optional[str] = None) -> None: # noqa: ANN001 + """Default text rendering for panel commands: pretty-printed JSON. + + Each panel has a deeply nested per-eval shape that doesn't tabulate + well in the general case (per-class tables exist, but vector clusters + and recommendations don't). For agent ergonomics we lean on --json, + and for humans we just pretty-print so they can pipe to jq or eyeball. + """ + import json as _json + + from roboflow.cli._output import output + + text = _json.dumps(payload, indent=2, default=str) + if header: + text = f"{header}\n{text}" + output(args, payload, text=text) + + +def _map_results(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_map_results(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) + + +def _confidence_sweep(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_confidence_sweep(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) + + +def _performance_by_class(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + from roboflow.cli._table import format_table + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_performance_by_class(api_key, workspace_url, args.eval_id, split=args.split) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + classes = data.get("classes", []) + rows = [] + for c in classes: + rows.append( + { + "class": c.get("className", ""), + "map50": _fmt_float(c.get("map50")), + "map50_95": _fmt_float(c.get("map50_95")), + "map75": _fmt_float(c.get("map75")), + "precision": _fmt_float(c.get("precision")), + "recall": _fmt_float(c.get("recall")), + "f1": _fmt_float(c.get("f1")), + "opt_thresh": _fmt_float(c.get("optimalThreshold")), + } + ) + table = format_table( + rows, + columns=["class", "map50", "map50_95", "map75", "precision", "recall", "f1", "opt_thresh"], + headers=["CLASS", "mAP50", "mAP50-95", "mAP75", "P", "R", "F1", "OPT_THR"], + ) + header = f"Split: {data.get('split', args.split or 'test')}" + output(args, data, text=f"{header}\n{table}") + + +def _fmt_float(value): + """Format a float to 4 decimal places for table output; pass through ``None`` as ''.""" + if value is None: + return "" + try: + return f"{float(value):.4f}" + except (TypeError, ValueError): + return str(value) + + +def _confusion_matrix(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_confusion_matrix( + api_key, + workspace_url, + args.eval_id, + split=args.split, + confidence=args.confidence, + ) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + header = ( + f"Split: {data.get('split', args.split or 'test')} " + f"Confidence: {data.get('confidenceThreshold', args.confidence or 'default')}" + ) + _emit_dict(args, data, header=header) + + +def _vector_analysis(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_vector_analysis(api_key, workspace_url, args.eval_id, confidence=args.confidence) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) + + +def _image_predictions(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output, output_error + from roboflow.cli._table import format_table + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_image_predictions( + api_key, + workspace_url, + args.eval_id, + split=args.split, + confidence=args.confidence, + limit=args.limit, + offset=args.offset, + ) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + + images = data.get("images", []) + rows = [] + for img in images: + stats = img.get("stats") or {} + rows.append( + { + "image": img.get("imageName", img.get("imageId", "")), + "split": img.get("split", ""), + "tp": stats.get("tp", ""), + "fp": stats.get("fp", ""), + "fn": stats.get("fn", ""), + "cluster": img.get("cluster", ""), + } + ) + table = format_table( + rows, + columns=["image", "split", "tp", "fp", "fn", "cluster"], + headers=["IMAGE", "SPLIT", "TP", "FP", "FN", "CLUSTER"], + ) + header = ( + f"Split: {data.get('split', args.split or 'test')} " + f"Confidence: {data.get('confidenceThreshold', args.confidence or 'default')} " + f"Total: {data.get('totalImages', len(images))} " + f"Offset: {data.get('offset', args.offset or 0)} " + f"Limit: {data.get('limit', args.limit or 200)}" + ) + output(args, data, text=f"{header}\n{table}") + + +def _recommendations(args): # noqa: ANN001 + from roboflow.adapters import rfapi + from roboflow.cli._output import output_error + + resolved = _resolve(args) + if not resolved: + return + workspace_url, api_key = resolved + + try: + data = rfapi.get_model_eval_recommendations(api_key, workspace_url, args.eval_id) + except Exception as exc: + output_error(args, str(exc), hint=_hint_for(exc), exit_code=_eval_error_exit_code(exc)) + return + _emit_dict(args, data) diff --git a/roboflow/core/model_eval.py b/roboflow/core/model_eval.py new file mode 100644 index 00000000..38447fe2 --- /dev/null +++ b/roboflow/core/model_eval.py @@ -0,0 +1,151 @@ +"""Model evaluation results — wraps the public ``/model-evals`` REST surface. + +A :class:`ModelEval` is a thin lazy wrapper around a single evaluation run. +The constructor accepts the eval id (and optional cached metadata from a list +response); each panel (``map_results``, ``confusion_matrix``, etc.) is fetched +on demand and returned as the raw JSON dict the server emits. + +The shape mirrors the REST endpoints documented at +``docs.roboflow.com/api-reference/model-evaluations``. Errors surface as +typed :mod:`roboflow.adapters.rfapi` subclasses so callers can distinguish +"eval doesn't exist" from "eval still running" without parsing strings. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from roboflow.adapters import rfapi + + +class ModelEval: + """A single model-evaluation run. + + Construct via :meth:`roboflow.core.workspace.Workspace.eval` or list via + :meth:`roboflow.core.workspace.Workspace.evals`. Direct construction is + supported when you already hold an eval id:: + + from roboflow.core.model_eval import ModelEval + ev = ModelEval(api_key, "lee-sandbox", "huUF720inUcymARwqAGK") + ev.refresh() # populates .status, .summary, .config, etc. + """ + + def __init__( + self, + api_key: str, + workspace_url: str, + eval_id: str, + info: Optional[Dict[str, Any]] = None, + ) -> None: + self._api_key = api_key + self._workspace_url = workspace_url + self.id = eval_id + # Populate metadata from a cached list/get response when available; the + # caller can still refresh() to re-fetch from the server. + self._apply(info or {}) + + # -- internal ----------------------------------------------------------- + + def _apply(self, info: Dict[str, Any]) -> None: + self.status: Optional[str] = info.get("status") + self.project_id: Optional[str] = info.get("projectId") + self.version_id: Optional[str] = info.get("versionId") + self.model_id: Optional[str] = info.get("modelId") + self.created_at: Optional[str] = info.get("createdAt") + self.config: Dict[str, Any] = info.get("config", {}) or {} + self.summary: Optional[Dict[str, Any]] = info.get("summary") + self._raw: Dict[str, Any] = info + + # -- core --------------------------------------------------------------- + + def refresh(self) -> "ModelEval": + """Re-fetch the eval header (status, summary, config) from the server.""" + info = rfapi.get_model_eval(self._api_key, self._workspace_url, self.id) + self._apply(info) + return self + + # -- panel accessors ---------------------------------------------------- + + def map_results(self) -> Dict[str, Any]: + """Per-split mAP results (mAP50, mAP50-95, mAP75, by object size, per class).""" + return rfapi.get_model_eval_map_results(self._api_key, self._workspace_url, self.id) + + def confidence_sweep(self) -> Dict[str, Any]: + """Confidence-threshold sweep (precision/recall/F1) for the test split.""" + return rfapi.get_model_eval_confidence_sweep(self._api_key, self._workspace_url, self.id) + + def performance_by_class(self, split: Optional[str] = None) -> Dict[str, Any]: + """Per-class precision / recall / F1 / mAP for the chosen split. + + ``split`` defaults to ``"test"`` server-side. Passing ``"all"`` raises + :class:`rfapi.InvalidSplitError` — this panel does not support an + aggregate view. + """ + return rfapi.get_model_eval_performance_by_class(self._api_key, self._workspace_url, self.id, split=split) + + def confusion_matrix( + self, + split: Optional[str] = None, + confidence: Optional[int] = None, + ) -> Dict[str, Any]: + """Confusion matrix (classes + matrix) for *split* at integer *confidence* (0-100).""" + return rfapi.get_model_eval_confusion_matrix( + self._api_key, self._workspace_url, self.id, split=split, confidence=confidence + ) + + def vector_analysis(self, confidence: Optional[int] = None) -> Dict[str, Any]: + """Embedding-cluster diagnostics (per-cluster sample images + metrics).""" + return rfapi.get_model_eval_vector_analysis(self._api_key, self._workspace_url, self.id, confidence=confidence) + + def image_predictions( + self, + split: Optional[str] = None, + confidence: Optional[int] = None, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Dict[str, Any]: + """Paginated per-image stats (TP/FP/FN counts, augmentations, cluster id).""" + return rfapi.get_model_eval_image_predictions( + self._api_key, + self._workspace_url, + self.id, + split=split, + confidence=confidence, + limit=limit, + offset=offset, + ) + + def recommendations(self) -> Dict[str, Any]: + """Server-generated suggestions for improving the model.""" + return rfapi.get_model_eval_recommendations(self._api_key, self._workspace_url, self.id) + + # -- helpers ------------------------------------------------------------ + + def to_dict(self) -> Dict[str, Any]: + """Return the cached eval metadata as a plain dict (id + last header fetch).""" + data: Dict[str, Any] = {"id": self.id} + # Prefer raw payload (preserves keys we don't surface as attrs); fall + # back to attributes when only the constructor was called with no info. + if self._raw: + return {**self._raw, "id": self.id} + for key in ("status", "projectId", "versionId", "modelId", "createdAt", "config", "summary"): + attr = ( + key + if key in {"status", "config", "summary"} + else { + "projectId": "project_id", + "versionId": "version_id", + "modelId": "model_id", + "createdAt": "created_at", + }[key] + ) + value = getattr(self, attr, None) + if value is not None: + data[key] = value + return data + + def __repr__(self) -> str: # pragma: no cover - debug helper + return f"ModelEval(id={self.id!r}, status={self.status!r}, project={self.project_id!r})" + + +__all__: List[str] = ["ModelEval"] diff --git a/roboflow/core/workspace.py b/roboflow/core/workspace.py index 89519409..3fb90061 100644 --- a/roboflow/core/workspace.py +++ b/roboflow/core/workspace.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from roboflow.core.device import Device + from roboflow.core.model_eval import ModelEval class Workspace: @@ -1432,6 +1433,70 @@ def upload_vision_event_image( metadata=metadata, ) + # ----------------------------------------------------------------- + # Model evaluations + # ----------------------------------------------------------------- + + def evals( + self, + *, + project: Optional[str] = None, + version: Optional[str] = None, + model: Optional[str] = None, + status: Optional[str] = None, + limit: Optional[int] = None, + ) -> List["ModelEval"]: + """List model evaluations in this workspace. + + Args: + project: Filter by project slug or id. + version: Filter by version id (or numeric version). + model: Filter by model id. + status: Filter by status — one of ``"running"``, ``"done"``, ``"failed"``. + limit: Max evals to return (server caps at 200; default 50). + + Returns: + A list of :class:`ModelEval` instances pre-populated with the + metadata from the list response (``status``, ``createdAt``, etc.). + Call :meth:`ModelEval.refresh` to re-fetch the header, or any + panel method to load detailed data. + + Example: + >>> ws = rf.workspace("lee-sandbox") + >>> done = ws.evals(status="done", limit=5) + >>> for ev in done: + ... print(ev.id, ev.summary) + """ + from roboflow.core.model_eval import ModelEval + + result = rfapi.list_model_evals( + self.__api_key, + self.url, + project=project, + version=version, + model=model, + status=status, + limit=limit, + ) + return [ModelEval(self.__api_key, self.url, e["id"], info=e) for e in result.get("evals", [])] + + def eval(self, eval_id: str) -> "ModelEval": + """Fetch a single model eval by id. + + Raises: + roboflow.adapters.rfapi.ModelEvalNotFoundError: If the id doesn't + exist in this workspace (HTTP 404). + + Example: + >>> ws = rf.workspace("lee-sandbox") + >>> ev = ws.eval("huUF720inUcymARwqAGK") + >>> ev.summary["mAP"] + """ + from roboflow.core.model_eval import ModelEval + + info = rfapi.get_model_eval(self.__api_key, self.url, eval_id) + return ModelEval(self.__api_key, self.url, info.get("id", eval_id), info=info) + def trash(self) -> dict: """ List items currently in the workspace Trash. diff --git a/tests/adapters/test_rfapi_model_evals.py b/tests/adapters/test_rfapi_model_evals.py new file mode 100644 index 00000000..e27d40e4 --- /dev/null +++ b/tests/adapters/test_rfapi_model_evals.py @@ -0,0 +1,224 @@ +"""Unit tests for the model-eval rfapi helpers (`/{ws}/model-evals/...`).""" + +from __future__ import annotations + +import unittest +from unittest.mock import MagicMock, patch + +from roboflow.adapters import rfapi +from roboflow.config import API_URL + + +def _resp(status: int, body): + """Build a mock requests.Response double for the given status + JSON body.""" + mock = MagicMock(status_code=status) + mock.json.return_value = body + mock.text = repr(body) + return mock + + +class TestListModelEvals(unittest.TestCase): + @patch("roboflow.adapters.rfapi.requests.get") + def test_success_no_filters(self, mock_get): + mock_get.return_value = _resp(200, {"evals": [{"id": "e1", "status": "done"}]}) + + result = rfapi.list_model_evals("k", "ws") + + self.assertEqual(result, {"evals": [{"id": "e1", "status": "done"}]}) + url = mock_get.call_args[0][0] + params = mock_get.call_args.kwargs["params"] + self.assertEqual(url, f"{API_URL}/ws/model-evals") + self.assertEqual(params, {"api_key": "k"}) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_success_with_filters(self, mock_get): + mock_get.return_value = _resp(200, {"evals": []}) + + rfapi.list_model_evals("k", "ws", project="p1", version=3, model="m1", status="done", limit=10) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual( + params, + { + "api_key": "k", + "project": "p1", + "version": 3, + "model": "m1", + "status": "done", + "limit": 10, + }, + ) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_omits_none_filters(self, mock_get): + mock_get.return_value = _resp(200, {"evals": []}) + + rfapi.list_model_evals("k", "ws", status="done", limit=None) + + params = mock_get.call_args.kwargs["params"] + self.assertNotIn("limit", params) + self.assertEqual(params["status"], "done") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_404_raises_not_found(self, mock_get): + mock_get.return_value = _resp(404, {"error": "model_eval_not_found", "message": "nope"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError) as ctx: + rfapi.list_model_evals("k", "ws") + self.assertIn("nope", str(ctx.exception)) + + +class TestGetModelEval(unittest.TestCase): + @patch("roboflow.adapters.rfapi.requests.get") + def test_success(self, mock_get): + mock_get.return_value = _resp(200, {"id": "e1", "status": "done", "summary": {"mAP": 0.9}}) + + result = rfapi.get_model_eval("k", "ws", "e1") + + self.assertEqual(result["summary"]["mAP"], 0.9) + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1") + + +class TestPanelEndpoints(unittest.TestCase): + """Each panel endpoint forwards path + params correctly.""" + + @patch("roboflow.adapters.rfapi.requests.get") + def test_map_results_url(self, mock_get): + mock_get.return_value = _resp(200, {"splits": {}}) + + rfapi.get_model_eval_map_results("k", "ws", "e1") + + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1/map-results") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_confidence_sweep_url(self, mock_get): + mock_get.return_value = _resp(200, {"splits": {}}) + + rfapi.get_model_eval_confidence_sweep("k", "ws", "e1") + + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1/confidence-sweep") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_performance_by_class_passes_split(self, mock_get): + mock_get.return_value = _resp(200, {"split": "valid", "classes": []}) + + rfapi.get_model_eval_performance_by_class("k", "ws", "e1", split="valid") + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["split"], "valid") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_confusion_matrix_passes_params(self, mock_get): + mock_get.return_value = _resp(200, {"matrix": []}) + + rfapi.get_model_eval_confusion_matrix("k", "ws", "e1", split="test", confidence=30) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["split"], "test") + self.assertEqual(params["confidence"], 30) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_image_predictions_pagination(self, mock_get): + mock_get.return_value = _resp(200, {"images": []}) + + rfapi.get_model_eval_image_predictions("k", "ws", "e1", split="test", confidence=20, limit=50, offset=100) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["limit"], 50) + self.assertEqual(params["offset"], 100) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_recommendations_url(self, mock_get): + mock_get.return_value = _resp(200, {"recommendations": []}) + + rfapi.get_model_eval_recommendations("k", "ws", "e1") + + url = mock_get.call_args[0][0] + self.assertEqual(url, f"{API_URL}/ws/model-evals/e1/recommendations") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_vector_analysis_passes_confidence(self, mock_get): + mock_get.return_value = _resp(200, {"clusters": []}) + + rfapi.get_model_eval_vector_analysis("k", "ws", "e1", confidence=25) + + params = mock_get.call_args.kwargs["params"] + self.assertEqual(params["confidence"], 25) + + +class TestErrorMapping(unittest.TestCase): + """Typed errors are routed to the right exception subclass.""" + + @patch("roboflow.adapters.rfapi.requests.get") + def test_404_flat_envelope(self, mock_get): + # Server returns the flat shape: {"error": "code", "message": "..."} + mock_get.return_value = _resp(404, {"error": "model_eval_not_found", "message": "Eval 'x' not found"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError) as ctx: + rfapi.get_model_eval("k", "ws", "x") + self.assertIn("Eval 'x' not found", str(ctx.exception)) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_404_nested_envelope_back_compat(self, mock_get): + # Older nested shape: {"error": {"code": "...", "message": "..."}} + mock_get.return_value = _resp(404, {"error": {"code": "model_eval_not_found", "message": "nested"}}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError): + rfapi.get_model_eval("k", "ws", "x") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_409_not_done(self, mock_get): + mock_get.return_value = _resp(409, {"error": "model_eval_not_done", "message": "Eval still running"}) + + with self.assertRaises(rfapi.ModelEvalNotDoneError): + rfapi.get_model_eval_map_results("k", "ws", "x") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_400_invalid_split(self, mock_get): + mock_get.return_value = _resp(400, {"error": "invalid_split", "message": "Invalid split"}) + + with self.assertRaises(rfapi.InvalidSplitError): + rfapi.get_model_eval_performance_by_class("k", "ws", "x", split="all") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_400_invalid_confidence(self, mock_get): + mock_get.return_value = _resp(400, {"error": "invalid_confidence", "message": "out of range"}) + + with self.assertRaises(rfapi.InvalidConfidenceError): + rfapi.get_model_eval_confusion_matrix("k", "ws", "x", confidence=200) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_unknown_404_falls_back_to_not_found(self, mock_get): + # 404 without a recognised code still maps by status code (forward-compat). + mock_get.return_value = _resp(404, {"error": "some_new_code", "message": "?"}) + + with self.assertRaises(rfapi.ModelEvalNotFoundError): + rfapi.get_model_eval("k", "ws", "x") + + @patch("roboflow.adapters.rfapi.requests.get") + def test_unknown_500_raises_generic_roboflow_error(self, mock_get): + mock_get.return_value = _resp(500, {"error": "server_oops", "message": "boom"}) + + with self.assertRaises(rfapi.RoboflowError) as ctx: + rfapi.get_model_eval("k", "ws", "x") + # Not one of the typed subclasses + self.assertNotIsInstance(ctx.exception, rfapi.ModelEvalNotFoundError) + self.assertNotIsInstance(ctx.exception, rfapi.ModelEvalNotDoneError) + + @patch("roboflow.adapters.rfapi.requests.get") + def test_non_json_body_falls_back_to_text(self, mock_get): + # Some misbehaving proxies return HTML 502s — make sure we don't crash. + bad = MagicMock(status_code=502, text="Bad Gateway") + bad.json.side_effect = ValueError("not JSON") + mock_get.return_value = bad + + with self.assertRaises(rfapi.RoboflowError) as ctx: + rfapi.get_model_eval("k", "ws", "x") + self.assertIn("Bad Gateway", str(ctx.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cli/test_eval_handler.py b/tests/cli/test_eval_handler.py new file mode 100644 index 00000000..b1fe9444 --- /dev/null +++ b/tests/cli/test_eval_handler.py @@ -0,0 +1,353 @@ +"""Tests for the model-eval CLI handler (`roboflow eval ...`).""" + +from __future__ import annotations + +import json +import unittest +from argparse import Namespace +from unittest.mock import patch + +from typer.testing import CliRunner + +from roboflow.cli import app + +runner = CliRunner() + + +# --------------------------------------------------------------------------- +# Registration / discoverability +# --------------------------------------------------------------------------- + + +class TestEvalRegistration(unittest.TestCase): + """`roboflow eval ...` subcommands are registered with valid --help.""" + + def test_eval_app_exists(self) -> None: + from roboflow.cli.handlers.eval import eval_app + + self.assertIsNotNone(eval_app) + + def test_eval_root_help(self) -> None: + result = runner.invoke(app, ["eval", "--help"]) + self.assertEqual(result.exit_code, 0) + + def test_each_subcommand_help(self) -> None: + for cmd in [ + "list", + "get", + "map-results", + "confidence-sweep", + "performance-by-class", + "confusion-matrix", + "vector-analysis", + "image-predictions", + "recommendations", + ]: + with self.subTest(cmd=cmd): + result = runner.invoke(app, ["eval", cmd, "--help"]) + self.assertEqual(result.exit_code, 0, f"{cmd} --help failed: {result.output}") + + +# --------------------------------------------------------------------------- +# Helpers — every test patches the workspace + key resolver so no IO happens. +# --------------------------------------------------------------------------- + + +def _args(**overrides): + """Build a Namespace matching what ctx_to_args produces, with sane defaults.""" + base = {"json": False, "workspace": "lee-sandbox", "api_key": None, "quiet": False} + base.update(overrides) + return Namespace(**base) + + +# --------------------------------------------------------------------------- +# `eval list` +# --------------------------------------------------------------------------- + + +class TestEvalListHandler(unittest.TestCase): + @patch("roboflow.adapters.rfapi.list_model_evals") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_list_text_calls_adapter_with_filters(self, _key, _ws, mock_list): + mock_list.return_value = { + "evals": [ + { + "id": "e1", + "status": "done", + "projectId": "p1", + "versionId": "3", + "modelId": None, + "createdAt": "2025-01-01", + } + ] + } + args = _args(workspace=None, project="p1", version="3", model=None, status="done", limit=5) + + from roboflow.cli.handlers.eval import _list_evals + + with patch("builtins.print") as mock_print: + _list_evals(args) + + mock_list.assert_called_once_with( + "key", "lee-sandbox", project="p1", version="3", model=None, status="done", limit=5 + ) + printed = mock_print.call_args[0][0] + self.assertIn("e1", printed) + self.assertIn("done", printed) + + @patch("roboflow.adapters.rfapi.list_model_evals") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_list_json_emits_evals_array(self, _key, _ws, mock_list): + mock_list.return_value = {"evals": [{"id": "e1", "status": "done"}]} + args = _args(workspace=None, json=True, project=None, version=None, model=None, status=None, limit=None) + + from roboflow.cli.handlers.eval import _list_evals + + with patch("builtins.print") as mock_print: + _list_evals(args) + + printed = mock_print.call_args[0][0] + data = json.loads(printed) + self.assertEqual(data, [{"id": "e1", "status": "done"}]) + + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value=None) + def test_list_no_workspace_exits_2(self, _ws): + args = _args(workspace=None, project=None, version=None, model=None, status=None, limit=None) + + from roboflow.cli.handlers.eval import _list_evals + + with self.assertRaises(SystemExit) as ctx: + _list_evals(args) + self.assertEqual(ctx.exception.code, 2) + + +# --------------------------------------------------------------------------- +# `eval get` +# --------------------------------------------------------------------------- + + +class TestEvalGetHandler(unittest.TestCase): + @patch("roboflow.adapters.rfapi.get_model_eval") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_get_text(self, _key, _ws, mock_get): + mock_get.return_value = { + "id": "e1", + "status": "done", + "projectId": "p1", + "versionId": "3", + "modelId": "m1", + "createdAt": "2025-01-01", + "config": {"overlap": 30, "iouThreshold": 50}, + "summary": {"mAP": 0.91, "precision": 0.85, "recall": 0.8}, + } + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _get_eval + + with patch("builtins.print") as mock_print: + _get_eval(args) + + printed = mock_print.call_args[0][0] + self.assertIn("Eval: e1", printed) + self.assertIn("Status: done", printed) + self.assertIn("mAP=0.91", printed) + mock_get.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_get_404_exits_3(self, _key, _ws, mock_get): + from roboflow.adapters import rfapi + + mock_get.side_effect = rfapi.ModelEvalNotFoundError("not found") + args = _args(workspace=None, eval_id="bad") + + from roboflow.cli.handlers.eval import _get_eval + + with self.assertRaises(SystemExit) as ctx: + _get_eval(args) + self.assertEqual(ctx.exception.code, 3) + + +# --------------------------------------------------------------------------- +# Per-panel handlers — each forwards args to the right adapter function. +# --------------------------------------------------------------------------- + + +class TestPanelHandlers(unittest.TestCase): + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_map_results_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"splits": {"test": {"map50": 0.9}}} + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _map_results + + with patch("builtins.print"): + _map_results(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_confidence_sweep") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_confidence_sweep_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"splits": {}} + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _confidence_sweep + + with patch("builtins.print"): + _confidence_sweep(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_performance_by_class_passes_split(self, _key, _ws, mock_fn): + mock_fn.return_value = {"split": "valid", "classes": [{"className": "car", "map50": 0.9}]} + args = _args(workspace=None, eval_id="e1", split="valid") + + from roboflow.cli.handlers.eval import _performance_by_class + + with patch("builtins.print"): + _performance_by_class(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", split="valid") + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_performance_by_class_invalid_split_exits_5(self, _key, _ws, mock_fn): + from roboflow.adapters import rfapi + + mock_fn.side_effect = rfapi.InvalidSplitError("no") + args = _args(workspace=None, eval_id="e1", split="all") + + from roboflow.cli.handlers.eval import _performance_by_class + + with self.assertRaises(SystemExit) as ctx: + _performance_by_class(args) + self.assertEqual(ctx.exception.code, 5) + + @patch("roboflow.adapters.rfapi.get_model_eval_confusion_matrix") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_confusion_matrix_passes_args(self, _key, _ws, mock_fn): + mock_fn.return_value = {"matrix": []} + args = _args(workspace=None, eval_id="e1", split="test", confidence=30) + + from roboflow.cli.handlers.eval import _confusion_matrix + + with patch("builtins.print"): + _confusion_matrix(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", split="test", confidence=30) + + @patch("roboflow.adapters.rfapi.get_model_eval_confusion_matrix") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_confusion_matrix_invalid_confidence_exits_5(self, _key, _ws, mock_fn): + from roboflow.adapters import rfapi + + mock_fn.side_effect = rfapi.InvalidConfidenceError("bad") + args = _args(workspace=None, eval_id="e1", split=None, confidence=999) + + from roboflow.cli.handlers.eval import _confusion_matrix + + with self.assertRaises(SystemExit) as ctx: + _confusion_matrix(args) + self.assertEqual(ctx.exception.code, 5) + + @patch("roboflow.adapters.rfapi.get_model_eval_vector_analysis") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_vector_analysis_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"clusters": []} + args = _args(workspace=None, eval_id="e1", confidence=20) + + from roboflow.cli.handlers.eval import _vector_analysis + + with patch("builtins.print"): + _vector_analysis(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", confidence=20) + + @patch("roboflow.adapters.rfapi.get_model_eval_image_predictions") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_image_predictions_pagination(self, _key, _ws, mock_fn): + mock_fn.return_value = { + "split": "test", + "confidenceThreshold": 30, + "totalImages": 100, + "offset": 50, + "limit": 10, + "images": [{"imageId": "i1", "imageName": "a.jpg", "split": "test", "stats": {}}], + } + args = _args(workspace=None, eval_id="e1", split="test", confidence=30, limit=10, offset=50) + + from roboflow.cli.handlers.eval import _image_predictions + + with patch("builtins.print") as mock_print: + _image_predictions(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1", split="test", confidence=30, limit=10, offset=50) + printed = mock_print.call_args[0][0] + self.assertIn("a.jpg", printed) + + @patch("roboflow.adapters.rfapi.get_model_eval_recommendations") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_recommendations_calls_adapter(self, _key, _ws, mock_fn): + mock_fn.return_value = {"generated": False} + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _recommendations + + with patch("builtins.print"): + _recommendations(args) + mock_fn.assert_called_once_with("key", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_panel_409_not_done_exits_4(self, _key, _ws, mock_fn): + from roboflow.adapters import rfapi + + mock_fn.side_effect = rfapi.ModelEvalNotDoneError("running") + args = _args(workspace=None, eval_id="e1") + + from roboflow.cli.handlers.eval import _map_results + + with self.assertRaises(SystemExit) as ctx: + _map_results(args) + self.assertEqual(ctx.exception.code, 4) + + +# --------------------------------------------------------------------------- +# Exit-code mapping helper +# --------------------------------------------------------------------------- + + +class TestExitCodeMapping(unittest.TestCase): + """The handler distinguishes 404/409/400 to give shell scripts useful exit codes.""" + + def test_exit_codes(self) -> None: + from roboflow.adapters import rfapi + from roboflow.cli.handlers.eval import _eval_error_exit_code + + cases = { + rfapi.ModelEvalNotFoundError("x"): 3, + rfapi.ModelEvalNotDoneError("x"): 4, + rfapi.InvalidSplitError("x"): 5, + rfapi.InvalidConfidenceError("x"): 5, + rfapi.RoboflowError("x"): 1, + ValueError("x"): 1, + } + for exc, expected in cases.items(): + with self.subTest(exc=type(exc).__name__): + self.assertEqual(_eval_error_exit_code(exc), expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_model_eval.py b/tests/test_model_eval.py new file mode 100644 index 00000000..86b4a4c1 --- /dev/null +++ b/tests/test_model_eval.py @@ -0,0 +1,244 @@ +"""Unit tests for the ModelEval SDK class and Workspace.evals/eval accessors.""" + +from __future__ import annotations + +import unittest +from unittest.mock import patch + + +def _make_workspace(api_key="k", url="lee-sandbox"): + """Build a Workspace with the minimal info dict its constructor accepts.""" + from roboflow.core.workspace import Workspace + + info = { + "workspace": { + "name": "Test", + "url": url, + "projects": [], + "members": [], + } + } + return Workspace(info, api_key=api_key, default_workspace=url, model_format="yolov8") + + +class TestModelEvalConstruction(unittest.TestCase): + def test_apply_info_populates_attributes(self): + from roboflow.core.model_eval import ModelEval + + info = { + "id": "e1", + "status": "done", + "projectId": "p1", + "versionId": "3", + "modelId": "m1", + "createdAt": "2025-01-01", + "config": {"overlap": 30, "iouThreshold": 50}, + "summary": {"mAP": 0.9, "precision": 0.8, "recall": 0.85}, + } + ev = ModelEval("k", "ws", "e1", info=info) + + self.assertEqual(ev.id, "e1") + self.assertEqual(ev.status, "done") + self.assertEqual(ev.project_id, "p1") + self.assertEqual(ev.version_id, "3") + self.assertEqual(ev.model_id, "m1") + self.assertEqual(ev.created_at, "2025-01-01") + self.assertEqual(ev.config["overlap"], 30) + self.assertEqual(ev.summary["mAP"], 0.9) + + def test_construction_without_info(self): + from roboflow.core.model_eval import ModelEval + + ev = ModelEval("k", "ws", "e1") + self.assertEqual(ev.id, "e1") + self.assertIsNone(ev.status) + self.assertEqual(ev.config, {}) + self.assertIsNone(ev.summary) + + +class TestModelEvalRefresh(unittest.TestCase): + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_refresh_updates_status_and_summary(self, mock_get): + from roboflow.core.model_eval import ModelEval + + mock_get.return_value = { + "id": "e1", + "status": "done", + "summary": {"mAP": 0.95}, + "config": {}, + } + ev = ModelEval("k", "ws", "e1") + result = ev.refresh() + + self.assertIs(result, ev) # chainable + self.assertEqual(ev.status, "done") + self.assertEqual(ev.summary["mAP"], 0.95) + mock_get.assert_called_once_with("k", "ws", "e1") + + +class TestModelEvalPanelAccessors(unittest.TestCase): + """Each panel method delegates to the matching rfapi function with the right args.""" + + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + def test_map_results(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"splits": {}} + ev = ModelEval("k", "ws", "e1") + result = ev.map_results() + + self.assertEqual(result, {"splits": {}}) + mock_fn.assert_called_once_with("k", "ws", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_confidence_sweep") + def test_confidence_sweep(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"splits": {}} + ModelEval("k", "ws", "e1").confidence_sweep() + + mock_fn.assert_called_once_with("k", "ws", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + def test_performance_by_class_default_split(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"classes": []} + ModelEval("k", "ws", "e1").performance_by_class() + mock_fn.assert_called_once_with("k", "ws", "e1", split=None) + + @patch("roboflow.adapters.rfapi.get_model_eval_performance_by_class") + def test_performance_by_class_with_split(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"classes": []} + ModelEval("k", "ws", "e1").performance_by_class(split="valid") + mock_fn.assert_called_once_with("k", "ws", "e1", split="valid") + + @patch("roboflow.adapters.rfapi.get_model_eval_confusion_matrix") + def test_confusion_matrix(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"matrix": []} + ModelEval("k", "ws", "e1").confusion_matrix(split="test", confidence=30) + mock_fn.assert_called_once_with("k", "ws", "e1", split="test", confidence=30) + + @patch("roboflow.adapters.rfapi.get_model_eval_vector_analysis") + def test_vector_analysis(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"clusters": []} + ModelEval("k", "ws", "e1").vector_analysis(confidence=40) + mock_fn.assert_called_once_with("k", "ws", "e1", confidence=40) + + @patch("roboflow.adapters.rfapi.get_model_eval_image_predictions") + def test_image_predictions(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"images": []} + ModelEval("k", "ws", "e1").image_predictions(split="valid", confidence=20, limit=50, offset=100) + mock_fn.assert_called_once_with("k", "ws", "e1", split="valid", confidence=20, limit=50, offset=100) + + @patch("roboflow.adapters.rfapi.get_model_eval_recommendations") + def test_recommendations(self, mock_fn): + from roboflow.core.model_eval import ModelEval + + mock_fn.return_value = {"recommendations": []} + ModelEval("k", "ws", "e1").recommendations() + mock_fn.assert_called_once_with("k", "ws", "e1") + + +class TestModelEvalErrors(unittest.TestCase): + """Typed errors from the adapter propagate through the SDK accessors.""" + + @patch("roboflow.adapters.rfapi.get_model_eval_map_results") + def test_not_done_error_propagates(self, mock_fn): + from roboflow.adapters import rfapi + from roboflow.core.model_eval import ModelEval + + mock_fn.side_effect = rfapi.ModelEvalNotDoneError("Eval still running") + ev = ModelEval("k", "ws", "e1") + with self.assertRaises(rfapi.ModelEvalNotDoneError): + ev.map_results() + + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_refresh_404_propagates(self, mock_fn): + from roboflow.adapters import rfapi + from roboflow.core.model_eval import ModelEval + + mock_fn.side_effect = rfapi.ModelEvalNotFoundError("nope") + with self.assertRaises(rfapi.ModelEvalNotFoundError): + ModelEval("k", "ws", "e1").refresh() + + +class TestWorkspaceEvalAccessors(unittest.TestCase): + @patch("roboflow.adapters.rfapi.list_model_evals") + def test_evals_returns_modeleval_instances(self, mock_list): + from roboflow.core.model_eval import ModelEval + + mock_list.return_value = { + "evals": [ + {"id": "e1", "status": "done", "projectId": "p1"}, + {"id": "e2", "status": "running", "projectId": "p1"}, + ] + } + ws = _make_workspace() + result = ws.evals(status="done", limit=5) + + self.assertEqual(len(result), 2) + self.assertTrue(all(isinstance(e, ModelEval) for e in result)) + self.assertEqual(result[0].id, "e1") + self.assertEqual(result[0].status, "done") + self.assertEqual(result[1].id, "e2") + # Workspace forwards filters to the adapter + mock_list.assert_called_once_with( + "k", "lee-sandbox", project=None, version=None, model=None, status="done", limit=5 + ) + + @patch("roboflow.adapters.rfapi.list_model_evals") + def test_evals_passes_all_filters(self, mock_list): + mock_list.return_value = {"evals": []} + + ws = _make_workspace() + ws.evals(project="p1", version="3", model="m1", status="failed", limit=200) + + mock_list.assert_called_once_with( + "k", "lee-sandbox", project="p1", version="3", model="m1", status="failed", limit=200 + ) + + @patch("roboflow.adapters.rfapi.list_model_evals") + def test_evals_empty_list(self, mock_list): + mock_list.return_value = {"evals": []} + ws = _make_workspace() + self.assertEqual(ws.evals(), []) + + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_eval_returns_populated_modeleval(self, mock_get): + from roboflow.core.model_eval import ModelEval + + mock_get.return_value = { + "id": "e1", + "status": "done", + "summary": {"mAP": 0.91}, + } + ws = _make_workspace() + ev = ws.eval("e1") + + self.assertIsInstance(ev, ModelEval) + self.assertEqual(ev.id, "e1") + self.assertEqual(ev.status, "done") + self.assertEqual(ev.summary["mAP"], 0.91) + mock_get.assert_called_once_with("k", "lee-sandbox", "e1") + + @patch("roboflow.adapters.rfapi.get_model_eval") + def test_eval_propagates_not_found(self, mock_get): + from roboflow.adapters import rfapi + + mock_get.side_effect = rfapi.ModelEvalNotFoundError("nope") + ws = _make_workspace() + with self.assertRaises(rfapi.ModelEvalNotFoundError): + ws.eval("bad") + + +if __name__ == "__main__": + unittest.main() From 738732306cd0fdaaec9c8ac9c36a42eb950f8e1c Mon Sep 17 00:00:00 2001 From: Lee Clement Date: Thu, 7 May 2026 10:26:14 -0400 Subject: [PATCH 2/5] fix(model-evals): drop dead two-shape error envelope handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The REST API returns a single flat shape {"error": "code", "message": "..."} — the agent's original adapter accepted both flat and nested shapes for forward-compat, but the nested shape never shipped. Drop the dead branch and the corresponding test; replace with a status-code-fallback test that exercises the existing 404/409 fallback paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- roboflow/adapters/rfapi.py | 27 ++++++++---------------- tests/adapters/test_rfapi_model_evals.py | 6 +++--- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/roboflow/adapters/rfapi.py b/roboflow/adapters/rfapi.py index 2079e8cd..8d887abb 100644 --- a/roboflow/adapters/rfapi.py +++ b/roboflow/adapters/rfapi.py @@ -1191,32 +1191,24 @@ class InvalidConfidenceError(RoboflowError): def _model_eval_error_for(response): """Translate a model-eval error response into the right RoboflowError subclass. - The model-eval REST surface returns errors in the shape:: + The model-eval REST surface returns errors as a flat envelope:: {"error": "", "message": ""} - Some routes (and earlier drafts of the spec) instead nest the code as - ``{"error": {"code": "...", "message": "..."}}``; we accept both so we - don't churn when the server normalises. Falls back to plain - :class:`RoboflowError` when the body isn't JSON or the code is - unrecognised, so new error codes don't crash older SDK callers. + Falls back to plain :class:`RoboflowError` when the body isn't JSON or + the code is unrecognised, so new error codes don't crash older SDK + callers. Status-code fallbacks for 404/409 keep typed exceptions + available even if the server omits the ``error`` field. """ code = None message = response.text try: body = response.json() if isinstance(body, dict): - err = body.get("error") - if isinstance(err, str): - # Flat shape: {"error": "code_string", "message": "..."} - code = err - message = body.get("message") or err - elif isinstance(err, dict): - # Nested shape: {"error": {"code": "...", "message": "..."}} - code = err.get("code") - message = err.get("message") or body.get("message") or message - else: - message = body.get("message", message) + code = body.get("error") + if not isinstance(code, str): + code = None + message = body.get("message") or code or message except (ValueError, TypeError): pass @@ -1229,7 +1221,6 @@ def _model_eval_error_for(response): cls = cls_by_code.get(code or "") if cls is not None: return cls(message) - # Status-code fallbacks for backends that haven't shipped the typed code yet. if response.status_code == 404: return ModelEvalNotFoundError(message) if response.status_code == 409: diff --git a/tests/adapters/test_rfapi_model_evals.py b/tests/adapters/test_rfapi_model_evals.py index e27d40e4..41cc6942 100644 --- a/tests/adapters/test_rfapi_model_evals.py +++ b/tests/adapters/test_rfapi_model_evals.py @@ -162,9 +162,9 @@ def test_404_flat_envelope(self, mock_get): self.assertIn("Eval 'x' not found", str(ctx.exception)) @patch("roboflow.adapters.rfapi.requests.get") - def test_404_nested_envelope_back_compat(self, mock_get): - # Older nested shape: {"error": {"code": "...", "message": "..."}} - mock_get.return_value = _resp(404, {"error": {"code": "model_eval_not_found", "message": "nested"}}) + def test_404_status_code_fallback(self, mock_get): + # No `error` field at all — fall back to the status code mapping. + mock_get.return_value = _resp(404, {"message": "something went wrong"}) with self.assertRaises(rfapi.ModelEvalNotFoundError): rfapi.get_model_eval("k", "ws", "x") From 267068377bf935c6b5ee80f9afb9a3b10bc19470 Mon Sep 17 00:00:00 2001 From: Lee Clement Date: Thu, 7 May 2026 11:41:41 -0400 Subject: [PATCH 3/5] fix(model-evals): align SDK + CLI with API evalId rename Address PR review on roboflow#11636 affecting the SDK/CLI: - ModelEval._apply reads evalId (legacy id fallback for forward-compat) - to_dict emits evalId - Workspace.evals resolves either field when constructing ModelEval - CLI list/get handlers prefer evalId, fall back to id - Drop the undocumented `config` attribute (not part of public DNA shape) - Tests updated for evalId; 57 pass Co-Authored-By: Claude Opus 4.7 (1M context) --- roboflow/cli/handlers/eval.py | 6 ++++-- roboflow/core/model_eval.py | 16 ++++++++++------ roboflow/core/workspace.py | 5 ++++- tests/test_model_eval.py | 14 +++++--------- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/roboflow/cli/handlers/eval.py b/roboflow/cli/handlers/eval.py index a4593b30..5edbdb54 100644 --- a/roboflow/cli/handlers/eval.py +++ b/roboflow/cli/handlers/eval.py @@ -222,7 +222,8 @@ def _list_evals(args): # noqa: ANN001 evals = result.get("evals", []) rows = [ { - "id": e.get("id", ""), + # Prefer DNA's `evalId`; tolerate legacy `id` from older server versions. + "id": e.get("evalId", e.get("id", "")), "status": e.get("status", ""), "project": e.get("projectId", ""), "version": e.get("versionId", ""), @@ -255,7 +256,8 @@ def _get_eval(args): # noqa: ANN001 return lines = [ - f"Eval: {info.get('id', args.eval_id)}", + # Prefer DNA's `evalId`; tolerate legacy `id`. + f"Eval: {info.get('evalId', info.get('id', args.eval_id))}", f" Status: {info.get('status', '')}", f" Project: {info.get('projectId', '')}", f" Version: {info.get('versionId', '')}", diff --git a/roboflow/core/model_eval.py b/roboflow/core/model_eval.py index 38447fe2..1493eabd 100644 --- a/roboflow/core/model_eval.py +++ b/roboflow/core/model_eval.py @@ -47,12 +47,16 @@ def __init__( # -- internal ----------------------------------------------------------- def _apply(self, info: Dict[str, Any]) -> None: + # Server returns `evalId` (per DNA's identifier-embedding convention, + # consistent with every panel response). Accept legacy `id` for + # forward-compat with cached responses from older server versions. + if info.get("evalId"): + self.id = info["evalId"] self.status: Optional[str] = info.get("status") self.project_id: Optional[str] = info.get("projectId") self.version_id: Optional[str] = info.get("versionId") self.model_id: Optional[str] = info.get("modelId") self.created_at: Optional[str] = info.get("createdAt") - self.config: Dict[str, Any] = info.get("config", {}) or {} self.summary: Optional[Dict[str, Any]] = info.get("summary") self._raw: Dict[str, Any] = info @@ -122,16 +126,16 @@ def recommendations(self) -> Dict[str, Any]: # -- helpers ------------------------------------------------------------ def to_dict(self) -> Dict[str, Any]: - """Return the cached eval metadata as a plain dict (id + last header fetch).""" - data: Dict[str, Any] = {"id": self.id} + """Return the cached eval metadata as a plain dict (evalId + last header fetch).""" + data: Dict[str, Any] = {"evalId": self.id} # Prefer raw payload (preserves keys we don't surface as attrs); fall # back to attributes when only the constructor was called with no info. if self._raw: - return {**self._raw, "id": self.id} - for key in ("status", "projectId", "versionId", "modelId", "createdAt", "config", "summary"): + return {**self._raw, "evalId": self.id} + for key in ("status", "projectId", "versionId", "modelId", "createdAt", "summary"): attr = ( key - if key in {"status", "config", "summary"} + if key in {"status", "summary"} else { "projectId": "project_id", "versionId": "version_id", diff --git a/roboflow/core/workspace.py b/roboflow/core/workspace.py index 3fb90061..f25292e3 100644 --- a/roboflow/core/workspace.py +++ b/roboflow/core/workspace.py @@ -1478,7 +1478,10 @@ def evals( status=status, limit=limit, ) - return [ModelEval(self.__api_key, self.url, e["id"], info=e) for e in result.get("evals", [])] + # Server returns `evalId` (per DNA); fall back to legacy `id` for forward-compat. + return [ + ModelEval(self.__api_key, self.url, e.get("evalId") or e["id"], info=e) for e in result.get("evals", []) + ] def eval(self, eval_id: str) -> "ModelEval": """Fetch a single model eval by id. diff --git a/tests/test_model_eval.py b/tests/test_model_eval.py index 86b4a4c1..542939f0 100644 --- a/tests/test_model_eval.py +++ b/tests/test_model_eval.py @@ -26,13 +26,12 @@ def test_apply_info_populates_attributes(self): from roboflow.core.model_eval import ModelEval info = { - "id": "e1", + "evalId": "e1", "status": "done", "projectId": "p1", "versionId": "3", "modelId": "m1", "createdAt": "2025-01-01", - "config": {"overlap": 30, "iouThreshold": 50}, "summary": {"mAP": 0.9, "precision": 0.8, "recall": 0.85}, } ev = ModelEval("k", "ws", "e1", info=info) @@ -43,7 +42,6 @@ def test_apply_info_populates_attributes(self): self.assertEqual(ev.version_id, "3") self.assertEqual(ev.model_id, "m1") self.assertEqual(ev.created_at, "2025-01-01") - self.assertEqual(ev.config["overlap"], 30) self.assertEqual(ev.summary["mAP"], 0.9) def test_construction_without_info(self): @@ -52,7 +50,6 @@ def test_construction_without_info(self): ev = ModelEval("k", "ws", "e1") self.assertEqual(ev.id, "e1") self.assertIsNone(ev.status) - self.assertEqual(ev.config, {}) self.assertIsNone(ev.summary) @@ -62,10 +59,9 @@ def test_refresh_updates_status_and_summary(self, mock_get): from roboflow.core.model_eval import ModelEval mock_get.return_value = { - "id": "e1", + "evalId": "e1", "status": "done", "summary": {"mAP": 0.95}, - "config": {}, } ev = ModelEval("k", "ws", "e1") result = ev.refresh() @@ -178,8 +174,8 @@ def test_evals_returns_modeleval_instances(self, mock_list): mock_list.return_value = { "evals": [ - {"id": "e1", "status": "done", "projectId": "p1"}, - {"id": "e2", "status": "running", "projectId": "p1"}, + {"evalId": "e1", "status": "done", "projectId": "p1"}, + {"evalId": "e2", "status": "running", "projectId": "p1"}, ] } ws = _make_workspace() @@ -217,7 +213,7 @@ def test_eval_returns_populated_modeleval(self, mock_get): from roboflow.core.model_eval import ModelEval mock_get.return_value = { - "id": "e1", + "evalId": "e1", "status": "done", "summary": {"mAP": 0.91}, } From 19171e30a88a6df75f6f414e03de4eb7da641b6c Mon Sep 17 00:00:00 2001 From: Lee Clement Date: Thu, 7 May 2026 12:53:23 -0400 Subject: [PATCH 4/5] =?UTF-8?q?fix(model-evals):=20rename=20SDK=20attr=20`?= =?UTF-8?q?project=5Fid`=20=E2=86=92=20`project`=20(URL=20slug)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pairs with roboflow#11636 dropping `projectId` from the public response. The SDK previously read `info["projectId"]` (the Firestore doc id) into `ModelEval.project_id`. That field was a doc-id leak — the API now only returns `project` (the URL slug) on the principle that public APIs should not expose storage-layer ids. Rename: `ModelEval.project_id` → `ModelEval.project`. Accept legacy `projectId` from cached older-server responses for forward-compat. CLI list/get handlers also pull from `project` first. Co-Authored-By: Claude Opus 4.7 (1M context) --- roboflow/cli/handlers/eval.py | 7 +++++-- roboflow/core/model_eval.py | 13 ++++++++----- tests/cli/test_eval_handler.py | 22 +++++++++++++++++----- tests/test_model_eval.py | 8 ++++---- 4 files changed, 34 insertions(+), 16 deletions(-) diff --git a/roboflow/cli/handlers/eval.py b/roboflow/cli/handlers/eval.py index 5edbdb54..d0656df4 100644 --- a/roboflow/cli/handlers/eval.py +++ b/roboflow/cli/handlers/eval.py @@ -225,7 +225,9 @@ def _list_evals(args): # noqa: ANN001 # Prefer DNA's `evalId`; tolerate legacy `id` from older server versions. "id": e.get("evalId", e.get("id", "")), "status": e.get("status", ""), - "project": e.get("projectId", ""), + # `project` is the URL slug; the public API does not expose the doc id. + # Tolerate legacy `projectId` for forward-compat against older deploys. + "project": e.get("project") or e.get("projectId", ""), "version": e.get("versionId", ""), "model": e.get("modelId", "") or "", "created": e.get("createdAt", ""), @@ -259,7 +261,8 @@ def _get_eval(args): # noqa: ANN001 # Prefer DNA's `evalId`; tolerate legacy `id`. f"Eval: {info.get('evalId', info.get('id', args.eval_id))}", f" Status: {info.get('status', '')}", - f" Project: {info.get('projectId', '')}", + # `project` is the URL slug; tolerate legacy `projectId` for forward-compat. + f" Project: {info.get('project') or info.get('projectId', '')}", f" Version: {info.get('versionId', '')}", f" Model: {info.get('modelId', '') or '(none)'}", f" Created: {info.get('createdAt', '')}", diff --git a/roboflow/core/model_eval.py b/roboflow/core/model_eval.py index 1493eabd..0d7ee9cf 100644 --- a/roboflow/core/model_eval.py +++ b/roboflow/core/model_eval.py @@ -53,7 +53,11 @@ def _apply(self, info: Dict[str, Any]) -> None: if info.get("evalId"): self.id = info["evalId"] self.status: Optional[str] = info.get("status") - self.project_id: Optional[str] = info.get("projectId") + # `project` is the project URL slug — the same identifier the REST API + # uses in URL paths. The internal Firestore doc id is intentionally + # never exposed in the public API. Accept legacy `projectId` for + # forward-compat with older server versions. + self.project: Optional[str] = info.get("project") or info.get("projectId") self.version_id: Optional[str] = info.get("versionId") self.model_id: Optional[str] = info.get("modelId") self.created_at: Optional[str] = info.get("createdAt") @@ -132,12 +136,11 @@ def to_dict(self) -> Dict[str, Any]: # back to attributes when only the constructor was called with no info. if self._raw: return {**self._raw, "evalId": self.id} - for key in ("status", "projectId", "versionId", "modelId", "createdAt", "summary"): + for key in ("status", "project", "versionId", "modelId", "createdAt", "summary"): attr = ( key - if key in {"status", "summary"} + if key in {"status", "project", "summary"} else { - "projectId": "project_id", "versionId": "version_id", "modelId": "model_id", "createdAt": "created_at", @@ -149,7 +152,7 @@ def to_dict(self) -> Dict[str, Any]: return data def __repr__(self) -> str: # pragma: no cover - debug helper - return f"ModelEval(id={self.id!r}, status={self.status!r}, project={self.project_id!r})" + return f"ModelEval(id={self.id!r}, status={self.status!r}, project={self.project!r})" __all__: List[str] = ["ModelEval"] diff --git a/tests/cli/test_eval_handler.py b/tests/cli/test_eval_handler.py index b1fe9444..7e19d60c 100644 --- a/tests/cli/test_eval_handler.py +++ b/tests/cli/test_eval_handler.py @@ -75,14 +75,21 @@ def test_list_text_calls_adapter_with_filters(self, _key, _ws, mock_list): { "id": "e1", "status": "done", - "projectId": "p1", + "project": "my-project-slug", "versionId": "3", "modelId": None, "createdAt": "2025-01-01", } ] } - args = _args(workspace=None, project="p1", version="3", model=None, status="done", limit=5) + args = _args( + workspace=None, + project="my-project-slug", + version="3", + model=None, + status="done", + limit=5, + ) from roboflow.cli.handlers.eval import _list_evals @@ -90,7 +97,13 @@ def test_list_text_calls_adapter_with_filters(self, _key, _ws, mock_list): _list_evals(args) mock_list.assert_called_once_with( - "key", "lee-sandbox", project="p1", version="3", model=None, status="done", limit=5 + "key", + "lee-sandbox", + project="my-project-slug", + version="3", + model=None, + status="done", + limit=5, ) printed = mock_print.call_args[0][0] self.assertIn("e1", printed) @@ -136,11 +149,10 @@ def test_get_text(self, _key, _ws, mock_get): mock_get.return_value = { "id": "e1", "status": "done", - "projectId": "p1", + "project": "my-project-slug", "versionId": "3", "modelId": "m1", "createdAt": "2025-01-01", - "config": {"overlap": 30, "iouThreshold": 50}, "summary": {"mAP": 0.91, "precision": 0.85, "recall": 0.8}, } args = _args(workspace=None, eval_id="e1") diff --git a/tests/test_model_eval.py b/tests/test_model_eval.py index 542939f0..49db9112 100644 --- a/tests/test_model_eval.py +++ b/tests/test_model_eval.py @@ -28,7 +28,7 @@ def test_apply_info_populates_attributes(self): info = { "evalId": "e1", "status": "done", - "projectId": "p1", + "project": "my-project-slug", # URL slug — the public API only returns the slug "versionId": "3", "modelId": "m1", "createdAt": "2025-01-01", @@ -38,7 +38,7 @@ def test_apply_info_populates_attributes(self): self.assertEqual(ev.id, "e1") self.assertEqual(ev.status, "done") - self.assertEqual(ev.project_id, "p1") + self.assertEqual(ev.project, "my-project-slug") self.assertEqual(ev.version_id, "3") self.assertEqual(ev.model_id, "m1") self.assertEqual(ev.created_at, "2025-01-01") @@ -174,8 +174,8 @@ def test_evals_returns_modeleval_instances(self, mock_list): mock_list.return_value = { "evals": [ - {"evalId": "e1", "status": "done", "projectId": "p1"}, - {"evalId": "e2", "status": "running", "projectId": "p1"}, + {"evalId": "e1", "status": "done", "project": "my-project-slug"}, + {"evalId": "e2", "status": "running", "project": "my-project-slug"}, ] } ws = _make_workspace() From 361d9938ed6f18761a063052e369cd7bcd0fc1e7 Mon Sep 17 00:00:00 2001 From: Lee Clement Date: Thu, 7 May 2026 14:23:51 -0400 Subject: [PATCH 5/5] fix(model-evals): address review comments on PR #475 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three issues raised in review: M1 (BLOCKER) — `eval image-predictions` table dropped TP/FP/FN. The public API nests those counts under `stats` with camelCase keys (`truePositives`/`falsePositives`/`falseNegatives`); the renderer was reading `stats.tp`/`stats.fp`/`stats.fn` and silently rendering blanks. Same line was also rendering the whole `cluster` object (including the `embedding2D` array) in the cluster column; now renders `cluster.id` only. Live transcript regression case added. H2 — `ModelEval`'s class docstring promised `.config` was populated by `refresh()`, but `_apply()` never set it. Drop the reference. The `config` field was previously stripped from the public API response (per earlier review item B — `overlap`/`iouThreshold` weren't documented in DNA), so the SDK never has anything to populate. CLI's matching dead "Config: overlap=… iouThreshold=…" line in `eval get` also removed. M4/N1 — `to_dict()` had an untested fallback branch + an awkward inline-conditional dict-lookup mapping json keys to attr names. Refactor to a flat `_PUBLIC_FIELDS = ((json_key, attr_name), ...)` tuple list. Add four tests: - round-trips a server payload with `evalId` overlay - overlays `evalId` when payload used legacy `id` - constructor-only path serialises attrs only, omitting None fields - constructor-only path translates Python attr names back to JSON keys 62 tests pass (was 57). Co-Authored-By: Claude Opus 4.7 (1M context) --- roboflow/cli/handlers/eval.py | 15 ++++--- roboflow/core/model_eval.py | 47 ++++++++++++-------- tests/cli/test_eval_handler.py | 48 +++++++++++++++++++++ tests/test_model_eval.py | 78 ++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 25 deletions(-) diff --git a/roboflow/cli/handlers/eval.py b/roboflow/cli/handlers/eval.py index d0656df4..0e41cc90 100644 --- a/roboflow/cli/handlers/eval.py +++ b/roboflow/cli/handlers/eval.py @@ -267,9 +267,6 @@ def _get_eval(args): # noqa: ANN001 f" Model: {info.get('modelId', '') or '(none)'}", f" Created: {info.get('createdAt', '')}", ] - config = info.get("config") or {} - if config: - lines.append(f" Config: overlap={config.get('overlap')} iouThreshold={config.get('iouThreshold')}") summary = info.get("summary") or {} if summary: lines.append( @@ -452,15 +449,19 @@ def _image_predictions(args): # noqa: ANN001 images = data.get("images", []) rows = [] for img in images: + # Stats are camelCase per the public API: + # `truePositives`/`falsePositives`/`falseNegatives` (not `tp`/`fp`/`fn`). stats = img.get("stats") or {} + cluster = img.get("cluster") or {} + cluster_id = cluster.get("id") if isinstance(cluster, dict) else cluster rows.append( { "image": img.get("imageName", img.get("imageId", "")), "split": img.get("split", ""), - "tp": stats.get("tp", ""), - "fp": stats.get("fp", ""), - "fn": stats.get("fn", ""), - "cluster": img.get("cluster", ""), + "tp": stats.get("truePositives", ""), + "fp": stats.get("falsePositives", ""), + "fn": stats.get("falseNegatives", ""), + "cluster": cluster_id if cluster_id is not None else "", } ) table = format_table( diff --git a/roboflow/core/model_eval.py b/roboflow/core/model_eval.py index 0d7ee9cf..49479c72 100644 --- a/roboflow/core/model_eval.py +++ b/roboflow/core/model_eval.py @@ -27,7 +27,7 @@ class ModelEval: from roboflow.core.model_eval import ModelEval ev = ModelEval(api_key, "lee-sandbox", "huUF720inUcymARwqAGK") - ev.refresh() # populates .status, .summary, .config, etc. + ev.refresh() # populates .status, .summary, etc. """ def __init__( @@ -67,7 +67,7 @@ def _apply(self, info: Dict[str, Any]) -> None: # -- core --------------------------------------------------------------- def refresh(self) -> "ModelEval": - """Re-fetch the eval header (status, summary, config) from the server.""" + """Re-fetch the eval header (status, summary, …) from the server.""" info = rfapi.get_model_eval(self._api_key, self._workspace_url, self.id) self._apply(info) return self @@ -129,26 +129,37 @@ def recommendations(self) -> Dict[str, Any]: # -- helpers ------------------------------------------------------------ + # Mapping (json_key, attr_name) used by `to_dict()` to round-trip a + # constructor-only ModelEval (one with no `info=` payload) back into the + # public JSON shape. Same fields the server returns at the top level of + # `modelEvals.get`, in the same order. + _PUBLIC_FIELDS = ( + ("status", "status"), + ("project", "project"), + ("versionId", "version_id"), + ("modelId", "model_id"), + ("createdAt", "created_at"), + ("summary", "summary"), + ) + def to_dict(self) -> Dict[str, Any]: - """Return the cached eval metadata as a plain dict (evalId + last header fetch).""" - data: Dict[str, Any] = {"evalId": self.id} - # Prefer raw payload (preserves keys we don't surface as attrs); fall - # back to attributes when only the constructor was called with no info. + """Return the cached eval metadata as a plain dict (evalId + last header fetch). + + When the instance was created from a server payload (the usual path — + via ``Workspace.eval`` or ``Workspace.evals``) the raw payload is + round-tripped, with ``evalId`` overlaid so legacy ``id``-keyed + responses still emit the DNA-aligned field. When the instance was + created without a payload (constructor only — ``ModelEval(key, ws, + eval_id)`` with no ``refresh()``) only the attributes the caller has + set get serialised, omitting any ``None`` fields. + """ if self._raw: return {**self._raw, "evalId": self.id} - for key in ("status", "project", "versionId", "modelId", "createdAt", "summary"): - attr = ( - key - if key in {"status", "project", "summary"} - else { - "versionId": "version_id", - "modelId": "model_id", - "createdAt": "created_at", - }[key] - ) - value = getattr(self, attr, None) + data: Dict[str, Any] = {"evalId": self.id} + for json_key, attr_name in self._PUBLIC_FIELDS: + value = getattr(self, attr_name, None) if value is not None: - data[key] = value + data[json_key] = value return data def __repr__(self) -> str: # pragma: no cover - debug helper diff --git a/tests/cli/test_eval_handler.py b/tests/cli/test_eval_handler.py index 7e19d60c..0d4a4a30 100644 --- a/tests/cli/test_eval_handler.py +++ b/tests/cli/test_eval_handler.py @@ -307,6 +307,54 @@ def test_image_predictions_pagination(self, _key, _ws, mock_fn): printed = mock_print.call_args[0][0] self.assertIn("a.jpg", printed) + @patch("roboflow.adapters.rfapi.get_model_eval_image_predictions") + @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") + @patch("roboflow.config.load_roboflow_api_key", return_value="key") + def test_image_predictions_table_renders_TP_FP_FN_from_camelCase_stats(self, _key, _ws, mock_fn): + # Regression: the public API nests counts under `stats` with camelCase + # keys (`truePositives`/`falsePositives`/`falseNegatives`). Earlier code + # read `stats.tp`/`stats.fp`/`stats.fn` and silently rendered blanks. + mock_fn.return_value = { + "split": "test", + "confidenceThreshold": 0.2, + "totalImages": 1, + "offset": 0, + "limit": 1, + "images": [ + { + "imageId": "i1", + "imageName": "abc.jpg", + "split": "test", + "augmentations": 2, + "stats": { + "truePositives": 7, + "falsePositives": 2, + "falseNegatives": 1, + "precision": 0.78, + "recall": 0.875, + "f1": 0.824, + }, + # The cluster column previously stringified the whole dict; + # we only want the cluster id rendered. + "cluster": {"id": 4, "embedding2D": [1.5, -3.2]}, + } + ], + } + args = _args(workspace=None, eval_id="e1", split="test", confidence=None, limit=None, offset=None) + + from roboflow.cli.handlers.eval import _image_predictions + + with patch("builtins.print") as mock_print: + _image_predictions(args) + printed = mock_print.call_args[0][0] + # TP/FP/FN counts must appear in the rendered table. + self.assertIn("7", printed) # truePositives + self.assertIn("2", printed) # falsePositives + augmentations both = 2; either way it should appear + self.assertIn("1", printed) # falseNegatives + # Cluster rendered as the bare id, not the embedding-bearing dict. + self.assertIn(" 4 ", printed) + self.assertNotIn("embedding2D", printed) + @patch("roboflow.adapters.rfapi.get_model_eval_recommendations") @patch("roboflow.cli._resolver.resolve_default_workspace", return_value="lee-sandbox") @patch("roboflow.config.load_roboflow_api_key", return_value="key") diff --git a/tests/test_model_eval.py b/tests/test_model_eval.py index 49db9112..40072bab 100644 --- a/tests/test_model_eval.py +++ b/tests/test_model_eval.py @@ -53,6 +53,84 @@ def test_construction_without_info(self): self.assertIsNone(ev.summary) +class TestModelEvalToDict(unittest.TestCase): + """`to_dict()` has two branches: with-payload (round-trip) and without-payload + (rebuild from attributes). Both need to behave correctly.""" + + def test_to_dict_round_trips_raw_payload_with_evalId_overlay(self): + from roboflow.core.model_eval import ModelEval + + # Server-payload path: the raw response is round-tripped (including any + # extra keys we don't surface as attrs), with `evalId` overlaid so legacy + # `id`-keyed responses still emit the DNA-aligned field. + info = { + "evalId": "e1", + "status": "done", + "project": "my-project-slug", + "versionId": "3", + "modelId": "m1", + "createdAt": "2025-01-01", + "summary": {"mAP": 0.9, "precision": 0.8, "recall": 0.85}, + "extraField": "preserved-by-roundtrip", + } + ev = ModelEval("k", "ws", "e1", info=info) + d = ev.to_dict() + # Round-trip preserves every server-side field, including ones we don't + # surface as attributes. + self.assertEqual(d["extraField"], "preserved-by-roundtrip") + self.assertEqual(d["project"], "my-project-slug") + self.assertEqual(d["evalId"], "e1") + self.assertEqual(d["summary"]["mAP"], 0.9) + + def test_to_dict_overlays_evalId_when_payload_used_legacy_id_key(self): + from roboflow.core.model_eval import ModelEval + + # Older server versions returned `id` instead of `evalId`. The SDK accepts + # both on the way in; on the way out it always emits `evalId`. + info = {"id": "e1-legacy", "status": "done", "project": "p"} + ev = ModelEval("k", "ws", "e1-legacy", info=info) + d = ev.to_dict() + self.assertEqual(d["evalId"], "e1-legacy") + + def test_to_dict_no_info_serialises_attrs_only_omitting_None(self): + from roboflow.core.model_eval import ModelEval + + # Constructor-only path (no `info=` payload, no `refresh()` call). + # Only attributes the caller sets get serialised; everything else is + # omitted rather than serialised as `null`. + ev = ModelEval("k", "ws", "e1") + d = ev.to_dict() + self.assertEqual(d, {"evalId": "e1"}) + + def test_to_dict_no_info_translates_attr_names_back_to_json_keys(self): + from roboflow.core.model_eval import ModelEval + + # Hand-construct an instance without an info payload, then mutate + # attributes (the way a user might before serialising for logging / + # comparison). `to_dict` should emit the JSON-side names, not the + # snake_case Python attr names. + ev = ModelEval("k", "ws", "e1") + ev.status = "done" + ev.project = "p" + ev.version_id = "3" + ev.model_id = "m1" + ev.created_at = "2025-01-01" + ev.summary = {"mAP": 0.9} + d = ev.to_dict() + self.assertEqual( + d, + { + "evalId": "e1", + "status": "done", + "project": "p", + "versionId": "3", # not version_id + "modelId": "m1", # not model_id + "createdAt": "2025-01-01", # not created_at + "summary": {"mAP": 0.9}, + }, + ) + + class TestModelEvalRefresh(unittest.TestCase): @patch("roboflow.adapters.rfapi.get_model_eval") def test_refresh_updates_status_and_summary(self, mock_get):