From 00b2c13dcbe5c07dfdbf9421461edc6c48b074a6 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Wed, 17 Jun 2026 13:27:55 -0700 Subject: [PATCH 1/2] launcher: package as modelopt_launcher; mcp: call console script directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tools/launcher/__init__.py gains PACKAGE_DIR so the directory is an importable package named modelopt_launcher (via package-dir mapping in pyproject.toml). common/ and examples/ stay in place — no file moves. pyproject.toml adds: - packages = ["modelopt_launcher"] + package-dir = {"modelopt_launcher": "."} - package-data for common/**/* and examples/**/*.{yaml,jinja} - console_scripts: modelopt-launcher = "modelopt_launcher.launch:main" launch.py: - imports from modelopt_launcher.{core,slurm_config} (works in both uv run and installed console-script modes via editable install) - adds _has_modelopt_src flag: skips packaging modelopt source when running as an installed console script (cluster container already has modelopt pre-installed) - adds main() entry point tools/mcp/modelopt_mcp/bridge.py: - deletes _find_launcher_dir() (75-line filesystem walker) and _launcher_dir_not_found_response(); 7 corresponding tests removed - _find_launcher_examples_dir() simplified to 2 strategies: env override → import modelopt_launcher - submit_job_impl and _submit_job_dry_run switch from ["uv", "run", "launch.py"] with cwd=launcher_dir to ["modelopt-launcher"] with no cwd required - _resolve_experiment_dir drops the _find_launcher_dir() fallback - read_cluster_artifact_impl drops cwd= from nemo subprocess tools/mcp/pyproject.toml declares modelopt-launcher as a proper dependency (dev: editable path ../launcher; published: PyPI). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Chenhan Yu --- tools/launcher/.gitignore | 3 + tools/launcher/__init__.py | 6 +- tools/launcher/launch.py | 62 ++++++++++---- tools/launcher/pyproject.toml | 15 +++- tools/mcp/modelopt_mcp/bridge.py | 138 +++---------------------------- tools/mcp/pyproject.toml | 24 +----- tools/mcp/tests/test_bridge.py | 99 ---------------------- 7 files changed, 80 insertions(+), 267 deletions(-) diff --git a/tools/launcher/.gitignore b/tools/launcher/.gitignore index 3eb4a49079c..3c0eb2aee0d 100644 --- a/tools/launcher/.gitignore +++ b/tools/launcher/.gitignore @@ -13,6 +13,9 @@ local_experiments/ # uv lock (generated, not portable) uv.lock +# Auto-created symlink by launch.py in dev mode +modules/ + # Python cache __pycache__/ diff --git a/tools/launcher/__init__.py b/tools/launcher/__init__.py index 11b92d8b771..88e2af4b450 100644 --- a/tools/launcher/__init__.py +++ b/tools/launcher/__init__.py @@ -13,4 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters.""" +"""modelopt_launcher — installable package exposing launcher scripts and examples.""" + +import os as _os + +PACKAGE_DIR: str = _os.path.dirname(_os.path.abspath(__file__)) diff --git a/tools/launcher/launch.py b/tools/launcher/launch.py index fdb867f08aa..d95a84eabbf 100644 --- a/tools/launcher/launch.py +++ b/tools/launcher/launch.py @@ -33,9 +33,16 @@ import subprocess # nosec B404 import warnings +import modelopt_launcher as _pkg import nemo_run as run -from core import SandboxPipeline, get_default_env, register_factory, run_jobs, set_slurm_config_type -from slurm_config import SlurmConfig, slurm_factory +from modelopt_launcher.core import ( + SandboxPipeline, + get_default_env, + register_factory, + run_jobs, + set_slurm_config_type, +) +from modelopt_launcher.slurm_config import SlurmConfig, slurm_factory set_slurm_config_type(SlurmConfig) register_factory("slurm_factory", slurm_factory) @@ -44,33 +51,49 @@ # Launcher-specific configuration # --------------------------------------------------------------------------- -LAUNCHER_DIR = os.path.dirname(os.path.abspath(__file__)) -MODELOPT_ROOT = os.path.dirname(os.path.dirname(LAUNCHER_DIR)) +LAUNCHER_DIR = _pkg.PACKAGE_DIR # tools/launcher/ (dev or installed) -# Ensure modules/Model-Optimizer symlink exists (points to parent Model-Optimizer root) -_mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") -if not os.path.exists(_mo_symlink): - os.makedirs(os.path.join(LAUNCHER_DIR, "modules"), exist_ok=True) - os.symlink(os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink) +# MODELOPT_ROOT is only available in a dev checkout (tools/launcher/../..): +# when running as an installed console script the cluster container already +# has modelopt pre-installed, so we skip packaging it from source. +MODELOPT_ROOT = os.path.dirname(os.path.dirname(LAUNCHER_DIR)) +_modelopt_src = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer", "modelopt") +_has_modelopt_src = os.path.isdir(_modelopt_src) + +# Ensure modules/Model-Optimizer symlink exists so the packager can resolve +# modules/Model-Optimizer/* patterns in dev mode. +if _has_modelopt_src: + _mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") + if not os.path.exists(_mo_symlink): + os.makedirs(os.path.join(LAUNCHER_DIR, "modules"), exist_ok=True) + os.symlink( + os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink + ) EXPERIMENT_TITLE = "cicd" DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE) -packager = run.PatternPackager( - include_pattern=[ +_include_pattern = ["examples/*", "common/*"] +_relative_path = [LAUNCHER_DIR, LAUNCHER_DIR] + +if _has_modelopt_src: + _include_pattern = [ "modules/Megatron-LM/megatron/*", "modules/Megatron-LM/examples/*", "modules/Megatron-LM/*.py", "modules/Model-Optimizer/modelopt/*", "modules/Model-Optimizer/modelopt_recipes/*", "modules/Model-Optimizer/examples/*", - "examples/*", - "common/*", - ], - relative_path=[LAUNCHER_DIR] * 8, + *_include_pattern, + ] + _relative_path = [LAUNCHER_DIR] * 6 + _relative_path + +packager = run.PatternPackager( + include_pattern=_include_pattern, + relative_path=_relative_path, ) -MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") +MODELOPT_SRC_PATH = _modelopt_src if _has_modelopt_src else None # --------------------------------------------------------------------------- @@ -125,5 +148,10 @@ def launch( ) -if __name__ == "__main__": +def main() -> None: + """Console script entry point for the ``modelopt-launcher`` command.""" run.cli.main(launch) + + +if __name__ == "__main__": + main() diff --git a/tools/launcher/pyproject.toml b/tools/launcher/pyproject.toml index 94577098d93..8ba825776d4 100644 --- a/tools/launcher/pyproject.toml +++ b/tools/launcher/pyproject.toml @@ -8,8 +8,21 @@ dependencies = [ "pyyaml", ] +[project.scripts] +modelopt-launcher = "modelopt_launcher.launch:main" + [tool.setuptools] -py-modules = [] +packages = ["modelopt_launcher"] + +[tool.setuptools.package-dir] +modelopt_launcher = "." + +[tool.setuptools.package-data] +modelopt_launcher = [ + "common/**/*", + "examples/**/*.yaml", + "examples/**/*.jinja", +] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tools/mcp/modelopt_mcp/bridge.py b/tools/mcp/modelopt_mcp/bridge.py index 45d8661f290..cadb2a4a088 100644 --- a/tools/mcp/modelopt_mcp/bridge.py +++ b/tools/mcp/modelopt_mcp/bridge.py @@ -54,97 +54,15 @@ ) -def _find_launcher_dir() -> Path | None: - """Resolve the modelopt launcher's directory. - - Tries, in order: - - 1. ``$MODELOPT_LAUNCHER_DIR`` env override — the deterministic - path agents/operators can set when the package layout doesn't - match the in-repo expectation. - 2. ``_THIS_DIR.parent.parent / "launcher"`` — the in-repo layout - (``tools/mcp/modelopt_mcp/bridge.py`` → ``tools/launcher/``). - Works in dev installs (``pip install -e tools/mcp``) and in - direct ``Model-Optimizer`` clones. - 3. Walk up from ``os.getcwd()`` looking for - ``modules/Model-Optimizer/tools/launcher/`` (intern-agent - workspace layout) or ``tools/launcher/`` (direct - Model-Optimizer checkout) at each ancestor. Stops at the - filesystem root. - - Returns ``None`` if no candidate resolves to an existing dir. - Callers surface that as a structured ``launcher_dir_not_found`` - failure with the searched paths in the diagnostic. - - Empirically: when modelopt-mcp is installed via ``uv tool install`` - (intern-agent's CI install pattern, MR !226), ``_THIS_DIR`` lives - inside ``~/.local/share/uv/tools/modelopt-mcp/lib/.../site-packages/`` - and step 2's parent-walk doesn't find the launcher. Step 3 (cwd - walk-up) handles that case — the agent's CWD is always inside - its cloned nmm-sandbox workspace where ``modules/Model-Optimizer/ - tools/launcher/`` does exist. - """ - env = os.environ.get("MODELOPT_LAUNCHER_DIR") - if env: - p = Path(env) - if p.exists(): - return p - - # In-repo layout (dev install / direct clone) - candidate = _THIS_DIR.parent.parent / "launcher" - if candidate.exists(): - return candidate - - # cwd walk-up (uv-tool-install + agent workspace layout) - cwd = Path.cwd().resolve() - for ancestor in (cwd, *cwd.parents): - for rel in ("modules/Model-Optimizer/tools/launcher", "tools/launcher"): - candidate = ancestor / rel - if candidate.exists(): - return candidate - - return None - - -def _launcher_dir_not_found_response(*, dry_run: bool = False) -> dict: - """Structured failure when ``_find_launcher_dir()`` returns None. - - Centralized so the five callsites that need the launcher dir - return a consistent diagnostic listing the searched paths. - """ - env_path = os.environ.get("MODELOPT_LAUNCHER_DIR") or "(unset)" - in_repo = _THIS_DIR.parent.parent / "launcher" - resp: dict = { - "ok": False, - "reason": "launcher_dir_not_found", - "diagnostic": ( - "Could not locate tools/launcher/. Searched:\n" - f" 1. $MODELOPT_LAUNCHER_DIR={env_path}\n" - f" 2. in-repo layout: {in_repo} (exists={in_repo.exists()})\n" - f" 3. cwd walk-up from {Path.cwd().resolve()} looking for " - "modules/Model-Optimizer/tools/launcher or tools/launcher\n" - "Fix: set $MODELOPT_LAUNCHER_DIR to the absolute path of your " - "Model-Optimizer checkout's tools/launcher/, or run modelopt-mcp " - "from inside such a checkout." - ), - } - if dry_run: - resp["dry_run"] = True - return resp - - def _find_launcher_examples_dir() -> Path | None: """Resolve the launcher examples directory. Strategy (in order): 1. ``MODELOPT_LAUNCHER_EXAMPLES_DIR`` env override — for tests + ad-hoc relocations. - 2. ``../../launcher/examples/`` from this file — the in-repo layout - when running from a Model-Optimizer clone (this is the dev mode - AND the uvx-from-git mode, since uvx checks out the whole repo). - 3. Site-packages install: walk back through the modelopt_launcher - package to find its examples/ — fallback for the case where the - launcher was pip-installed standalone. + 2. ``import modelopt_launcher`` — works whether the launcher is + installed via pip/uvx or in editable dev mode; ``PACKAGE_DIR`` + points at ``tools/launcher/``, which contains ``examples/``. Returns None if no candidate exists; callers surface that as a structured failure rather than blowing up. @@ -154,19 +72,10 @@ def _find_launcher_examples_dir() -> Path | None: p = Path(env) return p if p.exists() else None - # In-repo: this file is at tools/mcp/modelopt_mcp/bridge.py; - # examples are at tools/launcher/examples/. - candidate = _THIS_DIR.parent.parent / "launcher" / "examples" - if candidate.exists(): - return candidate - - # Site-packages fallback: the modelopt-launcher package may carry - # its examples next to its core.py. try: import modelopt_launcher - pkg_dir = Path(modelopt_launcher.__file__).resolve().parent - candidate = pkg_dir / "examples" + candidate = Path(modelopt_launcher.PACKAGE_DIR) / "examples" if candidate.exists(): return candidate except ImportError: @@ -604,15 +513,14 @@ def submit_job_impl( # list never goes through a shell, so quoting bakes literal quote chars # into the values that nemo-run's CLI parser sees. Verbatim values # carry spaces / special chars safely. - argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--yes"] + argv = ["modelopt-launcher", "--yaml", str(abs_yaml), "--yes"] if hf_local: argv.append(f"hf_local={hf_local}") else: - # Slurm mode — `launch.py`'s entrypoint does not accept a - # `cluster_host` arg (see tools/launcher/launch.py:82). The host - # is sourced via the SLURM_HOST env var, consumed by - # `slurm_factory(host=os.environ.get("SLURM_HOST", ""))` in - # tools/launcher/slurm_config.py. Propagate via env, not argv. + # Slurm mode — the launcher entrypoint does not accept a + # `cluster_host` arg. The host is sourced via the SLURM_HOST env + # var, consumed by slurm_factory in slurm_config.py. + # Propagate via env, not argv. if cluster_user: argv.append(f"user={cluster_user}") if identity: @@ -625,11 +533,6 @@ def submit_job_impl( for k, v in (extra_overrides or {}).items(): argv.append(f"{k}={v}") - # Run from the launcher dir so it picks up its own ./core.py etc. - launcher_dir = _find_launcher_dir() - if launcher_dir is None: - return _launcher_dir_not_found_response() - # Propagate env so submit-side and status-side agree on NEMORUN_HOME. # Without this, `launch.py` defaults NEMORUN_HOME to its own cwd # (tools/launcher/), but `_resolve_experiment_dir` later checks the @@ -655,7 +558,6 @@ def submit_job_impl( # B603 false positive — argv is a controlled list built above. proc = subprocess.Popen( # nosec B603 argv, - cwd=str(launcher_dir), env=child_env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -682,7 +584,6 @@ def submit_job_impl( try: proc = subprocess.run( # nosec B603 argv, - cwd=str(launcher_dir), env=child_env, capture_output=True, text=True, @@ -825,7 +726,7 @@ def _submit_job_dry_run( # blocks on its confirmation prompt — and since we're capturing # stdout (no TTY), the prompt would hang until the 60-second # timeout fires. - argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dryrun", "--yes"] + argv = ["modelopt-launcher", "--yaml", str(abs_yaml), "--dryrun", "--yes"] if hf_local: argv.append(f"hf_local={hf_local}") if cluster_user: @@ -839,10 +740,6 @@ def _submit_job_dry_run( for k, v in (extra_overrides or {}).items(): argv.append(f"{k}={v}") - launcher_dir = _find_launcher_dir() - if launcher_dir is None: - return _launcher_dir_not_found_response(dry_run=True) - # Propagate env so the launcher's factory resolution matches what # the live submit would see (mainly: SLURM_HOST for slurm-factory # default when cluster_host is set). @@ -864,7 +761,6 @@ def _submit_job_dry_run( try: proc = subprocess.run( # nosec B603 argv, - cwd=str(launcher_dir), env=child_env, capture_output=True, text=True, @@ -948,17 +844,6 @@ def _resolve_experiment_dir(experiment_id: str) -> Path | None: candidates.append(Path(nemorun_home) / "experiments" / experiment_id) candidates.append(Path.cwd() / "experiments" / experiment_id) candidates.append(Path.cwd() / "local_experiments" / experiment_id) - # The launcher's own experiments dir — submit_job_impl uses - # cwd=str(launcher_dir) for the subprocess, so when NEMORUN_HOME is - # unset, launch.py defaults to launcher_dir/experiments/. If the - # launcher dir can't be resolved (uv-tool-install without an - # override + the agent's cwd doesn't see a launcher checkout), - # we skip this fallback rather than crashing — the env-vs-cwd - # candidates above still cover the common cases. - launcher_dir = _find_launcher_dir() - if launcher_dir is not None: - candidates.append(launcher_dir / "experiments" / experiment_id) - candidates.append(launcher_dir / "local_experiments" / experiment_id) for c in candidates: if c.exists(): return c @@ -1281,12 +1166,9 @@ def read_cluster_artifact_impl( experiment_id, str(job_idx), ] - launcher_dir = _find_launcher_dir() - cwd = str(launcher_dir) if launcher_dir is not None else None try: proc = subprocess.run( # nosec B603 B607 argv, - cwd=cwd, capture_output=True, text=True, timeout=60, diff --git a/tools/mcp/pyproject.toml b/tools/mcp/pyproject.toml index 4df07bb0eef..3e30411baa1 100644 --- a/tools/mcp/pyproject.toml +++ b/tools/mcp/pyproject.toml @@ -5,20 +5,7 @@ description = "MCP server exposing ModelOpt launcher operations (submit, status, requires-python = ">=3.10" dependencies = [ "mcp>=1.0", - # NOTE on modelopt-launcher: `tools/launcher/pyproject.toml` declares - # the package name as `modelopt-launcher` but configures - # `py-modules = []` — there is NO importable `modelopt_launcher` - # Python package on disk. bridge.py invokes the launcher via - # `uv run launch.py` (subprocess) from `/tools/launcher/` as - # a sibling directory; it does NOT `import modelopt_launcher`. - # Declaring the bare name here would add an unsatisfiable PyPI - # dependency for end users installing via - # `uvx --from "git+...#subdirectory=tools/mcp" modelopt-mcp`. So we - # do NOT declare it. The install relationship is documented in - # README.md as a sibling-checkout layout requirement instead. The - # uvx-from-git path satisfies this naturally because uvx clones - # the whole repo, putting tools/launcher and tools/mcp next to - # each other on disk. + "modelopt-launcher", "pyyaml", "pydantic>=2.0", ] @@ -36,13 +23,8 @@ build-backend = "setuptools.build_meta" where = ["."] include = ["modelopt_mcp*"] -# No [tool.uv.sources] for the launcher — bridge.py uses it via -# `subprocess.run(["uv", "run", "launch.py", ...], cwd=/tools/launcher/)`, -# so the launcher is a file-layout dependency, not a Python import -# dependency. The uvx-from-git path clones the whole repo so the -# sibling tools/launcher/ ends up on disk automatically. For dev: -# uv pip install -e . -# # then run from a clone where ../launcher exists. +[tool.uv.sources] +modelopt-launcher = { path = "../launcher", editable = true } [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tools/mcp/tests/test_bridge.py b/tools/mcp/tests/test_bridge.py index a605eabe822..4993c57bfc5 100644 --- a/tools/mcp/tests/test_bridge.py +++ b/tools/mcp/tests/test_bridge.py @@ -861,102 +861,3 @@ def fake_run(argv, **kwargs): assert result["ok"] is False assert result["reason"] == "gh_pr_create_failed" assert result["branch_pushed"] is True - - -# --------------------------------------------------------------------------- -# _find_launcher_dir — env override + walk-up search -# --------------------------------------------------------------------------- - - -def test_find_launcher_dir_env_override(monkeypatch, tmp_path): - """`$MODELOPT_LAUNCHER_DIR` wins over in-repo / cwd-walk.""" - launcher = tmp_path / "custom-launcher" - launcher.mkdir() - monkeypatch.setenv("MODELOPT_LAUNCHER_DIR", str(launcher)) - monkeypatch.chdir(tmp_path) # ensure no walk-up match interferes - assert bridge._find_launcher_dir() == launcher - - -def test_find_launcher_dir_env_override_missing_dir_fallthrough(monkeypatch, tmp_path): - """`$MODELOPT_LAUNCHER_DIR` pointing at a nonexistent path → fall through.""" - monkeypatch.setenv("MODELOPT_LAUNCHER_DIR", str(tmp_path / "ghost")) - monkeypatch.chdir(tmp_path) # no walk-up candidate - # In-repo candidate may or may not exist depending on test env; in - # the uv-tool-install case + this cwd it won't, so we get None. - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - result = bridge._find_launcher_dir() - if in_repo.exists(): - assert result == in_repo - else: - assert result is None - - -def test_find_launcher_dir_walk_up_modules_layout(monkeypatch, tmp_path): - """Walk-up finds `modules/Model-Optimizer/tools/launcher/` from a deep cwd.""" - workspace = tmp_path / "nmm-sandbox" - launcher = workspace / "modules" / "Model-Optimizer" / "tools" / "launcher" - launcher.mkdir(parents=True) - # Agent cwds deep inside the workspace - deep_cwd = workspace / "experiments" / "cicd" / "cicd_42" - deep_cwd.mkdir(parents=True) - monkeypatch.chdir(deep_cwd) - monkeypatch.delenv("MODELOPT_LAUNCHER_DIR", raising=False) - - found = bridge._find_launcher_dir() - # In-repo `_THIS_DIR.parent.parent / "launcher"` may also exist in dev mode; - # accept either, but if it doesn't exist we MUST have walked up to find the - # workspace launcher. - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - if in_repo.exists(): - assert found == in_repo - else: - assert found == launcher - - -def test_find_launcher_dir_walk_up_tools_layout(monkeypatch, tmp_path): - """Walk-up finds plain `tools/launcher/` (direct Model-Optimizer checkout).""" - checkout = tmp_path / "Model-Optimizer-clone" - launcher = checkout / "tools" / "launcher" - launcher.mkdir(parents=True) - deep_cwd = checkout / "examples" / "speculative_decoding" - deep_cwd.mkdir(parents=True) - monkeypatch.chdir(deep_cwd) - monkeypatch.delenv("MODELOPT_LAUNCHER_DIR", raising=False) - - found = bridge._find_launcher_dir() - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - if in_repo.exists(): - assert found == in_repo - else: - assert found == launcher - - -def test_find_launcher_dir_returns_none_when_nothing_found(monkeypatch, tmp_path): - """No env, no in-repo, no walk-up candidate → None.""" - monkeypatch.delenv("MODELOPT_LAUNCHER_DIR", raising=False) - isolated = tmp_path / "iso" - isolated.mkdir() - monkeypatch.chdir(isolated) - found = bridge._find_launcher_dir() - # In a dev-install test env, the in-repo path may resolve. Accept - # either None or that specific path — but NEVER something unrelated. - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - assert found is None or found == in_repo - - -def test_launcher_dir_not_found_response_shape(): - """Helper returns the canonical structured-failure dict.""" - resp = bridge._launcher_dir_not_found_response() - assert resp["ok"] is False - assert resp["reason"] == "launcher_dir_not_found" - assert "Searched" in resp["diagnostic"] - assert "MODELOPT_LAUNCHER_DIR" in resp["diagnostic"] - assert "dry_run" not in resp - - -def test_launcher_dir_not_found_response_dry_run_flag(): - """`dry_run=True` adds `dry_run: True` to the response.""" - resp = bridge._launcher_dir_not_found_response(dry_run=True) - assert resp["ok"] is False - assert resp["dry_run"] is True - assert resp["reason"] == "launcher_dir_not_found" From 3aafbc83b10472ae8ad04ad7229e16c6459c055c Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Wed, 17 Jun 2026 16:53:04 -0700 Subject: [PATCH 2/2] address review: fix _has_modelopt_src check, _mo_symlink guard, __all__, launcher not found - launch.py: detect dev checkout via MODELOPT_ROOT/modelopt (actual dir) instead of the symlink path which doesn't exist in a clean checkout - launch.py: initialize _mo_symlink = None before the conditional; guard --clean to raise ValueError instead of NameError in installed mode - __init__.py: add __all__ = ["PACKAGE_DIR"] per coding guidelines - bridge.py: add _launcher_not_installed() helper; catch FileNotFoundError on all three modelopt-launcher subprocess call sites (Docker Popen, Slurm run, dry-run run) and return structured ok=false instead of crash Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Chenhan Yu --- tools/launcher/__init__.py | 2 ++ tools/launcher/launch.py | 21 ++++++++++++------- tools/mcp/modelopt_mcp/bridge.py | 35 +++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/tools/launcher/__init__.py b/tools/launcher/__init__.py index 88e2af4b450..baec2f6944a 100644 --- a/tools/launcher/__init__.py +++ b/tools/launcher/__init__.py @@ -17,4 +17,6 @@ import os as _os +__all__ = ["PACKAGE_DIR"] + PACKAGE_DIR: str = _os.path.dirname(_os.path.abspath(__file__)) diff --git a/tools/launcher/launch.py b/tools/launcher/launch.py index d95a84eabbf..99bd868749b 100644 --- a/tools/launcher/launch.py +++ b/tools/launcher/launch.py @@ -53,15 +53,18 @@ LAUNCHER_DIR = _pkg.PACKAGE_DIR # tools/launcher/ (dev or installed) -# MODELOPT_ROOT is only available in a dev checkout (tools/launcher/../..): -# when running as an installed console script the cluster container already -# has modelopt pre-installed, so we skip packaging it from source. +# Detect dev checkout by probing the actual MODELOPT_ROOT, not the symlink +# path (which doesn't exist yet in a clean checkout). When running as an +# installed console script the cluster container already has modelopt +# pre-installed, so we skip packaging it from source. MODELOPT_ROOT = os.path.dirname(os.path.dirname(LAUNCHER_DIR)) -_modelopt_src = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer", "modelopt") -_has_modelopt_src = os.path.isdir(_modelopt_src) +_has_modelopt_src = os.path.isdir(os.path.join(MODELOPT_ROOT, "modelopt")) + +# Symlink path used by the PatternPackager to resolve modules/Model-Optimizer/* +# patterns; only valid in dev mode. Initialized to None so --clean in installed +# mode gets a clear error instead of a NameError. +_mo_symlink: str | None = None -# Ensure modules/Model-Optimizer symlink exists so the packager can resolve -# modules/Model-Optimizer/* patterns in dev mode. if _has_modelopt_src: _mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") if not os.path.exists(_mo_symlink): @@ -70,6 +73,8 @@ os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink ) +_modelopt_src = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer", "modelopt") + EXPERIMENT_TITLE = "cicd" DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE) @@ -114,6 +119,8 @@ def launch( ) -> None: """Launch ModelOpt jobs on Slurm or locally with Docker.""" if clean: + if _mo_symlink is None: + raise ValueError("--clean requires a dev checkout; modelopt source not found.") examples_dir = os.path.join(_mo_symlink, "examples") print(f"Cleaning {examples_dir} with git clean -xdf ...") subprocess.run(["git", "clean", "-xdf", "."], cwd=examples_dir, check=True) # nosec B603 B607 diff --git a/tools/mcp/modelopt_mcp/bridge.py b/tools/mcp/modelopt_mcp/bridge.py index cadb2a4a088..32290b654c6 100644 --- a/tools/mcp/modelopt_mcp/bridge.py +++ b/tools/mcp/modelopt_mcp/bridge.py @@ -83,6 +83,20 @@ def _find_launcher_examples_dir() -> Path | None: return None +def _launcher_not_installed(argv: list[str]) -> dict: + """Structured failure when the ``modelopt-launcher`` binary is not on PATH.""" + return { + "ok": False, + "reason": "launcher_not_installed", + "diagnostic": ( + "`modelopt-launcher` was not found on PATH. " + "Install it with `pip install modelopt-launcher` or " + "`uv tool install modelopt-launcher` and retry." + ), + "argv": argv, + } + + # --------------------------------------------------------------------------- # list_examples # --------------------------------------------------------------------------- @@ -556,13 +570,16 @@ def submit_job_impl( # group so an MCP server restart / SIGINT doesn't SIGHUP the # in-flight launcher. # B603 false positive — argv is a controlled list built above. - proc = subprocess.Popen( # nosec B603 - argv, - env=child_env, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - start_new_session=True, - ) + try: + proc = subprocess.Popen( # nosec B603 + argv, + env=child_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + except FileNotFoundError: + return _launcher_not_installed(argv) return { "ok": True, "executor": "docker", @@ -590,6 +607,8 @@ def submit_job_impl( timeout=300, check=False, ) + except FileNotFoundError: + return _launcher_not_installed(argv) except subprocess.TimeoutExpired as e: return { "ok": False, @@ -767,6 +786,8 @@ def _submit_job_dry_run( timeout=60, check=False, ) + except FileNotFoundError: + return {**_launcher_not_installed(argv), "dry_run": True} except subprocess.TimeoutExpired as e: return { "ok": False,