diff --git a/tools/launcher/.gitignore b/tools/launcher/.gitignore index 3eb4a49079c..3c0eb2aee0d 100644 --- a/tools/launcher/.gitignore +++ b/tools/launcher/.gitignore @@ -13,6 +13,9 @@ local_experiments/ # uv lock (generated, not portable) uv.lock +# Auto-created symlink by launch.py in dev mode +modules/ + # Python cache __pycache__/ diff --git a/tools/launcher/__init__.py b/tools/launcher/__init__.py index 11b92d8b771..baec2f6944a 100644 --- a/tools/launcher/__init__.py +++ b/tools/launcher/__init__.py @@ -13,4 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""ModelOpt Launcher — submit quantization, training, and evaluation jobs to Slurm clusters.""" +"""modelopt_launcher — installable package exposing launcher scripts and examples.""" + +import os as _os + +__all__ = ["PACKAGE_DIR"] + +PACKAGE_DIR: str = _os.path.dirname(_os.path.abspath(__file__)) diff --git a/tools/launcher/launch.py b/tools/launcher/launch.py index fdb867f08aa..99bd868749b 100644 --- a/tools/launcher/launch.py +++ b/tools/launcher/launch.py @@ -33,9 +33,16 @@ import subprocess # nosec B404 import warnings +import modelopt_launcher as _pkg import nemo_run as run -from core import SandboxPipeline, get_default_env, register_factory, run_jobs, set_slurm_config_type -from slurm_config import SlurmConfig, slurm_factory +from modelopt_launcher.core import ( + SandboxPipeline, + get_default_env, + register_factory, + run_jobs, + set_slurm_config_type, +) +from modelopt_launcher.slurm_config import SlurmConfig, slurm_factory set_slurm_config_type(SlurmConfig) register_factory("slurm_factory", slurm_factory) @@ -44,33 +51,54 @@ # Launcher-specific configuration # --------------------------------------------------------------------------- -LAUNCHER_DIR = os.path.dirname(os.path.abspath(__file__)) +LAUNCHER_DIR = _pkg.PACKAGE_DIR # tools/launcher/ (dev or installed) + +# Detect dev checkout by probing the actual MODELOPT_ROOT, not the symlink +# path (which doesn't exist yet in a clean checkout). When running as an +# installed console script the cluster container already has modelopt +# pre-installed, so we skip packaging it from source. MODELOPT_ROOT = os.path.dirname(os.path.dirname(LAUNCHER_DIR)) +_has_modelopt_src = os.path.isdir(os.path.join(MODELOPT_ROOT, "modelopt")) + +# Symlink path used by the PatternPackager to resolve modules/Model-Optimizer/* +# patterns; only valid in dev mode. Initialized to None so --clean in installed +# mode gets a clear error instead of a NameError. +_mo_symlink: str | None = None -# Ensure modules/Model-Optimizer symlink exists (points to parent Model-Optimizer root) -_mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") -if not os.path.exists(_mo_symlink): - os.makedirs(os.path.join(LAUNCHER_DIR, "modules"), exist_ok=True) - os.symlink(os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink) +if _has_modelopt_src: + _mo_symlink = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer") + if not os.path.exists(_mo_symlink): + os.makedirs(os.path.join(LAUNCHER_DIR, "modules"), exist_ok=True) + os.symlink( + os.path.relpath(MODELOPT_ROOT, os.path.join(LAUNCHER_DIR, "modules")), _mo_symlink + ) + +_modelopt_src = os.path.join(LAUNCHER_DIR, "modules", "Model-Optimizer", "modelopt") EXPERIMENT_TITLE = "cicd" DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE) -packager = run.PatternPackager( - include_pattern=[ +_include_pattern = ["examples/*", "common/*"] +_relative_path = [LAUNCHER_DIR, LAUNCHER_DIR] + +if _has_modelopt_src: + _include_pattern = [ "modules/Megatron-LM/megatron/*", "modules/Megatron-LM/examples/*", "modules/Megatron-LM/*.py", "modules/Model-Optimizer/modelopt/*", "modules/Model-Optimizer/modelopt_recipes/*", "modules/Model-Optimizer/examples/*", - "examples/*", - "common/*", - ], - relative_path=[LAUNCHER_DIR] * 8, + *_include_pattern, + ] + _relative_path = [LAUNCHER_DIR] * 6 + _relative_path + +packager = run.PatternPackager( + include_pattern=_include_pattern, + relative_path=_relative_path, ) -MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") +MODELOPT_SRC_PATH = _modelopt_src if _has_modelopt_src else None # --------------------------------------------------------------------------- @@ -91,6 +119,8 @@ def launch( ) -> None: """Launch ModelOpt jobs on Slurm or locally with Docker.""" if clean: + if _mo_symlink is None: + raise ValueError("--clean requires a dev checkout; modelopt source not found.") examples_dir = os.path.join(_mo_symlink, "examples") print(f"Cleaning {examples_dir} with git clean -xdf ...") subprocess.run(["git", "clean", "-xdf", "."], cwd=examples_dir, check=True) # nosec B603 B607 @@ -125,5 +155,10 @@ def launch( ) -if __name__ == "__main__": +def main() -> None: + """Console script entry point for the ``modelopt-launcher`` command.""" run.cli.main(launch) + + +if __name__ == "__main__": + main() diff --git a/tools/launcher/pyproject.toml b/tools/launcher/pyproject.toml index 94577098d93..8ba825776d4 100644 --- a/tools/launcher/pyproject.toml +++ b/tools/launcher/pyproject.toml @@ -8,8 +8,21 @@ dependencies = [ "pyyaml", ] +[project.scripts] +modelopt-launcher = "modelopt_launcher.launch:main" + [tool.setuptools] -py-modules = [] +packages = ["modelopt_launcher"] + +[tool.setuptools.package-dir] +modelopt_launcher = "." + +[tool.setuptools.package-data] +modelopt_launcher = [ + "common/**/*", + "examples/**/*.yaml", + "examples/**/*.jinja", +] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tools/mcp/modelopt_mcp/bridge.py b/tools/mcp/modelopt_mcp/bridge.py index 45d8661f290..32290b654c6 100644 --- a/tools/mcp/modelopt_mcp/bridge.py +++ b/tools/mcp/modelopt_mcp/bridge.py @@ -54,97 +54,15 @@ ) -def _find_launcher_dir() -> Path | None: - """Resolve the modelopt launcher's directory. - - Tries, in order: - - 1. ``$MODELOPT_LAUNCHER_DIR`` env override — the deterministic - path agents/operators can set when the package layout doesn't - match the in-repo expectation. - 2. ``_THIS_DIR.parent.parent / "launcher"`` — the in-repo layout - (``tools/mcp/modelopt_mcp/bridge.py`` → ``tools/launcher/``). - Works in dev installs (``pip install -e tools/mcp``) and in - direct ``Model-Optimizer`` clones. - 3. Walk up from ``os.getcwd()`` looking for - ``modules/Model-Optimizer/tools/launcher/`` (intern-agent - workspace layout) or ``tools/launcher/`` (direct - Model-Optimizer checkout) at each ancestor. Stops at the - filesystem root. - - Returns ``None`` if no candidate resolves to an existing dir. - Callers surface that as a structured ``launcher_dir_not_found`` - failure with the searched paths in the diagnostic. - - Empirically: when modelopt-mcp is installed via ``uv tool install`` - (intern-agent's CI install pattern, MR !226), ``_THIS_DIR`` lives - inside ``~/.local/share/uv/tools/modelopt-mcp/lib/.../site-packages/`` - and step 2's parent-walk doesn't find the launcher. Step 3 (cwd - walk-up) handles that case — the agent's CWD is always inside - its cloned nmm-sandbox workspace where ``modules/Model-Optimizer/ - tools/launcher/`` does exist. - """ - env = os.environ.get("MODELOPT_LAUNCHER_DIR") - if env: - p = Path(env) - if p.exists(): - return p - - # In-repo layout (dev install / direct clone) - candidate = _THIS_DIR.parent.parent / "launcher" - if candidate.exists(): - return candidate - - # cwd walk-up (uv-tool-install + agent workspace layout) - cwd = Path.cwd().resolve() - for ancestor in (cwd, *cwd.parents): - for rel in ("modules/Model-Optimizer/tools/launcher", "tools/launcher"): - candidate = ancestor / rel - if candidate.exists(): - return candidate - - return None - - -def _launcher_dir_not_found_response(*, dry_run: bool = False) -> dict: - """Structured failure when ``_find_launcher_dir()`` returns None. - - Centralized so the five callsites that need the launcher dir - return a consistent diagnostic listing the searched paths. - """ - env_path = os.environ.get("MODELOPT_LAUNCHER_DIR") or "(unset)" - in_repo = _THIS_DIR.parent.parent / "launcher" - resp: dict = { - "ok": False, - "reason": "launcher_dir_not_found", - "diagnostic": ( - "Could not locate tools/launcher/. Searched:\n" - f" 1. $MODELOPT_LAUNCHER_DIR={env_path}\n" - f" 2. in-repo layout: {in_repo} (exists={in_repo.exists()})\n" - f" 3. cwd walk-up from {Path.cwd().resolve()} looking for " - "modules/Model-Optimizer/tools/launcher or tools/launcher\n" - "Fix: set $MODELOPT_LAUNCHER_DIR to the absolute path of your " - "Model-Optimizer checkout's tools/launcher/, or run modelopt-mcp " - "from inside such a checkout." - ), - } - if dry_run: - resp["dry_run"] = True - return resp - - def _find_launcher_examples_dir() -> Path | None: """Resolve the launcher examples directory. Strategy (in order): 1. ``MODELOPT_LAUNCHER_EXAMPLES_DIR`` env override — for tests + ad-hoc relocations. - 2. ``../../launcher/examples/`` from this file — the in-repo layout - when running from a Model-Optimizer clone (this is the dev mode - AND the uvx-from-git mode, since uvx checks out the whole repo). - 3. Site-packages install: walk back through the modelopt_launcher - package to find its examples/ — fallback for the case where the - launcher was pip-installed standalone. + 2. ``import modelopt_launcher`` — works whether the launcher is + installed via pip/uvx or in editable dev mode; ``PACKAGE_DIR`` + points at ``tools/launcher/``, which contains ``examples/``. Returns None if no candidate exists; callers surface that as a structured failure rather than blowing up. @@ -154,19 +72,10 @@ def _find_launcher_examples_dir() -> Path | None: p = Path(env) return p if p.exists() else None - # In-repo: this file is at tools/mcp/modelopt_mcp/bridge.py; - # examples are at tools/launcher/examples/. - candidate = _THIS_DIR.parent.parent / "launcher" / "examples" - if candidate.exists(): - return candidate - - # Site-packages fallback: the modelopt-launcher package may carry - # its examples next to its core.py. try: import modelopt_launcher - pkg_dir = Path(modelopt_launcher.__file__).resolve().parent - candidate = pkg_dir / "examples" + candidate = Path(modelopt_launcher.PACKAGE_DIR) / "examples" if candidate.exists(): return candidate except ImportError: @@ -174,6 +83,20 @@ def _find_launcher_examples_dir() -> Path | None: return None +def _launcher_not_installed(argv: list[str]) -> dict: + """Structured failure when the ``modelopt-launcher`` binary is not on PATH.""" + return { + "ok": False, + "reason": "launcher_not_installed", + "diagnostic": ( + "`modelopt-launcher` was not found on PATH. " + "Install it with `pip install modelopt-launcher` or " + "`uv tool install modelopt-launcher` and retry." + ), + "argv": argv, + } + + # --------------------------------------------------------------------------- # list_examples # --------------------------------------------------------------------------- @@ -604,15 +527,14 @@ def submit_job_impl( # list never goes through a shell, so quoting bakes literal quote chars # into the values that nemo-run's CLI parser sees. Verbatim values # carry spaces / special chars safely. - argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--yes"] + argv = ["modelopt-launcher", "--yaml", str(abs_yaml), "--yes"] if hf_local: argv.append(f"hf_local={hf_local}") else: - # Slurm mode — `launch.py`'s entrypoint does not accept a - # `cluster_host` arg (see tools/launcher/launch.py:82). The host - # is sourced via the SLURM_HOST env var, consumed by - # `slurm_factory(host=os.environ.get("SLURM_HOST", ""))` in - # tools/launcher/slurm_config.py. Propagate via env, not argv. + # Slurm mode — the launcher entrypoint does not accept a + # `cluster_host` arg. The host is sourced via the SLURM_HOST env + # var, consumed by slurm_factory in slurm_config.py. + # Propagate via env, not argv. if cluster_user: argv.append(f"user={cluster_user}") if identity: @@ -625,11 +547,6 @@ def submit_job_impl( for k, v in (extra_overrides or {}).items(): argv.append(f"{k}={v}") - # Run from the launcher dir so it picks up its own ./core.py etc. - launcher_dir = _find_launcher_dir() - if launcher_dir is None: - return _launcher_dir_not_found_response() - # Propagate env so submit-side and status-side agree on NEMORUN_HOME. # Without this, `launch.py` defaults NEMORUN_HOME to its own cwd # (tools/launcher/), but `_resolve_experiment_dir` later checks the @@ -653,14 +570,16 @@ def submit_job_impl( # group so an MCP server restart / SIGINT doesn't SIGHUP the # in-flight launcher. # B603 false positive — argv is a controlled list built above. - proc = subprocess.Popen( # nosec B603 - argv, - cwd=str(launcher_dir), - env=child_env, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - start_new_session=True, - ) + try: + proc = subprocess.Popen( # nosec B603 + argv, + env=child_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + except FileNotFoundError: + return _launcher_not_installed(argv) return { "ok": True, "executor": "docker", @@ -682,13 +601,14 @@ def submit_job_impl( try: proc = subprocess.run( # nosec B603 argv, - cwd=str(launcher_dir), env=child_env, capture_output=True, text=True, timeout=300, check=False, ) + except FileNotFoundError: + return _launcher_not_installed(argv) except subprocess.TimeoutExpired as e: return { "ok": False, @@ -825,7 +745,7 @@ def _submit_job_dry_run( # blocks on its confirmation prompt — and since we're capturing # stdout (no TTY), the prompt would hang until the 60-second # timeout fires. - argv = ["uv", "run", "launch.py", "--yaml", str(abs_yaml), "--dryrun", "--yes"] + argv = ["modelopt-launcher", "--yaml", str(abs_yaml), "--dryrun", "--yes"] if hf_local: argv.append(f"hf_local={hf_local}") if cluster_user: @@ -839,10 +759,6 @@ def _submit_job_dry_run( for k, v in (extra_overrides or {}).items(): argv.append(f"{k}={v}") - launcher_dir = _find_launcher_dir() - if launcher_dir is None: - return _launcher_dir_not_found_response(dry_run=True) - # Propagate env so the launcher's factory resolution matches what # the live submit would see (mainly: SLURM_HOST for slurm-factory # default when cluster_host is set). @@ -864,13 +780,14 @@ def _submit_job_dry_run( try: proc = subprocess.run( # nosec B603 argv, - cwd=str(launcher_dir), env=child_env, capture_output=True, text=True, timeout=60, check=False, ) + except FileNotFoundError: + return {**_launcher_not_installed(argv), "dry_run": True} except subprocess.TimeoutExpired as e: return { "ok": False, @@ -948,17 +865,6 @@ def _resolve_experiment_dir(experiment_id: str) -> Path | None: candidates.append(Path(nemorun_home) / "experiments" / experiment_id) candidates.append(Path.cwd() / "experiments" / experiment_id) candidates.append(Path.cwd() / "local_experiments" / experiment_id) - # The launcher's own experiments dir — submit_job_impl uses - # cwd=str(launcher_dir) for the subprocess, so when NEMORUN_HOME is - # unset, launch.py defaults to launcher_dir/experiments/. If the - # launcher dir can't be resolved (uv-tool-install without an - # override + the agent's cwd doesn't see a launcher checkout), - # we skip this fallback rather than crashing — the env-vs-cwd - # candidates above still cover the common cases. - launcher_dir = _find_launcher_dir() - if launcher_dir is not None: - candidates.append(launcher_dir / "experiments" / experiment_id) - candidates.append(launcher_dir / "local_experiments" / experiment_id) for c in candidates: if c.exists(): return c @@ -1281,12 +1187,9 @@ def read_cluster_artifact_impl( experiment_id, str(job_idx), ] - launcher_dir = _find_launcher_dir() - cwd = str(launcher_dir) if launcher_dir is not None else None try: proc = subprocess.run( # nosec B603 B607 argv, - cwd=cwd, capture_output=True, text=True, timeout=60, diff --git a/tools/mcp/pyproject.toml b/tools/mcp/pyproject.toml index 4df07bb0eef..3e30411baa1 100644 --- a/tools/mcp/pyproject.toml +++ b/tools/mcp/pyproject.toml @@ -5,20 +5,7 @@ description = "MCP server exposing ModelOpt launcher operations (submit, status, requires-python = ">=3.10" dependencies = [ "mcp>=1.0", - # NOTE on modelopt-launcher: `tools/launcher/pyproject.toml` declares - # the package name as `modelopt-launcher` but configures - # `py-modules = []` — there is NO importable `modelopt_launcher` - # Python package on disk. bridge.py invokes the launcher via - # `uv run launch.py` (subprocess) from `/tools/launcher/` as - # a sibling directory; it does NOT `import modelopt_launcher`. - # Declaring the bare name here would add an unsatisfiable PyPI - # dependency for end users installing via - # `uvx --from "git+...#subdirectory=tools/mcp" modelopt-mcp`. So we - # do NOT declare it. The install relationship is documented in - # README.md as a sibling-checkout layout requirement instead. The - # uvx-from-git path satisfies this naturally because uvx clones - # the whole repo, putting tools/launcher and tools/mcp next to - # each other on disk. + "modelopt-launcher", "pyyaml", "pydantic>=2.0", ] @@ -36,13 +23,8 @@ build-backend = "setuptools.build_meta" where = ["."] include = ["modelopt_mcp*"] -# No [tool.uv.sources] for the launcher — bridge.py uses it via -# `subprocess.run(["uv", "run", "launch.py", ...], cwd=/tools/launcher/)`, -# so the launcher is a file-layout dependency, not a Python import -# dependency. The uvx-from-git path clones the whole repo so the -# sibling tools/launcher/ ends up on disk automatically. For dev: -# uv pip install -e . -# # then run from a clone where ../launcher exists. +[tool.uv.sources] +modelopt-launcher = { path = "../launcher", editable = true } [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tools/mcp/tests/test_bridge.py b/tools/mcp/tests/test_bridge.py index a605eabe822..4993c57bfc5 100644 --- a/tools/mcp/tests/test_bridge.py +++ b/tools/mcp/tests/test_bridge.py @@ -861,102 +861,3 @@ def fake_run(argv, **kwargs): assert result["ok"] is False assert result["reason"] == "gh_pr_create_failed" assert result["branch_pushed"] is True - - -# --------------------------------------------------------------------------- -# _find_launcher_dir — env override + walk-up search -# --------------------------------------------------------------------------- - - -def test_find_launcher_dir_env_override(monkeypatch, tmp_path): - """`$MODELOPT_LAUNCHER_DIR` wins over in-repo / cwd-walk.""" - launcher = tmp_path / "custom-launcher" - launcher.mkdir() - monkeypatch.setenv("MODELOPT_LAUNCHER_DIR", str(launcher)) - monkeypatch.chdir(tmp_path) # ensure no walk-up match interferes - assert bridge._find_launcher_dir() == launcher - - -def test_find_launcher_dir_env_override_missing_dir_fallthrough(monkeypatch, tmp_path): - """`$MODELOPT_LAUNCHER_DIR` pointing at a nonexistent path → fall through.""" - monkeypatch.setenv("MODELOPT_LAUNCHER_DIR", str(tmp_path / "ghost")) - monkeypatch.chdir(tmp_path) # no walk-up candidate - # In-repo candidate may or may not exist depending on test env; in - # the uv-tool-install case + this cwd it won't, so we get None. - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - result = bridge._find_launcher_dir() - if in_repo.exists(): - assert result == in_repo - else: - assert result is None - - -def test_find_launcher_dir_walk_up_modules_layout(monkeypatch, tmp_path): - """Walk-up finds `modules/Model-Optimizer/tools/launcher/` from a deep cwd.""" - workspace = tmp_path / "nmm-sandbox" - launcher = workspace / "modules" / "Model-Optimizer" / "tools" / "launcher" - launcher.mkdir(parents=True) - # Agent cwds deep inside the workspace - deep_cwd = workspace / "experiments" / "cicd" / "cicd_42" - deep_cwd.mkdir(parents=True) - monkeypatch.chdir(deep_cwd) - monkeypatch.delenv("MODELOPT_LAUNCHER_DIR", raising=False) - - found = bridge._find_launcher_dir() - # In-repo `_THIS_DIR.parent.parent / "launcher"` may also exist in dev mode; - # accept either, but if it doesn't exist we MUST have walked up to find the - # workspace launcher. - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - if in_repo.exists(): - assert found == in_repo - else: - assert found == launcher - - -def test_find_launcher_dir_walk_up_tools_layout(monkeypatch, tmp_path): - """Walk-up finds plain `tools/launcher/` (direct Model-Optimizer checkout).""" - checkout = tmp_path / "Model-Optimizer-clone" - launcher = checkout / "tools" / "launcher" - launcher.mkdir(parents=True) - deep_cwd = checkout / "examples" / "speculative_decoding" - deep_cwd.mkdir(parents=True) - monkeypatch.chdir(deep_cwd) - monkeypatch.delenv("MODELOPT_LAUNCHER_DIR", raising=False) - - found = bridge._find_launcher_dir() - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - if in_repo.exists(): - assert found == in_repo - else: - assert found == launcher - - -def test_find_launcher_dir_returns_none_when_nothing_found(monkeypatch, tmp_path): - """No env, no in-repo, no walk-up candidate → None.""" - monkeypatch.delenv("MODELOPT_LAUNCHER_DIR", raising=False) - isolated = tmp_path / "iso" - isolated.mkdir() - monkeypatch.chdir(isolated) - found = bridge._find_launcher_dir() - # In a dev-install test env, the in-repo path may resolve. Accept - # either None or that specific path — but NEVER something unrelated. - in_repo = bridge._THIS_DIR.parent.parent / "launcher" - assert found is None or found == in_repo - - -def test_launcher_dir_not_found_response_shape(): - """Helper returns the canonical structured-failure dict.""" - resp = bridge._launcher_dir_not_found_response() - assert resp["ok"] is False - assert resp["reason"] == "launcher_dir_not_found" - assert "Searched" in resp["diagnostic"] - assert "MODELOPT_LAUNCHER_DIR" in resp["diagnostic"] - assert "dry_run" not in resp - - -def test_launcher_dir_not_found_response_dry_run_flag(): - """`dry_run=True` adds `dry_run: True` to the response.""" - resp = bridge._launcher_dir_not_found_response(dry_run=True) - assert resp["ok"] is False - assert resp["dry_run"] is True - assert resp["reason"] == "launcher_dir_not_found"