From 46e8588bf9f94ce9ae4700100c517bca58402e89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 16:29:05 +0100
Subject: [PATCH 01/16] Add explicit per-source data flags; deprecate
 --mount/--download/--output

Replace the string-format inference for command inputs/outputs with explicit
per-source CLI flags, so the source type is stated in the flag name instead of
guessed from the value:

  --mount-asset / --download-asset      alias=name[:version]
  --mount-datastore / --download-datastore  alias=datastore/folder
  --mount-job / --download-job          alias=<job_id>:<path>
  --output-datastore                    alias=datastore/folder
  --output-asset                        alias=name[:version]

This adds raw datastore folders as inputs (issue #14) and registering outputs
as data assets, symmetric across mount/download/output.

The old --mount/--download/--output flags (and their Python equivalents) are
deprecated: they still work but emit a warning and are classified and routed to
the new builders through a single set of parsers.

Closes #14

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/examples.md           |  60 ++++--
 src/submit_aml/__main__.py | 119 +++++++++++-
 src/submit_aml/aml.py      |  26 ++-
 src/submit_aml/data.py     | 381 ++++++++++++++++++++++++++-----------
 tests/test_data.py         | 215 +++++++++++++++++++--
 5 files changed, 652 insertions(+), 149 deletions(-)
diff --git a/docs/examples.md b/docs/examples.md
index 26e671e..e174913 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -168,32 +168,40 @@ Run a grid sweep over hyperparameters:
 
 Datasets are passed to the job as
 [`Input`](https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml.input?view=azure-python)
-objects.
+objects. There is one flag per source type (registered data asset, datastore
+folder, or previous job output), in either `--mount-*` or `--download-*` form.
 
 === "CLI"
 
-    Mount a dataset:
+    Mount or download a registered data asset:
 
     ```bash
     submit-aml \
         --script train.py \
-        --mount "data=MY-DATASET:2"
+        --mount-asset "data=MY-DATASET:2"
     ```
 
-    Download a dataset:
+    ```bash
+    submit-aml \
+        --script train.py \
+        --download-asset "data=MY-DATASET"
+    ```
+
+    Mount a folder directly from a datastore (no data-asset registration
+    required):
 
     ```bash
     submit-aml \
         --script train.py \
-        --download "data=MY-DATASET"
+        --mount-datastore "ref=mystore/exports/reference"
     ```
 
-    Use outputs from a previous job:
+    Use the outputs of a previous job:
 
     ```bash
     submit-aml \
         --script evaluate.py \
-        --mount "checkpoint=job_dir:my-training-job:models/best.pth"
+        --mount-job "checkpoint=my-training-job:models/best.pth"
     ```
 
 === "Python"
@@ -201,30 +209,49 @@ objects.
     ```python
     submit_to_aml(
         script_path="train.py",
-        datasets_mount=["data=MY-DATASET:2"],
+        mount_asset=["data=MY-DATASET:2"],
     )
 
     # Or download instead of mount
     submit_to_aml(
         script_path="train.py",
-        datasets_download=["data=MY-DATASET"],
+        download_asset=["data=MY-DATASET"],
+    )
+
+    # Mount a datastore folder directly
+    submit_to_aml(
+        script_path="train.py",
+        mount_datastore=["ref=mystore/exports/reference"],
     )
 
     # Use outputs from a previous job
     submit_to_aml(
         script_path="evaluate.py",
-        datasets_mount=["checkpoint=job_dir:my-training-job:models/best.pth"],
+        mount_job=["checkpoint=my-training-job:models/best.pth"],
     )
     ```
 
-Configure an output datastore:
+!!! note "Deprecated flags"
+
+    The `--mount`, `--download` and `--output` flags (and their
+    `datasets_mount`, `datasets_download` and `datasets_output` Python
+    equivalents) are deprecated in favour of the explicit per-source flags
+    above. They still work but emit a deprecation warning.
+
+Write outputs to a datastore folder, or register them as a data asset:
 
 === "CLI"
 
     ```bash
     submit-aml \
         --script train.py \
-        --output "results=mydatastore/experiment-outputs"
+        --output-datastore "results=mydatastore/experiment-outputs"
+    ```
+
+    ```bash
+    submit-aml \
+        --script train.py \
+        --output-asset "results=my-experiment-results"
     ```
 
 === "Python"
@@ -232,7 +259,13 @@ Configure an output datastore:
     ```python
     submit_to_aml(
         script_path="train.py",
-        datasets_output=["results=mydatastore/experiment-outputs"],
+        output_datastore=["results=mydatastore/experiment-outputs"],
+    )
+
+    # Or register the outputs as a data asset
+    submit_to_aml(
+        script_path="train.py",
+        output_asset=["results=my-experiment-results"],
     )
     ```
 
@@ -404,4 +437,3 @@ Submit and wait for the job to complete, streaming logs:
     ```python
     submit_to_aml(script_path="train.py", wait_for_completion=True)
     ```
-
diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py
index e7c4bc7..907eb25 100644
--- a/src/submit_aml/__main__.py
+++ b/src/submit_aml/__main__.py
@@ -141,12 +141,14 @@ def submit(
         "--download",
         "-d",
         help=(
-            "Azure ML dataset or job output folder to download. To download an Azure ML"
-            " dataset, the argument should take the form: alias, name and version"
-            " of the dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'."
-            " If the version is omitted, the last one will be used."
-            " To download the output folder of a previous job, the argument should take"
-            " the form 'alias=job_dir:<job_id>:<path/in/job/outputs>'; for example:"
+            "[DEPRECATED] Use --download-asset, --download-datastore or"
+            " --download-job instead. Azure ML dataset or job output folder to"
+            " download. To download an Azure ML dataset, the argument should take"
+            " the form: alias, name and version of the dataset; for example:"
+            " 'vindr_dir=VINDR-CXR-V2:1'. If the version is omitted, the last one"
+            " will be used. To download the output folder of a previous job, the"
+            " argument should take the form"
+            " 'alias=job_dir:<job_id>:<path/in/job/outputs>'; for example:"
             " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'."
             " The alias can be used to pass input datasets to the script, e.g.,"
             r" '${{inputs.vindr_dir}}' or '${{inputs.checkpoint}}'."
@@ -159,7 +161,8 @@ def submit(
         "--mount",
         "-m",
         help=(
-            "Azure ML dataset or job output folder to mount."
+            "[DEPRECATED] Use --mount-asset, --mount-datastore or --mount-job"
+            " instead. Azure ML dataset or job output folder to mount."
             " For an Azure ML dataset, the alias, name and version should be provided"
             " while for a job output folder, the alias, job ID and path in the job"
             " outputs should be provided. See the --download option for more"
@@ -167,13 +170,77 @@ def submit(
         ),
         rich_help_panel=PANEL_DATA,
     ),
+    mount_asset: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--mount-asset",
+        help=(
+            "Registered Azure ML data asset to mount, expressed as"
+            ' "alias=name[:version]". For example: "vindr_dir=VINDR-CXR-V2:1".'
+            " If the version is omitted, the latest one is used."
+            r" Pass it to the script with '${{inputs.vindr_dir}}'."
+            " This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
+    download_asset: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--download-asset",
+        help=(
+            "Registered Azure ML data asset to download. Same format as"
+            " --mount-asset. This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
+    mount_datastore: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--mount-datastore",
+        help=(
+            "Datastore folder to mount, expressed as"
+            ' "alias=datastore/path/to/folder".'
+            ' For example: "ref=mystore/exports/reference".'
+            r" Pass it to the script with '${{inputs.ref}}'."
+            " This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
+    download_datastore: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--download-datastore",
+        help=(
+            "Datastore folder to download. Same format as --mount-datastore."
+            " This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
+    mount_job: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--mount-job",
+        help=(
+            "Output folder of a previous job to mount, expressed as"
+            ' "alias=<job_id>:<path/in/job/outputs>".'
+            ' For example: "checkpoint=crusty_hat_43s6lmvb25:outputs/best.pth".'
+            r" Pass it to the script with '${{inputs.checkpoint}}'."
+            " This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
+    download_job: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--download-job",
+        help=(
+            "Output folder of a previous job to download. Same format as"
+            " --mount-job. This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
     output: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
         None,
         "--output",
         "-o",
         help=(
-            "Alias, datastore and path to folder into which outputs will be written,"
-            ' expressed as "alias=datastore/path/to/dir".'
+            "[DEPRECATED] Use --output-datastore or --output-asset instead."
+            " Alias, datastore and path to folder into which outputs will be"
+            ' written, expressed as "alias=datastore/path/to/dir".'
             ' For example: "out_dir=mydatastore/my_dataset".'
             " The alias can be used to pass outputs to the script, e.g.,"
             r' "${{outputs.out_dir}}".'
@@ -182,6 +249,32 @@ def submit(
         ),
         rich_help_panel=PANEL_DATA,
     ),
+    output_datastore: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--output-datastore",
+        help=(
+            "Datastore folder into which outputs will be written, expressed as"
+            ' "alias=datastore/path/to/dir".'
+            ' For example: "out_dir=mydatastore/my_dataset".'
+            r" Pass it to the script with '${{outputs.out_dir}}'."
+            " This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
+    output_asset: Optional[List[str]] = typer.Option(  # noqa: UP006, UP007
+        None,
+        "--output-asset",
+        help=(
+            "Register the outputs as an Azure ML data asset, expressed as"
+            ' "alias=name[:version]". For example: "out_dir=my-results".'
+            " The blobs are written to the workspace's default datastore and"
+            " registered as a data asset; if the version is omitted, Azure ML"
+            " auto-increments it."
+            r" Pass it to the script with '${{outputs.out_dir}}'."
+            " This option can be used multiple times."
+        ),
+        rich_help_panel=PANEL_DATA,
+    ),
     command_prefix: str = typer.Option(
         get_default("command_prefix"),
         help="Prefix to prepend to the command. For example, `uv run`.",
@@ -408,6 +501,14 @@ def submit(
             datasets_download=datasets_download,
             datasets_mount=datasets_mount,
             datasets_output=output,
+            mount_asset=mount_asset,
+            download_asset=download_asset,
+            mount_datastore=mount_datastore,
+            download_datastore=download_datastore,
+            mount_job=mount_job,
+            download_job=download_job,
+            output_datastore=output_datastore,
+            output_asset=output_asset,
             debug=debug,
             dependency_groups=dependency_groups,
             description=description,
diff --git a/src/submit_aml/aml.py b/src/submit_aml/aml.py
index c269bab..a38de21 100644
--- a/src/submit_aml/aml.py
+++ b/src/submit_aml/aml.py
@@ -345,6 +345,14 @@ def submit_to_aml(
     datasets_download: TypeOptionalStrList = None,
     datasets_mount: TypeOptionalStrList = None,
     datasets_output: TypeOptionalStrList = None,
+    mount_asset: TypeOptionalStrList = None,
+    download_asset: TypeOptionalStrList = None,
+    mount_datastore: TypeOptionalStrList = None,
+    download_datastore: TypeOptionalStrList = None,
+    mount_job: TypeOptionalStrList = None,
+    download_job: TypeOptionalStrList = None,
+    output_datastore: TypeOptionalStrList = None,
+    output_asset: TypeOptionalStrList = None,
     debug: bool = False,
     dependency_groups: list[str] | None = None,
     description: str | None = None,
@@ -512,8 +520,22 @@ def submit_to_aml(
         add_service_for_tensorboard(services, tensorboard_dir)
 
     # Data
-    inputs = build_command_inputs(ml_client, datasets_download, datasets_mount)
-    outputs = build_command_outputs(datasets_output)
+    inputs = build_command_inputs(
+        ml_client,
+        mount_asset=mount_asset,
+        download_asset=download_asset,
+        mount_datastore=mount_datastore,
+        download_datastore=download_datastore,
+        mount_job=mount_job,
+        download_job=download_job,
+        legacy_mount=datasets_mount,
+        legacy_download=datasets_download,
+    )
+    outputs = build_command_outputs(
+        output_datastore=output_datastore,
+        output_asset=output_asset,
+        legacy_output=datasets_output,
+    )
 
     # Sweep jobs
     is_sweep = sweep_inputs is not None and len(sweep_inputs) > 0
diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 410568b..0cef3eb 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -6,6 +6,7 @@
 from azure.ai.ml import Input
 from azure.ai.ml import MLClient
 from azure.ai.ml import Output
+from azure.ai.ml.constants import AssetTypes
 from azure.ai.ml.constants import InputOutputModes
 from azure.ai.ml.entities._job.sweep.search_space import SweepDistribution
 from azure.ai.ml.exceptions import MlException
@@ -17,6 +18,23 @@
 TypeOptionalStrList = list[str] | None
 
 
+def _datastore_uri(datastore: str, path: str) -> str:
+    """Build an Azure ML datastore URI for a folder.
+
+    Args:
+        datastore: Name of the datastore.
+        path: Path to the folder within the datastore.
+
+    Returns:
+        A URI of the form `azureml://datastores/<datastore>/paths/<path>`.
+
+    Examples:
+        >>> _datastore_uri('mystore', 'exports/reference')
+        'azureml://datastores/mystore/paths/exports/reference'
+    """
+    return f"azureml://datastores/{datastore}/paths/{path}"
+
+
 def _extract_alias_path_version(string: str) -> tuple[str, str, str | None]:
     """Get alias, data asset path, and data asset version from a string.
 
@@ -27,12 +45,9 @@ def _extract_alias_path_version(string: str) -> tuple[str, str, str | None]:
         Tuple of alias, path, and version (which may be
         None if version is not provided).
 
-    Raises:
-        ValueError: If the string is not of the expected format.
-
     Examples:
         >>> _extract_alias_path_version('my_data=MIMIC-CXR-V2:2')
-        ('my_data', 'MIMIC-CXR-V2', 2)
+        ('my_data', 'MIMIC-CXR-V2', '2')
         >>> _extract_alias_path_version('my_data=MIMIC-CXR-V2')
         ('my_data', 'MIMIC-CXR-V2', None)
     """
@@ -64,11 +79,8 @@ def _extract_alias_datastore_path(string: str) -> tuple[str, str, str]:
     Returns:
         Tuple of alias, datastore and folder.
 
-    Raises:
-        ValueError: If the string is not of the expected format.
-
     Examples:
-        >>> get_alias_datastore_path('my_data=inereyedata/output_dataset')
+        >>> _extract_alias_datastore_path('my_data=inereyedata/output_dataset')
         ('my_data', 'inereyedata', 'output_dataset')
     """
     pattern = r"(?P<alias>[^=]+)=(?P<datastore>[^/]+)/(?P<folder>.+)"
@@ -84,146 +96,297 @@ def _extract_alias_datastore_path(string: str) -> tuple[str, str, str]:
 
 
 def _extract_alias_job_path(string: str) -> tuple[str, str, str]:
-    """Get alias, job ID, and path from a job directory string.
+    """Get alias, job ID, and path from a job output string.
 
     Args:
-        string: String of the form `'alias=job_dir:<job_id>:<path>'`.
+        string: String of the form `'alias=<job_id>:<path>'`.
 
     Returns:
         Tuple of alias, job_id, and path.
 
-    Raises:
-        ValueError: If the string is not of the expected format.
-
     Examples:
-        >>> _extract_alias_job_path('checkpoint=job_dir:my_job_123:models/best.pth')
+        >>> _extract_alias_job_path('checkpoint=my_job_123:models/best.pth')
         ('checkpoint', 'my_job_123', 'models/best.pth')
     """
-    pattern = r"(?P<alias>[^=]+)=job_dir:(?P<job_id>[^:]+):(?P<path>.+)"
+    pattern = r"(?P<alias>[^=]+)=(?P<job_id>[^:]+):(?P<path>.+)"
     match = re.match(pattern, string)
     if match is None:
         message = (
-            f'Invalid job directory string: "{string}".'
-            ' Expected format: "alias=job_dir:job_id:path".'
+            f'Invalid job output string: "{string}".'
+            ' Expected format: "alias=job_id:path".'
         )
-        raise ValueError(message)
+        logger.error(message)
+        sys.exit(1)
     return match.group("alias"), match.group("job_id"), match.group("path")
 
 
-def _is_alias_path_version_string(string: str) -> bool:
-    try:
-        _extract_alias_path_version(string)
-        return True
-    except ValueError:
-        return False
+def _input_from_asset(
+    ml_client: MLClient,
+    string: str,
+    mode: str,
+) -> tuple[str, Input]:
+    """Build an `Input` from a registered data asset string.
 
+    Args:
+        ml_client: Client used to resolve the data asset.
+        string: String of the form `'alias=name[:version]'`.
+        mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`.
 
-def _is_alias_job_path_string(string: str) -> bool:
-    try:
-        _extract_alias_job_path(string)
-        return True
-    except ValueError:
-        return False
+    Returns:
+        Tuple of alias and the resolved `Input`.
 
+    Raises:
+        ValueError: If the data asset cannot be retrieved.
+    """
+    alias, path, version = _extract_alias_path_version(string)
+
+    if version is None:
+        kwargs = {"label": "latest"}
+    else:
+        kwargs = {"version": version}
+
+    with report_time(
+        f'Retrieving data asset "{path}"...',
+        f'Retrieved data asset "{path}"',
+    ):
+        try:
+            data = ml_client.data.get(name=path, **kwargs)
+        except MlException as e:
+            msg = f'Error getting data asset with name "{path}" and version "{version}"'
+            raise ValueError(msg) from e
+    return alias, Input(path=data.id, mode=mode)
+
+
+def _input_from_datastore(string: str, mode: str) -> tuple[str, Input]:
+    """Build an `Input` from a datastore-path string.
+
+    Args:
+        string: String of the form `'alias=datastore/folder'`.
+        mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`.
+
+    Returns:
+        Tuple of alias and the resulting `Input`.
+    """
+    alias, datastore, folder = _extract_alias_datastore_path(string)
+    azureml_path = _datastore_uri(datastore, folder)
+    logger.info(f'Using datastore path "{azureml_path}"...')
+    return alias, Input(path=azureml_path, mode=mode)
 
-def build_command_inputs(
-    ml_client: MLClient,
-    strings_download: list[str] | None,
-    strings_mount: list[str] | None,
-) -> TypeInputsDict:
-    """Get dictionaries data assets to be mounted or downloaded.
+
+def _input_from_job(string: str, mode: str) -> tuple[str, Input]:
+    """Build an `Input` from a previous job's output string.
 
     Args:
-        strings_download: List of strings of the form `'alias=path:version'` to
-            be downloaded. If `None`, no data assets will be downloaded.
-        strings_mount: List of strings of the form `'alias=path:version'` to
-            be mounted. If `None`, no data assets will be mounted.
+        string: String of the form `'alias=<job_id>:<path>'`.
+        mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`.
+
+    Returns:
+        Tuple of alias and the resulting `Input`.
     """
-    strings_download = [] if strings_download is None else strings_download
-    strings_mount = [] if strings_mount is None else strings_mount
-    datasets_download = _get_data_assets(
-        ml_client,
-        strings_download,
-        InputOutputModes.DOWNLOAD,
+    alias, job_id, path = _extract_alias_job_path(string)
+    azureml_path = _datastore_uri(
+        "workspaceartifactstore",
+        f"ExperimentRun/dcid.{job_id}/{path}",
     )
-    datasets_mount = _get_data_assets(
-        ml_client,
-        strings_mount,
-        InputOutputModes.MOUNT,
+    logger.info(f'Using job output path "{azureml_path}"...')
+    return alias, Input(path=azureml_path, mode=mode)
+
+
+def _output_from_datastore(string: str) -> tuple[str, Output]:
+    """Build an `Output` that writes to a datastore folder.
+
+    Args:
+        string: String of the form `'alias=datastore/folder'`.
+
+    Returns:
+        Tuple of alias and the resulting `Output`.
+    """
+    alias, datastore, folder = _extract_alias_datastore_path(string)
+    return alias, Output(path=_datastore_uri(datastore, folder))
+
+
+def _output_from_asset(string: str) -> tuple[str, Output]:
+    """Build an `Output` that registers a data asset.
+
+    The blobs are written to the workspace's default datastore at an
+    Azure ML-managed location and registered as a data asset.
+
+    Args:
+        string: String of the form `'alias=name[:version]'`. If the version is
+            omitted, Azure ML auto-increments it.
+
+    Returns:
+        Tuple of alias and the resulting `Output`.
+    """
+    alias, name, version = _extract_alias_path_version(string)
+    output = Output(type=AssetTypes.URI_FOLDER, name=name, version=version)
+    return alias, output
+
+
+def _warn_deprecated(old_flag: str, new_flags: str) -> None:
+    """Emit a deprecation warning pointing users to the new flags.
+
+    Args:
+        old_flag: The deprecated flag (e.g. `'--mount'`).
+        new_flags: Human-readable list of replacement flags.
+    """
+    logger.warning(
+        f"{old_flag} is deprecated and will be removed in a future release."
+        f" Use {new_flags} instead.",
     )
-    return {**datasets_download, **datasets_mount}
 
 
-def build_command_outputs(
-    strings_upload: list[str] | None,
-) -> dict[str, Output]:
-    """Get outputs for command.
+def _classify_legacy_input(string: str) -> str:
+    """Classify a legacy `--mount`/`--download` value by source type.
 
     Args:
-        strings_upload: List of strings of the form `'alias=datastore/path/to/dir'` to
-            be uploaded. If `None`, no outputs will be returned.
+        string: A legacy dataset string.
+
+    Returns:
+        One of `'job'`, `'datastore'`, or `'asset'`.
+
+    Examples:
+        >>> _classify_legacy_input('ckpt=job_dir:job123:out/best.pth')
+        'job'
+        >>> _classify_legacy_input('ref=mystore/exports/reference')
+        'datastore'
+        >>> _classify_legacy_input('data=MY-DATASET:2')
+        'asset'
     """
-    strings_upload = [] if strings_upload is None else strings_upload
-    outputs_dict = {}
-    for string in strings_upload:
-        alias, datastore, path = _extract_alias_datastore_path(string)
-        output = Output(
-            path=f"azureml://datastores/{datastore}/paths/{path}",
-        )
-        outputs_dict[alias] = output
-    return outputs_dict
+    if "=" not in string:
+        return "asset"
+    _, rhs = string.split("=", 1)
+    if rhs.startswith("job_dir:"):
+        return "job"
+    if "/" in rhs:
+        return "datastore"
+    return "asset"
 
 
-def _get_data_assets(
+def _legacy_input(
     ml_client: MLClient,
-    datasets: list[str],
+    string: str,
     mode: str,
-) -> dict[str, Input]:
-    """Get data assets from Azure ML.
+) -> tuple[str, Input]:
+    """Route a legacy `--mount`/`--download` value to the right builder.
 
     Args:
-        datasets: List of strings of the form `'alias=path:version'` or
-            `'alias=job_dir:<job_id>:<path>'`.
+        ml_client: Client used to resolve data assets.
+        string: A legacy dataset string.
         mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`.
 
+    Returns:
+        Tuple of alias and the resulting `Input`.
+    """
+    source = _classify_legacy_input(string)
+    if source == "job":
+        # Translate the old "alias=job_dir:<job_id>:<path>" form to the new one.
+        translated = string.replace("=job_dir:", "=", 1)
+        return _input_from_job(translated, mode)
+    if source == "datastore":
+        return _input_from_datastore(string, mode)
+    return _input_from_asset(ml_client, string, mode)
+
+
+def build_command_inputs(
+    ml_client: MLClient,
+    *,
+    mount_asset: list[str] | None = None,
+    download_asset: list[str] | None = None,
+    mount_datastore: list[str] | None = None,
+    download_datastore: list[str] | None = None,
+    mount_job: list[str] | None = None,
+    download_job: list[str] | None = None,
+    legacy_mount: list[str] | None = None,
+    legacy_download: list[str] | None = None,
+) -> TypeInputsDict:
+    """Build the inputs dictionary for a command job.
+
+    Args:
+        ml_client: Client used to resolve data assets.
+        mount_asset: Data assets to mount, as `'alias=name[:version]'`.
+        download_asset: Data assets to download, as `'alias=name[:version]'`.
+        mount_datastore: Datastore folders to mount, as `'alias=datastore/folder'`.
+        download_datastore: Datastore folders to download, as
+            `'alias=datastore/folder'`.
+        mount_job: Previous job outputs to mount, as `'alias=<job_id>:<path>'`.
+        download_job: Previous job outputs to download, as `'alias=<job_id>:<path>'`.
+        legacy_mount: Deprecated `--mount` values, routed by source type.
+        legacy_download: Deprecated `--download` values, routed by source type.
+
     Returns:
         Dictionary of `alias: Input` mappings.
     """
-    inputs = {}
-    for string in datasets:
-        if _is_alias_job_path_string(string):
-            # Handle job directory format
-            alias, job_id, path = _extract_alias_job_path(string)
-            azureml_path = f"azureml://datastores/workspaceartifactstore/paths/ExperimentRun/dcid.{job_id}/{path}"
-            logger.info(f'Using job output path "{azureml_path}"...')
-            inputs[alias] = Input(
-                path=str(azureml_path),
-                mode=mode,
-            )
-        else:
-            # Handle regular data asset format
-            alias, path, version = _extract_alias_path_version(string)
-
-            if version is None:
-                kwargs = {"label": "latest"}
-            else:
-                kwargs = {"version": version}
-
-            with report_time(
-                f'Retrieving data asset "{path}"...',
-                f'Retrieved data asset "{path}"',
-            ):
-                try:
-                    data = ml_client.data.get(name=path, **kwargs)
-                except MlException as e:
-                    msg = (
-                        "Error getting data asset with"
-                        f' name "{path}"'
-                        f' and version "{version}"'
-                    )
-                    raise ValueError(msg) from e
-            inputs[alias] = Input(
-                path=data.id,
-                mode=mode,
-            )
+    inputs: TypeInputsDict = {}
+
+    for string in mount_asset or []:
+        alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT)
+        inputs[alias] = value
+    for string in download_asset or []:
+        alias, value = _input_from_asset(ml_client, string, InputOutputModes.DOWNLOAD)
+        inputs[alias] = value
+
+    for string in mount_datastore or []:
+        alias, value = _input_from_datastore(string, InputOutputModes.MOUNT)
+        inputs[alias] = value
+    for string in download_datastore or []:
+        alias, value = _input_from_datastore(string, InputOutputModes.DOWNLOAD)
+        inputs[alias] = value
+
+    for string in mount_job or []:
+        alias, value = _input_from_job(string, InputOutputModes.MOUNT)
+        inputs[alias] = value
+    for string in download_job or []:
+        alias, value = _input_from_job(string, InputOutputModes.DOWNLOAD)
+        inputs[alias] = value
+
+    if legacy_mount:
+        _warn_deprecated("--mount", "--mount-asset, --mount-datastore or --mount-job")
+        for string in legacy_mount:
+            alias, value = _legacy_input(ml_client, string, InputOutputModes.MOUNT)
+            inputs[alias] = value
+    if legacy_download:
+        _warn_deprecated(
+            "--download",
+            "--download-asset, --download-datastore or --download-job",
+        )
+        for string in legacy_download:
+            alias, value = _legacy_input(ml_client, string, InputOutputModes.DOWNLOAD)
+            inputs[alias] = value
+
     return inputs
+
+
+def build_command_outputs(
+    *,
+    output_datastore: list[str] | None = None,
+    output_asset: list[str] | None = None,
+    legacy_output: list[str] | None = None,
+) -> dict[str, Output]:
+    """Build the outputs dictionary for a command job.
+
+    Args:
+        output_datastore: Datastore folders to write to, as
+            `'alias=datastore/folder'`.
+        output_asset: Data assets to register, as `'alias=name[:version]'`.
+        legacy_output: Deprecated `--output` values (datastore folders).
+
+    Returns:
+        Dictionary of `alias: Output` mappings.
+    """
+    outputs: dict[str, Output] = {}
+
+    for string in output_datastore or []:
+        alias, value = _output_from_datastore(string)
+        outputs[alias] = value
+    for string in output_asset or []:
+        alias, value = _output_from_asset(string)
+        outputs[alias] = value
+
+    if legacy_output:
+        _warn_deprecated("--output", "--output-datastore or --output-asset")
+        for string in legacy_output:
+            alias, value = _output_from_datastore(string)
+            outputs[alias] = value
+
+    return outputs
diff --git a/tests/test_data.py b/tests/test_data.py
index 8f9faab..be33724 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -1,14 +1,35 @@
-"""Tests for data-asset parsing helpers."""
+"""Tests for data-asset parsing helpers and input/output builders."""
 
 from __future__ import annotations
 
+from unittest.mock import Mock
+
 import pytest
+from azure.ai.ml.constants import InputOutputModes
 
+from submit_aml.data import _classify_legacy_input
+from submit_aml.data import _datastore_uri
 from submit_aml.data import _extract_alias_datastore_path
 from submit_aml.data import _extract_alias_job_path
 from submit_aml.data import _extract_alias_path_version
+from submit_aml.data import _input_from_datastore
+from submit_aml.data import _input_from_job
+from submit_aml.data import _output_from_asset
+from submit_aml.data import _output_from_datastore
+from submit_aml.data import build_command_inputs
 from submit_aml.data import build_command_outputs
 
+# ---------------------------------------------------------------------------
+# _datastore_uri
+# ---------------------------------------------------------------------------
+
+
+def test_datastore_uri() -> None:
+    """A datastore and path are joined into an azureml:// URI."""
+    uri = _datastore_uri("mystore", "exports/reference")
+    assert uri == "azureml://datastores/mystore/paths/exports/reference"
+
+
 # ---------------------------------------------------------------------------
 # _extract_alias_path_version
 # ---------------------------------------------------------------------------
@@ -51,35 +72,199 @@ def test_extract_alias_datastore_path_valid() -> None:
 
 
 def test_extract_alias_job_path_valid() -> None:
-    """'alias=job_dir:job_id:path' is parsed correctly."""
+    """'alias=job_id:path' is parsed correctly (no job_dir: prefix)."""
     alias, job_id, path = _extract_alias_job_path(
-        "checkpoint=job_dir:my_job_123:models/best.pth"
+        "checkpoint=my_job_123:models/best.pth"
     )
     assert alias == "checkpoint"
     assert job_id == "my_job_123"
     assert path == "models/best.pth"
 
 
-def test_extract_alias_job_path_invalid_raises() -> None:
-    """Strings not matching the job_dir pattern raise ValueError."""
-    with pytest.raises(ValueError, match="Invalid job directory"):
+def test_extract_alias_job_path_invalid_exits() -> None:
+    """Strings without a path component exit the process."""
+    with pytest.raises(SystemExit):
         _extract_alias_job_path("bad_format")
 
 
+# ---------------------------------------------------------------------------
+# _classify_legacy_input
+# ---------------------------------------------------------------------------
+
+
+def test_classify_legacy_input_job() -> None:
+    """A job_dir: prefix is classified as a job output."""
+    assert _classify_legacy_input("ckpt=job_dir:job123:out/best.pth") == "job"
+
+
+def test_classify_legacy_input_datastore() -> None:
+    """A slash in the right-hand side signals a datastore path."""
+    assert _classify_legacy_input("ref=mystore/exports/reference") == "datastore"
+
+
+def test_classify_legacy_input_asset() -> None:
+    """A plain name[:version] is classified as a data asset."""
+    assert _classify_legacy_input("data=MY-DATASET:2") == "asset"
+
+
+def test_classify_legacy_input_missing_equals() -> None:
+    """A string without '=' falls back to the asset branch."""
+    assert _classify_legacy_input("no-equals-here") == "asset"
+
+
+# ---------------------------------------------------------------------------
+# input builders
+# ---------------------------------------------------------------------------
+
+
+def test_input_from_datastore() -> None:
+    """A datastore string yields an Input with an azureml:// path and mode."""
+    alias, value = _input_from_datastore(
+        "ref=mystore/exports/reference",
+        InputOutputModes.MOUNT,
+    )
+    assert alias == "ref"
+    assert value.path == "azureml://datastores/mystore/paths/exports/reference"
+    assert value.mode == InputOutputModes.MOUNT
+
+
+def test_input_from_job() -> None:
+    """A job string yields an Input pointing at the job's run artifacts."""
+    alias, value = _input_from_job(
+        "checkpoint=my_job_123:models/best.pth",
+        InputOutputModes.DOWNLOAD,
+    )
+    assert alias == "checkpoint"
+    assert "ExperimentRun/dcid.my_job_123/models/best.pth" in value.path
+    assert "workspaceartifactstore" in value.path
+    assert value.mode == InputOutputModes.DOWNLOAD
+
+
+# ---------------------------------------------------------------------------
+# output builders
+# ---------------------------------------------------------------------------
+
+
+def test_output_from_datastore() -> None:
+    """A datastore string yields an Output with an azureml:// path."""
+    alias, output = _output_from_datastore("out_dir=mydatastore/my_dataset")
+    assert alias == "out_dir"
+    assert output.path == "azureml://datastores/mydatastore/paths/my_dataset"
+
+
+def test_output_from_asset_with_version() -> None:
+    """An asset string registers an Output with name and version."""
+    alias, output = _output_from_asset("out_dir=my-results:3")
+    assert alias == "out_dir"
+    assert output.name == "my-results"
+    assert output.version == "3"
+    assert output.type == "uri_folder"
+
+
+def test_output_from_asset_without_version() -> None:
+    """Omitting the version leaves it unset for Azure ML to auto-increment."""
+    _, output = _output_from_asset("out_dir=my-results")
+    assert output.name == "my-results"
+    assert output.version is None
+
+
+# ---------------------------------------------------------------------------
+# build_command_inputs
+# ---------------------------------------------------------------------------
+
+
+def test_build_command_inputs_empty() -> None:
+    """No arguments produce an empty dict and never touch the client."""
+    client = Mock()
+    assert build_command_inputs(client) == {}
+    client.data.get.assert_not_called()
+
+
+def test_build_command_inputs_datastore_and_job_skip_client() -> None:
+    """Datastore and job inputs are built without calling the client."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        mount_datastore=["ref=mystore/exports/reference"],
+        download_job=["ckpt=my_job_123:models/best.pth"],
+    )
+    assert set(inputs) == {"ref", "ckpt"}
+    assert inputs["ref"].mode == InputOutputModes.MOUNT
+    assert inputs["ckpt"].mode == InputOutputModes.DOWNLOAD
+    client.data.get.assert_not_called()
+
+
+def test_build_command_inputs_asset_calls_client() -> None:
+    """A data-asset input resolves through the client."""
+    client = Mock()
+    client.data.get.return_value = Mock(id="azureml:resolved-asset:1")
+    inputs = build_command_inputs(client, mount_asset=["data=MY-DATASET:2"])
+    client.data.get.assert_called_once()
+    assert inputs["data"].path == "azureml:resolved-asset:1"
+
+
+def test_build_command_inputs_legacy_datastore_routes(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """Legacy --mount datastore strings route to the datastore builder."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        legacy_mount=["ref=mystore/exports/reference"],
+    )
+    assert inputs["ref"].path == (
+        "azureml://datastores/mystore/paths/exports/reference"
+    )
+    client.data.get.assert_not_called()
+    assert "deprecated" in capsys.readouterr().out.lower()
+
+
+def test_build_command_inputs_legacy_job_routes() -> None:
+    """Legacy --download job_dir strings route to the job builder."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"],
+    )
+    assert "ExperimentRun/dcid.my_job_123/models/best.pth" in inputs["ckpt"].path
+    client.data.get.assert_not_called()
+
+
+def test_build_command_inputs_legacy_asset_calls_client() -> None:
+    """Legacy --mount asset strings still resolve through the client."""
+    client = Mock()
+    client.data.get.return_value = Mock(id="azureml:resolved-asset:1")
+    build_command_inputs(client, legacy_mount=["data=MY-DATASET:2"])
+    client.data.get.assert_called_once()
+
+
 # ---------------------------------------------------------------------------
 # build_command_outputs
 # ---------------------------------------------------------------------------
 
 
-def test_build_command_outputs_none() -> None:
-    """None input produces an empty dict."""
-    assert build_command_outputs(None) == {}
+def test_build_command_outputs_empty() -> None:
+    """No arguments produce an empty dict."""
+    assert build_command_outputs() == {}
+
+
+def test_build_command_outputs_datastore_and_asset() -> None:
+    """Datastore and asset outputs are both built."""
+    outputs = build_command_outputs(
+        output_datastore=["out_dir=mydatastore/my_dataset"],
+        output_asset=["asset_dir=my-results:2"],
+    )
+    assert outputs["out_dir"].path == (
+        "azureml://datastores/mydatastore/paths/my_dataset"
+    )
+    assert outputs["asset_dir"].name == "my-results"
+    assert outputs["asset_dir"].version == "2"
 
 
-def test_build_command_outputs_valid() -> None:
-    """Valid output strings are converted into Output objects."""
-    outputs = build_command_outputs(["out_dir=mydatastore/my_dataset"])
+def test_build_command_outputs_legacy_warns(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """Legacy --output strings are built and emit a deprecation warning."""
+    outputs = build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"])
     assert "out_dir" in outputs
-    output = outputs["out_dir"]
-    assert "mydatastore" in output.path
-    assert "my_dataset" in output.path
+    assert "deprecated" in capsys.readouterr().out.lower()

From 09ecedd51a2c63674e0eb779c1d770eadbbd688c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 16:41:04 +0100
Subject: [PATCH 02/16] Address Copilot review: robust legacy classification
 and clearer error

- Legacy --mount/--download values that use the new "job_id:path" form (a colon
  before the first slash) now route to the job-output builder instead of being
  misread as a datastore path.
- The data-asset lookup failure message now reports "latest" when no version was
  given, matching the label used for the lookup.
- Add tests for new-syntax job classification/routing and the error message.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/submit_aml/data.py | 23 +++++++++++++++++++----
 tests/test_data.py     | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 0cef3eb..5948836 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -152,7 +152,11 @@ def _input_from_asset(
         try:
             data = ml_client.data.get(name=path, **kwargs)
         except MlException as e:
-            msg = f'Error getting data asset with name "{path}" and version "{version}"'
+            version_desc = "latest" if version is None else version
+            msg = (
+                f'Error getting data asset with name "{path}"'
+                f' and version "{version_desc}"'
+            )
             raise ValueError(msg) from e
     return alias, Input(path=data.id, mode=mode)
 
@@ -245,9 +249,16 @@ def _classify_legacy_input(string: str) -> str:
     Returns:
         One of `'job'`, `'datastore'`, or `'asset'`.
 
+    A right-hand side that starts with `job_dir:`, or that has a `:` before its
+    first `/` (the new `job_id:path` form), is a job output. A `/` that comes
+    before any `:` signals a datastore folder. Anything else (a bare `name` or
+    `name:version`) is a data asset.
+
     Examples:
         >>> _classify_legacy_input('ckpt=job_dir:job123:out/best.pth')
         'job'
+        >>> _classify_legacy_input('ckpt=job123:out/best.pth')
+        'job'
         >>> _classify_legacy_input('ref=mystore/exports/reference')
         'datastore'
         >>> _classify_legacy_input('data=MY-DATASET:2')
@@ -258,9 +269,13 @@ def _classify_legacy_input(string: str) -> str:
     _, rhs = string.split("=", 1)
     if rhs.startswith("job_dir:"):
         return "job"
-    if "/" in rhs:
-        return "datastore"
-    return "asset"
+    slash = rhs.find("/")
+    if slash == -1:
+        return "asset"
+    colon = rhs.find(":")
+    if colon != -1 and colon < slash:
+        return "job"
+    return "datastore"
 
 
 def _legacy_input(
diff --git a/tests/test_data.py b/tests/test_data.py
index be33724..db1d602 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -6,12 +6,14 @@
 
 import pytest
 from azure.ai.ml.constants import InputOutputModes
+from azure.ai.ml.exceptions import MlException
 
 from submit_aml.data import _classify_legacy_input
 from submit_aml.data import _datastore_uri
 from submit_aml.data import _extract_alias_datastore_path
 from submit_aml.data import _extract_alias_job_path
 from submit_aml.data import _extract_alias_path_version
+from submit_aml.data import _input_from_asset
 from submit_aml.data import _input_from_datastore
 from submit_aml.data import _input_from_job
 from submit_aml.data import _output_from_asset
@@ -97,11 +99,21 @@ def test_classify_legacy_input_job() -> None:
     assert _classify_legacy_input("ckpt=job_dir:job123:out/best.pth") == "job"
 
 
+def test_classify_legacy_input_job_new_syntax() -> None:
+    """A new-style 'job_id:path' value (colon before slash) is a job output."""
+    assert _classify_legacy_input("ckpt=job123:outputs/best.pth") == "job"
+
+
 def test_classify_legacy_input_datastore() -> None:
-    """A slash in the right-hand side signals a datastore path."""
+    """A slash before any colon signals a datastore path."""
     assert _classify_legacy_input("ref=mystore/exports/reference") == "datastore"
 
 
+def test_classify_legacy_input_datastore_with_colon_in_folder() -> None:
+    """A colon after the first slash is part of the folder, not a job id."""
+    assert _classify_legacy_input("ref=mystore/a:b/c") == "datastore"
+
+
 def test_classify_legacy_input_asset() -> None:
     """A plain name[:version] is classified as a data asset."""
     assert _classify_legacy_input("data=MY-DATASET:2") == "asset"
@@ -140,6 +152,16 @@ def test_input_from_job() -> None:
     assert value.mode == InputOutputModes.DOWNLOAD
 
 
+def test_input_from_asset_missing_version_reports_latest() -> None:
+    """When no version is given, the failure message mentions 'latest'."""
+    client = Mock()
+    client.data.get.side_effect = MlException(
+        message="boom", no_personal_data_message="boom"
+    )
+    with pytest.raises(ValueError, match='version "latest"'):
+        _input_from_asset(client, "data=MY-DATASET", InputOutputModes.MOUNT)
+
+
 # ---------------------------------------------------------------------------
 # output builders
 # ---------------------------------------------------------------------------
@@ -230,6 +252,17 @@ def test_build_command_inputs_legacy_job_routes() -> None:
     client.data.get.assert_not_called()
 
 
+def test_build_command_inputs_legacy_job_new_syntax_routes() -> None:
+    """Legacy values using the new 'job_id:path' form route to the job builder."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        legacy_mount=["ckpt=my_job_123:models/best.pth"],
+    )
+    assert "ExperimentRun/dcid.my_job_123/models/best.pth" in inputs["ckpt"].path
+    client.data.get.assert_not_called()
+
+
 def test_build_command_inputs_legacy_asset_calls_client() -> None:
     """Legacy --mount asset strings still resolve through the client."""
     client = Mock()

From efc9ba7b2e3bf3151c5f2c7629aabfaef3dd9ea7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 16:47:58 +0100
Subject: [PATCH 03/16] Address Copilot review: document datastore form in
 deprecated flag help

The deprecated --mount/--download flags route datastore-folder values via
legacy parsing, so their help text now mentions the 'alias=datastore/folder'
form alongside the dataset and job-output forms.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/submit_aml/__main__.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py
index 907eb25..ed171b3 100644
--- a/src/submit_aml/__main__.py
+++ b/src/submit_aml/__main__.py
@@ -142,12 +142,13 @@ def submit(
         "-d",
         help=(
             "[DEPRECATED] Use --download-asset, --download-datastore or"
-            " --download-job instead. Azure ML dataset or job output folder to"
-            " download. To download an Azure ML dataset, the argument should take"
-            " the form: alias, name and version of the dataset; for example:"
-            " 'vindr_dir=VINDR-CXR-V2:1'. If the version is omitted, the last one"
-            " will be used. To download the output folder of a previous job, the"
-            " argument should take the form"
+            " --download-job instead. Azure ML dataset, datastore folder or job"
+            " output folder to download. To download an Azure ML dataset, the"
+            " argument should take the form: alias, name and version of the"
+            " dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'. If the version is"
+            " omitted, the last one will be used. To download a datastore folder,"
+            " use 'alias=datastore/folder'. To download the output folder of a"
+            " previous job, the argument should take the form"
             " 'alias=job_dir:<job_id>:<path/in/job/outputs>'; for example:"
             " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'."
             " The alias can be used to pass input datasets to the script, e.g.,"
@@ -162,9 +163,10 @@ def submit(
         "-m",
         help=(
             "[DEPRECATED] Use --mount-asset, --mount-datastore or --mount-job"
-            " instead. Azure ML dataset or job output folder to mount."
-            " For an Azure ML dataset, the alias, name and version should be provided"
-            " while for a job output folder, the alias, job ID and path in the job"
+            " instead. Azure ML dataset, datastore folder or job output folder to"
+            " mount. For an Azure ML dataset, the alias, name and version should be"
+            " provided; for a datastore folder, use 'alias=datastore/folder'; while"
+            " for a job output folder, the alias, job ID and path in the job"
             " outputs should be provided. See the --download option for more"
             " information."
         ),

From 97af6ac7406c05642d9e235b851df36d17d2493c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 16:54:38 +0100
Subject: [PATCH 04/16] Address Copilot review: reject legacy job_dir: prefix;
 clarify job path help

- The new --mount-job/--download-job flags now reject values that still carry
  the legacy "job_dir:" prefix with a clear error, instead of silently building
  an invalid ExperimentRun/dcid.job_dir/... URI. (Legacy --mount/--download
  still accept and translate the prefix.)
- Reworded the job help/example to make clear the path may point at any run
  artifact, not only files under outputs/.
- Added a test for the rejected legacy prefix.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/submit_aml/__main__.py | 9 +++++----
 src/submit_aml/data.py     | 8 ++++++++
 tests/test_data.py         | 6 ++++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py
index ed171b3..bf28111 100644
--- a/src/submit_aml/__main__.py
+++ b/src/submit_aml/__main__.py
@@ -218,9 +218,10 @@ def submit(
         None,
         "--mount-job",
         help=(
-            "Output folder of a previous job to mount, expressed as"
-            ' "alias=<job_id>:<path/in/job/outputs>".'
-            ' For example: "checkpoint=crusty_hat_43s6lmvb25:outputs/best.pth".'
+            "Output of a previous job to mount, expressed as"
+            ' "alias=<job_id>:<path/in/run/artifacts>". The path may point at any'
+            " run artifact, not just files under outputs/."
+            ' For example: "checkpoint=crusty_hat_43s6lmvb25:models/best.pth".'
             r" Pass it to the script with '${{inputs.checkpoint}}'."
             " This option can be used multiple times."
         ),
@@ -230,7 +231,7 @@ def submit(
         None,
         "--download-job",
         help=(
-            "Output folder of a previous job to download. Same format as"
+            "Output of a previous job to download. Same format as"
             " --mount-job. This option can be used multiple times."
         ),
         rich_help_panel=PANEL_DATA,
diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 5948836..7a7225b 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -117,6 +117,14 @@ def _extract_alias_job_path(string: str) -> tuple[str, str, str]:
         )
         logger.error(message)
         sys.exit(1)
+    if match.group("job_id") == "job_dir":
+        message = (
+            f'Invalid job output string: "{string}".'
+            ' The "job_dir:" prefix is no longer used with the job flags;'
+            ' use "alias=job_id:path" instead.'
+        )
+        logger.error(message)
+        sys.exit(1)
     return match.group("alias"), match.group("job_id"), match.group("path")
 
 
diff --git a/tests/test_data.py b/tests/test_data.py
index db1d602..bf1e8ad 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -89,6 +89,12 @@ def test_extract_alias_job_path_invalid_exits() -> None:
         _extract_alias_job_path("bad_format")
 
 
+def test_extract_alias_job_path_rejects_legacy_prefix() -> None:
+    """The legacy 'job_dir:' prefix is rejected on the new job flags."""
+    with pytest.raises(SystemExit):
+        _extract_alias_job_path("ckpt=job_dir:my_job_123:models/best.pth")
+
+
 # ---------------------------------------------------------------------------
 # _classify_legacy_input
 # ---------------------------------------------------------------------------

From 8f1eeca181fff3df74e4123d6592f66ef30faa00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 17:02:29 +0100
Subject: [PATCH 05/16] Address Copilot review: name Python params in
 deprecation warnings

The deprecation warnings now reference both the CLI flag and the equivalent
Python parameter (e.g. "--mount (datasets_mount)"), so the message is accurate
whether the deprecated path was reached via the CLI or the submit_to_aml API.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/submit_aml/data.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 7a7225b..f224d01 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -364,14 +364,19 @@ def build_command_inputs(
         inputs[alias] = value
 
     if legacy_mount:
-        _warn_deprecated("--mount", "--mount-asset, --mount-datastore or --mount-job")
+        _warn_deprecated(
+            "--mount (datasets_mount)",
+            "--mount-asset, --mount-datastore or --mount-job"
+            " (mount_asset, mount_datastore or mount_job)",
+        )
         for string in legacy_mount:
             alias, value = _legacy_input(ml_client, string, InputOutputModes.MOUNT)
             inputs[alias] = value
     if legacy_download:
         _warn_deprecated(
-            "--download",
-            "--download-asset, --download-datastore or --download-job",
+            "--download (datasets_download)",
+            "--download-asset, --download-datastore or --download-job"
+            " (download_asset, download_datastore or download_job)",
         )
         for string in legacy_download:
             alias, value = _legacy_input(ml_client, string, InputOutputModes.DOWNLOAD)
@@ -407,7 +412,10 @@ def build_command_outputs(
         outputs[alias] = value
 
     if legacy_output:
-        _warn_deprecated("--output", "--output-datastore or --output-asset")
+        _warn_deprecated(
+            "--output (datasets_output)",
+            "--output-datastore or --output-asset (output_datastore or output_asset)",
+        )
         for string in legacy_output:
             alias, value = _output_from_datastore(string)
             outputs[alias] = value

From 11fcd47381fa6c64f360c8b2ddc0309c2804b4c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 17:38:35 +0100
Subject: [PATCH 06/16] Document removal plan for deprecated data flags

Add a comment by _warn_deprecated outlining the phased, semver-aligned
removal of the legacy --mount/--download/--output flags (keep visible and
warning through 1.x, remove in 2.0.0).
---
 src/submit_aml/data.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index f224d01..293f1f9 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -235,6 +235,28 @@ def _output_from_asset(string: str) -> tuple[str, Output]:
     return alias, output
 
 
+# Removal plan for the deprecated data flags (--mount/-m, --download/-d,
+# --output/-o), superseded by the explicit-source flags (--{mount,download}-
+# {asset,datastore,job} and --output-{datastore,asset}):
+#
+#   1. Now (1.x): both flag sets work. The legacy flags carry a [DEPRECATED]
+#      marker in --help and emit `_warn_deprecated` at runtime. This is the
+#      grace period in which users migrate.
+#   2. Before removal: once downstream callers have migrated (grep the known
+#      consumer repos / run scripts for `--mount`, `--download`, `--output`,
+#      `-m `, `-d `, `-o ` and the `datasets_{mount,download,output}` kwargs of
+#      `submit_to_aml`), and the deprecation has shipped in at least one
+#      tagged release, schedule removal for the next MAJOR version (2.0.0) per
+#      semver, since dropping a CLI flag is a breaking change.
+#   3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/
+#      `output` typer.Options in `__main__.py`, drop the matching
+#      `submit_to_aml` parameters and the `legacy_*` branches in
+#      `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and
+#      `_warn_deprecated`, and note the breaking change in the changelog.
+#
+# Until step 3, keep the legacy flags VISIBLE in --help (the [DEPRECATED]
+# marker is how users discover the migration path); only hide them as an
+# optional last step in a release immediately preceding removal.
 def _warn_deprecated(old_flag: str, new_flags: str) -> None:
     """Emit a deprecation warning pointing users to the new flags.
 

From fc34836308de4b8305971a987dba72b6d65ae830 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 19:56:01 +0100
Subject: [PATCH 07/16] Make legacy data-flag deprecation warnings
 source-specific

Previously --mount/--download/--output emitted one generic warning per
flag listing all replacements. Now each legacy value is classified and
gets a tailored message naming the exact replacement flag and echoing the
corrected invocation, e.g. '--mount my_alias=data_asset' ->
'--mount-asset my_alias=data_asset'. Job values additionally show the
translated form (the redundant job_dir: prefix dropped).

Replace _warn_deprecated with _warn_legacy_input/_warn_legacy_output;
thread the flag base through _legacy_input; warn per value. Add tests
asserting the per-source suggested flag and value.
---
 src/submit_aml/data.py | 85 ++++++++++++++++++++++++++++++------------
 tests/test_data.py     | 44 +++++++++++++++++++++-
 2 files changed, 103 insertions(+), 26 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 293f1f9..16175ff 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -251,22 +251,59 @@ def _output_from_asset(string: str) -> tuple[str, Output]:
 #   3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/
 #      `output` typer.Options in `__main__.py`, drop the matching
 #      `submit_to_aml` parameters and the `legacy_*` branches in
-#      `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and
-#      `_warn_deprecated`, and note the breaking change in the changelog.
+#      `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and the
+#      `_warn_legacy_*` warning helpers, and note the breaking change in the
+#      changelog.
 #
 # Until step 3, keep the legacy flags VISIBLE in --help (the [DEPRECATED]
 # marker is how users discover the migration path); only hide them as an
 # optional last step in a release immediately preceding removal.
-def _warn_deprecated(old_flag: str, new_flags: str) -> None:
-    """Emit a deprecation warning pointing users to the new flags.
+
+# Replacement input flag (CLI, Python parameter) for each legacy input flag
+# base and classified source type, used to tailor the deprecation warning.
+_LEGACY_INPUT_FLAGS = {
+    "mount": ("--mount", "datasets_mount"),
+    "download": ("--download", "datasets_download"),
+}
+
+
+def _warn_legacy_input(
+    flag_base: str,
+    source: str,
+    old_value: str,
+    new_value: str,
+) -> None:
+    """Warn that a legacy input flag is deprecated, naming the exact fix.
+
+    Args:
+        flag_base: The legacy flag base, either `'mount'` or `'download'`.
+        source: The classified source type (`'asset'`, `'datastore'`, or
+            `'job'`), used to pick the per-source replacement flag.
+        old_value: The legacy `alias=value` string the user passed.
+        new_value: The value to use with the replacement flag (identical to
+            `old_value` except for job values, which drop the `job_dir:`
+            prefix).
+    """
+    old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base]
+    new_cli = f"--{flag_base}-{source}"
+    new_param = f"{flag_base}_{source}"
+    logger.warning(
+        f"{old_cli} ({old_param}) is deprecated and will be removed in a future"
+        f" release. Replace '{old_cli} {old_value}' with"
+        f" '{new_cli} {new_value}' ({new_param}).",
+    )
+
+
+def _warn_legacy_output(old_value: str) -> None:
+    """Warn that a legacy `--output` value is deprecated, naming the exact fix.
 
     Args:
-        old_flag: The deprecated flag (e.g. `'--mount'`).
-        new_flags: Human-readable list of replacement flags.
+        old_value: The legacy `alias=datastore/folder` string the user passed.
     """
     logger.warning(
-        f"{old_flag} is deprecated and will be removed in a future release."
-        f" Use {new_flags} instead.",
+        "--output (datasets_output) is deprecated and will be removed in a"
+        f" future release. Replace '--output {old_value}' with"
+        f" '--output-datastore {old_value}' (output_datastore).",
     )
 
 
@@ -312,13 +349,19 @@ def _legacy_input(
     ml_client: MLClient,
     string: str,
     mode: str,
+    flag_base: str,
 ) -> tuple[str, Input]:
     """Route a legacy `--mount`/`--download` value to the right builder.
 
+    Emits a deprecation warning naming the exact replacement flag for the
+    value's classified source type.
+
     Args:
         ml_client: Client used to resolve data assets.
         string: A legacy dataset string.
         mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`.
+        flag_base: The legacy flag base, either `'mount'` or `'download'`,
+            used to tailor the deprecation warning.
 
     Returns:
         Tuple of alias and the resulting `Input`.
@@ -327,9 +370,12 @@ def _legacy_input(
     if source == "job":
         # Translate the old "alias=job_dir:<job_id>:<path>" form to the new one.
         translated = string.replace("=job_dir:", "=", 1)
+        _warn_legacy_input(flag_base, "job", string, translated)
         return _input_from_job(translated, mode)
     if source == "datastore":
+        _warn_legacy_input(flag_base, "datastore", string, string)
         return _input_from_datastore(string, mode)
+    _warn_legacy_input(flag_base, "asset", string, string)
     return _input_from_asset(ml_client, string, mode)
 
 
@@ -386,22 +432,16 @@ def build_command_inputs(
         inputs[alias] = value
 
     if legacy_mount:
-        _warn_deprecated(
-            "--mount (datasets_mount)",
-            "--mount-asset, --mount-datastore or --mount-job"
-            " (mount_asset, mount_datastore or mount_job)",
-        )
         for string in legacy_mount:
-            alias, value = _legacy_input(ml_client, string, InputOutputModes.MOUNT)
+            alias, value = _legacy_input(
+                ml_client, string, InputOutputModes.MOUNT, "mount"
+            )
             inputs[alias] = value
     if legacy_download:
-        _warn_deprecated(
-            "--download (datasets_download)",
-            "--download-asset, --download-datastore or --download-job"
-            " (download_asset, download_datastore or download_job)",
-        )
         for string in legacy_download:
-            alias, value = _legacy_input(ml_client, string, InputOutputModes.DOWNLOAD)
+            alias, value = _legacy_input(
+                ml_client, string, InputOutputModes.DOWNLOAD, "download"
+            )
             inputs[alias] = value
 
     return inputs
@@ -434,11 +474,8 @@ def build_command_outputs(
         outputs[alias] = value
 
     if legacy_output:
-        _warn_deprecated(
-            "--output (datasets_output)",
-            "--output-datastore or --output-asset (output_datastore or output_asset)",
-        )
         for string in legacy_output:
+            _warn_legacy_output(string)
             alias, value = _output_from_datastore(string)
             outputs[alias] = value
 
diff --git a/tests/test_data.py b/tests/test_data.py
index bf1e8ad..30bc2c9 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -277,6 +277,44 @@ def test_build_command_inputs_legacy_asset_calls_client() -> None:
     client.data.get.assert_called_once()
 
 
+def test_build_command_inputs_legacy_asset_warns_mount_asset(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """A legacy --mount asset value is told to use --mount-asset specifically."""
+    client = Mock()
+    client.data.get.return_value = Mock(id="azureml:resolved-asset:1")
+    build_command_inputs(client, legacy_mount=["my_alias=data_asset"])
+    message = " ".join(capsys.readouterr().out.split())
+    assert "--mount-asset my_alias=data_asset" in message
+    assert "--mount-datastore" not in message
+    assert "--mount-job" not in message
+
+
+def test_build_command_inputs_legacy_datastore_warns_mount_datastore(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """A legacy --mount datastore value is told to use --mount-datastore."""
+    client = Mock()
+    build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"])
+    message = " ".join(capsys.readouterr().out.split())
+    assert "--mount-datastore ref=mystore/exports/reference" in message
+
+
+def test_build_command_inputs_legacy_job_warns_download_job_translated(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """A legacy --download job value is told to use --download-job, sans prefix."""
+    client = Mock()
+    build_command_inputs(
+        client,
+        legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"],
+    )
+    message = " ".join(capsys.readouterr().out.split())
+    assert "--download-job ckpt=my_job_123:models/best.pth" in message
+    # The suggested replacement (after "with") drops the legacy job_dir: prefix.
+    assert "job_dir:" not in message.split("with", 1)[1]
+
+
 # ---------------------------------------------------------------------------
 # build_command_outputs
 # ---------------------------------------------------------------------------
@@ -303,7 +341,9 @@ def test_build_command_outputs_datastore_and_asset() -> None:
 def test_build_command_outputs_legacy_warns(
     capsys: pytest.CaptureFixture[str],
 ) -> None:
-    """Legacy --output strings are built and emit a deprecation warning."""
+    """Legacy --output strings are built and emit a targeted deprecation warning."""
     outputs = build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"])
     assert "out_dir" in outputs
-    assert "deprecated" in capsys.readouterr().out.lower()
+    message = " ".join(capsys.readouterr().out.split())
+    assert "deprecated" in message.lower()
+    assert "--output-datastore out_dir=mydatastore/my_dataset" in message

From 56aeb96d57b15ea4c0b704b7da2d90cec5766820 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 19:59:22 +0100
Subject: [PATCH 08/16] Emit DeprecationWarning for legacy data flags via the
 Python API

The legacy-flag deprecation was previously only a loguru log line, which
serves CLI users but is invisible to callers of the library API. Also emit
a real warnings.warn(..., DeprecationWarning) from the warning helpers so
programmatic users of submit_to_aml / build_command_inputs /
build_command_outputs get the standard, tooling-visible signal. Pass a
stacklevel so the warning is blamed on the API caller, not the helper.

Add tests asserting the DeprecationWarning category for legacy inputs and
outputs.
---
 src/submit_aml/data.py | 40 +++++++++++++++++++++++++++++++---------
 tests/test_data.py     | 13 +++++++++++++
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 16175ff..36e2c49 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -2,6 +2,7 @@
 
 import re
 import sys
+import warnings
 
 from azure.ai.ml import Input
 from azure.ai.ml import MLClient
@@ -272,9 +273,14 @@ def _warn_legacy_input(
     source: str,
     old_value: str,
     new_value: str,
+    stacklevel: int = 2,
 ) -> None:
     """Warn that a legacy input flag is deprecated, naming the exact fix.
 
+    Emits both a human-facing log line (for CLI users) and a Python
+    `DeprecationWarning` (for callers of the library API, e.g. `submit_to_aml`
+    or `build_command_inputs`).
+
     Args:
         flag_base: The legacy flag base, either `'mount'` or `'download'`.
         source: The classified source type (`'asset'`, `'datastore'`, or
@@ -283,28 +289,40 @@ def _warn_legacy_input(
         new_value: The value to use with the replacement flag (identical to
             `old_value` except for job values, which drop the `job_dir:`
             prefix).
+        stacklevel: Stack level for the `DeprecationWarning`, so it points at
+            the API caller rather than this helper.
     """
     old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base]
     new_cli = f"--{flag_base}-{source}"
     new_param = f"{flag_base}_{source}"
-    logger.warning(
+    message = (
         f"{old_cli} ({old_param}) is deprecated and will be removed in a future"
         f" release. Replace '{old_cli} {old_value}' with"
-        f" '{new_cli} {new_value}' ({new_param}).",
+        f" '{new_cli} {new_value}' ({new_param})."
     )
+    logger.warning(message)
+    warnings.warn(message, DeprecationWarning, stacklevel=stacklevel)
 
 
-def _warn_legacy_output(old_value: str) -> None:
+def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None:
     """Warn that a legacy `--output` value is deprecated, naming the exact fix.
 
+    Emits both a human-facing log line (for CLI users) and a Python
+    `DeprecationWarning` (for callers of the library API, e.g. `submit_to_aml`
+    or `build_command_outputs`).
+
     Args:
         old_value: The legacy `alias=datastore/folder` string the user passed.
+        stacklevel: Stack level for the `DeprecationWarning`, so it points at
+            the API caller rather than this helper.
     """
-    logger.warning(
+    message = (
         "--output (datasets_output) is deprecated and will be removed in a"
         f" future release. Replace '--output {old_value}' with"
-        f" '--output-datastore {old_value}' (output_datastore).",
+        f" '--output-datastore {old_value}' (output_datastore)."
     )
+    logger.warning(message)
+    warnings.warn(message, DeprecationWarning, stacklevel=stacklevel)
 
 
 def _classify_legacy_input(string: str) -> str:
@@ -367,15 +385,17 @@ def _legacy_input(
         Tuple of alias and the resulting `Input`.
     """
     source = _classify_legacy_input(string)
+    # stacklevel=4: warnings.warn -> _warn_legacy_input -> _legacy_input ->
+    # build_command_inputs, so the DeprecationWarning points at the API caller.
     if source == "job":
         # Translate the old "alias=job_dir:<job_id>:<path>" form to the new one.
         translated = string.replace("=job_dir:", "=", 1)
-        _warn_legacy_input(flag_base, "job", string, translated)
+        _warn_legacy_input(flag_base, "job", string, translated, stacklevel=4)
         return _input_from_job(translated, mode)
     if source == "datastore":
-        _warn_legacy_input(flag_base, "datastore", string, string)
+        _warn_legacy_input(flag_base, "datastore", string, string, stacklevel=4)
         return _input_from_datastore(string, mode)
-    _warn_legacy_input(flag_base, "asset", string, string)
+    _warn_legacy_input(flag_base, "asset", string, string, stacklevel=4)
     return _input_from_asset(ml_client, string, mode)
 
 
@@ -475,7 +495,9 @@ def build_command_outputs(
 
     if legacy_output:
         for string in legacy_output:
-            _warn_legacy_output(string)
+            # stacklevel=3: warnings.warn -> _warn_legacy_output ->
+            # build_command_outputs, pointing at the API caller.
+            _warn_legacy_output(string, stacklevel=3)
             alias, value = _output_from_datastore(string)
             outputs[alias] = value
 
diff --git a/tests/test_data.py b/tests/test_data.py
index 30bc2c9..a39b7ee 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -269,6 +269,13 @@ def test_build_command_inputs_legacy_job_new_syntax_routes() -> None:
     client.data.get.assert_not_called()
 
 
+def test_build_command_inputs_legacy_raises_deprecation_warning() -> None:
+    """The Python API raises a DeprecationWarning for legacy input values."""
+    client = Mock()
+    with pytest.warns(DeprecationWarning, match=r"--mount-datastore"):
+        build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"])
+
+
 def test_build_command_inputs_legacy_asset_calls_client() -> None:
     """Legacy --mount asset strings still resolve through the client."""
     client = Mock()
@@ -347,3 +354,9 @@ def test_build_command_outputs_legacy_warns(
     message = " ".join(capsys.readouterr().out.split())
     assert "deprecated" in message.lower()
     assert "--output-datastore out_dir=mydatastore/my_dataset" in message
+
+
+def test_build_command_outputs_legacy_raises_deprecation_warning() -> None:
+    """The Python API raises a DeprecationWarning for legacy output values."""
+    with pytest.warns(DeprecationWarning, match=r"--output-datastore"):
+        build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"])

From c351648ea5680c47295025601631771317826d72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 20:22:25 +0100
Subject: [PATCH 09/16] Separate CLI vs Python API deprecation messages

CLI users were shown Python parameter names (e.g. '(datasets_mount)') in
the deprecation log line, which is irrelevant to them. Split the messaging:
the loguru log (CLI-facing) now names only the flags, while the
DeprecationWarning (Python-API-facing) names only the parameters. Update
tests to assert each audience sees only its own surface.
---
 src/submit_aml/data.py | 39 ++++++++++++++++++++++++---------------
 tests/test_data.py     | 14 ++++++++++----
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 36e2c49..3d15f8a 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -295,34 +295,43 @@ def _warn_legacy_input(
     old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base]
     new_cli = f"--{flag_base}-{source}"
     new_param = f"{flag_base}_{source}"
-    message = (
-        f"{old_cli} ({old_param}) is deprecated and will be removed in a future"
-        f" release. Replace '{old_cli} {old_value}' with"
-        f" '{new_cli} {new_value}' ({new_param})."
+    # CLI users see only flags; Python API users see only parameter names.
+    cli_message = (
+        f"{old_cli} is deprecated and will be removed in a future release."
+        f" Replace '{old_cli} {old_value}' with '{new_cli} {new_value}'."
+    )
+    api_message = (
+        f"The '{old_param}' parameter is deprecated and will be removed in a"
+        f" future release. Pass {new_value!r} to '{new_param}' instead."
     )
-    logger.warning(message)
-    warnings.warn(message, DeprecationWarning, stacklevel=stacklevel)
+    logger.warning(cli_message)
+    warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel)
 
 
 def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None:
     """Warn that a legacy `--output` value is deprecated, naming the exact fix.
 
-    Emits both a human-facing log line (for CLI users) and a Python
-    `DeprecationWarning` (for callers of the library API, e.g. `submit_to_aml`
-    or `build_command_outputs`).
+    Emits a human-facing log line naming the replacement CLI flag (for CLI
+    users) and a Python `DeprecationWarning` naming the replacement parameter
+    (for callers of the library API, e.g. `submit_to_aml` or
+    `build_command_outputs`).
 
     Args:
         old_value: The legacy `alias=datastore/folder` string the user passed.
         stacklevel: Stack level for the `DeprecationWarning`, so it points at
             the API caller rather than this helper.
     """
-    message = (
-        "--output (datasets_output) is deprecated and will be removed in a"
-        f" future release. Replace '--output {old_value}' with"
-        f" '--output-datastore {old_value}' (output_datastore)."
+    # CLI users see only flags; Python API users see only parameter names.
+    cli_message = (
+        "--output is deprecated and will be removed in a future release."
+        f" Replace '--output {old_value}' with '--output-datastore {old_value}'."
+    )
+    api_message = (
+        "The 'datasets_output' parameter is deprecated and will be removed in a"
+        f" future release. Pass {old_value!r} to 'output_datastore' instead."
     )
-    logger.warning(message)
-    warnings.warn(message, DeprecationWarning, stacklevel=stacklevel)
+    logger.warning(cli_message)
+    warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel)
 
 
 def _classify_legacy_input(string: str) -> str:
diff --git a/tests/test_data.py b/tests/test_data.py
index a39b7ee..8b4a927 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -270,10 +270,12 @@ def test_build_command_inputs_legacy_job_new_syntax_routes() -> None:
 
 
 def test_build_command_inputs_legacy_raises_deprecation_warning() -> None:
-    """The Python API raises a DeprecationWarning for legacy input values."""
+    """The Python API raises a DeprecationWarning naming the new parameter."""
     client = Mock()
-    with pytest.warns(DeprecationWarning, match=r"--mount-datastore"):
+    with pytest.warns(DeprecationWarning, match=r"mount_datastore") as record:
         build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"])
+    # The Python-facing warning references parameters, not CLI flags.
+    assert "--mount" not in str(record[0].message)
 
 
 def test_build_command_inputs_legacy_asset_calls_client() -> None:
@@ -295,6 +297,9 @@ def test_build_command_inputs_legacy_asset_warns_mount_asset(
     assert "--mount-asset my_alias=data_asset" in message
     assert "--mount-datastore" not in message
     assert "--mount-job" not in message
+    # CLI users should not see Python parameter names.
+    assert "mount_asset" not in message
+    assert "datasets_mount" not in message
 
 
 def test_build_command_inputs_legacy_datastore_warns_mount_datastore(
@@ -357,6 +362,7 @@ def test_build_command_outputs_legacy_warns(
 
 
 def test_build_command_outputs_legacy_raises_deprecation_warning() -> None:
-    """The Python API raises a DeprecationWarning for legacy output values."""
-    with pytest.warns(DeprecationWarning, match=r"--output-datastore"):
+    """The Python API raises a DeprecationWarning naming the new parameter."""
+    with pytest.warns(DeprecationWarning, match=r"output_datastore") as record:
         build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"])
+    assert "--output" not in str(record[0].message)

From 6f87f25fc149a4c9c732d201994958b27b54f6ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 20:27:56 +0100
Subject: [PATCH 10/16] Fix stale _warn_deprecated reference in removal-plan
 comment

Phase 1 of the comment still named the old _warn_deprecated helper; point
it at _warn_legacy_input / _warn_legacy_output instead.
---
 src/submit_aml/data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 3d15f8a..eda6911 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -241,8 +241,9 @@ def _output_from_asset(string: str) -> tuple[str, Output]:
 # {asset,datastore,job} and --output-{datastore,asset}):
 #
 #   1. Now (1.x): both flag sets work. The legacy flags carry a [DEPRECATED]
-#      marker in --help and emit `_warn_deprecated` at runtime. This is the
-#      grace period in which users migrate.
+#      marker in --help and emit a deprecation warning at runtime (via
+#      `_warn_legacy_input` / `_warn_legacy_output`). This is the grace period
+#      in which users migrate.
 #   2. Before removal: once downstream callers have migrated (grep the known
 #      consumer repos / run scripts for `--mount`, `--download`, `--output`,
 #      `-m `, `-d `, `-o ` and the `datasets_{mount,download,output}` kwargs of

From ec4768541c68ed291ea5176aec8afcdf01e0cf7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 20:32:59 +0100
Subject: [PATCH 11/16] Show one-element list in Python API deprecation
 warnings

The replacement parameters are list[str]; the warning previously suggested
passing a bare string, which would iterate over characters. Show the value
wrapped in a one-element list (e.g. ['ref=...']).
---
 src/submit_aml/data.py |  4 ++--
 tests/test_data.py     | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index eda6911..b85f21e 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -303,7 +303,7 @@ def _warn_legacy_input(
     )
     api_message = (
         f"The '{old_param}' parameter is deprecated and will be removed in a"
-        f" future release. Pass {new_value!r} to '{new_param}' instead."
+        f" future release. Pass [{new_value!r}] to '{new_param}' instead."
     )
     logger.warning(cli_message)
     warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel)
@@ -329,7 +329,7 @@ def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None:
     )
     api_message = (
         "The 'datasets_output' parameter is deprecated and will be removed in a"
-        f" future release. Pass {old_value!r} to 'output_datastore' instead."
+        f" future release. Pass [{old_value!r}] to 'output_datastore' instead."
     )
     logger.warning(cli_message)
     warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel)
diff --git a/tests/test_data.py b/tests/test_data.py
index 8b4a927..3a78e47 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -274,8 +274,11 @@ def test_build_command_inputs_legacy_raises_deprecation_warning() -> None:
     client = Mock()
     with pytest.warns(DeprecationWarning, match=r"mount_datastore") as record:
         build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"])
-    # The Python-facing warning references parameters, not CLI flags.
-    assert "--mount" not in str(record[0].message)
+    # The Python-facing warning references parameters, not CLI flags, and shows
+    # a one-element list (the parameters are list[str]).
+    message = str(record[0].message)
+    assert "--mount" not in message
+    assert "['ref=mystore/exports/reference']" in message
 
 
 def test_build_command_inputs_legacy_asset_calls_client() -> None:
@@ -365,4 +368,6 @@ def test_build_command_outputs_legacy_raises_deprecation_warning() -> None:
     """The Python API raises a DeprecationWarning naming the new parameter."""
     with pytest.warns(DeprecationWarning, match=r"output_datastore") as record:
         build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"])
-    assert "--output" not in str(record[0].message)
+    message = str(record[0].message)
+    assert "--output" not in message
+    assert "['out_dir=mydatastore/my_dataset']" in message

From 5621096fd7838e411fbe5b5aaa451780d55147b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 20:39:46 +0100
Subject: [PATCH 12/16] Preserve mount-over-download precedence for duplicate
 aliases

build_command_inputs builds a single dict where later writes win. The new
explicit/legacy flags processed mounts before downloads, so a duplicate
alias resolved to download mode -- reversing main's {**downloads, **mounts}
precedence where mount won. Process downloads first and mounts second (and
legacy after explicit) so mount keeps precedence. Add regression tests.
---
 src/submit_aml/data.py | 36 +++++++++++++++++++++---------------
 tests/test_data.py     | 22 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index b85f21e..454e838 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -437,42 +437,48 @@ def build_command_inputs(
 
     Returns:
         Dictionary of `alias: Input` mappings.
+
+    When the same alias appears under both a download and a mount flag, the
+    mount wins: downloads are applied first and mounts overwrite them. This
+    preserves the historical `{**downloads, **mounts}` precedence.
     """
     inputs: TypeInputsDict = {}
 
-    for string in mount_asset or []:
-        alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT)
-        inputs[alias] = value
+    # Downloads first, then mounts, so that mount takes precedence when the same
+    # alias is supplied under both modes (and likewise legacy after explicit).
     for string in download_asset or []:
         alias, value = _input_from_asset(ml_client, string, InputOutputModes.DOWNLOAD)
         inputs[alias] = value
-
-    for string in mount_datastore or []:
-        alias, value = _input_from_datastore(string, InputOutputModes.MOUNT)
+    for string in mount_asset or []:
+        alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT)
         inputs[alias] = value
+
     for string in download_datastore or []:
         alias, value = _input_from_datastore(string, InputOutputModes.DOWNLOAD)
         inputs[alias] = value
-
-    for string in mount_job or []:
-        alias, value = _input_from_job(string, InputOutputModes.MOUNT)
+    for string in mount_datastore or []:
+        alias, value = _input_from_datastore(string, InputOutputModes.MOUNT)
         inputs[alias] = value
+
     for string in download_job or []:
         alias, value = _input_from_job(string, InputOutputModes.DOWNLOAD)
         inputs[alias] = value
+    for string in mount_job or []:
+        alias, value = _input_from_job(string, InputOutputModes.MOUNT)
+        inputs[alias] = value
 
-    if legacy_mount:
-        for string in legacy_mount:
-            alias, value = _legacy_input(
-                ml_client, string, InputOutputModes.MOUNT, "mount"
-            )
-            inputs[alias] = value
     if legacy_download:
         for string in legacy_download:
             alias, value = _legacy_input(
                 ml_client, string, InputOutputModes.DOWNLOAD, "download"
             )
             inputs[alias] = value
+    if legacy_mount:
+        for string in legacy_mount:
+            alias, value = _legacy_input(
+                ml_client, string, InputOutputModes.MOUNT, "mount"
+            )
+            inputs[alias] = value
 
     return inputs
 
diff --git a/tests/test_data.py b/tests/test_data.py
index 3a78e47..8e0b4c6 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -289,6 +289,28 @@ def test_build_command_inputs_legacy_asset_calls_client() -> None:
     client.data.get.assert_called_once()
 
 
+def test_build_command_inputs_mount_overrides_download_for_duplicate_alias() -> None:
+    """When an alias is given under both modes, mount wins (legacy precedence)."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        download_datastore=["ref=mystore/exports/reference"],
+        mount_datastore=["ref=mystore/exports/reference"],
+    )
+    assert inputs["ref"].mode == InputOutputModes.MOUNT
+
+
+def test_build_command_inputs_legacy_mount_overrides_legacy_download() -> None:
+    """A colliding alias across legacy mount/download resolves to mount."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        legacy_download=["ref=mystore/exports/reference"],
+        legacy_mount=["ref=mystore/exports/reference"],
+    )
+    assert inputs["ref"].mode == InputOutputModes.MOUNT
+
+
 def test_build_command_inputs_legacy_asset_warns_mount_asset(
     capsys: pytest.CaptureFixture[str],
 ) -> None:

From e988b1c9741e96b00b415f80d0c54c26b1cb32d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 21:11:55 +0100
Subject: [PATCH 13/16] Raise on duplicate input/output aliases

An alias maps to a single ${{inputs.<alias>}} / ${{outputs.<alias>}}
reference, so reusing one (across modes, source types, or flags) is a user
error. Previously the colliding entry silently overwrote the earlier one.
Add _assign_unique and route all input/output insertions through it, raising
a clear ValueError naming the alias. This supersedes the earlier
mount-over-download precedence (Copilot suggested duplicate-alias validation
as the alternative). Add tests for cross-mode, same-mode, legacy, and output
collisions.
---
 src/submit_aml/data.py | 64 +++++++++++++++++++++++++++++++-----------
 tests/test_data.py     | 51 ++++++++++++++++++++++-----------
 2 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 454e838..5f959de 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -409,6 +409,34 @@ def _legacy_input(
     return _input_from_asset(ml_client, string, mode)
 
 
+def _assign_unique(
+    mapping: dict[str, object],
+    alias: str,
+    value: object,
+    *,
+    kind: str,
+) -> None:
+    """Insert `alias -> value`, raising if the alias is already present.
+
+    Each alias becomes a single `${{inputs.<alias>}}` / `${{outputs.<alias>}}`
+    reference, so it must be unique. Reusing an alias (across modes, source
+    types, or flags) is a user error rather than something to silently resolve.
+
+    Args:
+        mapping: The inputs or outputs dict being built.
+        alias: The alias key to insert.
+        value: The `Input` or `Output` to store.
+        kind: Either `'input'` or `'output'`, used in the error message.
+
+    Raises:
+        ValueError: If `alias` is already present in `mapping`.
+    """
+    if alias in mapping:
+        msg = f"Duplicate {kind} alias {alias!r}: each alias must be unique."
+        raise ValueError(msg)
+    mapping[alias] = value
+
+
 def build_command_inputs(
     ml_client: MLClient,
     *,
@@ -438,47 +466,46 @@ def build_command_inputs(
     Returns:
         Dictionary of `alias: Input` mappings.
 
-    When the same alias appears under both a download and a mount flag, the
-    mount wins: downloads are applied first and mounts overwrite them. This
-    preserves the historical `{**downloads, **mounts}` precedence.
+    Raises:
+        ValueError: If the same alias is used more than once across any of the
+            input flags (an alias maps to a single `${{inputs.<alias>}}`
+            reference, so it must be unique).
     """
     inputs: TypeInputsDict = {}
 
-    # Downloads first, then mounts, so that mount takes precedence when the same
-    # alias is supplied under both modes (and likewise legacy after explicit).
     for string in download_asset or []:
         alias, value = _input_from_asset(ml_client, string, InputOutputModes.DOWNLOAD)
-        inputs[alias] = value
+        _assign_unique(inputs, alias, value, kind="input")
     for string in mount_asset or []:
         alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT)
-        inputs[alias] = value
+        _assign_unique(inputs, alias, value, kind="input")
 
     for string in download_datastore or []:
         alias, value = _input_from_datastore(string, InputOutputModes.DOWNLOAD)
-        inputs[alias] = value
+        _assign_unique(inputs, alias, value, kind="input")
     for string in mount_datastore or []:
         alias, value = _input_from_datastore(string, InputOutputModes.MOUNT)
-        inputs[alias] = value
+        _assign_unique(inputs, alias, value, kind="input")
 
     for string in download_job or []:
         alias, value = _input_from_job(string, InputOutputModes.DOWNLOAD)
-        inputs[alias] = value
+        _assign_unique(inputs, alias, value, kind="input")
     for string in mount_job or []:
         alias, value = _input_from_job(string, InputOutputModes.MOUNT)
-        inputs[alias] = value
+        _assign_unique(inputs, alias, value, kind="input")
 
     if legacy_download:
         for string in legacy_download:
             alias, value = _legacy_input(
                 ml_client, string, InputOutputModes.DOWNLOAD, "download"
             )
-            inputs[alias] = value
+            _assign_unique(inputs, alias, value, kind="input")
     if legacy_mount:
         for string in legacy_mount:
             alias, value = _legacy_input(
                 ml_client, string, InputOutputModes.MOUNT, "mount"
             )
-            inputs[alias] = value
+            _assign_unique(inputs, alias, value, kind="input")
 
     return inputs
 
@@ -499,15 +526,20 @@ def build_command_outputs(
 
     Returns:
         Dictionary of `alias: Output` mappings.
+
+    Raises:
+        ValueError: If the same alias is used more than once across any of the
+            output flags (an alias maps to a single `${{outputs.<alias>}}`
+            reference, so it must be unique).
     """
     outputs: dict[str, Output] = {}
 
     for string in output_datastore or []:
         alias, value = _output_from_datastore(string)
-        outputs[alias] = value
+        _assign_unique(outputs, alias, value, kind="output")
     for string in output_asset or []:
         alias, value = _output_from_asset(string)
-        outputs[alias] = value
+        _assign_unique(outputs, alias, value, kind="output")
 
     if legacy_output:
         for string in legacy_output:
@@ -515,6 +547,6 @@ def build_command_outputs(
             # build_command_outputs, pointing at the API caller.
             _warn_legacy_output(string, stacklevel=3)
             alias, value = _output_from_datastore(string)
-            outputs[alias] = value
+            _assign_unique(outputs, alias, value, kind="output")
 
     return outputs
diff --git a/tests/test_data.py b/tests/test_data.py
index 8e0b4c6..34f6f5e 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -289,26 +289,36 @@ def test_build_command_inputs_legacy_asset_calls_client() -> None:
     client.data.get.assert_called_once()
 
 
-def test_build_command_inputs_mount_overrides_download_for_duplicate_alias() -> None:
-    """When an alias is given under both modes, mount wins (legacy precedence)."""
+def test_build_command_inputs_duplicate_alias_across_modes_raises() -> None:
+    """The same alias under both download and mount is a hard error."""
     client = Mock()
-    inputs = build_command_inputs(
-        client,
-        download_datastore=["ref=mystore/exports/reference"],
-        mount_datastore=["ref=mystore/exports/reference"],
-    )
-    assert inputs["ref"].mode == InputOutputModes.MOUNT
+    with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*ref"):
+        build_command_inputs(
+            client,
+            download_datastore=["ref=mystore/exports/reference"],
+            mount_datastore=["ref=mystore/exports/reference"],
+        )
 
 
-def test_build_command_inputs_legacy_mount_overrides_legacy_download() -> None:
-    """A colliding alias across legacy mount/download resolves to mount."""
+def test_build_command_inputs_duplicate_alias_same_mode_raises() -> None:
+    """A repeated alias within a single flag is also a hard error."""
     client = Mock()
-    inputs = build_command_inputs(
-        client,
-        legacy_download=["ref=mystore/exports/reference"],
-        legacy_mount=["ref=mystore/exports/reference"],
-    )
-    assert inputs["ref"].mode == InputOutputModes.MOUNT
+    with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*ref"):
+        build_command_inputs(
+            client,
+            mount_datastore=["ref=mystore/a", "ref=mystore/b"],
+        )
+
+
+def test_build_command_inputs_legacy_duplicate_alias_raises() -> None:
+    """A colliding alias across legacy mount/download raises."""
+    client = Mock()
+    with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*ref"):
+        build_command_inputs(
+            client,
+            legacy_download=["ref=mystore/exports/reference"],
+            legacy_mount=["ref=mystore/exports/reference"],
+        )
 
 
 def test_build_command_inputs_legacy_asset_warns_mount_asset(
@@ -375,6 +385,15 @@ def test_build_command_outputs_datastore_and_asset() -> None:
     assert outputs["asset_dir"].version == "2"
 
 
+def test_build_command_outputs_duplicate_alias_raises() -> None:
+    """The same alias under two output flags is a hard error."""
+    with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*out_dir"):
+        build_command_outputs(
+            output_datastore=["out_dir=mydatastore/my_dataset"],
+            output_asset=["out_dir=my-results:2"],
+        )
+
+
 def test_build_command_outputs_legacy_warns(
     capsys: pytest.CaptureFixture[str],
 ) -> None:

From 7e65dd44a7efb4ad7c97e00f5cde43f5dcc5c88a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 21:18:08 +0100
Subject: [PATCH 14/16] Fix removal-plan comment refs and document new job form
 in --download help

- Removal-plan comment named non-existent add_inputs/add_outputs; point it
  at build_command_inputs/build_command_outputs and the legacy_* kwargs.
- The deprecated --download help only showed the legacy job_dir: syntax;
  document the preferred 'alias=<job_id>:<path>' form (still accepting the
  old one).
---
 src/submit_aml/__main__.py | 7 ++++---
 src/submit_aml/data.py     | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py
index bf28111..3ef228b 100644
--- a/src/submit_aml/__main__.py
+++ b/src/submit_aml/__main__.py
@@ -148,9 +148,10 @@ def submit(
             " dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'. If the version is"
             " omitted, the last one will be used. To download a datastore folder,"
             " use 'alias=datastore/folder'. To download the output folder of a"
-            " previous job, the argument should take the form"
-            " 'alias=job_dir:<job_id>:<path/in/job/outputs>'; for example:"
-            " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'."
+            " previous job, the preferred form is 'alias=<job_id>:<path>' (the"
+            " same as --download-job); for example,"
+            " 'checkpoint=crusty_hat_43s6lmvb25:outputs/checkpoint-10000'. The"
+            " older 'alias=job_dir:<job_id>:<path>' form is also still accepted."
             " The alias can be used to pass input datasets to the script, e.g.,"
             r" '${{inputs.vindr_dir}}' or '${{inputs.checkpoint}}'."
             " This option can be used multiple times."
diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 5f959de..8d9fcfb 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -252,10 +252,10 @@ def _output_from_asset(string: str) -> tuple[str, Output]:
 #      semver, since dropping a CLI flag is a breaking change.
 #   3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/
 #      `output` typer.Options in `__main__.py`, drop the matching
-#      `submit_to_aml` parameters and the `legacy_*` branches in
-#      `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and the
-#      `_warn_legacy_*` warning helpers, and note the breaking change in the
-#      changelog.
+#      `submit_to_aml` parameters and the `legacy_*` keyword arguments and
+#      branches in `build_command_inputs`/`build_command_outputs`, delete the
+#      `_legacy_*` helpers and the `_warn_legacy_*` warning helpers, and note
+#      the breaking change in the changelog.
 #
 # Until step 3, keep the legacy flags VISIBLE in --help (the [DEPRECATED]
 # marker is how users discover the migration path); only hide them as an

From 009aaefc3386818ced4b03935d812e3b5621512c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Tue, 9 Jun 2026 23:52:24 +0100
Subject: [PATCH 15/16] Fix CI: generic _assign_unique type and ANSI-robust
 warning tests

- ty failed because dict is invariant, so passing TypeInputsDict /
  dict[str, Output] to _assign_unique(mapping: dict[str, object]) was
  rejected. Make _assign_unique generic over a TypeVar.
- pytest failed under CI's coloured rich output: the deprecation-message
  tests asserted on captured stdout, where ANSI codes and line wrapping
  split the matched substrings. Assert on the raw message passed to
  logger.warning (patched) instead, via a _deprecation_log helper.
---
 src/submit_aml/data.py |  6 +++--
 tests/test_data.py     | 56 +++++++++++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 8d9fcfb..b8e7965 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -3,6 +3,7 @@
 import re
 import sys
 import warnings
+from typing import TypeVar
 
 from azure.ai.ml import Input
 from azure.ai.ml import MLClient
@@ -17,6 +18,7 @@
 
 TypeInputsDict = dict[str, Input | SweepDistribution]
 TypeOptionalStrList = list[str] | None
+_MappingValue = TypeVar("_MappingValue")
 
 
 def _datastore_uri(datastore: str, path: str) -> str:
@@ -410,9 +412,9 @@ def _legacy_input(
 
 
 def _assign_unique(
-    mapping: dict[str, object],
+    mapping: dict[str, _MappingValue],
     alias: str,
-    value: object,
+    value: _MappingValue,
     *,
     kind: str,
 ) -> None:
diff --git a/tests/test_data.py b/tests/test_data.py
index 34f6f5e..06fc25e 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from unittest.mock import Mock
+from unittest.mock import patch
 
 import pytest
 from azure.ai.ml.constants import InputOutputModes
@@ -21,6 +22,17 @@
 from submit_aml.data import build_command_inputs
 from submit_aml.data import build_command_outputs
 
+
+def _deprecation_log(mock_logger: Mock) -> str:
+    """Join the raw messages passed to `logger.warning`.
+
+    Asserting on the raw message (rather than `capsys`) avoids the rich
+    console's ANSI colouring, highlighting, and line wrapping, which otherwise
+    make multi-token substring checks brittle (and fail under CI's colour mode).
+    """
+    return " ".join(call.args[0] for call in mock_logger.warning.call_args_list)
+
+
 # ---------------------------------------------------------------------------
 # _datastore_uri
 # ---------------------------------------------------------------------------
@@ -321,14 +333,13 @@ def test_build_command_inputs_legacy_duplicate_alias_raises() -> None:
         )
 
 
-def test_build_command_inputs_legacy_asset_warns_mount_asset(
-    capsys: pytest.CaptureFixture[str],
-) -> None:
+def test_build_command_inputs_legacy_asset_warns_mount_asset() -> None:
     """A legacy --mount asset value is told to use --mount-asset specifically."""
     client = Mock()
     client.data.get.return_value = Mock(id="azureml:resolved-asset:1")
-    build_command_inputs(client, legacy_mount=["my_alias=data_asset"])
-    message = " ".join(capsys.readouterr().out.split())
+    with patch("submit_aml.data.logger") as mock_logger:
+        build_command_inputs(client, legacy_mount=["my_alias=data_asset"])
+    message = _deprecation_log(mock_logger)
     assert "--mount-asset my_alias=data_asset" in message
     assert "--mount-datastore" not in message
     assert "--mount-job" not in message
@@ -337,26 +348,24 @@ def test_build_command_inputs_legacy_asset_warns_mount_asset(
     assert "datasets_mount" not in message
 
 
-def test_build_command_inputs_legacy_datastore_warns_mount_datastore(
-    capsys: pytest.CaptureFixture[str],
-) -> None:
+def test_build_command_inputs_legacy_datastore_warns_mount_datastore() -> None:
     """A legacy --mount datastore value is told to use --mount-datastore."""
     client = Mock()
-    build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"])
-    message = " ".join(capsys.readouterr().out.split())
+    with patch("submit_aml.data.logger") as mock_logger:
+        build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"])
+    message = _deprecation_log(mock_logger)
     assert "--mount-datastore ref=mystore/exports/reference" in message
 
 
-def test_build_command_inputs_legacy_job_warns_download_job_translated(
-    capsys: pytest.CaptureFixture[str],
-) -> None:
+def test_build_command_inputs_legacy_job_warns_download_job_translated() -> None:
     """A legacy --download job value is told to use --download-job, sans prefix."""
     client = Mock()
-    build_command_inputs(
-        client,
-        legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"],
-    )
-    message = " ".join(capsys.readouterr().out.split())
+    with patch("submit_aml.data.logger") as mock_logger:
+        build_command_inputs(
+            client,
+            legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"],
+        )
+    message = _deprecation_log(mock_logger)
     assert "--download-job ckpt=my_job_123:models/best.pth" in message
     # The suggested replacement (after "with") drops the legacy job_dir: prefix.
     assert "job_dir:" not in message.split("with", 1)[1]
@@ -394,13 +403,14 @@ def test_build_command_outputs_duplicate_alias_raises() -> None:
         )
 
 
-def test_build_command_outputs_legacy_warns(
-    capsys: pytest.CaptureFixture[str],
-) -> None:
+def test_build_command_outputs_legacy_warns() -> None:
     """Legacy --output strings are built and emit a targeted deprecation warning."""
-    outputs = build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"])
+    with patch("submit_aml.data.logger") as mock_logger:
+        outputs = build_command_outputs(
+            legacy_output=["out_dir=mydatastore/my_dataset"]
+        )
     assert "out_dir" in outputs
-    message = " ".join(capsys.readouterr().out.split())
+    message = _deprecation_log(mock_logger)
     assert "deprecated" in message.lower()
     assert "--output-datastore out_dir=mydatastore/my_dataset" in message
 

From b6445ad9967f055b8cf74b92540c6b3cd2a833b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?=
 <fperezgarcia@microsoft.com>
Date: Wed, 10 Jun 2026 00:07:08 +0100
Subject: [PATCH 16/16] Address review: accurate warning comments, job help,
 positional compat

- The 'CLI sees only flags / API sees only params' inline comments were
  misleading: both messages are always emitted. Reword to say the log line
  is phrased for CLI users and the DeprecationWarning for API users.
- --download help implied the bare 'alias=<job_id>:<path>' job form works on
  the legacy flag, but it only routes as a job when <path> has a '/'. Steer
  users to --download-job (or the legacy job_dir: form) and note the caveat.
- build_command_inputs/outputs became keyword-only, breaking pre-existing
  positional callers (build_command_inputs(client, downloads, mounts);
  build_command_outputs(uploads)). Keep legacy_download/legacy_mount and
  legacy_output as leading positional params (new flags stay keyword-only).
  Add positional-compat regression tests.
---
 src/submit_aml/__main__.py |  9 +++++----
 src/submit_aml/data.py     | 35 +++++++++++++++++++++++++----------
 tests/test_data.py         | 20 ++++++++++++++++++++
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py
index 3ef228b..35ed04f 100644
--- a/src/submit_aml/__main__.py
+++ b/src/submit_aml/__main__.py
@@ -148,10 +148,11 @@ def submit(
             " dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'. If the version is"
             " omitted, the last one will be used. To download a datastore folder,"
             " use 'alias=datastore/folder'. To download the output folder of a"
-            " previous job, the preferred form is 'alias=<job_id>:<path>' (the"
-            " same as --download-job); for example,"
-            " 'checkpoint=crusty_hat_43s6lmvb25:outputs/checkpoint-10000'. The"
-            " older 'alias=job_dir:<job_id>:<path>' form is also still accepted."
+            " previous job, prefer --download-job; on this deprecated flag use"
+            " the 'alias=job_dir:<job_id>:<path>' form, for example"
+            " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'"
+            " (the bare 'alias=<job_id>:<path>' form is only recognised as a job"
+            " when <path> contains a '/', otherwise it is read as a data asset)."
             " The alias can be used to pass input datasets to the script, e.g.,"
             r" '${{inputs.vindr_dir}}' or '${{inputs.checkpoint}}'."
             " This option can be used multiple times."
diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index b8e7965..ae5f325 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -254,8 +254,8 @@ def _output_from_asset(string: str) -> tuple[str, Output]:
 #      semver, since dropping a CLI flag is a breaking change.
 #   3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/
 #      `output` typer.Options in `__main__.py`, drop the matching
-#      `submit_to_aml` parameters and the `legacy_*` keyword arguments and
-#      branches in `build_command_inputs`/`build_command_outputs`, delete the
+#      `submit_to_aml` parameters and the `legacy_*` parameters and branches in
+#      `build_command_inputs`/`build_command_outputs`, delete the
 #      `_legacy_*` helpers and the `_warn_legacy_*` warning helpers, and note
 #      the breaking change in the changelog.
 #
@@ -298,7 +298,9 @@ def _warn_legacy_input(
     old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base]
     new_cli = f"--{flag_base}-{source}"
     new_param = f"{flag_base}_{source}"
-    # CLI users see only flags; Python API users see only parameter names.
+    # Both are always emitted: the log line is phrased for CLI users (it names
+    # flags) and the DeprecationWarning for Python API users (it names
+    # parameters).
     cli_message = (
         f"{old_cli} is deprecated and will be removed in a future release."
         f" Replace '{old_cli} {old_value}' with '{new_cli} {new_value}'."
@@ -324,7 +326,9 @@ def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None:
         stacklevel: Stack level for the `DeprecationWarning`, so it points at
             the API caller rather than this helper.
     """
-    # CLI users see only flags; Python API users see only parameter names.
+    # Both are always emitted: the log line is phrased for CLI users (it names
+    # flags) and the DeprecationWarning for Python API users (it names
+    # parameters).
     cli_message = (
         "--output is deprecated and will be removed in a future release."
         f" Replace '--output {old_value}' with '--output-datastore {old_value}'."
@@ -441,6 +445,8 @@ def _assign_unique(
 
 def build_command_inputs(
     ml_client: MLClient,
+    legacy_download: list[str] | None = None,
+    legacy_mount: list[str] | None = None,
     *,
     mount_asset: list[str] | None = None,
     download_asset: list[str] | None = None,
@@ -448,13 +454,19 @@ def build_command_inputs(
     download_datastore: list[str] | None = None,
     mount_job: list[str] | None = None,
     download_job: list[str] | None = None,
-    legacy_mount: list[str] | None = None,
-    legacy_download: list[str] | None = None,
 ) -> TypeInputsDict:
     """Build the inputs dictionary for a command job.
 
+    `legacy_download` / `legacy_mount` are kept as the first positional
+    parameters (in that order) so existing positional callers from before the
+    explicit-source flags, i.e. `build_command_inputs(client, downloads,
+    mounts)`, keep working during the 1.x deprecation window. The new
+    explicit-source flags are keyword-only.
+
     Args:
         ml_client: Client used to resolve data assets.
+        legacy_download: Deprecated `--download` values, routed by source type.
+        legacy_mount: Deprecated `--mount` values, routed by source type.
         mount_asset: Data assets to mount, as `'alias=name[:version]'`.
         download_asset: Data assets to download, as `'alias=name[:version]'`.
         mount_datastore: Datastore folders to mount, as `'alias=datastore/folder'`.
@@ -462,8 +474,6 @@ def build_command_inputs(
             `'alias=datastore/folder'`.
         mount_job: Previous job outputs to mount, as `'alias=<job_id>:<path>'`.
         download_job: Previous job outputs to download, as `'alias=<job_id>:<path>'`.
-        legacy_mount: Deprecated `--mount` values, routed by source type.
-        legacy_download: Deprecated `--download` values, routed by source type.
 
     Returns:
         Dictionary of `alias: Input` mappings.
@@ -513,18 +523,23 @@ def build_command_inputs(
 
 
 def build_command_outputs(
+    legacy_output: list[str] | None = None,
     *,
     output_datastore: list[str] | None = None,
     output_asset: list[str] | None = None,
-    legacy_output: list[str] | None = None,
 ) -> dict[str, Output]:
     """Build the outputs dictionary for a command job.
 
+    `legacy_output` is kept as the first positional parameter so existing
+    positional callers from before the explicit-target flags, i.e.
+    `build_command_outputs(uploads)`, keep working during the 1.x deprecation
+    window. The new explicit-target flags are keyword-only.
+
     Args:
+        legacy_output: Deprecated `--output` values (datastore folders).
         output_datastore: Datastore folders to write to, as
             `'alias=datastore/folder'`.
         output_asset: Data assets to register, as `'alias=name[:version]'`.
-        legacy_output: Deprecated `--output` values (datastore folders).
 
     Returns:
         Dictionary of `alias: Output` mappings.
diff --git a/tests/test_data.py b/tests/test_data.py
index 06fc25e..381e23a 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -293,6 +293,26 @@ def test_build_command_inputs_legacy_raises_deprecation_warning() -> None:
     assert "['ref=mystore/exports/reference']" in message
 
 
+def test_build_command_inputs_legacy_positional_compat() -> None:
+    """Positional `(client, downloads, mounts)` calls still work (1.x compat)."""
+    client = Mock()
+    inputs = build_command_inputs(
+        client,
+        ["dl=mystore/down"],
+        ["mn=mystore/mount"],
+    )
+    assert inputs["dl"].mode == InputOutputModes.DOWNLOAD
+    assert inputs["mn"].mode == InputOutputModes.MOUNT
+
+
+def test_build_command_outputs_legacy_positional_compat() -> None:
+    """Positional `(uploads)` calls still work (1.x compat)."""
+    outputs = build_command_outputs(["out_dir=mydatastore/my_dataset"])
+    assert outputs["out_dir"].path == (
+        "azureml://datastores/mydatastore/paths/my_dataset"
+    )
+
+
 def test_build_command_inputs_legacy_asset_calls_client() -> None:
     """Legacy --mount asset strings still resolve through the client."""
     client = Mock()