From 46e8588bf9f94ce9ae4700100c517bca58402e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 16:29:05 +0100 Subject: [PATCH 01/16] Add explicit per-source data flags; deprecate --mount/--download/--output Replace the string-format inference for command inputs/outputs with explicit per-source CLI flags, so the source type is stated in the flag name instead of guessed from the value: --mount-asset / --download-asset alias=name[:version] --mount-datastore / --download-datastore alias=datastore/folder --mount-job / --download-job alias=: --output-datastore alias=datastore/folder --output-asset alias=name[:version] This adds raw datastore folders as inputs (issue #14) and registering outputs as data assets, symmetric across mount/download/output. The old --mount/--download/--output flags (and their Python equivalents) are deprecated: they still work but emit a warning and are classified and routed to the new builders through a single set of parsers. Closes #14 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/examples.md | 60 ++++-- src/submit_aml/__main__.py | 119 +++++++++++- src/submit_aml/aml.py | 26 ++- src/submit_aml/data.py | 381 ++++++++++++++++++++++++++----------- tests/test_data.py | 215 +++++++++++++++++++-- 5 files changed, 652 insertions(+), 149 deletions(-) diff --git a/docs/examples.md b/docs/examples.md index 26e671e..e174913 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -168,32 +168,40 @@ Run a grid sweep over hyperparameters: Datasets are passed to the job as [`Input`](https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml.input?view=azure-python) -objects. +objects. There is one flag per source type (registered data asset, datastore +folder, or previous job output), in either `--mount-*` or `--download-*` form. === "CLI" - Mount a dataset: + Mount or download a registered data asset: ```bash submit-aml \ --script train.py \ - --mount "data=MY-DATASET:2" + --mount-asset "data=MY-DATASET:2" ``` - Download a dataset: + ```bash + submit-aml \ + --script train.py \ + --download-asset "data=MY-DATASET" + ``` + + Mount a folder directly from a datastore (no data-asset registration + required): ```bash submit-aml \ --script train.py \ - --download "data=MY-DATASET" + --mount-datastore "ref=mystore/exports/reference" ``` - Use outputs from a previous job: + Use the outputs of a previous job: ```bash submit-aml \ --script evaluate.py \ - --mount "checkpoint=job_dir:my-training-job:models/best.pth" + --mount-job "checkpoint=my-training-job:models/best.pth" ``` === "Python" @@ -201,30 +209,49 @@ objects. ```python submit_to_aml( script_path="train.py", - datasets_mount=["data=MY-DATASET:2"], + mount_asset=["data=MY-DATASET:2"], ) # Or download instead of mount submit_to_aml( script_path="train.py", - datasets_download=["data=MY-DATASET"], + download_asset=["data=MY-DATASET"], + ) + + # Mount a datastore folder directly + submit_to_aml( + script_path="train.py", + mount_datastore=["ref=mystore/exports/reference"], ) # Use outputs from a previous job submit_to_aml( script_path="evaluate.py", - datasets_mount=["checkpoint=job_dir:my-training-job:models/best.pth"], + mount_job=["checkpoint=my-training-job:models/best.pth"], ) ``` -Configure an output datastore: +!!! note "Deprecated flags" + + The `--mount`, `--download` and `--output` flags (and their + `datasets_mount`, `datasets_download` and `datasets_output` Python + equivalents) are deprecated in favour of the explicit per-source flags + above. They still work but emit a deprecation warning. + +Write outputs to a datastore folder, or register them as a data asset: === "CLI" ```bash submit-aml \ --script train.py \ - --output "results=mydatastore/experiment-outputs" + --output-datastore "results=mydatastore/experiment-outputs" + ``` + + ```bash + submit-aml \ + --script train.py \ + --output-asset "results=my-experiment-results" ``` === "Python" @@ -232,7 +259,13 @@ Configure an output datastore: ```python submit_to_aml( script_path="train.py", - datasets_output=["results=mydatastore/experiment-outputs"], + output_datastore=["results=mydatastore/experiment-outputs"], + ) + + # Or register the outputs as a data asset + submit_to_aml( + script_path="train.py", + output_asset=["results=my-experiment-results"], ) ``` @@ -404,4 +437,3 @@ Submit and wait for the job to complete, streaming logs: ```python submit_to_aml(script_path="train.py", wait_for_completion=True) ``` - diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py index e7c4bc7..907eb25 100644 --- a/src/submit_aml/__main__.py +++ b/src/submit_aml/__main__.py @@ -141,12 +141,14 @@ def submit( "--download", "-d", help=( - "Azure ML dataset or job output folder to download. To download an Azure ML" - " dataset, the argument should take the form: alias, name and version" - " of the dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'." - " If the version is omitted, the last one will be used." - " To download the output folder of a previous job, the argument should take" - " the form 'alias=job_dir::'; for example:" + "[DEPRECATED] Use --download-asset, --download-datastore or" + " --download-job instead. Azure ML dataset or job output folder to" + " download. To download an Azure ML dataset, the argument should take" + " the form: alias, name and version of the dataset; for example:" + " 'vindr_dir=VINDR-CXR-V2:1'. If the version is omitted, the last one" + " will be used. To download the output folder of a previous job, the" + " argument should take the form" + " 'alias=job_dir::'; for example:" " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'." " The alias can be used to pass input datasets to the script, e.g.," r" '${{inputs.vindr_dir}}' or '${{inputs.checkpoint}}'." @@ -159,7 +161,8 @@ def submit( "--mount", "-m", help=( - "Azure ML dataset or job output folder to mount." + "[DEPRECATED] Use --mount-asset, --mount-datastore or --mount-job" + " instead. Azure ML dataset or job output folder to mount." " For an Azure ML dataset, the alias, name and version should be provided" " while for a job output folder, the alias, job ID and path in the job" " outputs should be provided. See the --download option for more" @@ -167,13 +170,77 @@ def submit( ), rich_help_panel=PANEL_DATA, ), + mount_asset: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--mount-asset", + help=( + "Registered Azure ML data asset to mount, expressed as" + ' "alias=name[:version]". For example: "vindr_dir=VINDR-CXR-V2:1".' + " If the version is omitted, the latest one is used." + r" Pass it to the script with '${{inputs.vindr_dir}}'." + " This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), + download_asset: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--download-asset", + help=( + "Registered Azure ML data asset to download. Same format as" + " --mount-asset. This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), + mount_datastore: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--mount-datastore", + help=( + "Datastore folder to mount, expressed as" + ' "alias=datastore/path/to/folder".' + ' For example: "ref=mystore/exports/reference".' + r" Pass it to the script with '${{inputs.ref}}'." + " This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), + download_datastore: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--download-datastore", + help=( + "Datastore folder to download. Same format as --mount-datastore." + " This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), + mount_job: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--mount-job", + help=( + "Output folder of a previous job to mount, expressed as" + ' "alias=:".' + ' For example: "checkpoint=crusty_hat_43s6lmvb25:outputs/best.pth".' + r" Pass it to the script with '${{inputs.checkpoint}}'." + " This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), + download_job: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--download-job", + help=( + "Output folder of a previous job to download. Same format as" + " --mount-job. This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), output: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 None, "--output", "-o", help=( - "Alias, datastore and path to folder into which outputs will be written," - ' expressed as "alias=datastore/path/to/dir".' + "[DEPRECATED] Use --output-datastore or --output-asset instead." + " Alias, datastore and path to folder into which outputs will be" + ' written, expressed as "alias=datastore/path/to/dir".' ' For example: "out_dir=mydatastore/my_dataset".' " The alias can be used to pass outputs to the script, e.g.," r' "${{outputs.out_dir}}".' @@ -182,6 +249,32 @@ def submit( ), rich_help_panel=PANEL_DATA, ), + output_datastore: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--output-datastore", + help=( + "Datastore folder into which outputs will be written, expressed as" + ' "alias=datastore/path/to/dir".' + ' For example: "out_dir=mydatastore/my_dataset".' + r" Pass it to the script with '${{outputs.out_dir}}'." + " This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), + output_asset: Optional[List[str]] = typer.Option( # noqa: UP006, UP007 + None, + "--output-asset", + help=( + "Register the outputs as an Azure ML data asset, expressed as" + ' "alias=name[:version]". For example: "out_dir=my-results".' + " The blobs are written to the workspace's default datastore and" + " registered as a data asset; if the version is omitted, Azure ML" + " auto-increments it." + r" Pass it to the script with '${{outputs.out_dir}}'." + " This option can be used multiple times." + ), + rich_help_panel=PANEL_DATA, + ), command_prefix: str = typer.Option( get_default("command_prefix"), help="Prefix to prepend to the command. For example, `uv run`.", @@ -408,6 +501,14 @@ def submit( datasets_download=datasets_download, datasets_mount=datasets_mount, datasets_output=output, + mount_asset=mount_asset, + download_asset=download_asset, + mount_datastore=mount_datastore, + download_datastore=download_datastore, + mount_job=mount_job, + download_job=download_job, + output_datastore=output_datastore, + output_asset=output_asset, debug=debug, dependency_groups=dependency_groups, description=description, diff --git a/src/submit_aml/aml.py b/src/submit_aml/aml.py index c269bab..a38de21 100644 --- a/src/submit_aml/aml.py +++ b/src/submit_aml/aml.py @@ -345,6 +345,14 @@ def submit_to_aml( datasets_download: TypeOptionalStrList = None, datasets_mount: TypeOptionalStrList = None, datasets_output: TypeOptionalStrList = None, + mount_asset: TypeOptionalStrList = None, + download_asset: TypeOptionalStrList = None, + mount_datastore: TypeOptionalStrList = None, + download_datastore: TypeOptionalStrList = None, + mount_job: TypeOptionalStrList = None, + download_job: TypeOptionalStrList = None, + output_datastore: TypeOptionalStrList = None, + output_asset: TypeOptionalStrList = None, debug: bool = False, dependency_groups: list[str] | None = None, description: str | None = None, @@ -512,8 +520,22 @@ def submit_to_aml( add_service_for_tensorboard(services, tensorboard_dir) # Data - inputs = build_command_inputs(ml_client, datasets_download, datasets_mount) - outputs = build_command_outputs(datasets_output) + inputs = build_command_inputs( + ml_client, + mount_asset=mount_asset, + download_asset=download_asset, + mount_datastore=mount_datastore, + download_datastore=download_datastore, + mount_job=mount_job, + download_job=download_job, + legacy_mount=datasets_mount, + legacy_download=datasets_download, + ) + outputs = build_command_outputs( + output_datastore=output_datastore, + output_asset=output_asset, + legacy_output=datasets_output, + ) # Sweep jobs is_sweep = sweep_inputs is not None and len(sweep_inputs) > 0 diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 410568b..0cef3eb 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -6,6 +6,7 @@ from azure.ai.ml import Input from azure.ai.ml import MLClient from azure.ai.ml import Output +from azure.ai.ml.constants import AssetTypes from azure.ai.ml.constants import InputOutputModes from azure.ai.ml.entities._job.sweep.search_space import SweepDistribution from azure.ai.ml.exceptions import MlException @@ -17,6 +18,23 @@ TypeOptionalStrList = list[str] | None +def _datastore_uri(datastore: str, path: str) -> str: + """Build an Azure ML datastore URI for a folder. + + Args: + datastore: Name of the datastore. + path: Path to the folder within the datastore. + + Returns: + A URI of the form `azureml://datastores//paths/`. + + Examples: + >>> _datastore_uri('mystore', 'exports/reference') + 'azureml://datastores/mystore/paths/exports/reference' + """ + return f"azureml://datastores/{datastore}/paths/{path}" + + def _extract_alias_path_version(string: str) -> tuple[str, str, str | None]: """Get alias, data asset path, and data asset version from a string. @@ -27,12 +45,9 @@ def _extract_alias_path_version(string: str) -> tuple[str, str, str | None]: Tuple of alias, path, and version (which may be None if version is not provided). - Raises: - ValueError: If the string is not of the expected format. - Examples: >>> _extract_alias_path_version('my_data=MIMIC-CXR-V2:2') - ('my_data', 'MIMIC-CXR-V2', 2) + ('my_data', 'MIMIC-CXR-V2', '2') >>> _extract_alias_path_version('my_data=MIMIC-CXR-V2') ('my_data', 'MIMIC-CXR-V2', None) """ @@ -64,11 +79,8 @@ def _extract_alias_datastore_path(string: str) -> tuple[str, str, str]: Returns: Tuple of alias, datastore and folder. - Raises: - ValueError: If the string is not of the expected format. - Examples: - >>> get_alias_datastore_path('my_data=inereyedata/output_dataset') + >>> _extract_alias_datastore_path('my_data=inereyedata/output_dataset') ('my_data', 'inereyedata', 'output_dataset') """ pattern = r"(?P[^=]+)=(?P[^/]+)/(?P.+)" @@ -84,146 +96,297 @@ def _extract_alias_datastore_path(string: str) -> tuple[str, str, str]: def _extract_alias_job_path(string: str) -> tuple[str, str, str]: - """Get alias, job ID, and path from a job directory string. + """Get alias, job ID, and path from a job output string. Args: - string: String of the form `'alias=job_dir::'`. + string: String of the form `'alias=:'`. Returns: Tuple of alias, job_id, and path. - Raises: - ValueError: If the string is not of the expected format. - Examples: - >>> _extract_alias_job_path('checkpoint=job_dir:my_job_123:models/best.pth') + >>> _extract_alias_job_path('checkpoint=my_job_123:models/best.pth') ('checkpoint', 'my_job_123', 'models/best.pth') """ - pattern = r"(?P[^=]+)=job_dir:(?P[^:]+):(?P.+)" + pattern = r"(?P[^=]+)=(?P[^:]+):(?P.+)" match = re.match(pattern, string) if match is None: message = ( - f'Invalid job directory string: "{string}".' - ' Expected format: "alias=job_dir:job_id:path".' + f'Invalid job output string: "{string}".' + ' Expected format: "alias=job_id:path".' ) - raise ValueError(message) + logger.error(message) + sys.exit(1) return match.group("alias"), match.group("job_id"), match.group("path") -def _is_alias_path_version_string(string: str) -> bool: - try: - _extract_alias_path_version(string) - return True - except ValueError: - return False +def _input_from_asset( + ml_client: MLClient, + string: str, + mode: str, +) -> tuple[str, Input]: + """Build an `Input` from a registered data asset string. + Args: + ml_client: Client used to resolve the data asset. + string: String of the form `'alias=name[:version]'`. + mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`. -def _is_alias_job_path_string(string: str) -> bool: - try: - _extract_alias_job_path(string) - return True - except ValueError: - return False + Returns: + Tuple of alias and the resolved `Input`. + Raises: + ValueError: If the data asset cannot be retrieved. + """ + alias, path, version = _extract_alias_path_version(string) + + if version is None: + kwargs = {"label": "latest"} + else: + kwargs = {"version": version} + + with report_time( + f'Retrieving data asset "{path}"...', + f'Retrieved data asset "{path}"', + ): + try: + data = ml_client.data.get(name=path, **kwargs) + except MlException as e: + msg = f'Error getting data asset with name "{path}" and version "{version}"' + raise ValueError(msg) from e + return alias, Input(path=data.id, mode=mode) + + +def _input_from_datastore(string: str, mode: str) -> tuple[str, Input]: + """Build an `Input` from a datastore-path string. + + Args: + string: String of the form `'alias=datastore/folder'`. + mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`. + + Returns: + Tuple of alias and the resulting `Input`. + """ + alias, datastore, folder = _extract_alias_datastore_path(string) + azureml_path = _datastore_uri(datastore, folder) + logger.info(f'Using datastore path "{azureml_path}"...') + return alias, Input(path=azureml_path, mode=mode) -def build_command_inputs( - ml_client: MLClient, - strings_download: list[str] | None, - strings_mount: list[str] | None, -) -> TypeInputsDict: - """Get dictionaries data assets to be mounted or downloaded. + +def _input_from_job(string: str, mode: str) -> tuple[str, Input]: + """Build an `Input` from a previous job's output string. Args: - strings_download: List of strings of the form `'alias=path:version'` to - be downloaded. If `None`, no data assets will be downloaded. - strings_mount: List of strings of the form `'alias=path:version'` to - be mounted. If `None`, no data assets will be mounted. + string: String of the form `'alias=:'`. + mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`. + + Returns: + Tuple of alias and the resulting `Input`. """ - strings_download = [] if strings_download is None else strings_download - strings_mount = [] if strings_mount is None else strings_mount - datasets_download = _get_data_assets( - ml_client, - strings_download, - InputOutputModes.DOWNLOAD, + alias, job_id, path = _extract_alias_job_path(string) + azureml_path = _datastore_uri( + "workspaceartifactstore", + f"ExperimentRun/dcid.{job_id}/{path}", ) - datasets_mount = _get_data_assets( - ml_client, - strings_mount, - InputOutputModes.MOUNT, + logger.info(f'Using job output path "{azureml_path}"...') + return alias, Input(path=azureml_path, mode=mode) + + +def _output_from_datastore(string: str) -> tuple[str, Output]: + """Build an `Output` that writes to a datastore folder. + + Args: + string: String of the form `'alias=datastore/folder'`. + + Returns: + Tuple of alias and the resulting `Output`. + """ + alias, datastore, folder = _extract_alias_datastore_path(string) + return alias, Output(path=_datastore_uri(datastore, folder)) + + +def _output_from_asset(string: str) -> tuple[str, Output]: + """Build an `Output` that registers a data asset. + + The blobs are written to the workspace's default datastore at an + Azure ML-managed location and registered as a data asset. + + Args: + string: String of the form `'alias=name[:version]'`. If the version is + omitted, Azure ML auto-increments it. + + Returns: + Tuple of alias and the resulting `Output`. + """ + alias, name, version = _extract_alias_path_version(string) + output = Output(type=AssetTypes.URI_FOLDER, name=name, version=version) + return alias, output + + +def _warn_deprecated(old_flag: str, new_flags: str) -> None: + """Emit a deprecation warning pointing users to the new flags. + + Args: + old_flag: The deprecated flag (e.g. `'--mount'`). + new_flags: Human-readable list of replacement flags. + """ + logger.warning( + f"{old_flag} is deprecated and will be removed in a future release." + f" Use {new_flags} instead.", ) - return {**datasets_download, **datasets_mount} -def build_command_outputs( - strings_upload: list[str] | None, -) -> dict[str, Output]: - """Get outputs for command. +def _classify_legacy_input(string: str) -> str: + """Classify a legacy `--mount`/`--download` value by source type. Args: - strings_upload: List of strings of the form `'alias=datastore/path/to/dir'` to - be uploaded. If `None`, no outputs will be returned. + string: A legacy dataset string. + + Returns: + One of `'job'`, `'datastore'`, or `'asset'`. + + Examples: + >>> _classify_legacy_input('ckpt=job_dir:job123:out/best.pth') + 'job' + >>> _classify_legacy_input('ref=mystore/exports/reference') + 'datastore' + >>> _classify_legacy_input('data=MY-DATASET:2') + 'asset' """ - strings_upload = [] if strings_upload is None else strings_upload - outputs_dict = {} - for string in strings_upload: - alias, datastore, path = _extract_alias_datastore_path(string) - output = Output( - path=f"azureml://datastores/{datastore}/paths/{path}", - ) - outputs_dict[alias] = output - return outputs_dict + if "=" not in string: + return "asset" + _, rhs = string.split("=", 1) + if rhs.startswith("job_dir:"): + return "job" + if "/" in rhs: + return "datastore" + return "asset" -def _get_data_assets( +def _legacy_input( ml_client: MLClient, - datasets: list[str], + string: str, mode: str, -) -> dict[str, Input]: - """Get data assets from Azure ML. +) -> tuple[str, Input]: + """Route a legacy `--mount`/`--download` value to the right builder. Args: - datasets: List of strings of the form `'alias=path:version'` or - `'alias=job_dir::'`. + ml_client: Client used to resolve data assets. + string: A legacy dataset string. mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`. + Returns: + Tuple of alias and the resulting `Input`. + """ + source = _classify_legacy_input(string) + if source == "job": + # Translate the old "alias=job_dir::" form to the new one. + translated = string.replace("=job_dir:", "=", 1) + return _input_from_job(translated, mode) + if source == "datastore": + return _input_from_datastore(string, mode) + return _input_from_asset(ml_client, string, mode) + + +def build_command_inputs( + ml_client: MLClient, + *, + mount_asset: list[str] | None = None, + download_asset: list[str] | None = None, + mount_datastore: list[str] | None = None, + download_datastore: list[str] | None = None, + mount_job: list[str] | None = None, + download_job: list[str] | None = None, + legacy_mount: list[str] | None = None, + legacy_download: list[str] | None = None, +) -> TypeInputsDict: + """Build the inputs dictionary for a command job. + + Args: + ml_client: Client used to resolve data assets. + mount_asset: Data assets to mount, as `'alias=name[:version]'`. + download_asset: Data assets to download, as `'alias=name[:version]'`. + mount_datastore: Datastore folders to mount, as `'alias=datastore/folder'`. + download_datastore: Datastore folders to download, as + `'alias=datastore/folder'`. + mount_job: Previous job outputs to mount, as `'alias=:'`. + download_job: Previous job outputs to download, as `'alias=:'`. + legacy_mount: Deprecated `--mount` values, routed by source type. + legacy_download: Deprecated `--download` values, routed by source type. + Returns: Dictionary of `alias: Input` mappings. """ - inputs = {} - for string in datasets: - if _is_alias_job_path_string(string): - # Handle job directory format - alias, job_id, path = _extract_alias_job_path(string) - azureml_path = f"azureml://datastores/workspaceartifactstore/paths/ExperimentRun/dcid.{job_id}/{path}" - logger.info(f'Using job output path "{azureml_path}"...') - inputs[alias] = Input( - path=str(azureml_path), - mode=mode, - ) - else: - # Handle regular data asset format - alias, path, version = _extract_alias_path_version(string) - - if version is None: - kwargs = {"label": "latest"} - else: - kwargs = {"version": version} - - with report_time( - f'Retrieving data asset "{path}"...', - f'Retrieved data asset "{path}"', - ): - try: - data = ml_client.data.get(name=path, **kwargs) - except MlException as e: - msg = ( - "Error getting data asset with" - f' name "{path}"' - f' and version "{version}"' - ) - raise ValueError(msg) from e - inputs[alias] = Input( - path=data.id, - mode=mode, - ) + inputs: TypeInputsDict = {} + + for string in mount_asset or []: + alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT) + inputs[alias] = value + for string in download_asset or []: + alias, value = _input_from_asset(ml_client, string, InputOutputModes.DOWNLOAD) + inputs[alias] = value + + for string in mount_datastore or []: + alias, value = _input_from_datastore(string, InputOutputModes.MOUNT) + inputs[alias] = value + for string in download_datastore or []: + alias, value = _input_from_datastore(string, InputOutputModes.DOWNLOAD) + inputs[alias] = value + + for string in mount_job or []: + alias, value = _input_from_job(string, InputOutputModes.MOUNT) + inputs[alias] = value + for string in download_job or []: + alias, value = _input_from_job(string, InputOutputModes.DOWNLOAD) + inputs[alias] = value + + if legacy_mount: + _warn_deprecated("--mount", "--mount-asset, --mount-datastore or --mount-job") + for string in legacy_mount: + alias, value = _legacy_input(ml_client, string, InputOutputModes.MOUNT) + inputs[alias] = value + if legacy_download: + _warn_deprecated( + "--download", + "--download-asset, --download-datastore or --download-job", + ) + for string in legacy_download: + alias, value = _legacy_input(ml_client, string, InputOutputModes.DOWNLOAD) + inputs[alias] = value + return inputs + + +def build_command_outputs( + *, + output_datastore: list[str] | None = None, + output_asset: list[str] | None = None, + legacy_output: list[str] | None = None, +) -> dict[str, Output]: + """Build the outputs dictionary for a command job. + + Args: + output_datastore: Datastore folders to write to, as + `'alias=datastore/folder'`. + output_asset: Data assets to register, as `'alias=name[:version]'`. + legacy_output: Deprecated `--output` values (datastore folders). + + Returns: + Dictionary of `alias: Output` mappings. + """ + outputs: dict[str, Output] = {} + + for string in output_datastore or []: + alias, value = _output_from_datastore(string) + outputs[alias] = value + for string in output_asset or []: + alias, value = _output_from_asset(string) + outputs[alias] = value + + if legacy_output: + _warn_deprecated("--output", "--output-datastore or --output-asset") + for string in legacy_output: + alias, value = _output_from_datastore(string) + outputs[alias] = value + + return outputs diff --git a/tests/test_data.py b/tests/test_data.py index 8f9faab..be33724 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,14 +1,35 @@ -"""Tests for data-asset parsing helpers.""" +"""Tests for data-asset parsing helpers and input/output builders.""" from __future__ import annotations +from unittest.mock import Mock + import pytest +from azure.ai.ml.constants import InputOutputModes +from submit_aml.data import _classify_legacy_input +from submit_aml.data import _datastore_uri from submit_aml.data import _extract_alias_datastore_path from submit_aml.data import _extract_alias_job_path from submit_aml.data import _extract_alias_path_version +from submit_aml.data import _input_from_datastore +from submit_aml.data import _input_from_job +from submit_aml.data import _output_from_asset +from submit_aml.data import _output_from_datastore +from submit_aml.data import build_command_inputs from submit_aml.data import build_command_outputs +# --------------------------------------------------------------------------- +# _datastore_uri +# --------------------------------------------------------------------------- + + +def test_datastore_uri() -> None: + """A datastore and path are joined into an azureml:// URI.""" + uri = _datastore_uri("mystore", "exports/reference") + assert uri == "azureml://datastores/mystore/paths/exports/reference" + + # --------------------------------------------------------------------------- # _extract_alias_path_version # --------------------------------------------------------------------------- @@ -51,35 +72,199 @@ def test_extract_alias_datastore_path_valid() -> None: def test_extract_alias_job_path_valid() -> None: - """'alias=job_dir:job_id:path' is parsed correctly.""" + """'alias=job_id:path' is parsed correctly (no job_dir: prefix).""" alias, job_id, path = _extract_alias_job_path( - "checkpoint=job_dir:my_job_123:models/best.pth" + "checkpoint=my_job_123:models/best.pth" ) assert alias == "checkpoint" assert job_id == "my_job_123" assert path == "models/best.pth" -def test_extract_alias_job_path_invalid_raises() -> None: - """Strings not matching the job_dir pattern raise ValueError.""" - with pytest.raises(ValueError, match="Invalid job directory"): +def test_extract_alias_job_path_invalid_exits() -> None: + """Strings without a path component exit the process.""" + with pytest.raises(SystemExit): _extract_alias_job_path("bad_format") +# --------------------------------------------------------------------------- +# _classify_legacy_input +# --------------------------------------------------------------------------- + + +def test_classify_legacy_input_job() -> None: + """A job_dir: prefix is classified as a job output.""" + assert _classify_legacy_input("ckpt=job_dir:job123:out/best.pth") == "job" + + +def test_classify_legacy_input_datastore() -> None: + """A slash in the right-hand side signals a datastore path.""" + assert _classify_legacy_input("ref=mystore/exports/reference") == "datastore" + + +def test_classify_legacy_input_asset() -> None: + """A plain name[:version] is classified as a data asset.""" + assert _classify_legacy_input("data=MY-DATASET:2") == "asset" + + +def test_classify_legacy_input_missing_equals() -> None: + """A string without '=' falls back to the asset branch.""" + assert _classify_legacy_input("no-equals-here") == "asset" + + +# --------------------------------------------------------------------------- +# input builders +# --------------------------------------------------------------------------- + + +def test_input_from_datastore() -> None: + """A datastore string yields an Input with an azureml:// path and mode.""" + alias, value = _input_from_datastore( + "ref=mystore/exports/reference", + InputOutputModes.MOUNT, + ) + assert alias == "ref" + assert value.path == "azureml://datastores/mystore/paths/exports/reference" + assert value.mode == InputOutputModes.MOUNT + + +def test_input_from_job() -> None: + """A job string yields an Input pointing at the job's run artifacts.""" + alias, value = _input_from_job( + "checkpoint=my_job_123:models/best.pth", + InputOutputModes.DOWNLOAD, + ) + assert alias == "checkpoint" + assert "ExperimentRun/dcid.my_job_123/models/best.pth" in value.path + assert "workspaceartifactstore" in value.path + assert value.mode == InputOutputModes.DOWNLOAD + + +# --------------------------------------------------------------------------- +# output builders +# --------------------------------------------------------------------------- + + +def test_output_from_datastore() -> None: + """A datastore string yields an Output with an azureml:// path.""" + alias, output = _output_from_datastore("out_dir=mydatastore/my_dataset") + assert alias == "out_dir" + assert output.path == "azureml://datastores/mydatastore/paths/my_dataset" + + +def test_output_from_asset_with_version() -> None: + """An asset string registers an Output with name and version.""" + alias, output = _output_from_asset("out_dir=my-results:3") + assert alias == "out_dir" + assert output.name == "my-results" + assert output.version == "3" + assert output.type == "uri_folder" + + +def test_output_from_asset_without_version() -> None: + """Omitting the version leaves it unset for Azure ML to auto-increment.""" + _, output = _output_from_asset("out_dir=my-results") + assert output.name == "my-results" + assert output.version is None + + +# --------------------------------------------------------------------------- +# build_command_inputs +# --------------------------------------------------------------------------- + + +def test_build_command_inputs_empty() -> None: + """No arguments produce an empty dict and never touch the client.""" + client = Mock() + assert build_command_inputs(client) == {} + client.data.get.assert_not_called() + + +def test_build_command_inputs_datastore_and_job_skip_client() -> None: + """Datastore and job inputs are built without calling the client.""" + client = Mock() + inputs = build_command_inputs( + client, + mount_datastore=["ref=mystore/exports/reference"], + download_job=["ckpt=my_job_123:models/best.pth"], + ) + assert set(inputs) == {"ref", "ckpt"} + assert inputs["ref"].mode == InputOutputModes.MOUNT + assert inputs["ckpt"].mode == InputOutputModes.DOWNLOAD + client.data.get.assert_not_called() + + +def test_build_command_inputs_asset_calls_client() -> None: + """A data-asset input resolves through the client.""" + client = Mock() + client.data.get.return_value = Mock(id="azureml:resolved-asset:1") + inputs = build_command_inputs(client, mount_asset=["data=MY-DATASET:2"]) + client.data.get.assert_called_once() + assert inputs["data"].path == "azureml:resolved-asset:1" + + +def test_build_command_inputs_legacy_datastore_routes( + capsys: pytest.CaptureFixture[str], +) -> None: + """Legacy --mount datastore strings route to the datastore builder.""" + client = Mock() + inputs = build_command_inputs( + client, + legacy_mount=["ref=mystore/exports/reference"], + ) + assert inputs["ref"].path == ( + "azureml://datastores/mystore/paths/exports/reference" + ) + client.data.get.assert_not_called() + assert "deprecated" in capsys.readouterr().out.lower() + + +def test_build_command_inputs_legacy_job_routes() -> None: + """Legacy --download job_dir strings route to the job builder.""" + client = Mock() + inputs = build_command_inputs( + client, + legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"], + ) + assert "ExperimentRun/dcid.my_job_123/models/best.pth" in inputs["ckpt"].path + client.data.get.assert_not_called() + + +def test_build_command_inputs_legacy_asset_calls_client() -> None: + """Legacy --mount asset strings still resolve through the client.""" + client = Mock() + client.data.get.return_value = Mock(id="azureml:resolved-asset:1") + build_command_inputs(client, legacy_mount=["data=MY-DATASET:2"]) + client.data.get.assert_called_once() + + # --------------------------------------------------------------------------- # build_command_outputs # --------------------------------------------------------------------------- -def test_build_command_outputs_none() -> None: - """None input produces an empty dict.""" - assert build_command_outputs(None) == {} +def test_build_command_outputs_empty() -> None: + """No arguments produce an empty dict.""" + assert build_command_outputs() == {} + + +def test_build_command_outputs_datastore_and_asset() -> None: + """Datastore and asset outputs are both built.""" + outputs = build_command_outputs( + output_datastore=["out_dir=mydatastore/my_dataset"], + output_asset=["asset_dir=my-results:2"], + ) + assert outputs["out_dir"].path == ( + "azureml://datastores/mydatastore/paths/my_dataset" + ) + assert outputs["asset_dir"].name == "my-results" + assert outputs["asset_dir"].version == "2" -def test_build_command_outputs_valid() -> None: - """Valid output strings are converted into Output objects.""" - outputs = build_command_outputs(["out_dir=mydatastore/my_dataset"]) +def test_build_command_outputs_legacy_warns( + capsys: pytest.CaptureFixture[str], +) -> None: + """Legacy --output strings are built and emit a deprecation warning.""" + outputs = build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"]) assert "out_dir" in outputs - output = outputs["out_dir"] - assert "mydatastore" in output.path - assert "my_dataset" in output.path + assert "deprecated" in capsys.readouterr().out.lower() From 09ecedd51a2c63674e0eb779c1d770eadbbd688c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 16:41:04 +0100 Subject: [PATCH 02/16] Address Copilot review: robust legacy classification and clearer error - Legacy --mount/--download values that use the new "job_id:path" form (a colon before the first slash) now route to the job-output builder instead of being misread as a datastore path. - The data-asset lookup failure message now reports "latest" when no version was given, matching the label used for the lookup. - Add tests for new-syntax job classification/routing and the error message. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/submit_aml/data.py | 23 +++++++++++++++++++---- tests/test_data.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 0cef3eb..5948836 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -152,7 +152,11 @@ def _input_from_asset( try: data = ml_client.data.get(name=path, **kwargs) except MlException as e: - msg = f'Error getting data asset with name "{path}" and version "{version}"' + version_desc = "latest" if version is None else version + msg = ( + f'Error getting data asset with name "{path}"' + f' and version "{version_desc}"' + ) raise ValueError(msg) from e return alias, Input(path=data.id, mode=mode) @@ -245,9 +249,16 @@ def _classify_legacy_input(string: str) -> str: Returns: One of `'job'`, `'datastore'`, or `'asset'`. + A right-hand side that starts with `job_dir:`, or that has a `:` before its + first `/` (the new `job_id:path` form), is a job output. A `/` that comes + before any `:` signals a datastore folder. Anything else (a bare `name` or + `name:version`) is a data asset. + Examples: >>> _classify_legacy_input('ckpt=job_dir:job123:out/best.pth') 'job' + >>> _classify_legacy_input('ckpt=job123:out/best.pth') + 'job' >>> _classify_legacy_input('ref=mystore/exports/reference') 'datastore' >>> _classify_legacy_input('data=MY-DATASET:2') @@ -258,9 +269,13 @@ def _classify_legacy_input(string: str) -> str: _, rhs = string.split("=", 1) if rhs.startswith("job_dir:"): return "job" - if "/" in rhs: - return "datastore" - return "asset" + slash = rhs.find("/") + if slash == -1: + return "asset" + colon = rhs.find(":") + if colon != -1 and colon < slash: + return "job" + return "datastore" def _legacy_input( diff --git a/tests/test_data.py b/tests/test_data.py index be33724..db1d602 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -6,12 +6,14 @@ import pytest from azure.ai.ml.constants import InputOutputModes +from azure.ai.ml.exceptions import MlException from submit_aml.data import _classify_legacy_input from submit_aml.data import _datastore_uri from submit_aml.data import _extract_alias_datastore_path from submit_aml.data import _extract_alias_job_path from submit_aml.data import _extract_alias_path_version +from submit_aml.data import _input_from_asset from submit_aml.data import _input_from_datastore from submit_aml.data import _input_from_job from submit_aml.data import _output_from_asset @@ -97,11 +99,21 @@ def test_classify_legacy_input_job() -> None: assert _classify_legacy_input("ckpt=job_dir:job123:out/best.pth") == "job" +def test_classify_legacy_input_job_new_syntax() -> None: + """A new-style 'job_id:path' value (colon before slash) is a job output.""" + assert _classify_legacy_input("ckpt=job123:outputs/best.pth") == "job" + + def test_classify_legacy_input_datastore() -> None: - """A slash in the right-hand side signals a datastore path.""" + """A slash before any colon signals a datastore path.""" assert _classify_legacy_input("ref=mystore/exports/reference") == "datastore" +def test_classify_legacy_input_datastore_with_colon_in_folder() -> None: + """A colon after the first slash is part of the folder, not a job id.""" + assert _classify_legacy_input("ref=mystore/a:b/c") == "datastore" + + def test_classify_legacy_input_asset() -> None: """A plain name[:version] is classified as a data asset.""" assert _classify_legacy_input("data=MY-DATASET:2") == "asset" @@ -140,6 +152,16 @@ def test_input_from_job() -> None: assert value.mode == InputOutputModes.DOWNLOAD +def test_input_from_asset_missing_version_reports_latest() -> None: + """When no version is given, the failure message mentions 'latest'.""" + client = Mock() + client.data.get.side_effect = MlException( + message="boom", no_personal_data_message="boom" + ) + with pytest.raises(ValueError, match='version "latest"'): + _input_from_asset(client, "data=MY-DATASET", InputOutputModes.MOUNT) + + # --------------------------------------------------------------------------- # output builders # --------------------------------------------------------------------------- @@ -230,6 +252,17 @@ def test_build_command_inputs_legacy_job_routes() -> None: client.data.get.assert_not_called() +def test_build_command_inputs_legacy_job_new_syntax_routes() -> None: + """Legacy values using the new 'job_id:path' form route to the job builder.""" + client = Mock() + inputs = build_command_inputs( + client, + legacy_mount=["ckpt=my_job_123:models/best.pth"], + ) + assert "ExperimentRun/dcid.my_job_123/models/best.pth" in inputs["ckpt"].path + client.data.get.assert_not_called() + + def test_build_command_inputs_legacy_asset_calls_client() -> None: """Legacy --mount asset strings still resolve through the client.""" client = Mock() From efc9ba7b2e3bf3151c5f2c7629aabfaef3dd9ea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 16:47:58 +0100 Subject: [PATCH 03/16] Address Copilot review: document datastore form in deprecated flag help The deprecated --mount/--download flags route datastore-folder values via legacy parsing, so their help text now mentions the 'alias=datastore/folder' form alongside the dataset and job-output forms. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/submit_aml/__main__.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py index 907eb25..ed171b3 100644 --- a/src/submit_aml/__main__.py +++ b/src/submit_aml/__main__.py @@ -142,12 +142,13 @@ def submit( "-d", help=( "[DEPRECATED] Use --download-asset, --download-datastore or" - " --download-job instead. Azure ML dataset or job output folder to" - " download. To download an Azure ML dataset, the argument should take" - " the form: alias, name and version of the dataset; for example:" - " 'vindr_dir=VINDR-CXR-V2:1'. If the version is omitted, the last one" - " will be used. To download the output folder of a previous job, the" - " argument should take the form" + " --download-job instead. Azure ML dataset, datastore folder or job" + " output folder to download. To download an Azure ML dataset, the" + " argument should take the form: alias, name and version of the" + " dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'. If the version is" + " omitted, the last one will be used. To download a datastore folder," + " use 'alias=datastore/folder'. To download the output folder of a" + " previous job, the argument should take the form" " 'alias=job_dir::'; for example:" " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'." " The alias can be used to pass input datasets to the script, e.g.," @@ -162,9 +163,10 @@ def submit( "-m", help=( "[DEPRECATED] Use --mount-asset, --mount-datastore or --mount-job" - " instead. Azure ML dataset or job output folder to mount." - " For an Azure ML dataset, the alias, name and version should be provided" - " while for a job output folder, the alias, job ID and path in the job" + " instead. Azure ML dataset, datastore folder or job output folder to" + " mount. For an Azure ML dataset, the alias, name and version should be" + " provided; for a datastore folder, use 'alias=datastore/folder'; while" + " for a job output folder, the alias, job ID and path in the job" " outputs should be provided. See the --download option for more" " information." ), From 97af6ac7406c05642d9e235b851df36d17d2493c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 16:54:38 +0100 Subject: [PATCH 04/16] Address Copilot review: reject legacy job_dir: prefix; clarify job path help - The new --mount-job/--download-job flags now reject values that still carry the legacy "job_dir:" prefix with a clear error, instead of silently building an invalid ExperimentRun/dcid.job_dir/... URI. (Legacy --mount/--download still accept and translate the prefix.) - Reworded the job help/example to make clear the path may point at any run artifact, not only files under outputs/. - Added a test for the rejected legacy prefix. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/submit_aml/__main__.py | 9 +++++---- src/submit_aml/data.py | 8 ++++++++ tests/test_data.py | 6 ++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py index ed171b3..bf28111 100644 --- a/src/submit_aml/__main__.py +++ b/src/submit_aml/__main__.py @@ -218,9 +218,10 @@ def submit( None, "--mount-job", help=( - "Output folder of a previous job to mount, expressed as" - ' "alias=:".' - ' For example: "checkpoint=crusty_hat_43s6lmvb25:outputs/best.pth".' + "Output of a previous job to mount, expressed as" + ' "alias=:". The path may point at any' + " run artifact, not just files under outputs/." + ' For example: "checkpoint=crusty_hat_43s6lmvb25:models/best.pth".' r" Pass it to the script with '${{inputs.checkpoint}}'." " This option can be used multiple times." ), @@ -230,7 +231,7 @@ def submit( None, "--download-job", help=( - "Output folder of a previous job to download. Same format as" + "Output of a previous job to download. Same format as" " --mount-job. This option can be used multiple times." ), rich_help_panel=PANEL_DATA, diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 5948836..7a7225b 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -117,6 +117,14 @@ def _extract_alias_job_path(string: str) -> tuple[str, str, str]: ) logger.error(message) sys.exit(1) + if match.group("job_id") == "job_dir": + message = ( + f'Invalid job output string: "{string}".' + ' The "job_dir:" prefix is no longer used with the job flags;' + ' use "alias=job_id:path" instead.' + ) + logger.error(message) + sys.exit(1) return match.group("alias"), match.group("job_id"), match.group("path") diff --git a/tests/test_data.py b/tests/test_data.py index db1d602..bf1e8ad 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -89,6 +89,12 @@ def test_extract_alias_job_path_invalid_exits() -> None: _extract_alias_job_path("bad_format") +def test_extract_alias_job_path_rejects_legacy_prefix() -> None: + """The legacy 'job_dir:' prefix is rejected on the new job flags.""" + with pytest.raises(SystemExit): + _extract_alias_job_path("ckpt=job_dir:my_job_123:models/best.pth") + + # --------------------------------------------------------------------------- # _classify_legacy_input # --------------------------------------------------------------------------- From 8f1eeca181fff3df74e4123d6592f66ef30faa00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 17:02:29 +0100 Subject: [PATCH 05/16] Address Copilot review: name Python params in deprecation warnings The deprecation warnings now reference both the CLI flag and the equivalent Python parameter (e.g. "--mount (datasets_mount)"), so the message is accurate whether the deprecated path was reached via the CLI or the submit_to_aml API. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/submit_aml/data.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 7a7225b..f224d01 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -364,14 +364,19 @@ def build_command_inputs( inputs[alias] = value if legacy_mount: - _warn_deprecated("--mount", "--mount-asset, --mount-datastore or --mount-job") + _warn_deprecated( + "--mount (datasets_mount)", + "--mount-asset, --mount-datastore or --mount-job" + " (mount_asset, mount_datastore or mount_job)", + ) for string in legacy_mount: alias, value = _legacy_input(ml_client, string, InputOutputModes.MOUNT) inputs[alias] = value if legacy_download: _warn_deprecated( - "--download", - "--download-asset, --download-datastore or --download-job", + "--download (datasets_download)", + "--download-asset, --download-datastore or --download-job" + " (download_asset, download_datastore or download_job)", ) for string in legacy_download: alias, value = _legacy_input(ml_client, string, InputOutputModes.DOWNLOAD) @@ -407,7 +412,10 @@ def build_command_outputs( outputs[alias] = value if legacy_output: - _warn_deprecated("--output", "--output-datastore or --output-asset") + _warn_deprecated( + "--output (datasets_output)", + "--output-datastore or --output-asset (output_datastore or output_asset)", + ) for string in legacy_output: alias, value = _output_from_datastore(string) outputs[alias] = value From 11fcd47381fa6c64f360c8b2ddc0309c2804b4c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 17:38:35 +0100 Subject: [PATCH 06/16] Document removal plan for deprecated data flags Add a comment by _warn_deprecated outlining the phased, semver-aligned removal of the legacy --mount/--download/--output flags (keep visible and warning through 1.x, remove in 2.0.0). --- src/submit_aml/data.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index f224d01..293f1f9 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -235,6 +235,28 @@ def _output_from_asset(string: str) -> tuple[str, Output]: return alias, output +# Removal plan for the deprecated data flags (--mount/-m, --download/-d, +# --output/-o), superseded by the explicit-source flags (--{mount,download}- +# {asset,datastore,job} and --output-{datastore,asset}): +# +# 1. Now (1.x): both flag sets work. The legacy flags carry a [DEPRECATED] +# marker in --help and emit `_warn_deprecated` at runtime. This is the +# grace period in which users migrate. +# 2. Before removal: once downstream callers have migrated (grep the known +# consumer repos / run scripts for `--mount`, `--download`, `--output`, +# `-m `, `-d `, `-o ` and the `datasets_{mount,download,output}` kwargs of +# `submit_to_aml`), and the deprecation has shipped in at least one +# tagged release, schedule removal for the next MAJOR version (2.0.0) per +# semver, since dropping a CLI flag is a breaking change. +# 3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/ +# `output` typer.Options in `__main__.py`, drop the matching +# `submit_to_aml` parameters and the `legacy_*` branches in +# `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and +# `_warn_deprecated`, and note the breaking change in the changelog. +# +# Until step 3, keep the legacy flags VISIBLE in --help (the [DEPRECATED] +# marker is how users discover the migration path); only hide them as an +# optional last step in a release immediately preceding removal. def _warn_deprecated(old_flag: str, new_flags: str) -> None: """Emit a deprecation warning pointing users to the new flags. From fc34836308de4b8305971a987dba72b6d65ae830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 19:56:01 +0100 Subject: [PATCH 07/16] Make legacy data-flag deprecation warnings source-specific Previously --mount/--download/--output emitted one generic warning per flag listing all replacements. Now each legacy value is classified and gets a tailored message naming the exact replacement flag and echoing the corrected invocation, e.g. '--mount my_alias=data_asset' -> '--mount-asset my_alias=data_asset'. Job values additionally show the translated form (the redundant job_dir: prefix dropped). Replace _warn_deprecated with _warn_legacy_input/_warn_legacy_output; thread the flag base through _legacy_input; warn per value. Add tests asserting the per-source suggested flag and value. --- src/submit_aml/data.py | 85 ++++++++++++++++++++++++++++++------------ tests/test_data.py | 44 +++++++++++++++++++++- 2 files changed, 103 insertions(+), 26 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 293f1f9..16175ff 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -251,22 +251,59 @@ def _output_from_asset(string: str) -> tuple[str, Output]: # 3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/ # `output` typer.Options in `__main__.py`, drop the matching # `submit_to_aml` parameters and the `legacy_*` branches in -# `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and -# `_warn_deprecated`, and note the breaking change in the changelog. +# `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and the +# `_warn_legacy_*` warning helpers, and note the breaking change in the +# changelog. # # Until step 3, keep the legacy flags VISIBLE in --help (the [DEPRECATED] # marker is how users discover the migration path); only hide them as an # optional last step in a release immediately preceding removal. -def _warn_deprecated(old_flag: str, new_flags: str) -> None: - """Emit a deprecation warning pointing users to the new flags. + +# Replacement input flag (CLI, Python parameter) for each legacy input flag +# base and classified source type, used to tailor the deprecation warning. +_LEGACY_INPUT_FLAGS = { + "mount": ("--mount", "datasets_mount"), + "download": ("--download", "datasets_download"), +} + + +def _warn_legacy_input( + flag_base: str, + source: str, + old_value: str, + new_value: str, +) -> None: + """Warn that a legacy input flag is deprecated, naming the exact fix. + + Args: + flag_base: The legacy flag base, either `'mount'` or `'download'`. + source: The classified source type (`'asset'`, `'datastore'`, or + `'job'`), used to pick the per-source replacement flag. + old_value: The legacy `alias=value` string the user passed. + new_value: The value to use with the replacement flag (identical to + `old_value` except for job values, which drop the `job_dir:` + prefix). + """ + old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base] + new_cli = f"--{flag_base}-{source}" + new_param = f"{flag_base}_{source}" + logger.warning( + f"{old_cli} ({old_param}) is deprecated and will be removed in a future" + f" release. Replace '{old_cli} {old_value}' with" + f" '{new_cli} {new_value}' ({new_param}).", + ) + + +def _warn_legacy_output(old_value: str) -> None: + """Warn that a legacy `--output` value is deprecated, naming the exact fix. Args: - old_flag: The deprecated flag (e.g. `'--mount'`). - new_flags: Human-readable list of replacement flags. + old_value: The legacy `alias=datastore/folder` string the user passed. """ logger.warning( - f"{old_flag} is deprecated and will be removed in a future release." - f" Use {new_flags} instead.", + "--output (datasets_output) is deprecated and will be removed in a" + f" future release. Replace '--output {old_value}' with" + f" '--output-datastore {old_value}' (output_datastore).", ) @@ -312,13 +349,19 @@ def _legacy_input( ml_client: MLClient, string: str, mode: str, + flag_base: str, ) -> tuple[str, Input]: """Route a legacy `--mount`/`--download` value to the right builder. + Emits a deprecation warning naming the exact replacement flag for the + value's classified source type. + Args: ml_client: Client used to resolve data assets. string: A legacy dataset string. mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`. + flag_base: The legacy flag base, either `'mount'` or `'download'`, + used to tailor the deprecation warning. Returns: Tuple of alias and the resulting `Input`. @@ -327,9 +370,12 @@ def _legacy_input( if source == "job": # Translate the old "alias=job_dir::" form to the new one. translated = string.replace("=job_dir:", "=", 1) + _warn_legacy_input(flag_base, "job", string, translated) return _input_from_job(translated, mode) if source == "datastore": + _warn_legacy_input(flag_base, "datastore", string, string) return _input_from_datastore(string, mode) + _warn_legacy_input(flag_base, "asset", string, string) return _input_from_asset(ml_client, string, mode) @@ -386,22 +432,16 @@ def build_command_inputs( inputs[alias] = value if legacy_mount: - _warn_deprecated( - "--mount (datasets_mount)", - "--mount-asset, --mount-datastore or --mount-job" - " (mount_asset, mount_datastore or mount_job)", - ) for string in legacy_mount: - alias, value = _legacy_input(ml_client, string, InputOutputModes.MOUNT) + alias, value = _legacy_input( + ml_client, string, InputOutputModes.MOUNT, "mount" + ) inputs[alias] = value if legacy_download: - _warn_deprecated( - "--download (datasets_download)", - "--download-asset, --download-datastore or --download-job" - " (download_asset, download_datastore or download_job)", - ) for string in legacy_download: - alias, value = _legacy_input(ml_client, string, InputOutputModes.DOWNLOAD) + alias, value = _legacy_input( + ml_client, string, InputOutputModes.DOWNLOAD, "download" + ) inputs[alias] = value return inputs @@ -434,11 +474,8 @@ def build_command_outputs( outputs[alias] = value if legacy_output: - _warn_deprecated( - "--output (datasets_output)", - "--output-datastore or --output-asset (output_datastore or output_asset)", - ) for string in legacy_output: + _warn_legacy_output(string) alias, value = _output_from_datastore(string) outputs[alias] = value diff --git a/tests/test_data.py b/tests/test_data.py index bf1e8ad..30bc2c9 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -277,6 +277,44 @@ def test_build_command_inputs_legacy_asset_calls_client() -> None: client.data.get.assert_called_once() +def test_build_command_inputs_legacy_asset_warns_mount_asset( + capsys: pytest.CaptureFixture[str], +) -> None: + """A legacy --mount asset value is told to use --mount-asset specifically.""" + client = Mock() + client.data.get.return_value = Mock(id="azureml:resolved-asset:1") + build_command_inputs(client, legacy_mount=["my_alias=data_asset"]) + message = " ".join(capsys.readouterr().out.split()) + assert "--mount-asset my_alias=data_asset" in message + assert "--mount-datastore" not in message + assert "--mount-job" not in message + + +def test_build_command_inputs_legacy_datastore_warns_mount_datastore( + capsys: pytest.CaptureFixture[str], +) -> None: + """A legacy --mount datastore value is told to use --mount-datastore.""" + client = Mock() + build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"]) + message = " ".join(capsys.readouterr().out.split()) + assert "--mount-datastore ref=mystore/exports/reference" in message + + +def test_build_command_inputs_legacy_job_warns_download_job_translated( + capsys: pytest.CaptureFixture[str], +) -> None: + """A legacy --download job value is told to use --download-job, sans prefix.""" + client = Mock() + build_command_inputs( + client, + legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"], + ) + message = " ".join(capsys.readouterr().out.split()) + assert "--download-job ckpt=my_job_123:models/best.pth" in message + # The suggested replacement (after "with") drops the legacy job_dir: prefix. + assert "job_dir:" not in message.split("with", 1)[1] + + # --------------------------------------------------------------------------- # build_command_outputs # --------------------------------------------------------------------------- @@ -303,7 +341,9 @@ def test_build_command_outputs_datastore_and_asset() -> None: def test_build_command_outputs_legacy_warns( capsys: pytest.CaptureFixture[str], ) -> None: - """Legacy --output strings are built and emit a deprecation warning.""" + """Legacy --output strings are built and emit a targeted deprecation warning.""" outputs = build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"]) assert "out_dir" in outputs - assert "deprecated" in capsys.readouterr().out.lower() + message = " ".join(capsys.readouterr().out.split()) + assert "deprecated" in message.lower() + assert "--output-datastore out_dir=mydatastore/my_dataset" in message From 56aeb96d57b15ea4c0b704b7da2d90cec5766820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 19:59:22 +0100 Subject: [PATCH 08/16] Emit DeprecationWarning for legacy data flags via the Python API The legacy-flag deprecation was previously only a loguru log line, which serves CLI users but is invisible to callers of the library API. Also emit a real warnings.warn(..., DeprecationWarning) from the warning helpers so programmatic users of submit_to_aml / build_command_inputs / build_command_outputs get the standard, tooling-visible signal. Pass a stacklevel so the warning is blamed on the API caller, not the helper. Add tests asserting the DeprecationWarning category for legacy inputs and outputs. --- src/submit_aml/data.py | 40 +++++++++++++++++++++++++++++++--------- tests/test_data.py | 13 +++++++++++++ 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 16175ff..36e2c49 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -2,6 +2,7 @@ import re import sys +import warnings from azure.ai.ml import Input from azure.ai.ml import MLClient @@ -272,9 +273,14 @@ def _warn_legacy_input( source: str, old_value: str, new_value: str, + stacklevel: int = 2, ) -> None: """Warn that a legacy input flag is deprecated, naming the exact fix. + Emits both a human-facing log line (for CLI users) and a Python + `DeprecationWarning` (for callers of the library API, e.g. `submit_to_aml` + or `build_command_inputs`). + Args: flag_base: The legacy flag base, either `'mount'` or `'download'`. source: The classified source type (`'asset'`, `'datastore'`, or @@ -283,28 +289,40 @@ def _warn_legacy_input( new_value: The value to use with the replacement flag (identical to `old_value` except for job values, which drop the `job_dir:` prefix). + stacklevel: Stack level for the `DeprecationWarning`, so it points at + the API caller rather than this helper. """ old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base] new_cli = f"--{flag_base}-{source}" new_param = f"{flag_base}_{source}" - logger.warning( + message = ( f"{old_cli} ({old_param}) is deprecated and will be removed in a future" f" release. Replace '{old_cli} {old_value}' with" - f" '{new_cli} {new_value}' ({new_param}).", + f" '{new_cli} {new_value}' ({new_param})." ) + logger.warning(message) + warnings.warn(message, DeprecationWarning, stacklevel=stacklevel) -def _warn_legacy_output(old_value: str) -> None: +def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None: """Warn that a legacy `--output` value is deprecated, naming the exact fix. + Emits both a human-facing log line (for CLI users) and a Python + `DeprecationWarning` (for callers of the library API, e.g. `submit_to_aml` + or `build_command_outputs`). + Args: old_value: The legacy `alias=datastore/folder` string the user passed. + stacklevel: Stack level for the `DeprecationWarning`, so it points at + the API caller rather than this helper. """ - logger.warning( + message = ( "--output (datasets_output) is deprecated and will be removed in a" f" future release. Replace '--output {old_value}' with" - f" '--output-datastore {old_value}' (output_datastore).", + f" '--output-datastore {old_value}' (output_datastore)." ) + logger.warning(message) + warnings.warn(message, DeprecationWarning, stacklevel=stacklevel) def _classify_legacy_input(string: str) -> str: @@ -367,15 +385,17 @@ def _legacy_input( Tuple of alias and the resulting `Input`. """ source = _classify_legacy_input(string) + # stacklevel=4: warnings.warn -> _warn_legacy_input -> _legacy_input -> + # build_command_inputs, so the DeprecationWarning points at the API caller. if source == "job": # Translate the old "alias=job_dir::" form to the new one. translated = string.replace("=job_dir:", "=", 1) - _warn_legacy_input(flag_base, "job", string, translated) + _warn_legacy_input(flag_base, "job", string, translated, stacklevel=4) return _input_from_job(translated, mode) if source == "datastore": - _warn_legacy_input(flag_base, "datastore", string, string) + _warn_legacy_input(flag_base, "datastore", string, string, stacklevel=4) return _input_from_datastore(string, mode) - _warn_legacy_input(flag_base, "asset", string, string) + _warn_legacy_input(flag_base, "asset", string, string, stacklevel=4) return _input_from_asset(ml_client, string, mode) @@ -475,7 +495,9 @@ def build_command_outputs( if legacy_output: for string in legacy_output: - _warn_legacy_output(string) + # stacklevel=3: warnings.warn -> _warn_legacy_output -> + # build_command_outputs, pointing at the API caller. + _warn_legacy_output(string, stacklevel=3) alias, value = _output_from_datastore(string) outputs[alias] = value diff --git a/tests/test_data.py b/tests/test_data.py index 30bc2c9..a39b7ee 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -269,6 +269,13 @@ def test_build_command_inputs_legacy_job_new_syntax_routes() -> None: client.data.get.assert_not_called() +def test_build_command_inputs_legacy_raises_deprecation_warning() -> None: + """The Python API raises a DeprecationWarning for legacy input values.""" + client = Mock() + with pytest.warns(DeprecationWarning, match=r"--mount-datastore"): + build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"]) + + def test_build_command_inputs_legacy_asset_calls_client() -> None: """Legacy --mount asset strings still resolve through the client.""" client = Mock() @@ -347,3 +354,9 @@ def test_build_command_outputs_legacy_warns( message = " ".join(capsys.readouterr().out.split()) assert "deprecated" in message.lower() assert "--output-datastore out_dir=mydatastore/my_dataset" in message + + +def test_build_command_outputs_legacy_raises_deprecation_warning() -> None: + """The Python API raises a DeprecationWarning for legacy output values.""" + with pytest.warns(DeprecationWarning, match=r"--output-datastore"): + build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"]) From c351648ea5680c47295025601631771317826d72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 20:22:25 +0100 Subject: [PATCH 09/16] Separate CLI vs Python API deprecation messages CLI users were shown Python parameter names (e.g. '(datasets_mount)') in the deprecation log line, which is irrelevant to them. Split the messaging: the loguru log (CLI-facing) now names only the flags, while the DeprecationWarning (Python-API-facing) names only the parameters. Update tests to assert each audience sees only its own surface. --- src/submit_aml/data.py | 39 ++++++++++++++++++++++++--------------- tests/test_data.py | 14 ++++++++++---- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 36e2c49..3d15f8a 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -295,34 +295,43 @@ def _warn_legacy_input( old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base] new_cli = f"--{flag_base}-{source}" new_param = f"{flag_base}_{source}" - message = ( - f"{old_cli} ({old_param}) is deprecated and will be removed in a future" - f" release. Replace '{old_cli} {old_value}' with" - f" '{new_cli} {new_value}' ({new_param})." + # CLI users see only flags; Python API users see only parameter names. + cli_message = ( + f"{old_cli} is deprecated and will be removed in a future release." + f" Replace '{old_cli} {old_value}' with '{new_cli} {new_value}'." + ) + api_message = ( + f"The '{old_param}' parameter is deprecated and will be removed in a" + f" future release. Pass {new_value!r} to '{new_param}' instead." ) - logger.warning(message) - warnings.warn(message, DeprecationWarning, stacklevel=stacklevel) + logger.warning(cli_message) + warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel) def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None: """Warn that a legacy `--output` value is deprecated, naming the exact fix. - Emits both a human-facing log line (for CLI users) and a Python - `DeprecationWarning` (for callers of the library API, e.g. `submit_to_aml` - or `build_command_outputs`). + Emits a human-facing log line naming the replacement CLI flag (for CLI + users) and a Python `DeprecationWarning` naming the replacement parameter + (for callers of the library API, e.g. `submit_to_aml` or + `build_command_outputs`). Args: old_value: The legacy `alias=datastore/folder` string the user passed. stacklevel: Stack level for the `DeprecationWarning`, so it points at the API caller rather than this helper. """ - message = ( - "--output (datasets_output) is deprecated and will be removed in a" - f" future release. Replace '--output {old_value}' with" - f" '--output-datastore {old_value}' (output_datastore)." + # CLI users see only flags; Python API users see only parameter names. + cli_message = ( + "--output is deprecated and will be removed in a future release." + f" Replace '--output {old_value}' with '--output-datastore {old_value}'." + ) + api_message = ( + "The 'datasets_output' parameter is deprecated and will be removed in a" + f" future release. Pass {old_value!r} to 'output_datastore' instead." ) - logger.warning(message) - warnings.warn(message, DeprecationWarning, stacklevel=stacklevel) + logger.warning(cli_message) + warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel) def _classify_legacy_input(string: str) -> str: diff --git a/tests/test_data.py b/tests/test_data.py index a39b7ee..8b4a927 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -270,10 +270,12 @@ def test_build_command_inputs_legacy_job_new_syntax_routes() -> None: def test_build_command_inputs_legacy_raises_deprecation_warning() -> None: - """The Python API raises a DeprecationWarning for legacy input values.""" + """The Python API raises a DeprecationWarning naming the new parameter.""" client = Mock() - with pytest.warns(DeprecationWarning, match=r"--mount-datastore"): + with pytest.warns(DeprecationWarning, match=r"mount_datastore") as record: build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"]) + # The Python-facing warning references parameters, not CLI flags. + assert "--mount" not in str(record[0].message) def test_build_command_inputs_legacy_asset_calls_client() -> None: @@ -295,6 +297,9 @@ def test_build_command_inputs_legacy_asset_warns_mount_asset( assert "--mount-asset my_alias=data_asset" in message assert "--mount-datastore" not in message assert "--mount-job" not in message + # CLI users should not see Python parameter names. + assert "mount_asset" not in message + assert "datasets_mount" not in message def test_build_command_inputs_legacy_datastore_warns_mount_datastore( @@ -357,6 +362,7 @@ def test_build_command_outputs_legacy_warns( def test_build_command_outputs_legacy_raises_deprecation_warning() -> None: - """The Python API raises a DeprecationWarning for legacy output values.""" - with pytest.warns(DeprecationWarning, match=r"--output-datastore"): + """The Python API raises a DeprecationWarning naming the new parameter.""" + with pytest.warns(DeprecationWarning, match=r"output_datastore") as record: build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"]) + assert "--output" not in str(record[0].message) From 6f87f25fc149a4c9c732d201994958b27b54f6ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 20:27:56 +0100 Subject: [PATCH 10/16] Fix stale _warn_deprecated reference in removal-plan comment Phase 1 of the comment still named the old _warn_deprecated helper; point it at _warn_legacy_input / _warn_legacy_output instead. --- src/submit_aml/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 3d15f8a..eda6911 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -241,8 +241,9 @@ def _output_from_asset(string: str) -> tuple[str, Output]: # {asset,datastore,job} and --output-{datastore,asset}): # # 1. Now (1.x): both flag sets work. The legacy flags carry a [DEPRECATED] -# marker in --help and emit `_warn_deprecated` at runtime. This is the -# grace period in which users migrate. +# marker in --help and emit a deprecation warning at runtime (via +# `_warn_legacy_input` / `_warn_legacy_output`). This is the grace period +# in which users migrate. # 2. Before removal: once downstream callers have migrated (grep the known # consumer repos / run scripts for `--mount`, `--download`, `--output`, # `-m `, `-d `, `-o ` and the `datasets_{mount,download,output}` kwargs of From ec4768541c68ed291ea5176aec8afcdf01e0cf7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 20:32:59 +0100 Subject: [PATCH 11/16] Show one-element list in Python API deprecation warnings The replacement parameters are list[str]; the warning previously suggested passing a bare string, which would iterate over characters. Show the value wrapped in a one-element list (e.g. ['ref=...']). --- src/submit_aml/data.py | 4 ++-- tests/test_data.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index eda6911..b85f21e 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -303,7 +303,7 @@ def _warn_legacy_input( ) api_message = ( f"The '{old_param}' parameter is deprecated and will be removed in a" - f" future release. Pass {new_value!r} to '{new_param}' instead." + f" future release. Pass [{new_value!r}] to '{new_param}' instead." ) logger.warning(cli_message) warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel) @@ -329,7 +329,7 @@ def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None: ) api_message = ( "The 'datasets_output' parameter is deprecated and will be removed in a" - f" future release. Pass {old_value!r} to 'output_datastore' instead." + f" future release. Pass [{old_value!r}] to 'output_datastore' instead." ) logger.warning(cli_message) warnings.warn(api_message, DeprecationWarning, stacklevel=stacklevel) diff --git a/tests/test_data.py b/tests/test_data.py index 8b4a927..3a78e47 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -274,8 +274,11 @@ def test_build_command_inputs_legacy_raises_deprecation_warning() -> None: client = Mock() with pytest.warns(DeprecationWarning, match=r"mount_datastore") as record: build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"]) - # The Python-facing warning references parameters, not CLI flags. - assert "--mount" not in str(record[0].message) + # The Python-facing warning references parameters, not CLI flags, and shows + # a one-element list (the parameters are list[str]). + message = str(record[0].message) + assert "--mount" not in message + assert "['ref=mystore/exports/reference']" in message def test_build_command_inputs_legacy_asset_calls_client() -> None: @@ -365,4 +368,6 @@ def test_build_command_outputs_legacy_raises_deprecation_warning() -> None: """The Python API raises a DeprecationWarning naming the new parameter.""" with pytest.warns(DeprecationWarning, match=r"output_datastore") as record: build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"]) - assert "--output" not in str(record[0].message) + message = str(record[0].message) + assert "--output" not in message + assert "['out_dir=mydatastore/my_dataset']" in message From 5621096fd7838e411fbe5b5aaa451780d55147b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 20:39:46 +0100 Subject: [PATCH 12/16] Preserve mount-over-download precedence for duplicate aliases build_command_inputs builds a single dict where later writes win. The new explicit/legacy flags processed mounts before downloads, so a duplicate alias resolved to download mode -- reversing main's {**downloads, **mounts} precedence where mount won. Process downloads first and mounts second (and legacy after explicit) so mount keeps precedence. Add regression tests. --- src/submit_aml/data.py | 36 +++++++++++++++++++++--------------- tests/test_data.py | 22 ++++++++++++++++++++++ 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index b85f21e..454e838 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -437,42 +437,48 @@ def build_command_inputs( Returns: Dictionary of `alias: Input` mappings. + + When the same alias appears under both a download and a mount flag, the + mount wins: downloads are applied first and mounts overwrite them. This + preserves the historical `{**downloads, **mounts}` precedence. """ inputs: TypeInputsDict = {} - for string in mount_asset or []: - alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT) - inputs[alias] = value + # Downloads first, then mounts, so that mount takes precedence when the same + # alias is supplied under both modes (and likewise legacy after explicit). for string in download_asset or []: alias, value = _input_from_asset(ml_client, string, InputOutputModes.DOWNLOAD) inputs[alias] = value - - for string in mount_datastore or []: - alias, value = _input_from_datastore(string, InputOutputModes.MOUNT) + for string in mount_asset or []: + alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT) inputs[alias] = value + for string in download_datastore or []: alias, value = _input_from_datastore(string, InputOutputModes.DOWNLOAD) inputs[alias] = value - - for string in mount_job or []: - alias, value = _input_from_job(string, InputOutputModes.MOUNT) + for string in mount_datastore or []: + alias, value = _input_from_datastore(string, InputOutputModes.MOUNT) inputs[alias] = value + for string in download_job or []: alias, value = _input_from_job(string, InputOutputModes.DOWNLOAD) inputs[alias] = value + for string in mount_job or []: + alias, value = _input_from_job(string, InputOutputModes.MOUNT) + inputs[alias] = value - if legacy_mount: - for string in legacy_mount: - alias, value = _legacy_input( - ml_client, string, InputOutputModes.MOUNT, "mount" - ) - inputs[alias] = value if legacy_download: for string in legacy_download: alias, value = _legacy_input( ml_client, string, InputOutputModes.DOWNLOAD, "download" ) inputs[alias] = value + if legacy_mount: + for string in legacy_mount: + alias, value = _legacy_input( + ml_client, string, InputOutputModes.MOUNT, "mount" + ) + inputs[alias] = value return inputs diff --git a/tests/test_data.py b/tests/test_data.py index 3a78e47..8e0b4c6 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -289,6 +289,28 @@ def test_build_command_inputs_legacy_asset_calls_client() -> None: client.data.get.assert_called_once() +def test_build_command_inputs_mount_overrides_download_for_duplicate_alias() -> None: + """When an alias is given under both modes, mount wins (legacy precedence).""" + client = Mock() + inputs = build_command_inputs( + client, + download_datastore=["ref=mystore/exports/reference"], + mount_datastore=["ref=mystore/exports/reference"], + ) + assert inputs["ref"].mode == InputOutputModes.MOUNT + + +def test_build_command_inputs_legacy_mount_overrides_legacy_download() -> None: + """A colliding alias across legacy mount/download resolves to mount.""" + client = Mock() + inputs = build_command_inputs( + client, + legacy_download=["ref=mystore/exports/reference"], + legacy_mount=["ref=mystore/exports/reference"], + ) + assert inputs["ref"].mode == InputOutputModes.MOUNT + + def test_build_command_inputs_legacy_asset_warns_mount_asset( capsys: pytest.CaptureFixture[str], ) -> None: From e988b1c9741e96b00b415f80d0c54c26b1cb32d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 21:11:55 +0100 Subject: [PATCH 13/16] Raise on duplicate input/output aliases An alias maps to a single ${{inputs.}} / ${{outputs.}} reference, so reusing one (across modes, source types, or flags) is a user error. Previously the colliding entry silently overwrote the earlier one. Add _assign_unique and route all input/output insertions through it, raising a clear ValueError naming the alias. This supersedes the earlier mount-over-download precedence (Copilot suggested duplicate-alias validation as the alternative). Add tests for cross-mode, same-mode, legacy, and output collisions. --- src/submit_aml/data.py | 64 +++++++++++++++++++++++++++++++----------- tests/test_data.py | 51 ++++++++++++++++++++++----------- 2 files changed, 83 insertions(+), 32 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 454e838..5f959de 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -409,6 +409,34 @@ def _legacy_input( return _input_from_asset(ml_client, string, mode) +def _assign_unique( + mapping: dict[str, object], + alias: str, + value: object, + *, + kind: str, +) -> None: + """Insert `alias -> value`, raising if the alias is already present. + + Each alias becomes a single `${{inputs.}}` / `${{outputs.}}` + reference, so it must be unique. Reusing an alias (across modes, source + types, or flags) is a user error rather than something to silently resolve. + + Args: + mapping: The inputs or outputs dict being built. + alias: The alias key to insert. + value: The `Input` or `Output` to store. + kind: Either `'input'` or `'output'`, used in the error message. + + Raises: + ValueError: If `alias` is already present in `mapping`. + """ + if alias in mapping: + msg = f"Duplicate {kind} alias {alias!r}: each alias must be unique." + raise ValueError(msg) + mapping[alias] = value + + def build_command_inputs( ml_client: MLClient, *, @@ -438,47 +466,46 @@ def build_command_inputs( Returns: Dictionary of `alias: Input` mappings. - When the same alias appears under both a download and a mount flag, the - mount wins: downloads are applied first and mounts overwrite them. This - preserves the historical `{**downloads, **mounts}` precedence. + Raises: + ValueError: If the same alias is used more than once across any of the + input flags (an alias maps to a single `${{inputs.}}` + reference, so it must be unique). """ inputs: TypeInputsDict = {} - # Downloads first, then mounts, so that mount takes precedence when the same - # alias is supplied under both modes (and likewise legacy after explicit). for string in download_asset or []: alias, value = _input_from_asset(ml_client, string, InputOutputModes.DOWNLOAD) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") for string in mount_asset or []: alias, value = _input_from_asset(ml_client, string, InputOutputModes.MOUNT) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") for string in download_datastore or []: alias, value = _input_from_datastore(string, InputOutputModes.DOWNLOAD) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") for string in mount_datastore or []: alias, value = _input_from_datastore(string, InputOutputModes.MOUNT) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") for string in download_job or []: alias, value = _input_from_job(string, InputOutputModes.DOWNLOAD) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") for string in mount_job or []: alias, value = _input_from_job(string, InputOutputModes.MOUNT) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") if legacy_download: for string in legacy_download: alias, value = _legacy_input( ml_client, string, InputOutputModes.DOWNLOAD, "download" ) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") if legacy_mount: for string in legacy_mount: alias, value = _legacy_input( ml_client, string, InputOutputModes.MOUNT, "mount" ) - inputs[alias] = value + _assign_unique(inputs, alias, value, kind="input") return inputs @@ -499,15 +526,20 @@ def build_command_outputs( Returns: Dictionary of `alias: Output` mappings. + + Raises: + ValueError: If the same alias is used more than once across any of the + output flags (an alias maps to a single `${{outputs.}}` + reference, so it must be unique). """ outputs: dict[str, Output] = {} for string in output_datastore or []: alias, value = _output_from_datastore(string) - outputs[alias] = value + _assign_unique(outputs, alias, value, kind="output") for string in output_asset or []: alias, value = _output_from_asset(string) - outputs[alias] = value + _assign_unique(outputs, alias, value, kind="output") if legacy_output: for string in legacy_output: @@ -515,6 +547,6 @@ def build_command_outputs( # build_command_outputs, pointing at the API caller. _warn_legacy_output(string, stacklevel=3) alias, value = _output_from_datastore(string) - outputs[alias] = value + _assign_unique(outputs, alias, value, kind="output") return outputs diff --git a/tests/test_data.py b/tests/test_data.py index 8e0b4c6..34f6f5e 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -289,26 +289,36 @@ def test_build_command_inputs_legacy_asset_calls_client() -> None: client.data.get.assert_called_once() -def test_build_command_inputs_mount_overrides_download_for_duplicate_alias() -> None: - """When an alias is given under both modes, mount wins (legacy precedence).""" +def test_build_command_inputs_duplicate_alias_across_modes_raises() -> None: + """The same alias under both download and mount is a hard error.""" client = Mock() - inputs = build_command_inputs( - client, - download_datastore=["ref=mystore/exports/reference"], - mount_datastore=["ref=mystore/exports/reference"], - ) - assert inputs["ref"].mode == InputOutputModes.MOUNT + with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*ref"): + build_command_inputs( + client, + download_datastore=["ref=mystore/exports/reference"], + mount_datastore=["ref=mystore/exports/reference"], + ) -def test_build_command_inputs_legacy_mount_overrides_legacy_download() -> None: - """A colliding alias across legacy mount/download resolves to mount.""" +def test_build_command_inputs_duplicate_alias_same_mode_raises() -> None: + """A repeated alias within a single flag is also a hard error.""" client = Mock() - inputs = build_command_inputs( - client, - legacy_download=["ref=mystore/exports/reference"], - legacy_mount=["ref=mystore/exports/reference"], - ) - assert inputs["ref"].mode == InputOutputModes.MOUNT + with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*ref"): + build_command_inputs( + client, + mount_datastore=["ref=mystore/a", "ref=mystore/b"], + ) + + +def test_build_command_inputs_legacy_duplicate_alias_raises() -> None: + """A colliding alias across legacy mount/download raises.""" + client = Mock() + with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*ref"): + build_command_inputs( + client, + legacy_download=["ref=mystore/exports/reference"], + legacy_mount=["ref=mystore/exports/reference"], + ) def test_build_command_inputs_legacy_asset_warns_mount_asset( @@ -375,6 +385,15 @@ def test_build_command_outputs_datastore_and_asset() -> None: assert outputs["asset_dir"].version == "2" +def test_build_command_outputs_duplicate_alias_raises() -> None: + """The same alias under two output flags is a hard error.""" + with pytest.raises(ValueError, match=r"[Dd]uplicate.*alias.*out_dir"): + build_command_outputs( + output_datastore=["out_dir=mydatastore/my_dataset"], + output_asset=["out_dir=my-results:2"], + ) + + def test_build_command_outputs_legacy_warns( capsys: pytest.CaptureFixture[str], ) -> None: From 7e65dd44a7efb4ad7c97e00f5cde43f5dcc5c88a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 21:18:08 +0100 Subject: [PATCH 14/16] Fix removal-plan comment refs and document new job form in --download help - Removal-plan comment named non-existent add_inputs/add_outputs; point it at build_command_inputs/build_command_outputs and the legacy_* kwargs. - The deprecated --download help only showed the legacy job_dir: syntax; document the preferred 'alias=:' form (still accepting the old one). --- src/submit_aml/__main__.py | 7 ++++--- src/submit_aml/data.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py index bf28111..3ef228b 100644 --- a/src/submit_aml/__main__.py +++ b/src/submit_aml/__main__.py @@ -148,9 +148,10 @@ def submit( " dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'. If the version is" " omitted, the last one will be used. To download a datastore folder," " use 'alias=datastore/folder'. To download the output folder of a" - " previous job, the argument should take the form" - " 'alias=job_dir::'; for example:" - " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'." + " previous job, the preferred form is 'alias=:' (the" + " same as --download-job); for example," + " 'checkpoint=crusty_hat_43s6lmvb25:outputs/checkpoint-10000'. The" + " older 'alias=job_dir::' form is also still accepted." " The alias can be used to pass input datasets to the script, e.g.," r" '${{inputs.vindr_dir}}' or '${{inputs.checkpoint}}'." " This option can be used multiple times." diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 5f959de..8d9fcfb 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -252,10 +252,10 @@ def _output_from_asset(string: str) -> tuple[str, Output]: # semver, since dropping a CLI flag is a breaking change. # 3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/ # `output` typer.Options in `__main__.py`, drop the matching -# `submit_to_aml` parameters and the `legacy_*` branches in -# `add_inputs`/`add_outputs`, delete the `_legacy_*` helpers and the -# `_warn_legacy_*` warning helpers, and note the breaking change in the -# changelog. +# `submit_to_aml` parameters and the `legacy_*` keyword arguments and +# branches in `build_command_inputs`/`build_command_outputs`, delete the +# `_legacy_*` helpers and the `_warn_legacy_*` warning helpers, and note +# the breaking change in the changelog. # # Until step 3, keep the legacy flags VISIBLE in --help (the [DEPRECATED] # marker is how users discover the migration path); only hide them as an From 009aaefc3386818ced4b03935d812e3b5621512c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 9 Jun 2026 23:52:24 +0100 Subject: [PATCH 15/16] Fix CI: generic _assign_unique type and ANSI-robust warning tests - ty failed because dict is invariant, so passing TypeInputsDict / dict[str, Output] to _assign_unique(mapping: dict[str, object]) was rejected. Make _assign_unique generic over a TypeVar. - pytest failed under CI's coloured rich output: the deprecation-message tests asserted on captured stdout, where ANSI codes and line wrapping split the matched substrings. Assert on the raw message passed to logger.warning (patched) instead, via a _deprecation_log helper. --- src/submit_aml/data.py | 6 +++-- tests/test_data.py | 56 +++++++++++++++++++++++++----------------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index 8d9fcfb..b8e7965 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -3,6 +3,7 @@ import re import sys import warnings +from typing import TypeVar from azure.ai.ml import Input from azure.ai.ml import MLClient @@ -17,6 +18,7 @@ TypeInputsDict = dict[str, Input | SweepDistribution] TypeOptionalStrList = list[str] | None +_MappingValue = TypeVar("_MappingValue") def _datastore_uri(datastore: str, path: str) -> str: @@ -410,9 +412,9 @@ def _legacy_input( def _assign_unique( - mapping: dict[str, object], + mapping: dict[str, _MappingValue], alias: str, - value: object, + value: _MappingValue, *, kind: str, ) -> None: diff --git a/tests/test_data.py b/tests/test_data.py index 34f6f5e..06fc25e 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -3,6 +3,7 @@ from __future__ import annotations from unittest.mock import Mock +from unittest.mock import patch import pytest from azure.ai.ml.constants import InputOutputModes @@ -21,6 +22,17 @@ from submit_aml.data import build_command_inputs from submit_aml.data import build_command_outputs + +def _deprecation_log(mock_logger: Mock) -> str: + """Join the raw messages passed to `logger.warning`. + + Asserting on the raw message (rather than `capsys`) avoids the rich + console's ANSI colouring, highlighting, and line wrapping, which otherwise + make multi-token substring checks brittle (and fail under CI's colour mode). + """ + return " ".join(call.args[0] for call in mock_logger.warning.call_args_list) + + # --------------------------------------------------------------------------- # _datastore_uri # --------------------------------------------------------------------------- @@ -321,14 +333,13 @@ def test_build_command_inputs_legacy_duplicate_alias_raises() -> None: ) -def test_build_command_inputs_legacy_asset_warns_mount_asset( - capsys: pytest.CaptureFixture[str], -) -> None: +def test_build_command_inputs_legacy_asset_warns_mount_asset() -> None: """A legacy --mount asset value is told to use --mount-asset specifically.""" client = Mock() client.data.get.return_value = Mock(id="azureml:resolved-asset:1") - build_command_inputs(client, legacy_mount=["my_alias=data_asset"]) - message = " ".join(capsys.readouterr().out.split()) + with patch("submit_aml.data.logger") as mock_logger: + build_command_inputs(client, legacy_mount=["my_alias=data_asset"]) + message = _deprecation_log(mock_logger) assert "--mount-asset my_alias=data_asset" in message assert "--mount-datastore" not in message assert "--mount-job" not in message @@ -337,26 +348,24 @@ def test_build_command_inputs_legacy_asset_warns_mount_asset( assert "datasets_mount" not in message -def test_build_command_inputs_legacy_datastore_warns_mount_datastore( - capsys: pytest.CaptureFixture[str], -) -> None: +def test_build_command_inputs_legacy_datastore_warns_mount_datastore() -> None: """A legacy --mount datastore value is told to use --mount-datastore.""" client = Mock() - build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"]) - message = " ".join(capsys.readouterr().out.split()) + with patch("submit_aml.data.logger") as mock_logger: + build_command_inputs(client, legacy_mount=["ref=mystore/exports/reference"]) + message = _deprecation_log(mock_logger) assert "--mount-datastore ref=mystore/exports/reference" in message -def test_build_command_inputs_legacy_job_warns_download_job_translated( - capsys: pytest.CaptureFixture[str], -) -> None: +def test_build_command_inputs_legacy_job_warns_download_job_translated() -> None: """A legacy --download job value is told to use --download-job, sans prefix.""" client = Mock() - build_command_inputs( - client, - legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"], - ) - message = " ".join(capsys.readouterr().out.split()) + with patch("submit_aml.data.logger") as mock_logger: + build_command_inputs( + client, + legacy_download=["ckpt=job_dir:my_job_123:models/best.pth"], + ) + message = _deprecation_log(mock_logger) assert "--download-job ckpt=my_job_123:models/best.pth" in message # The suggested replacement (after "with") drops the legacy job_dir: prefix. assert "job_dir:" not in message.split("with", 1)[1] @@ -394,13 +403,14 @@ def test_build_command_outputs_duplicate_alias_raises() -> None: ) -def test_build_command_outputs_legacy_warns( - capsys: pytest.CaptureFixture[str], -) -> None: +def test_build_command_outputs_legacy_warns() -> None: """Legacy --output strings are built and emit a targeted deprecation warning.""" - outputs = build_command_outputs(legacy_output=["out_dir=mydatastore/my_dataset"]) + with patch("submit_aml.data.logger") as mock_logger: + outputs = build_command_outputs( + legacy_output=["out_dir=mydatastore/my_dataset"] + ) assert "out_dir" in outputs - message = " ".join(capsys.readouterr().out.split()) + message = _deprecation_log(mock_logger) assert "deprecated" in message.lower() assert "--output-datastore out_dir=mydatastore/my_dataset" in message From b6445ad9967f055b8cf74b92540c6b3cd2a833b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Wed, 10 Jun 2026 00:07:08 +0100 Subject: [PATCH 16/16] Address review: accurate warning comments, job help, positional compat - The 'CLI sees only flags / API sees only params' inline comments were misleading: both messages are always emitted. Reword to say the log line is phrased for CLI users and the DeprecationWarning for API users. - --download help implied the bare 'alias=:' job form works on the legacy flag, but it only routes as a job when has a '/'. Steer users to --download-job (or the legacy job_dir: form) and note the caveat. - build_command_inputs/outputs became keyword-only, breaking pre-existing positional callers (build_command_inputs(client, downloads, mounts); build_command_outputs(uploads)). Keep legacy_download/legacy_mount and legacy_output as leading positional params (new flags stay keyword-only). Add positional-compat regression tests. --- src/submit_aml/__main__.py | 9 +++++---- src/submit_aml/data.py | 35 +++++++++++++++++++++++++---------- tests/test_data.py | 20 ++++++++++++++++++++ 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py index 3ef228b..35ed04f 100644 --- a/src/submit_aml/__main__.py +++ b/src/submit_aml/__main__.py @@ -148,10 +148,11 @@ def submit( " dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'. If the version is" " omitted, the last one will be used. To download a datastore folder," " use 'alias=datastore/folder'. To download the output folder of a" - " previous job, the preferred form is 'alias=:' (the" - " same as --download-job); for example," - " 'checkpoint=crusty_hat_43s6lmvb25:outputs/checkpoint-10000'. The" - " older 'alias=job_dir::' form is also still accepted." + " previous job, prefer --download-job; on this deprecated flag use" + " the 'alias=job_dir::' form, for example" + " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'" + " (the bare 'alias=:' form is only recognised as a job" + " when contains a '/', otherwise it is read as a data asset)." " The alias can be used to pass input datasets to the script, e.g.," r" '${{inputs.vindr_dir}}' or '${{inputs.checkpoint}}'." " This option can be used multiple times." diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py index b8e7965..ae5f325 100644 --- a/src/submit_aml/data.py +++ b/src/submit_aml/data.py @@ -254,8 +254,8 @@ def _output_from_asset(string: str) -> tuple[str, Output]: # semver, since dropping a CLI flag is a breaking change. # 3. At removal (2.0.0): delete the `datasets_download`/`datasets_mount`/ # `output` typer.Options in `__main__.py`, drop the matching -# `submit_to_aml` parameters and the `legacy_*` keyword arguments and -# branches in `build_command_inputs`/`build_command_outputs`, delete the +# `submit_to_aml` parameters and the `legacy_*` parameters and branches in +# `build_command_inputs`/`build_command_outputs`, delete the # `_legacy_*` helpers and the `_warn_legacy_*` warning helpers, and note # the breaking change in the changelog. # @@ -298,7 +298,9 @@ def _warn_legacy_input( old_cli, old_param = _LEGACY_INPUT_FLAGS[flag_base] new_cli = f"--{flag_base}-{source}" new_param = f"{flag_base}_{source}" - # CLI users see only flags; Python API users see only parameter names. + # Both are always emitted: the log line is phrased for CLI users (it names + # flags) and the DeprecationWarning for Python API users (it names + # parameters). cli_message = ( f"{old_cli} is deprecated and will be removed in a future release." f" Replace '{old_cli} {old_value}' with '{new_cli} {new_value}'." @@ -324,7 +326,9 @@ def _warn_legacy_output(old_value: str, stacklevel: int = 2) -> None: stacklevel: Stack level for the `DeprecationWarning`, so it points at the API caller rather than this helper. """ - # CLI users see only flags; Python API users see only parameter names. + # Both are always emitted: the log line is phrased for CLI users (it names + # flags) and the DeprecationWarning for Python API users (it names + # parameters). cli_message = ( "--output is deprecated and will be removed in a future release." f" Replace '--output {old_value}' with '--output-datastore {old_value}'." @@ -441,6 +445,8 @@ def _assign_unique( def build_command_inputs( ml_client: MLClient, + legacy_download: list[str] | None = None, + legacy_mount: list[str] | None = None, *, mount_asset: list[str] | None = None, download_asset: list[str] | None = None, @@ -448,13 +454,19 @@ def build_command_inputs( download_datastore: list[str] | None = None, mount_job: list[str] | None = None, download_job: list[str] | None = None, - legacy_mount: list[str] | None = None, - legacy_download: list[str] | None = None, ) -> TypeInputsDict: """Build the inputs dictionary for a command job. + `legacy_download` / `legacy_mount` are kept as the first positional + parameters (in that order) so existing positional callers from before the + explicit-source flags, i.e. `build_command_inputs(client, downloads, + mounts)`, keep working during the 1.x deprecation window. The new + explicit-source flags are keyword-only. + Args: ml_client: Client used to resolve data assets. + legacy_download: Deprecated `--download` values, routed by source type. + legacy_mount: Deprecated `--mount` values, routed by source type. mount_asset: Data assets to mount, as `'alias=name[:version]'`. download_asset: Data assets to download, as `'alias=name[:version]'`. mount_datastore: Datastore folders to mount, as `'alias=datastore/folder'`. @@ -462,8 +474,6 @@ def build_command_inputs( `'alias=datastore/folder'`. mount_job: Previous job outputs to mount, as `'alias=:'`. download_job: Previous job outputs to download, as `'alias=:'`. - legacy_mount: Deprecated `--mount` values, routed by source type. - legacy_download: Deprecated `--download` values, routed by source type. Returns: Dictionary of `alias: Input` mappings. @@ -513,18 +523,23 @@ def build_command_inputs( def build_command_outputs( + legacy_output: list[str] | None = None, *, output_datastore: list[str] | None = None, output_asset: list[str] | None = None, - legacy_output: list[str] | None = None, ) -> dict[str, Output]: """Build the outputs dictionary for a command job. + `legacy_output` is kept as the first positional parameter so existing + positional callers from before the explicit-target flags, i.e. + `build_command_outputs(uploads)`, keep working during the 1.x deprecation + window. The new explicit-target flags are keyword-only. + Args: + legacy_output: Deprecated `--output` values (datastore folders). output_datastore: Datastore folders to write to, as `'alias=datastore/folder'`. output_asset: Data assets to register, as `'alias=name[:version]'`. - legacy_output: Deprecated `--output` values (datastore folders). Returns: Dictionary of `alias: Output` mappings. diff --git a/tests/test_data.py b/tests/test_data.py index 06fc25e..381e23a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -293,6 +293,26 @@ def test_build_command_inputs_legacy_raises_deprecation_warning() -> None: assert "['ref=mystore/exports/reference']" in message +def test_build_command_inputs_legacy_positional_compat() -> None: + """Positional `(client, downloads, mounts)` calls still work (1.x compat).""" + client = Mock() + inputs = build_command_inputs( + client, + ["dl=mystore/down"], + ["mn=mystore/mount"], + ) + assert inputs["dl"].mode == InputOutputModes.DOWNLOAD + assert inputs["mn"].mode == InputOutputModes.MOUNT + + +def test_build_command_outputs_legacy_positional_compat() -> None: + """Positional `(uploads)` calls still work (1.x compat).""" + outputs = build_command_outputs(["out_dir=mydatastore/my_dataset"]) + assert outputs["out_dir"].path == ( + "azureml://datastores/mydatastore/paths/my_dataset" + ) + + def test_build_command_inputs_legacy_asset_calls_client() -> None: """Legacy --mount asset strings still resolve through the client.""" client = Mock()