From 9b22e5500bf5281006393118c9b2bf685dbd55e5 Mon Sep 17 00:00:00 2001
From: Stephanie Hyland <stephanie.hyland@microsoft.com>
Date: Tue, 9 Jun 2026 06:29:25 -0700
Subject: [PATCH] Support raw datastore-path folders as command inputs

Add a datastore-path branch to `_get_data_assets` so inputs accept
`alias=datastore/folder` and mount/download `azureml://datastores/...`
URIs directly, symmetric with the existing output behaviour. This
removes the need to pre-register a data asset for transient datastore
folders.

- Add `_is_alias_datastore_path_string` predicate (pure string check,
  since `_extract_alias_datastore_path` exits rather than raises).
- Extract shared `_datastore_uri` helper, used by inputs and outputs.
- Document the new form in the --download/--mount CLI help.
- Add tests for the predicate and the input routing.

Closes #14

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/examples.md           | 14 ++++++++
 src/submit_aml/__main__.py |  7 +++-
 src/submit_aml/data.py     | 58 +++++++++++++++++++++++++++----
 tests/test_data.py         | 71 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 142 insertions(+), 8 deletions(-)

diff --git a/docs/examples.md b/docs/examples.md
index 26e671e..088a864 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -196,6 +196,14 @@ objects.
         --mount "checkpoint=job_dir:my-training-job:models/best.pth"
     ```
 
+    Mount a raw folder on a registered datastore:
+
+    ```bash
+    submit-aml \
+        --script evaluate.py \
+        --mount "ref_dir=mydatastore/exports/reference"
+    ```
+
 === "Python"
 
     ```python
@@ -215,6 +223,12 @@ objects.
         script_path="evaluate.py",
         datasets_mount=["checkpoint=job_dir:my-training-job:models/best.pth"],
     )
+
+    # Mount a raw folder on a registered datastore
+    submit_to_aml(
+        script_path="evaluate.py",
+        datasets_mount=["ref_dir=mydatastore/exports/reference"],
+    )
     ```
 
 Configure an output datastore:
diff --git a/src/submit_aml/__main__.py b/src/submit_aml/__main__.py
index e7c4bc7..98c8007 100644
--- a/src/submit_aml/__main__.py
+++ b/src/submit_aml/__main__.py
@@ -145,6 +145,9 @@ def submit(
             " dataset, the argument should take the form: alias, name and version"
             " of the dataset; for example: 'vindr_dir=VINDR-CXR-V2:1'."
             " If the version is omitted, the last one will be used."
+            " To download a raw folder on a registered datastore, the argument should"
+            " take the form 'alias=datastore/path/to/dir'; for example:"
+            " 'ref_dir=mydatastore/exports/reference'."
             " To download the output folder of a previous job, the argument should take"
             " the form 'alias=job_dir:<job_id>:<path/in/job/outputs>'; for example:"
             " 'checkpoint=job_dir:crusty_hat_43s6lmvb25:outputs/checkpoint-10000'."
@@ -160,7 +163,9 @@ def submit(
         "-m",
         help=(
             "Azure ML dataset or job output folder to mount."
-            " For an Azure ML dataset, the alias, name and version should be provided"
+            " For an Azure ML dataset, the alias, name and version should be provided;"
+            " for a raw datastore folder, the alias, datastore and path should be"
+            " provided (e.g. 'ref_dir=mydatastore/exports/reference');"
             " while for a job output folder, the alias, job ID and path in the job"
             " outputs should be provided. See the --download option for more"
             " information."
diff --git a/src/submit_aml/data.py b/src/submit_aml/data.py
index 410568b..2f93e9f 100644
--- a/src/submit_aml/data.py
+++ b/src/submit_aml/data.py
@@ -55,6 +55,20 @@ def _extract_alias_path_version(string: str) -> tuple[str, str, str | None]:
     sys.exit(1)
 
 
+def _datastore_uri(datastore: str, path: str) -> str:
+    """Build an Azure ML datastore URI for a folder on a datastore.
+
+    Args:
+        datastore: Name of the registered datastore.
+        path: Folder path within the datastore.
+
+    Returns:
+        An `azureml://` URI of the form
+        `azureml://datastores/<datastore>/paths/<path>`.
+    """
+    return f"azureml://datastores/{datastore}/paths/{path}"
+
+
 def _extract_alias_datastore_path(string: str) -> tuple[str, str, str]:
     """Get alias, datastore name and folder path from a string.
 
@@ -126,6 +140,24 @@ def _is_alias_job_path_string(string: str) -> bool:
         return False
 
 
+def _is_alias_datastore_path_string(string: str) -> bool:
+    """Return True if the string refers to a raw datastore-path folder.
+
+    A datastore-path string has the form `'alias=datastore/folder'`. It is
+    distinguished from a data-asset name (`'alias=name[:version]'`) by the
+    presence of a `/` in the right-hand side, and from a job-output directory
+    by not starting with `job_dir:`.
+
+    This is intentionally a pure string check: `_extract_alias_datastore_path`
+    calls `sys.exit(1)` on a non-match rather than raising, so it cannot be
+    wrapped in try/except the way `_is_alias_job_path_string` is.
+    """
+    if "=" not in string:
+        return False
+    _, rhs = string.split("=", 1)
+    return "/" in rhs and not rhs.startswith("job_dir:")
+
+
 def build_command_inputs(
     ml_client: MLClient,
     strings_download: list[str] | None,
@@ -134,10 +166,13 @@ def build_command_inputs(
     """Get dictionaries data assets to be mounted or downloaded.
 
     Args:
-        strings_download: List of strings of the form `'alias=path:version'` to
-            be downloaded. If `None`, no data assets will be downloaded.
-        strings_mount: List of strings of the form `'alias=path:version'` to
-            be mounted. If `None`, no data assets will be mounted.
+        strings_download: List of strings to be downloaded. Each is of the form
+            `'alias=name[:version]'` (registered data asset),
+            `'alias=datastore/folder'` (raw datastore path), or
+            `'alias=job_dir:<job_id>:<path>'` (previous job output).
+            If `None`, no data assets will be downloaded.
+        strings_mount: List of strings to be mounted, in the same forms as
+            `strings_download`. If `None`, no data assets will be mounted.
     """
     strings_download = [] if strings_download is None else strings_download
     strings_mount = [] if strings_mount is None else strings_mount
@@ -168,7 +203,7 @@ def build_command_outputs(
     for string in strings_upload:
         alias, datastore, path = _extract_alias_datastore_path(string)
         output = Output(
-            path=f"azureml://datastores/{datastore}/paths/{path}",
+            path=_datastore_uri(datastore, path),
         )
         outputs_dict[alias] = output
     return outputs_dict
@@ -182,8 +217,8 @@ def _get_data_assets(
     """Get data assets from Azure ML.
 
     Args:
-        datasets: List of strings of the form `'alias=path:version'` or
-            `'alias=job_dir:<job_id>:<path>'`.
+        datasets: List of strings of the form `'alias=path:version'`,
+            `'alias=datastore/folder'`, or `'alias=job_dir:<job_id>:<path>'`.
         mode: Either `InputOutputModes.DOWNLOAD` or `InputOutputModes.MOUNT`.
 
     Returns:
@@ -200,6 +235,15 @@ def _get_data_assets(
                 path=str(azureml_path),
                 mode=mode,
             )
+        elif _is_alias_datastore_path_string(string):
+            # Handle raw datastore-path folder format
+            alias, datastore, folder = _extract_alias_datastore_path(string)
+            azureml_path = _datastore_uri(datastore, folder)
+            logger.info(f'Using datastore path "{azureml_path}"...')
+            inputs[alias] = Input(
+                path=azureml_path,
+                mode=mode,
+            )
         else:
             # Handle regular data asset format
             alias, path, version = _extract_alias_path_version(string)
diff --git a/tests/test_data.py b/tests/test_data.py
index 8f9faab..441da24 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -2,11 +2,16 @@
 
 from __future__ import annotations
 
+from unittest.mock import MagicMock
+
 import pytest
+from azure.ai.ml.constants import InputOutputModes
 
 from submit_aml.data import _extract_alias_datastore_path
 from submit_aml.data import _extract_alias_job_path
 from submit_aml.data import _extract_alias_path_version
+from submit_aml.data import _is_alias_datastore_path_string
+from submit_aml.data import build_command_inputs
 from submit_aml.data import build_command_outputs
 
 # ---------------------------------------------------------------------------
@@ -83,3 +88,69 @@ def test_build_command_outputs_valid() -> None:
     output = outputs["out_dir"]
     assert "mydatastore" in output.path
     assert "my_dataset" in output.path
+
+
+# ---------------------------------------------------------------------------
+# _is_alias_datastore_path_string
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("string", "expected"),
+    [
+        ("ref=mystore/exports/reference", True),
+        ("ref=mystore/folder", True),
+        ("my_data=MIMIC-CXR-V2", False),
+        ("my_data=MIMIC-CXR-V2:2", False),
+        ("checkpoint=job_dir:my_job_123:models/best.pth", False),
+        ("no_equals_sign", False),
+    ],
+)
+def test_is_alias_datastore_path_string(string: str, expected: bool) -> None:
+    """Only 'alias=datastore/folder' strings are recognised as datastore paths."""
+    assert _is_alias_datastore_path_string(string) is expected
+
+
+# ---------------------------------------------------------------------------
+# build_command_inputs (datastore-path branch)
+# ---------------------------------------------------------------------------
+
+
+def test_build_command_inputs_datastore_path_mount() -> None:
+    """A raw datastore-path string builds an azureml:// Input without AML lookup."""
+    ml_client = MagicMock()
+    inputs = build_command_inputs(
+        ml_client,
+        strings_download=None,
+        strings_mount=["ref=mystore/exports/reference"],
+    )
+    assert "ref" in inputs
+    ref = inputs["ref"]
+    assert ref.path == "azureml://datastores/mystore/paths/exports/reference"
+    assert ref.mode == InputOutputModes.MOUNT
+    ml_client.data.get.assert_not_called()
+
+
+def test_build_command_inputs_datastore_path_download() -> None:
+    """The datastore-path branch honours the download mode."""
+    ml_client = MagicMock()
+    inputs = build_command_inputs(
+        ml_client,
+        strings_download=["ref=mystore/exports/reference"],
+        strings_mount=None,
+    )
+    assert inputs["ref"].mode == InputOutputModes.DOWNLOAD
+    ml_client.data.get.assert_not_called()
+
+
+def test_build_command_inputs_data_asset_routes_to_get() -> None:
+    """A 'name:version' string still resolves via ml_client.data.get."""
+    ml_client = MagicMock()
+    ml_client.data.get.return_value = MagicMock(id="azureml://data-asset-id")
+    inputs = build_command_inputs(
+        ml_client,
+        strings_download=None,
+        strings_mount=["my_data=MIMIC-CXR-V2:2"],
+    )
+    ml_client.data.get.assert_called_once_with(name="MIMIC-CXR-V2", version="2")
+    assert inputs["my_data"].path == "azureml://data-asset-id"