From 6cc554b1e237ae42438564bacdd8a7f6a1e34a19 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Tue, 9 Jun 2026 13:12:03 +0200 Subject: [PATCH 1/3] fix: resolve MLflowExpManager FileLock to absolute path for file:// URIs `_get_or_create_exp` built the lock path with `os.path.join(pr.netloc, pr.path.lstrip("/"), "filelock")`. For an absolute `file://` URI (including the default `QSettings` URI) `urlparse` yields an empty netloc and an absolute path, so `lstrip("/")` turned it into a CWD-relative path. The FileLock and its parent dirs were then created under whatever directory the process happened to run in, leaving stray trees and an ineffective lock that no longer serialized concurrent experiment creation. Use `url2pathname(pr.path)` so the lock resolves to the location the URI actually names (also correct for Windows drive-letter URIs), and ensure the directory exists before locking. Refs microsoft/qlib#2252 --- qlib/workflow/expm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index cb48d156acf..17b3378ee57 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -2,11 +2,11 @@ # Licensed under the MIT License. from urllib.parse import urlparse +from urllib.request import url2pathname import mlflow from filelock import FileLock from mlflow.exceptions import MlflowException, RESOURCE_ALREADY_EXISTS, ErrorCode from mlflow.entities import ViewType -import os from typing import Optional, Text from pathlib import Path @@ -233,7 +233,14 @@ def _get_or_create_exp(self, experiment_id=None, experiment_name=None) -> (objec # So we supported it in the interface wrapper pr = urlparse(self.uri) if pr.scheme == "file": - with FileLock(Path(os.path.join(pr.netloc, pr.path.lstrip("/"), "filelock"))): # pylint: disable=E0110 + # `pr.path` of an absolute file:// URI is already an absolute path. The previous + # `os.path.join(pr.netloc, pr.path.lstrip("/"), ...)` stripped the leading "/" and + # produced a CWD-relative lock path, so the lock (and its parent dirs) landed wherever + # the process happened to be running instead of at the URI's location. `url2pathname` + # restores the absolute path and also handles Windows drive-letter URIs correctly. + lock_dir = Path(url2pathname(pr.path)) + lock_dir.mkdir(parents=True, exist_ok=True) + with FileLock(lock_dir / "filelock"): # pylint: disable=E0110 return self.create_exp(experiment_name), True # NOTE: for other schemes like http, we double check to avoid create exp conflicts try: From 74f25d5ad81faca001e3427e038c1459271518ef Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Tue, 9 Jun 2026 13:12:03 +0200 Subject: [PATCH 2/3] fix: skip git code snapshot quietly when CWD is not a git work tree `_log_uncommitted_code` ran `git diff/status/diff --cached` via `subprocess.check_output(cmd, shell=True)` without capturing stderr. When the CWD was not a git repository (containers, CI sandboxes, a tempdir) git wrote a multi-line "usage: git diff --no-index ..." banner straight to the parent's stderr, bypassing qlib's logger, and each failure emitted a noisy INFO record. Probe `git rev-parse --is-inside-work-tree` first and skip silently (DEBUG) when there is no work tree or git is unavailable. Drop `shell=True` in favor of argument lists, capture/suppress stderr, and demote the per-command failure log to DEBUG. This hook is opportunistic reproducibility metadata, not a precondition for running an experiment. Refs microsoft/qlib#2252 --- qlib/workflow/recorder.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 5fd99c0769f..208af67cd28 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -364,18 +364,38 @@ def _log_uncommitted_code(self): Mlflow only log the commit id of the current repo. But usually, user will have a lot of uncommitted changes. So this tries to automatically to log them all. """ + # This is an opportunistic reproducibility hook, not a precondition for running an experiment. + # When the CWD is not inside a git work tree (containers, CI sandboxes, a tempdir, ...) the git + # commands below fail; previously `shell=True` without capturing stderr leaked git's multi-line + # "usage: git diff --no-index ..." banner to the parent's stderr (bypassing this logger), and each + # failure emitted a noisy INFO record. So we first check for a work tree and skip silently if absent. + try: + probe = subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + check=False, + ) + except OSError: + # git is not installed / not on PATH + logger.debug("Skip logging uncommitted code: git is not available.") + return + if probe.returncode != 0 or probe.stdout.strip() != b"true": + logger.debug(f"Skip logging uncommitted code: $CWD({os.getcwd()}) is not a git work tree.") + return + # TODO: the sub-directories maybe git repos. # So it will be better if we can walk the sub-directories and log the uncommitted changes. for cmd, fname in [ - ("git diff", "code_diff.txt"), - ("git status", "code_status.txt"), - ("git diff --cached", "code_cached.txt"), + (["git", "diff"], "code_diff.txt"), + (["git", "status"], "code_status.txt"), + (["git", "diff", "--cached"], "code_cached.txt"), ]: try: - out = subprocess.check_output(cmd, shell=True) + out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL) self.client.log_text(self.id, out.decode(), fname) # this behaves same as above except subprocess.CalledProcessError: - logger.info(f"Fail to log the uncommitted code of $CWD({os.getcwd()}) when run {cmd}.") + logger.debug(f"Fail to log the uncommitted code of $CWD({os.getcwd()}) when run {' '.join(cmd)}.") def end_run(self, status: str = Recorder.STATUS_S): assert status in [ From 760b1013fa62e5b80f43d2bdf3f244370cb0dd61 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Tue, 9 Jun 2026 13:40:35 +0200 Subject: [PATCH 3/3] test: cover CWD-safety of MLflowExpManager FileLock and git snapshot Regression tests for the two fixes in this branch: - MLflowExpManager places its FileLock at the absolute file:// path the URI names and leaves an unrelated working directory clean. - MLflowRecorder._log_uncommitted_code stays silent outside a git work tree (no git banner leaked to stderr, no artifacts logged) and still logs code_diff/status/cached inside one. Both CWD-failure tests fail on the pre-fix code and pass with the fixes. Refs microsoft/qlib#2252 --- tests/test_workflow_cwd.py | 152 +++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 tests/test_workflow_cwd.py diff --git a/tests/test_workflow_cwd.py b/tests/test_workflow_cwd.py new file mode 100644 index 00000000000..e1a990da87c --- /dev/null +++ b/tests/test_workflow_cwd.py @@ -0,0 +1,152 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +Regression tests for the current-working-directory assumptions in ``qlib.workflow``. + +See https://github.com/microsoft/qlib/issues/2252: + +1. ``MLflowExpManager`` built its ``FileLock`` path from ``file://`` URIs in a way that + produced a CWD-relative path, so the lock (and its parent dirs) was created wherever + the process happened to be running instead of at the location the URI named. +2. ``MLflowRecorder._log_uncommitted_code`` shelled out to ``git`` without capturing + stderr; outside a git work tree git's usage banner leaked to the parent's stderr and + each command emitted a noisy log record. +""" + +import os +import shutil +import tempfile +import unittest +import subprocess +from contextlib import contextmanager +from pathlib import Path + +import mlflow + +from qlib.config import C +from qlib.workflow.expm import MLflowExpManager +from qlib.workflow.recorder import MLflowRecorder + + +@contextmanager +def chdir(path): + """Temporarily change the working directory.""" + old = os.getcwd() + os.chdir(str(path)) + try: + yield + finally: + os.chdir(old) + + +@contextmanager +def capture_fd_stderr(): + """ + Capture OS-level file descriptor 2. + + A child process spawned with ``shell=True`` and no ``stderr=`` redirection writes to + the inherited fd 2, which ``contextlib.redirect_stderr`` (it only swaps ``sys.stderr``) + would not see. We redirect the real fd so the test observes exactly what a user's + terminal / log pipe would. + """ + saved_fd = os.dup(2) + tmp = tempfile.TemporaryFile(mode="w+b") + os.dup2(tmp.fileno(), 2) + try: + yield tmp + finally: + os.dup2(saved_fd, 2) + os.close(saved_fd) + + +class MLflowExpManagerCWDTest(unittest.TestCase): + """The experiment FileLock must live at the absolute path the file:// URI names.""" + + def setUp(self) -> None: + self.store = Path(tempfile.mkdtemp(prefix="qlib-store-")) + self.unrelated_cwd = Path(tempfile.mkdtemp(prefix="qlib-cwd-")) + # Constructing MLflowExpManager mutates the global config; snapshot to restore it. + self._saved_uri = C.exp_manager.get("kwargs", {}).get("uri") + + def tearDown(self) -> None: + if self._saved_uri is not None: + C.exp_manager.setdefault("kwargs", {})["uri"] = self._saved_uri + shutil.rmtree(self.store, ignore_errors=True) + shutil.rmtree(self.unrelated_cwd, ignore_errors=True) + + def test_filelock_resolves_to_absolute_uri_not_cwd(self): + mlruns = self.store / "mlruns" # absolute path that does not exist yet + expm = MLflowExpManager(uri="file:" + str(mlruns), default_exp_name="Experiment") + + with chdir(self.unrelated_cwd): + expm._get_or_create_exp(experiment_name="cwd-test") + # The buggy code created `//filelock`, + # i.e. a stray tree under the (unrelated) working directory. Nothing may land here. + leftovers = sorted(p.name for p in Path.cwd().iterdir()) + self.assertEqual( + leftovers, + [], + f"FileLock created files relative to CWD instead of the absolute URI: {leftovers}", + ) + + # And the lock/experiment store must exist at the absolute location the URI named. + self.assertTrue(mlruns.exists(), f"experiment store was not created at {mlruns}") + + +class MLflowRecorderGitSnapshotTest(unittest.TestCase): + """`_log_uncommitted_code` must be quiet outside a repo and still work inside one.""" + + def setUp(self) -> None: + self.store = Path(tempfile.mkdtemp(prefix="qlib-store-")) + self.uri = "file:" + str(self.store / "mlruns") + self.client = mlflow.tracking.MlflowClient(tracking_uri=self.uri) + self.exp_id = self.client.create_experiment("git-snapshot-test") + self.non_git = Path(tempfile.mkdtemp(prefix="qlib-nongit-")) + self.git_repo = Path(tempfile.mkdtemp(prefix="qlib-gitrepo-")) + + def tearDown(self) -> None: + for d in (self.store, self.non_git, self.git_repo): + shutil.rmtree(d, ignore_errors=True) + + def _new_recorder(self) -> MLflowRecorder: + rec = MLflowRecorder(self.exp_id, self.uri) + run = rec.client.create_run(self.exp_id) + rec.id = run.info.run_id + return rec + + def test_non_git_cwd_is_silent_and_logs_no_artifacts(self): + rec = self._new_recorder() + with chdir(self.non_git), capture_fd_stderr() as err: + rec._log_uncommitted_code() + err.seek(0) + leaked = err.read() + + # git's "Not a git repository" / "usage: git ..." banner must not leak to stderr. + self.assertNotIn(b"Not a git repository", leaked) + self.assertNotIn(b"usage: git", leaked) + # No code snapshot artifacts should be produced outside a work tree. + self.assertEqual([a.path for a in rec.client.list_artifacts(rec.id)], []) + + def test_git_work_tree_logs_code_snapshot(self): + env = { + **os.environ, + "GIT_AUTHOR_NAME": "qlib-test", + "GIT_AUTHOR_EMAIL": "qlib-test@example.com", + "GIT_COMMITTER_NAME": "qlib-test", + "GIT_COMMITTER_EMAIL": "qlib-test@example.com", + } + subprocess.run(["git", "init", "-q"], cwd=self.git_repo, check=True, env=env) + (self.git_repo / "a.txt").write_text("hello\n") + subprocess.run(["git", "add", "a.txt"], cwd=self.git_repo, check=True, env=env) + (self.git_repo / "a.txt").write_text("hello\nworld\n") # unstaged change on top + + rec = self._new_recorder() + with chdir(self.git_repo): + rec._log_uncommitted_code() + + artifacts = sorted(a.path for a in rec.client.list_artifacts(rec.id)) + self.assertEqual(artifacts, ["code_cached.txt", "code_diff.txt", "code_status.txt"]) + + +if __name__ == "__main__": + unittest.main()