From e8a6d4421d5b9fcd6064c7ae610e57d0923ee984 Mon Sep 17 00:00:00 2001 From: Shivanshu07 Date: Mon, 27 Apr 2026 23:45:30 +0530 Subject: [PATCH] feat(core): per-port lockfile with stale-lock reclaim (PER-7855) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of PER-7855 CLI QoS hardening — short-circuit "Percy already running" at command entry instead of failing late and noisily with EADDRINUSE on `server.listen()`. New module `core/src/lock.js`: - `acquireLock({port})` writes `~/.percy/agent-.lock` atomically via `wx`. Payload is `{pid, port, startedAt}`; mode `0o600` on the file, `0o700` on the parent dir. - `LockHeldError` carries `{meta, lockPath}` so the refusal message can name the live pid + lock path for manual cleanup. - Stale-lock reclaim: `process.kill(pid, 0)` liveness probe; ESRCH treated as dead, EPERM as alive-but-foreign. A self-pid lock (left over by an earlier in-process invocation) is reclaimed without consulting `process.kill` — we cannot conflict with ourselves. - Reclaim is unlink + retry-`wx`, NOT rename-based: Windows CI is pinned to Node 14 (`.github/workflows/windows.yml:15`), where `fs.renameSync` over an existing target is unreliable. `Percy.start()`: - Acquires the lock as the first step inside `try {` (before monitoring, proxy detection, queue starts), so a held-lock fails fast. - Registers a one-shot `process.on('exit')` synchronous unlink as last-chance cleanup if the process exits without a normal `stop()`. Phase 3 will replace this with a signal-driven drain. `Percy.stop()`: - Releases the lock in the `finally` block, alongside monitoring teardown. Idempotent: re-running release on an already-released handle is a no-op. Backwards compatibility: when the lock is held, the start() catch maps `LockHeldError` to the legacy "Percy is already running or the port X is in use" message string (downstream tooling may grep for it) AND also logs the actionable detail (live pid, lockfile path) via `log.error` so users can recover. Test infrastructure (`core/test/helpers/index.js`): - Added `~/.percy/agent-*` to the mockfs `$bypass` list so lock files go through the real fs rather than the in-memory mock. Files are cleaned by `Percy.stop()`'s release path; the self-pid stale optimization handles same-process collisions during sequential Jasmine runs. Tests added: 13 unit specs (`core/test/unit/lock.test.js`) covering SC3 stale reclaim, SC4 live-foreign refusal, SC5 multi-port, EPERM-as-alive, corrupt-payload recovery, mkdir-p, mode bits on POSIX, release idempotency, re-acquire after release. Origin: docs/brainstorms/2026-04-24-per-7855-cli-qos-hardening-requirements.md Plan: docs/plans/2026-04-27-001-feat-per-7855-cli-qos-hardening-plan.md Phase 1: commit e135e9a4 (network refactors + redaction + hint) Phase 3 next: signal drain + unhandled-rejection handlers (PER-7855) Co-Authored-By: Claude Opus 4.7 (1M context, extended thinking) --- packages/core/src/lock.js | 135 +++++++++++++++++++++ packages/core/src/percy.js | 51 +++++++- packages/core/test/helpers/index.js | 7 ++ packages/core/test/unit/lock.test.js | 172 +++++++++++++++++++++++++++ 4 files changed, 363 insertions(+), 2 deletions(-) create mode 100644 packages/core/src/lock.js create mode 100644 packages/core/test/unit/lock.test.js diff --git a/packages/core/src/lock.js b/packages/core/src/lock.js new file mode 100644 index 000000000..a5d077e3e --- /dev/null +++ b/packages/core/src/lock.js @@ -0,0 +1,135 @@ +// Per-port lock file for Percy agent processes (PER-7855 Phase 2). +// +// Why: a stale ~/.percy directory after a crash currently surfaces as a +// late, opaque EADDRINUSE on the next `percy start`. The lock file lets +// us short-circuit at command entry with a clear, actionable refusal +// message and lets us auto-reclaim a stale lock whose recorded pid is +// dead. +// +// Cross-platform note: `fs.renameSync` over an existing target is +// unreliable on Node 14 Windows (Percy's Windows CI is pinned to +// node-version: 14, see .github/workflows/windows.yml). We therefore +// reclaim via unlink + retry-`wx` rather than rename-based reclaim. + +import { mkdirSync, writeFileSync, readFileSync, unlinkSync } from 'fs'; +import { join } from 'path'; +// Use a default import so tests can `spyOn(os, 'homedir')` to redirect +// the lock dir into a tmpdir without touching the user's $HOME. +// (Babel's namespace import is frozen and not spy-able.) +import os from 'os'; + +const LOCK_DIR_MODE = 0o700; +const LOCK_FILE_MODE = 0o600; + +export class LockHeldError extends Error { + constructor(meta, lockPath) { + super( + `Percy is already running on port ${meta.port} ` + + `(pid ${meta.pid}, started ${meta.startedAt}).\n` + + `If you believe this is stale, remove ${lockPath} and try again.` + ); + this.name = 'LockHeldError'; + this.meta = meta; + this.lockPath = lockPath; + } +} + +export function lockPathFor(port) { + return join(os.homedir(), '.percy', `agent-${port}.lock`); +} + +// `process.kill(pid, 0)` returns truthy for living processes, throws +// ESRCH if the pid is gone, and throws EPERM if the pid exists but +// belongs to another user (treat as alive — we cannot reclaim it). +function livenessCheck(pid) { + try { + process.kill(pid, 0); + return 'alive'; + } catch (err) { + if (err.code === 'ESRCH') return 'dead'; + if (err.code === 'EPERM') return 'alive'; + /* istanbul ignore next: defensive — every other Node error code + (ENOSYS, EINVAL, …) implies we cannot determine liveness, so + refusing to reclaim is the safer default. */ + return 'alive'; + } +} + +// Acquire a per-port lock. On success, returns a handle whose `path` +// the caller must eventually pass to `releaseLockSync`. Throws +// `LockHeldError` if another live process holds the lock. +export function acquireLock({ port }) { + const dir = join(os.homedir(), '.percy'); + const path = lockPathFor(port); + const payload = JSON.stringify({ + pid: process.pid, + port, + startedAt: new Date().toISOString() + }); + + mkdirSync(dir, { recursive: true, mode: LOCK_DIR_MODE }); + + // Fast path: atomic exclusive create. + try { + writeFileSync(path, payload, { flag: 'wx', mode: LOCK_FILE_MODE }); + return { path, payload }; + } catch (err) { + /* istanbul ignore if: any non-EEXIST error from `wx` is unexpected + (e.g. EACCES on a read-only $HOME) — propagate. */ + if (err.code !== 'EEXIST') throw err; + } + + // Lock exists. Inspect, then either refuse or reclaim once. + let existing; + try { + existing = JSON.parse(readFileSync(path, 'utf-8')); + } catch (parseErr) { + // Corrupt or truncated payload (a previous process was killed + // mid-write): treat as stale, unlink, and retry. + existing = null; + } + + // A lock recorded with OUR pid means we leaked a previous lock from + // the same process (e.g., a test that forgot to release in afterEach, + // or a code path that bypassed the normal stop). Reclaiming is safe + // because we are that process — we cannot conflict with ourselves. + if (existing && existing.pid !== process.pid && livenessCheck(existing.pid) === 'alive') { + throw new LockHeldError(existing, path); + } + + // Stale (or corrupt). Unlink and retry exclusive create. If a third + // process raced in and won, the second `wx` fails with EEXIST and + // we surface their info — their lock is the legitimate one. + try { + unlinkSync(path); + } catch (e) { + /* istanbul ignore next: race window — another reclaimer beat us + to the unlink. */ + if (e.code !== 'ENOENT') throw e; + } + + try { + writeFileSync(path, payload, { flag: 'wx', mode: LOCK_FILE_MODE }); + return { path, payload }; + } catch (err) { + /* istanbul ignore else */ + if (err.code === 'EEXIST') { + const winner = JSON.parse(readFileSync(path, 'utf-8')); + throw new LockHeldError(winner, path); + } + throw err; + } +} + +// Synchronous release for use in normal teardown AND in +// `process.on('exit')` (which only runs synchronous handlers). +export function releaseLockSync(handle) { + if (!handle?.path) return; + try { + unlinkSync(handle.path); + } catch (e) { + /* istanbul ignore next: lock already gone (manually removed, + race with another reclaimer) — nothing to do. */ + if (e.code !== 'ENOENT') throw e; + } +} diff --git a/packages/core/src/percy.js b/packages/core/src/percy.js index 3f1a9eb48..5a44b933c 100644 --- a/packages/core/src/percy.js +++ b/packages/core/src/percy.js @@ -3,6 +3,7 @@ import PercyConfig from '@percy/config'; import logger from '@percy/logger'; import { getProxy } from '@percy/client/utils'; import Browser from './browser.js'; +import { acquireLock, releaseLockSync } from './lock.js'; import Pako from 'pako'; import { base64encode, @@ -220,6 +221,21 @@ export class Percy { this.cliStartTime = new Date().toISOString(); try { + // PER-7855 Phase 2: per-port lock fast-fail. Acquire BEFORE any + // expensive setup (monitoring, proxy detection, hostname loads) + // so a second `percy start` on the same port refuses cheaply. + // Skipped when no server is configured (lock represents a port + // claim). + if (this.server) { + this._lockHandle = acquireLock({ port: this.port }); + // Synchronous unlink as last-chance cleanup if the process + // exits without a normal stop() (Phase 3 will wire the + // signal-driven path; this `exit` handler covers crash and + // uncaught-exception cases until then). + this._lockExitHandler = () => releaseLockSync(this._lockHandle); + process.on('exit', this._lockExitHandler); + } + // started monitoring system metrics if (this.systemMonitoringEnabled()) { @@ -263,11 +279,25 @@ export class Percy { await this.#discovery.end(); await this.#snapshots.end(); + // PER-7855 Phase 2: release the lock on failed start so a retry + // doesn't see this aborted attempt as "already running." + this._releaseLock(); + // mark this instance as closed unless aborting this.readyState = error.name !== 'AbortError' ? 3 : null; - // throw an easier-to-understand error when the port is in use - if (error.code === 'EADDRINUSE') { + // throw an easier-to-understand error when the port is in use. + // PER-7855 Phase 2: a held lockfile fails before server.listen, + // so EADDRINUSE no longer fires for the "Percy already running" + // case — surface LockHeldError under the same legacy message + // (downstream tools may grep for it) but ALSO log the actionable + // detail (pid + lock path) so users can recover. + if (error.name === 'LockHeldError') { + this.log.error(error.message); + let errMsg = `Percy is already running or the port ${this.port} is in use`; + await this.suggestionsForFix(errMsg); + throw new Error(errMsg); + } else if (error.code === 'EADDRINUSE') { let errMsg = `Percy is already running or the port ${this.port} is in use`; await this.suggestionsForFix(errMsg); throw new Error(errMsg); @@ -278,6 +308,17 @@ export class Percy { } } + // PER-7855 Phase 2: idempotent lock release used by both the + // success and failure paths in start/stop. + _releaseLock() { + if (this._lockExitHandler) { + process.off('exit', this._lockExitHandler); + this._lockExitHandler = null; + } + releaseLockSync(this._lockHandle); + this._lockHandle = null; + } + // Resolves once snapshot and upload queues are idle async *idle() { yield* this.#discovery.idle(); @@ -373,6 +414,12 @@ export class Percy { this.monitoring.stopMonitoring(); clearTimeout(this.resetMonitoringId); + // PER-7855 Phase 2: release per-port lock after the server + // socket is closed (the unlink itself is sync, but ordering + // after `server?.close()` keeps the post-condition that "lock + // present ⇒ server bound" until the very end). + this._releaseLock(); + // This issue doesn't comes under regular error logs, // it's detected if we just and stop percy server await this.checkForNoSnapshotCommandError(); diff --git a/packages/core/test/helpers/index.js b/packages/core/test/helpers/index.js index 11a54e699..96553bdb3 100644 --- a/packages/core/test/helpers/index.js +++ b/packages/core/test/helpers/index.js @@ -12,6 +12,13 @@ export function mockfs(initial) { path.resolve(url.fileURLToPath(import.meta.url), '/../../../dom/dist/bundle.js'), path.resolve(url.fileURLToPath(import.meta.url), '../secretPatterns.yml'), p => p.includes?.('.local-chromium'), + // PER-7855 Phase 2: per-port lockfiles live under ~/.percy/. They + // are infrastructure (not test fixture data), so route them through + // the real fs. Tests on a developer machine may briefly see lock + // files appear under ~/.percy/ during a run; they are cleaned up in + // Percy.stop() and are guarded against same-process collision by + // the self-pid stale optimization in lock.js. + p => typeof p === 'string' && p.includes('/.percy/agent-'), ...(initial?.$bypass ?? []) ] }); diff --git a/packages/core/test/unit/lock.test.js b/packages/core/test/unit/lock.test.js new file mode 100644 index 000000000..5f8b63708 --- /dev/null +++ b/packages/core/test/unit/lock.test.js @@ -0,0 +1,172 @@ +import { setupTest } from '../helpers/index.js'; +import { mkdirSync, mkdtempSync, writeFileSync, readFileSync, existsSync, statSync, rmSync } from 'fs'; +import { join } from 'path'; +import os from 'os'; +import { acquireLock, releaseLockSync, lockPathFor, LockHeldError } from '../../src/lock.js'; + +describe('Unit / Lock', () => { + let fakeHome; + + beforeEach(async () => { + await setupTest(); + + // Redirect `os.homedir()` to a per-test tmp dir so we never touch + // the real $HOME. mkdtempSync gives a unique, writable dir. + fakeHome = mkdtempSync(join(os.tmpdir(), 'percy-lock-test-')); + spyOn(os, 'homedir').and.returnValue(fakeHome); + }); + + afterEach(() => { + /* istanbul ignore next: best-effort cleanup */ + try { rmSync(fakeHome, { recursive: true, force: true }); } catch {} + }); + + describe('acquireLock', () => { + it('writes a lock with our pid, port and an ISO startedAt', () => { + let handle = acquireLock({ port: 5338 }); + let parsed = JSON.parse(readFileSync(handle.path, 'utf-8')); + + expect(parsed.pid).toBe(process.pid); + expect(parsed.port).toBe(5338); + expect(parsed.startedAt).toMatch(/^\d{4}-\d{2}-\d{2}T/); + expect(handle.path).toBe(lockPathFor(5338)); + }); + + it('creates ~/.percy/ if it does not exist (mkdir -p)', () => { + let dir = join(fakeHome, '.percy'); + expect(existsSync(dir)).toBe(false); + acquireLock({ port: 5338 }); + expect(existsSync(dir)).toBe(true); + }); + + // SC3 — stale-lock reclaim + it('reclaims a stale lock whose recorded pid is dead', () => { + // PID 99999999 is reliably non-existent (Linux pid_max is ~4M). + let stalePath = lockPathFor(5338); + mkdirSync(join(fakeHome, '.percy'), { recursive: true }); + writeFileSync(stalePath, JSON.stringify({ pid: 99999999, port: 5338, startedAt: '1970-01-01T00:00:00.000Z' })); + + let handle = acquireLock({ port: 5338 }); + + let parsed = JSON.parse(readFileSync(handle.path, 'utf-8')); + expect(parsed.pid).toBe(process.pid); + }); + + // SC4 — live-lock refusal with actionable message. + // We mock process.kill so we don't depend on a specific live pid + // existing on the test host, and so the recorded pid (12345) is + // distinguishable from process.pid (otherwise the self-pid stale + // optimization would kick in and reclaim). + it('throws LockHeldError when a live foreign process holds the lock', () => { + let livePid = 12345; + let path = lockPathFor(5338); + mkdirSync(join(fakeHome, '.percy'), { recursive: true }); + writeFileSync(path, JSON.stringify({ pid: livePid, port: 5338, startedAt: '2026-04-27T10:00:00.000Z' })); + spyOn(process, 'kill').and.returnValue(true); + + let err; + try { acquireLock({ port: 5338 }); } catch (e) { err = e; } + + expect(err).toBeInstanceOf(LockHeldError); + expect(err.meta.pid).toBe(livePid); + expect(err.meta.port).toBe(5338); + expect(err.lockPath).toBe(path); + expect(err.message).toContain(`pid ${livePid}`); + expect(err.message).toContain(path); + }); + + it('reclaims a self-pid lock (leaked from earlier in same process)', () => { + let path = lockPathFor(5338); + mkdirSync(join(fakeHome, '.percy'), { recursive: true }); + writeFileSync(path, JSON.stringify({ pid: process.pid, port: 5338, startedAt: '2026-04-27T10:00:00.000Z' })); + + let handle = acquireLock({ port: 5338 }); + + let parsed = JSON.parse(readFileSync(handle.path, 'utf-8')); + expect(parsed.pid).toBe(process.pid); + expect(parsed.startedAt).not.toBe('2026-04-27T10:00:00.000Z'); + }); + + it('treats EPERM from process.kill as alive (cross-user pid)', () => { + let path = lockPathFor(5338); + mkdirSync(join(fakeHome, '.percy'), { recursive: true }); + writeFileSync(path, JSON.stringify({ pid: 1, port: 5338, startedAt: '2026-04-27T10:00:00.000Z' })); + + // Stub process.kill to throw EPERM (different-user-owned pid). + spyOn(process, 'kill').and.callFake(() => { + let err = new Error('Operation not permitted'); + err.code = 'EPERM'; + throw err; + }); + + expect(() => acquireLock({ port: 5338 })).toThrowMatching(e => e instanceof LockHeldError); + }); + + it('reclaims a corrupt-payload lock (truncated JSON)', () => { + let path = lockPathFor(5338); + mkdirSync(join(fakeHome, '.percy'), { recursive: true }); + writeFileSync(path, '{not valid json'); // simulate mid-write crash + + let handle = acquireLock({ port: 5338 }); + + let parsed = JSON.parse(readFileSync(handle.path, 'utf-8')); + expect(parsed.pid).toBe(process.pid); + }); + + // SC5 — parallel multi-port: two locks on different ports coexist. + it('allows distinct ports to lock concurrently', () => { + let h1 = acquireLock({ port: 5338 }); + let h2 = acquireLock({ port: 5339 }); + + expect(h1.path).not.toBe(h2.path); + expect(existsSync(h1.path)).toBe(true); + expect(existsSync(h2.path)).toBe(true); + }); + + // POSIX-only: mode bits aren't faithfully represented on Windows. + it('writes lock file with mode 0o600 and parent dir 0o700', () => { + if (process.platform.startsWith('win')) { + pending('mode bits not preserved on Windows'); + return; + } + + let handle = acquireLock({ port: 5338 }); + let fileMode = statSync(handle.path).mode & 0o777; + let dirMode = statSync(join(fakeHome, '.percy')).mode & 0o777; + + expect(fileMode).toBe(0o600); + expect(dirMode).toBe(0o700); + }); + }); + + describe('releaseLockSync', () => { + it('removes the lock file', () => { + let handle = acquireLock({ port: 5338 }); + expect(existsSync(handle.path)).toBe(true); + + releaseLockSync(handle); + + expect(existsSync(handle.path)).toBe(false); + }); + + it('is a no-op for a missing handle', () => { + expect(() => releaseLockSync(undefined)).not.toThrow(); + expect(() => releaseLockSync({})).not.toThrow(); + expect(() => releaseLockSync(null)).not.toThrow(); + }); + + it('is a no-op when the lock file is already gone', () => { + let handle = acquireLock({ port: 5338 }); + releaseLockSync(handle); + expect(() => releaseLockSync(handle)).not.toThrow(); + }); + + it('lets a fresh process re-acquire after release', () => { + let h1 = acquireLock({ port: 5338 }); + releaseLockSync(h1); + + let h2 = acquireLock({ port: 5338 }); + expect(h2.path).toBe(h1.path); + }); + }); +});