fix: use companion watcher process for async job completion

The async startJob previously relied on Node.js event listeners in the
CLI process to capture child output and finalize the job file. But the
CLI process exits immediately after returning the job ID, killing the
event loop before the close handler fires — leaving jobs stuck at
'running' forever.

Fix: startJob now spawns a companion watcher process (job-watcher.ts)
that is itself detached and outlives the CLI. The watcher:
- Spawns the actual client (codex/claude/opencode)
- Captures stdout/stderr
- Writes the final job record to disk on child exit
- Has its own 10-minute timeout safety net

The CLI returns the job ID immediately. The watcher independently
finalizes the job. The CLI no longer needs to stay alive.

Also updates tests to mock the watcher spawn via injectable
spawnWatcher option.
This commit is contained in:
2026-05-20 14:08:44 -05:00
parent 017eb1b410
commit 33c898ff9a
3 changed files with 265 additions and 137 deletions
+108 -3
View File
@@ -137,6 +137,95 @@ function mockSpawn(scenarios: Map<string, MockScenario>): any {
};
}
/**
* Creates a mock spawnWatcher that simulates the job-watcher.ts behavior:
* spawns the mock child, captures output, and writes the final job record.
*/
function createMockWatcher(
scenarios: Map<string, MockScenario>,
fs: ReturnType<typeof createMockFs>,
opts?: { watcherTimeoutMs?: number }
): (jobFilePath: string, command: string, args: string[]) => { pid: number; unref: () => void } {
return (jobFilePath: string, command: string, clientArgs: string[]) => {
const key = [command, ...clientArgs].join(" ");
const scenario = scenarios.get(key) ?? { error: Object.assign(new Error("spawn ENOENT"), { code: "ENOENT" }) };
const child = createMockChildProcess(scenario);
const watcherPid = 99999;
let stdout = "";
let stderr = "";
child.stdout?.on("data", (chunk: Buffer | string) => { stdout += chunk.toString(); });
child.stderr?.on("data", (chunk: Buffer | string) => { stderr += chunk.toString(); });
child.on("close", (code: number | null) => {
// Read current record, update with results
let record: JobRecord;
try {
record = JSON.parse(fs.readFileSync(jobFilePath)) as JobRecord;
} catch {
return;
}
const durationMs = Date.now() - new Date(record.startedAt).getTime();
const status = code === 0 || code === null ? "completed" : "failed";
record.status = status;
record.stdout = stdout;
record.stderr = stderr;
record.completedAt = new Date().toISOString();
record.result = { stdout, stderr, exitCode: code ?? 0, client: record.client, durationMs };
if (scenario.error) {
record.status = "failed";
record.error = scenario.error.message;
}
fs.writeFileSync(jobFilePath, JSON.stringify(record));
});
child.on("error", (err: NodeJS.ErrnoException) => {
let record: JobRecord;
try {
record = JSON.parse(fs.readFileSync(jobFilePath)) as JobRecord;
} catch {
return;
}
record.status = "failed";
record.error = err.message;
record.completedAt = new Date().toISOString();
fs.writeFileSync(jobFilePath, JSON.stringify(record));
});
// For hang scenarios, simulate a timeout by writing timed_out after a delay
if (scenario.hang) {
const watcherTimeout = opts?.watcherTimeoutMs ?? 30;
setTimeout(() => {
try {
const existing = JSON.parse(fs.readFileSync(jobFilePath)) as JobRecord;
// Don't overwrite if already cancelled/completed
if (existing.status !== "running") return;
existing.status = "timed_out";
existing.completedAt = new Date().toISOString();
existing.result = { stdout, stderr, exitCode: -1, client: existing.client, durationMs: watcherTimeout };
fs.writeFileSync(jobFilePath, JSON.stringify(existing));
} catch { /* ignore */ }
}, watcherTimeout);
}
// Simulate watcher updating the PID in the job file
try {
const record = JSON.parse(fs.readFileSync(jobFilePath)) as JobRecord;
record.pid = child.pid;
fs.writeFileSync(jobFilePath, JSON.stringify(record));
} catch { /* ignore */ }
return { pid: watcherPid, unref: () => {} };
};
}
function createJobTestHelper(scenarios: Map<string, MockScenario>, jobDir: string) {
const fs = createMockFs();
const spawnWatcher = createMockWatcher(scenarios, fs);
const spawn = mockSpawn(scenarios);
return { fs, spawn, spawnWatcher, jobDir };
}
function readJobRecord(fs: ReturnType<typeof createMockFs>, path: string): JobRecord {
return JSON.parse(fs.readFileSync(path)) as JobRecord;
}
@@ -156,6 +245,7 @@ describe("startJob", () => {
const job = await startJob("codex", "hello world", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -188,11 +278,13 @@ describe("startJob", () => {
const job1 = await startJob("codex", "a", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
const job2 = await startJob("codex", "b", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -209,6 +301,7 @@ describe("startJob", () => {
const job = await startJob("codex", "slow", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
timeoutMs: 20,
});
@@ -230,6 +323,7 @@ describe("startJob", () => {
const job = await startJob("codex", "fail", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -249,6 +343,7 @@ describe("startJob", () => {
const job = await startJob("codex", "hello", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -263,7 +358,7 @@ describe("startJob", () => {
describe("getJob", () => {
it("returns the current Job state from disk", async () => {
const scenarios = new Map<string, MockScenario>([
["codex exec --yolo hello", { stdout: "ok", exitCode: 0 }],
["codex exec --yolo hello", { hang: true }],
]);
const fs = createMockFs();
const jobDir = "/tmp/jobs";
@@ -271,6 +366,7 @@ describe("getJob", () => {
const job = await startJob("codex", "hello", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs, { watcherTimeoutMs: 5000 }),
fs,
});
@@ -280,9 +376,9 @@ describe("getJob", () => {
await delay(50);
// Job should still be running since watcher timeout is 5s
const after = getJob(job.id, { jobDir, fs });
assert.strictEqual(after.status, "completed");
assert.strictEqual(after.result?.exitCode, 0);
assert.strictEqual(after.status, "running");
});
it("throws JobNotFoundError for nonexistent job", () => {
@@ -305,6 +401,7 @@ describe("getJobResult", () => {
const job = await startJob("codex", "hello", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -325,6 +422,7 @@ describe("getJobResult", () => {
const job = await startJob("codex", "hello", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
timeoutMs: 50,
});
@@ -345,6 +443,7 @@ describe("getJobResult", () => {
const job = await startJob("codex", "fail", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -376,6 +475,7 @@ describe("cancelJob", () => {
const job = await startJob("codex", "hello", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs, { watcherTimeoutMs: 5000 }),
fs,
});
@@ -439,12 +539,14 @@ describe("listJobs", () => {
const job1 = await startJob("codex", "a", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
await delay(20);
const job2 = await startJob("codex", "b", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
@@ -577,6 +679,7 @@ describe("cleanupJobs", () => {
const oldJob = await startJob("codex", "old", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
await delay(50);
@@ -584,6 +687,7 @@ describe("cleanupJobs", () => {
const newJob = await startJob("codex", "new", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
await delay(50);
@@ -610,6 +714,7 @@ describe("cleanupJobs", () => {
const job = await startJob("codex", "old", {
jobDir,
spawn: mockSpawn(scenarios),
spawnWatcher: createMockWatcher(scenarios, fs),
fs,
});
await delay(50);