fix(cli-provider): add timeout protection for external CLI processes

Prevent builds from hanging indefinitely when CLI provider subprocess
is unresponsive. Default 2h for runTaskInWorkflow, 1h for watchWorkflow.
Graceful SIGTERM with 10s grace before SIGKILL.

- Added RUN_TASK_TIMEOUT_MS (2 hours) and WATCH_WORKFLOW_TIMEOUT_MS (1 hour)
- Added gracefulKill helper: SIGTERM first, SIGKILL after 10s grace period
- runTaskInWorkflow and watchWorkflow now have timeout protection
- Existing execute() method upgraded to use gracefulKill
- core.error() called with clear human-readable timeout message
- Added comprehensive tests: timeout triggers, SIGKILL escalation,
  grace period cancellation on voluntary exit, normal completion

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
frostebite
2026-03-05 12:55:14 +00:00
parent 007852a800
commit fe63d7b32d
2 changed files with 173 additions and 7 deletions

View File

@@ -20,6 +20,7 @@ jest.mock('@actions/core', () => ({
jest.mock('../provider-git-manager');
import { spawn } from 'child_process';
import * as core from '@actions/core';
import CliProvider from './cli-provider';
const mockSpawn = spawn as jest.MockedFunction<typeof spawn>;
@@ -215,6 +216,18 @@ describe('CliProvider', () => {
const result = await promise;
expect(result).toBe('line 1\nline 2');
});
it('rejects on spawn error', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
const provider = new CliProvider('/nonexistent/path', {} as any);
const promise = provider.runTaskInWorkflow('guid', 'image', 'cmd', '/mnt', '/work', [], []);
child.emit('error', new Error('ENOENT'));
await expect(promise).rejects.toThrow('failed to spawn executable');
});
});
describe('cleanupWorkflow', () => {
@@ -404,7 +417,7 @@ describe('CliProvider', () => {
jest.useRealTimers();
});
it('rejects and kills process when command times out', async () => {
it('rejects and kills process when execute command times out', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
@@ -417,6 +430,97 @@ describe('CliProvider', () => {
await expect(promise).rejects.toThrow('timed out');
expect(child.kill).toHaveBeenCalledWith('SIGTERM');
});
it('rejects and kills process when runTaskInWorkflow times out', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
const provider = new CliProvider('/path/to/exe', {} as any);
const promise = provider.runTaskInWorkflow('guid', 'image', 'cmd', '/mnt', '/work', [], []);
// Advance past the 2-hour timeout (7_200_000ms)
jest.advanceTimersByTime(7_200_001);
await expect(promise).rejects.toThrow('run-task timed out');
expect(child.kill).toHaveBeenCalledWith('SIGTERM');
expect(core.error).toHaveBeenCalledWith(expect.stringContaining('CLI provider timed out after 120 minutes'));
});
it('rejects and kills process when watchWorkflow times out', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
const provider = new CliProvider('/path/to/exe', {} as any);
const promise = provider.watchWorkflow();
// Advance past the 1-hour timeout (3_600_000ms)
jest.advanceTimersByTime(3_600_001);
await expect(promise).rejects.toThrow('watch-workflow timed out');
expect(child.kill).toHaveBeenCalledWith('SIGTERM');
expect(core.error).toHaveBeenCalledWith(expect.stringContaining('CLI provider timed out after 60 minutes'));
});
it('escalates to SIGKILL after grace period on runTaskInWorkflow timeout', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
const provider = new CliProvider('/path/to/exe', {} as any);
const promise = provider.runTaskInWorkflow('guid', 'image', 'cmd', '/mnt', '/work', [], []);
// Trigger the timeout
jest.advanceTimersByTime(7_200_001);
await expect(promise).rejects.toThrow('timed out');
// SIGTERM was sent
expect(child.kill).toHaveBeenCalledWith('SIGTERM');
// Advance past the 10s grace period — SIGKILL should fire
jest.advanceTimersByTime(10_001);
expect(child.kill).toHaveBeenCalledWith('SIGKILL');
});
it('does not send SIGKILL if process exits before grace period', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
const provider = new CliProvider('/path/to/exe', {} as any);
const promise = provider.runTaskInWorkflow('guid', 'image', 'cmd', '/mnt', '/work', [], []);
// Trigger the timeout
jest.advanceTimersByTime(7_200_001);
await expect(promise).rejects.toThrow('timed out');
// Process exits voluntarily after SIGTERM
child.emit('close', 143);
// Advance past the grace period — SIGKILL should NOT fire because process already exited
jest.advanceTimersByTime(10_001);
expect(child.kill).toHaveBeenCalledWith('SIGTERM');
// SIGKILL should not have been called because the close event cleared the timer
expect(child.kill).not.toHaveBeenCalledWith('SIGKILL');
});
it('clears timeout when runTaskInWorkflow completes normally', async () => {
const child = createMockChildProcess();
mockSpawn.mockReturnValue(child);
const provider = new CliProvider('/path/to/exe', {} as any);
const promise = provider.runTaskInWorkflow('guid', 'image', 'cmd', '/mnt', '/work', [], []);
// Process completes before timeout
child.stdout.emit('data', Buffer.from(JSON.stringify({ success: true, output: 'done' }) + '\n'));
child.emit('close', 0);
const result = await promise;
expect(result).toBe('done');
// Advance far past timeout — should NOT reject
jest.advanceTimersByTime(8_000_000);
expect(child.kill).not.toHaveBeenCalled();
});
});
describe('available providers list', () => {

View File

@@ -1,4 +1,5 @@
import { spawn } from 'child_process';
import { spawn, ChildProcess } from 'child_process';
import * as core from '@actions/core';
import { ProviderInterface } from '../provider-interface';
import BuildParameters from '../../../build-parameters';
import OrchestratorEnvironmentVariable from '../../options/orchestrator-environment-variable';
@@ -9,6 +10,29 @@ import OrchestratorLogger from '../../services/core/orchestrator-logger';
import { CliProviderRequest, CliProviderResponse, CliProviderSubcommand } from './cli-provider-protocol';
const DEFAULT_TIMEOUT_MS = 300_000; // 300 seconds
const RUN_TASK_TIMEOUT_MS = 7_200_000; // 2 hours
const WATCH_WORKFLOW_TIMEOUT_MS = 3_600_000; // 1 hour
const SIGKILL_GRACE_MS = 10_000; // 10 seconds grace period before SIGKILL
/**
* Gracefully kill a child process: SIGTERM first, then SIGKILL after a grace period.
*/
function gracefulKill(child: ChildProcess, graceMs: number = SIGKILL_GRACE_MS): void {
child.kill('SIGTERM');
const forceKillTimer = setTimeout(() => {
try {
child.kill('SIGKILL');
} catch {
// Process may already be dead
}
}, graceMs);
// Clear the force-kill timer if the process exits on its own
child.on('close', () => {
clearTimeout(forceKillTimer);
});
}
class CliProvider implements ProviderInterface {
private readonly executablePath: string;
@@ -74,6 +98,8 @@ class CliProvider implements ProviderInterface {
},
};
const timeoutMs = RUN_TASK_TIMEOUT_MS;
return new Promise<string>((resolve, reject) => {
const child = spawn(this.executablePath, ['run-task'], {
stdio: ['pipe', 'pipe', 'pipe'],
@@ -83,6 +109,17 @@ class CliProvider implements ProviderInterface {
let lastJsonResponse: CliProviderResponse | undefined;
const outputLines: string[] = [];
let stderrOutput = '';
let timedOut = false;
// Set up timeout to prevent indefinite hangs
const timer = setTimeout(() => {
timedOut = true;
const minutes = Math.round(timeoutMs / 60_000);
const message = `CLI provider timed out after ${minutes} minutes. The external provider may be unresponsive.`;
core.error(message);
gracefulKill(child);
reject(new Error(`CliProvider run-task timed out after ${timeoutMs}ms`));
}, timeoutMs);
child.stdin.write(JSON.stringify(request));
child.stdin.end();
@@ -123,10 +160,16 @@ class CliProvider implements ProviderInterface {
});
child.on('error', (error: Error) => {
reject(new Error(`CliProvider: failed to spawn executable '${this.executablePath}': ${error.message}`));
clearTimeout(timer);
if (!timedOut) {
reject(new Error(`CliProvider: failed to spawn executable '${this.executablePath}': ${error.message}`));
}
});
child.on('close', (code: number | null) => {
clearTimeout(timer);
if (timedOut) return;
if (lastJsonResponse) {
if (lastJsonResponse.success) {
resolve(lastJsonResponse.output || outputLines.join('\n'));
@@ -182,6 +225,8 @@ class CliProvider implements ProviderInterface {
params: {},
};
const timeoutMs = WATCH_WORKFLOW_TIMEOUT_MS;
return new Promise<string>((resolve, reject) => {
const child = spawn(this.executablePath, ['watch-workflow'], {
stdio: ['pipe', 'pipe', 'pipe'],
@@ -190,6 +235,17 @@ class CliProvider implements ProviderInterface {
let lastJsonResponse: CliProviderResponse | undefined;
const outputLines: string[] = [];
let timedOut = false;
// Set up timeout to prevent indefinite hangs
const timer = setTimeout(() => {
timedOut = true;
const minutes = Math.round(timeoutMs / 60_000);
const message = `CLI provider timed out after ${minutes} minutes. The external provider may be unresponsive.`;
core.error(message);
gracefulKill(child);
reject(new Error(`CliProvider watch-workflow timed out after ${timeoutMs}ms`));
}, timeoutMs);
child.stdin.write(JSON.stringify(request));
child.stdin.end();
@@ -225,10 +281,16 @@ class CliProvider implements ProviderInterface {
});
child.on('error', (error: Error) => {
reject(new Error(`CliProvider: failed to spawn executable '${this.executablePath}': ${error.message}`));
clearTimeout(timer);
if (!timedOut) {
reject(new Error(`CliProvider: failed to spawn executable '${this.executablePath}': ${error.message}`));
}
});
child.on('close', (code: number | null) => {
clearTimeout(timer);
if (timedOut) return;
if (lastJsonResponse) {
if (lastJsonResponse.success) {
resolve(lastJsonResponse.output || outputLines.join('\n'));
@@ -246,7 +308,7 @@ class CliProvider implements ProviderInterface {
/**
* Execute a CLI provider subcommand with a default timeout.
* Used for all methods except runTaskInWorkflow and watchWorkflow (which have no timeout).
* Timeout applies a graceful SIGTERM followed by SIGKILL after a grace period.
*/
private execute(
command: CliProviderSubcommand,
@@ -265,10 +327,10 @@ class CliProvider implements ProviderInterface {
let stderrData = '';
let timedOut = false;
// Set up timeout
// Set up timeout with graceful kill
const timer = setTimeout(() => {
timedOut = true;
child.kill('SIGTERM');
gracefulKill(child);
reject(new Error(`CliProvider: command '${command}' timed out after ${timeoutMs}ms`));
}, timeoutMs);