feat(orchestrator): add retry-on-fallback and provider init timeout

Adds retryOnFallback (retry failed builds on alternate provider) and
providerInitTimeout (swap provider if init takes too long). Refactors
run() into run()/runWithProvider() to support retry loop.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
frostebite
2026-03-05 07:45:54 +00:00
parent 786ee3799c
commit 8194790728
6 changed files with 133 additions and 8 deletions
+4
View File
@@ -58,6 +58,8 @@ class BuildParameters {
public runnerCheckEnabled!: boolean;
public runnerCheckLabels!: string[];
public runnerCheckMinAvailable!: number;
public retryOnFallback!: boolean;
public providerInitTimeout!: number;
public gitPrivateToken!: string;
public awsStackName!: string;
public awsEndpoint?: string;
@@ -202,6 +204,8 @@ class BuildParameters {
runnerCheckEnabled: OrchestratorOptions.runnerCheckEnabled,
runnerCheckLabels: OrchestratorOptions.runnerCheckLabels,
runnerCheckMinAvailable: OrchestratorOptions.runnerCheckMinAvailable,
retryOnFallback: OrchestratorOptions.retryOnFallback,
providerInitTimeout: OrchestratorOptions.providerInitTimeout,
buildPlatform: OrchestratorOptions.buildPlatform,
kubeConfig: OrchestratorOptions.kubeConfig,
containerMemory: OrchestratorOptions.containerMemory,
@@ -156,6 +156,14 @@ class OrchestratorOptions {
return Number(OrchestratorOptions.getInput('runnerCheckMinAvailable')) || 1;
}
static get retryOnFallback(): boolean {
return OrchestratorOptions.getInput('retryOnFallback') === 'true';
}
static get providerInitTimeout(): number {
return Number(OrchestratorOptions.getInput('providerInitTimeout')) || 0;
}
static get containerCpu(): string {
return OrchestratorOptions.getInput('containerCpu') || `1024`;
}
+61 -6
View File
@@ -217,6 +217,30 @@ class Orchestrator {
if (baseImage.includes(`undefined`)) {
throw new Error(`baseImage is undefined`);
}
try {
return await Orchestrator.runWithProvider(buildParameters, baseImage);
} catch (primaryError: any) {
// Retry on fallback provider if enabled and a fallback is configured
const fallback = buildParameters.fallbackProviderStrategy;
const alreadyOnFallback = buildParameters.providerStrategy === fallback;
if (buildParameters.retryOnFallback && fallback && !alreadyOnFallback) {
OrchestratorLogger.log(
`Primary provider '${buildParameters.providerStrategy}' failed: ${primaryError.message}`,
);
OrchestratorLogger.log(`Retrying build on fallback provider '${fallback}'...`);
buildParameters.providerStrategy = fallback;
core.setOutput('providerFallbackUsed', 'true');
core.setOutput('providerFallbackReason', `Primary provider failed: ${primaryError.message}`);
return await Orchestrator.runWithProvider(buildParameters, baseImage);
}
throw primaryError;
}
}
private static async runWithProvider(buildParameters: BuildParameters, baseImage: string) {
await Orchestrator.setup(buildParameters);
// When aws-local mode is enabled, validate AWS CloudFormation templates
@@ -224,12 +248,10 @@ class Orchestrator {
if (Orchestrator.validateAwsTemplates) {
await Orchestrator.validateAwsCloudFormationTemplates();
}
await Orchestrator.Provider.setupWorkflow(
Orchestrator.buildParameters.buildGuid,
Orchestrator.buildParameters,
Orchestrator.buildParameters.branch,
Orchestrator.defaultSecrets,
);
// Setup workflow with optional init timeout
await Orchestrator.setupWorkflowWithTimeout();
try {
if (buildParameters.maxRetainedWorkspaces > 0) {
Orchestrator.lockedWorkspace = SharedWorkspaceLocking.NewWorkspaceName();
@@ -310,6 +332,39 @@ class Orchestrator {
}
}
/**
* Runs setupWorkflow with an optional timeout. If providerInitTimeout is set and the
* provider takes longer than that to initialize, throws an error that triggers
* retry-on-fallback (if enabled).
*/
private static async setupWorkflowWithTimeout() {
const timeoutSeconds = Orchestrator.buildParameters.providerInitTimeout;
const setupPromise = Orchestrator.Provider.setupWorkflow(
Orchestrator.buildParameters.buildGuid,
Orchestrator.buildParameters,
Orchestrator.buildParameters.branch,
Orchestrator.defaultSecrets,
);
if (timeoutSeconds <= 0) {
await setupPromise;
return;
}
OrchestratorLogger.log(`Provider init timeout: ${timeoutSeconds}s`);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(
() => reject(new Error(`Provider initialization timed out after ${timeoutSeconds}s`)),
timeoutSeconds * 1000,
);
});
await Promise.race([setupPromise, timeoutPromise]);
}
private static async updateStatusWithBuildParameters() {
const content = { ...Orchestrator.buildParameters };
content.gitPrivateToken = ``;