pr feedback

This commit is contained in:
Frostebite
2025-12-06 23:00:43 +00:00
parent c61c9f8373
commit a99defafbc
18 changed files with 190 additions and 152 deletions
@@ -22,7 +22,7 @@ class KubernetesJobSpecFactory {
containerName: string,
ip: string = '',
) {
const endpointEnvNames = new Set([
const endpointEnvironmentNames = new Set([
'AWS_S3_ENDPOINT',
'AWS_ENDPOINT',
'AWS_CLOUD_FORMATION_ENDPOINT',
@@ -36,7 +36,7 @@ class KubernetesJobSpecFactory {
let value = x.value;
if (
typeof value === 'string' &&
endpointEnvNames.has(x.name) &&
endpointEnvironmentNames.has(x.name) &&
(value.startsWith('http://localhost') || value.startsWith('http://127.0.0.1'))
) {
// Replace localhost with host.k3d.internal so pods can access host services
@@ -45,6 +45,7 @@ class KubernetesJobSpecFactory {
.replace('http://localhost', 'http://host.k3d.internal')
.replace('http://127.0.0.1', 'http://host.k3d.internal');
}
return { name: x.name, value } as CloudRunnerEnvironmentVariable;
});
@@ -19,8 +19,7 @@ class KubernetesPods {
}));
const errorDetails: string[] = [];
errorDetails.push(`Pod: ${podName}`);
errorDetails.push(`Phase: ${phase}`);
errorDetails.push(`Pod: ${podName}`, `Phase: ${phase}`);
if (conditions.length > 0) {
errorDetails.push(
@@ -36,10 +35,10 @@ class KubernetesPods {
let containerSucceeded = false;
if (containerStatuses.length > 0) {
containerStatuses.forEach((cs, idx) => {
for (const [index, cs] of containerStatuses.entries()) {
if (cs.state?.waiting) {
errorDetails.push(
`Container ${idx} (${cs.name}) waiting: ${cs.state.waiting.reason} - ${cs.state.waiting.message || ''}`,
`Container ${index} (${cs.name}) waiting: ${cs.state.waiting.reason} - ${cs.state.waiting.message || ''}`,
);
}
if (cs.state?.terminated) {
@@ -49,12 +48,12 @@ class KubernetesPods {
containerSucceeded = true;
}
errorDetails.push(
`Container ${idx} (${cs.name}) terminated: ${cs.state.terminated.reason} - ${
`Container ${index} (${cs.name}) terminated: ${cs.state.terminated.reason} - ${
cs.state.terminated.message || ''
} (exit code: ${exitCode})`,
);
}
});
}
}
if (events.length > 0) {
@@ -80,6 +79,7 @@ class KubernetesPods {
);
}
CloudRunnerLogger.log(`Pod details: ${errorDetails.join('\n')}`);
// Don't throw error - container succeeded, PreStopHook failure is non-critical
return false; // Pod is not running, but we don't treat it as a failure
}
@@ -90,8 +90,9 @@ class KubernetesPods {
CloudRunnerLogger.log(
`Pod ${podName} was killed with PreStopHook failure. Waiting for container status to determine if container succeeded...`,
);
// Wait a bit for container status to become available (up to 30 seconds)
for (let i = 0; i < 6; i++) {
for (let index = 0; index < 6; index++) {
await new Promise((resolve) => setTimeout(resolve, 5000));
try {
const updatedPod = (await kubeClient.listNamespacedPod(namespace)).body.items.find(
@@ -105,6 +106,7 @@ class KubernetesPods {
CloudRunnerLogger.logWarning(
`Pod ${podName} container succeeded (exit code 0) after waiting. PreStopHook failure is non-fatal.`,
);
return false; // Pod is not running, but container succeeded
} else {
CloudRunnerLogger.log(
@@ -121,12 +123,14 @@ class KubernetesPods {
CloudRunnerLogger.log(`Error while waiting for container status: ${waitError}`);
}
}
// If we still don't have container status after waiting, but only PreStopHook failed,
// be lenient - the container might have succeeded but status wasn't updated
if (containerExitCode === undefined && hasPreStopHookFailure && !hasExceededGracePeriod) {
CloudRunnerLogger.logWarning(
`Pod ${podName} container status not available after waiting, but only PreStopHook failed (no ExceededGracePeriod). Assuming container may have succeeded.`,
);
return false; // Be lenient - PreStopHook failure alone is not fatal
}
CloudRunnerLogger.log(
@@ -139,6 +143,7 @@ class KubernetesPods {
CloudRunnerLogger.logWarning(
`Pod ${podName} has PreStopHook failure but no container failure detected. Treating as non-fatal.`,
);
return false; // PreStopHook failure alone is not fatal if container status is unclear
}
@@ -149,8 +154,10 @@ class KubernetesPods {
CloudRunnerLogger.logWarning(
`Pod ${podName} was killed (exit code 137 - likely OOM or resource limit) with PreStopHook/grace period issues. This may be a resource constraint issue rather than a build failure.`,
);
// Still log the details but don't fail the test - the build might have succeeded before being killed
CloudRunnerLogger.log(`Pod details: ${errorDetails.join('\n')}`);
return false; // Don't treat system kills as test failures if only PreStopHook issues
}
@@ -62,25 +62,30 @@ class KubernetesTaskRunner {
true,
callback,
);
// If we successfully got logs, check for end of transmission
if (FollowLogStreamService.DidReceiveEndOfTransmission) {
CloudRunnerLogger.log('end of log stream');
break;
}
// If we got logs but no end marker, continue trying (might be more logs)
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
retriesAfterFinish++;
continue;
}
// If we've exhausted retries, break
break;
} catch (fallbackError: any) {
CloudRunnerLogger.log(`Fallback log fetch also failed: ${fallbackError}`);
// If both fail, continue retrying if we haven't exhausted retries
if (retriesAfterFinish < KubernetesTaskRunner.maxRetry) {
retriesAfterFinish++;
continue;
}
// Only break if we've exhausted all retries
CloudRunnerLogger.logWarning(
`Could not fetch any container logs after ${KubernetesTaskRunner.maxRetry} retries`,
@@ -101,6 +106,7 @@ class KubernetesTaskRunner {
if (!error?.message?.includes('previous terminated container')) {
throw error;
}
// For previous container errors, we've already tried fallback, so just break
CloudRunnerLogger.logWarning(
`Could not fetch previous container logs after retries, but continuing with available logs`,