feat: update codex live runtime and restart flow
This commit is contained in:
@@ -28,20 +28,6 @@ const runnerLogTrimIntervalMs = Math.max(
|
||||
15_000,
|
||||
Number(process.env.SERVER_COMMAND_RUNNER_LOG_TRIM_INTERVAL_MS?.trim() || '60000'),
|
||||
);
|
||||
const cpuWatchdogEnabled = process.env.SERVER_COMMAND_CPU_WATCHDOG_ENABLED?.trim() !== 'false';
|
||||
const cpuWatchdogIntervalMs = Math.max(15_000, Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_INTERVAL_MS?.trim() || '60000'));
|
||||
const cpuWatchdogThresholdPercent = Math.max(
|
||||
10,
|
||||
Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_THRESHOLD_PERCENT?.trim() || '120'),
|
||||
);
|
||||
const cpuWatchdogConsecutiveLimit = Math.max(
|
||||
2,
|
||||
Math.round(Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_CONSECUTIVE_LIMIT?.trim() || '8')),
|
||||
);
|
||||
const cpuWatchdogCooldownMs = Math.max(
|
||||
60_000,
|
||||
Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_COOLDOWN_MS?.trim() || '1200000'),
|
||||
);
|
||||
const STREAM_CAPTURE_LIMIT = 256 * 1024;
|
||||
const CODEX_HOME_RUNTIME_PATHS = [
|
||||
'auth.json',
|
||||
@@ -78,6 +64,18 @@ const commandDefinitions = {
|
||||
},
|
||||
restartStrategy: 'deferred',
|
||||
},
|
||||
prod: {
|
||||
label: 'PROD',
|
||||
scriptPath: path.join(projectRoot, 'etc', 'commands', 'server-command', 'restart-prod.sh'),
|
||||
workingDirectory: projectRoot,
|
||||
env: {
|
||||
MAIN_PROJECT_ROOT: projectRoot,
|
||||
SERVER_COMMAND_COMPOSE_FILE: path.join(projectRoot, 'docker-compose.yml'),
|
||||
SERVER_COMMAND_SERVICE: process.env.SERVER_COMMAND_PROD_SERVICE?.trim() || 'prod-app',
|
||||
SERVER_COMMAND_CONTAINER_NAME: process.env.SERVER_COMMAND_PROD_CONTAINER_NAME?.trim() || 'ai-code-app-prod',
|
||||
},
|
||||
restartStrategy: 'deferred',
|
||||
},
|
||||
'work-server': {
|
||||
label: 'WORK-SERVER',
|
||||
scriptPath: path.join(projectRoot, 'etc', 'commands', 'server-command', 'restart-work-server.sh'),
|
||||
@@ -133,46 +131,6 @@ function translateWorkspacePathToHost(inputPath) {
|
||||
return normalizedInput;
|
||||
}
|
||||
|
||||
const cpuWatchdogTargets = [
|
||||
{
|
||||
name: 'test-app',
|
||||
containerName: 'ai-code-app-app-1',
|
||||
restartMode: 'command',
|
||||
restartKey: 'test',
|
||||
},
|
||||
{
|
||||
name: 'release-app',
|
||||
containerName: 'ai-code-app-release',
|
||||
restartMode: 'command',
|
||||
restartKey: 'rel',
|
||||
},
|
||||
{
|
||||
name: 'prod-app',
|
||||
containerName: 'ai-code-app-prod',
|
||||
restartMode: 'docker',
|
||||
},
|
||||
{
|
||||
name: 'work-server',
|
||||
containerName: 'work-server',
|
||||
restartMode: 'command',
|
||||
restartKey: 'work-server',
|
||||
},
|
||||
];
|
||||
|
||||
const cpuWatchdogState = new Map(
|
||||
cpuWatchdogTargets.map((target) => [
|
||||
target.containerName,
|
||||
{
|
||||
lastCpuPercent: null,
|
||||
breachCount: 0,
|
||||
lastSampleAt: null,
|
||||
lastRestartAt: null,
|
||||
lastRestartReason: null,
|
||||
},
|
||||
]),
|
||||
);
|
||||
let cpuWatchdogBusy = false;
|
||||
|
||||
function trimOutput(value, maxLength = 400) {
|
||||
const normalized = value.replace(/\s+/g, ' ').trim();
|
||||
if (!normalized) {
|
||||
@@ -223,26 +181,6 @@ async function writeHeartbeat() {
|
||||
cwd: projectRoot,
|
||||
startedAt,
|
||||
updatedAt: new Date().toISOString(),
|
||||
cpuWatchdog: {
|
||||
enabled: cpuWatchdogEnabled,
|
||||
intervalMs: cpuWatchdogIntervalMs,
|
||||
thresholdPercent: cpuWatchdogThresholdPercent,
|
||||
consecutiveLimit: cpuWatchdogConsecutiveLimit,
|
||||
cooldownMs: cpuWatchdogCooldownMs,
|
||||
targets: cpuWatchdogTargets.map((target) => {
|
||||
const state = cpuWatchdogState.get(target.containerName);
|
||||
return {
|
||||
name: target.name,
|
||||
containerName: target.containerName,
|
||||
restartMode: target.restartMode,
|
||||
lastCpuPercent: state?.lastCpuPercent ?? null,
|
||||
breachCount: state?.breachCount ?? 0,
|
||||
lastSampleAt: state?.lastSampleAt ?? null,
|
||||
lastRestartAt: state?.lastRestartAt ?? null,
|
||||
lastRestartReason: state?.lastRestartReason ?? null,
|
||||
};
|
||||
}),
|
||||
},
|
||||
},
|
||||
null,
|
||||
2,
|
||||
@@ -251,127 +189,6 @@ async function writeHeartbeat() {
|
||||
);
|
||||
}
|
||||
|
||||
function parseCpuPercentage(value) {
|
||||
const numeric = Number(String(value ?? '').replace('%', '').trim());
|
||||
return Number.isFinite(numeric) ? numeric : null;
|
||||
}
|
||||
|
||||
async function restartContainerByDocker(containerName) {
|
||||
await execFileAsync('docker', ['restart', containerName], {
|
||||
cwd: projectRoot,
|
||||
timeout: 30_000,
|
||||
maxBuffer: 1024 * 1024,
|
||||
});
|
||||
}
|
||||
|
||||
async function sampleCpuWatchdog() {
|
||||
if (!cpuWatchdogEnabled || cpuWatchdogBusy || cpuWatchdogTargets.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
cpuWatchdogBusy = true;
|
||||
|
||||
try {
|
||||
const { stdout } = await execFileAsync(
|
||||
'docker',
|
||||
[
|
||||
'stats',
|
||||
'--no-stream',
|
||||
'--format',
|
||||
'{{json .}}',
|
||||
...cpuWatchdogTargets.map((target) => target.containerName),
|
||||
],
|
||||
{
|
||||
cwd: projectRoot,
|
||||
timeout: 15_000,
|
||||
maxBuffer: 1024 * 1024,
|
||||
},
|
||||
);
|
||||
const now = new Date().toISOString();
|
||||
const sampledContainers = new Set();
|
||||
|
||||
for (const line of stdout.split('\n')) {
|
||||
const trimmedLine = line.trim();
|
||||
|
||||
if (!trimmedLine) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parsed;
|
||||
|
||||
try {
|
||||
parsed = JSON.parse(trimmedLine);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const containerName = String(parsed.Name ?? '').trim();
|
||||
const cpuPercent = parseCpuPercentage(parsed.CPUPerc);
|
||||
const target = cpuWatchdogTargets.find((entry) => entry.containerName === containerName);
|
||||
const state = cpuWatchdogState.get(containerName);
|
||||
|
||||
if (!target || !state) {
|
||||
continue;
|
||||
}
|
||||
|
||||
sampledContainers.add(containerName);
|
||||
state.lastCpuPercent = cpuPercent;
|
||||
state.lastSampleAt = now;
|
||||
state.breachCount = cpuPercent != null && cpuPercent >= cpuWatchdogThresholdPercent ? state.breachCount + 1 : 0;
|
||||
|
||||
const cooldownPassed =
|
||||
!state.lastRestartAt || Date.now() - new Date(state.lastRestartAt).getTime() >= cpuWatchdogCooldownMs;
|
||||
|
||||
if (state.breachCount < cpuWatchdogConsecutiveLimit || !cooldownPassed) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const restartReason = `cpu ${cpuPercent?.toFixed(1) ?? '?'}% sustained for ${
|
||||
state.breachCount
|
||||
} samples`;
|
||||
|
||||
process.stdout.write(
|
||||
`[cpu-watchdog] restarting ${target.containerName} because ${restartReason} (threshold ${cpuWatchdogThresholdPercent}%)\n`,
|
||||
);
|
||||
|
||||
if (target.restartMode === 'command' && target.restartKey) {
|
||||
await runRestartCommand(target.restartKey);
|
||||
} else {
|
||||
await restartContainerByDocker(target.containerName);
|
||||
}
|
||||
|
||||
state.breachCount = 0;
|
||||
state.lastRestartAt = new Date().toISOString();
|
||||
state.lastRestartReason = restartReason;
|
||||
await writeHeartbeat().catch(() => {
|
||||
// noop
|
||||
});
|
||||
}
|
||||
|
||||
for (const target of cpuWatchdogTargets) {
|
||||
if (sampledContainers.has(target.containerName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const state = cpuWatchdogState.get(target.containerName);
|
||||
|
||||
if (!state) {
|
||||
continue;
|
||||
}
|
||||
|
||||
state.lastCpuPercent = null;
|
||||
state.lastSampleAt = now;
|
||||
state.breachCount = 0;
|
||||
}
|
||||
} catch (error) {
|
||||
process.stdout.write(
|
||||
`[cpu-watchdog] sample failed: ${error instanceof Error ? error.message : String(error)}\n`,
|
||||
);
|
||||
} finally {
|
||||
cpuWatchdogBusy = false;
|
||||
}
|
||||
}
|
||||
|
||||
void trimRunnerLogIfNeeded();
|
||||
setInterval(() => {
|
||||
void trimRunnerLogIfNeeded();
|
||||
@@ -1030,12 +847,5 @@ server.listen(port, host, () => {
|
||||
});
|
||||
}, 10_000);
|
||||
heartbeatTimer.unref();
|
||||
if (cpuWatchdogEnabled) {
|
||||
const cpuWatchdogTimer = setInterval(() => {
|
||||
void sampleCpuWatchdog();
|
||||
}, cpuWatchdogIntervalMs);
|
||||
cpuWatchdogTimer.unref();
|
||||
void sampleCpuWatchdog();
|
||||
}
|
||||
process.stdout.write(`server-command-runner listening on http://${host}:${port}\n`);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user