feat: update codex live runtime and restart flow

This commit is contained in:
2026-04-23 18:10:43 +09:00
parent b0b9980a6c
commit 6e863feafd
36 changed files with 1636 additions and 358 deletions

View File

@@ -28,20 +28,6 @@ const runnerLogTrimIntervalMs = Math.max(
15_000,
Number(process.env.SERVER_COMMAND_RUNNER_LOG_TRIM_INTERVAL_MS?.trim() || '60000'),
);
const cpuWatchdogEnabled = process.env.SERVER_COMMAND_CPU_WATCHDOG_ENABLED?.trim() !== 'false';
const cpuWatchdogIntervalMs = Math.max(15_000, Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_INTERVAL_MS?.trim() || '60000'));
const cpuWatchdogThresholdPercent = Math.max(
10,
Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_THRESHOLD_PERCENT?.trim() || '120'),
);
const cpuWatchdogConsecutiveLimit = Math.max(
2,
Math.round(Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_CONSECUTIVE_LIMIT?.trim() || '8')),
);
const cpuWatchdogCooldownMs = Math.max(
60_000,
Number(process.env.SERVER_COMMAND_CPU_WATCHDOG_COOLDOWN_MS?.trim() || '1200000'),
);
const STREAM_CAPTURE_LIMIT = 256 * 1024;
const CODEX_HOME_RUNTIME_PATHS = [
'auth.json',
@@ -78,6 +64,18 @@ const commandDefinitions = {
},
restartStrategy: 'deferred',
},
prod: {
label: 'PROD',
scriptPath: path.join(projectRoot, 'etc', 'commands', 'server-command', 'restart-prod.sh'),
workingDirectory: projectRoot,
env: {
MAIN_PROJECT_ROOT: projectRoot,
SERVER_COMMAND_COMPOSE_FILE: path.join(projectRoot, 'docker-compose.yml'),
SERVER_COMMAND_SERVICE: process.env.SERVER_COMMAND_PROD_SERVICE?.trim() || 'prod-app',
SERVER_COMMAND_CONTAINER_NAME: process.env.SERVER_COMMAND_PROD_CONTAINER_NAME?.trim() || 'ai-code-app-prod',
},
restartStrategy: 'deferred',
},
'work-server': {
label: 'WORK-SERVER',
scriptPath: path.join(projectRoot, 'etc', 'commands', 'server-command', 'restart-work-server.sh'),
@@ -133,46 +131,6 @@ function translateWorkspacePathToHost(inputPath) {
return normalizedInput;
}
const cpuWatchdogTargets = [
{
name: 'test-app',
containerName: 'ai-code-app-app-1',
restartMode: 'command',
restartKey: 'test',
},
{
name: 'release-app',
containerName: 'ai-code-app-release',
restartMode: 'command',
restartKey: 'rel',
},
{
name: 'prod-app',
containerName: 'ai-code-app-prod',
restartMode: 'docker',
},
{
name: 'work-server',
containerName: 'work-server',
restartMode: 'command',
restartKey: 'work-server',
},
];
const cpuWatchdogState = new Map(
cpuWatchdogTargets.map((target) => [
target.containerName,
{
lastCpuPercent: null,
breachCount: 0,
lastSampleAt: null,
lastRestartAt: null,
lastRestartReason: null,
},
]),
);
let cpuWatchdogBusy = false;
function trimOutput(value, maxLength = 400) {
const normalized = value.replace(/\s+/g, ' ').trim();
if (!normalized) {
@@ -223,26 +181,6 @@ async function writeHeartbeat() {
cwd: projectRoot,
startedAt,
updatedAt: new Date().toISOString(),
cpuWatchdog: {
enabled: cpuWatchdogEnabled,
intervalMs: cpuWatchdogIntervalMs,
thresholdPercent: cpuWatchdogThresholdPercent,
consecutiveLimit: cpuWatchdogConsecutiveLimit,
cooldownMs: cpuWatchdogCooldownMs,
targets: cpuWatchdogTargets.map((target) => {
const state = cpuWatchdogState.get(target.containerName);
return {
name: target.name,
containerName: target.containerName,
restartMode: target.restartMode,
lastCpuPercent: state?.lastCpuPercent ?? null,
breachCount: state?.breachCount ?? 0,
lastSampleAt: state?.lastSampleAt ?? null,
lastRestartAt: state?.lastRestartAt ?? null,
lastRestartReason: state?.lastRestartReason ?? null,
};
}),
},
},
null,
2,
@@ -251,127 +189,6 @@ async function writeHeartbeat() {
);
}
function parseCpuPercentage(value) {
const numeric = Number(String(value ?? '').replace('%', '').trim());
return Number.isFinite(numeric) ? numeric : null;
}
async function restartContainerByDocker(containerName) {
await execFileAsync('docker', ['restart', containerName], {
cwd: projectRoot,
timeout: 30_000,
maxBuffer: 1024 * 1024,
});
}
async function sampleCpuWatchdog() {
if (!cpuWatchdogEnabled || cpuWatchdogBusy || cpuWatchdogTargets.length === 0) {
return;
}
cpuWatchdogBusy = true;
try {
const { stdout } = await execFileAsync(
'docker',
[
'stats',
'--no-stream',
'--format',
'{{json .}}',
...cpuWatchdogTargets.map((target) => target.containerName),
],
{
cwd: projectRoot,
timeout: 15_000,
maxBuffer: 1024 * 1024,
},
);
const now = new Date().toISOString();
const sampledContainers = new Set();
for (const line of stdout.split('\n')) {
const trimmedLine = line.trim();
if (!trimmedLine) {
continue;
}
let parsed;
try {
parsed = JSON.parse(trimmedLine);
} catch {
continue;
}
const containerName = String(parsed.Name ?? '').trim();
const cpuPercent = parseCpuPercentage(parsed.CPUPerc);
const target = cpuWatchdogTargets.find((entry) => entry.containerName === containerName);
const state = cpuWatchdogState.get(containerName);
if (!target || !state) {
continue;
}
sampledContainers.add(containerName);
state.lastCpuPercent = cpuPercent;
state.lastSampleAt = now;
state.breachCount = cpuPercent != null && cpuPercent >= cpuWatchdogThresholdPercent ? state.breachCount + 1 : 0;
const cooldownPassed =
!state.lastRestartAt || Date.now() - new Date(state.lastRestartAt).getTime() >= cpuWatchdogCooldownMs;
if (state.breachCount < cpuWatchdogConsecutiveLimit || !cooldownPassed) {
continue;
}
const restartReason = `cpu ${cpuPercent?.toFixed(1) ?? '?'}% sustained for ${
state.breachCount
} samples`;
process.stdout.write(
`[cpu-watchdog] restarting ${target.containerName} because ${restartReason} (threshold ${cpuWatchdogThresholdPercent}%)\n`,
);
if (target.restartMode === 'command' && target.restartKey) {
await runRestartCommand(target.restartKey);
} else {
await restartContainerByDocker(target.containerName);
}
state.breachCount = 0;
state.lastRestartAt = new Date().toISOString();
state.lastRestartReason = restartReason;
await writeHeartbeat().catch(() => {
// noop
});
}
for (const target of cpuWatchdogTargets) {
if (sampledContainers.has(target.containerName)) {
continue;
}
const state = cpuWatchdogState.get(target.containerName);
if (!state) {
continue;
}
state.lastCpuPercent = null;
state.lastSampleAt = now;
state.breachCount = 0;
}
} catch (error) {
process.stdout.write(
`[cpu-watchdog] sample failed: ${error instanceof Error ? error.message : String(error)}\n`,
);
} finally {
cpuWatchdogBusy = false;
}
}
void trimRunnerLogIfNeeded();
setInterval(() => {
void trimRunnerLogIfNeeded();
@@ -1030,12 +847,5 @@ server.listen(port, host, () => {
});
}, 10_000);
heartbeatTimer.unref();
if (cpuWatchdogEnabled) {
const cpuWatchdogTimer = setInterval(() => {
void sampleCpuWatchdog();
}, cpuWatchdogIntervalMs);
cpuWatchdogTimer.unref();
void sampleCpuWatchdog();
}
process.stdout.write(`server-command-runner listening on http://${host}:${port}\n`);
});