fix(runner): registreer worker-presence + 10s heartbeat in run-one-job

Tot nu toe schreef de NAS-runner nooit naar `claude_workers`, waardoor
de UI de worker als offline toonde ondanks gezonde container-health.
Direct na `getAuth()` doen we nu een UPSERT via `registerWorker` en
starten we een 10s heartbeat die `last_seen_at` vers houdt tijdens
quota-backoff, LISTEN-wait, claude-spawn en cleanup.

De heartbeat stopt via try/finally op elk exit-pad. Bewust geen
`unregisterWorker`: tussen iteraties zou dat UI-flicker geven, en
abnormale exits worden door de UI's eigen 60s-prune opgevangen.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Madhura68 2026-05-11 02:30:24 +02:00
parent 38c0e5f103
commit e8c4518abb

View file

@ -38,6 +38,8 @@ import {
import { releaseLocksOnTerminal } from '/opt/scrum4me-mcp/src/git/job-locks.js'
import { mapBudgetToEffort } from '/opt/scrum4me-mcp/src/lib/job-config.js'
import { getKindPromptText } from '/opt/scrum4me-mcp/src/lib/kind-prompts.js'
import { registerWorker } from '/opt/scrum4me-mcp/src/presence/worker.js'
import { startHeartbeat } from '/opt/scrum4me-mcp/src/presence/heartbeat.js'
// ----- logging --------------------------------------------------------
const log = (msg: string) =>
@ -49,6 +51,7 @@ const logError = (msg: string) =>
const WAIT_DEADLINE_SECONDS = 270 // ruim binnen MAX_WAIT_SECONDS van wait_for_job
const POLL_INTERVAL_MS = 5000
const HEARTBEAT_INTERVAL_MS = 60_000
const WORKER_HEARTBEAT_INTERVAL_MS = 10_000
const MCP_CONFIG = '/opt/agent/mcp-config.json'
const QUOTA_PROBE_PATH = '/opt/agent/bin/worker-quota-probe.sh'
const QUOTA_BACKOFF_CAP_MS = 30 * 60 * 1000
@ -152,6 +155,23 @@ async function main(): Promise<number> {
const { userId, tokenId } = await getAuth()
log(`auth ok user_id=${userId} token_id=${tokenId}`)
// Worker presence — UI leest claude_workers.last_seen_at.
// UPSERT zodat de rij tussen iteraties blijft bestaan (geen flicker),
// en heartbeat houdt last_seen_at vers tijdens quota-backoff,
// LISTEN-wait, claude-spawn, en cleanup. Niet unregisteren bij exit:
// de UI prunet zelf rijen ouder dan 60s.
try {
await registerWorker({ userId, tokenId })
} catch (err) {
logError(`registerWorker failed (non-fatal): ${(err as Error).message}`)
}
const workerHeartbeat = startHeartbeat({
userId,
tokenId,
intervalMs: WORKER_HEARTBEAT_INTERVAL_MS,
})
try {
// 1. Quota probe (gate vóór elke claim).
try {
await quotaProbe(userId)
@ -371,6 +391,9 @@ async function main(): Promise<number> {
if (tokenExpired) return 3
return exitCode ?? 1
} finally {
workerHeartbeat.stop()
}
}
// ----- entry ----------------------------------------------------------