4 changed files with 11 additions and 64 deletions
--- a/.env.example
+++ b/.env.example
@ -110,9 +110,3 @@ AGENT_BACKOFF_MAX=300
 AGENT_LOG_GZIP_AFTER_HOURS=24
 # Hoeveel dagen ge-gzipte logs bewaren voor we ze verwijderen.
 AGENT_LOG_DELETE_AFTER_DAYS=30
 # Claude CLI --output-format. Default 'stream-json' streamt de volledige
 # event-stream (tool-calls, berichten) live naar de run-log; 'text' geeft
 # alleen Claude's eind-samenvatting (terser, maar geen live-meekijken).
 # stream-json maakt de run-log JSONL — gebruik jq of een viewer.
 AGENT_CLAUDE_OUTPUT_FORMAT=stream-json
--- a/bin/log-cleanup.sh
+++ b/bin/log-cleanup.sh
@ -18,9 +18,4 @@ find "${AGENT_LOG_DIR}" -type f \
    \( -name '*.log' -o -name '*.log.gz' -o -name '*.txt' -o -name '*.json' \) \
    -mtime "+${AGENT_LOG_HARD_DELETE_DAYS}" -delete 2>/dev/null || true
 # Prune dangling per-job symlinks: jobs/<job_id>.log -> runs/<ts>.log waarvan
 # het doel door rotatie is gegzipt of verwijderd. De -type f hierboven raakt
 # symlinks niet, dus broken links worden hier expliciet opgeruimd (-xtype l).
 find "${AGENT_LOG_DIR}/jobs" -maxdepth 1 -xtype l -delete 2>/dev/null || true
 find "${AGENT_LOG_DIR}/jobs" -mindepth 1 -type d -empty -delete 2>/dev/null || true
--- a/bin/run-agent.sh
+++ b/bin/run-agent.sh
@ -68,9 +68,7 @@ while true; do
    # claimt zelf via tryClaimJob, leest JobConfig (PBI-67), bouwt de
    # juiste Claude CLI-args, spawnt 'claude', wacht, sluit af.
    set +e
-    # RUN_LOG laat run-one-job.ts een jobs/<job_id>.log symlink leggen naar
+    tsx /opt/agent/bin/run-one-job.ts > "${run_log}" 2>&1
    # dit run-log, zodat de output van een job op job-id vindbaar is.
    RUN_LOG="${run_log}" tsx /opt/agent/bin/run-one-job.ts > "${run_log}" 2>&1
    exit_code=$?
    set -e
@ -80,12 +78,8 @@ while true; do
    # Token-expiry detectie: run-one-job.ts retourneert exit 3 wanneer het
    # bekende auth-error-strings in Claude's output ziet. We checken óók de
    # log-tekst voor het geval een ander pad het patroon raakt (bv. Prisma-
-    # connection-error met OAuth-expired in error-body) — maar alléén bij een
+    # connection-error met OAuth-expired in error-body).
-    # niet-nul exit. Het run-log bevat de volledige stream-json output (incl.
+    if [[ "$exit_code" -eq 3 ]] || grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then
    # tool-results én run-one-job's eigen "TOKEN_EXPIRED detected"-logregel),
    # dus een geslaagde job die toevallig "401 unauthorized" in z'n output
    # heeft mag de grep-fallback niet triggeren.
    if [[ "$exit_code" -eq 3 ]] || { [[ "$exit_code" -ne 0 ]] && grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; }; then
        log "AUTH FAILURE detected (exit=$exit_code or pattern in log) — marking TOKEN_EXPIRED"
        touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
        write_state "$(jq -n \
--- a/bin/run-one-job.ts
+++ b/bin/run-one-job.ts
@ -22,8 +22,7 @@
 //   3  = TOKEN_EXPIRED detected → run-agent.sh schrijft TOKEN_EXPIRED marker
 import { spawn, spawnSync } from 'node:child_process'
-import { mkdirSync, rmSync, symlinkSync, writeFileSync } from 'node:fs'
+import { mkdirSync, rmSync, writeFileSync } from 'node:fs'
 import { basename, join } from 'node:path'
 import { Client as PgClient } from 'pg'
@ -197,25 +196,6 @@ async function main(): Promise<number> {
    log(`claimed job_id=${jobId}`)
    // Per-job log: symlink jobs/<jobId>.log -> the runs/<timestamp>.log of
    // this iteration. runs/ files are timestamp-named, so without this a job's
    // output is only findable by grepping. run-agent.sh passes the run-log
    // path via RUN_LOG. Relative target so it survives the host bind-mount.
    // Best-effort — never fail the job over a log convenience. Dangling links
    // (after the runs/ file is gzipped/deleted) are pruned by log-cleanup.sh.
    const runLog = process.env.RUN_LOG
    if (runLog) {
      try {
        const jobsDir = join(process.env.AGENT_LOG_DIR ?? '/var/log/agent', 'jobs')
        mkdirSync(jobsDir, { recursive: true })
        const linkPath = join(jobsDir, `${jobId}.log`)
        rmSync(linkPath, { force: true })
        symlinkSync(join('..', 'runs', basename(runLog)), linkPath)
      } catch (err) {
        log(`per-job log symlink skipped for ${jobId}: ${(err as Error).message}`)
      }
    }
    // 3. Resolve full context.
    let ctx: Awaited<ReturnType<typeof getFullJobContext>> = null
    try {
@ -292,13 +272,6 @@ async function main(): Promise<number> {
    // 7. Build CLI args.
    const promptText = getKindPromptText(ctx.kind).replace('$PAYLOAD_PATH', payloadPath)
    // --output-format is configureerbaar via env. Default 'stream-json' geeft
    // de volledige event-stream (elke tool-call, elk bericht) live in de
    // run-log, i.p.v. alleen Claude's eind-samenvatting. stream-json vereist
    // --verbose in print-mode. Zet AGENT_CLAUDE_OUTPUT_FORMAT=text terug voor
    // de oude terse output. TOKEN_EXPIRED-detectie werkt ongewijzigd: de
    // auth-error-strings staan ook binnen de JSON-events.
    const outputFormat = process.env.AGENT_CLAUDE_OUTPUT_FORMAT ?? 'stream-json'
    const args: string[] = [
      '-p',
      promptText,
@ -313,9 +286,8 @@ async function main(): Promise<number> {
      '--add-dir',
      '/opt/agent',
      '--output-format',
-      outputFormat,
+      'text',
    ]
    if (outputFormat === 'stream-json') args.push('--verbose')
    if (effort) args.push('--effort', effort)
    const cwd = worktreePath ?? '/opt/agent'
@ -383,21 +355,13 @@ async function main(): Promise<number> {
        `duration_ms=${durationMs} wall_clock_seconds=${Math.round(durationMs / 1000)}`,
    )
-    // 10. Token-expiry detection — alleen als Claude zelf non-zero eindigde.
+    // 10. Token-expiry detection.
    // stdoutBuf bevat de volledige stream-json output incl. álle tool-results,
    // dus de auth-error-strings kunnen ook agent-werk-content zijn (een doc
    // over 401-handling gelezen, een endpoint getest). Een echte credential-
    // fout laat 'claude' non-zero exiten; een geslaagde run (exit 0) is per
    // definitie geen token-expiry. Zonder deze gate legt zulke content de
    // worker onterecht plat (run-agent.sh → TOKEN_EXPIRED marker + sleep).
    let tokenExpired = false
-    if (exitCode !== 0) {
+    for (const pat of TOKEN_EXPIRY_PATTERNS) {
-      for (const pat of TOKEN_EXPIRY_PATTERNS) {
+      if (pat.test(stdoutBuf)) {
-        if (pat.test(stdoutBuf)) {
+        tokenExpired = true
-          tokenExpired = true
+        log(`TOKEN_EXPIRED detected pattern="${pat.source}" exiting code=3`)
-          log(`TOKEN_EXPIRED detected pattern="${pat.source}" exiting code=3`)
+        break
          break
        }
      }
    }