#!/usr/bin/env bash # run-agent.sh — daemon-loop # # Strategie: # - Eerst pre-flight token-check (eenmalig, blokkeert start bij faal) # - Loop: tsx /opt/agent/bin/run-one-job.ts (één geclaimde job per iteratie) # - Exit 0 → de queue was leeg of de job is afgerond, sleep kort, herhaal # - Exit 3 → run-one-job detecteerde TOKEN_EXPIRED in Claude-output # - Exit ≠ 0 → exponential backoff, log, schrijf state, herhaal # - Bij N opeenvolgende fouten → schrijf UNHEALTHY marker; health # endpoint gaat op 503, container blijft runnen voor diagnose # - Bij gedetecteerde token-expiry → schrijf TOKEN_EXPIRED marker # en exit (compose start opnieuw, maar entrypoint zal dezelfde # marker zien via health-server) # # Claim/exec-loop zit in bin/run-one-job.ts (Node + tsx); deze shell doet # alleen daemon/backoff/health/log-rotation. Zie docs/plans/queue-loop-extraction.md # in de Scrum4Me-repo. set -uo pipefail # let op: geen -e, we willen exit-codes inspecteren source /opt/agent/bin/_lib.sh : "${AGENT_MAX_FAILURES:=5}" : "${AGENT_BACKOFF_START:=5}" : "${AGENT_BACKOFF_FACTOR:=2}" : "${AGENT_BACKOFF_MAX:=300}" mkdir -p "${AGENT_LOG_DIR}/runs" # ----- pre-flight ------------------------------------------------------- log "pre-flight token check" if ! /opt/agent/bin/check-tokens.sh; then log "pre-flight failed — see check-tokens output above" write_state '{"status":"unhealthy","reason":"preflight-failed"}' touch "${AGENT_STATE_DIR}/UNHEALTHY" # Blijf hangen zodat health-endpoint debugbaar blijft, maar herstart # niet de hele compose service onnodig. sleep infinity fi rm -f "${AGENT_STATE_DIR}/UNHEALTHY" "${AGENT_STATE_DIR}/TOKEN_EXPIRED" # Log-rotation eenmaal aan het begin, daarna elke iteratie. /opt/agent/bin/rotate-logs.sh || true /opt/agent/bin/log-cleanup.sh || true # Geen seed-prompt en geen ALLOWED_TOOLS-string meer: per-job CLI-flags # (incl. --model, --permission-mode, --effort, --allowedTools en de # kind-specifieke prompt) worden door run-one-job.ts gebouwd uit # JobConfig (resolved via PBI-67's resolveJobConfig). CONSEC_FAILURES=0 BACKOFF=${AGENT_BACKOFF_START} while true; do iteration_start=$(date -u +%Y-%m-%dT%H:%M:%SZ) run_log="${AGENT_LOG_DIR}/runs/$(date -u +%Y%m%dT%H%M%SZ).log" write_state "$(jq -n \ --arg started "$iteration_start" \ --argjson failures "$CONSEC_FAILURES" \ '{status:"running", currentBatchStartedAt:$started, consecutiveFailures:$failures}')" log "starting iteration (log: ${run_log})" # Eén iteratie = één geclaimde job (of "geen job" → exit 0). De runner # claimt zelf via tryClaimJob, leest JobConfig (PBI-67), bouwt de # juiste Claude CLI-args, spawnt 'claude', wacht, sluit af. set +e # RUN_LOG laat run-one-job.ts een jobs/.log symlink leggen naar # dit run-log, zodat de output van een job op job-id vindbaar is. RUN_LOG="${run_log}" tsx /opt/agent/bin/run-one-job.ts > "${run_log}" 2>&1 exit_code=$? set -e iteration_end=$(date -u +%Y-%m-%dT%H:%M:%SZ) log "batch ended exit=${exit_code}" # Token-expiry detectie: run-one-job.ts retourneert exit 3 wanneer het # bekende auth-error-strings in Claude's output ziet. We checken óók de # log-tekst voor het geval een ander pad het patroon raakt (bv. Prisma- # connection-error met OAuth-expired in error-body). if [[ "$exit_code" -eq 3 ]] || grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then log "AUTH FAILURE detected (exit=$exit_code or pattern in log) — marking TOKEN_EXPIRED" touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED" write_state "$(jq -n \ --arg endedAt "$iteration_end" \ --argjson exit "$exit_code" \ '{status:"token-expired", lastBatchAt:$endedAt, lastBatchExit:$exit}')" # Blijf hangen — geen herstart, gebruiker moet rebuild doen. sleep infinity fi if [[ "$exit_code" -eq 0 ]]; then CONSEC_FAILURES=0 BACKOFF=${AGENT_BACKOFF_START} write_state "$(jq -n \ --arg endedAt "$iteration_end" \ '{status:"idle", lastBatchAt:$endedAt, lastBatchExit:0, consecutiveFailures:0}')" log "queue empty — sleep 2s" sleep 2 else CONSEC_FAILURES=$((CONSEC_FAILURES + 1)) write_state "$(jq -n \ --arg endedAt "$iteration_end" \ --argjson exit "$exit_code" \ --argjson failures "$CONSEC_FAILURES" \ '{status:"backoff", lastBatchAt:$endedAt, lastBatchExit:$exit, consecutiveFailures:$failures}')" if [[ "$CONSEC_FAILURES" -ge "$AGENT_MAX_FAILURES" ]]; then log "too many consecutive failures (${CONSEC_FAILURES}) — marking UNHEALTHY" touch "${AGENT_STATE_DIR}/UNHEALTHY" sleep infinity fi log "backing off ${BACKOFF}s before retry" sleep "$BACKOFF" BACKOFF=$(( BACKOFF * AGENT_BACKOFF_FACTOR )) if [[ "$BACKOFF" -gt "$AGENT_BACKOFF_MAX" ]]; then BACKOFF=$AGENT_BACKOFF_MAX fi fi /opt/agent/bin/rotate-logs.sh || true /opt/agent/bin/log-cleanup.sh || true done