initial: NAS agent runner setup

2026-05-02 15:43:59 +02:00 · 2026-05-02 15:43:59 +02:00 · 9d8a7fe237
commit 9d8a7fe237
16 changed files with 1121 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,75 @@
 # ============================================================
 # scrum4me-agent-runner — environment configuratie
 # ============================================================
 # Kopieer naar .env en vul in. Houd .env buiten git (zie .gitignore).
 # Permissies: chmod 600 .env
 # ----- Claude Code authenticatie ----------------------------
 # Genereer op je werkstation met: `claude setup-token`
 # Output is een sk-ant-oat01-... token; geldig 1 jaar; gebruikt
 # je Pro/Max subscription quota in plaats van per-token billing.
 #
 # Alternatief: ANTHROPIC_API_KEY (sk-ant-api03-...) voor pay-per-use
 # via console.anthropic.com. Niet beide tegelijk zetten.
 CLAUDE_CODE_OAUTH_TOKEN=sk-ant-oat01-vervang-mij
 # ANTHROPIC_API_KEY=
 # ----- Scrum4Me API token -----------------------------------
 # Bearer-token van de DEDICATED agent-user (niet je persoonlijke
 # account). Aanmaken: log in als de agent-user → /settings/tokens
 # → label "NAS-runner". Token wordt eenmalig getoond.
 #
 # Als deze ge-revoked wordt: rebuild + redeploy (zie README).
 SCRUM4ME_TOKEN=vervang-mij
 # ----- Scrum4Me database ------------------------------------
 # Beide URLs uit het Neon-dashboard. DATABASE_URL is pooled,
 # DIRECT_URL is unpooled — scrum4me-mcp gebruikt DATABASE_URL
 # voor reads/writes; DIRECT_URL is alleen nodig als je de mcp
 # uitbreidt met LISTEN/NOTIFY (op dit moment niet).
 DATABASE_URL=postgresql://user:pass@host/dbname?sslmode=require
 DIRECT_URL=postgresql://user:pass@host/dbname?sslmode=require
 # ----- Scrum4Me API host ------------------------------------
 # Voor token-validatie via /api/health en /api/products
 # voorafgaand aan het starten van de daemon-loop. Gebruik je
 # productie-URL.
 SCRUM4ME_BASE_URL=https://scrum4me.example.com
 # ----- Scrum4Me MCP versie ----------------------------------
 # Pin een specifieke commit-sha of tag van scrum4me-mcp.
 # `main` werkt voor ontwikkeling; in productie altijd vastpinnen.
 MCP_GIT_REF=main
 # Idem voor Claude Code zelf. `latest` of een specifieke versie.
 CLAUDE_CODE_VERSION=latest
 # ----- NAS paths --------------------------------------------
 # Basis-share waaronder cache/, logs/ en state/ vallen.
 # Pas aan als je een andere share-naam gebruikt op QNAP.
 NAS_BASE=/share/Agent
 # UID/GID matchen met de share-eigenaar op QNAP (admin = 1000
 # bij standaard QTS-installatie).
 AGENT_UID=1000
 AGENT_GID=1000
 # ----- Health endpoint --------------------------------------
 # Externe poort op de NAS waarop /health bereikbaar is.
 # Wijzig als 8080 al in gebruik is door iets anders op QTS.
 AGENT_HEALTH_PORT_HOST=8080
 # ----- Daemon-loop tuning -----------------------------------
 # Maximum opeenvolgende mislukte batches voordat de container
 # zichzelf "unhealthy" markeert.
 AGENT_MAX_FAILURES=5
 # Backoff in seconden: start, factor, max
 AGENT_BACKOFF_START=5
 AGENT_BACKOFF_FACTOR=2
 AGENT_BACKOFF_MAX=300
 # Hoeveel uur logs bewaren voor we comprimeren (gzip).
 AGENT_LOG_GZIP_AFTER_HOURS=24
 # Hoeveel dagen ge-gzipte logs bewaren voor we ze verwijderen.
 AGENT_LOG_DELETE_AFTER_DAYS=30
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,19 @@
 # Secrets
 .env
 *.env.local
 # Logs (lokaal testen)
 *.log
 *.log.gz
 # OS
 .DS_Store
 Thumbs.db
 # Editor
 .vscode/
 .idea/
 *.swp
 # Node (mocht je lokaal iets uitproberen)
 node_modules/
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -0,0 +1,80 @@
 # CLAUDE.md — Scrum4Me NAS-runner
 Je draait als headless worker op een QNAP NAS. Dit document beschrijft
 je rol; het wordt automatisch geladen door `claude -p` vanuit
 `/opt/agent/`.
 ## Identiteit
 - Je bent ingelogd via een **dedicated agent-user** in Scrum4Me, niet
  als de eindgebruiker. Commits, story-logs en `claude_jobs.claimed_by_token_id`
  zullen jouw token tonen.
 - Je hebt **geen push-rechten**. Geen SSH-keys op deze container, geen
  `~/.gitconfig` met push-credentials. Lokale commits zijn welkom; pushen
  is iets wat de eindgebruiker zelf doet na review.
 - Je opereert binnen `/tmp/job-<id>` per job. Buiten die directory en
  buiten `/var/log/agent` heb je niets te zoeken.
 ## Operationele loop (verplicht)
 Wanneer je geseed wordt met *"Pak de volgende job uit de Scrum4Me-queue"*
 of equivalent:
 1. Roep `mcp__scrum4me__wait_for_job` aan. Geen argumenten, geen wait-time
   tweaken — de tool blokt zelf tot 600 s.
 2. Als er een job geclaimd wordt:
   1. Roep `bash /opt/agent/bin/job-prepare.sh <job_id> <repo_url>` aan
      via Bash. Output is het pad van de working tree.
   2. `cd` naar dat pad.
   3. Lees de project-CLAUDE.md (`./CLAUDE.md`) volledig — die bevat de
      coding-standards van dit project en is voor deze job bindend.
   4. Voer het `implementation_plan` uit dat je van `wait_for_job` kreeg.
      Volg de Commit Strategy uit de project-CLAUDE.md (commit per laag,
      ST-code in de titel).
   5. Voer de project-verificaties uit die de project-CLAUDE.md voorschrijft
      (typisch `npm run lint && npm test && npm run build`).
   6. **Niet pushen.** Lokaal committen op een feature-branch is goed.
   7. Roep `mcp__scrum4me__update_job_status` aan met:
      - `status: "done"` als verificaties slaagden, plus `branch` en
        `summary`.
      - `status: "failed"` met `error` als iets onomkeerbaar misging.
   8. Roep `bash /opt/agent/bin/job-cleanup.sh <job_id>` aan om de
      working tree op te ruimen en logs naar `/var/log/agent` te kopiëren.
 3. Roep **direct opnieuw** `wait_for_job` aan. Stop niet, vraag niets.
 4. Pas wanneer `wait_for_job` na de volledige block-time terugkomt zonder
   claim, sluit de turn af met een korte recap (aantal jobs, success/fail).
 ## Foutscenario's
 - **`job-prepare.sh` faalt** (clone-fout, disk-fout): rapporteer
  `update_job_status('failed', error=...)` en ga door met de volgende job.
  Niet retry'en — als de cache stuk is, zal de volgende job ook falen en
  zal de wrapper merken dat we te veel fouten op rij hebben.
 - **Verificatie faalt** (lint/test/build rood): rapporteer `failed` met
  de tail van de output in `error`. Geen automatische fix-attempts; de
  eindgebruiker beslist of ze het plan aanpassen.
 - **Onverwachte runtime-fout** in de tools: laat de exception propageren.
  De wrapper-loop schrijft een run-log en herstart `claude -p` met backoff.
 ## Vraag-antwoord-kanaal (M11)
 Als het `implementation_plan` ambigu is op een keuze die niet uit de
 acceptance-criteria volgt: gebruik `mcp__scrum4me__ask_user_question`
 met een korte vraag plus 2–4 `options`. Geef `wait_seconds: 600` mee
 zodat de tool blijft wachten. Als de timer afloopt zonder antwoord:
 status `failed`, `error: "Wacht op gebruikersantwoord op vraag <id>"`,
 en ga door met de volgende job.
 Niet gokken. Niet aannemen.
 ## Wat je NIET doet
 - Geen `git push`, ook niet naar `origin/<branch>` van een feature-branch.
 - Geen `npm publish`, `vercel deploy`, of welke release-actie dan ook.
 - Geen edits buiten `/tmp/job-*` (geen `~/.bashrc`, geen `/etc/...`,
  geen andere shares).
 - Geen credentials uitprinten of in commit-messages stoppen — `.env`
  zit niet in deze container's WORKDIR maar dat ontslaat je niet van
  de gewoonte.
 - Geen long-running shell-processes starten (servers, watchers). Builds
  en tests moeten zelfstandig terminate'n.
--- a/85
+++ b/85
@ -0,0 +1,85 @@
 # syntax=docker/dockerfile:1.6
 FROM ubuntu:22.04
 # ----- system deps -------------------------------------------------------
 ENV DEBIAN_FRONTEND=noninteractive \
    TZ=Europe/Amsterdam \
    LANG=C.UTF-8 \
    LC_ALL=C.UTF-8
 RUN apt-get update && apt-get install -y --no-install-recommends \
        ca-certificates curl git tini gosu jq xz-utils \
        build-essential python3 \
        tzdata logrotate \
    && ln -fs /usr/share/zoneinfo/$TZ /etc/localtime \
    && dpkg-reconfigure --frontend=noninteractive tzdata \
    && rm -rf /var/lib/apt/lists/*
 # ----- node 22 LTS -------------------------------------------------------
 # Voor zowel Claude Code (de native installer heeft geen node nodig, maar
 # scrum4me-mcp draait op tsx) als de health-server.
 RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
    && apt-get install -y --no-install-recommends nodejs \
    && rm -rf /var/lib/apt/lists/* \
    && npm install -g pnpm@9 tsx@4 \
    && npm cache clean --force
 # ----- claude code via native installer ---------------------------------
 # Zet PATH zodat het user-binary gevonden wordt; native installer plaatst
 # in $HOME/.local/bin standaard. We installeren als root om in /usr/local
 # te belanden, of fallback naar /opt.
 ARG CLAUDE_CODE_VERSION=latest
 RUN curl -fsSL https://claude.ai/install.sh | bash -s ${CLAUDE_CODE_VERSION} \
    && cp /root/.local/bin/claude /usr/local/bin/claude \
    && chmod +x /usr/local/bin/claude \
    && claude --version
 # ----- scrum4me-mcp ------------------------------------------------------
 # Clone met submodules zodat vendor/scrum4me (Prisma-schema) meekomt.
 # Pin via build-arg; default = main, in productie altijd op een commit-sha.
 ARG MCP_GIT_REPO=https://github.com/madhura68/scrum4me-mcp.git
 ARG MCP_GIT_REF=main
 RUN git clone --recurse-submodules --branch ${MCP_GIT_REF} --depth 1 \
        ${MCP_GIT_REPO} /opt/scrum4me-mcp \
    && cd /opt/scrum4me-mcp \
    && npm ci --omit=dev --omit=optional || npm install --omit=dev \
    && (npm run sync-schema || true) \
    && npx prisma generate
 # ----- non-root user -----------------------------------------------------
 # UID 1000 zodat bind-mounted /share/Agent/* schrijfrechten matchen met de
 # admin op QNAP. Pas aan via build-arg als je een andere UID gebruikt.
 ARG AGENT_UID=1000
 ARG AGENT_GID=1000
 RUN groupadd -g ${AGENT_GID} agent \
    && useradd  -u ${AGENT_UID} -g ${AGENT_GID} -m -s /bin/bash agent \
    && mkdir -p /var/cache/repos /var/cache/npm /var/log/agent /var/run/agent \
    && chown -R agent:agent /var/cache /var/log/agent /var/run/agent /home/agent
 # ----- runner files ------------------------------------------------------
 WORKDIR /opt/agent
 COPY --chown=agent:agent bin/        ./bin/
 COPY --chown=agent:agent etc/        ./etc/
 COPY --chown=agent:agent CLAUDE.md   ./
 COPY --chown=agent:agent mcp-config.json ./
 RUN chmod +x ./bin/*.sh
 # ----- runtime config ----------------------------------------------------
 ENV PATH=/opt/agent/bin:/usr/local/bin:/usr/bin:/bin \
    HOME=/home/agent \
    NPM_CONFIG_CACHE=/var/cache/npm \
    PNPM_HOME=/var/cache/pnpm \
    AGENT_STATE_DIR=/var/run/agent \
    AGENT_LOG_DIR=/var/log/agent \
    AGENT_REPO_CACHE=/var/cache/repos \
    AGENT_JOB_ROOT=/tmp \
    AGENT_HEALTH_PORT=8080
 EXPOSE 8080
 # tini als PID 1 → correcte signal handling, geen zombies
 ENTRYPOINT ["/usr/bin/tini", "--"]
 CMD ["/opt/agent/bin/entrypoint.sh"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,147 @@
 # scrum4me-agent-runner
 Headless Claude Code worker die de Scrum4Me job-queue (M13) leegtrekt vanaf
 een QNAP NAS via Container Station. Geen Vercel, geen browser, geen
 toetsenbord — Claude Code draait als daemon, claimt jobs uit
 `mcp__scrum4me__wait_for_job`, voert ze uit in een per-job clone, en pusht
 nooit zelf.
 ## Architectuur in één plaatje
 ```
 ┌─ QNAP TS-664 (Container Station) ─────────────────────────────┐
 │                                                                │
 │  ┌─ container: agent-runner ────────────────────────────────┐  │
 │  │  PID 1: tini → run-agent.sh (daemon-loop)                │  │
 │  │            ├─ health-server.js  (8080 → host)            │  │
 │  │            └─ claude -p (per-batch, met MCP via stdio)   │  │
 │  │                  └─ scrum4me-mcp → Neon Postgres         │  │
 │  │                                                          │  │
 │  │  /tmp/job-<id>      ephemeral working trees              │  │
 │  │  /var/cache/repos   bare git mirrors  (volume)           │  │
 │  │  /var/cache/npm     npm cache         (volume)           │  │
 │  │  /var/log/agent     run + job logs    (volume)           │  │
 │  └──────────────────────────────────────────────────────────┘  │
 │                                                                │
 │  /share/Agent/cache  /share/Agent/logs  /share/Agent/state     │
 └────────────────────────────────────────────────────────────────┘
                              │
                              ▼  HTTPS
                   Neon Postgres (Scrum4Me DB)
                              ▲
                              │
                   Vercel ─── Scrum4Me UI (gebruikers enqueueen jobs)
 ```
 Eén `claude -p`-invocation roept intern `wait_for_job` aan totdat de
 queue leeg is (≈600 s lege block-time → afsluiten). De wrapper start
 `claude -p` opnieuw zodra hij eindigt, met exponentiële backoff bij
 fouten.
 ## Wat zit waar
 | Bestand                  | Doel                                                            |
 |--------------------------|-----------------------------------------------------------------|
 | `Dockerfile`             | Ubuntu 22.04 + Node 22 + Claude Code + scrum4me-mcp + scripts   |
 | `docker-compose.yml`     | Service-definitie, volumes, env-file, restart-policy, limits    |
 | `package.json`           | Npm-dependencies van de runner zelf (alleen `scrum4me-mcp` pin) |
 | `mcp-config.json`        | Claude Code MCP-config (verwijst stdio naar scrum4me-mcp)       |
 | `CLAUDE.md`              | Agent-rol-instructies, auto-geladen door `claude -p`            |
 | `bin/entrypoint.sh`      | Container-startup: dirs, health-server, daemon-loop             |
 | `bin/run-agent.sh`       | Daemon-loop met backoff, exit-code-routing en state-writes      |
 | `bin/check-tokens.sh`    | Pre-flight: API-token, OAuth-token, DB-bereikbaarheid           |
 | `bin/job-prepare.sh`     | Per-job: bare-fetch + clone-via-reference naar `/tmp/job-<id>`  |
 | `bin/job-cleanup.sh`     | Per-job: logs naar `/var/log`, working tree weg                 |
 | `bin/health-server.js`   | HTTP-endpoint op 8080 dat state.json en marker-files leest      |
 | `bin/rotate-logs.sh`     | Compress/cleanup van oude `.log`-bestanden                      |
 | `.env.example`           | Alle env-vars met uitleg                                        |
 ## Vereisten op de NAS
 - Container Station 2+ (Docker compose v2)
 - Drie shares aangemaakt: `/share/Agent/cache`, `/share/Agent/logs`, `/share/Agent/state`
 - Of één share `/share/Agent` waaronder de drie subdirs vallen
 - Internet-uitgang naar `api.anthropic.com`, `github.com`, je Neon-host, `registry.npmjs.org`
 ## Deploy
 ```bash
 # 1. Op je werkstation: token's regelen
 #    a. CLAUDE_CODE_OAUTH_TOKEN  →  draai `claude setup-token` (browser-flow)
 #    b. SCRUM4ME_TOKEN           →  log in als de dedicated agent-user in
 #                                   Scrum4Me, /settings/tokens, label "NAS-runner"
 #    c. DATABASE_URL/DIRECT_URL  →  Neon dashboard
 # 2. Repo op de NAS plaatsen
 ssh admin@nas
 cd /share/Agent
 git clone https://github.com/<jij>/scrum4me-agent-runner.git
 cd scrum4me-agent-runner
 # 3. Env aanmaken
 cp .env.example .env
 chmod 600 .env
 vi .env   # vul alle waarden in
 # 4. Build + start
 docker compose build
 docker compose up -d
 # 5. Verifiëren
 curl http://nas.local:8080/health
 docker compose logs -f
 ```
 ## Updaten (handmatig, bewust)
 `SCRUM4ME_TOKEN` of `CLAUDE_CODE_OAUTH_TOKEN` rouleer je via een rebuild:
 ```bash
 cd /share/Agent/scrum4me-agent-runner
 git pull
 vi .env                   # nieuwe waarden
 docker compose build      # nieuwe scrum4me-mcp-versie als dat veranderd is
 docker compose up -d
 ```
 Dezelfde flow voor schema-drift in scrum4me-mcp: pin een nieuwe
 `MCP_GIT_REF` in `.env` of in `docker-compose.yml`, rebuild.
 ## Health-endpoint
 `GET http://<nas>:8080/health` retourneert:
 ```json
 {
  "status": "running",            // running | idle | unhealthy | token-expired
  "lastBatchAt": "2026-05-01T12:34:56Z",
  "lastBatchExit": 0,
  "consecutiveFailures": 0,
  "tokenStatus": { "anthropic": "ok", "scrum4me": "ok", "db": "ok" }
 }
 ```
 HTTP-status: `200` als running/idle, `503` bij token-expired of als de
 laatste heartbeat ouder is dan 5 minuten.
 ## Filesystem-grenzen
 De agent-user heeft geen SSH-keys, geen `~/.gitconfig` met push-credentials,
 en geen toegang tot andere shares dan `/share/Agent/*`. Commits worden
 lokaal in de per-job clone gemaakt; pushen gebeurt door jou op je
 werkstation na review (CLAUDE.md regel: *"`git push` is altijd expliciet"*).
 ## Bekende grenzen
 - **Eén actieve job tegelijk.** De wrapper-loop is sequentieel. Voor
  parallellisme zou je meerdere containers met dezelfde `SCRUM4ME_TOKEN`
  kunnen draaien — `wait_for_job` gebruikt `FOR UPDATE SKIP LOCKED` dus
  dat is veilig op DB-niveau, maar dan moet je je `node_modules`-cache
  per container scheiden.
 - **OAuth-token: 1 jaar geldig.** Bij verloop schrijft de wrapper een
  `TOKEN_EXPIRED`-marker en wordt de container `unhealthy`. Geen
  auto-rotatie.
 - **`npm install` per job** kost op een N5095 ~30–60 s per Next.js-clone,
  óók met de pnpm-store. Voor zeer kleine fixes is dat de dominante
  factor. Kan later vervangen worden door een persistente warm-`node_modules`
  per repo als dat een knelpunt wordt.
--- a/_lib.sh
+++ b/_lib.sh
@ -0,0 +1,36 @@
 #!/usr/bin/env bash
 # _lib.sh — gedeelde helpers; source vanuit andere scripts
 : "${AGENT_STATE_DIR:=/var/run/agent}"
 : "${AGENT_LOG_DIR:=/var/log/agent}"
 log() {
    printf '[%s] [%s] %s\n' \
        "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
        "${0##*/}" \
        "$*" >&2
 }
 # Schrijf JSON-state atomisch (write tmp + rename).
 # Argument: JSON-string. Voegt automatisch $.heartbeatAt = now toe.
 write_state() {
    local payload="$1"
    local tmp="${AGENT_STATE_DIR}/state.json.tmp.$$"
    local final="${AGENT_STATE_DIR}/state.json"
    # Merge $payload met heartbeatAt; behoud andere bestaande keys door
    # de bestaande state in te lezen en payload daarover te leggen.
    local existing='{}'
    if [[ -f "$final" ]]; then
        existing=$(cat "$final" 2>/dev/null || echo '{}')
    fi
    jq -n \
        --argjson existing "$existing" \
        --argjson payload "$payload" \
        --arg now "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
        '$existing + $payload + {heartbeatAt: $now}' \
        > "$tmp"
    mv "$tmp" "$final"
 }
--- a/check-tokens.sh
+++ b/check-tokens.sh
@ -0,0 +1,94 @@
 #!/usr/bin/env bash
 # check-tokens.sh — valideer credentials VOORDAT de daemon-loop start
 #
 # Tests:
 #   1. CLAUDE_CODE_OAUTH_TOKEN of ANTHROPIC_API_KEY aanwezig
 #   2. SCRUM4ME_TOKEN aanwezig en werkt tegen ${SCRUM4ME_BASE_URL}/api/products
 #   3. DATABASE_URL bereikbaar (best-effort: lege psql-style connect via node)
 #
 # Exit 0 op success, 1 bij elke fout.
 set -uo pipefail
 source /opt/agent/bin/_lib.sh
 ok=true
 # ----- 1. Anthropic credentials ----------------------------------------
 if [[ -z "${CLAUDE_CODE_OAUTH_TOKEN:-}" && -z "${ANTHROPIC_API_KEY:-}" ]]; then
    log "FAIL: neither CLAUDE_CODE_OAUTH_TOKEN nor ANTHROPIC_API_KEY is set"
    ok=false
 else
    if [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" && -n "${ANTHROPIC_API_KEY:-}" ]]; then
        log "WARN: both CLAUDE_CODE_OAUTH_TOKEN and ANTHROPIC_API_KEY are set; Claude Code will pick one and warn"
    fi
    log "OK: anthropic credential present"
 fi
 # ----- 2. Scrum4Me API token -------------------------------------------
 if [[ -z "${SCRUM4ME_TOKEN:-}" ]]; then
    log "FAIL: SCRUM4ME_TOKEN is not set"
    ok=false
 elif [[ -z "${SCRUM4ME_BASE_URL:-}" ]]; then
    log "WARN: SCRUM4ME_BASE_URL not set — skipping API token validation"
 else
    log "checking SCRUM4ME_TOKEN against ${SCRUM4ME_BASE_URL}/api/products"
    http_code=$(curl -sS -o /tmp/check-products.out -w '%{http_code}' \
        -H "Authorization: Bearer ${SCRUM4ME_TOKEN}" \
        "${SCRUM4ME_BASE_URL}/api/products" || echo "000")
    case "$http_code" in
        200)
            count=$(jq 'length' /tmp/check-products.out 2>/dev/null || echo "?")
            log "OK: SCRUM4ME_TOKEN works (${count} accessible products)"
            ;;
        401)
            log "FAIL: SCRUM4ME_TOKEN returned 401 — token revoked or wrong"
            ok=false
            ;;
        403)
            log "FAIL: SCRUM4ME_TOKEN returned 403 — likely a demo-token; create a non-demo agent-user"
            ok=false
            ;;
        000)
            log "FAIL: could not reach ${SCRUM4ME_BASE_URL} — network or DNS issue"
            ok=false
            ;;
        *)
            log "FAIL: unexpected status ${http_code} from ${SCRUM4ME_BASE_URL}/api/products"
            cat /tmp/check-products.out >&2 || true
            ok=false
            ;;
    esac
    rm -f /tmp/check-products.out
 fi
 # ----- 3. Database bereikbaarheid --------------------------------------
 # We hebben geen psql geinstalleerd om dependency-bloat te vermijden.
 # Best-effort: parse host+port uit DATABASE_URL en doe een TCP-connect.
 if [[ -z "${DATABASE_URL:-}" ]]; then
    log "FAIL: DATABASE_URL not set"
    ok=false
 else
    db_host=$(echo "$DATABASE_URL" | sed -E 's#.*@([^:/?]+).*#\1#')
    db_port=$(echo "$DATABASE_URL" | sed -nE 's#.*@[^:/]+:([0-9]+).*#\1#p')
    db_port=${db_port:-5432}
    if [[ -z "$db_host" ]]; then
        log "WARN: could not parse host from DATABASE_URL — skipping reachability check"
    else
        log "checking TCP connect to ${db_host}:${db_port}"
        if timeout 5 bash -c "</dev/tcp/${db_host}/${db_port}" 2>/dev/null; then
            log "OK: ${db_host}:${db_port} reachable"
        else
            log "FAIL: cannot reach ${db_host}:${db_port}"
            ok=false
        fi
    fi
 fi
 if $ok; then
    log "all pre-flight checks passed"
    exit 0
 else
    log "pre-flight failed"
    exit 1
 fi
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,56 @@
 services:
  agent:
    build:
      context: .
      args:
        # Pin een specifieke commit van scrum4me-mcp in productie.
        # Aanpassen + `docker compose build` om te roteren.
        MCP_GIT_REF: ${MCP_GIT_REF:-main}
        CLAUDE_CODE_VERSION: ${CLAUDE_CODE_VERSION:-latest}
        AGENT_UID: ${AGENT_UID:-1000}
        AGENT_GID: ${AGENT_GID:-1000}
    image: scrum4me-agent-runner:local
    container_name: scrum4me-agent
    env_file:
      - .env
    # Volumes: drie persistent op de NAS-share, één tmpfs voor de per-job
    # working trees zodat ze nooit op de NAS-share belanden.
    volumes:
      - ${NAS_BASE:-/share/Agent}/cache:/var/cache
      - ${NAS_BASE:-/share/Agent}/logs:/var/log/agent
      - ${NAS_BASE:-/share/Agent}/state:/var/run/agent
    tmpfs:
      - /tmp:size=4g,mode=1777
    ports:
      - "${AGENT_HEALTH_PORT_HOST:-8080}:8080"
    restart: unless-stopped
    # N5095 heeft 4 cores. Geef agent er 3, laat 1 voor QTS.
    # Memory: 4GB is ruim voor één Claude Code sessie + één npm install.
    deploy:
      resources:
        limits:
          cpus: "3.0"
          memory: 4g
        reservations:
          cpus: "0.5"
          memory: 512m
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://localhost:8080/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 60s
    # Logging: laat docker-driver de stdout/stderr afvangen, gecapped op
    # ~50MB totaal. De daemon-loop schrijft eigen run-logs naar /var/log/agent.
    logging:
      driver: json-file
      options:
        max-size: "10m"
        max-file: "5"
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -0,0 +1,66 @@
 #!/usr/bin/env bash
 # entrypoint.sh — container-startup
 #
 # Verantwoordelijkheden:
 #   1. Schrijfbare dirs op de bind-mounts garanderen (UID/GID matching)
 #   2. Health-server starten als achtergrondproces
 #   3. gosu naar de agent-user en daemon-loop starten
 #
 # Loopt als root tot stap 3 — daarvoor hebben we root nodig om
 # bind-mounts goed te zetten als de share met andere ownership is
 # aangemaakt.
 set -euo pipefail
 log() { printf '[entrypoint] %s\n' "$*" >&2; }
 : "${AGENT_UID:=1000}"
 : "${AGENT_GID:=1000}"
 : "${AGENT_STATE_DIR:=/var/run/agent}"
 : "${AGENT_LOG_DIR:=/var/log/agent}"
 : "${AGENT_REPO_CACHE:=/var/cache/repos}"
 : "${AGENT_HEALTH_PORT:=8080}"
 # ----- 1. dirs op bind-mounts -------------------------------------------
 log "ensuring directories on bind-mounts"
 mkdir -p \
    "${AGENT_STATE_DIR}" \
    "${AGENT_LOG_DIR}/runs" \
    "${AGENT_LOG_DIR}/jobs" \
    "${AGENT_REPO_CACHE}" \
    /var/cache/npm \
    /var/cache/pnpm
 # Alleen ownership corrigeren als de share als andere user is aangemaakt
 # — niet recursief op /var/cache/repos want dat kan groot zijn en de
 # eerste boot vertragen.
 chown "${AGENT_UID}:${AGENT_GID}" \
    "${AGENT_STATE_DIR}" \
    "${AGENT_LOG_DIR}" \
    "${AGENT_LOG_DIR}/runs" \
    "${AGENT_LOG_DIR}/jobs" \
    "${AGENT_REPO_CACHE}" \
    /var/cache/npm \
    /var/cache/pnpm 2>/dev/null || true
 # ----- 2. health-server in de achtergrond -------------------------------
 log "starting health-server on :${AGENT_HEALTH_PORT}"
 gosu agent node /opt/agent/bin/health-server.js \
    > "${AGENT_LOG_DIR}/health-server.log" 2>&1 &
 HEALTH_PID=$!
 log "health-server pid=${HEALTH_PID}"
 # Initial state: starting
 gosu agent /bin/bash -c 'cat > "${AGENT_STATE_DIR}/state.json"' <<EOF
 {
  "status": "starting",
  "startedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
  "lastBatchAt": null,
  "lastBatchExit": null,
  "consecutiveFailures": 0
 }
 EOF
 # ----- 3. drop privileges en start daemon-loop --------------------------
 log "dropping to agent user and starting run-agent.sh"
 exec gosu agent /opt/agent/bin/run-agent.sh
--- a/health-server.js
+++ b/health-server.js
@ -0,0 +1,129 @@
 #!/usr/bin/env node
 // health-server.js
 //
 // Een minimaal HTTP-endpoint dat de daemon-state uitleest:
 //   - /health   → JSON met status, last batch, failures, token-status
 //   - /healthz  → 200 OK (minimal liveness, voor docker healthcheck)
 //
 // State komt uit ${AGENT_STATE_DIR}/state.json. Marker-files:
 //   - UNHEALTHY      → 503, te veel opeenvolgende fouten
 //   - TOKEN_EXPIRED  → 503, een credential is verlopen
 //
 // Geen externe deps — gebruikt alleen node built-ins.
 const http = require('node:http');
 const fs = require('node:fs/promises');
 const path = require('node:path');
 const STATE_DIR = process.env.AGENT_STATE_DIR || '/var/run/agent';
 const PORT = Number(process.env.AGENT_HEALTH_PORT || 8080);
 // Als de heartbeat ouder is dan dit, beschouwen we de daemon als zombie.
 const STALE_HEARTBEAT_SECONDS = 360;  // 6 min — wrapper schrijft elke iteratie
 const STATE_FILE = path.join(STATE_DIR, 'state.json');
 const UNHEALTHY_MARKER = path.join(STATE_DIR, 'UNHEALTHY');
 const TOKEN_EXPIRED_MARKER = path.join(STATE_DIR, 'TOKEN_EXPIRED');
 async function exists(p) {
    try {
        await fs.access(p);
        return true;
    } catch {
        return false;
    }
 }
 async function readState() {
    try {
        const raw = await fs.readFile(STATE_FILE, 'utf8');
        return JSON.parse(raw);
    } catch (err) {
        return { status: 'unknown', error: String(err) };
    }
 }
 function ageSeconds(iso) {
    if (!iso) return null;
    const then = Date.parse(iso);
    if (Number.isNaN(then)) return null;
    return Math.floor((Date.now() - then) / 1000);
 }
 async function buildResponse() {
    const state = await readState();
    const tokenExpired = await exists(TOKEN_EXPIRED_MARKER);
    const unhealthy = await exists(UNHEALTHY_MARKER);
    const heartbeatAgeS = ageSeconds(state.heartbeatAt);
    const heartbeatStale =
        heartbeatAgeS !== null && heartbeatAgeS > STALE_HEARTBEAT_SECONDS;
    let httpStatus = 200;
    let effectiveStatus = state.status || 'unknown';
    if (tokenExpired) {
        httpStatus = 503;
        effectiveStatus = 'token-expired';
    } else if (unhealthy) {
        httpStatus = 503;
        effectiveStatus = 'unhealthy';
    } else if (heartbeatStale) {
        httpStatus = 503;
        effectiveStatus = 'stale';
    }
    const body = {
        status: effectiveStatus,
        rawStatus: state.status,
        startedAt: state.startedAt,
        heartbeatAt: state.heartbeatAt,
        heartbeatAgeSeconds: heartbeatAgeS,
        lastBatchAt: state.lastBatchAt,
        lastBatchExit: state.lastBatchExit,
        currentBatchStartedAt: state.currentBatchStartedAt,
        consecutiveFailures: state.consecutiveFailures ?? 0,
        markers: {
            tokenExpired,
            unhealthy,
        },
    };
    return { httpStatus, body };
 }
 const server = http.createServer(async (req, res) => {
    if (req.url === '/healthz') {
        res.writeHead(200, { 'content-type': 'text/plain' });
        res.end('ok\n');
        return;
    }
    if (req.url === '/health') {
        try {
            const { httpStatus, body } = await buildResponse();
            res.writeHead(httpStatus, {
                'content-type': 'application/json',
                'cache-control': 'no-store',
            });
            res.end(JSON.stringify(body, null, 2));
        } catch (err) {
            res.writeHead(500, { 'content-type': 'application/json' });
            res.end(JSON.stringify({ status: 'error', error: String(err) }));
        }
        return;
    }
    res.writeHead(404, { 'content-type': 'text/plain' });
    res.end('not found\n');
 });
 server.listen(PORT, '0.0.0.0', () => {
    console.log(`[health-server] listening on :${PORT}`);
 });
 for (const sig of ['SIGTERM', 'SIGINT']) {
    process.on(sig, () => {
        console.log(`[health-server] received ${sig}, shutting down`);
        server.close(() => process.exit(0));
        setTimeout(() => process.exit(1), 5000).unref();
    });
 }
--- a/job-cleanup.sh
+++ b/job-cleanup.sh
@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 # job-cleanup.sh — sluit een per-job working tree af
 #
 # Usage: job-cleanup.sh <job_id>
 #
 # - Bewaart `git log` en `git diff` van de feature-branch in
 #   /var/log/agent/jobs/<job_id>/  zodat je achteraf kunt zien wat de
 #   agent heeft gedaan zonder de NAS-share met clones te vervuilen.
 # - Verwijdert de working tree.
 set -uo pipefail
 source /opt/agent/bin/_lib.sh
 : "${AGENT_JOB_ROOT:=/tmp}"
 : "${AGENT_LOG_DIR:=/var/log/agent}"
 JOB_ID="${1:-}"
 if [[ -z "$JOB_ID" ]]; then
    log "usage: $0 <job_id>"
    exit 2
 fi
 WORK_DIR="${AGENT_JOB_ROOT}/job-${JOB_ID}"
 ARCHIVE_DIR="${AGENT_LOG_DIR}/jobs/${JOB_ID}"
 mkdir -p "$ARCHIVE_DIR"
 if [[ -d "$WORK_DIR/.git" ]]; then
    log "archiving git artifacts to ${ARCHIVE_DIR}"
    (
        cd "$WORK_DIR"
        git log --oneline --decorate -n 50 origin/main..HEAD \
            > "$ARCHIVE_DIR/commits.txt" 2>/dev/null || true
        git diff origin/main..HEAD \
            > "$ARCHIVE_DIR/diff.patch" 2>/dev/null || true
        git rev-parse HEAD \
            > "$ARCHIVE_DIR/HEAD" 2>/dev/null || true
        git branch --show-current \
            > "$ARCHIVE_DIR/branch" 2>/dev/null || true
    )
 fi
 if [[ -d "$WORK_DIR" ]]; then
    log "removing ${WORK_DIR}"
    rm -rf "$WORK_DIR"
 fi
 log "cleanup complete for job ${JOB_ID}"
--- a/job-prepare.sh
+++ b/job-prepare.sh
@ -0,0 +1,90 @@
 #!/usr/bin/env bash
 # job-prepare.sh — zet een per-job working tree op
 #
 # Usage: job-prepare.sh <job_id> <repo_url>
 #
 # Strategie:
 #   1. Bare cache van de repo onderhouden in $AGENT_REPO_CACHE/<slug>.git
 #   2. Vers fetchen voor we clonen (laatste main-state)
 #   3. `git clone --reference --dissociate` zodat objects gedeeld worden
 #      (snelle clone, schone working tree)
 #   4. `npm install` met cache-share via $NPM_CONFIG_CACHE
 #
 # Output (laatste regel): pad van de working tree, voor `cd`-gebruik.
 set -uo pipefail
 source /opt/agent/bin/_lib.sh
 : "${AGENT_REPO_CACHE:=/var/cache/repos}"
 : "${AGENT_JOB_ROOT:=/tmp}"
 : "${AGENT_LOG_DIR:=/var/log/agent}"
 JOB_ID="${1:-}"
 REPO_URL="${2:-}"
 if [[ -z "$JOB_ID" || -z "$REPO_URL" ]]; then
    log "usage: $0 <job_id> <repo_url>"
    exit 2
 fi
 # Slug uit repo_url voor de cache-naam: "github.com/foo/bar.git" → "foo_bar"
 SLUG=$(echo "$REPO_URL" \
    | sed -E 's#^.*[:/]([^/]+/[^/]+?)(\.git)?/?$#\1#' \
    | tr '/' '_')
 CACHE_DIR="${AGENT_REPO_CACHE}/${SLUG}.git"
 WORK_DIR="${AGENT_JOB_ROOT}/job-${JOB_ID}"
 JOB_LOG="${AGENT_LOG_DIR}/jobs/${JOB_ID}.log"
 mkdir -p "$(dirname "$JOB_LOG")"
 {
    log "preparing job ${JOB_ID} from ${REPO_URL}"
    log "cache=${CACHE_DIR} work=${WORK_DIR}"
    # ----- 1. ensure bare cache ------------------------------------
    if [[ ! -d "$CACHE_DIR" ]]; then
        log "cache miss — bare-cloning ${REPO_URL}"
        git clone --bare --filter=blob:none "$REPO_URL" "$CACHE_DIR"
    else
        log "cache hit — fetching latest"
        git -C "$CACHE_DIR" fetch --all --prune --quiet
    fi
    # ----- 2. fresh working tree -----------------------------------
    if [[ -d "$WORK_DIR" ]]; then
        log "stale work dir found — removing"
        rm -rf "$WORK_DIR"
    fi
    log "cloning via reference"
    git clone \
        --reference "$CACHE_DIR" \
        --dissociate \
        --quiet \
        "$REPO_URL" "$WORK_DIR"
    # ----- 3. branch -----------------------------------------------
    BRANCH="agent/job-${JOB_ID}"
    git -C "$WORK_DIR" checkout -B "$BRANCH"
    log "checked out ${BRANCH} from $(git -C "$WORK_DIR" rev-parse --short HEAD)"
    # ----- 4. install deps -----------------------------------------
    if [[ -f "$WORK_DIR/package-lock.json" ]]; then
        log "running npm ci (cache=${NPM_CONFIG_CACHE})"
        ( cd "$WORK_DIR" && npm ci --no-audit --no-fund --prefer-offline ) \
            || log "WARN: npm ci failed (continuing — agent kan zelf decide)"
    elif [[ -f "$WORK_DIR/package.json" ]]; then
        log "no lockfile — running npm install"
        ( cd "$WORK_DIR" && npm install --no-audit --no-fund --prefer-offline ) \
            || log "WARN: npm install failed"
    else
        log "no package.json — skipping install"
    fi
    log "ready: ${WORK_DIR}"
 } >> "$JOB_LOG" 2>&1
 # Output naar stdout zodat Claude het pad kan parsen.
 echo "$WORK_DIR"
--- a/mcp-config.json
+++ b/mcp-config.json
@ -0,0 +1,14 @@
 {
  "mcpServers": {
    "scrum4me": {
      "type": "stdio",
      "command": "npx",
      "args": ["tsx", "/opt/scrum4me-mcp/src/index.ts"],
      "env": {
        "SCRUM4ME_TOKEN": "${SCRUM4ME_TOKEN}",
        "DATABASE_URL": "${DATABASE_URL}",
        "DIRECT_URL": "${DIRECT_URL}"
      }
    }
  }
 }
--- a/package.json
+++ b/package.json
@ -0,0 +1,17 @@
 {
  "name": "scrum4me-agent-runner",
  "version": "0.1.0",
  "private": true,
  "description": "Headless Claude Code worker dat de Scrum4Me job-queue leegt vanaf een NAS",
  "scripts": {
    "build": "docker compose build",
    "up": "docker compose up -d",
    "down": "docker compose down",
    "logs": "docker compose logs -f",
    "rebuild": "docker compose build --no-cache && docker compose up -d",
    "health": "curl -fsS http://localhost:${AGENT_HEALTH_PORT_HOST:-8080}/health | jq ."
  },
  "engines": {
    "node": ">=22"
  }
 }
--- a/rotate-logs.sh
+++ b/rotate-logs.sh
@ -0,0 +1,36 @@
 #!/usr/bin/env bash
 # rotate-logs.sh — comprimeer oude run-logs en verwijder zeer oude
 #
 # Wordt elke daemon-iteratie aangeroepen. Werkt op:
 #   - ${AGENT_LOG_DIR}/runs/*.log    (per claude -p invocation)
 #   - ${AGENT_LOG_DIR}/jobs/*/*.txt  (per job archief)
 #
 # Configurable via env:
 #   AGENT_LOG_GZIP_AFTER_HOURS (default 24)
 #   AGENT_LOG_DELETE_AFTER_DAYS (default 30)
 set -uo pipefail
 source /opt/agent/bin/_lib.sh
 : "${AGENT_LOG_DIR:=/var/log/agent}"
 : "${AGENT_LOG_GZIP_AFTER_HOURS:=24}"
 : "${AGENT_LOG_DELETE_AFTER_DAYS:=30}"
 # Convert hours → minutes for find -mmin
 GZIP_AFTER_MIN=$(( AGENT_LOG_GZIP_AFTER_HOURS * 60 ))
 # 1. Gzip oude .log files in runs/
 find "${AGENT_LOG_DIR}/runs" -maxdepth 1 -type f -name '*.log' \
    -mmin "+${GZIP_AFTER_MIN}" -print 2>/dev/null \
    | while read -r f; do
        gzip -q "$f" || true
    done
 # 2. Delete zeer oude .gz files in runs/
 find "${AGENT_LOG_DIR}/runs" -maxdepth 1 -type f -name '*.log.gz' \
    -mtime "+${AGENT_LOG_DELETE_AFTER_DAYS}" -delete 2>/dev/null || true
 # 3. Delete zeer oude job-archieven
 find "${AGENT_LOG_DIR}/jobs" -maxdepth 1 -mindepth 1 -type d \
    -mtime "+${AGENT_LOG_DELETE_AFTER_DAYS}" -exec rm -rf {} + 2>/dev/null || true
--- a/run-agent.sh
+++ b/run-agent.sh
@ -0,0 +1,128 @@
 #!/usr/bin/env bash
 # run-agent.sh — daemon-loop
 #
 # Strategie:
 #   - Eerst pre-flight token-check (eenmalig, blokkeert start bij faal)
 #   - Loop: claude -p met seed-prompt
 #   - Exit 0 → de queue was leeg, sleep kort, herhaal
 #   - Exit ≠ 0 → exponential backoff, log, schrijf state, herhaal
 #   - Bij N opeenvolgende fouten → schrijf UNHEALTHY marker; health
 #     endpoint gaat op 503, container blijft runnen voor diagnose
 #   - Bij gedetecteerde token-expiry → schrijf TOKEN_EXPIRED marker
 #     en exit (compose start opnieuw, maar entrypoint zal dezelfde
 #     marker zien via health-server)
 set -uo pipefail   # let op: geen -e, we willen exit-codes inspecteren
 source /opt/agent/bin/_lib.sh
 : "${AGENT_MAX_FAILURES:=5}"
 : "${AGENT_BACKOFF_START:=5}"
 : "${AGENT_BACKOFF_FACTOR:=2}"
 : "${AGENT_BACKOFF_MAX:=300}"
 mkdir -p "${AGENT_LOG_DIR}/runs"
 # ----- pre-flight -------------------------------------------------------
 log "pre-flight token check"
 if ! /opt/agent/bin/check-tokens.sh; then
    log "pre-flight failed — see check-tokens output above"
    write_state '{"status":"unhealthy","reason":"preflight-failed"}'
    touch "${AGENT_STATE_DIR}/UNHEALTHY"
    # Blijf hangen zodat health-endpoint debugbaar blijft, maar herstart
    # niet de hele compose service onnodig.
    sleep infinity
 fi
 rm -f "${AGENT_STATE_DIR}/UNHEALTHY" "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
 # Log-rotation eenmaal aan het begin, daarna elke iteratie.
 /opt/agent/bin/rotate-logs.sh || true
 # ----- seed prompt ------------------------------------------------------
 SEED_PROMPT='Pak de volgende job uit de Scrum4Me-queue en draai de queue leeg volgens de loop in /opt/agent/CLAUDE.md. Niet stoppen tussen jobs door. Sluit pas af zodra wait_for_job na de volledige block-time terugkomt zonder claim.'
 # Tools-allowlist: alle MCP-tools die scrum4me-mcp aanbiedt + standaard
 # file/bash-tools. Geen WebFetch, geen WebSearch — de agent heeft die
 # niet nodig en uitsluiting verkleint het surface.
 ALLOWED_TOOLS='Read,Edit,Write,Bash,Grep,Glob,mcp__scrum4me__health,mcp__scrum4me__list_products,mcp__scrum4me__get_claude_context,mcp__scrum4me__wait_for_job,mcp__scrum4me__update_job_status,mcp__scrum4me__update_task_status,mcp__scrum4me__update_task_plan,mcp__scrum4me__log_implementation,mcp__scrum4me__log_test_result,mcp__scrum4me__log_commit,mcp__scrum4me__create_pbi,mcp__scrum4me__create_story,mcp__scrum4me__create_task,mcp__scrum4me__create_todo,mcp__scrum4me__ask_user_question,mcp__scrum4me__get_question_answer,mcp__scrum4me__list_open_questions,mcp__scrum4me__cancel_question'
 CONSEC_FAILURES=0
 BACKOFF=${AGENT_BACKOFF_START}
 while true; do
    iteration_start=$(date -u +%Y-%m-%dT%H:%M:%SZ)
    run_log="${AGENT_LOG_DIR}/runs/$(date -u +%Y%m%dT%H%M%SZ).log"
    write_state "$(jq -n \
        --arg started "$iteration_start" \
        --argjson failures "$CONSEC_FAILURES" \
        '{status:"running", currentBatchStartedAt:$started, consecutiveFailures:$failures}')"
    log "starting batch (log: ${run_log})"
    # claude -p met onze MCP-config en allowlist.
    # cwd = /opt/agent zodat onze CLAUDE.md auto-geladen wordt.
    #
    # --permission-mode bypassPermissions: alle resterende permission-
    # prompts uit. Veilig in deze container omdat (1) we draaien als
    # non-root agent-user, (2) geen push-credentials, (3) writes
    # gelimiteerd tot /tmp/job-*. De allowlist hierboven blijft als
    # belt-and-braces second filter.
    set +e
    claude -p "${SEED_PROMPT}" \
        --mcp-config /opt/agent/mcp-config.json \
        --allowedTools "${ALLOWED_TOOLS}" \
        --permission-mode bypassPermissions \
        --output-format text \
        > "${run_log}" 2>&1
    exit_code=$?
    set -e
    iteration_end=$(date -u +%Y-%m-%dT%H:%M:%SZ)
    log "batch ended exit=${exit_code}"
    # Token-expiry detectie: parse stderr/stdout op bekende strings.
    if grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then
        log "AUTH FAILURE detected in run log — marking TOKEN_EXPIRED"
        touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
        write_state "$(jq -n \
            --arg endedAt "$iteration_end" \
            --argjson exit "$exit_code" \
            '{status:"token-expired", lastBatchAt:$endedAt, lastBatchExit:$exit}')"
        # Blijf hangen — geen herstart, gebruiker moet rebuild doen.
        sleep infinity
    fi
    if [[ "$exit_code" -eq 0 ]]; then
        CONSEC_FAILURES=0
        BACKOFF=${AGENT_BACKOFF_START}
        write_state "$(jq -n \
            --arg endedAt "$iteration_end" \
            '{status:"idle", lastBatchAt:$endedAt, lastBatchExit:0, consecutiveFailures:0}')"
        log "queue empty — sleep 2s"
        sleep 2
    else
        CONSEC_FAILURES=$((CONSEC_FAILURES + 1))
        write_state "$(jq -n \
            --arg endedAt "$iteration_end" \
            --argjson exit "$exit_code" \
            --argjson failures "$CONSEC_FAILURES" \
            '{status:"backoff", lastBatchAt:$endedAt, lastBatchExit:$exit, consecutiveFailures:$failures}')"
        if [[ "$CONSEC_FAILURES" -ge "$AGENT_MAX_FAILURES" ]]; then
            log "too many consecutive failures (${CONSEC_FAILURES}) — marking UNHEALTHY"
            touch "${AGENT_STATE_DIR}/UNHEALTHY"
            sleep infinity
        fi
        log "backing off ${BACKOFF}s before retry"
        sleep "$BACKOFF"
        BACKOFF=$(( BACKOFF * AGENT_BACKOFF_FACTOR ))
        if [[ "$BACKOFF" -gt "$AGENT_BACKOFF_MAX" ]]; then
            BACKOFF=$AGENT_BACKOFF_MAX
        fi
    fi
    /opt/agent/bin/rotate-logs.sh || true
 done