scrum4me-docker/bin/run-agent.sh
Janpeter Visser 834e7912e7 feat(ST-mmuwreer): add check_queue_empty stap + allowedTools
CLAUDE.md: nieuwe stap 8 in operationele loop — agent roept
check_queue_empty aan na update_job_status('done'). Bij empty=true
exit batch direct ipv 600s wait_for_job-poll.

bin/run-agent.sh: voeg mcp__scrum4me__check_queue_empty toe aan
ALLOWED_TOOLS zodat de agent de tool ook daadwerkelijk mag aanroepen.

Vereist: scrum4me-mcp v0.3.0+ in MCP_GIT_REF (na merge bumpen + rebuild).

Re-doet werk uit `bd6b91e` dat in eerdere agent-run verloren ging
omdat verify_task_against_plan errorde (origin/main hard-coded; bug
in scrum4me-mcp opgevangen in PBI cmoq1j2e2001dvt17scif1flj).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 19:37:43 +02:00

128 lines
5.5 KiB
Bash

#!/usr/bin/env bash
# run-agent.sh — daemon-loop
#
# Strategie:
# - Eerst pre-flight token-check (eenmalig, blokkeert start bij faal)
# - Loop: claude -p met seed-prompt
# - Exit 0 → de queue was leeg, sleep kort, herhaal
# - Exit ≠ 0 → exponential backoff, log, schrijf state, herhaal
# - Bij N opeenvolgende fouten → schrijf UNHEALTHY marker; health
# endpoint gaat op 503, container blijft runnen voor diagnose
# - Bij gedetecteerde token-expiry → schrijf TOKEN_EXPIRED marker
# en exit (compose start opnieuw, maar entrypoint zal dezelfde
# marker zien via health-server)
set -uo pipefail # let op: geen -e, we willen exit-codes inspecteren
source /opt/agent/bin/_lib.sh
: "${AGENT_MAX_FAILURES:=5}"
: "${AGENT_BACKOFF_START:=5}"
: "${AGENT_BACKOFF_FACTOR:=2}"
: "${AGENT_BACKOFF_MAX:=300}"
mkdir -p "${AGENT_LOG_DIR}/runs"
# ----- pre-flight -------------------------------------------------------
log "pre-flight token check"
if ! /opt/agent/bin/check-tokens.sh; then
log "pre-flight failed — see check-tokens output above"
write_state '{"status":"unhealthy","reason":"preflight-failed"}'
touch "${AGENT_STATE_DIR}/UNHEALTHY"
# Blijf hangen zodat health-endpoint debugbaar blijft, maar herstart
# niet de hele compose service onnodig.
sleep infinity
fi
rm -f "${AGENT_STATE_DIR}/UNHEALTHY" "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
# Log-rotation eenmaal aan het begin, daarna elke iteratie.
/opt/agent/bin/rotate-logs.sh || true
# ----- seed prompt ------------------------------------------------------
SEED_PROMPT='Pak de volgende job uit de Scrum4Me-queue en draai de queue leeg volgens de loop in /opt/agent/CLAUDE.md. Niet stoppen tussen jobs door. Sluit pas af zodra wait_for_job na de volledige block-time terugkomt zonder claim.'
# Tools-allowlist: alle MCP-tools die scrum4me-mcp aanbiedt + standaard
# file/bash-tools. Geen WebFetch, geen WebSearch — de agent heeft die
# niet nodig en uitsluiting verkleint het surface.
ALLOWED_TOOLS='Read,Edit,Write,Bash,Grep,Glob,mcp__scrum4me__health,mcp__scrum4me__list_products,mcp__scrum4me__get_claude_context,mcp__scrum4me__wait_for_job,mcp__scrum4me__check_queue_empty,mcp__scrum4me__update_job_status,mcp__scrum4me__update_task_status,mcp__scrum4me__update_task_plan,mcp__scrum4me__log_implementation,mcp__scrum4me__log_test_result,mcp__scrum4me__log_commit,mcp__scrum4me__create_pbi,mcp__scrum4me__create_story,mcp__scrum4me__create_task,mcp__scrum4me__create_todo,mcp__scrum4me__ask_user_question,mcp__scrum4me__get_question_answer,mcp__scrum4me__list_open_questions,mcp__scrum4me__cancel_question'
CONSEC_FAILURES=0
BACKOFF=${AGENT_BACKOFF_START}
while true; do
iteration_start=$(date -u +%Y-%m-%dT%H:%M:%SZ)
run_log="${AGENT_LOG_DIR}/runs/$(date -u +%Y%m%dT%H%M%SZ).log"
write_state "$(jq -n \
--arg started "$iteration_start" \
--argjson failures "$CONSEC_FAILURES" \
'{status:"running", currentBatchStartedAt:$started, consecutiveFailures:$failures}')"
log "starting batch (log: ${run_log})"
# claude -p met onze MCP-config en allowlist.
# cwd = /opt/agent zodat onze CLAUDE.md auto-geladen wordt.
#
# --permission-mode bypassPermissions: alle resterende permission-
# prompts uit. Veilig in deze container omdat (1) we draaien als
# non-root agent-user, (2) geen push-credentials, (3) writes
# gelimiteerd tot /tmp/job-*. De allowlist hierboven blijft als
# belt-and-braces second filter.
set +e
claude -p "${SEED_PROMPT}" \
--mcp-config /opt/agent/mcp-config.json \
--allowedTools "${ALLOWED_TOOLS}" \
--permission-mode bypassPermissions \
--output-format text \
> "${run_log}" 2>&1
exit_code=$?
set -e
iteration_end=$(date -u +%Y-%m-%dT%H:%M:%SZ)
log "batch ended exit=${exit_code}"
# Token-expiry detectie: parse stderr/stdout op bekende strings.
if grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then
log "AUTH FAILURE detected in run log — marking TOKEN_EXPIRED"
touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
write_state "$(jq -n \
--arg endedAt "$iteration_end" \
--argjson exit "$exit_code" \
'{status:"token-expired", lastBatchAt:$endedAt, lastBatchExit:$exit}')"
# Blijf hangen — geen herstart, gebruiker moet rebuild doen.
sleep infinity
fi
if [[ "$exit_code" -eq 0 ]]; then
CONSEC_FAILURES=0
BACKOFF=${AGENT_BACKOFF_START}
write_state "$(jq -n \
--arg endedAt "$iteration_end" \
'{status:"idle", lastBatchAt:$endedAt, lastBatchExit:0, consecutiveFailures:0}')"
log "queue empty — sleep 2s"
sleep 2
else
CONSEC_FAILURES=$((CONSEC_FAILURES + 1))
write_state "$(jq -n \
--arg endedAt "$iteration_end" \
--argjson exit "$exit_code" \
--argjson failures "$CONSEC_FAILURES" \
'{status:"backoff", lastBatchAt:$endedAt, lastBatchExit:$exit, consecutiveFailures:$failures}')"
if [[ "$CONSEC_FAILURES" -ge "$AGENT_MAX_FAILURES" ]]; then
log "too many consecutive failures (${CONSEC_FAILURES}) — marking UNHEALTHY"
touch "${AGENT_STATE_DIR}/UNHEALTHY"
sleep infinity
fi
log "backing off ${BACKOFF}s before retry"
sleep "$BACKOFF"
BACKOFF=$(( BACKOFF * AGENT_BACKOFF_FACTOR ))
if [[ "$BACKOFF" -gt "$AGENT_BACKOFF_MAX" ]]; then
BACKOFF=$AGENT_BACKOFF_MAX
fi
fi
/opt/agent/bin/rotate-logs.sh || true
done