Drie fixes om de container lokaal (en op de NAS) te kunnen builden en draaien: - Dockerfile: clone scrum4me-mcp zonder --recurse-submodules. De Prisma- schema zit al gecommit in het scrum4me-mcp repo; de vendor/scrum4me submodule is alleen nodig voor schema-updates en wijst naar een privaat repo dat tijdens docker build niet bereikbaar is. - Dockerfile: voeg /usr/sbin en /sbin toe aan PATH zodat gosu (in /usr/sbin/gosu na apt-install) gevonden wordt door entrypoint.sh. Zonder dit faalt de container in een restart loop. - Verplaats alle runner scripts naar bin/ en maak etc/ aan, zodat COPY bin/ en COPY etc/ in de Dockerfile bestanden vinden. Verder: - .gitattributes om CRLF-corruptie van shell scripts op Windows te voorkomen (core.autocrlf=true is default actief). - .gitignore: docker-compose.override.yml uitsluiten zodat lokale dev-overrides niet worden gecommit. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
128 lines
5.5 KiB
Bash
128 lines
5.5 KiB
Bash
#!/usr/bin/env bash
|
|
# run-agent.sh — daemon-loop
|
|
#
|
|
# Strategie:
|
|
# - Eerst pre-flight token-check (eenmalig, blokkeert start bij faal)
|
|
# - Loop: claude -p met seed-prompt
|
|
# - Exit 0 → de queue was leeg, sleep kort, herhaal
|
|
# - Exit ≠ 0 → exponential backoff, log, schrijf state, herhaal
|
|
# - Bij N opeenvolgende fouten → schrijf UNHEALTHY marker; health
|
|
# endpoint gaat op 503, container blijft runnen voor diagnose
|
|
# - Bij gedetecteerde token-expiry → schrijf TOKEN_EXPIRED marker
|
|
# en exit (compose start opnieuw, maar entrypoint zal dezelfde
|
|
# marker zien via health-server)
|
|
|
|
set -uo pipefail # let op: geen -e, we willen exit-codes inspecteren
|
|
|
|
source /opt/agent/bin/_lib.sh
|
|
|
|
: "${AGENT_MAX_FAILURES:=5}"
|
|
: "${AGENT_BACKOFF_START:=5}"
|
|
: "${AGENT_BACKOFF_FACTOR:=2}"
|
|
: "${AGENT_BACKOFF_MAX:=300}"
|
|
|
|
mkdir -p "${AGENT_LOG_DIR}/runs"
|
|
|
|
# ----- pre-flight -------------------------------------------------------
|
|
log "pre-flight token check"
|
|
if ! /opt/agent/bin/check-tokens.sh; then
|
|
log "pre-flight failed — see check-tokens output above"
|
|
write_state '{"status":"unhealthy","reason":"preflight-failed"}'
|
|
touch "${AGENT_STATE_DIR}/UNHEALTHY"
|
|
# Blijf hangen zodat health-endpoint debugbaar blijft, maar herstart
|
|
# niet de hele compose service onnodig.
|
|
sleep infinity
|
|
fi
|
|
|
|
rm -f "${AGENT_STATE_DIR}/UNHEALTHY" "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
|
|
|
|
# Log-rotation eenmaal aan het begin, daarna elke iteratie.
|
|
/opt/agent/bin/rotate-logs.sh || true
|
|
|
|
# ----- seed prompt ------------------------------------------------------
|
|
SEED_PROMPT='Pak de volgende job uit de Scrum4Me-queue en draai de queue leeg volgens de loop in /opt/agent/CLAUDE.md. Niet stoppen tussen jobs door. Sluit pas af zodra wait_for_job na de volledige block-time terugkomt zonder claim.'
|
|
|
|
# Tools-allowlist: alle MCP-tools die scrum4me-mcp aanbiedt + standaard
|
|
# file/bash-tools. Geen WebFetch, geen WebSearch — de agent heeft die
|
|
# niet nodig en uitsluiting verkleint het surface.
|
|
ALLOWED_TOOLS='Read,Edit,Write,Bash,Grep,Glob,mcp__scrum4me__health,mcp__scrum4me__list_products,mcp__scrum4me__get_claude_context,mcp__scrum4me__wait_for_job,mcp__scrum4me__update_job_status,mcp__scrum4me__update_task_status,mcp__scrum4me__update_task_plan,mcp__scrum4me__log_implementation,mcp__scrum4me__log_test_result,mcp__scrum4me__log_commit,mcp__scrum4me__create_pbi,mcp__scrum4me__create_story,mcp__scrum4me__create_task,mcp__scrum4me__create_todo,mcp__scrum4me__ask_user_question,mcp__scrum4me__get_question_answer,mcp__scrum4me__list_open_questions,mcp__scrum4me__cancel_question'
|
|
|
|
CONSEC_FAILURES=0
|
|
BACKOFF=${AGENT_BACKOFF_START}
|
|
|
|
while true; do
|
|
iteration_start=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
run_log="${AGENT_LOG_DIR}/runs/$(date -u +%Y%m%dT%H%M%SZ).log"
|
|
|
|
write_state "$(jq -n \
|
|
--arg started "$iteration_start" \
|
|
--argjson failures "$CONSEC_FAILURES" \
|
|
'{status:"running", currentBatchStartedAt:$started, consecutiveFailures:$failures}')"
|
|
|
|
log "starting batch (log: ${run_log})"
|
|
|
|
# claude -p met onze MCP-config en allowlist.
|
|
# cwd = /opt/agent zodat onze CLAUDE.md auto-geladen wordt.
|
|
#
|
|
# --permission-mode bypassPermissions: alle resterende permission-
|
|
# prompts uit. Veilig in deze container omdat (1) we draaien als
|
|
# non-root agent-user, (2) geen push-credentials, (3) writes
|
|
# gelimiteerd tot /tmp/job-*. De allowlist hierboven blijft als
|
|
# belt-and-braces second filter.
|
|
set +e
|
|
claude -p "${SEED_PROMPT}" \
|
|
--mcp-config /opt/agent/mcp-config.json \
|
|
--allowedTools "${ALLOWED_TOOLS}" \
|
|
--permission-mode bypassPermissions \
|
|
--output-format text \
|
|
> "${run_log}" 2>&1
|
|
exit_code=$?
|
|
set -e
|
|
|
|
iteration_end=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
log "batch ended exit=${exit_code}"
|
|
|
|
# Token-expiry detectie: parse stderr/stdout op bekende strings.
|
|
if grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then
|
|
log "AUTH FAILURE detected in run log — marking TOKEN_EXPIRED"
|
|
touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
|
|
write_state "$(jq -n \
|
|
--arg endedAt "$iteration_end" \
|
|
--argjson exit "$exit_code" \
|
|
'{status:"token-expired", lastBatchAt:$endedAt, lastBatchExit:$exit}')"
|
|
# Blijf hangen — geen herstart, gebruiker moet rebuild doen.
|
|
sleep infinity
|
|
fi
|
|
|
|
if [[ "$exit_code" -eq 0 ]]; then
|
|
CONSEC_FAILURES=0
|
|
BACKOFF=${AGENT_BACKOFF_START}
|
|
write_state "$(jq -n \
|
|
--arg endedAt "$iteration_end" \
|
|
'{status:"idle", lastBatchAt:$endedAt, lastBatchExit:0, consecutiveFailures:0}')"
|
|
log "queue empty — sleep 2s"
|
|
sleep 2
|
|
else
|
|
CONSEC_FAILURES=$((CONSEC_FAILURES + 1))
|
|
write_state "$(jq -n \
|
|
--arg endedAt "$iteration_end" \
|
|
--argjson exit "$exit_code" \
|
|
--argjson failures "$CONSEC_FAILURES" \
|
|
'{status:"backoff", lastBatchAt:$endedAt, lastBatchExit:$exit, consecutiveFailures:$failures}')"
|
|
|
|
if [[ "$CONSEC_FAILURES" -ge "$AGENT_MAX_FAILURES" ]]; then
|
|
log "too many consecutive failures (${CONSEC_FAILURES}) — marking UNHEALTHY"
|
|
touch "${AGENT_STATE_DIR}/UNHEALTHY"
|
|
sleep infinity
|
|
fi
|
|
|
|
log "backing off ${BACKOFF}s before retry"
|
|
sleep "$BACKOFF"
|
|
BACKOFF=$(( BACKOFF * AGENT_BACKOFF_FACTOR ))
|
|
if [[ "$BACKOFF" -gt "$AGENT_BACKOFF_MAX" ]]; then
|
|
BACKOFF=$AGENT_BACKOFF_MAX
|
|
fi
|
|
fi
|
|
|
|
/opt/agent/bin/rotate-logs.sh || true
|
|
done
|