fix: lokale Docker build werkend krijgen
Drie fixes om de container lokaal (en op de NAS) te kunnen builden en draaien: - Dockerfile: clone scrum4me-mcp zonder --recurse-submodules. De Prisma- schema zit al gecommit in het scrum4me-mcp repo; de vendor/scrum4me submodule is alleen nodig voor schema-updates en wijst naar een privaat repo dat tijdens docker build niet bereikbaar is. - Dockerfile: voeg /usr/sbin en /sbin toe aan PATH zodat gosu (in /usr/sbin/gosu na apt-install) gevonden wordt door entrypoint.sh. Zonder dit faalt de container in een restart loop. - Verplaats alle runner scripts naar bin/ en maak etc/ aan, zodat COPY bin/ en COPY etc/ in de Dockerfile bestanden vinden. Verder: - .gitattributes om CRLF-corruptie van shell scripts op Windows te voorkomen (core.autocrlf=true is default actief). - .gitignore: docker-compose.override.yml uitsluiten zodat lokale dev-overrides niet worden gecommit. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
9d8a7fe237
commit
47b1de93db
12 changed files with 17 additions and 5 deletions
128
bin/run-agent.sh
Normal file
128
bin/run-agent.sh
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
#!/usr/bin/env bash
|
||||
# run-agent.sh — daemon-loop
|
||||
#
|
||||
# Strategie:
|
||||
# - Eerst pre-flight token-check (eenmalig, blokkeert start bij faal)
|
||||
# - Loop: claude -p met seed-prompt
|
||||
# - Exit 0 → de queue was leeg, sleep kort, herhaal
|
||||
# - Exit ≠ 0 → exponential backoff, log, schrijf state, herhaal
|
||||
# - Bij N opeenvolgende fouten → schrijf UNHEALTHY marker; health
|
||||
# endpoint gaat op 503, container blijft runnen voor diagnose
|
||||
# - Bij gedetecteerde token-expiry → schrijf TOKEN_EXPIRED marker
|
||||
# en exit (compose start opnieuw, maar entrypoint zal dezelfde
|
||||
# marker zien via health-server)
|
||||
|
||||
set -uo pipefail # let op: geen -e, we willen exit-codes inspecteren
|
||||
|
||||
source /opt/agent/bin/_lib.sh
|
||||
|
||||
: "${AGENT_MAX_FAILURES:=5}"
|
||||
: "${AGENT_BACKOFF_START:=5}"
|
||||
: "${AGENT_BACKOFF_FACTOR:=2}"
|
||||
: "${AGENT_BACKOFF_MAX:=300}"
|
||||
|
||||
mkdir -p "${AGENT_LOG_DIR}/runs"
|
||||
|
||||
# ----- pre-flight -------------------------------------------------------
|
||||
log "pre-flight token check"
|
||||
if ! /opt/agent/bin/check-tokens.sh; then
|
||||
log "pre-flight failed — see check-tokens output above"
|
||||
write_state '{"status":"unhealthy","reason":"preflight-failed"}'
|
||||
touch "${AGENT_STATE_DIR}/UNHEALTHY"
|
||||
# Blijf hangen zodat health-endpoint debugbaar blijft, maar herstart
|
||||
# niet de hele compose service onnodig.
|
||||
sleep infinity
|
||||
fi
|
||||
|
||||
rm -f "${AGENT_STATE_DIR}/UNHEALTHY" "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
|
||||
|
||||
# Log-rotation eenmaal aan het begin, daarna elke iteratie.
|
||||
/opt/agent/bin/rotate-logs.sh || true
|
||||
|
||||
# ----- seed prompt ------------------------------------------------------
|
||||
SEED_PROMPT='Pak de volgende job uit de Scrum4Me-queue en draai de queue leeg volgens de loop in /opt/agent/CLAUDE.md. Niet stoppen tussen jobs door. Sluit pas af zodra wait_for_job na de volledige block-time terugkomt zonder claim.'
|
||||
|
||||
# Tools-allowlist: alle MCP-tools die scrum4me-mcp aanbiedt + standaard
|
||||
# file/bash-tools. Geen WebFetch, geen WebSearch — de agent heeft die
|
||||
# niet nodig en uitsluiting verkleint het surface.
|
||||
ALLOWED_TOOLS='Read,Edit,Write,Bash,Grep,Glob,mcp__scrum4me__health,mcp__scrum4me__list_products,mcp__scrum4me__get_claude_context,mcp__scrum4me__wait_for_job,mcp__scrum4me__update_job_status,mcp__scrum4me__update_task_status,mcp__scrum4me__update_task_plan,mcp__scrum4me__log_implementation,mcp__scrum4me__log_test_result,mcp__scrum4me__log_commit,mcp__scrum4me__create_pbi,mcp__scrum4me__create_story,mcp__scrum4me__create_task,mcp__scrum4me__create_todo,mcp__scrum4me__ask_user_question,mcp__scrum4me__get_question_answer,mcp__scrum4me__list_open_questions,mcp__scrum4me__cancel_question'
|
||||
|
||||
CONSEC_FAILURES=0
|
||||
BACKOFF=${AGENT_BACKOFF_START}
|
||||
|
||||
while true; do
|
||||
iteration_start=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
run_log="${AGENT_LOG_DIR}/runs/$(date -u +%Y%m%dT%H%M%SZ).log"
|
||||
|
||||
write_state "$(jq -n \
|
||||
--arg started "$iteration_start" \
|
||||
--argjson failures "$CONSEC_FAILURES" \
|
||||
'{status:"running", currentBatchStartedAt:$started, consecutiveFailures:$failures}')"
|
||||
|
||||
log "starting batch (log: ${run_log})"
|
||||
|
||||
# claude -p met onze MCP-config en allowlist.
|
||||
# cwd = /opt/agent zodat onze CLAUDE.md auto-geladen wordt.
|
||||
#
|
||||
# --permission-mode bypassPermissions: alle resterende permission-
|
||||
# prompts uit. Veilig in deze container omdat (1) we draaien als
|
||||
# non-root agent-user, (2) geen push-credentials, (3) writes
|
||||
# gelimiteerd tot /tmp/job-*. De allowlist hierboven blijft als
|
||||
# belt-and-braces second filter.
|
||||
set +e
|
||||
claude -p "${SEED_PROMPT}" \
|
||||
--mcp-config /opt/agent/mcp-config.json \
|
||||
--allowedTools "${ALLOWED_TOOLS}" \
|
||||
--permission-mode bypassPermissions \
|
||||
--output-format text \
|
||||
> "${run_log}" 2>&1
|
||||
exit_code=$?
|
||||
set -e
|
||||
|
||||
iteration_end=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
log "batch ended exit=${exit_code}"
|
||||
|
||||
# Token-expiry detectie: parse stderr/stdout op bekende strings.
|
||||
if grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then
|
||||
log "AUTH FAILURE detected in run log — marking TOKEN_EXPIRED"
|
||||
touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
|
||||
write_state "$(jq -n \
|
||||
--arg endedAt "$iteration_end" \
|
||||
--argjson exit "$exit_code" \
|
||||
'{status:"token-expired", lastBatchAt:$endedAt, lastBatchExit:$exit}')"
|
||||
# Blijf hangen — geen herstart, gebruiker moet rebuild doen.
|
||||
sleep infinity
|
||||
fi
|
||||
|
||||
if [[ "$exit_code" -eq 0 ]]; then
|
||||
CONSEC_FAILURES=0
|
||||
BACKOFF=${AGENT_BACKOFF_START}
|
||||
write_state "$(jq -n \
|
||||
--arg endedAt "$iteration_end" \
|
||||
'{status:"idle", lastBatchAt:$endedAt, lastBatchExit:0, consecutiveFailures:0}')"
|
||||
log "queue empty — sleep 2s"
|
||||
sleep 2
|
||||
else
|
||||
CONSEC_FAILURES=$((CONSEC_FAILURES + 1))
|
||||
write_state "$(jq -n \
|
||||
--arg endedAt "$iteration_end" \
|
||||
--argjson exit "$exit_code" \
|
||||
--argjson failures "$CONSEC_FAILURES" \
|
||||
'{status:"backoff", lastBatchAt:$endedAt, lastBatchExit:$exit, consecutiveFailures:$failures}')"
|
||||
|
||||
if [[ "$CONSEC_FAILURES" -ge "$AGENT_MAX_FAILURES" ]]; then
|
||||
log "too many consecutive failures (${CONSEC_FAILURES}) — marking UNHEALTHY"
|
||||
touch "${AGENT_STATE_DIR}/UNHEALTHY"
|
||||
sleep infinity
|
||||
fi
|
||||
|
||||
log "backing off ${BACKOFF}s before retry"
|
||||
sleep "$BACKOFF"
|
||||
BACKOFF=$(( BACKOFF * AGENT_BACKOFF_FACTOR ))
|
||||
if [[ "$BACKOFF" -gt "$AGENT_BACKOFF_MAX" ]]; then
|
||||
BACKOFF=$AGENT_BACKOFF_MAX
|
||||
fi
|
||||
fi
|
||||
|
||||
/opt/agent/bin/rotate-logs.sh || true
|
||||
done
|
||||
Loading…
Add table
Add a link
Reference in a new issue