scrum4me-docker/bin/entrypoint.sh
Madhura68 847ba96870 fix(entrypoint): ensure_writable bind-mounts + add log-cleanup (>2d)
- entrypoint.sh: chown → chmod a+rwX → fail-fast met diagnostiek voor
  AGENT_STATE_DIR en AGENT_LOG_DIR. Lost stille state.json permission
  denied op QNAP-share op (NAS-ACL blokkeert chown vanuit container).
- bin/log-cleanup.sh: nieuwe hard-delete >2d (env-tunable) naast de
  conservatievere rotate-logs.sh (gzip 24u, delete 30d).
- run-agent.sh: roept log-cleanup.sh aan bij startup en elke iteratie.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 16:18:10 +02:00

123 lines
4.4 KiB
Bash

#!/usr/bin/env bash
# entrypoint.sh — container-startup
#
# Verantwoordelijkheden:
# 1. Schrijfbare dirs op de bind-mounts garanderen (UID/GID matching)
# 2. Health-server starten als achtergrondproces
# 3. gosu naar de agent-user en daemon-loop starten
#
# Loopt als root tot stap 3 — daarvoor hebben we root nodig om
# bind-mounts goed te zetten als de share met andere ownership is
# aangemaakt.
set -euo pipefail
log() { printf '[entrypoint] %s\n' "$*" >&2; }
: "${AGENT_UID:=1000}"
: "${AGENT_GID:=1000}"
: "${AGENT_STATE_DIR:=/var/run/agent}"
: "${AGENT_LOG_DIR:=/var/log/agent}"
: "${AGENT_REPO_CACHE:=/var/cache/repos}"
: "${AGENT_HEALTH_PORT:=8080}"
# ----- 0. preflight: /var/cache mount-type + writable --------------------
_cache_fs=$(stat -f -c %T /var/cache 2>/dev/null \
|| stat -f /var/cache 2>/dev/null | awk '/Type:/{print $NF}')
if [ "$_cache_fs" = "tmpfs" ]; then
log "FATAL: /var/cache is tmpfs (likely missing bind-mount). Fix docker-compose.yml en doe \`compose up -d --force-recreate\`."
exit 1
fi
if ! touch /var/cache/.write-test 2>/dev/null; then
log "FATAL: /var/cache niet writable als user $(id -u)."
exit 1
fi
rm -f /var/cache/.write-test
log "/var/cache OK (fs=${_cache_fs})"
# Lighter warning-only check voor log/state mounts
_logdir_fs=$(stat -f -c %T /var/log/agent 2>/dev/null || echo unknown)
if [ "$_logdir_fs" = "tmpfs" ]; then
log "WARN: /var/log/agent is tmpfs — overleeft geen container-herstart."
fi
_statedir_fs=$(stat -f -c %T /var/run/agent 2>/dev/null || echo unknown)
if [ "$_statedir_fs" = "tmpfs" ]; then
log "WARN: /var/run/agent is tmpfs — overleeft geen container-herstart."
fi
# ----- 1. dirs op bind-mounts -------------------------------------------
log "ensuring directories on bind-mounts"
# Helper: garandeer dat $1 schrijfbaar is voor de agent-user.
# Escalatie: chown → chmod a+rwX → fail-fast met diagnostiek.
# Nodig omdat bind-mounts vanaf de NAS-share de Dockerfile-chown
# overschrijven en QNAP-ACLs een tweede chown vaak blokkeren.
ensure_writable() {
local dir="$1"
mkdir -p "$dir"
chown "${AGENT_UID}:${AGENT_GID}" "$dir" 2>/dev/null || true
if gosu agent test -w "$dir"; then
log "$dir OK (writable als UID=${AGENT_UID})"
return 0
fi
chmod a+rwX "$dir" 2>/dev/null || true
if gosu agent test -w "$dir"; then
log "WARN: $dir was niet writable — chmod a+rwX toegepast"
return 0
fi
log "FATAL: $dir niet writable als UID=${AGENT_UID}"
log " huidige stat: $(stat -c '%U:%G %a' "$dir" 2>/dev/null || echo '<onbekend>')"
log " fix op de NAS-host:"
log " chown -R ${AGENT_UID}:${AGENT_GID} <host-pad voor $dir>"
log " of: chmod -R 0775 <host-pad voor $dir>"
log " (zie docker-compose.yml volumes-mapping voor het host-pad)"
exit 1
}
ensure_writable "${AGENT_STATE_DIR}"
ensure_writable "${AGENT_LOG_DIR}"
# Sub-dirs en cache-paden — niet kritiek genoeg voor fail-fast; chown
# best-effort. /var/cache writability is al boven gecheckt.
mkdir -p \
"${AGENT_LOG_DIR}/runs" \
"${AGENT_LOG_DIR}/jobs" \
"${AGENT_REPO_CACHE}" \
/var/cache/npm \
/var/cache/pnpm
chown "${AGENT_UID}:${AGENT_GID}" \
"${AGENT_LOG_DIR}/runs" \
"${AGENT_LOG_DIR}/jobs" \
"${AGENT_REPO_CACHE}" \
/var/cache/npm \
/var/cache/pnpm 2>/dev/null || true
# ----- 2. health-server in de achtergrond -------------------------------
log "starting health-server on :${AGENT_HEALTH_PORT}"
gosu agent node /opt/agent/bin/health-server.js \
> "${AGENT_LOG_DIR}/health-server.log" 2>&1 &
HEALTH_PID=$!
log "health-server pid=${HEALTH_PID}"
# Initial state: starting
gosu agent /bin/bash -c 'cat > "${AGENT_STATE_DIR}/state.json"' <<EOF
{
"status": "starting",
"startedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"lastBatchAt": null,
"lastBatchExit": null,
"consecutiveFailures": 0
}
EOF
# ----- 3. drop privileges, bootstrap repos, start daemon-loop -----------
# repo-bootstrap.sh runs as agent (NOT root) so that the cloned repos
# are owned by the agent user and live under ~agent/Projects/<name> —
# that is exactly where scrum4me-mcp's resolveRepoRoot looks via its
# convention fallback.
log "dropping to agent user and bootstrapping repos"
gosu agent /opt/agent/bin/repo-bootstrap.sh \
>> "${AGENT_LOG_DIR}/repo-bootstrap.log" 2>&1 \
|| log "WARN: repo-bootstrap returned non-zero (continuing)"
log "starting run-agent.sh"
exec gosu agent /opt/agent/bin/run-agent.sh