Merge pull request #3 from madhura68/feat/story-mmuwreer
feat: /var/cache → /srv/agent-cache + entrypoint preflight + health cache-check
This commit is contained in:
commit
c5bf226b3d
5 changed files with 73 additions and 4 deletions
15
CLAUDE.md
15
CLAUDE.md
|
|
@ -38,11 +38,20 @@ of equivalent:
|
||||||
- `status: "done"` als verificaties slaagden, plus `branch` en
|
- `status: "done"` als verificaties slaagden, plus `branch` en
|
||||||
`summary`.
|
`summary`.
|
||||||
- `status: "failed"` met `error` als iets onomkeerbaar misging.
|
- `status: "failed"` met `error` als iets onomkeerbaar misging.
|
||||||
8. Roep `bash /opt/agent/bin/job-cleanup.sh <job_id>` aan om de
|
8. Roep `mcp__scrum4me__check_queue_empty` aan (geen args). Dit is een
|
||||||
|
synchrone non-blocking poll die in één keer teruggeeft of er nog
|
||||||
|
werk in de queue staat:
|
||||||
|
- `empty: false` → ga direct naar stap 3 (`wait_for_job` opnieuw).
|
||||||
|
- `empty: true` → batch is klaar; geef recap en exit. Geen extra
|
||||||
|
`wait_for_job`-call die 600 s blokt.
|
||||||
|
9. Roep `bash /opt/agent/bin/job-cleanup.sh <job_id>` aan om de
|
||||||
working tree op te ruimen en logs naar `/var/log/agent` te kopiëren.
|
working tree op te ruimen en logs naar `/var/log/agent` te kopiëren.
|
||||||
3. Roep **direct opnieuw** `wait_for_job` aan. Stop niet, vraag niets.
|
3. Op basis van stap 8: bij `empty: false` opnieuw `wait_for_job`; bij
|
||||||
|
`empty: true` direct naar stap 4. Stop niet midden in de loop, vraag
|
||||||
|
niets.
|
||||||
4. Pas wanneer `wait_for_job` na de volledige block-time terugkomt zonder
|
4. Pas wanneer `wait_for_job` na de volledige block-time terugkomt zonder
|
||||||
claim, sluit de turn af met een korte recap (aantal jobs, success/fail).
|
claim, óf `check_queue_empty` empty=true retourneerde, sluit de turn
|
||||||
|
af met een korte recap (aantal jobs, success/fail).
|
||||||
|
|
||||||
## Foutscenario's
|
## Foutscenario's
|
||||||
|
|
||||||
|
|
|
||||||
20
README.md
20
README.md
|
|
@ -114,6 +114,26 @@ docker compose up -d
|
||||||
Dezelfde flow voor schema-drift in scrum4me-mcp: pin een nieuwe
|
Dezelfde flow voor schema-drift in scrum4me-mcp: pin een nieuwe
|
||||||
`MCP_GIT_REF` in `.env` of in `docker-compose.yml`, rebuild.
|
`MCP_GIT_REF` in `.env` of in `docker-compose.yml`, rebuild.
|
||||||
|
|
||||||
|
### Wijzigingen in `docker-compose.yml` (volumes, tmpfs, env_file, ports)
|
||||||
|
|
||||||
|
> **Let op:** `docker compose restart` herstart alleen het proces in de
|
||||||
|
> bestaande container met de **oude** config. Wijzigingen in volumes,
|
||||||
|
> tmpfs-mounts, env_file of ports worden daarmee **niet** doorgevoerd.
|
||||||
|
|
||||||
|
Gebruik altijd `--force-recreate` als je `docker-compose.yml` is veranderd:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d --force-recreate agent
|
||||||
|
```
|
||||||
|
|
||||||
|
Verifieer daarna dat `/var/cache` op de NAS-overlay staat en **niet** op tmpfs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec scrum4me-agent df -h /var/cache
|
||||||
|
# Verwacht: Filesystem op /dev/mapper/cachedev* of een NAS-share
|
||||||
|
# Fout: tmpfs 16M ... (dan is force-recreate niet uitgevoerd)
|
||||||
|
```
|
||||||
|
|
||||||
## Health-endpoint
|
## Health-endpoint
|
||||||
|
|
||||||
`GET http://<nas>:8080/health` retourneert:
|
`GET http://<nas>:8080/health` retourneert:
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,29 @@ log() { printf '[entrypoint] %s\n' "$*" >&2; }
|
||||||
: "${AGENT_REPO_CACHE:=/var/cache/repos}"
|
: "${AGENT_REPO_CACHE:=/var/cache/repos}"
|
||||||
: "${AGENT_HEALTH_PORT:=8080}"
|
: "${AGENT_HEALTH_PORT:=8080}"
|
||||||
|
|
||||||
|
# ----- 0. preflight: /var/cache mount-type + writable --------------------
|
||||||
|
_cache_fs=$(stat -f -c %T /var/cache 2>/dev/null \
|
||||||
|
|| stat -f /var/cache 2>/dev/null | awk '/Type:/{print $NF}')
|
||||||
|
if [ "$_cache_fs" = "tmpfs" ]; then
|
||||||
|
log "FATAL: /var/cache is tmpfs (likely missing bind-mount). Fix docker-compose.yml en doe \`compose up -d --force-recreate\`."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! touch /var/cache/.write-test 2>/dev/null; then
|
||||||
|
log "FATAL: /var/cache niet writable als user $(id -u)."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
rm -f /var/cache/.write-test
|
||||||
|
log "/var/cache OK (fs=${_cache_fs})"
|
||||||
|
# Lighter warning-only check voor log/state mounts
|
||||||
|
_logdir_fs=$(stat -f -c %T /var/log/agent 2>/dev/null || echo unknown)
|
||||||
|
if [ "$_logdir_fs" = "tmpfs" ]; then
|
||||||
|
log "WARN: /var/log/agent is tmpfs — overleeft geen container-herstart."
|
||||||
|
fi
|
||||||
|
_statedir_fs=$(stat -f -c %T /var/run/agent 2>/dev/null || echo unknown)
|
||||||
|
if [ "$_statedir_fs" = "tmpfs" ]; then
|
||||||
|
log "WARN: /var/run/agent is tmpfs — overleeft geen container-herstart."
|
||||||
|
fi
|
||||||
|
|
||||||
# ----- 1. dirs op bind-mounts -------------------------------------------
|
# ----- 1. dirs op bind-mounts -------------------------------------------
|
||||||
log "ensuring directories on bind-mounts"
|
log "ensuring directories on bind-mounts"
|
||||||
mkdir -p \
|
mkdir -p \
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
const http = require('node:http');
|
const http = require('node:http');
|
||||||
const fs = require('node:fs/promises');
|
const fs = require('node:fs/promises');
|
||||||
const path = require('node:path');
|
const path = require('node:path');
|
||||||
|
const { execSync } = require('node:child_process');
|
||||||
|
|
||||||
const STATE_DIR = process.env.AGENT_STATE_DIR || '/var/run/agent';
|
const STATE_DIR = process.env.AGENT_STATE_DIR || '/var/run/agent';
|
||||||
const PORT = Number(process.env.AGENT_HEALTH_PORT || 8080);
|
const PORT = Number(process.env.AGENT_HEALTH_PORT || 8080);
|
||||||
|
|
@ -25,6 +26,13 @@ const STATE_FILE = path.join(STATE_DIR, 'state.json');
|
||||||
const UNHEALTHY_MARKER = path.join(STATE_DIR, 'UNHEALTHY');
|
const UNHEALTHY_MARKER = path.join(STATE_DIR, 'UNHEALTHY');
|
||||||
const TOKEN_EXPIRED_MARKER = path.join(STATE_DIR, 'TOKEN_EXPIRED');
|
const TOKEN_EXPIRED_MARKER = path.join(STATE_DIR, 'TOKEN_EXPIRED');
|
||||||
|
|
||||||
|
const CACHE_LOW_BYTES = 100 * 1024 * 1024; // 100 MB
|
||||||
|
|
||||||
|
function cacheBytesFree() {
|
||||||
|
const out = execSync('df -PB1 /var/cache').toString().split('\n')[1];
|
||||||
|
return parseInt(out.split(/\s+/)[3], 10);
|
||||||
|
}
|
||||||
|
|
||||||
async function exists(p) {
|
async function exists(p) {
|
||||||
try {
|
try {
|
||||||
await fs.access(p);
|
await fs.access(p);
|
||||||
|
|
@ -59,6 +67,14 @@ async function buildResponse() {
|
||||||
const heartbeatStale =
|
const heartbeatStale =
|
||||||
heartbeatAgeS !== null && heartbeatAgeS > STALE_HEARTBEAT_SECONDS;
|
heartbeatAgeS !== null && heartbeatAgeS > STALE_HEARTBEAT_SECONDS;
|
||||||
|
|
||||||
|
const cacheFreeBytes = cacheBytesFree();
|
||||||
|
if (cacheFreeBytes < CACHE_LOW_BYTES) {
|
||||||
|
return {
|
||||||
|
httpStatus: 503,
|
||||||
|
body: { status: 'unhealthy', reason: 'cache-low' },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
let httpStatus = 200;
|
let httpStatus = 200;
|
||||||
let effectiveStatus = state.status || 'unknown';
|
let effectiveStatus = state.status || 'unknown';
|
||||||
|
|
||||||
|
|
@ -87,6 +103,7 @@ async function buildResponse() {
|
||||||
tokenExpired,
|
tokenExpired,
|
||||||
unhealthy,
|
unhealthy,
|
||||||
},
|
},
|
||||||
|
cache_free_bytes: cacheFreeBytes,
|
||||||
};
|
};
|
||||||
|
|
||||||
return { httpStatus, body };
|
return { httpStatus, body };
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ SEED_PROMPT='Pak de volgende job uit de Scrum4Me-queue en draai de queue leeg vo
|
||||||
# Tools-allowlist: alle MCP-tools die scrum4me-mcp aanbiedt + standaard
|
# Tools-allowlist: alle MCP-tools die scrum4me-mcp aanbiedt + standaard
|
||||||
# file/bash-tools. Geen WebFetch, geen WebSearch — de agent heeft die
|
# file/bash-tools. Geen WebFetch, geen WebSearch — de agent heeft die
|
||||||
# niet nodig en uitsluiting verkleint het surface.
|
# niet nodig en uitsluiting verkleint het surface.
|
||||||
ALLOWED_TOOLS='Read,Edit,Write,Bash,Grep,Glob,mcp__scrum4me__health,mcp__scrum4me__list_products,mcp__scrum4me__get_claude_context,mcp__scrum4me__wait_for_job,mcp__scrum4me__update_job_status,mcp__scrum4me__update_task_status,mcp__scrum4me__update_task_plan,mcp__scrum4me__log_implementation,mcp__scrum4me__log_test_result,mcp__scrum4me__log_commit,mcp__scrum4me__create_pbi,mcp__scrum4me__create_story,mcp__scrum4me__create_task,mcp__scrum4me__create_todo,mcp__scrum4me__ask_user_question,mcp__scrum4me__get_question_answer,mcp__scrum4me__list_open_questions,mcp__scrum4me__cancel_question'
|
ALLOWED_TOOLS='Read,Edit,Write,Bash,Grep,Glob,mcp__scrum4me__health,mcp__scrum4me__list_products,mcp__scrum4me__get_claude_context,mcp__scrum4me__wait_for_job,mcp__scrum4me__check_queue_empty,mcp__scrum4me__update_job_status,mcp__scrum4me__update_task_status,mcp__scrum4me__update_task_plan,mcp__scrum4me__log_implementation,mcp__scrum4me__log_test_result,mcp__scrum4me__log_commit,mcp__scrum4me__create_pbi,mcp__scrum4me__create_story,mcp__scrum4me__create_task,mcp__scrum4me__create_todo,mcp__scrum4me__ask_user_question,mcp__scrum4me__get_question_answer,mcp__scrum4me__list_open_questions,mcp__scrum4me__cancel_question'
|
||||||
|
|
||||||
CONSEC_FAILURES=0
|
CONSEC_FAILURES=0
|
||||||
BACKOFF=${AGENT_BACKOFF_START}
|
BACKOFF=${AGENT_BACKOFF_START}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue