Compare commits
7 commits
feat/forge
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 5529f3850d | |||
|
|
28ef6818a3 | ||
|
|
a051bb00d4 | ||
|
|
1a87bee280 | ||
|
|
c64c0278f2 | ||
|
|
794ad7faaa | ||
|
|
0b5a044ea5 |
4 changed files with 64 additions and 11 deletions
|
|
@ -110,3 +110,9 @@ AGENT_BACKOFF_MAX=300
|
|||
AGENT_LOG_GZIP_AFTER_HOURS=24
|
||||
# Hoeveel dagen ge-gzipte logs bewaren voor we ze verwijderen.
|
||||
AGENT_LOG_DELETE_AFTER_DAYS=30
|
||||
|
||||
# Claude CLI --output-format. Default 'stream-json' streamt de volledige
|
||||
# event-stream (tool-calls, berichten) live naar de run-log; 'text' geeft
|
||||
# alleen Claude's eind-samenvatting (terser, maar geen live-meekijken).
|
||||
# stream-json maakt de run-log JSONL — gebruik jq of een viewer.
|
||||
AGENT_CLAUDE_OUTPUT_FORMAT=stream-json
|
||||
|
|
|
|||
|
|
@ -18,4 +18,9 @@ find "${AGENT_LOG_DIR}" -type f \
|
|||
\( -name '*.log' -o -name '*.log.gz' -o -name '*.txt' -o -name '*.json' \) \
|
||||
-mtime "+${AGENT_LOG_HARD_DELETE_DAYS}" -delete 2>/dev/null || true
|
||||
|
||||
# Prune dangling per-job symlinks: jobs/<job_id>.log -> runs/<ts>.log waarvan
|
||||
# het doel door rotatie is gegzipt of verwijderd. De -type f hierboven raakt
|
||||
# symlinks niet, dus broken links worden hier expliciet opgeruimd (-xtype l).
|
||||
find "${AGENT_LOG_DIR}/jobs" -maxdepth 1 -xtype l -delete 2>/dev/null || true
|
||||
|
||||
find "${AGENT_LOG_DIR}/jobs" -mindepth 1 -type d -empty -delete 2>/dev/null || true
|
||||
|
|
|
|||
|
|
@ -68,7 +68,9 @@ while true; do
|
|||
# claimt zelf via tryClaimJob, leest JobConfig (PBI-67), bouwt de
|
||||
# juiste Claude CLI-args, spawnt 'claude', wacht, sluit af.
|
||||
set +e
|
||||
tsx /opt/agent/bin/run-one-job.ts > "${run_log}" 2>&1
|
||||
# RUN_LOG laat run-one-job.ts een jobs/<job_id>.log symlink leggen naar
|
||||
# dit run-log, zodat de output van een job op job-id vindbaar is.
|
||||
RUN_LOG="${run_log}" tsx /opt/agent/bin/run-one-job.ts > "${run_log}" 2>&1
|
||||
exit_code=$?
|
||||
set -e
|
||||
|
||||
|
|
@ -78,8 +80,12 @@ while true; do
|
|||
# Token-expiry detectie: run-one-job.ts retourneert exit 3 wanneer het
|
||||
# bekende auth-error-strings in Claude's output ziet. We checken óók de
|
||||
# log-tekst voor het geval een ander pad het patroon raakt (bv. Prisma-
|
||||
# connection-error met OAuth-expired in error-body).
|
||||
if [[ "$exit_code" -eq 3 ]] || grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; then
|
||||
# connection-error met OAuth-expired in error-body) — maar alléén bij een
|
||||
# niet-nul exit. Het run-log bevat de volledige stream-json output (incl.
|
||||
# tool-results én run-one-job's eigen "TOKEN_EXPIRED detected"-logregel),
|
||||
# dus een geslaagde job die toevallig "401 unauthorized" in z'n output
|
||||
# heeft mag de grep-fallback niet triggeren.
|
||||
if [[ "$exit_code" -eq 3 ]] || { [[ "$exit_code" -ne 0 ]] && grep -qE '(invalid_api_key|authentication.*failed|401.*unauthor|OAuth.*expired)' "${run_log}"; }; then
|
||||
log "AUTH FAILURE detected (exit=$exit_code or pattern in log) — marking TOKEN_EXPIRED"
|
||||
touch "${AGENT_STATE_DIR}/TOKEN_EXPIRED"
|
||||
write_state "$(jq -n \
|
||||
|
|
|
|||
|
|
@ -22,7 +22,8 @@
|
|||
// 3 = TOKEN_EXPIRED detected → run-agent.sh schrijft TOKEN_EXPIRED marker
|
||||
|
||||
import { spawn, spawnSync } from 'node:child_process'
|
||||
import { mkdirSync, rmSync, writeFileSync } from 'node:fs'
|
||||
import { mkdirSync, rmSync, symlinkSync, writeFileSync } from 'node:fs'
|
||||
import { basename, join } from 'node:path'
|
||||
|
||||
import { Client as PgClient } from 'pg'
|
||||
|
||||
|
|
@ -196,6 +197,25 @@ async function main(): Promise<number> {
|
|||
|
||||
log(`claimed job_id=${jobId}`)
|
||||
|
||||
// Per-job log: symlink jobs/<jobId>.log -> the runs/<timestamp>.log of
|
||||
// this iteration. runs/ files are timestamp-named, so without this a job's
|
||||
// output is only findable by grepping. run-agent.sh passes the run-log
|
||||
// path via RUN_LOG. Relative target so it survives the host bind-mount.
|
||||
// Best-effort — never fail the job over a log convenience. Dangling links
|
||||
// (after the runs/ file is gzipped/deleted) are pruned by log-cleanup.sh.
|
||||
const runLog = process.env.RUN_LOG
|
||||
if (runLog) {
|
||||
try {
|
||||
const jobsDir = join(process.env.AGENT_LOG_DIR ?? '/var/log/agent', 'jobs')
|
||||
mkdirSync(jobsDir, { recursive: true })
|
||||
const linkPath = join(jobsDir, `${jobId}.log`)
|
||||
rmSync(linkPath, { force: true })
|
||||
symlinkSync(join('..', 'runs', basename(runLog)), linkPath)
|
||||
} catch (err) {
|
||||
log(`per-job log symlink skipped for ${jobId}: ${(err as Error).message}`)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Resolve full context.
|
||||
let ctx: Awaited<ReturnType<typeof getFullJobContext>> = null
|
||||
try {
|
||||
|
|
@ -272,6 +292,13 @@ async function main(): Promise<number> {
|
|||
|
||||
// 7. Build CLI args.
|
||||
const promptText = getKindPromptText(ctx.kind).replace('$PAYLOAD_PATH', payloadPath)
|
||||
// --output-format is configureerbaar via env. Default 'stream-json' geeft
|
||||
// de volledige event-stream (elke tool-call, elk bericht) live in de
|
||||
// run-log, i.p.v. alleen Claude's eind-samenvatting. stream-json vereist
|
||||
// --verbose in print-mode. Zet AGENT_CLAUDE_OUTPUT_FORMAT=text terug voor
|
||||
// de oude terse output. TOKEN_EXPIRED-detectie werkt ongewijzigd: de
|
||||
// auth-error-strings staan ook binnen de JSON-events.
|
||||
const outputFormat = process.env.AGENT_CLAUDE_OUTPUT_FORMAT ?? 'stream-json'
|
||||
const args: string[] = [
|
||||
'-p',
|
||||
promptText,
|
||||
|
|
@ -286,8 +313,9 @@ async function main(): Promise<number> {
|
|||
'--add-dir',
|
||||
'/opt/agent',
|
||||
'--output-format',
|
||||
'text',
|
||||
outputFormat,
|
||||
]
|
||||
if (outputFormat === 'stream-json') args.push('--verbose')
|
||||
if (effort) args.push('--effort', effort)
|
||||
|
||||
const cwd = worktreePath ?? '/opt/agent'
|
||||
|
|
@ -355,13 +383,21 @@ async function main(): Promise<number> {
|
|||
`duration_ms=${durationMs} wall_clock_seconds=${Math.round(durationMs / 1000)}`,
|
||||
)
|
||||
|
||||
// 10. Token-expiry detection.
|
||||
// 10. Token-expiry detection — alleen als Claude zelf non-zero eindigde.
|
||||
// stdoutBuf bevat de volledige stream-json output incl. álle tool-results,
|
||||
// dus de auth-error-strings kunnen ook agent-werk-content zijn (een doc
|
||||
// over 401-handling gelezen, een endpoint getest). Een echte credential-
|
||||
// fout laat 'claude' non-zero exiten; een geslaagde run (exit 0) is per
|
||||
// definitie geen token-expiry. Zonder deze gate legt zulke content de
|
||||
// worker onterecht plat (run-agent.sh → TOKEN_EXPIRED marker + sleep).
|
||||
let tokenExpired = false
|
||||
for (const pat of TOKEN_EXPIRY_PATTERNS) {
|
||||
if (pat.test(stdoutBuf)) {
|
||||
tokenExpired = true
|
||||
log(`TOKEN_EXPIRED detected pattern="${pat.source}" exiting code=3`)
|
||||
break
|
||||
if (exitCode !== 0) {
|
||||
for (const pat of TOKEN_EXPIRY_PATTERNS) {
|
||||
if (pat.test(stdoutBuf)) {
|
||||
tokenExpired = true
|
||||
log(`TOKEN_EXPIRED detected pattern="${pat.source}" exiting code=3`)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue