Compare commits
14 commits
feat/sprin
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 0d76fc32ca | |||
|
|
2b03ee02e0 | ||
|
|
f01bd555d1 | ||
|
|
ec7c5a616a |
||
|
|
20de584759 | ||
|
|
c179c356b3 |
||
|
|
ab87c0fada | ||
|
|
27cba872a8 |
||
|
|
7e049ebdef | ||
|
|
84b3afbefa |
||
|
|
2b746af1a3 | ||
|
|
5c1f047259 |
||
|
|
68c4d037cf | ||
|
|
e0c2536a8c |
42 changed files with 5130 additions and 179 deletions
|
|
@ -7,3 +7,5 @@ OPS_AGENT_URL="http://127.0.0.1:3099"
|
||||||
REPO_PATHS="/srv/scrum4me/repos/scrum4me,/srv/ops/repos/ops-dashboard"
|
REPO_PATHS="/srv/scrum4me/repos/scrum4me,/srv/ops/repos/ops-dashboard"
|
||||||
# Comma-separated list of systemd unit names to show on the /systemd page (must match commands.yml allowed list)
|
# Comma-separated list of systemd unit names to show on the /systemd page (must match commands.yml allowed list)
|
||||||
SYSTEMD_UNITS="scrum4me-web,ops-agent"
|
SYSTEMD_UNITS="scrum4me-web,ops-agent"
|
||||||
|
# Worker run-logs directory inside the container (read-only bind mount; see docker-compose.yml)
|
||||||
|
WORKER_LOGS_DIR="/var/worker-logs/idea"
|
||||||
|
|
|
||||||
32
app/api/worker-logs/[name]/route.ts
Normal file
32
app/api/worker-logs/[name]/route.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import { NextRequest } from 'next/server'
|
||||||
|
import { getCurrentUser } from '@/lib/session'
|
||||||
|
import { readRunLog, WorkerLogError } from '@/lib/worker-logs'
|
||||||
|
import { parseRunLog } from '@/lib/parse-worker-log'
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
// GET /api/worker-logs/<file>.log — full parsed timeline for one run-log.
|
||||||
|
export async function GET(
|
||||||
|
_request: NextRequest,
|
||||||
|
{ params }: { params: Promise<{ name: string }> },
|
||||||
|
) {
|
||||||
|
const user = await getCurrentUser()
|
||||||
|
if (!user) {
|
||||||
|
return Response.json({ error: 'unauthorized' }, { status: 401 })
|
||||||
|
}
|
||||||
|
|
||||||
|
const { name: rawName } = await params
|
||||||
|
const name = decodeURIComponent(rawName)
|
||||||
|
|
||||||
|
try {
|
||||||
|
const raw = await readRunLog(name)
|
||||||
|
return Response.json(parseRunLog(raw, name))
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof WorkerLogError) {
|
||||||
|
const status = err.code === 'invalid' ? 400 : err.code === 'not-found' ? 404 : 500
|
||||||
|
return Response.json({ error: err.message }, { status })
|
||||||
|
}
|
||||||
|
const message = err instanceof Error ? err.message : 'failed to read worker log'
|
||||||
|
return Response.json({ error: message }, { status: 500 })
|
||||||
|
}
|
||||||
|
}
|
||||||
25
app/api/worker-logs/route.ts
Normal file
25
app/api/worker-logs/route.ts
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
import { NextRequest } from 'next/server'
|
||||||
|
import { getCurrentUser } from '@/lib/session'
|
||||||
|
import { listRunLogs } from '@/lib/worker-logs'
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
// GET /api/worker-logs?limit=10 — newest-first run-log summaries for the table.
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
const user = await getCurrentUser()
|
||||||
|
if (!user) {
|
||||||
|
return Response.json({ error: 'unauthorized' }, { status: 401 })
|
||||||
|
}
|
||||||
|
|
||||||
|
const limitParam = request.nextUrl.searchParams.get('limit')
|
||||||
|
const limit = limitParam ? Number(limitParam) : 10
|
||||||
|
|
||||||
|
try {
|
||||||
|
const logs = await listRunLogs(limit)
|
||||||
|
return Response.json({ logs })
|
||||||
|
} catch (err) {
|
||||||
|
// Surfaces a missing bind mount legibly (e.g. WORKER_LOGS_DIR not mounted).
|
||||||
|
const message = err instanceof Error ? err.message : 'failed to list worker logs'
|
||||||
|
return Response.json({ error: message }, { status: 500 })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -5,6 +5,11 @@ import { getCurrentUser } from '@/lib/session'
|
||||||
export const dynamic = 'force-dynamic'
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
const FLOWS = [
|
const FLOWS = [
|
||||||
|
{
|
||||||
|
href: '/flows/redeploy-all',
|
||||||
|
title: 'Redeploy All',
|
||||||
|
desc: 'Volledige stack-redeploy: scrum4me-web + MCP-worker (cache-busted)',
|
||||||
|
},
|
||||||
{
|
{
|
||||||
href: '/flows/update-scrum4me-web',
|
href: '/flows/update-scrum4me-web',
|
||||||
title: 'Update Scrum4Me website',
|
title: 'Update Scrum4Me website',
|
||||||
|
|
@ -15,6 +20,11 @@ const FLOWS = [
|
||||||
title: 'Update Caddy config',
|
title: 'Update Caddy config',
|
||||||
desc: 'Reload Caddy met nieuwe Caddyfile + cert renewal check',
|
desc: 'Reload Caddy met nieuwe Caddyfile + cert renewal check',
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
href: '/flows/server-backup',
|
||||||
|
title: 'Server backup',
|
||||||
|
desc: 'pg_dumpall + restic naar NAS én B2 — handmatige backup of restore-test',
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
export default async function FlowsIndex() {
|
export default async function FlowsIndex() {
|
||||||
|
|
|
||||||
135
app/flows/redeploy-all/_components/flow-panel.tsx
Normal file
135
app/flows/redeploy-all/_components/flow-panel.tsx
Normal file
|
|
@ -0,0 +1,135 @@
|
||||||
|
'use client'
|
||||||
|
|
||||||
|
import { useState, useCallback } from 'react'
|
||||||
|
import Link from 'next/link'
|
||||||
|
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||||
|
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||||
|
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||||
|
|
||||||
|
const FLOW_KEY = 'redeploy_all'
|
||||||
|
|
||||||
|
const STEPS = [
|
||||||
|
'git status Scrum4Me (show current state)',
|
||||||
|
'git fetch Scrum4Me (fetch remote refs)',
|
||||||
|
'git log (commits ahead of upstream)',
|
||||||
|
'git pull --ff-only Scrum4Me (aborts if dirty)',
|
||||||
|
'npm ci (install dependencies)',
|
||||||
|
'prisma migrate deploy (apply migrations)',
|
||||||
|
'npm run build (build application)',
|
||||||
|
'systemctl restart scrum4me-web',
|
||||||
|
'smoke test: curl /api/products (expect 200 or 401)',
|
||||||
|
'git status scrum4me-docker (show current state)',
|
||||||
|
'git fetch scrum4me-docker (fetch remote refs)',
|
||||||
|
'git pull --ff-only scrum4me-docker (aborts if dirty)',
|
||||||
|
'git pull --ff-only scrum4me-mcp (lokale sync)',
|
||||||
|
'rebuild worker image — cache-busted MCP clone',
|
||||||
|
'docker compose up -d --force-recreate worker-idea',
|
||||||
|
'wait for worker pre-flight to pass',
|
||||||
|
]
|
||||||
|
|
||||||
|
export default function FlowPanel() {
|
||||||
|
const [pendingDryRun, setPendingDryRun] = useState<boolean | null>(null)
|
||||||
|
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||||
|
|
||||||
|
const handleComplete = useCallback((flowRunId: string) => {
|
||||||
|
setCompletedFlowRunId(flowRunId)
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const flowRun = useFlowRun(handleComplete)
|
||||||
|
|
||||||
|
const handleConfirm = useCallback(() => {
|
||||||
|
if (pendingDryRun === null) return
|
||||||
|
const dryRun = pendingDryRun
|
||||||
|
setPendingDryRun(null)
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
flowRun.startFlow(FLOW_KEY, dryRun)
|
||||||
|
}, [pendingDryRun, flowRun])
|
||||||
|
|
||||||
|
const handleReset = useCallback(() => {
|
||||||
|
flowRun.reset()
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
}, [flowRun])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-6">
|
||||||
|
<div className="rounded-lg border border-border p-5 space-y-4">
|
||||||
|
<div>
|
||||||
|
<p className="text-sm text-muted-foreground">
|
||||||
|
Volledige stack-redeploy: eerst de hoofd-app (scrum4me-web — pull,
|
||||||
|
migrate, build, restart), dan de MCP-worker (cache-busted image
|
||||||
|
rebuild zodat de nieuwe scrum4me-mcp code wordt opgepikt).
|
||||||
|
</p>
|
||||||
|
<p className="mt-1 text-xs text-muted-foreground font-mono">
|
||||||
|
repos: Scrum4Me · scrum4me-docker · scrum4me-mcp
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<ol className="space-y-1">
|
||||||
|
{STEPS.map((step, i) => (
|
||||||
|
<li key={i} className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||||
|
<span className="text-border min-w-[1.5rem]">{i + 1}.</span>
|
||||||
|
<span>{step}</span>
|
||||||
|
</li>
|
||||||
|
))}
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<button
|
||||||
|
onClick={() => setPendingDryRun(false)}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
||||||
|
>
|
||||||
|
Run
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setPendingDryRun(true)}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className="rounded-lg border border-border px-4 py-2 text-sm hover:bg-muted/50 disabled:opacity-50 transition-colors"
|
||||||
|
>
|
||||||
|
Dry Run
|
||||||
|
</button>
|
||||||
|
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||||
|
<button
|
||||||
|
onClick={handleReset}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
Reset
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{flowRun.status !== 'idle' && (
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<span className="text-sm font-medium">Output</span>
|
||||||
|
{completedFlowRunId && (
|
||||||
|
<Link
|
||||||
|
href={`/audit/${completedFlowRunId}`}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
View in audit log →
|
||||||
|
</Link>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<StreamingTerminal
|
||||||
|
lines={flowRun.lines}
|
||||||
|
status={flowRun.status}
|
||||||
|
error={flowRun.error}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<ConfirmDialog
|
||||||
|
open={pendingDryRun !== null}
|
||||||
|
title={pendingDryRun ? 'Dry Run: Redeploy All' : 'Run: Redeploy All'}
|
||||||
|
commandPreview={
|
||||||
|
pendingDryRun
|
||||||
|
? `[DRY RUN] flow: ${FLOW_KEY}\n\nAll steps will be shown without executing.`
|
||||||
|
: `flow: ${FLOW_KEY}\n\nSteps:\n${STEPS.map((s, i) => ` ${i + 1}. ${s}`).join('\n')}`
|
||||||
|
}
|
||||||
|
onConfirm={handleConfirm}
|
||||||
|
onCancel={() => setPendingDryRun(null)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
27
app/flows/redeploy-all/page.tsx
Normal file
27
app/flows/redeploy-all/page.tsx
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
import Link from 'next/link'
|
||||||
|
import { redirect } from 'next/navigation'
|
||||||
|
import { getCurrentUser } from '@/lib/session'
|
||||||
|
import FlowPanel from './_components/flow-panel'
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
export default async function RedeployAllPage() {
|
||||||
|
const user = await getCurrentUser()
|
||||||
|
if (!user) redirect('/login')
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-background p-6">
|
||||||
|
<div className="mx-auto max-w-4xl space-y-6">
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<Link href="/" className="text-sm text-muted-foreground hover:text-foreground">
|
||||||
|
← Home
|
||||||
|
</Link>
|
||||||
|
<span className="text-muted-foreground">/</span>
|
||||||
|
<h1 className="text-2xl font-semibold tracking-tight">Redeploy All</h1>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<FlowPanel />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
178
app/flows/server-backup/_components/flow-panel.tsx
Normal file
178
app/flows/server-backup/_components/flow-panel.tsx
Normal file
|
|
@ -0,0 +1,178 @@
|
||||||
|
'use client'
|
||||||
|
|
||||||
|
import { useState, useCallback } from 'react'
|
||||||
|
import Link from 'next/link'
|
||||||
|
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||||
|
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||||
|
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||||
|
|
||||||
|
// One panel runs either flow; we switch step-list + description based on the
|
||||||
|
// currently active or last-triggered action. Wording mirrors the existing
|
||||||
|
// /settings/backups → server-backup-section so the two entry points stay
|
||||||
|
// consistent. The actual work in both flows runs out-of-band via systemd
|
||||||
|
// (server-backup.service) — the ops-agent flow just kicks it off and tails
|
||||||
|
// the resulting log / status file.
|
||||||
|
|
||||||
|
type Kind = 'backup' | 'restore'
|
||||||
|
|
||||||
|
type FlowSpec = {
|
||||||
|
flowKey: string
|
||||||
|
buttonLabel: string
|
||||||
|
shortDescription: string
|
||||||
|
steps: string[]
|
||||||
|
confirmTitle: string
|
||||||
|
confirmBody: string
|
||||||
|
}
|
||||||
|
|
||||||
|
const FLOWS: Record<Kind, FlowSpec> = {
|
||||||
|
backup: {
|
||||||
|
flowKey: 'server_backup_full',
|
||||||
|
buttonLabel: 'Backup now',
|
||||||
|
shortDescription:
|
||||||
|
'Volledige server-backup: pg_dumpall van alle databases + restic snapshot naar NAS én Backblaze B2 (Object Lock). Draait dagelijks via timer; deze knop triggert handmatig.',
|
||||||
|
steps: [
|
||||||
|
'trigger_server_backup (systemctl start server-backup.service)',
|
||||||
|
'tail_backup_log_today (live log mee-stream)',
|
||||||
|
'read_backup_status (lees status.json met repo-totalen + duur)',
|
||||||
|
],
|
||||||
|
confirmTitle: 'Trigger server backup',
|
||||||
|
confirmBody:
|
||||||
|
'flow: server_backup_full\n\nSteps:\n 1. trigger_server_backup (systemctl start server-backup.service)\n 2. tail_backup_log_today\n 3. read_backup_status\n\nThe actual work happens in systemd; this flow kicks it off and tails the log.',
|
||||||
|
},
|
||||||
|
restore: {
|
||||||
|
flowKey: 'server_backup_restore_test',
|
||||||
|
buttonLabel: 'Run restore test',
|
||||||
|
shortDescription:
|
||||||
|
'Non-destructieve restore-test: haalt de laatste snapshot uit de NAS-repo terug naar /tmp/restore-test en verifieert dat kritieke files er zijn. Raakt niets in de live stack.',
|
||||||
|
steps: [
|
||||||
|
'trigger_restore_test (restore latest NAS snapshot to /tmp/restore-test/)',
|
||||||
|
'read_backup_status (lees assertions + per-file outcome)',
|
||||||
|
],
|
||||||
|
confirmTitle: 'Run restore test (NAS)',
|
||||||
|
confirmBody:
|
||||||
|
'flow: server_backup_restore_test\n\nSteps:\n 1. trigger_restore_test (restore latest NAS snapshot to /tmp/restore-test/)\n 2. read_backup_status\n\nNon-destructive — restores into /tmp only and asserts critical files exist.',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function FlowPanel() {
|
||||||
|
// `displayKind` drives the steps/description card; updated optimistically
|
||||||
|
// when the user presses a button so the displayed flow matches the pending
|
||||||
|
// confirm. `activeKind` only flips once the flow actually starts.
|
||||||
|
const [displayKind, setDisplayKind] = useState<Kind>('backup')
|
||||||
|
const [pendingKind, setPendingKind] = useState<Kind | null>(null)
|
||||||
|
const [activeKind, setActiveKind] = useState<Kind | null>(null)
|
||||||
|
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||||
|
|
||||||
|
const handleComplete = useCallback((flowRunId: string) => {
|
||||||
|
setCompletedFlowRunId(flowRunId)
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const flowRun = useFlowRun(handleComplete)
|
||||||
|
|
||||||
|
const handleClickKind = useCallback((kind: Kind) => {
|
||||||
|
setDisplayKind(kind)
|
||||||
|
setPendingKind(kind)
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const handleConfirm = useCallback(() => {
|
||||||
|
if (pendingKind === null) return
|
||||||
|
const kind = pendingKind
|
||||||
|
setPendingKind(null)
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
setActiveKind(kind)
|
||||||
|
flowRun.startFlow(FLOWS[kind].flowKey, false)
|
||||||
|
}, [pendingKind, flowRun])
|
||||||
|
|
||||||
|
const handleReset = useCallback(() => {
|
||||||
|
flowRun.reset()
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
setActiveKind(null)
|
||||||
|
}, [flowRun])
|
||||||
|
|
||||||
|
const spec = FLOWS[displayKind]
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-6">
|
||||||
|
<div className="rounded-lg border border-border p-5 space-y-4">
|
||||||
|
<div>
|
||||||
|
<p className="text-sm text-muted-foreground">{spec.shortDescription}</p>
|
||||||
|
<p className="mt-1 text-xs text-muted-foreground font-mono">
|
||||||
|
flow: {spec.flowKey}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<ol className="space-y-1">
|
||||||
|
{spec.steps.map((step, i) => (
|
||||||
|
<li key={i} className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||||
|
<span className="text-border min-w-[1.5rem]">{i + 1}.</span>
|
||||||
|
<span>{step}</span>
|
||||||
|
</li>
|
||||||
|
))}
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<button
|
||||||
|
onClick={() => handleClickKind('backup')}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className={`rounded-lg px-4 py-2 text-sm font-medium transition-opacity disabled:opacity-50 ${
|
||||||
|
displayKind === 'backup'
|
||||||
|
? 'bg-foreground text-background hover:opacity-90'
|
||||||
|
: 'border border-border hover:bg-muted/50'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
{FLOWS.backup.buttonLabel}
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => handleClickKind('restore')}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className={`rounded-lg px-4 py-2 text-sm font-medium transition-opacity disabled:opacity-50 ${
|
||||||
|
displayKind === 'restore'
|
||||||
|
? 'bg-foreground text-background hover:opacity-90'
|
||||||
|
: 'border border-border hover:bg-muted/50'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
{FLOWS.restore.buttonLabel}
|
||||||
|
</button>
|
||||||
|
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||||
|
<button
|
||||||
|
onClick={handleReset}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
Reset
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{flowRun.status !== 'idle' && (
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<span className="text-sm font-medium">
|
||||||
|
Output{activeKind ? ` — ${FLOWS[activeKind].buttonLabel}` : ''}
|
||||||
|
</span>
|
||||||
|
{completedFlowRunId && (
|
||||||
|
<Link
|
||||||
|
href={`/audit/${completedFlowRunId}`}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
View in audit log →
|
||||||
|
</Link>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<StreamingTerminal
|
||||||
|
lines={flowRun.lines}
|
||||||
|
status={flowRun.status}
|
||||||
|
error={flowRun.error}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<ConfirmDialog
|
||||||
|
open={pendingKind !== null}
|
||||||
|
title={pendingKind ? FLOWS[pendingKind].confirmTitle : ''}
|
||||||
|
commandPreview={pendingKind ? FLOWS[pendingKind].confirmBody : ''}
|
||||||
|
onConfirm={handleConfirm}
|
||||||
|
onCancel={() => setPendingKind(null)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
31
app/flows/server-backup/page.tsx
Normal file
31
app/flows/server-backup/page.tsx
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
import Link from 'next/link'
|
||||||
|
import { redirect } from 'next/navigation'
|
||||||
|
import { getCurrentUser } from '@/lib/session'
|
||||||
|
import FlowPanel from './_components/flow-panel'
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
export default async function ServerBackupPage() {
|
||||||
|
const user = await getCurrentUser()
|
||||||
|
if (!user) redirect('/login')
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-background p-6">
|
||||||
|
<div className="mx-auto max-w-4xl space-y-6">
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<Link href="/" className="text-sm text-muted-foreground hover:text-foreground">
|
||||||
|
← Home
|
||||||
|
</Link>
|
||||||
|
<span className="text-muted-foreground">/</span>
|
||||||
|
<Link href="/flows" className="text-sm text-muted-foreground hover:text-foreground">
|
||||||
|
Flows
|
||||||
|
</Link>
|
||||||
|
<span className="text-muted-foreground">/</span>
|
||||||
|
<h1 className="text-2xl font-semibold tracking-tight">Server backup</h1>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<FlowPanel />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
@ -1,171 +1,52 @@
|
||||||
'use client'
|
'use client'
|
||||||
|
|
||||||
import { useState, useCallback } from 'react'
|
|
||||||
import Link from 'next/link'
|
|
||||||
import { useFlowRun } from '@/hooks/useFlowRun'
|
|
||||||
import StreamingTerminal from '@/components/StreamingTerminal'
|
|
||||||
import ConfirmDialog from '@/components/ConfirmDialog'
|
|
||||||
import type { BackupFile } from '../page'
|
import type { BackupFile } from '../page'
|
||||||
|
import type {
|
||||||
function formatSize(bytes: number): string {
|
BackupStatusEnvelope,
|
||||||
if (bytes === 0) return '—'
|
ResticSnapshot,
|
||||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
ResticStats,
|
||||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
} from '../_lib/types'
|
||||||
}
|
import DatabaseBackupsSection from './database-backups-section'
|
||||||
|
import ServerBackupSection from './server-backup-section'
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
backups: BackupFile[]
|
backups: BackupFile[]
|
||||||
listError: string | null
|
listError: string | null
|
||||||
|
envelope: BackupStatusEnvelope
|
||||||
|
nasSnapshots: ResticSnapshot[]
|
||||||
|
b2Snapshots: ResticSnapshot[]
|
||||||
|
nasStats: ResticStats | null
|
||||||
|
b2Stats: ResticStats | null
|
||||||
|
serverBackupErrors: {
|
||||||
|
status?: string
|
||||||
|
nasSnapshots?: string
|
||||||
|
b2Snapshots?: string
|
||||||
|
nasStats?: string
|
||||||
|
b2Stats?: string
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function BackupsPanel({ backups, listError }: Props) {
|
export default function BackupsPanel({
|
||||||
const [pending, setPending] = useState(false)
|
backups,
|
||||||
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
listError,
|
||||||
|
envelope,
|
||||||
const handleComplete = useCallback((flowRunId: string) => {
|
nasSnapshots,
|
||||||
setCompletedFlowRunId(flowRunId)
|
b2Snapshots,
|
||||||
}, [])
|
nasStats,
|
||||||
|
b2Stats,
|
||||||
const flowRun = useFlowRun(handleComplete)
|
serverBackupErrors,
|
||||||
|
}: Props) {
|
||||||
const handleConfirm = useCallback(() => {
|
|
||||||
setPending(false)
|
|
||||||
setCompletedFlowRunId(null)
|
|
||||||
flowRun.startFlow('backup_ops_db', false)
|
|
||||||
}, [flowRun])
|
|
||||||
|
|
||||||
const handleReset = useCallback(() => {
|
|
||||||
flowRun.reset()
|
|
||||||
setCompletedFlowRunId(null)
|
|
||||||
}, [flowRun])
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="space-y-6">
|
<div className="space-y-12">
|
||||||
{/* Description */}
|
<DatabaseBackupsSection backups={backups} listError={listError} />
|
||||||
<div className="rounded-lg border border-border p-5 space-y-3">
|
<div className="h-px bg-border" />
|
||||||
<p className="text-sm text-muted-foreground">
|
<ServerBackupSection
|
||||||
Backs up the <code className="font-mono text-xs">ops_dashboard</code> database using{' '}
|
envelope={envelope}
|
||||||
<code className="font-mono text-xs">pg_dump</code>. Dumps are stored in{' '}
|
nasSnapshots={nasSnapshots}
|
||||||
<code className="font-mono text-xs">/srv/ops/backups/</code> and retained for 30 days.
|
b2Snapshots={b2Snapshots}
|
||||||
For automated daily backups, enable the systemd timer:{' '}
|
nasStats={nasStats}
|
||||||
<code className="font-mono text-xs">deploy/ops-agent/ops-db-backup.timer</code>.
|
b2Stats={b2Stats}
|
||||||
</p>
|
errors={serverBackupErrors}
|
||||||
|
|
||||||
<ol className="space-y-0.5">
|
|
||||||
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
|
||||||
<span className="text-border min-w-[1.5rem]">1.</span>
|
|
||||||
<span>pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump</span>
|
|
||||||
</li>
|
|
||||||
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
|
||||||
<span className="text-border min-w-[1.5rem]">2.</span>
|
|
||||||
<span>cleanup: delete backup files older than 30 days</span>
|
|
||||||
</li>
|
|
||||||
</ol>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* Action buttons */}
|
|
||||||
<div className="flex items-center gap-3">
|
|
||||||
<button
|
|
||||||
onClick={() => setPending(true)}
|
|
||||||
disabled={flowRun.status === 'running'}
|
|
||||||
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
|
||||||
>
|
|
||||||
Backup now
|
|
||||||
</button>
|
|
||||||
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
|
||||||
<button
|
|
||||||
onClick={handleReset}
|
|
||||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
|
||||||
>
|
|
||||||
Reset
|
|
||||||
</button>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* Terminal output */}
|
|
||||||
{flowRun.status !== 'idle' && (
|
|
||||||
<div className="space-y-2">
|
|
||||||
<div className="flex items-center justify-between">
|
|
||||||
<span className="text-sm font-medium">Output</span>
|
|
||||||
{completedFlowRunId && (
|
|
||||||
<Link
|
|
||||||
href={`/audit/${completedFlowRunId}`}
|
|
||||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
|
||||||
>
|
|
||||||
View in audit log →
|
|
||||||
</Link>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
<StreamingTerminal
|
|
||||||
lines={flowRun.lines}
|
|
||||||
status={flowRun.status}
|
|
||||||
error={flowRun.error}
|
|
||||||
/>
|
|
||||||
{flowRun.status === 'done' && (
|
|
||||||
<p className="text-xs text-muted-foreground">
|
|
||||||
Reload this page to see the updated backup list.
|
|
||||||
</p>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{/* Backup list */}
|
|
||||||
<div className="space-y-3">
|
|
||||||
<h2 className="text-sm font-semibold">Existing backups</h2>
|
|
||||||
|
|
||||||
{listError && (
|
|
||||||
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-4 text-sm text-destructive">
|
|
||||||
Could not list backups: {listError}
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{!listError && backups.length === 0 && (
|
|
||||||
<div className="rounded-lg border border-border px-4 py-6 text-sm text-muted-foreground text-center">
|
|
||||||
No backups found in /srv/ops/backups/
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{!listError && backups.length > 0 && (
|
|
||||||
<div className="rounded-lg border border-border overflow-hidden">
|
|
||||||
<table className="w-full text-xs font-mono">
|
|
||||||
<thead>
|
|
||||||
<tr className="border-b border-border bg-muted/30">
|
|
||||||
<th className="text-left px-4 py-2 font-medium text-muted-foreground">
|
|
||||||
Timestamp
|
|
||||||
</th>
|
|
||||||
<th className="text-left px-4 py-2 font-medium text-muted-foreground">File</th>
|
|
||||||
<th className="text-right px-4 py-2 font-medium text-muted-foreground">Size</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
{backups.map((b, i) => (
|
|
||||||
<tr key={b.name} className={i % 2 === 0 ? '' : 'bg-muted/10'}>
|
|
||||||
<td className="px-4 py-2 text-muted-foreground">{b.label}</td>
|
|
||||||
<td className="px-4 py-2">{b.name}</td>
|
|
||||||
<td className="px-4 py-2 text-right text-muted-foreground">
|
|
||||||
{formatSize(b.sizeBytes)}
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
))}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
<p className="text-xs text-muted-foreground">
|
|
||||||
Backups older than 30 days are removed automatically by the cleanup step.
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* Confirm dialog */}
|
|
||||||
<ConfirmDialog
|
|
||||||
open={pending}
|
|
||||||
title="Backup ops_dashboard database"
|
|
||||||
commandPreview={
|
|
||||||
'flow: backup_ops_db\n\nSteps:\n 1. pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump\n 2. cleanup: delete backups older than 30 days'
|
|
||||||
}
|
|
||||||
onConfirm={handleConfirm}
|
|
||||||
onCancel={() => setPending(false)}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
|
|
|
||||||
172
app/settings/backups/_components/database-backups-section.tsx
Normal file
172
app/settings/backups/_components/database-backups-section.tsx
Normal file
|
|
@ -0,0 +1,172 @@
|
||||||
|
'use client'
|
||||||
|
|
||||||
|
import { useCallback, useState } from 'react'
|
||||||
|
import Link from 'next/link'
|
||||||
|
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||||
|
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||||
|
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||||
|
import type { BackupFile } from '../page'
|
||||||
|
|
||||||
|
function formatSize(bytes: number): string {
|
||||||
|
if (bytes === 0) return '—'
|
||||||
|
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
||||||
|
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
backups: BackupFile[]
|
||||||
|
listError: string | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function DatabaseBackupsSection({ backups, listError }: Props) {
|
||||||
|
const [pending, setPending] = useState(false)
|
||||||
|
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||||
|
|
||||||
|
const handleComplete = useCallback((flowRunId: string) => {
|
||||||
|
setCompletedFlowRunId(flowRunId)
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const flowRun = useFlowRun(handleComplete)
|
||||||
|
|
||||||
|
const handleConfirm = useCallback(() => {
|
||||||
|
setPending(false)
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
flowRun.startFlow('backup_ops_db', false)
|
||||||
|
}, [flowRun])
|
||||||
|
|
||||||
|
const handleReset = useCallback(() => {
|
||||||
|
flowRun.reset()
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
}, [flowRun])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<section className="space-y-6">
|
||||||
|
<div className="flex items-baseline justify-between">
|
||||||
|
<h2 className="text-lg font-semibold tracking-tight">Database backups</h2>
|
||||||
|
<span className="text-xs text-muted-foreground">flow: backup_ops_db</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="rounded-lg border border-border p-5 space-y-3">
|
||||||
|
<p className="text-sm text-muted-foreground">
|
||||||
|
Backs up the <code className="font-mono text-xs">ops_dashboard</code> database using{' '}
|
||||||
|
<code className="font-mono text-xs">pg_dump</code>. Dumps are stored in{' '}
|
||||||
|
<code className="font-mono text-xs">/srv/ops/backups/</code> and retained for 30 days.
|
||||||
|
For automated daily backups, enable the systemd timer:{' '}
|
||||||
|
<code className="font-mono text-xs">deploy/ops-agent/ops-db-backup.timer</code>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ol className="space-y-0.5">
|
||||||
|
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||||
|
<span className="text-border min-w-[1.5rem]">1.</span>
|
||||||
|
<span>pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump</span>
|
||||||
|
</li>
|
||||||
|
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||||
|
<span className="text-border min-w-[1.5rem]">2.</span>
|
||||||
|
<span>cleanup: delete backup files older than 30 days</span>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<button
|
||||||
|
onClick={() => setPending(true)}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
||||||
|
>
|
||||||
|
Backup now
|
||||||
|
</button>
|
||||||
|
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||||
|
<button
|
||||||
|
onClick={handleReset}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
Reset
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{flowRun.status !== 'idle' && (
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<span className="text-sm font-medium">Output</span>
|
||||||
|
{completedFlowRunId && (
|
||||||
|
<Link
|
||||||
|
href={`/audit/${completedFlowRunId}`}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
View in audit log →
|
||||||
|
</Link>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<StreamingTerminal
|
||||||
|
lines={flowRun.lines}
|
||||||
|
status={flowRun.status}
|
||||||
|
error={flowRun.error}
|
||||||
|
/>
|
||||||
|
{flowRun.status === 'done' && (
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Reload this page to see the updated backup list.
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="space-y-3">
|
||||||
|
<h3 className="text-sm font-semibold">Existing backups</h3>
|
||||||
|
|
||||||
|
{listError && (
|
||||||
|
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-4 text-sm text-destructive">
|
||||||
|
Could not list backups: {listError}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{!listError && backups.length === 0 && (
|
||||||
|
<div className="rounded-lg border border-border px-4 py-6 text-sm text-muted-foreground text-center">
|
||||||
|
No backups found in /srv/ops/backups/
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{!listError && backups.length > 0 && (
|
||||||
|
<div className="rounded-lg border border-border overflow-hidden">
|
||||||
|
<table className="w-full text-xs font-mono">
|
||||||
|
<thead>
|
||||||
|
<tr className="border-b border-border bg-muted/30">
|
||||||
|
<th className="text-left px-4 py-2 font-medium text-muted-foreground">
|
||||||
|
Timestamp
|
||||||
|
</th>
|
||||||
|
<th className="text-left px-4 py-2 font-medium text-muted-foreground">File</th>
|
||||||
|
<th className="text-right px-4 py-2 font-medium text-muted-foreground">Size</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{backups.map((b, i) => (
|
||||||
|
<tr key={b.name} className={i % 2 === 0 ? '' : 'bg-muted/10'}>
|
||||||
|
<td className="px-4 py-2 text-muted-foreground">{b.label}</td>
|
||||||
|
<td className="px-4 py-2">{b.name}</td>
|
||||||
|
<td className="px-4 py-2 text-right text-muted-foreground">
|
||||||
|
{formatSize(b.sizeBytes)}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Backups older than 30 days are removed automatically by the cleanup step.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ConfirmDialog
|
||||||
|
open={pending}
|
||||||
|
title="Backup ops_dashboard database"
|
||||||
|
commandPreview={
|
||||||
|
'flow: backup_ops_db\n\nSteps:\n 1. pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump\n 2. cleanup: delete backups older than 30 days'
|
||||||
|
}
|
||||||
|
onConfirm={handleConfirm}
|
||||||
|
onCancel={() => setPending(false)}
|
||||||
|
/>
|
||||||
|
</section>
|
||||||
|
)
|
||||||
|
}
|
||||||
447
app/settings/backups/_components/server-backup-section.tsx
Normal file
447
app/settings/backups/_components/server-backup-section.tsx
Normal file
|
|
@ -0,0 +1,447 @@
|
||||||
|
'use client'
|
||||||
|
|
||||||
|
import { useCallback, useState } from 'react'
|
||||||
|
import Link from 'next/link'
|
||||||
|
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||||
|
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||||
|
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||||
|
import type {
|
||||||
|
BackupPhase,
|
||||||
|
BackupStatus,
|
||||||
|
BackupStatusEnvelope,
|
||||||
|
OverallStatus,
|
||||||
|
PhaseStatus,
|
||||||
|
ResticSnapshot,
|
||||||
|
ResticStats,
|
||||||
|
} from '../_lib/types'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
envelope: BackupStatusEnvelope
|
||||||
|
nasSnapshots: ResticSnapshot[]
|
||||||
|
b2Snapshots: ResticSnapshot[]
|
||||||
|
nasStats: ResticStats | null
|
||||||
|
b2Stats: ResticStats | null
|
||||||
|
errors: {
|
||||||
|
status?: string
|
||||||
|
nasSnapshots?: string
|
||||||
|
b2Snapshots?: string
|
||||||
|
nasStats?: string
|
||||||
|
b2Stats?: string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type ActiveFlow = 'backup' | 'restore' | null
|
||||||
|
|
||||||
|
function formatBytes(bytes: number | null | undefined): string {
|
||||||
|
if (bytes == null) return '—'
|
||||||
|
if (bytes < 1024) return `${bytes} B`
|
||||||
|
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
||||||
|
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||||
|
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatDuration(seconds: number | null | undefined): string {
|
||||||
|
if (seconds == null || seconds === 0) return '—'
|
||||||
|
if (seconds < 60) return `${seconds}s`
|
||||||
|
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${seconds % 60}s`
|
||||||
|
const h = Math.floor(seconds / 3600)
|
||||||
|
const m = Math.floor((seconds % 3600) / 60)
|
||||||
|
return `${h}h ${m}m`
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatTimestamp(iso: string | null | undefined): string {
|
||||||
|
if (!iso) return '—'
|
||||||
|
try {
|
||||||
|
const d = new Date(iso)
|
||||||
|
if (Number.isNaN(d.getTime())) return iso
|
||||||
|
const yyyy = d.getFullYear()
|
||||||
|
const mm = String(d.getMonth() + 1).padStart(2, '0')
|
||||||
|
const dd = String(d.getDate()).padStart(2, '0')
|
||||||
|
const hh = String(d.getHours()).padStart(2, '0')
|
||||||
|
const mi = String(d.getMinutes()).padStart(2, '0')
|
||||||
|
return `${yyyy}-${mm}-${dd} ${hh}:${mi}`
|
||||||
|
} catch {
|
||||||
|
return iso
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function overallBadgeClass(status: OverallStatus): string {
|
||||||
|
switch (status) {
|
||||||
|
case 'success':
|
||||||
|
return 'bg-green-500/15 text-green-500 border-green-500/30'
|
||||||
|
case 'partial_failure':
|
||||||
|
return 'bg-amber-500/15 text-amber-500 border-amber-500/30'
|
||||||
|
case 'failed':
|
||||||
|
return 'bg-destructive/15 text-destructive border-destructive/30'
|
||||||
|
default:
|
||||||
|
return 'bg-muted/50 text-muted-foreground border-border'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function phaseIcon(status: PhaseStatus): { glyph: string; color: string } {
|
||||||
|
switch (status) {
|
||||||
|
case 'success':
|
||||||
|
return { glyph: '✓', color: 'text-green-500' }
|
||||||
|
case 'skipped':
|
||||||
|
return { glyph: '–', color: 'text-muted-foreground' }
|
||||||
|
case 'degraded':
|
||||||
|
return { glyph: '!', color: 'text-amber-500' }
|
||||||
|
case 'failed':
|
||||||
|
return { glyph: '✗', color: 'text-destructive' }
|
||||||
|
case 'pending':
|
||||||
|
default:
|
||||||
|
return { glyph: '○', color: 'text-muted-foreground/50' }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function phaseDurationSeconds(phase: BackupPhase): number | null {
|
||||||
|
if (!phase.startedAt || !phase.completedAt) return null
|
||||||
|
const start = new Date(phase.startedAt).getTime()
|
||||||
|
const end = new Date(phase.completedAt).getTime()
|
||||||
|
if (Number.isNaN(start) || Number.isNaN(end)) return null
|
||||||
|
return Math.max(0, Math.round((end - start) / 1000))
|
||||||
|
}
|
||||||
|
|
||||||
|
function StatusCard({ status }: { status: BackupStatus | null }) {
|
||||||
|
if (!status) {
|
||||||
|
return (
|
||||||
|
<div className="rounded-lg border border-border px-4 py-3 text-sm text-muted-foreground">
|
||||||
|
No backup run recorded yet. Trigger one with the "Backup now" button below.
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
<div className="rounded-lg border border-border p-4 space-y-3">
|
||||||
|
<div className="flex items-center justify-between flex-wrap gap-2">
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<span
|
||||||
|
className={`inline-flex items-center gap-1.5 rounded-md border px-2 py-0.5 text-xs font-medium uppercase tracking-wide ${overallBadgeClass(status.overallStatus)}`}
|
||||||
|
>
|
||||||
|
{status.overallStatus.replace('_', ' ')}
|
||||||
|
</span>
|
||||||
|
<span className="text-sm text-muted-foreground">
|
||||||
|
Last run {formatTimestamp(status.completedAt)} on{' '}
|
||||||
|
<code className="font-mono text-xs">{status.host || '—'}</code>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
duration {formatDuration(status.durationSeconds)}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div className="grid grid-cols-2 gap-1 sm:grid-cols-4">
|
||||||
|
{status.phases.map((p) => {
|
||||||
|
const icon = phaseIcon(p.status)
|
||||||
|
const dur = phaseDurationSeconds(p)
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
key={p.name}
|
||||||
|
className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/20 px-2 py-1.5"
|
||||||
|
title={p.error ?? p.status}
|
||||||
|
>
|
||||||
|
<span className={`font-mono text-sm ${icon.color}`}>{icon.glyph}</span>
|
||||||
|
<div className="flex flex-col leading-tight min-w-0">
|
||||||
|
<span className="truncate text-xs font-medium">{p.name}</span>
|
||||||
|
<span className="text-[10px] text-muted-foreground">
|
||||||
|
{p.status}
|
||||||
|
{dur != null ? ` · ${formatDuration(dur)}` : ''}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function StatsBlock({ stats, label, error }: { stats: ResticStats | null; label: string; error?: string }) {
|
||||||
|
if (error) {
|
||||||
|
return (
|
||||||
|
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-3 text-xs text-destructive">
|
||||||
|
{label}: {error}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (!stats) {
|
||||||
|
return (
|
||||||
|
<div className="rounded-lg border border-border p-3 text-xs text-muted-foreground">
|
||||||
|
{label}: no stats yet
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
const dedup =
|
||||||
|
stats.dedupRatio != null && Number.isFinite(stats.dedupRatio)
|
||||||
|
? `${stats.dedupRatio.toFixed(2)}×`
|
||||||
|
: '—'
|
||||||
|
return (
|
||||||
|
<div className="rounded-lg border border-border p-3 space-y-1.5">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<span className="text-xs font-semibold uppercase tracking-wide text-muted-foreground">
|
||||||
|
{label}
|
||||||
|
</span>
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
{stats.snapshotsCount} snapshot{stats.snapshotsCount === 1 ? '' : 's'}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<dl className="grid grid-cols-2 gap-x-3 gap-y-0.5 text-xs font-mono">
|
||||||
|
<dt className="text-muted-foreground">restore size</dt>
|
||||||
|
<dd className="text-right">{formatBytes(stats.restoreSizeBytes)}</dd>
|
||||||
|
<dt className="text-muted-foreground">raw data</dt>
|
||||||
|
<dd className="text-right">{formatBytes(stats.rawDataBytes)}</dd>
|
||||||
|
<dt className="text-muted-foreground">dedup ratio</dt>
|
||||||
|
<dd className="text-right">{dedup}</dd>
|
||||||
|
</dl>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function SnapshotsTable({
|
||||||
|
snapshots,
|
||||||
|
label,
|
||||||
|
error,
|
||||||
|
}: {
|
||||||
|
snapshots: ResticSnapshot[]
|
||||||
|
label: string
|
||||||
|
error?: string
|
||||||
|
}) {
|
||||||
|
return (
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<h3 className="text-sm font-semibold">{label}</h3>
|
||||||
|
<span className="text-xs text-muted-foreground">{snapshots.length} shown</span>
|
||||||
|
</div>
|
||||||
|
{error ? (
|
||||||
|
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-3 text-xs text-destructive">
|
||||||
|
{error}
|
||||||
|
</div>
|
||||||
|
) : snapshots.length === 0 ? (
|
||||||
|
<div className="rounded-lg border border-border px-4 py-6 text-xs text-muted-foreground text-center">
|
||||||
|
No snapshots in this repo yet.
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="rounded-lg border border-border overflow-hidden">
|
||||||
|
<table className="w-full text-xs font-mono">
|
||||||
|
<thead>
|
||||||
|
<tr className="border-b border-border bg-muted/30">
|
||||||
|
<th className="text-left px-3 py-2 font-medium text-muted-foreground">Time</th>
|
||||||
|
<th className="text-left px-3 py-2 font-medium text-muted-foreground">ID</th>
|
||||||
|
<th className="text-left px-3 py-2 font-medium text-muted-foreground">Tags</th>
|
||||||
|
<th className="text-right px-3 py-2 font-medium text-muted-foreground">
|
||||||
|
Files / size added
|
||||||
|
</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{snapshots.map((s, i) => (
|
||||||
|
<tr key={s.id} className={i % 2 === 0 ? '' : 'bg-muted/10'}>
|
||||||
|
<td className="px-3 py-1.5 text-muted-foreground">{formatTimestamp(s.time)}</td>
|
||||||
|
<td className="px-3 py-1.5">{s.shortId}</td>
|
||||||
|
<td className="px-3 py-1.5 text-muted-foreground truncate max-w-[12rem]">
|
||||||
|
{s.tags.join(', ') || '—'}
|
||||||
|
</td>
|
||||||
|
<td className="px-3 py-1.5 text-right text-muted-foreground">
|
||||||
|
{s.summary?.files_new != null
|
||||||
|
? `${s.summary.files_new} new · ${formatBytes(s.summary.data_added ?? 0)}`
|
||||||
|
: '—'}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function ServerBackupSection({
|
||||||
|
envelope,
|
||||||
|
nasSnapshots,
|
||||||
|
b2Snapshots,
|
||||||
|
nasStats,
|
||||||
|
b2Stats,
|
||||||
|
errors,
|
||||||
|
}: Props) {
|
||||||
|
const [pending, setPending] = useState<ActiveFlow>(null)
|
||||||
|
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||||
|
const [activeFlow, setActiveFlow] = useState<ActiveFlow>(null)
|
||||||
|
|
||||||
|
const handleComplete = useCallback((flowRunId: string) => {
|
||||||
|
setCompletedFlowRunId(flowRunId)
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const flowRun = useFlowRun(handleComplete)
|
||||||
|
|
||||||
|
const startFlow = useCallback(
|
||||||
|
(kind: 'backup' | 'restore') => {
|
||||||
|
setPending(null)
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
setActiveFlow(kind)
|
||||||
|
flowRun.startFlow(
|
||||||
|
kind === 'backup' ? 'server_backup_full' : 'server_backup_restore_test',
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
[flowRun],
|
||||||
|
)
|
||||||
|
|
||||||
|
const handleReset = useCallback(() => {
|
||||||
|
flowRun.reset()
|
||||||
|
setCompletedFlowRunId(null)
|
||||||
|
setActiveFlow(null)
|
||||||
|
}, [flowRun])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<section className="space-y-6">
|
||||||
|
<div className="flex items-baseline justify-between">
|
||||||
|
<h2 className="text-lg font-semibold tracking-tight">Server backup (restic)</h2>
|
||||||
|
<span className="text-xs text-muted-foreground">flows: server_backup_full · restore_test</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="rounded-lg border border-border p-5 space-y-3">
|
||||||
|
<p className="text-sm text-muted-foreground">
|
||||||
|
Daily server-wide backup at 03:30: <code className="font-mono text-xs">pg_dumpall</code> +
|
||||||
|
Forgejo dump, then restic to <strong>NAS</strong> (local) and <strong>Backblaze B2</strong>{' '}
|
||||||
|
(offsite, Object Lock). Authoritative restore sources are the database dumps; live datadirs
|
||||||
|
are excluded. See{' '}
|
||||||
|
<Link
|
||||||
|
href="https://github.com/Madhura68/Ops-dashboard/blob/main/docs/runbooks/server-backup.md"
|
||||||
|
className="underline hover:text-foreground"
|
||||||
|
>
|
||||||
|
docs/runbooks/server-backup.md
|
||||||
|
</Link>{' '}
|
||||||
|
for the full procedure.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<StatusCard status={envelope.lastRun} />
|
||||||
|
{errors.status && (
|
||||||
|
<div className="rounded-lg border border-amber-500/50 bg-amber-500/10 p-3 text-xs text-amber-500">
|
||||||
|
Could not read backup status: {errors.status}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="grid gap-3 md:grid-cols-2">
|
||||||
|
<StatsBlock stats={nasStats} label="NAS repo" error={errors.nasStats} />
|
||||||
|
<StatsBlock stats={b2Stats} label="B2 repo" error={errors.b2Stats} />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center gap-3 flex-wrap">
|
||||||
|
<button
|
||||||
|
onClick={() => setPending('backup')}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
||||||
|
>
|
||||||
|
Backup now
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setPending('restore')}
|
||||||
|
disabled={flowRun.status === 'running'}
|
||||||
|
className="rounded-lg border border-border px-4 py-2 text-sm font-medium hover:bg-muted/50 disabled:opacity-50 transition-colors"
|
||||||
|
>
|
||||||
|
Run restore test
|
||||||
|
</button>
|
||||||
|
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||||
|
<button
|
||||||
|
onClick={handleReset}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
Reset
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{flowRun.status !== 'idle' && (
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<span className="text-sm font-medium">
|
||||||
|
Output {activeFlow ? `(${activeFlow === 'backup' ? 'backup' : 'restore test'})` : ''}
|
||||||
|
</span>
|
||||||
|
{completedFlowRunId && (
|
||||||
|
<Link
|
||||||
|
href={`/audit/${completedFlowRunId}`}
|
||||||
|
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||||
|
>
|
||||||
|
View in audit log →
|
||||||
|
</Link>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<StreamingTerminal
|
||||||
|
lines={flowRun.lines}
|
||||||
|
status={flowRun.status}
|
||||||
|
error={flowRun.error}
|
||||||
|
/>
|
||||||
|
{flowRun.status === 'done' && (
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Reload this page to see the updated status, snapshots, and stats.
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="grid gap-6 lg:grid-cols-2">
|
||||||
|
<SnapshotsTable
|
||||||
|
snapshots={nasSnapshots}
|
||||||
|
label="NAS snapshots"
|
||||||
|
error={errors.nasSnapshots}
|
||||||
|
/>
|
||||||
|
<SnapshotsTable
|
||||||
|
snapshots={b2Snapshots}
|
||||||
|
label="B2 snapshots"
|
||||||
|
error={errors.b2Snapshots}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{envelope.lastRestoreTest && (
|
||||||
|
<div className="rounded-lg border border-border p-4 space-y-2">
|
||||||
|
<div className="flex items-center justify-between flex-wrap gap-2">
|
||||||
|
<h3 className="text-sm font-semibold">Last restore test</h3>
|
||||||
|
<span
|
||||||
|
className={`inline-flex items-center rounded-md border px-2 py-0.5 text-xs font-medium uppercase tracking-wide ${overallBadgeClass(envelope.lastRestoreTest.overallStatus)}`}
|
||||||
|
>
|
||||||
|
{envelope.lastRestoreTest.overallStatus.replace('_', ' ')}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
{formatTimestamp(envelope.lastRestoreTest.completedAt)} · repo{' '}
|
||||||
|
<code className="font-mono">{envelope.lastRestoreTest.repo}</code> · snapshot{' '}
|
||||||
|
<code className="font-mono">
|
||||||
|
{envelope.lastRestoreTest.snapshotId?.slice(0, 8) ?? '—'}
|
||||||
|
</code>{' '}
|
||||||
|
· {envelope.lastRestoreTest.assertions.length} assertions
|
||||||
|
</p>
|
||||||
|
{envelope.lastRestoreTest.assertions.some((a) => a.status !== 'ok') && (
|
||||||
|
<ul className="space-y-0.5">
|
||||||
|
{envelope.lastRestoreTest.assertions
|
||||||
|
.filter((a) => a.status !== 'ok')
|
||||||
|
.map((a) => (
|
||||||
|
<li key={a.path} className="text-xs font-mono text-amber-500">
|
||||||
|
{a.status === 'missing' ? '✗ missing' : '! empty'} · {a.path}
|
||||||
|
</li>
|
||||||
|
))}
|
||||||
|
</ul>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<ConfirmDialog
|
||||||
|
open={pending === 'backup'}
|
||||||
|
title="Trigger server backup"
|
||||||
|
commandPreview={
|
||||||
|
'flow: server_backup_full\n\nSteps:\n 1. trigger_server_backup (systemctl start server-backup.service)\n 2. tail_backup_log_today\n 3. read_backup_status\n\nThe actual work happens in systemd; this flow kicks it off and tails the log.'
|
||||||
|
}
|
||||||
|
onConfirm={() => startFlow('backup')}
|
||||||
|
onCancel={() => setPending(null)}
|
||||||
|
/>
|
||||||
|
<ConfirmDialog
|
||||||
|
open={pending === 'restore'}
|
||||||
|
title="Run restore test (NAS)"
|
||||||
|
commandPreview={
|
||||||
|
'flow: server_backup_restore_test\n\nSteps:\n 1. trigger_restore_test (restore latest NAS snapshot to /tmp/restore-test/)\n 2. read_backup_status\n\nNon-destructive — restores into /tmp only and asserts critical files exist.'
|
||||||
|
}
|
||||||
|
onConfirm={() => startFlow('restore')}
|
||||||
|
onCancel={() => setPending(null)}
|
||||||
|
/>
|
||||||
|
</section>
|
||||||
|
)
|
||||||
|
}
|
||||||
191
app/settings/backups/_lib/parse.ts
Normal file
191
app/settings/backups/_lib/parse.ts
Normal file
|
|
@ -0,0 +1,191 @@
|
||||||
|
import type {
|
||||||
|
BackupPhase,
|
||||||
|
BackupStatus,
|
||||||
|
BackupStatusEnvelope,
|
||||||
|
OverallStatus,
|
||||||
|
PhaseStatus,
|
||||||
|
ResticSnapshot,
|
||||||
|
ResticStats,
|
||||||
|
RestoreTestAssertion,
|
||||||
|
RestoreTestStatus,
|
||||||
|
} from './types'
|
||||||
|
|
||||||
|
const PHASE_ORDER = [
|
||||||
|
'postgres_dump',
|
||||||
|
'forgejo_dump',
|
||||||
|
'forgejo_db_dump',
|
||||||
|
'restic_nas',
|
||||||
|
'restic_b2',
|
||||||
|
'forget_nas',
|
||||||
|
'check_nas',
|
||||||
|
'check_b2',
|
||||||
|
] as const
|
||||||
|
|
||||||
|
function isRecord(v: unknown): v is Record<string, unknown> {
|
||||||
|
return typeof v === 'object' && v !== null && !Array.isArray(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
function asString(v: unknown): string | null {
|
||||||
|
return typeof v === 'string' ? v : null
|
||||||
|
}
|
||||||
|
|
||||||
|
function asNumber(v: unknown): number | null {
|
||||||
|
return typeof v === 'number' && Number.isFinite(v) ? v : null
|
||||||
|
}
|
||||||
|
|
||||||
|
function asPhaseStatus(v: unknown): PhaseStatus {
|
||||||
|
if (
|
||||||
|
v === 'success' ||
|
||||||
|
v === 'skipped' ||
|
||||||
|
v === 'degraded' ||
|
||||||
|
v === 'failed' ||
|
||||||
|
v === 'pending'
|
||||||
|
) {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
return 'pending'
|
||||||
|
}
|
||||||
|
|
||||||
|
function asOverallStatus(v: unknown): OverallStatus {
|
||||||
|
if (v === 'success' || v === 'partial_failure' || v === 'failed') return v
|
||||||
|
return 'unknown'
|
||||||
|
}
|
||||||
|
|
||||||
|
function parsePhase(name: string, raw: unknown): BackupPhase {
|
||||||
|
if (!isRecord(raw)) {
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
status: 'pending',
|
||||||
|
exitCode: null,
|
||||||
|
startedAt: null,
|
||||||
|
completedAt: null,
|
||||||
|
error: null,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
status: asPhaseStatus(raw.status),
|
||||||
|
exitCode: asNumber(raw.exit_code),
|
||||||
|
startedAt: asString(raw.started_at),
|
||||||
|
completedAt: asString(raw.completed_at),
|
||||||
|
error: asString(raw.error),
|
||||||
|
snapshotId: asString(raw.snapshot_id) ?? undefined,
|
||||||
|
filesNew: asNumber(raw.files_new),
|
||||||
|
dataAddedBytes: asNumber(raw.data_added_bytes),
|
||||||
|
outputFile: asString(raw.output_file) ?? undefined,
|
||||||
|
bytes: asNumber(raw.bytes),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseBackupStatus(raw: unknown): BackupStatus | null {
|
||||||
|
if (!isRecord(raw)) return null
|
||||||
|
const phasesRaw = isRecord(raw.phases) ? raw.phases : {}
|
||||||
|
const phases = PHASE_ORDER.map((name) => parsePhase(name, phasesRaw[name]))
|
||||||
|
return {
|
||||||
|
schemaVersion: asNumber(raw.schema_version) ?? 1,
|
||||||
|
overallStatus: asOverallStatus(raw.overall_status),
|
||||||
|
startedAt: asString(raw.started_at) ?? '',
|
||||||
|
completedAt: asString(raw.completed_at) ?? '',
|
||||||
|
durationSeconds: asNumber(raw.duration_seconds) ?? 0,
|
||||||
|
host: asString(raw.host) ?? '',
|
||||||
|
phases,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseRestoreTestAssertion(raw: unknown): RestoreTestAssertion | null {
|
||||||
|
if (!isRecord(raw)) return null
|
||||||
|
const status = raw.status
|
||||||
|
if (status !== 'ok' && status !== 'empty' && status !== 'missing') return null
|
||||||
|
return {
|
||||||
|
path: asString(raw.path) ?? '',
|
||||||
|
status,
|
||||||
|
bytes: asNumber(raw.bytes) ?? 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseRestoreTestStatus(raw: unknown): RestoreTestStatus | null {
|
||||||
|
if (!isRecord(raw)) return null
|
||||||
|
const assertionsRaw = Array.isArray(raw.assertions) ? raw.assertions : []
|
||||||
|
const assertions: RestoreTestAssertion[] = []
|
||||||
|
for (const a of assertionsRaw) {
|
||||||
|
const parsed = parseRestoreTestAssertion(a)
|
||||||
|
if (parsed) assertions.push(parsed)
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
schemaVersion: asNumber(raw.schema_version) ?? 1,
|
||||||
|
overallStatus: asOverallStatus(raw.overall_status),
|
||||||
|
startedAt: asString(raw.started_at) ?? '',
|
||||||
|
completedAt: asString(raw.completed_at) ?? '',
|
||||||
|
durationSeconds: asNumber(raw.duration_seconds) ?? 0,
|
||||||
|
repo: asString(raw.repo) ?? '',
|
||||||
|
snapshotId: asString(raw.snapshot_id),
|
||||||
|
restoreExitCode: asNumber(raw.restore_exit_code),
|
||||||
|
target: asString(raw.target) ?? undefined,
|
||||||
|
assertions,
|
||||||
|
error: asString(raw.error) ?? undefined,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseStatusEnvelope(output: string): BackupStatusEnvelope {
|
||||||
|
try {
|
||||||
|
const trimmed = output.trim()
|
||||||
|
if (!trimmed) return { lastRun: null, lastRestoreTest: null }
|
||||||
|
const parsed: unknown = JSON.parse(trimmed)
|
||||||
|
if (!isRecord(parsed)) return { lastRun: null, lastRestoreTest: null }
|
||||||
|
return {
|
||||||
|
lastRun: parseBackupStatus(parsed.last_run),
|
||||||
|
lastRestoreTest: parseRestoreTestStatus(parsed.last_restore_test),
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
return { lastRun: null, lastRestoreTest: null }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseResticSnapshots(output: string, repo: 'nas' | 'b2'): ResticSnapshot[] {
|
||||||
|
try {
|
||||||
|
const trimmed = output.trim()
|
||||||
|
if (!trimmed) return []
|
||||||
|
const parsed: unknown = JSON.parse(trimmed)
|
||||||
|
if (!Array.isArray(parsed)) return []
|
||||||
|
const result: ResticSnapshot[] = []
|
||||||
|
for (const s of parsed) {
|
||||||
|
if (!isRecord(s)) continue
|
||||||
|
const id = asString(s.id)
|
||||||
|
if (!id) continue
|
||||||
|
const shortId = asString(s.short_id) ?? id.slice(0, 8)
|
||||||
|
const time = asString(s.time) ?? ''
|
||||||
|
const hostname = asString(s.hostname) ?? ''
|
||||||
|
const tags = Array.isArray(s.tags)
|
||||||
|
? s.tags.filter((t): t is string => typeof t === 'string')
|
||||||
|
: []
|
||||||
|
const paths = Array.isArray(s.paths)
|
||||||
|
? s.paths.filter((p): p is string => typeof p === 'string')
|
||||||
|
: []
|
||||||
|
const summary = isRecord(s.summary) ? (s.summary as ResticSnapshot['summary']) : null
|
||||||
|
result.push({ id, shortId, time, hostname, tags, paths, repo, summary })
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
} catch {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseResticStats(output: string, repo: 'nas' | 'b2'): ResticStats | null {
|
||||||
|
try {
|
||||||
|
const trimmed = output.trim()
|
||||||
|
if (!trimmed) return null
|
||||||
|
const parsed: unknown = JSON.parse(trimmed)
|
||||||
|
if (!isRecord(parsed)) return null
|
||||||
|
return {
|
||||||
|
repo,
|
||||||
|
snapshotsCount: asNumber(parsed.snapshots_count) ?? 0,
|
||||||
|
restoreSizeBytes: asNumber(parsed.restore_size_bytes),
|
||||||
|
restoreSizeFiles: asNumber(parsed.restore_size_files),
|
||||||
|
rawDataBytes: asNumber(parsed.raw_data_bytes),
|
||||||
|
rawBlobCount: asNumber(parsed.raw_blob_count),
|
||||||
|
dedupRatio: asNumber(parsed.dedup_ratio),
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
78
app/settings/backups/_lib/types.ts
Normal file
78
app/settings/backups/_lib/types.ts
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
export type PhaseStatus = 'success' | 'skipped' | 'degraded' | 'failed' | 'pending'
|
||||||
|
export type OverallStatus = 'success' | 'partial_failure' | 'failed' | 'unknown'
|
||||||
|
|
||||||
|
export interface BackupPhase {
|
||||||
|
name: string
|
||||||
|
status: PhaseStatus
|
||||||
|
exitCode: number | null
|
||||||
|
startedAt: string | null
|
||||||
|
completedAt: string | null
|
||||||
|
error: string | null
|
||||||
|
snapshotId?: string
|
||||||
|
filesNew?: number | null
|
||||||
|
dataAddedBytes?: number | null
|
||||||
|
outputFile?: string
|
||||||
|
bytes?: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BackupStatus {
|
||||||
|
schemaVersion: number
|
||||||
|
overallStatus: OverallStatus
|
||||||
|
startedAt: string
|
||||||
|
completedAt: string
|
||||||
|
durationSeconds: number
|
||||||
|
host: string
|
||||||
|
phases: BackupPhase[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RestoreTestAssertion {
|
||||||
|
path: string
|
||||||
|
status: 'ok' | 'empty' | 'missing'
|
||||||
|
bytes: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RestoreTestStatus {
|
||||||
|
schemaVersion: number
|
||||||
|
overallStatus: OverallStatus
|
||||||
|
startedAt: string
|
||||||
|
completedAt: string
|
||||||
|
durationSeconds: number
|
||||||
|
repo: string
|
||||||
|
snapshotId: string | null
|
||||||
|
restoreExitCode: number | null
|
||||||
|
target?: string
|
||||||
|
assertions: RestoreTestAssertion[]
|
||||||
|
error?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BackupStatusEnvelope {
|
||||||
|
lastRun: BackupStatus | null
|
||||||
|
lastRestoreTest: RestoreTestStatus | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ResticSnapshot {
|
||||||
|
id: string
|
||||||
|
shortId: string
|
||||||
|
time: string
|
||||||
|
hostname: string
|
||||||
|
tags: string[]
|
||||||
|
paths: string[]
|
||||||
|
repo: 'nas' | 'b2'
|
||||||
|
summary?: {
|
||||||
|
files_new?: number
|
||||||
|
files_changed?: number
|
||||||
|
data_added?: number
|
||||||
|
total_files_processed?: number
|
||||||
|
total_bytes_processed?: number
|
||||||
|
} | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ResticStats {
|
||||||
|
repo: 'nas' | 'b2'
|
||||||
|
snapshotsCount: number
|
||||||
|
restoreSizeBytes: number | null
|
||||||
|
restoreSizeFiles: number | null
|
||||||
|
rawDataBytes: number | null
|
||||||
|
rawBlobCount: number | null
|
||||||
|
dedupRatio: number | null
|
||||||
|
}
|
||||||
|
|
@ -3,6 +3,16 @@ import { redirect } from 'next/navigation'
|
||||||
import { getCurrentUser } from '@/lib/session'
|
import { getCurrentUser } from '@/lib/session'
|
||||||
import { execAgent } from '@/lib/agent-client'
|
import { execAgent } from '@/lib/agent-client'
|
||||||
import BackupsPanel from './_components/backups-panel'
|
import BackupsPanel from './_components/backups-panel'
|
||||||
|
import {
|
||||||
|
parseResticSnapshots,
|
||||||
|
parseResticStats,
|
||||||
|
parseStatusEnvelope,
|
||||||
|
} from './_lib/parse'
|
||||||
|
import type {
|
||||||
|
BackupStatusEnvelope,
|
||||||
|
ResticSnapshot,
|
||||||
|
ResticStats,
|
||||||
|
} from './_lib/types'
|
||||||
|
|
||||||
export const dynamic = 'force-dynamic'
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
|
@ -27,23 +37,74 @@ function parseBackupList(output: string): BackupFile[] {
|
||||||
.filter((b) => b.name)
|
.filter((b) => b.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function errorMessage(err: unknown): string {
|
||||||
|
return err instanceof Error ? err.message : 'agent call failed'
|
||||||
|
}
|
||||||
|
|
||||||
|
async function tryExec(command: string): Promise<{ output: string | null; error: string | null }> {
|
||||||
|
try {
|
||||||
|
const output = await execAgent(command)
|
||||||
|
return { output, error: null }
|
||||||
|
} catch (err) {
|
||||||
|
return { output: null, error: errorMessage(err) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export default async function BackupsPage() {
|
export default async function BackupsPage() {
|
||||||
const user = await getCurrentUser()
|
const user = await getCurrentUser()
|
||||||
if (!user) redirect('/login')
|
if (!user) redirect('/login')
|
||||||
|
|
||||||
let backups: BackupFile[] = []
|
// Run all agent calls in parallel; per-call error isolation so one failure
|
||||||
let listError: string | null = null
|
// does not blank the entire page.
|
||||||
|
const [
|
||||||
|
backupListResult,
|
||||||
|
statusResult,
|
||||||
|
nasSnapshotsResult,
|
||||||
|
b2SnapshotsResult,
|
||||||
|
nasStatsResult,
|
||||||
|
b2StatsResult,
|
||||||
|
] = await Promise.all([
|
||||||
|
tryExec('list_ops_backups'),
|
||||||
|
tryExec('read_backup_status'),
|
||||||
|
tryExec('restic_snapshots_nas'),
|
||||||
|
tryExec('restic_snapshots_b2'),
|
||||||
|
tryExec('restic_stats_nas'),
|
||||||
|
tryExec('restic_stats_b2'),
|
||||||
|
])
|
||||||
|
|
||||||
try {
|
const backups: BackupFile[] = backupListResult.output
|
||||||
const output = await execAgent('list_ops_backups')
|
? parseBackupList(backupListResult.output)
|
||||||
backups = parseBackupList(output)
|
: []
|
||||||
} catch (err) {
|
const listError = backupListResult.error
|
||||||
listError = err instanceof Error ? err.message : 'failed to list backups'
|
|
||||||
|
const envelope: BackupStatusEnvelope = statusResult.output
|
||||||
|
? parseStatusEnvelope(statusResult.output)
|
||||||
|
: { lastRun: null, lastRestoreTest: null }
|
||||||
|
|
||||||
|
const nasSnapshots: ResticSnapshot[] = nasSnapshotsResult.output
|
||||||
|
? parseResticSnapshots(nasSnapshotsResult.output, 'nas')
|
||||||
|
: []
|
||||||
|
const b2Snapshots: ResticSnapshot[] = b2SnapshotsResult.output
|
||||||
|
? parseResticSnapshots(b2SnapshotsResult.output, 'b2')
|
||||||
|
: []
|
||||||
|
const nasStats: ResticStats | null = nasStatsResult.output
|
||||||
|
? parseResticStats(nasStatsResult.output, 'nas')
|
||||||
|
: null
|
||||||
|
const b2Stats: ResticStats | null = b2StatsResult.output
|
||||||
|
? parseResticStats(b2StatsResult.output, 'b2')
|
||||||
|
: null
|
||||||
|
|
||||||
|
const serverBackupErrors = {
|
||||||
|
status: statusResult.error ?? undefined,
|
||||||
|
nasSnapshots: nasSnapshotsResult.error ?? undefined,
|
||||||
|
b2Snapshots: b2SnapshotsResult.error ?? undefined,
|
||||||
|
nasStats: nasStatsResult.error ?? undefined,
|
||||||
|
b2Stats: b2StatsResult.error ?? undefined,
|
||||||
}
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen bg-background p-6">
|
<div className="min-h-screen bg-background p-6">
|
||||||
<div className="mx-auto max-w-4xl space-y-6">
|
<div className="mx-auto max-w-6xl space-y-6">
|
||||||
<div className="flex items-center gap-3">
|
<div className="flex items-center gap-3">
|
||||||
<Link href="/" className="text-sm text-muted-foreground hover:text-foreground">
|
<Link href="/" className="text-sm text-muted-foreground hover:text-foreground">
|
||||||
← Home
|
← Home
|
||||||
|
|
@ -52,7 +113,16 @@ export default async function BackupsPage() {
|
||||||
<h1 className="text-2xl font-semibold tracking-tight">Backups</h1>
|
<h1 className="text-2xl font-semibold tracking-tight">Backups</h1>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<BackupsPanel backups={backups} listError={listError} />
|
<BackupsPanel
|
||||||
|
backups={backups}
|
||||||
|
listError={listError}
|
||||||
|
envelope={envelope}
|
||||||
|
nasSnapshots={nasSnapshots}
|
||||||
|
b2Snapshots={b2Snapshots}
|
||||||
|
nasStats={nasStats}
|
||||||
|
b2Stats={b2Stats}
|
||||||
|
serverBackupErrors={serverBackupErrors}
|
||||||
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
|
|
|
||||||
291
app/worker-logs/_components/run-log-detail.tsx
Normal file
291
app/worker-logs/_components/run-log-detail.tsx
Normal file
|
|
@ -0,0 +1,291 @@
|
||||||
|
'use client'
|
||||||
|
|
||||||
|
import { useCallback, useEffect, useState, type ReactElement } from 'react'
|
||||||
|
import type { LogEvent, MetaTag, ParsedRunLog } from '@/lib/parse-worker-log'
|
||||||
|
import { cn, formatDuration } from '@/lib/utils'
|
||||||
|
|
||||||
|
async function fetchDetail(fileName: string): Promise<ParsedRunLog> {
|
||||||
|
const res = await fetch(`/api/worker-logs/${encodeURIComponent(fileName)}`, { cache: 'no-store' })
|
||||||
|
const body = await res.json().catch(() => ({}))
|
||||||
|
if (!res.ok) throw new Error(body?.error ?? `request failed (${res.status})`)
|
||||||
|
return body as ParsedRunLog
|
||||||
|
}
|
||||||
|
|
||||||
|
const META_TAG_STYLES: Record<MetaTag, string> = {
|
||||||
|
claim: 'text-muted-foreground',
|
||||||
|
auth: 'text-muted-foreground',
|
||||||
|
quota: 'text-muted-foreground',
|
||||||
|
'no-job': 'text-muted-foreground',
|
||||||
|
claimed: 'text-blue-600 dark:text-blue-400',
|
||||||
|
worktree: 'text-muted-foreground',
|
||||||
|
config: 'text-blue-600 dark:text-blue-400',
|
||||||
|
payload: 'text-muted-foreground',
|
||||||
|
spawn: 'text-blue-600 dark:text-blue-400',
|
||||||
|
'claude-done': 'text-blue-600 dark:text-blue-400',
|
||||||
|
cleanup: 'text-muted-foreground',
|
||||||
|
exit: 'text-muted-foreground',
|
||||||
|
error: 'text-destructive',
|
||||||
|
'token-expired': 'text-destructive',
|
||||||
|
timeout: 'text-muted-foreground',
|
||||||
|
other: 'text-muted-foreground',
|
||||||
|
}
|
||||||
|
|
||||||
|
function timeOnly(ts: string | null): string {
|
||||||
|
if (!ts) return ''
|
||||||
|
const d = new Date(ts)
|
||||||
|
return isNaN(d.getTime()) ? '' : d.toLocaleTimeString()
|
||||||
|
}
|
||||||
|
|
||||||
|
function inputPreview(input: string): string {
|
||||||
|
const oneLine = input.replace(/\s+/g, ' ').trim()
|
||||||
|
return oneLine.length > 100 ? `${oneLine.slice(0, 100)}…` : oneLine
|
||||||
|
}
|
||||||
|
|
||||||
|
function TruncNote({ chars }: { chars?: number }) {
|
||||||
|
return (
|
||||||
|
<div className="mt-0.5 text-[11px] italic text-muted-foreground">
|
||||||
|
— afgekapt{chars != null ? ` (${chars} chars totaal)` : ''}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function EventBlock({ event }: { event: LogEvent }): ReactElement {
|
||||||
|
switch (event.kind) {
|
||||||
|
case 'meta':
|
||||||
|
return (
|
||||||
|
<div className="flex gap-2 py-0.5 font-mono text-[11px] leading-relaxed">
|
||||||
|
<span className="shrink-0 text-muted-foreground/60">{timeOnly(event.ts)}</span>
|
||||||
|
<span className={cn('shrink-0 uppercase tracking-wide', META_TAG_STYLES[event.tag])}>
|
||||||
|
{event.tag}
|
||||||
|
</span>
|
||||||
|
<span className="break-all text-muted-foreground">{event.text}</span>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'system-init':
|
||||||
|
return (
|
||||||
|
<div className="my-2 rounded-lg border border-border bg-card p-3 text-xs">
|
||||||
|
<div className="mb-1 font-medium text-foreground">Sessie gestart</div>
|
||||||
|
<div className="grid grid-cols-2 gap-x-4 gap-y-1 text-muted-foreground sm:grid-cols-3">
|
||||||
|
<div>
|
||||||
|
<span className="text-foreground/70">model</span> {event.model}
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span className="text-foreground/70">permission</span> {event.permissionMode}
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span className="text-foreground/70">claude</span> v{event.version || '?'}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{event.cwd && (
|
||||||
|
<div className="mt-1 break-all font-mono text-[11px] text-muted-foreground">
|
||||||
|
cwd: {event.cwd}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
{(event.tools.length > 0 || event.mcpServers.length > 0) && (
|
||||||
|
<details className="mt-2">
|
||||||
|
<summary className="cursor-pointer text-muted-foreground hover:text-foreground">
|
||||||
|
{event.tools.length} tools · {event.mcpServers.length} MCP-server(s)
|
||||||
|
</summary>
|
||||||
|
<div className="mt-1 font-mono text-[11px] text-muted-foreground">
|
||||||
|
{event.mcpServers.length > 0 && <div>mcp: {event.mcpServers.join(', ')}</div>}
|
||||||
|
<div className="break-words">{event.tools.join(', ')}</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'assistant-text':
|
||||||
|
return (
|
||||||
|
<div className="my-1.5 border-l-2 border-blue-300 pl-3 dark:border-blue-700">
|
||||||
|
<div className="whitespace-pre-wrap text-sm text-foreground">{event.text}</div>
|
||||||
|
{event.truncated && <TruncNote />}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'thinking':
|
||||||
|
return (
|
||||||
|
<details className="my-1 pl-3">
|
||||||
|
<summary className="cursor-pointer text-xs italic text-muted-foreground hover:text-foreground">
|
||||||
|
thinking…
|
||||||
|
</summary>
|
||||||
|
<div className="mt-1 whitespace-pre-wrap border-l-2 border-border pl-3 text-xs italic text-muted-foreground">
|
||||||
|
{event.text}
|
||||||
|
{event.truncated && <TruncNote />}
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'tool-call':
|
||||||
|
return (
|
||||||
|
<details open className="my-1">
|
||||||
|
<summary className="cursor-pointer list-none">
|
||||||
|
<span className="inline-flex max-w-full items-center gap-2 rounded-md bg-muted px-2 py-1 text-xs">
|
||||||
|
<span className="shrink-0 font-medium text-foreground">▸ {event.name}</span>
|
||||||
|
<span className="truncate font-mono text-[11px] text-muted-foreground">
|
||||||
|
{inputPreview(event.input)}
|
||||||
|
</span>
|
||||||
|
</span>
|
||||||
|
</summary>
|
||||||
|
<pre className="ml-2 mt-1 overflow-x-auto rounded-md border border-border bg-muted/30 p-2 font-mono text-[11px] leading-relaxed">
|
||||||
|
{event.input}
|
||||||
|
</pre>
|
||||||
|
{event.truncated && <TruncNote />}
|
||||||
|
</details>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'tool-result':
|
||||||
|
return (
|
||||||
|
<details className="my-1">
|
||||||
|
<summary className="cursor-pointer list-none">
|
||||||
|
<span
|
||||||
|
className={cn(
|
||||||
|
'inline-flex items-center gap-2 rounded-md px-2 py-1 text-xs',
|
||||||
|
event.isError
|
||||||
|
? 'bg-destructive/10 text-destructive'
|
||||||
|
: 'bg-muted text-muted-foreground',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<span>{event.isError ? '✕ result (error)' : '◂ result'}</span>
|
||||||
|
<span className="text-[11px] opacity-70">{event.fullLength} chars</span>
|
||||||
|
</span>
|
||||||
|
</summary>
|
||||||
|
<pre
|
||||||
|
className={cn(
|
||||||
|
'ml-2 mt-1 max-h-80 overflow-auto whitespace-pre-wrap break-all rounded-md border p-2 font-mono text-[11px] leading-relaxed',
|
||||||
|
event.isError ? 'border-destructive/30 bg-destructive/5' : 'border-border bg-muted/30',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{event.body || '(body weggelaten — timeline ingekort)'}
|
||||||
|
</pre>
|
||||||
|
{event.truncated && <TruncNote chars={event.fullLength} />}
|
||||||
|
</details>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'rate-limit':
|
||||||
|
return (
|
||||||
|
<div className="my-1 text-xs">
|
||||||
|
<span className="rounded-md bg-amber-100 px-2 py-0.5 text-amber-800 dark:bg-amber-900/30 dark:text-amber-400">
|
||||||
|
rate limit: {event.status}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'result':
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
className={cn(
|
||||||
|
'my-2 rounded-lg border p-3',
|
||||||
|
event.isError
|
||||||
|
? 'border-destructive/30 bg-destructive/10'
|
||||||
|
: 'border-green-300 bg-green-50 dark:border-green-800 dark:bg-green-900/20',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<div className="flex flex-wrap items-center gap-3 text-xs">
|
||||||
|
<span className="font-medium text-foreground">Resultaat: {event.subtype}</span>
|
||||||
|
{event.durationMs != null && (
|
||||||
|
<span className="text-muted-foreground">{formatDuration(event.durationMs)}</span>
|
||||||
|
)}
|
||||||
|
{event.numTurns != null && (
|
||||||
|
<span className="text-muted-foreground">{event.numTurns} turns</span>
|
||||||
|
)}
|
||||||
|
{event.totalCostUsd != null && (
|
||||||
|
<span className="text-muted-foreground">${event.totalCostUsd.toFixed(2)}</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
{event.resultText && (
|
||||||
|
<div className="mt-2 whitespace-pre-wrap text-sm text-foreground">
|
||||||
|
{event.resultText}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
{event.resultTruncated && <TruncNote />}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
|
||||||
|
case 'raw':
|
||||||
|
return (
|
||||||
|
<div className="break-all py-0.5 font-mono text-[11px] text-muted-foreground/70">
|
||||||
|
{event.text}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function RunLogDetail({ fileName }: { fileName: string }) {
|
||||||
|
const [data, setData] = useState<ParsedRunLog | null>(null)
|
||||||
|
const [error, setError] = useState<string | null>(null)
|
||||||
|
const [loading, setLoading] = useState(true)
|
||||||
|
|
||||||
|
const load = useCallback(async () => {
|
||||||
|
try {
|
||||||
|
const d = await fetchDetail(fileName)
|
||||||
|
setData(d)
|
||||||
|
setError(null)
|
||||||
|
} catch (err) {
|
||||||
|
setError(err instanceof Error ? err.message : 'kon log niet laden')
|
||||||
|
} finally {
|
||||||
|
setLoading(false)
|
||||||
|
}
|
||||||
|
}, [fileName])
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
setLoading(true)
|
||||||
|
setData(null)
|
||||||
|
setError(null)
|
||||||
|
load()
|
||||||
|
}, [load])
|
||||||
|
|
||||||
|
// Keep refreshing while the run is still in progress.
|
||||||
|
useEffect(() => {
|
||||||
|
if (!data?.inProgress) return
|
||||||
|
const id = setInterval(load, 5000)
|
||||||
|
return () => clearInterval(id)
|
||||||
|
}, [data?.inProgress, load])
|
||||||
|
|
||||||
|
if (loading) {
|
||||||
|
return <div className="animate-pulse text-xs text-muted-foreground">log laden…</div>
|
||||||
|
}
|
||||||
|
if (error) {
|
||||||
|
return (
|
||||||
|
<div className="rounded-md border border-destructive/30 bg-destructive/10 px-3 py-2 text-xs text-destructive">
|
||||||
|
{error}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (!data) return null
|
||||||
|
|
||||||
|
const { summary, events } = data
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex flex-wrap items-center gap-3 text-xs text-muted-foreground">
|
||||||
|
<span className="font-mono text-foreground">{summary.fileName}</span>
|
||||||
|
{summary.jobId && <span className="font-mono">job {summary.jobId}</span>}
|
||||||
|
{summary.model && <span>{summary.model}</span>}
|
||||||
|
{summary.permissionMode && <span>{summary.permissionMode}</span>}
|
||||||
|
{summary.durationMs != null && <span>{formatDuration(summary.durationMs)}</span>}
|
||||||
|
{data.inProgress && (
|
||||||
|
<span className="animate-pulse text-amber-600 dark:text-amber-400">● running…</span>
|
||||||
|
)}
|
||||||
|
{data.responseTruncated && (
|
||||||
|
<span className="italic">timeline ingekort (zeer grote log)</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{summary.errorSummary && (
|
||||||
|
<div className="rounded-md border border-destructive/30 bg-destructive/10 px-3 py-2 text-xs text-destructive">
|
||||||
|
{summary.errorSummary}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="rounded-lg border border-border bg-background p-3">
|
||||||
|
{events.length === 0 ? (
|
||||||
|
<div className="text-xs text-muted-foreground">geen events</div>
|
||||||
|
) : (
|
||||||
|
events.map((event, i) => <EventBlock key={i} event={event} />)
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
202
app/worker-logs/_components/worker-logs-view.tsx
Normal file
202
app/worker-logs/_components/worker-logs-view.tsx
Normal file
|
|
@ -0,0 +1,202 @@
|
||||||
|
'use client'
|
||||||
|
|
||||||
|
import { Fragment, useCallback, useEffect, useState } from 'react'
|
||||||
|
import type { RunLogSummary, RunStatus } from '@/lib/parse-worker-log'
|
||||||
|
import { cn, formatDuration, relativeTime } from '@/lib/utils'
|
||||||
|
import RunLogDetail from './run-log-detail'
|
||||||
|
|
||||||
|
const LIMIT_OPTIONS = [10, 25, 50, 100]
|
||||||
|
const COLUMN_COUNT = 7
|
||||||
|
|
||||||
|
const STATUS_STYLES: Record<RunStatus, { badge: string; dot: string }> = {
|
||||||
|
idle: {
|
||||||
|
badge: 'bg-zinc-100 text-zinc-600 dark:bg-zinc-800 dark:text-zinc-400',
|
||||||
|
dot: 'bg-zinc-400 dark:bg-zinc-500',
|
||||||
|
},
|
||||||
|
running: {
|
||||||
|
badge: 'bg-amber-100 text-amber-700 dark:bg-amber-900/30 dark:text-amber-400',
|
||||||
|
dot: 'bg-amber-500 dark:bg-amber-400',
|
||||||
|
},
|
||||||
|
success: {
|
||||||
|
badge: 'bg-green-100 text-green-800 dark:bg-green-900/30 dark:text-green-400',
|
||||||
|
dot: 'bg-green-500 dark:bg-green-400',
|
||||||
|
},
|
||||||
|
error: {
|
||||||
|
badge: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400',
|
||||||
|
dot: 'bg-red-500 dark:bg-red-400',
|
||||||
|
},
|
||||||
|
'token-expired': {
|
||||||
|
badge: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400',
|
||||||
|
dot: 'bg-red-500 dark:bg-red-400',
|
||||||
|
},
|
||||||
|
unknown: {
|
||||||
|
badge: 'bg-zinc-100 text-zinc-600 dark:bg-zinc-800 dark:text-zinc-400',
|
||||||
|
dot: 'bg-zinc-400 dark:bg-zinc-500',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
export function StatusBadge({ status }: { status: RunStatus }) {
|
||||||
|
const s = STATUS_STYLES[status]
|
||||||
|
return (
|
||||||
|
<span
|
||||||
|
className={cn(
|
||||||
|
'inline-flex items-center gap-1.5 rounded-full px-2 py-0.5 text-xs font-medium',
|
||||||
|
s.badge,
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<span className={cn('size-1.5 rounded-full', s.dot)} />
|
||||||
|
{status}
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchLogs(limit: number): Promise<RunLogSummary[]> {
|
||||||
|
const res = await fetch(`/api/worker-logs?limit=${limit}`, { cache: 'no-store' })
|
||||||
|
const body = await res.json().catch(() => ({}))
|
||||||
|
if (!res.ok) throw new Error(body?.error ?? `request failed (${res.status})`)
|
||||||
|
return (body.logs ?? []) as RunLogSummary[]
|
||||||
|
}
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
initialLogs: RunLogSummary[]
|
||||||
|
initialError: string | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function WorkerLogsView({ initialLogs, initialError }: Props) {
|
||||||
|
const [logs, setLogs] = useState<RunLogSummary[]>(initialLogs)
|
||||||
|
const [limit, setLimit] = useState(10)
|
||||||
|
const [selected, setSelected] = useState<string | null>(null)
|
||||||
|
const [error, setError] = useState<string | null>(initialError)
|
||||||
|
const [refreshing, setRefreshing] = useState(false)
|
||||||
|
const [lastUpdated, setLastUpdated] = useState<Date>(new Date())
|
||||||
|
|
||||||
|
const refresh = useCallback(async () => {
|
||||||
|
setRefreshing(true)
|
||||||
|
try {
|
||||||
|
const data = await fetchLogs(limit)
|
||||||
|
setLogs(data)
|
||||||
|
setError(null)
|
||||||
|
setLastUpdated(new Date())
|
||||||
|
} catch (err) {
|
||||||
|
setError(err instanceof Error ? err.message : 'refresh failed')
|
||||||
|
} finally {
|
||||||
|
setRefreshing(false)
|
||||||
|
}
|
||||||
|
}, [limit])
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
refresh()
|
||||||
|
const id = setInterval(refresh, 10000)
|
||||||
|
return () => clearInterval(id)
|
||||||
|
}, [refresh])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<span className="text-xs text-muted-foreground">toon</span>
|
||||||
|
{LIMIT_OPTIONS.map((opt) => (
|
||||||
|
<button
|
||||||
|
key={opt}
|
||||||
|
onClick={() => setLimit(opt)}
|
||||||
|
className={cn(
|
||||||
|
'rounded-md border px-2 py-1 text-xs transition-colors',
|
||||||
|
limit === opt
|
||||||
|
? 'border-foreground/30 bg-muted font-medium text-foreground'
|
||||||
|
: 'border-border text-muted-foreground hover:bg-muted/50',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{opt}
|
||||||
|
</button>
|
||||||
|
))}
|
||||||
|
{refreshing && (
|
||||||
|
<span className="text-xs text-muted-foreground animate-pulse">refreshing…</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
updated {lastUpdated.toLocaleTimeString()} · auto-refreshes every 10s
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{error && (
|
||||||
|
<div className="rounded-lg border border-destructive/30 bg-destructive/10 px-4 py-3 text-sm text-destructive">
|
||||||
|
{error}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="overflow-x-auto rounded-lg border border-border">
|
||||||
|
<table className="w-full text-sm">
|
||||||
|
<thead>
|
||||||
|
<tr className="border-b border-border bg-muted/50">
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Started</th>
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Status</th>
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Job</th>
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Model</th>
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Turns</th>
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Duration</th>
|
||||||
|
<th className="px-4 py-3 text-left font-medium text-muted-foreground">Cost</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{logs.length === 0 && !error ? (
|
||||||
|
<tr>
|
||||||
|
<td colSpan={COLUMN_COUNT} className="px-4 py-8 text-center text-muted-foreground">
|
||||||
|
No worker runs found
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
) : (
|
||||||
|
logs.map((log) => {
|
||||||
|
const isSelected = selected === log.fileName
|
||||||
|
return (
|
||||||
|
<Fragment key={log.fileName}>
|
||||||
|
<tr
|
||||||
|
onClick={() => setSelected(isSelected ? null : log.fileName)}
|
||||||
|
title={log.errorSummary ?? undefined}
|
||||||
|
className={cn(
|
||||||
|
'cursor-pointer border-b border-border transition-colors',
|
||||||
|
isSelected ? 'bg-muted/50' : 'hover:bg-muted/30',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<td className="px-4 py-3 text-xs">
|
||||||
|
{log.startedAt ? (
|
||||||
|
<span title={new Date(log.startedAt).toLocaleString()}>
|
||||||
|
{relativeTime(new Date(log.startedAt))}
|
||||||
|
</span>
|
||||||
|
) : (
|
||||||
|
<span className="font-mono">{log.runId}</span>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-3">
|
||||||
|
<StatusBadge status={log.status} />
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-3 font-mono text-xs text-muted-foreground">
|
||||||
|
{log.jobId ? `…${log.jobId.slice(-8)}` : '—'}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-3 text-xs text-muted-foreground">{log.model ?? '—'}</td>
|
||||||
|
<td className="px-4 py-3 text-xs text-muted-foreground">
|
||||||
|
{log.numTurns ?? '—'}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-3 text-xs text-muted-foreground">
|
||||||
|
{log.durationMs != null ? formatDuration(log.durationMs) : '—'}
|
||||||
|
</td>
|
||||||
|
<td className="px-4 py-3 text-xs text-muted-foreground">
|
||||||
|
{log.totalCostUsd != null ? `$${log.totalCostUsd.toFixed(2)}` : '—'}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{isSelected && (
|
||||||
|
<tr className="border-b border-border bg-muted/20">
|
||||||
|
<td colSpan={COLUMN_COUNT} className="px-4 py-4">
|
||||||
|
<RunLogDetail fileName={log.fileName} />
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
)}
|
||||||
|
</Fragment>
|
||||||
|
)
|
||||||
|
})
|
||||||
|
)}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
34
app/worker-logs/page.tsx
Normal file
34
app/worker-logs/page.tsx
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
import { redirect } from 'next/navigation'
|
||||||
|
import { getCurrentUser } from '@/lib/session'
|
||||||
|
import { listRunLogs } from '@/lib/worker-logs'
|
||||||
|
import type { RunLogSummary } from '@/lib/parse-worker-log'
|
||||||
|
import WorkerLogsView from './_components/worker-logs-view'
|
||||||
|
|
||||||
|
export const dynamic = 'force-dynamic'
|
||||||
|
|
||||||
|
export default async function WorkerLogsPage() {
|
||||||
|
const user = await getCurrentUser()
|
||||||
|
if (!user) redirect('/login')
|
||||||
|
|
||||||
|
let initialLogs: RunLogSummary[] = []
|
||||||
|
let initialError: string | null = null
|
||||||
|
try {
|
||||||
|
initialLogs = await listRunLogs(10)
|
||||||
|
} catch (err) {
|
||||||
|
initialError = err instanceof Error ? err.message : 'Failed to read worker logs'
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen bg-background p-6">
|
||||||
|
<div className="mx-auto max-w-6xl space-y-6">
|
||||||
|
<div>
|
||||||
|
<h1 className="text-2xl font-semibold tracking-tight">Worker Logs</h1>
|
||||||
|
<p className="text-sm text-muted-foreground">
|
||||||
|
Recente runs van de Scrum4Me-worker — klik een rij voor de uitgewerkte timeline
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<WorkerLogsView initialLogs={initialLogs} initialError={initialError} />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
@ -12,6 +12,7 @@ const NAV_ITEMS = [
|
||||||
{ href: '/caddy', label: 'Caddy' },
|
{ href: '/caddy', label: 'Caddy' },
|
||||||
{ href: '/flows', label: 'Flows' },
|
{ href: '/flows', label: 'Flows' },
|
||||||
{ href: '/audit', label: 'Audit' },
|
{ href: '/audit', label: 'Audit' },
|
||||||
|
{ href: '/worker-logs', label: 'Worker Logs' },
|
||||||
{ href: '/settings', label: 'Settings' },
|
{ href: '/settings', label: 'Settings' },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,19 @@
|
||||||
# /etc/sudoers.d/ops-agent
|
# /etc/sudoers.d/ops-agent
|
||||||
# NOPASSWD for explicit systemctl restart invocations by the ops-agent service account.
|
# NOPASSWD for explicit invocations by the ops-agent service account.
|
||||||
# Only the service names whitelisted in commands.yml are listed here.
|
# Only the service names + wrapper scripts whitelisted in commands.yml are listed here.
|
||||||
# Installed by deploy/ops-agent/setup.sh.
|
# Installed by deploy/ops-agent/setup.sh.
|
||||||
|
|
||||||
ops-agent ALL=(root) NOPASSWD: \
|
ops-agent ALL=(root) NOPASSWD: \
|
||||||
/usr/bin/systemctl restart scrum4me-web, \
|
/usr/bin/systemctl restart scrum4me-web, \
|
||||||
/usr/bin/systemctl restart ops-agent, \
|
/usr/bin/systemctl restart ops-agent, \
|
||||||
/usr/bin/systemctl restart caddy
|
/usr/bin/systemctl restart caddy, \
|
||||||
|
/srv/backups/scripts/wrappers/read-status.sh, \
|
||||||
|
/srv/backups/scripts/wrappers/restic-snapshots.sh nas, \
|
||||||
|
/srv/backups/scripts/wrappers/restic-snapshots.sh b2, \
|
||||||
|
/srv/backups/scripts/wrappers/restic-stats.sh nas, \
|
||||||
|
/srv/backups/scripts/wrappers/restic-stats.sh b2, \
|
||||||
|
/srv/backups/scripts/wrappers/restic-check.sh nas, \
|
||||||
|
/srv/backups/scripts/wrappers/restic-check.sh b2, \
|
||||||
|
/srv/backups/scripts/wrappers/trigger-backup.sh, \
|
||||||
|
/srv/backups/scripts/wrappers/trigger-restore-test.sh nas, \
|
||||||
|
/srv/backups/scripts/wrappers/trigger-restore-test.sh b2
|
||||||
|
|
|
||||||
141
deploy/server-backup/README.md
Normal file
141
deploy/server-backup/README.md
Normal file
|
|
@ -0,0 +1,141 @@
|
||||||
|
# Server backup — deploy artefacten
|
||||||
|
|
||||||
|
Dagelijkse server-brede backup met restic naar **NAS** (lokaal) en **Backblaze B2** (offsite, Object Lock). Inclusief structured statusfile die de ops-dashboard kan lezen.
|
||||||
|
|
||||||
|
De volledige beschrijving — voorwaarden, B2 keys, Object Lock, Forgejo-restore-test, integriteits-schedule — staat in [`docs/runbooks/server-backup.md`](../../docs/runbooks/server-backup.md).
|
||||||
|
|
||||||
|
## Bestanden
|
||||||
|
|
||||||
|
| Bestand | Doel | Plek op host |
|
||||||
|
|---|---|---|
|
||||||
|
| `server-backup.sh` | hoofd-script (phase-based, flock, statusfile) | `/srv/backups/scripts/server-backup.sh` |
|
||||||
|
| `restore-test.sh` | restore latest snapshot + check critical files | `/srv/backups/scripts/restore-test.sh` |
|
||||||
|
| `server-backup.service` | systemd oneshot | `/etc/systemd/system/server-backup.service` |
|
||||||
|
| `server-backup.timer` | daily 03:30 + 10 min jitter | `/etc/systemd/system/server-backup.timer` |
|
||||||
|
| `restic-backup.env.example` | env-template (repos, B2 keys, Forgejo) | kopiëren naar `/etc/restic-backup.env` |
|
||||||
|
|
||||||
|
Bovendien aan te maken (niet in deze repo, omdat het secrets zijn):
|
||||||
|
|
||||||
|
- `/etc/restic-backup.password` — alleen het restic-wachtwoord (mode `0400 root:root`).
|
||||||
|
|
||||||
|
## Snelle installatie (zie runbook voor alle context)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Tools en directories
|
||||||
|
sudo apt update && sudo apt install -y restic jq
|
||||||
|
|
||||||
|
sudo mkdir -p /srv/backups/scripts /srv/backups/logs /srv/backups/status \
|
||||||
|
/var/backups/databases
|
||||||
|
sudo chmod 0750 /srv/backups/logs /srv/backups/status
|
||||||
|
|
||||||
|
# 2. Scripts plaatsen
|
||||||
|
sudo cp deploy/server-backup/server-backup.sh /srv/backups/scripts/
|
||||||
|
sudo cp deploy/server-backup/restore-test.sh /srv/backups/scripts/
|
||||||
|
sudo chmod 0750 /srv/backups/scripts/*.sh
|
||||||
|
sudo chown root:root /srv/backups/scripts/*.sh
|
||||||
|
|
||||||
|
# 3. Env + password
|
||||||
|
sudo cp deploy/server-backup/restic-backup.env.example /etc/restic-backup.env
|
||||||
|
sudo chmod 0600 /etc/restic-backup.env
|
||||||
|
sudo chown root:root /etc/restic-backup.env
|
||||||
|
# Genereer wachtwoord — bewaar dit OOK in je password manager.
|
||||||
|
sudo sh -c 'openssl rand -hex 24 > /etc/restic-backup.password'
|
||||||
|
sudo chmod 0400 /etc/restic-backup.password
|
||||||
|
|
||||||
|
# 4. Vul /etc/restic-backup.env (RESTIC_REPO_NAS, RESTIC_REPO_B2,
|
||||||
|
# B2_ACCOUNT_ID, B2_ACCOUNT_KEY, FORGEJO_*). Zie runbook deel A+B.
|
||||||
|
|
||||||
|
# 5. Repos initialiseren (zie runbook deel C voor Object Lock + key-capabilities)
|
||||||
|
sudo -E bash -c 'set -a; . /etc/restic-backup.env; set +a; \
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password; \
|
||||||
|
restic -r "$RESTIC_REPO_NAS" init && \
|
||||||
|
restic -r "$RESTIC_REPO_B2" init'
|
||||||
|
|
||||||
|
# 6. Systemd
|
||||||
|
sudo cp deploy/server-backup/server-backup.service /etc/systemd/system/
|
||||||
|
sudo cp deploy/server-backup/server-backup.timer /etc/systemd/system/
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now server-backup.timer
|
||||||
|
systemctl list-timers | grep server-backup
|
||||||
|
|
||||||
|
# 7. Eerste run handmatig (volgen via journalctl)
|
||||||
|
sudo systemctl start server-backup.service
|
||||||
|
journalctl -u server-backup.service -f
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ops-agent wiring (na stap 1-7)
|
||||||
|
|
||||||
|
Voor de **/flows/server-backup**-pagina en **/settings/backups** in het dashboard
|
||||||
|
moet ops-agent ook weten van de wrappers, commands, flow-YAMLs en de
|
||||||
|
NOPASSWD-sudoers-regels. Dat doet een idempotent install-script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo bash deploy/server-backup/install-flows.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Wat het regelt (en wat het bewust **niet** doet) staat in de header van het
|
||||||
|
script. Re-run safe; backups van `commands.yml` en `sudoers.d/ops-agent` worden
|
||||||
|
bewaard met `.bak.<timestamp>`-suffix. Daarna is de UI op
|
||||||
|
`/flows/server-backup` direct te gebruiken.
|
||||||
|
|
||||||
|
## Verifiëren
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Statusfile
|
||||||
|
sudo jq . /srv/backups/status/last-run.json
|
||||||
|
|
||||||
|
# Snapshots
|
||||||
|
sudo -E bash -c 'set -a; . /etc/restic-backup.env; set +a; \
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password; \
|
||||||
|
restic -r "$RESTIC_REPO_NAS" snapshots; \
|
||||||
|
restic -r "$RESTIC_REPO_B2" snapshots'
|
||||||
|
|
||||||
|
# Restore-test (NAS, niet-destructief — restored naar /tmp/restore-test)
|
||||||
|
sudo /srv/backups/scripts/restore-test.sh nas
|
||||||
|
sudo jq . /srv/backups/status/last-restore-test.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## Statusfile-schema
|
||||||
|
|
||||||
|
Het script schrijft `/srv/backups/status/last-run.json` na elke run (success of failure), atomisch via temp + `mv`. De ops-dashboard leest deze file via `read_backup_status` (zie `ops-agent/commands.yml.example`).
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema_version": 1,
|
||||||
|
"overall_status": "success | partial_failure | failed",
|
||||||
|
"started_at": "2026-05-15T03:30:00+02:00",
|
||||||
|
"completed_at": "2026-05-15T03:48:21+02:00",
|
||||||
|
"duration_seconds": 1101,
|
||||||
|
"host": "scrum4me-srv",
|
||||||
|
"phases": {
|
||||||
|
"postgres_dump": { "status": "success", "exit_code": 0, "...": "..." },
|
||||||
|
"forgejo_dump": { "status": "skipped", "exit_code": 99, "...": "..." },
|
||||||
|
"forgejo_db_dump": { "status": "skipped", "exit_code": 99 },
|
||||||
|
"restic_nas": { "status": "success", "exit_code": 0, "snapshot_id": "abc123" },
|
||||||
|
"restic_b2": { "status": "degraded", "exit_code": 3, "error": "1 file unreadable" },
|
||||||
|
"forget_nas": { "status": "success", "exit_code": 0 },
|
||||||
|
"check_nas": { "status": "success", "exit_code": 0 },
|
||||||
|
"check_b2": { "status": "success", "exit_code": 0 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Per phase `status`:
|
||||||
|
|
||||||
|
| status | betekenis | telt mee als |
|
||||||
|
|---|---|---|
|
||||||
|
| `success` | exit 0 | success |
|
||||||
|
| `skipped` | exit 99 — phase niet van toepassing (bv. Forgejo niet geïnstalleerd) | success |
|
||||||
|
| `degraded` | exit 3 — restic snapshot is gemaakt maar bepaalde files waren onleesbaar | partial_failure |
|
||||||
|
| `failed` | andere non-zero exit | partial_failure of failed (zie `overall_status`) |
|
||||||
|
| `pending` | phase niet gerund (script aborted vóór deze phase) | partial_failure |
|
||||||
|
|
||||||
|
`overall_status` regels:
|
||||||
|
|
||||||
|
- **`failed`** als `postgres_dump` faalt (DB-dump is autoritatief), of als **beide** restic repos falen.
|
||||||
|
- **`partial_failure`** bij enige `failed` of `degraded` phase die niet kritisch is (bv. één restic repo down, of forgejo_dump faalt terwijl postgres lukt).
|
||||||
|
- **`success`** als geen enkele phase `failed` of `degraded` is.
|
||||||
|
|
||||||
|
## Volgorde tov bestaande `ops-db-backup.timer`
|
||||||
|
|
||||||
|
De bestaande `deploy/ops-agent/ops-db-backup.timer` draait om **02:00** en doet alleen `pg_dump ops_dashboard` naar `/srv/ops/backups/`. Deze nieuwe `server-backup.timer` draait om **03:30** en pickt die map mee in zijn restic-backup. Beide blijven naast elkaar bestaan.
|
||||||
264
deploy/server-backup/install-flows.sh
Executable file
264
deploy/server-backup/install-flows.sh
Executable file
|
|
@ -0,0 +1,264 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Idempotent installer that wires the server-backup flow into ops-agent.
|
||||||
|
#
|
||||||
|
# What this DOES install:
|
||||||
|
# 1. /srv/backups/scripts/wrappers/*.sh (wrapper scripts used by ops-agent)
|
||||||
|
# 2. /etc/ops-agent/flows/server_backup_*.yml (flow YAMLs for full + restore-test)
|
||||||
|
# 3. /etc/ops-agent/commands.yml (appends backup commands if missing)
|
||||||
|
# 4. /etc/sudoers.d/ops-agent (appends wrapper allowlist, visudo-validated)
|
||||||
|
# 5. systemctl restart ops-agent (pick up new commands/flows)
|
||||||
|
# 6. systemctl enable --now server-backup.timer (daily backup)
|
||||||
|
#
|
||||||
|
# What this DOES NOT do (do manually first — see README "Snelle installatie"):
|
||||||
|
# - Create /etc/restic-backup.env (with NAS path, B2 keys, Forgejo container name)
|
||||||
|
# - Create /etc/restic-backup.password
|
||||||
|
# - Initialise the restic repos (NAS + B2)
|
||||||
|
# - Install /srv/backups/scripts/{server-backup.sh,restore-test.sh}
|
||||||
|
# - Install /etc/systemd/system/server-backup.{service,timer}
|
||||||
|
#
|
||||||
|
# Re-run safe: each step checks for prior state and skips. Backups of mutated
|
||||||
|
# files (commands.yml, sudoers) are kept with a .bak.<timestamp> suffix.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# sudo bash deploy/server-backup/install-flows.sh
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Resolve repo root from this script's location, so it works regardless of cwd.
|
||||||
|
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||||
|
REPO="$(cd -- "$SCRIPT_DIR/../.." &>/dev/null && pwd)"
|
||||||
|
|
||||||
|
WRAPPERS_SRC="$REPO/deploy/server-backup/wrappers"
|
||||||
|
FLOWS_SRC="$REPO/ops-agent/flows.example"
|
||||||
|
COMMANDS_SRC="$REPO/ops-agent/commands.yml.example"
|
||||||
|
|
||||||
|
WRAPPERS_DST=/srv/backups/scripts/wrappers
|
||||||
|
FLOWS_DST=/etc/ops-agent/flows
|
||||||
|
COMMANDS_DST=/etc/ops-agent/commands.yml
|
||||||
|
SUDOERS_DST=/etc/sudoers.d/ops-agent
|
||||||
|
|
||||||
|
if [[ $EUID -ne 0 ]]; then
|
||||||
|
echo "ERROR: run as root (sudo)." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
step() { echo; echo "── $* ──"; }
|
||||||
|
ok() { echo " ✓ $*"; }
|
||||||
|
skip() { echo " · $* (already in place)"; }
|
||||||
|
note() { echo " ! $*"; }
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "1. Install wrappers to $WRAPPERS_DST"
|
||||||
|
|
||||||
|
mkdir -p "$WRAPPERS_DST"
|
||||||
|
chown root:root "$WRAPPERS_DST"
|
||||||
|
chmod 0750 "$WRAPPERS_DST"
|
||||||
|
|
||||||
|
if [[ ! -d "$WRAPPERS_SRC" ]]; then
|
||||||
|
echo "ERROR: $WRAPPERS_SRC not found — repo state unexpected (expected at $REPO)" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
for src in "$WRAPPERS_SRC"/*.sh; do
|
||||||
|
name=$(basename "$src")
|
||||||
|
dst="$WRAPPERS_DST/$name"
|
||||||
|
if [[ -f "$dst" ]] && cmp -s "$src" "$dst"; then
|
||||||
|
skip "$name"
|
||||||
|
else
|
||||||
|
install -o root -g root -m 0750 "$src" "$dst"
|
||||||
|
ok "$name installed"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "2. Install flow YAMLs to $FLOWS_DST"
|
||||||
|
|
||||||
|
mkdir -p "$FLOWS_DST"
|
||||||
|
|
||||||
|
for f in server_backup_full.yml server_backup_restore_test.yml; do
|
||||||
|
src="$FLOWS_SRC/$f"
|
||||||
|
dst="$FLOWS_DST/$f"
|
||||||
|
if [[ ! -f "$src" ]]; then
|
||||||
|
echo "ERROR: $src missing — repo state unexpected" >&2
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
if [[ -f "$dst" ]] && cmp -s "$src" "$dst"; then
|
||||||
|
skip "$f"
|
||||||
|
else
|
||||||
|
install -o root -g root -m 0644 "$src" "$dst"
|
||||||
|
ok "$f installed"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "3. Append missing commands to $COMMANDS_DST"
|
||||||
|
|
||||||
|
# Commands we want to ensure exist. Names must match the YAML in commands.yml.example.
|
||||||
|
NEEDED_CMDS=(
|
||||||
|
trigger_server_backup
|
||||||
|
trigger_restore_test
|
||||||
|
tail_backup_log_today
|
||||||
|
read_backup_status
|
||||||
|
restic_snapshots_nas
|
||||||
|
restic_snapshots_b2
|
||||||
|
restic_stats_nas
|
||||||
|
restic_stats_b2
|
||||||
|
)
|
||||||
|
|
||||||
|
if [[ ! -f "$COMMANDS_DST" ]]; then
|
||||||
|
echo "ERROR: $COMMANDS_DST missing — run base ops-agent install first" >&2
|
||||||
|
exit 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
TS=$(date +%Y%m%d-%H%M%S)
|
||||||
|
cp -p "$COMMANDS_DST" "${COMMANDS_DST}.bak.${TS}"
|
||||||
|
|
||||||
|
# Check which commands are missing
|
||||||
|
missing_cmds=()
|
||||||
|
for cmd in "${NEEDED_CMDS[@]}"; do
|
||||||
|
if grep -qE "^ ${cmd}:" "$COMMANDS_DST"; then
|
||||||
|
skip "command $cmd already in commands.yml"
|
||||||
|
else
|
||||||
|
missing_cmds+=("$cmd")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ ${#missing_cmds[@]} -eq 0 ]]; then
|
||||||
|
skip "no commands to add"
|
||||||
|
rm "${COMMANDS_DST}.bak.${TS}" # no-op edit, drop the backup
|
||||||
|
else
|
||||||
|
# Extract each missing command's YAML block from commands.yml.example.
|
||||||
|
# The block-detection regex MUST include digits — command names like
|
||||||
|
# restic_snapshots_b2 / restic_stats_b2 contain digits, otherwise the
|
||||||
|
# following block (e.g. restic_stats_nas) would swallow them.
|
||||||
|
tmp=$(mktemp)
|
||||||
|
python3 - "$COMMANDS_SRC" "${missing_cmds[@]}" >> "$tmp" <<'PY'
|
||||||
|
import sys, re
|
||||||
|
src_path = sys.argv[1]
|
||||||
|
wanted = sys.argv[2:]
|
||||||
|
with open(src_path) as f:
|
||||||
|
src = f.read()
|
||||||
|
# Top-level command blocks: each starts at column-2 with "<name>:" line.
|
||||||
|
# A block ends at the next sibling-key line (same indentation) or EOF.
|
||||||
|
pattern = re.compile(r"(^ [a-z0-9_]+:[\s\S]*?)(?=^ [a-z0-9_]+:|\Z)", re.M)
|
||||||
|
blocks = {}
|
||||||
|
for m in pattern.finditer(src):
|
||||||
|
block = m.group(1)
|
||||||
|
name_match = re.match(r"^ ([a-z0-9_]+):", block)
|
||||||
|
if name_match:
|
||||||
|
blocks[name_match.group(1)] = block.rstrip() + "\n"
|
||||||
|
exit_code = 0
|
||||||
|
for cmd in wanted:
|
||||||
|
if cmd in blocks:
|
||||||
|
sys.stdout.write("\n" + blocks[cmd])
|
||||||
|
else:
|
||||||
|
sys.stderr.write(f"ERROR: {cmd} not found in {src_path}\n")
|
||||||
|
exit_code = 1
|
||||||
|
sys.exit(exit_code)
|
||||||
|
PY
|
||||||
|
|
||||||
|
if [[ -s "$tmp" ]]; then
|
||||||
|
cat "$tmp" >> "$COMMANDS_DST"
|
||||||
|
rm "$tmp"
|
||||||
|
ok "appended ${#missing_cmds[@]} commands: ${missing_cmds[*]}"
|
||||||
|
note "backup at ${COMMANDS_DST}.bak.${TS}"
|
||||||
|
else
|
||||||
|
rm "$tmp"
|
||||||
|
echo "ERROR: extraction produced empty output — aborting" >&2
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "4. Ensure sudoers allows ops-agent to run wrappers"
|
||||||
|
|
||||||
|
WRAPPER_PATHS=(
|
||||||
|
/srv/backups/scripts/wrappers/trigger-backup.sh
|
||||||
|
/srv/backups/scripts/wrappers/trigger-restore-test.sh
|
||||||
|
/srv/backups/scripts/wrappers/read-status.sh
|
||||||
|
/srv/backups/scripts/wrappers/restic-snapshots.sh
|
||||||
|
/srv/backups/scripts/wrappers/restic-stats.sh
|
||||||
|
/srv/backups/scripts/wrappers/restic-check.sh
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build proposed sudoers content: existing file + missing wrapper-NOPASSWD lines.
|
||||||
|
SUDOERS_TMP=$(mktemp /tmp/sudoers-ops-agent.XXXXXX)
|
||||||
|
chmod 0440 "$SUDOERS_TMP"
|
||||||
|
cp "$SUDOERS_DST" "$SUDOERS_TMP"
|
||||||
|
|
||||||
|
added_lines=0
|
||||||
|
for path in "${WRAPPER_PATHS[@]}"; do
|
||||||
|
pattern="NOPASSWD:[[:space:]]*${path//\//\\/}\\b"
|
||||||
|
if grep -qE "$pattern" "$SUDOERS_TMP"; then
|
||||||
|
skip "$(basename "$path") already in sudoers"
|
||||||
|
else
|
||||||
|
echo "ops-agent ALL=(root) NOPASSWD: $path *" >> "$SUDOERS_TMP"
|
||||||
|
ok "added NOPASSWD for $(basename "$path")"
|
||||||
|
added_lines=$((added_lines + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $added_lines -gt 0 ]]; then
|
||||||
|
# Validate with visudo before swapping in — bail loud if invalid (prevents lockout).
|
||||||
|
if visudo -c -f "$SUDOERS_TMP" >/dev/null; then
|
||||||
|
cp -p "$SUDOERS_DST" "${SUDOERS_DST}.bak.${TS}"
|
||||||
|
install -o root -g root -m 0440 "$SUDOERS_TMP" "$SUDOERS_DST"
|
||||||
|
rm "$SUDOERS_TMP"
|
||||||
|
ok "sudoers updated (visudo-validated); backup at ${SUDOERS_DST}.bak.${TS}"
|
||||||
|
else
|
||||||
|
echo "ERROR: visudo validation failed — sudoers not modified" >&2
|
||||||
|
echo " check $SUDOERS_TMP" >&2
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
rm "$SUDOERS_TMP"
|
||||||
|
skip "sudoers already complete"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "5. Restart ops-agent (reload commands.yml + flows)"
|
||||||
|
|
||||||
|
systemctl restart ops-agent
|
||||||
|
sleep 1
|
||||||
|
if systemctl is-active --quiet ops-agent; then
|
||||||
|
ok "ops-agent restarted ($(systemctl show -p ActiveEnterTimestamp ops-agent --value))"
|
||||||
|
else
|
||||||
|
echo "ERROR: ops-agent failed to start — check 'journalctl -u ops-agent -n 50'" >&2
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "6. Enable server-backup.timer"
|
||||||
|
|
||||||
|
if systemctl is-enabled --quiet server-backup.timer; then
|
||||||
|
skip "server-backup.timer already enabled"
|
||||||
|
else
|
||||||
|
systemctl enable server-backup.timer
|
||||||
|
ok "server-backup.timer enabled"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if systemctl is-active --quiet server-backup.timer; then
|
||||||
|
skip "server-backup.timer already active"
|
||||||
|
else
|
||||||
|
systemctl start server-backup.timer
|
||||||
|
ok "server-backup.timer started"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Show next-firing
|
||||||
|
echo
|
||||||
|
note "next scheduled runs:"
|
||||||
|
systemctl list-timers --no-pager | grep -E "NEXT|server-backup" | head -5
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
step "Done"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Test via the UI:"
|
||||||
|
echo " /flows/server-backup → click 'Run restore test' (non-destructive)"
|
||||||
|
echo
|
||||||
|
echo "Or test via curl on this host:"
|
||||||
|
echo " TOKEN=\$(cat /etc/ops-agent/secret)"
|
||||||
|
echo " curl -sS -H \"Authorization: Bearer \$TOKEN\" \\"
|
||||||
|
echo " -H 'Content-Type: application/json' \\"
|
||||||
|
echo " -X POST http://127.0.0.1:3099/agent/v1/flow \\"
|
||||||
|
echo " --data '{\"flow_key\":\"server_backup_restore_test\",\"dry_run\":false}'"
|
||||||
44
deploy/server-backup/restic-backup.env.example
Normal file
44
deploy/server-backup/restic-backup.env.example
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
# Copy to /etc/restic-backup.env on the host. Permissions: 0600 root:root.
|
||||||
|
# RESTIC_PASSWORD lives in /etc/restic-backup.password (mode 0400 root:root)
|
||||||
|
# — the backup script sets RESTIC_PASSWORD_FILE from there, so the password
|
||||||
|
# never appears in the process listing or this env file.
|
||||||
|
|
||||||
|
# ── Restic repositories ────────────────────────────────────────────────────
|
||||||
|
# Local NAS path (must be mounted before the timer fires; see runbook).
|
||||||
|
RESTIC_REPO_NAS=/mnt/nas/backups/restic/scrum4me-srv
|
||||||
|
|
||||||
|
# Backblaze B2 repo, format: b2:<bucket-name>:<prefix>
|
||||||
|
# Bucket must have Object Lock (Governance) with default retention >= 30 days.
|
||||||
|
RESTIC_REPO_B2=b2:scrum4me-srv-backup:scrum4me-srv
|
||||||
|
|
||||||
|
# ── Backblaze B2 server key ────────────────────────────────────────────────
|
||||||
|
# Capabilities REQUIRED: listBuckets, listFiles, readFiles, writeFiles
|
||||||
|
# Capabilities FORBIDDEN: deleteFiles, deleteKeys, bypassGovernance
|
||||||
|
# Create with:
|
||||||
|
# b2 application-key create \
|
||||||
|
# --bucket scrum4me-srv-backup \
|
||||||
|
# --name-prefix scrum4me-srv \
|
||||||
|
# server-backup-key \
|
||||||
|
# listBuckets,listFiles,readFiles,writeFiles
|
||||||
|
B2_ACCOUNT_ID=REPLACE_WITH_B2_KEY_ID
|
||||||
|
B2_ACCOUNT_KEY=REPLACE_WITH_B2_APPLICATION_KEY
|
||||||
|
|
||||||
|
# ── Forgejo backup target (optional — set to skip if Forgejo not deployed) ─
|
||||||
|
# Container name as it appears in `docker ps`. Set to "" or comment out to
|
||||||
|
# skip the Forgejo phases entirely.
|
||||||
|
FORGEJO_CONTAINER=scrum4me-forgejo
|
||||||
|
# Path to app.ini INSIDE the Forgejo container (used by `forgejo dump -c`).
|
||||||
|
FORGEJO_CONFIG=/data/gitea/conf/app.ini
|
||||||
|
# Postgres database name for Forgejo (empty = use SQLite, skip forgejo_db_dump).
|
||||||
|
FORGEJO_DB_NAME=forgejo
|
||||||
|
# Postgres container + role for Forgejo's DB (defaults match scrum4me stack).
|
||||||
|
FORGEJO_DB_CONTAINER=scrum4me-postgres
|
||||||
|
FORGEJO_DB_USER=scrum4me
|
||||||
|
|
||||||
|
# ── Scrum4Me Postgres (required for postgres_dump phase) ───────────────────
|
||||||
|
PG_CONTAINER=scrum4me-postgres
|
||||||
|
PG_DUMPALL_USER=scrum4me
|
||||||
|
|
||||||
|
# ── Optional bandwidth limit for restic B2 upload (KiB/s; 0 = unlimited) ──
|
||||||
|
# Translated by the script into `restic --limit-upload "$BACKUP_LIMIT_UPLOAD_KIB"`.
|
||||||
|
# BACKUP_LIMIT_UPLOAD_KIB=5000
|
||||||
187
deploy/server-backup/restore-test.sh
Normal file
187
deploy/server-backup/restore-test.sh
Normal file
|
|
@ -0,0 +1,187 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Restore the latest restic snapshot to /tmp/restore-test/ and assert that a
|
||||||
|
# small set of critical files came back intact. Used by the monthly maintenance
|
||||||
|
# check and by the dashboard's "Restore test" button.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# server-backup-restore-test.sh [nas|b2]
|
||||||
|
#
|
||||||
|
# Default repo is "nas" (faster, no B2 download fees).
|
||||||
|
|
||||||
|
umask 077
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
REPO_LABEL="${1:-nas}"
|
||||||
|
RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}"
|
||||||
|
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
|
||||||
|
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
|
||||||
|
STATUS_DIR="$(dirname "$STATUS_FILE")"
|
||||||
|
STARTED_AT="$(date -Is)"
|
||||||
|
SECONDS=0
|
||||||
|
|
||||||
|
# Load env (idempotent: ok if already in environment).
|
||||||
|
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$REPO_LABEL" in
|
||||||
|
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||||
|
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||||
|
*) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
|
||||||
|
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
|
||||||
|
|
||||||
|
for tool in jq restic; do
|
||||||
|
command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; }
|
||||||
|
done
|
||||||
|
|
||||||
|
mkdir -p "$STATUS_DIR"
|
||||||
|
chmod 0750 "$STATUS_DIR"
|
||||||
|
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
echo " Restore test — started $STARTED_AT"
|
||||||
|
echo " Repo: $REPO_LABEL ($REPO)"
|
||||||
|
echo " Target: $RESTORE_DIR"
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
|
||||||
|
# Clean previous attempt to keep results unambiguous.
|
||||||
|
rm -rf "$RESTORE_DIR"
|
||||||
|
mkdir -p "$RESTORE_DIR"
|
||||||
|
|
||||||
|
# Find latest snapshot id.
|
||||||
|
SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \
|
||||||
|
| jq -r '.[0].short_id // .[0].id // empty')
|
||||||
|
|
||||||
|
if [ -z "$SNAPSHOT_ID" ]; then
|
||||||
|
echo "ERROR: no snapshots found in $REPO_LABEL repo"
|
||||||
|
jq -n \
|
||||||
|
--arg started "$STARTED_AT" \
|
||||||
|
--arg completed "$(date -Is)" \
|
||||||
|
--argjson duration "$SECONDS" \
|
||||||
|
--arg repo "$REPO_LABEL" \
|
||||||
|
'{
|
||||||
|
schema_version: 1,
|
||||||
|
overall_status: "failed",
|
||||||
|
started_at: $started,
|
||||||
|
completed_at: $completed,
|
||||||
|
duration_seconds: $duration,
|
||||||
|
repo: $repo,
|
||||||
|
snapshot_id: null,
|
||||||
|
error: "no snapshots in repo",
|
||||||
|
assertions: []
|
||||||
|
}' > "$STATUS_FILE"
|
||||||
|
chmod 0644 "$STATUS_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Restoring snapshot $SNAPSHOT_ID (filtered) …"
|
||||||
|
# Restore ALLEEN de paden waar we op asserten — een full restore zou disk
|
||||||
|
# nodig hebben gelijk aan de restore-size van de snapshot (honderden GiB) en
|
||||||
|
# is voor een correctheids-test onnodig. /tmp is vaak tmpfs of klein —
|
||||||
|
# vandaar dat een full restore daar onmiddellijk vastloopt op ENOSPC.
|
||||||
|
# Houd deze lijst gesynchroniseerd met ASSERTION_PATHS hieronder.
|
||||||
|
RESTORE_RC=0
|
||||||
|
restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" \
|
||||||
|
--include /srv/scrum4me/compose/docker-compose.yml \
|
||||||
|
--include /srv/scrum4me/caddy/Caddyfile \
|
||||||
|
--include /etc/restic-backup.env \
|
||||||
|
--include /var/backups/databases \
|
||||||
|
|| RESTORE_RC=$?
|
||||||
|
|
||||||
|
if [ "$RESTORE_RC" -ne 0 ]; then
|
||||||
|
echo "ERROR: restic restore exited $RESTORE_RC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Assertions: each is a path that MUST exist and be non-empty.
|
||||||
|
# Adjust to your stack after first run (and update the runbook addendum).
|
||||||
|
ASSERTION_PATHS=(
|
||||||
|
"$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml"
|
||||||
|
"$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile"
|
||||||
|
"$RESTORE_DIR/etc/restic-backup.env"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Latest postgres dump — match the newest file (glob may resolve to zero).
|
||||||
|
shopt -s nullglob
|
||||||
|
PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz)
|
||||||
|
shopt -u nullglob
|
||||||
|
if [ "${#PG_DUMPS[@]}" -gt 0 ]; then
|
||||||
|
# pick lexicographic last (= newest date, ISO format)
|
||||||
|
LATEST_PG="${PG_DUMPS[-1]}"
|
||||||
|
ASSERTION_PATHS+=("$LATEST_PG")
|
||||||
|
fi
|
||||||
|
|
||||||
|
ASSERTIONS_JSON='[]'
|
||||||
|
ANY_FAILED=0
|
||||||
|
for p in "${ASSERTION_PATHS[@]}"; do
|
||||||
|
if [ -s "$p" ]; then
|
||||||
|
status="ok"
|
||||||
|
bytes=$(stat -c %s "$p")
|
||||||
|
echo " ✓ $p ($bytes bytes)"
|
||||||
|
elif [ -e "$p" ]; then
|
||||||
|
status="empty"
|
||||||
|
bytes=0
|
||||||
|
ANY_FAILED=1
|
||||||
|
echo " ✗ $p (exists but empty)"
|
||||||
|
else
|
||||||
|
status="missing"
|
||||||
|
bytes=0
|
||||||
|
ANY_FAILED=1
|
||||||
|
echo " ✗ $p (missing)"
|
||||||
|
fi
|
||||||
|
ASSERTIONS_JSON=$(jq -c \
|
||||||
|
--arg path "$p" \
|
||||||
|
--arg status "$status" \
|
||||||
|
--argjson bytes "$bytes" \
|
||||||
|
'. + [{path: $path, status: $status, bytes: $bytes}]' \
|
||||||
|
<<< "$ASSERTIONS_JSON")
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$RESTORE_RC" -ne 0 ]; then
|
||||||
|
OVERALL="failed"
|
||||||
|
elif [ "$ANY_FAILED" -ne 0 ]; then
|
||||||
|
OVERALL="partial_failure"
|
||||||
|
else
|
||||||
|
OVERALL="success"
|
||||||
|
fi
|
||||||
|
|
||||||
|
jq -n \
|
||||||
|
--arg started "$STARTED_AT" \
|
||||||
|
--arg completed "$(date -Is)" \
|
||||||
|
--argjson duration "$SECONDS" \
|
||||||
|
--arg repo "$REPO_LABEL" \
|
||||||
|
--arg snapshot "$SNAPSHOT_ID" \
|
||||||
|
--arg overall "$OVERALL" \
|
||||||
|
--argjson restore_exit "$RESTORE_RC" \
|
||||||
|
--argjson assertions "$ASSERTIONS_JSON" \
|
||||||
|
'{
|
||||||
|
schema_version: 1,
|
||||||
|
overall_status: $overall,
|
||||||
|
started_at: $started,
|
||||||
|
completed_at: $completed,
|
||||||
|
duration_seconds: $duration,
|
||||||
|
repo: $repo,
|
||||||
|
snapshot_id: $snapshot,
|
||||||
|
restore_exit_code: $restore_exit,
|
||||||
|
target: "'"$RESTORE_DIR"'",
|
||||||
|
assertions: $assertions
|
||||||
|
}' > "$STATUS_FILE"
|
||||||
|
chmod 0644 "$STATUS_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
echo " Restore test — finished $(date -Is)"
|
||||||
|
echo " Overall: $OVERALL"
|
||||||
|
echo " Status file: $STATUS_FILE"
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
|
||||||
|
case "$OVERALL" in
|
||||||
|
success) exit 0 ;;
|
||||||
|
partial_failure) exit 75 ;;
|
||||||
|
failed|*) exit 1 ;;
|
||||||
|
esac
|
||||||
38
deploy/server-backup/server-backup.service
Normal file
38
deploy/server-backup/server-backup.service
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Server-wide backup (pg_dumpall + restic to NAS + B2)
|
||||||
|
Documentation=file:///srv/scrum4me/ops-dashboard/docs/runbooks/server-backup.md
|
||||||
|
After=network-online.target docker.service
|
||||||
|
Wants=network-online.target
|
||||||
|
# NAS-mount moet beschikbaar zijn voordat restic naar de NAS-repo schrijft;
|
||||||
|
# triggert de cifs automount voor /mnt/nas/backups als die nog niet actief is.
|
||||||
|
RequiresMountsFor=/mnt/nas/backups
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
EnvironmentFile=/etc/restic-backup.env
|
||||||
|
ExecStart=/srv/backups/scripts/server-backup.sh
|
||||||
|
TimeoutStartSec=4h
|
||||||
|
RuntimeMaxSec=6h
|
||||||
|
Nice=10
|
||||||
|
IOSchedulingClass=best-effort
|
||||||
|
IOSchedulingPriority=7
|
||||||
|
# Sandboxing — backup needs root for /etc + docker exec, but limit the rest.
|
||||||
|
# /mnt/nas/backups MOET in ReadWritePaths anders kan restic niet naar de
|
||||||
|
# NAS-repo schrijven door ProtectSystem=strict.
|
||||||
|
ProtectSystem=strict
|
||||||
|
ReadWritePaths=/var/backups /srv/backups /run /tmp /mnt/nas/backups
|
||||||
|
ProtectHome=read-only
|
||||||
|
NoNewPrivileges=yes
|
||||||
|
PrivateTmp=yes
|
||||||
|
ProtectKernelTunables=yes
|
||||||
|
ProtectKernelModules=yes
|
||||||
|
ProtectControlGroups=yes
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=server-backup
|
||||||
|
|
||||||
|
# Exit code semantics from server-backup.sh:
|
||||||
|
# 0 = success (all phases ok)
|
||||||
|
# 75 = partial_failure (some non-critical phase failed/degraded)
|
||||||
|
# 1 = failed (a critical dump phase failed or both restic repos failed)
|
||||||
|
SuccessExitStatus=75
|
||||||
504
deploy/server-backup/server-backup.sh
Normal file
504
deploy/server-backup/server-backup.sh
Normal file
|
|
@ -0,0 +1,504 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Daily server-wide backup: dumps databases, runs restic to NAS + B2,
|
||||||
|
# writes a structured statusfile that the ops-dashboard can read.
|
||||||
|
#
|
||||||
|
# Install:
|
||||||
|
# cp deploy/server-backup/server-backup.sh /srv/backups/scripts/server-backup.sh
|
||||||
|
# chmod 0750 /srv/backups/scripts/server-backup.sh
|
||||||
|
# chown root:root /srv/backups/scripts/server-backup.sh
|
||||||
|
#
|
||||||
|
# Requires: bash, jq, flock, restic, docker, gzip. See runbook for setup.
|
||||||
|
|
||||||
|
umask 077
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
# ── Configuration ──────────────────────────────────────────────────────────
|
||||||
|
STATUS_DIR="${STATUS_DIR:-/srv/backups/status}"
|
||||||
|
LOG_DIR="${LOG_DIR:-/srv/backups/logs}"
|
||||||
|
DB_DUMP_DIR="${DB_DUMP_DIR:-/var/backups/databases}"
|
||||||
|
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
|
||||||
|
LOCKFILE="${LOCKFILE:-/run/server-backup.lock}"
|
||||||
|
RUN_DATE="$(date +%F)"
|
||||||
|
STARTED_AT="$(date -Is)"
|
||||||
|
SECONDS=0
|
||||||
|
|
||||||
|
# Phase order — must match write_status_json + determine_exit_code expectations.
|
||||||
|
PHASE_ORDER=(
|
||||||
|
postgres_dump
|
||||||
|
forgejo_dump
|
||||||
|
forgejo_db_dump
|
||||||
|
restic_nas
|
||||||
|
restic_b2
|
||||||
|
forget_nas
|
||||||
|
check_nas
|
||||||
|
check_b2
|
||||||
|
)
|
||||||
|
|
||||||
|
declare -A PHASE_STATUS PHASE_EXIT PHASE_START PHASE_END PHASE_ERR PHASE_EXTRA
|
||||||
|
OVERALL_STATUS="unknown"
|
||||||
|
|
||||||
|
# ── Single-instance lock ───────────────────────────────────────────────────
|
||||||
|
exec 9>"$LOCKFILE" || { echo "ERROR: cannot open lockfile $LOCKFILE" >&2; exit 1; }
|
||||||
|
if ! flock -n 9; then
|
||||||
|
echo "ERROR: another server-backup is already running (lock $LOCKFILE held)" >&2
|
||||||
|
exit 75
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Env + secret loading ───────────────────────────────────────────────────
|
||||||
|
# When invoked via systemd, EnvironmentFile=/etc/restic-backup.env has already
|
||||||
|
# been loaded. When invoked manually for testing, source it ourselves.
|
||||||
|
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
: "${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set (see /etc/restic-backup.env)}"
|
||||||
|
: "${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set (see /etc/restic-backup.env)}"
|
||||||
|
|
||||||
|
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
|
||||||
|
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
|
||||||
|
|
||||||
|
# Required tooling
|
||||||
|
for tool in jq restic docker gzip flock; do
|
||||||
|
if ! command -v "$tool" >/dev/null 2>&1; then
|
||||||
|
echo "ERROR: required tool '$tool' not on PATH" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── Logging ────────────────────────────────────────────────────────────────
|
||||||
|
mkdir -p "$LOG_DIR" "$STATUS_DIR" "$DB_DUMP_DIR"
|
||||||
|
chmod 0750 "$LOG_DIR" "$STATUS_DIR"
|
||||||
|
LOG_FILE="$LOG_DIR/server-backup-$RUN_DATE.log"
|
||||||
|
# Mirror everything to LOG_FILE and the journal.
|
||||||
|
exec > >(tee -a "$LOG_FILE") 2>&1
|
||||||
|
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
echo " Server backup — started $STARTED_AT"
|
||||||
|
echo " Host: $(hostname)"
|
||||||
|
echo " NAS repo: $RESTIC_REPO_NAS"
|
||||||
|
echo " B2 repo: $RESTIC_REPO_B2"
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
|
||||||
|
# ── Phase runner ───────────────────────────────────────────────────────────
|
||||||
|
# Runs the function passed as first arg, captures stdout+stderr into a phase
|
||||||
|
# buffer, records status / exit_code / timestamps / error tail.
|
||||||
|
run_phase() {
|
||||||
|
local name="$1"; shift
|
||||||
|
local phase_buf
|
||||||
|
phase_buf=$(mktemp -t "backup-phase-${name}.XXXXXX")
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "─── phase: $name ─── $(date -Is)"
|
||||||
|
PHASE_START[$name]=$(date -Is)
|
||||||
|
|
||||||
|
local rc=0
|
||||||
|
# Run in a sub-shell so set -e inside callees doesn't kill us.
|
||||||
|
(
|
||||||
|
"$@"
|
||||||
|
) 2>&1 | tee "$phase_buf"
|
||||||
|
rc=${PIPESTATUS[0]}
|
||||||
|
|
||||||
|
PHASE_EXIT[$name]=$rc
|
||||||
|
case "$rc" in
|
||||||
|
0) PHASE_STATUS[$name]=success ;;
|
||||||
|
3) PHASE_STATUS[$name]=degraded ;; # restic: snapshot created but some files unreadable
|
||||||
|
99) PHASE_STATUS[$name]=skipped ;; # our convention for "not applicable"
|
||||||
|
*) PHASE_STATUS[$name]=failed ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ "$rc" -ne 0 ] && [ "$rc" -ne 99 ] && [ -s "$phase_buf" ]; then
|
||||||
|
# Keep last few non-empty lines as a compact error summary.
|
||||||
|
PHASE_ERR[$name]=$(tail -n 5 "$phase_buf" | tr '\n' ' ' | head -c 500)
|
||||||
|
fi
|
||||||
|
|
||||||
|
PHASE_END[$name]=$(date -Is)
|
||||||
|
rm -f "$phase_buf"
|
||||||
|
echo "─── end $name (exit=$rc, status=${PHASE_STATUS[$name]})"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convention: a phase function returns 99 to mark itself "skipped" — the
|
||||||
|
# overall outcome treats this as success.
|
||||||
|
SKIPPED=99
|
||||||
|
|
||||||
|
# ── Phase 1: pg_dumpall (Scrum4Me Postgres cluster) ────────────────────────
|
||||||
|
dump_postgres_all() {
|
||||||
|
local pg_container="${PG_CONTAINER:-scrum4me-postgres}"
|
||||||
|
local pg_user="${PG_DUMPALL_USER:-scrum4me}"
|
||||||
|
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -qx "$pg_container"; then
|
||||||
|
echo "Postgres container '$pg_container' not running — cannot continue."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local tmp="$DB_DUMP_DIR/.postgres-$RUN_DATE.sql.gz.tmp"
|
||||||
|
local final="$DB_DUMP_DIR/postgres-$RUN_DATE.sql.gz"
|
||||||
|
rm -f "$tmp"
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
docker exec "$pg_container" pg_dumpall -U "$pg_user" --clean --if-exists \
|
||||||
|
| gzip -c > "$tmp"
|
||||||
|
local rc=$?
|
||||||
|
set +o pipefail
|
||||||
|
|
||||||
|
if [ "$rc" -ne 0 ]; then
|
||||||
|
rm -f "$tmp"
|
||||||
|
return "$rc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
mv "$tmp" "$final"
|
||||||
|
chmod 0640 "$final"
|
||||||
|
local bytes
|
||||||
|
bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
|
||||||
|
PHASE_EXTRA[postgres_dump]="output_file=$final;bytes=$bytes"
|
||||||
|
echo "wrote $final ($bytes bytes)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phase 2: Forgejo dump (filesystem + repos) ─────────────────────────────
|
||||||
|
dump_forgejo() {
|
||||||
|
local fj="${FORGEJO_CONTAINER:-}"
|
||||||
|
if [ -z "$fj" ]; then
|
||||||
|
echo "FORGEJO_CONTAINER unset — skipping Forgejo dump."
|
||||||
|
return "$SKIPPED"
|
||||||
|
fi
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -qx "$fj"; then
|
||||||
|
echo "Forgejo container '$fj' not running — skipping."
|
||||||
|
return "$SKIPPED"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local config="${FORGEJO_CONFIG:-/data/gitea/conf/app.ini}"
|
||||||
|
local tmp="$DB_DUMP_DIR/.forgejo-$RUN_DATE.zip.tmp"
|
||||||
|
local final="$DB_DUMP_DIR/forgejo-$RUN_DATE.zip"
|
||||||
|
rm -f "$tmp"
|
||||||
|
|
||||||
|
# `forgejo dump -f -` streams the zip to stdout. We run as the `git` user
|
||||||
|
# inside the container (standard Forgejo image convention).
|
||||||
|
#
|
||||||
|
# NB: Forgejo 11.x heeft GEEN `--skip-db` flag (verwijderd na de Gitea-fork);
|
||||||
|
# de DB komt dus mee in de zip. Onze separate `forgejo_db_dump`-fase blijft
|
||||||
|
# de autoritatieve restore-bron — de in-zip DB-dump is een redundante kopie.
|
||||||
|
set -o pipefail
|
||||||
|
docker exec -u git "$fj" forgejo dump -c "$config" --type zip -f - > "$tmp"
|
||||||
|
local rc=$?
|
||||||
|
set +o pipefail
|
||||||
|
|
||||||
|
if [ "$rc" -ne 0 ]; then
|
||||||
|
rm -f "$tmp"
|
||||||
|
return "$rc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
mv "$tmp" "$final"
|
||||||
|
chmod 0640 "$final"
|
||||||
|
local bytes
|
||||||
|
bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
|
||||||
|
PHASE_EXTRA[forgejo_dump]="output_file=$final;bytes=$bytes"
|
||||||
|
echo "wrote $final ($bytes bytes)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phase 3: Forgejo Postgres DB dump (authoritative for DB restore) ───────
|
||||||
|
dump_forgejo_db() {
|
||||||
|
local db_name="${FORGEJO_DB_NAME:-}"
|
||||||
|
if [ -z "$db_name" ]; then
|
||||||
|
echo "FORGEJO_DB_NAME unset — skipping Forgejo DB dump (assume SQLite)."
|
||||||
|
return "$SKIPPED"
|
||||||
|
fi
|
||||||
|
local db_container="${FORGEJO_DB_CONTAINER:-scrum4me-postgres}"
|
||||||
|
local db_user="${FORGEJO_DB_USER:-scrum4me}"
|
||||||
|
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -qx "$db_container"; then
|
||||||
|
echo "DB container '$db_container' not running — skipping Forgejo DB dump."
|
||||||
|
return "$SKIPPED"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local tmp="$DB_DUMP_DIR/.forgejo-db-$RUN_DATE.sql.gz.tmp"
|
||||||
|
local final="$DB_DUMP_DIR/forgejo-db-$RUN_DATE.sql.gz"
|
||||||
|
rm -f "$tmp"
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
docker exec "$db_container" pg_dump -U "$db_user" --clean --if-exists "$db_name" \
|
||||||
|
| gzip -c > "$tmp"
|
||||||
|
local rc=$?
|
||||||
|
set +o pipefail
|
||||||
|
|
||||||
|
if [ "$rc" -ne 0 ]; then
|
||||||
|
rm -f "$tmp"
|
||||||
|
return "$rc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
mv "$tmp" "$final"
|
||||||
|
chmod 0640 "$final"
|
||||||
|
local bytes
|
||||||
|
bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
|
||||||
|
PHASE_EXTRA[forgejo_db_dump]="output_file=$final;bytes=$bytes"
|
||||||
|
echo "wrote $final ($bytes bytes)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phases 4 + 5: restic backup to NAS / B2 ────────────────────────────────
|
||||||
|
# Live Docker datadirs are excluded — dumps (above) are the authoritative
|
||||||
|
# restore source for Postgres and Forgejo.
|
||||||
|
RESTIC_BACKUP_PATHS=(
|
||||||
|
/etc
|
||||||
|
/home/janpeter
|
||||||
|
/root
|
||||||
|
/opt
|
||||||
|
/srv
|
||||||
|
/usr/local/bin
|
||||||
|
"$DB_DUMP_DIR"
|
||||||
|
/srv/ops/backups
|
||||||
|
)
|
||||||
|
RESTIC_EXCLUDES=(
|
||||||
|
--exclude='**/node_modules'
|
||||||
|
--exclude='**/.next/cache'
|
||||||
|
--exclude='**/.cache'
|
||||||
|
--exclude='**/.git/objects/pack'
|
||||||
|
--exclude='/srv/backups/logs'
|
||||||
|
--exclude='/tmp'
|
||||||
|
--exclude='/var/tmp'
|
||||||
|
--exclude='/srv/scrum4me/postgres' # live Postgres datadir — non-authoritative
|
||||||
|
--exclude='/srv/forgejo/data/git' # live Forgejo git objects — non-authoritative
|
||||||
|
--exclude='/srv/forgejo/data/lfs'
|
||||||
|
--exclude='/srv/forgejo/data/queues'
|
||||||
|
)
|
||||||
|
|
||||||
|
restic_backup_to() {
|
||||||
|
local repo="$1"; local label="$2"
|
||||||
|
local extra_args=()
|
||||||
|
if [ "$label" = "b2" ] && [ -n "${BACKUP_LIMIT_UPLOAD_KIB:-}" ]; then
|
||||||
|
extra_args+=(--limit-upload "$BACKUP_LIMIT_UPLOAD_KIB")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Capture restic JSON output so we can extract the snapshot id.
|
||||||
|
local json_out
|
||||||
|
json_out=$(mktemp -t "restic-backup-${label}.XXXXXX.json")
|
||||||
|
|
||||||
|
# --no-scan keeps the lockfile interaction light; --skip-if-unchanged still
|
||||||
|
# records a snapshot per restic semantics so the dashboard sees a daily entry.
|
||||||
|
restic -r "$repo" backup \
|
||||||
|
--tag scheduled \
|
||||||
|
--tag "host=$(hostname)" \
|
||||||
|
--json \
|
||||||
|
"${extra_args[@]}" \
|
||||||
|
"${RESTIC_EXCLUDES[@]}" \
|
||||||
|
"${RESTIC_BACKUP_PATHS[@]}" \
|
||||||
|
| tee "$json_out"
|
||||||
|
local rc=${PIPESTATUS[0]}
|
||||||
|
|
||||||
|
# Extract snapshot id from the final summary line (last JSON object of type=summary).
|
||||||
|
local snap
|
||||||
|
snap=$(jq -rs 'map(select(.message_type=="summary")) | last | .snapshot_id // empty' < "$json_out" 2>/dev/null || true)
|
||||||
|
local files_new
|
||||||
|
files_new=$(jq -rs 'map(select(.message_type=="summary")) | last | .files_new // empty' < "$json_out" 2>/dev/null || true)
|
||||||
|
local data_added
|
||||||
|
data_added=$(jq -rs 'map(select(.message_type=="summary")) | last | .data_added // empty' < "$json_out" 2>/dev/null || true)
|
||||||
|
|
||||||
|
if [ -n "$snap" ]; then
|
||||||
|
PHASE_EXTRA["restic_$label"]="snapshot_id=$snap;files_new=${files_new:-0};data_added_bytes=${data_added:-0}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$json_out"
|
||||||
|
return "$rc"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phase 6: prune NAS only (B2 is Object Lock — pruning runs off-server) ──
|
||||||
|
restic_forget_nas() {
|
||||||
|
restic -r "$RESTIC_REPO_NAS" forget \
|
||||||
|
--keep-daily 7 \
|
||||||
|
--keep-weekly 4 \
|
||||||
|
--keep-monthly 12 \
|
||||||
|
--prune
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Phase 7: integrity check (light daily; weekly read-data-subset on Sun) ─
|
||||||
|
is_sunday() {
|
||||||
|
[ "$(date +%u)" = "7" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
restic_check_nas() {
|
||||||
|
if is_sunday; then
|
||||||
|
restic -r "$RESTIC_REPO_NAS" check --read-data-subset=2.5%
|
||||||
|
else
|
||||||
|
restic -r "$RESTIC_REPO_NAS" check
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
restic_check_b2() {
|
||||||
|
if is_sunday; then
|
||||||
|
# On B2 a read-data-subset costs bandwidth + B2 download fees. Keep the
|
||||||
|
# subset tiny on Sundays; deeper checks run monthly off-server.
|
||||||
|
restic -r "$RESTIC_REPO_B2" check --read-data-subset=1%
|
||||||
|
else
|
||||||
|
restic -r "$RESTIC_REPO_B2" check
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Statusfile writer ──────────────────────────────────────────────────────
|
||||||
|
# Builds a structured JSON statusfile in /srv/backups/status/last-run.json
|
||||||
|
# atomically (write to tmp, then mv).
|
||||||
|
write_status_json() {
|
||||||
|
local tmpfile
|
||||||
|
tmpfile=$(mktemp -t "backup-status.XXXXXX.json")
|
||||||
|
|
||||||
|
# Build the phases object incrementally with jq for safe escaping.
|
||||||
|
local phases_json='{}'
|
||||||
|
local name status exit_code started ended err extra
|
||||||
|
local snapshot_id files_new data_added output_file bytes
|
||||||
|
for name in "${PHASE_ORDER[@]}"; do
|
||||||
|
status="${PHASE_STATUS[$name]:-pending}"
|
||||||
|
exit_code="${PHASE_EXIT[$name]:-}"
|
||||||
|
started="${PHASE_START[$name]:-}"
|
||||||
|
ended="${PHASE_END[$name]:-}"
|
||||||
|
err="${PHASE_ERR[$name]:-}"
|
||||||
|
extra="${PHASE_EXTRA[$name]:-}"
|
||||||
|
|
||||||
|
snapshot_id=""
|
||||||
|
files_new=""
|
||||||
|
data_added=""
|
||||||
|
output_file=""
|
||||||
|
bytes=""
|
||||||
|
if [ -n "$extra" ]; then
|
||||||
|
# extra is a semicolon-separated list of key=value pairs
|
||||||
|
local pair key val
|
||||||
|
IFS=';' read -ra pairs <<< "$extra"
|
||||||
|
for pair in "${pairs[@]}"; do
|
||||||
|
key="${pair%%=*}"
|
||||||
|
val="${pair#*=}"
|
||||||
|
case "$key" in
|
||||||
|
snapshot_id) snapshot_id="$val" ;;
|
||||||
|
files_new) files_new="$val" ;;
|
||||||
|
data_added_bytes) data_added="$val" ;;
|
||||||
|
output_file) output_file="$val" ;;
|
||||||
|
bytes) bytes="$val" ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# exit_code as JSON number when present, null otherwise.
|
||||||
|
local exit_arg='null'
|
||||||
|
if [ -n "$exit_code" ]; then
|
||||||
|
exit_arg="$exit_code"
|
||||||
|
fi
|
||||||
|
|
||||||
|
phases_json=$(
|
||||||
|
jq -c -n \
|
||||||
|
--argjson base "$phases_json" \
|
||||||
|
--arg name "$name" \
|
||||||
|
--arg status "$status" \
|
||||||
|
--argjson exit_code "$exit_arg" \
|
||||||
|
--arg started "$started" \
|
||||||
|
--arg ended "$ended" \
|
||||||
|
--arg err "$err" \
|
||||||
|
--arg snapshot_id "$snapshot_id" \
|
||||||
|
--arg files_new "$files_new" \
|
||||||
|
--arg data_added "$data_added" \
|
||||||
|
--arg output_file "$output_file" \
|
||||||
|
--arg bytes "$bytes" \
|
||||||
|
'
|
||||||
|
$base + {
|
||||||
|
($name): ({
|
||||||
|
status: $status,
|
||||||
|
exit_code: $exit_code,
|
||||||
|
started_at: (if $started == "" then null else $started end),
|
||||||
|
completed_at: (if $ended == "" then null else $ended end),
|
||||||
|
error: (if $err == "" then null else $err end)
|
||||||
|
}
|
||||||
|
+ (if $snapshot_id != "" then { snapshot_id: $snapshot_id } else {} end)
|
||||||
|
+ (if $files_new != "" then { files_new: ($files_new | tonumber? // null) } else {} end)
|
||||||
|
+ (if $data_added != "" then { data_added_bytes: ($data_added | tonumber? // null) } else {} end)
|
||||||
|
+ (if $output_file != "" then { output_file: $output_file } else {} end)
|
||||||
|
+ (if $bytes != "" then { bytes: ($bytes | tonumber? // null) } else {} end))
|
||||||
|
}'
|
||||||
|
)
|
||||||
|
done
|
||||||
|
|
||||||
|
jq -n \
|
||||||
|
--arg overall "$OVERALL_STATUS" \
|
||||||
|
--arg started "$STARTED_AT" \
|
||||||
|
--arg completed "$(date -Is)" \
|
||||||
|
--argjson duration "$SECONDS" \
|
||||||
|
--arg host "$(hostname)" \
|
||||||
|
--argjson phases "$phases_json" \
|
||||||
|
'{
|
||||||
|
schema_version: 1,
|
||||||
|
overall_status: $overall,
|
||||||
|
started_at: $started,
|
||||||
|
completed_at: $completed,
|
||||||
|
duration_seconds: $duration,
|
||||||
|
host: $host,
|
||||||
|
phases: $phases
|
||||||
|
}' > "$tmpfile"
|
||||||
|
|
||||||
|
mv "$tmpfile" "$STATUS_DIR/last-run.json"
|
||||||
|
chmod 0644 "$STATUS_DIR/last-run.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Outcome aggregation ────────────────────────────────────────────────────
|
||||||
|
# success → exit 0
|
||||||
|
# partial_failure → exit 75 (visible but distinguishable from hard failure)
|
||||||
|
# failed → exit 1
|
||||||
|
determine_exit_code() {
|
||||||
|
local critical_failure=false
|
||||||
|
local has_failure=false
|
||||||
|
local has_degraded=false
|
||||||
|
local name status
|
||||||
|
|
||||||
|
for name in "${PHASE_ORDER[@]}"; do
|
||||||
|
status="${PHASE_STATUS[$name]:-pending}"
|
||||||
|
case "$status" in
|
||||||
|
success|skipped) ;;
|
||||||
|
degraded) has_degraded=true ;;
|
||||||
|
failed)
|
||||||
|
has_failure=true
|
||||||
|
case "$name" in
|
||||||
|
postgres_dump) critical_failure=true ;; # losing the DB dump is catastrophic
|
||||||
|
esac
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Losing BOTH restic repos is also catastrophic.
|
||||||
|
if [ "${PHASE_STATUS[restic_nas]:-}" = "failed" ] \
|
||||||
|
&& [ "${PHASE_STATUS[restic_b2]:-}" = "failed" ]; then
|
||||||
|
critical_failure=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# NB: deze functie wordt direct (niet via $(...)) aangeroepen, anders gaan
|
||||||
|
# de OVERALL_STATUS-assignments verloren in de subshell — write_status_json
|
||||||
|
# zou dan "unknown" wegschrijven en de eind-banner idem.
|
||||||
|
if [ "$critical_failure" = true ]; then
|
||||||
|
OVERALL_STATUS="failed"
|
||||||
|
EXIT_CODE=1
|
||||||
|
elif [ "$has_failure" = true ] || [ "$has_degraded" = true ]; then
|
||||||
|
OVERALL_STATUS="partial_failure"
|
||||||
|
EXIT_CODE=75
|
||||||
|
else
|
||||||
|
OVERALL_STATUS="success"
|
||||||
|
EXIT_CODE=0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Main sequence ──────────────────────────────────────────────────────────
|
||||||
|
run_phase postgres_dump dump_postgres_all
|
||||||
|
run_phase forgejo_dump dump_forgejo
|
||||||
|
run_phase forgejo_db_dump dump_forgejo_db
|
||||||
|
run_phase restic_nas restic_backup_to "$RESTIC_REPO_NAS" nas
|
||||||
|
run_phase restic_b2 restic_backup_to "$RESTIC_REPO_B2" b2
|
||||||
|
run_phase forget_nas restic_forget_nas
|
||||||
|
run_phase check_nas restic_check_nas
|
||||||
|
run_phase check_b2 restic_check_b2
|
||||||
|
|
||||||
|
determine_exit_code # sets OVERALL_STATUS + EXIT_CODE in this shell
|
||||||
|
write_status_json
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
echo " Server backup — finished $(date -Is)"
|
||||||
|
echo " Overall status: $OVERALL_STATUS (exit $EXIT_CODE)"
|
||||||
|
echo " Duration: ${SECONDS}s"
|
||||||
|
echo " Status file: $STATUS_DIR/last-run.json"
|
||||||
|
echo " Log file: $LOG_FILE"
|
||||||
|
echo "════════════════════════════════════════════════════════════════"
|
||||||
|
|
||||||
|
exit "$EXIT_CODE"
|
||||||
12
deploy/server-backup/server-backup.timer
Normal file
12
deploy/server-backup/server-backup.timer
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Daily server-wide backup (timer)
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
# Daily at 03:30 local. After ops-db-backup.timer (02:00) so the ops_dashboard
|
||||||
|
# pg_dump from /srv/ops/backups/ is fresh when restic picks it up.
|
||||||
|
OnCalendar=*-*-* 03:30:00
|
||||||
|
Persistent=true
|
||||||
|
RandomizedDelaySec=600
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
25
deploy/server-backup/wrappers/read-status.sh
Normal file
25
deploy/server-backup/wrappers/read-status.sh
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Read /srv/backups/status/last-run.json. Returns "{}" if missing, so the
|
||||||
|
# dashboard can render an "unknown" state instead of erroring.
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-run.json}"
|
||||||
|
RESTORE_STATUS_FILE="${RESTORE_STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
|
||||||
|
|
||||||
|
# We emit a small wrapper object with both files so the UI can render the
|
||||||
|
# server-backup status AND the most recent restore-test status from one call.
|
||||||
|
last_run='{}'
|
||||||
|
if [ -r "$STATUS_FILE" ]; then
|
||||||
|
last_run=$(cat "$STATUS_FILE")
|
||||||
|
fi
|
||||||
|
|
||||||
|
last_restore='null'
|
||||||
|
if [ -r "$RESTORE_STATUS_FILE" ]; then
|
||||||
|
last_restore=$(cat "$RESTORE_STATUS_FILE")
|
||||||
|
fi
|
||||||
|
|
||||||
|
jq -n \
|
||||||
|
--argjson last_run "$last_run" \
|
||||||
|
--argjson last_restore "$last_restore" \
|
||||||
|
'{ last_run: $last_run, last_restore_test: $last_restore }'
|
||||||
24
deploy/server-backup/wrappers/restic-check.sh
Normal file
24
deploy/server-backup/wrappers/restic-check.sh
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Run a light restic integrity check on the given repo.
|
||||||
|
# Usage: restic-check.sh nas|b2
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
LABEL="${1:-}"
|
||||||
|
if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then
|
||||||
|
echo "label must be nas or b2" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$LABEL" in
|
||||||
|
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||||
|
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}"
|
||||||
|
|
||||||
|
restic -r "$REPO" check
|
||||||
39
deploy/server-backup/wrappers/restic-snapshots.sh
Normal file
39
deploy/server-backup/wrappers/restic-snapshots.sh
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# List recent restic snapshots from a labelled repo. Output: JSON array.
|
||||||
|
# Usage: restic-snapshots.sh nas|b2
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
LABEL="${1:-}"
|
||||||
|
if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then
|
||||||
|
echo '{"error":"label must be nas or b2"}' >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Load env (idempotent — systemd already loaded it for service contexts).
|
||||||
|
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$LABEL" in
|
||||||
|
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||||
|
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}"
|
||||||
|
|
||||||
|
# Show last 30 snapshots, newest first, with the fields the UI needs.
|
||||||
|
restic -r "$REPO" snapshots --json 2>/dev/null \
|
||||||
|
| jq --arg repo "$LABEL" '
|
||||||
|
sort_by(.time) | reverse | .[0:30]
|
||||||
|
| map({
|
||||||
|
id: .id,
|
||||||
|
short_id: (.short_id // (.id[0:8])),
|
||||||
|
time: .time,
|
||||||
|
hostname: .hostname,
|
||||||
|
tags: (.tags // []),
|
||||||
|
paths: (.paths // []),
|
||||||
|
summary: (.summary // null),
|
||||||
|
repo: $repo
|
||||||
|
})
|
||||||
|
'
|
||||||
51
deploy/server-backup/wrappers/restic-stats.sh
Normal file
51
deploy/server-backup/wrappers/restic-stats.sh
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Repo stats: combines restic stats in two modes plus snapshot count.
|
||||||
|
# Output: JSON object with restore_size_bytes, raw_data_bytes, dedup_ratio.
|
||||||
|
# Usage: restic-stats.sh nas|b2
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
LABEL="${1:-}"
|
||||||
|
if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then
|
||||||
|
echo '{"error":"label must be nas or b2"}' >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$LABEL" in
|
||||||
|
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||||
|
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}"
|
||||||
|
|
||||||
|
# restore-size: total bytes if every file in every snapshot were re-extracted.
|
||||||
|
restore_json=$(restic -r "$REPO" stats --mode restore-size --json 2>/dev/null || echo '{}')
|
||||||
|
# raw-data: total unique blob bytes after dedup + compression.
|
||||||
|
raw_json=$(restic -r "$REPO" stats --mode raw-data --json 2>/dev/null || echo '{}')
|
||||||
|
# Snapshot count for the same repo.
|
||||||
|
snap_count=$(restic -r "$REPO" snapshots --json 2>/dev/null | jq 'length // 0')
|
||||||
|
|
||||||
|
jq -n \
|
||||||
|
--arg repo "$LABEL" \
|
||||||
|
--argjson restore "$restore_json" \
|
||||||
|
--argjson raw "$raw_json" \
|
||||||
|
--argjson snap_count "${snap_count:-0}" \
|
||||||
|
'
|
||||||
|
{
|
||||||
|
repo: $repo,
|
||||||
|
snapshots_count: $snap_count,
|
||||||
|
restore_size_bytes: ($restore.total_size // null),
|
||||||
|
restore_size_files: ($restore.total_file_count // null),
|
||||||
|
raw_data_bytes: ($raw.total_size // null),
|
||||||
|
raw_blob_count: ($raw.total_blob_count // null),
|
||||||
|
dedup_ratio: (
|
||||||
|
if ($restore.total_size != null) and ($raw.total_size != null) and ($raw.total_size > 0)
|
||||||
|
then (($restore.total_size | tonumber) / ($raw.total_size | tonumber))
|
||||||
|
else null
|
||||||
|
end
|
||||||
|
)
|
||||||
|
}'
|
||||||
18
deploy/server-backup/wrappers/trigger-backup.sh
Normal file
18
deploy/server-backup/wrappers/trigger-backup.sh
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Trigger server-backup.service ad-hoc. Refuses if a run is already active
|
||||||
|
# (the script itself also flock's, but checking here gives a friendlier error).
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
UNIT=server-backup.service
|
||||||
|
|
||||||
|
active=$(systemctl is-active "$UNIT" 2>/dev/null || true)
|
||||||
|
if [ "$active" = "active" ] || [ "$active" = "activating" ]; then
|
||||||
|
echo "ERROR: $UNIT is already $active — refusing to trigger." >&2
|
||||||
|
exit 75
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Use --no-block so we return immediately; the dashboard will poll via
|
||||||
|
# read-status.sh and tail the log to follow progress.
|
||||||
|
systemctl start --no-block "$UNIT"
|
||||||
|
echo "Triggered $UNIT. Follow with: journalctl -u $UNIT -f"
|
||||||
15
deploy/server-backup/wrappers/trigger-restore-test.sh
Normal file
15
deploy/server-backup/wrappers/trigger-restore-test.sh
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Run a non-destructive restore test against the NAS repo. Streams output to
|
||||||
|
# stdout (so the dashboard's StreamingTerminal can render it) and writes the
|
||||||
|
# structured result to /srv/backups/status/last-restore-test.json.
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
REPO_LABEL="${1:-nas}"
|
||||||
|
|
||||||
|
if [ ! -x /srv/backups/scripts/restore-test.sh ]; then
|
||||||
|
echo "ERROR: /srv/backups/scripts/restore-test.sh not installed" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec /srv/backups/scripts/restore-test.sh "$REPO_LABEL"
|
||||||
562
docs/runbooks/server-backup.md
Normal file
562
docs/runbooks/server-backup.md
Normal file
|
|
@ -0,0 +1,562 @@
|
||||||
|
# Server-brede backup (restic + NAS + B2, dashboard-bediend)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
`scrum4me-srv` draait een Docker-stack (Scrum4Me-web, worker-idea, ops-dashboard,
|
||||||
|
postgres-17, caddy) plus Forgejo. De huidige backup-dekking — alleen
|
||||||
|
`pg_dump ops_dashboard` naar `/srv/ops/backups/` met 30 dagen retentie op één
|
||||||
|
disk — laat **alles anders** vallen: Scrum4Me-data, Forgejo, Caddy-certs,
|
||||||
|
Docker-volumes en `/etc` zijn weg bij brand, diefstal, ransomware of disk-fail.
|
||||||
|
|
||||||
|
Doel: de server **herbouwbaar** maken vanuit een encrypted, gededupliceerde,
|
||||||
|
versioned backup met twee onafhankelijke kopieën — **NAS** lokaal en
|
||||||
|
**Backblaze B2** offsite — bediend vanuit de ops-dashboard. De bestaande
|
||||||
|
`backup_ops_db`-flow blijft draaien; restic pickt zijn dump-directory mee.
|
||||||
|
|
||||||
|
**Belangrijke ontwerpkeuzes** (uitgebreid toegelicht in de review onder
|
||||||
|
`/Users/janpetervisser/Development/Scrum4Me/docs/recommendations/server-backup-plan-review-2026-05-15.md`):
|
||||||
|
|
||||||
|
- **B2 Object Lock + server-key zonder `deleteFiles`** — een aanvaller met root
|
||||||
|
op de server kan geen B2-snapshots weghalen tot Object Lock-retention
|
||||||
|
verloopt. Dat is de ransomware-bescherming. Prune op B2 gebeurt maandelijks
|
||||||
|
vanaf de laptop met een aparte hoge-cap maintenance-key.
|
||||||
|
- **Authoritative restore-bron = dumps, niet live datadirs.** Postgres- en
|
||||||
|
Forgejo-data-directories zijn expliciet `--exclude`'d uit restic;
|
||||||
|
`pg_dumpall` en `forgejo dump` + aparte `pg_dump <forgejo_db>` zijn de
|
||||||
|
autoritatieve bronnen.
|
||||||
|
- **Phase-based script met structured statusfile.** Eén falende fase laat de
|
||||||
|
rest doorlopen; per-phase status / exit-code / timestamps / error-tail komen
|
||||||
|
in `/srv/backups/status/last-run.json` die de dashboard live leest.
|
||||||
|
- **Single-instance lock** via `flock /run/server-backup.lock` — UI-knop en
|
||||||
|
systemd-timer kunnen elkaar niet overlappen.
|
||||||
|
|
||||||
|
## Voorwaarden (aantoonbaar voldaan vóór uitvoering)
|
||||||
|
|
||||||
|
- [ ] Bash, jq, restic, docker, gzip, flock op `$PATH` (`apt install restic jq` voor de eerste twee — de rest zit standaard).
|
||||||
|
- [ ] De Scrum4Me-stack draait in Docker (`docker ps | grep scrum4me-postgres`).
|
||||||
|
- [ ] `/srv/scrum4me/compose/docker-compose.yml` bestaat (anders herzie je het exclude-pad in `server-backup.sh`).
|
||||||
|
- [ ] Tijd loopt synchroon (`timedatectl status`) — backups gebruiken ISO-timestamps.
|
||||||
|
|
||||||
|
## Voorwaarden (input van de gebruiker nodig)
|
||||||
|
|
||||||
|
- **NAS-mount** — pad zoals `/mnt/nas/backups` met genoeg ruimte (initieel ≥ 100 GB; restic is gededupliceerd, dus daarna groeit het traag).
|
||||||
|
- **Backblaze B2-account** — credit-card geregistreerd, bucket aanmaken vereist een operator-actie.
|
||||||
|
- **Restic-wachtwoord** — `openssl rand -hex 24`, bewaard in je password manager **én** in `/etc/restic-backup.password` op de server. Beide nodig — kwijt op één plek = repo onleesbaar.
|
||||||
|
- **B2 maintenance-key** — bewaard alleen op je laptop in passwordmanager. Niet op de server.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel A — Voorbereiding op `scrum4me-srv`
|
||||||
|
|
||||||
|
Uit te voeren als `root` op `scrum4me-srv`.
|
||||||
|
|
||||||
|
1. **Tools installeren**
|
||||||
|
```bash
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y restic jq
|
||||||
|
restic version
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Directories aanmaken**
|
||||||
|
```bash
|
||||||
|
sudo mkdir -p /srv/backups/scripts /srv/backups/logs /srv/backups/status \
|
||||||
|
/var/backups/databases
|
||||||
|
sudo chmod 0750 /srv/backups/logs /srv/backups/status
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **NAS-mount aanmaken** — een nieuwe mount op `/mnt/nas/backups` die naar
|
||||||
|
de subdir `backups` van de bestaande `ssd`-share op de NAS wijst. Geen
|
||||||
|
nieuwe Samba-share op de NAS nodig — de cifs-`prefixpath`-optie mount het
|
||||||
|
subpad direct.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Subdir op de NAS aanmaken via de bestaande ssd-mount
|
||||||
|
sudo mkdir -p /mnt/nas/ssd/backups
|
||||||
|
|
||||||
|
# 2. cifs-utils geïnstalleerd? (voor andere /mnt/nas-shares is dat al zo)
|
||||||
|
dpkg -l | grep -q '^ii cifs-utils' || sudo apt install -y cifs-utils
|
||||||
|
|
||||||
|
# 3. Fstab-regel toevoegen — uid/gid=0 + mode 0700/0600 = root-only
|
||||||
|
sudo tee -a /etc/fstab <<'EOF'
|
||||||
|
//192.168.0.155/ssd /mnt/nas/backups cifs credentials=/etc/samba/credentials-nas,uid=0,gid=0,iocharset=utf8,vers=3.0,nofail,_netdev,x-systemd.automount,prefixpath=backups,file_mode=0600,dir_mode=0700 0 0
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 4. systemd reload + mount
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo mount /mnt/nas/backups
|
||||||
|
mountpoint -q /mnt/nas/backups && echo "OK" || echo "FAIL"
|
||||||
|
df -h /mnt/nas/backups
|
||||||
|
```
|
||||||
|
|
||||||
|
`_netdev,x-systemd.automount,nofail` zorgt dat de mount automatisch terugkomt
|
||||||
|
bij reboot zónder de boot te laten hangen als de NAS even weg is. De
|
||||||
|
`RequiresMountsFor=/mnt/nas/backups` in `server-backup.service` triggert
|
||||||
|
bovendien de automount voor de timer-run.
|
||||||
|
|
||||||
|
4. **Restic-wachtwoord genereren en plaatsen**
|
||||||
|
```bash
|
||||||
|
sudo sh -c 'openssl rand -hex 24 > /etc/restic-backup.password'
|
||||||
|
sudo chmod 0400 /etc/restic-backup.password
|
||||||
|
sudo chown root:root /etc/restic-backup.password
|
||||||
|
```
|
||||||
|
**Kopieer dezelfde string naar je password manager** vóór je verder gaat. Een gegeneerd wachtwoord dat alleen op de server staat is geen wachtwoord — het is een ticking time bomb.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel B — Backblaze B2 inrichten (Object Lock + scoped keys)
|
||||||
|
|
||||||
|
Doel: een bucket waarvan **bestaande** snapshots niet door de server gewist kunnen worden, plus twee separate keys: één voor de server (alleen schrijven/lezen) en één voor de operator (alle rechten, alleen vanaf laptop gebruikt).
|
||||||
|
|
||||||
|
1. **Bucket aanmaken** in de Backblaze-UI of via `b2` CLI:
|
||||||
|
- Naam: `scrum4me-srv-backup` (of een variant; vermeld in `/etc/restic-backup.env`).
|
||||||
|
- Privacy: **Private**.
|
||||||
|
- **File Lock: Enabled, Governance mode, default retention = 30 days**. Governance betekent: een key met `bypassGovernance` kan locks omzeilen — die capability geven we **alleen** aan de maintenance-key.
|
||||||
|
- Lifecycle rules: **geen** (lifecycle conflicts met Object Lock).
|
||||||
|
- Encryption: server-side encryption aanlaten (B2 standaard).
|
||||||
|
|
||||||
|
2. **Server-key** aanmaken (gaat naar `/etc/restic-backup.env` op de server):
|
||||||
|
```bash
|
||||||
|
# via b2 CLI:
|
||||||
|
b2 application-key create \
|
||||||
|
--bucket scrum4me-srv-backup \
|
||||||
|
--name-prefix scrum4me-srv \
|
||||||
|
server-backup-key \
|
||||||
|
listBuckets,listFiles,readFiles,writeFiles
|
||||||
|
```
|
||||||
|
Bewaar de output (`keyID` + `applicationKey`). Verifieer in de UI dat de key **niet** `deleteFiles`, **niet** `deleteKeys`, **niet** `bypassGovernance` heeft.
|
||||||
|
|
||||||
|
3. **Maintenance-key** aanmaken (gaat in je password manager op de laptop):
|
||||||
|
```bash
|
||||||
|
b2 application-key create \
|
||||||
|
--bucket scrum4me-srv-backup \
|
||||||
|
scrum4me-srv-maintenance-key \
|
||||||
|
listBuckets,listFiles,readFiles,writeFiles,deleteFiles,bypassGovernance
|
||||||
|
```
|
||||||
|
Deze key komt **nooit** op de server. Gebruik alleen voor `restic forget --prune` vanaf je laptop (zie Deel H).
|
||||||
|
|
||||||
|
4. **`/etc/restic-backup.env` aanmaken**
|
||||||
|
```bash
|
||||||
|
sudo cp /srv/scrum4me/ops-dashboard/deploy/server-backup/restic-backup.env.example \
|
||||||
|
/etc/restic-backup.env
|
||||||
|
sudo chmod 0600 /etc/restic-backup.env
|
||||||
|
sudo chown root:root /etc/restic-backup.env
|
||||||
|
sudo nano /etc/restic-backup.env
|
||||||
|
```
|
||||||
|
Vul in: `RESTIC_REPO_NAS`, `RESTIC_REPO_B2`, `B2_ACCOUNT_ID` (= keyID), `B2_ACCOUNT_KEY` (= applicationKey). Forgejo-velden in Deel F.
|
||||||
|
|
||||||
|
**Dreigingsmodel**
|
||||||
|
|
||||||
|
| Dreiging | Gedekt door dit ontwerp? |
|
||||||
|
|---|---|
|
||||||
|
| Disk-fail / corruptie | ✓ NAS + B2 = 2× redundancy |
|
||||||
|
| Brand / diefstal / waterschade | ✓ B2 is offsite |
|
||||||
|
| Ransomware op de server | ✓ B2 Object Lock — bestaande snapshots immutable tot retention verloopt |
|
||||||
|
| Server-compromise (root) | ✓ server-key kan geen B2-files verwijderen |
|
||||||
|
| Laptop-compromise + server-compromise simultaan | ✗ maintenance-key dan ook in handen van aanvaller — geen verdediging |
|
||||||
|
| Backblaze account-compromise | ✗ — buiten scope; mitigeer met 2FA en audit-trail |
|
||||||
|
| Verlies restic-wachtwoord | ✗ — repos onleesbaar; bewaar wachtwoord óók in password manager |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel C — Restic-repos initialiseren
|
||||||
|
|
||||||
|
1. **NAS-repo init**
|
||||||
|
```bash
|
||||||
|
sudo -E bash -c '
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||||
|
restic -r "$RESTIC_REPO_NAS" init
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **B2-repo init**
|
||||||
|
```bash
|
||||||
|
sudo -E bash -c '
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||||
|
restic -r "$RESTIC_REPO_B2" init
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Retentie droogtest** — controleer dat het forget-beleid niet té agressief is op een eerste-snapshot-only repo. (Op een verse repo verwijdert `forget` niets, maar dit toont dat alle paden + auth werken.)
|
||||||
|
```bash
|
||||||
|
sudo -E bash -c '
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||||
|
restic -r "$RESTIC_REPO_NAS" forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --dry-run
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel D — Scripts en systemd-units plaatsen
|
||||||
|
|
||||||
|
1. **Scripts kopiëren**
|
||||||
|
```bash
|
||||||
|
sudo cp /srv/scrum4me/ops-dashboard/deploy/server-backup/server-backup.sh /srv/backups/scripts/
|
||||||
|
sudo cp /srv/scrum4me/ops-dashboard/deploy/server-backup/restore-test.sh /srv/backups/scripts/
|
||||||
|
sudo chmod 0750 /srv/backups/scripts/*.sh
|
||||||
|
sudo chown root:root /srv/backups/scripts/*.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Systemd-units kopiëren**
|
||||||
|
```bash
|
||||||
|
sudo cp /srv/scrum4me/ops-dashboard/deploy/server-backup/server-backup.service /etc/systemd/system/
|
||||||
|
sudo cp /srv/scrum4me/ops-dashboard/deploy/server-backup/server-backup.timer /etc/systemd/system/
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now server-backup.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Timer verifiëren**
|
||||||
|
```bash
|
||||||
|
systemctl list-timers | grep server-backup
|
||||||
|
```
|
||||||
|
Toont next-run morgen 03:30 (+ randomized delay tot 10 min).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel E — Eerste run handmatig + statusfile-verificatie
|
||||||
|
|
||||||
|
1. **Trigger**
|
||||||
|
```bash
|
||||||
|
sudo systemctl start server-backup.service
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Live volgen**
|
||||||
|
```bash
|
||||||
|
journalctl -u server-backup.service -f
|
||||||
|
```
|
||||||
|
Verwacht: 8 fasen (postgres_dump, forgejo_dump, forgejo_db_dump, restic_nas, restic_b2, forget_nas, check_nas, check_b2), elk met een `─── phase: X ───` start- en `─── end X (exit=N, status=S)` eindregel.
|
||||||
|
|
||||||
|
3. **Statusfile**
|
||||||
|
```bash
|
||||||
|
sudo jq . /srv/backups/status/last-run.json
|
||||||
|
```
|
||||||
|
Verwacht: `overall_status: "success"`, alle 5 verplichte fasen `success` (Forgejo mag `skipped` zijn als die nog niet geconfigureerd is).
|
||||||
|
|
||||||
|
4. **Snapshots**
|
||||||
|
```bash
|
||||||
|
sudo -E bash -c '
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||||
|
restic -r "$RESTIC_REPO_NAS" snapshots
|
||||||
|
restic -r "$RESTIC_REPO_B2" snapshots
|
||||||
|
'
|
||||||
|
```
|
||||||
|
Beide tonen één snapshot met `host=scrum4me-srv` en tags `scheduled`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel F — Forgejo subplan
|
||||||
|
|
||||||
|
Vóór de eerste full-backup run: inventariseer Forgejo en bevestig (of corrigeer) de defaults in `restic-backup.env`. Bij twijfel — zet `FORGEJO_CONTAINER=` (leeg) zodat de Forgejo-fases als `skipped` markeren tot je verifieerd hebt.
|
||||||
|
|
||||||
|
### F1. Inventarisatie
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}' | grep -i forgejo
|
||||||
|
```
|
||||||
|
|
||||||
|
Noteer:
|
||||||
|
- container-naam (vermoedelijk `forgejo`).
|
||||||
|
- image-versie (`codeberg.org/forgejo/forgejo:<versie>`).
|
||||||
|
|
||||||
|
### F2. Configpaden in de container
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker inspect scrum4me-forgejo --format '{{ range .Mounts }}{{ .Source }} -> {{ .Destination }}{{ println }}{{ end }}'
|
||||||
|
docker exec scrum4me-forgejo ls -la /data/gitea/conf/app.ini
|
||||||
|
```
|
||||||
|
|
||||||
|
Standaard: `app.ini` in `/data/gitea/conf/app.ini` binnen de container. Wijkt dat af, pas `FORGEJO_CONFIG=` in `/etc/restic-backup.env` aan.
|
||||||
|
|
||||||
|
### F3. DB-koppeling controleren
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec scrum4me-forgejo grep -E '^DB_TYPE|^HOST|^NAME|^USER' /data/gitea/conf/app.ini
|
||||||
|
```
|
||||||
|
|
||||||
|
- `DB_TYPE=postgres` met `NAME=forgejo` ⇒ zet `FORGEJO_DB_NAME=forgejo`, en als de Postgres-container niet `scrum4me-postgres` is: `FORGEJO_DB_CONTAINER=...`.
|
||||||
|
- `DB_TYPE=sqlite` ⇒ laat `FORGEJO_DB_NAME=` leeg; SQLite-DB komt mee in `forgejo dump`.
|
||||||
|
|
||||||
|
### F4. Dump-strategie
|
||||||
|
|
||||||
|
Het script doet **drie** dingen voor Forgejo:
|
||||||
|
|
||||||
|
1. `forgejo dump --skip-db -c <config> --type zip -f -` — codebases, attachments, hooks, LFS metadata, etc.
|
||||||
|
2. Separate `pg_dump <forgejo_db>` — autoritatieve DB-restore-bron (Forgejo docs documenteren bekende import-issues bij DB-inhoud uit `forgejo dump`, daarom `--skip-db`).
|
||||||
|
3. Live datadirs (`/srv/forgejo/data/git`, `/srv/forgejo/data/lfs`, `/srv/forgejo/data/queues`) worden **niet** door restic gekopieerd — dat zijn live B-Trees waar een file-level kopie inconsistent zou zijn.
|
||||||
|
|
||||||
|
### F5. Restore-test in geïsoleerde compose-stack
|
||||||
|
|
||||||
|
Vóór je de Forgejo-restore voor real nodig hebt: test hem een keer. Maak een tijdelijke directory met een verse Forgejo + Postgres, voer de dumps in, draai `forgejo doctor check --all`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Minimaal restore-test-recept (vul in op basis van je Forgejo-versie)
|
||||||
|
RESTORE_DIR=/tmp/forgejo-restore-test
|
||||||
|
mkdir -p "$RESTORE_DIR"
|
||||||
|
cd "$RESTORE_DIR"
|
||||||
|
|
||||||
|
# 1. compose-stack met blanco Forgejo + Postgres
|
||||||
|
cat > docker-compose.yml <<'YAML'
|
||||||
|
services:
|
||||||
|
forgejo:
|
||||||
|
image: codeberg.org/forgejo/forgejo:<vul-versie-in>
|
||||||
|
volumes: [ "./forgejo-data:/data" ]
|
||||||
|
depends_on: [ db ]
|
||||||
|
db:
|
||||||
|
image: postgres:17
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: forgejo
|
||||||
|
POSTGRES_PASSWORD: testtest
|
||||||
|
POSTGRES_DB: forgejo
|
||||||
|
volumes: [ "./db-data:/var/lib/postgresql/data" ]
|
||||||
|
YAML
|
||||||
|
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# 2. DB-dump terugzetten
|
||||||
|
gunzip < /var/backups/databases/forgejo-db-$(date +%F).sql.gz \
|
||||||
|
| docker compose exec -T db psql -U forgejo forgejo
|
||||||
|
|
||||||
|
# 3. Forgejo-dump uitpakken in de data-volume
|
||||||
|
docker compose stop forgejo
|
||||||
|
unzip /var/backups/databases/forgejo-$(date +%F).zip -d forgejo-data/
|
||||||
|
docker compose start forgejo
|
||||||
|
|
||||||
|
# 4. Health-checks
|
||||||
|
docker compose exec forgejo forgejo doctor check --all
|
||||||
|
curl -fsS http://localhost:3000/api/v1/version
|
||||||
|
```
|
||||||
|
|
||||||
|
Slaagt `forgejo doctor check --all` en het `/api/v1/version`-endpoint? Dan is je Forgejo-restore werkend. Tear-down: `docker compose down -v && rm -rf "$RESTORE_DIR"`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel G — Restore-procedure in productie
|
||||||
|
|
||||||
|
### G1. Files uit een snapshot terughalen
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Snapshot kiezen
|
||||||
|
sudo -E bash -c '
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||||
|
restic -r "$RESTIC_REPO_NAS" snapshots
|
||||||
|
'
|
||||||
|
|
||||||
|
# Restore (latest, alleen /etc — voorbeeld)
|
||||||
|
sudo -E bash -c '
|
||||||
|
set -a; . /etc/restic-backup.env; set +a
|
||||||
|
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||||
|
restic -r "$RESTIC_REPO_NAS" restore latest --target /tmp/restore --include /etc
|
||||||
|
'
|
||||||
|
```
|
||||||
|
|
||||||
|
### G2. Postgres herstellen (Scrum4Me-cluster)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop de apps die met de DB praten
|
||||||
|
docker compose -f /srv/scrum4me/compose/docker-compose.yml stop scrum4me-web ops-dashboard worker-idea
|
||||||
|
|
||||||
|
# Restore dumpall (drop + recreate alle DBs in de cluster — vandaar --clean --if-exists in de dump)
|
||||||
|
gunzip < /var/backups/databases/postgres-2026-05-15.sql.gz \
|
||||||
|
| docker exec -i scrum4me-postgres psql -U scrum4me
|
||||||
|
|
||||||
|
# Apps weer aan
|
||||||
|
docker compose -f /srv/scrum4me/compose/docker-compose.yml start scrum4me-web ops-dashboard worker-idea
|
||||||
|
```
|
||||||
|
|
||||||
|
Voor partial restore (alleen één database): pak die DB uit de dumpall-tekst met `pg_restore` of `awk`-block extractie. Voor alleen `ops_dashboard` is de bestaande [recovery.md](recovery.md) sectie 2a primair.
|
||||||
|
|
||||||
|
### G3. Forgejo herstellen
|
||||||
|
|
||||||
|
Volg [F5](#f5-restore-test-in-geïsoleerde-compose-stack) maar dan met de echte Forgejo-compose-stack en zonder tear-down. Belangrijk: stop de live Forgejo eerst, vervang `/srv/forgejo/data` volledig, restore DB, start Forgejo, `forgejo doctor check --all`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel H — Maintenance vanaf de laptop (maandelijks)
|
||||||
|
|
||||||
|
Doel: B2-snapshots ouder dan retention-policy daadwerkelijk pruning, plus een diepere integriteits-check die op de server te duur zou zijn.
|
||||||
|
|
||||||
|
1. **Voorbereiding** (eenmalig op laptop):
|
||||||
|
```bash
|
||||||
|
brew install restic jq
|
||||||
|
# Maintenance-key uit password manager
|
||||||
|
export B2_ACCOUNT_ID=<maintenance-key-id>
|
||||||
|
export B2_ACCOUNT_KEY=<maintenance-app-key>
|
||||||
|
export RESTIC_REPOSITORY=b2:scrum4me-srv-backup:scrum4me-srv
|
||||||
|
read -rs RESTIC_PASSWORD < /dev/tty # uit password manager
|
||||||
|
export RESTIC_PASSWORD
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Prune-check** (eerst dry-run om te zien wat er zou gebeuren):
|
||||||
|
```bash
|
||||||
|
restic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Daadwerkelijke prune** (vereist `bypassGovernance` capability — alleen via maintenance-key):
|
||||||
|
```bash
|
||||||
|
restic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --prune
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Diepere check**:
|
||||||
|
```bash
|
||||||
|
restic check --read-data-subset=10%
|
||||||
|
```
|
||||||
|
B2-bandbreedte: 10% van een 50 GB repo = 5 GB download, B2-prijs ~ $0.05 (gratis 1 GB/dag).
|
||||||
|
|
||||||
|
5. **Cleanup environment** — sluit shell of `unset RESTIC_PASSWORD B2_ACCOUNT_*`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel I — Integriteits-schedule (samenvatting)
|
||||||
|
|
||||||
|
| Cadans | Wie | Wat | Waarom |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Dagelijks 03:30 | server (systemd timer) | `restic check` op beide repos | snelle metadata-/structure-validatie |
|
||||||
|
| Wekelijks (zondag) | server (zelfde script) | `restic check --read-data-subset=2.5%` op NAS, `1%` op B2 | sample-based data-integrity |
|
||||||
|
| Maandelijks | operator (laptop) | `restic check --read-data-subset=10%` + `forget --prune` op B2 | diepere check + prune (B2 server-key heeft geen delete-rechten) |
|
||||||
|
| Maandelijks | operator (server) | `/srv/backups/scripts/restore-test.sh nas` + handmatige Forgejo-stack-restore (F5) | end-to-end restore-verificatie |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Te wijzigen / nieuw aangemaakte bestanden
|
||||||
|
|
||||||
|
**Op `scrum4me-srv`** (alleen via deploy uit deze repo, geen handmatige edits):
|
||||||
|
|
||||||
|
- `/srv/backups/scripts/server-backup.sh` (uit `deploy/server-backup/`).
|
||||||
|
- `/srv/backups/scripts/restore-test.sh` (idem).
|
||||||
|
- `/etc/systemd/system/server-backup.service`, `server-backup.timer` (uit `deploy/server-backup/`).
|
||||||
|
- `/etc/restic-backup.env` — secrets, niet in repo.
|
||||||
|
- `/etc/restic-backup.password` — secret, niet in repo.
|
||||||
|
|
||||||
|
**In deze repo (`ops-dashboard`)**, nieuw aangemaakt:
|
||||||
|
|
||||||
|
- `deploy/server-backup/*` — alle deploy-artefacten.
|
||||||
|
- `docs/runbooks/server-backup.md` — dit document.
|
||||||
|
- Later (Fase 3+4): `ops-agent/commands.yml.example`-uitbreiding, `ops-agent/flows.example/server_backup_*.yml`, `app/settings/backups/_components/server-backup-section.tsx`.
|
||||||
|
|
||||||
|
**Op de laptop**, in password manager:
|
||||||
|
|
||||||
|
- restic-wachtwoord (identiek aan `/etc/restic-backup.password`).
|
||||||
|
- B2 maintenance-key (keyID + applicationKey).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Veelvoorkomende fouten
|
||||||
|
|
||||||
|
| Symptoom | Oorzaak | Fix |
|
||||||
|
|---|---|---|
|
||||||
|
| `unable to open repository ... no such file or directory` (NAS) | NAS-mount weg na reboot | `mountpoint -q /mnt/nas/backups` — fix `fstab`/`autofs`; herstart `server-backup.service` |
|
||||||
|
| `unable to open repository ... AccessDenied` (B2) | server-key heeft verkeerde capabilities of bucket-prefix | check `b2 application-key list`; capabilities moeten `listBuckets,listFiles,readFiles,writeFiles` zijn, name-prefix moet matchen |
|
||||||
|
| `Object Lock In Place` bij `forget --prune` op B2 | server probeert ten onrechte B2 te prunen (heeft die capability niet) | het script prune'd alleen NAS — als deze fout opduikt: handmatige `restic forget` op B2 gedraaid (zou off-server moeten); gebruik maintenance-key |
|
||||||
|
| `restic snapshot tag scheduled` ontbreekt in UI | run heeft `--tag scheduled` niet meegekregen | check script — `restic_backup_to` zet beide tags hardcoded |
|
||||||
|
| `forgejo dump` faalt met permission denied | container-user niet `git` | pas `dump_forgejo` aan: `docker exec -u <correct-user>` |
|
||||||
|
| restic exit code 3 in statusfile | sommige files waren niet leesbaar tijdens snapshot (open file lock) | non-fataal — log toont welke files; meestal logs of sockets; eventueel toevoegen aan `RESTIC_EXCLUDES` |
|
||||||
|
| `another server-backup is already running` exit 75 | timer en UI-knop tegelijk, of vorige run hangt | `systemctl status server-backup.service`; bij hang: `systemctl kill server-backup.service`, lockfile `/run/server-backup.lock` opruimen |
|
||||||
|
| `last-run.json` niet geüpdatet | script gecrashed vóór `write_status_json` | `journalctl -u server-backup.service --since=today` — meestal env-file of password-file probleem |
|
||||||
|
| Postgres-datadir in restic snapshot terug te zien | excludes verkeerd geconfigureerd | check `RESTIC_EXCLUDES` in script — moet `/srv/scrum4me/postgres` bevatten |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Verificatie (end-to-end)
|
||||||
|
|
||||||
|
1. **Eerste run slaagt** — Deel E groen, statusfile `overall_status: success`.
|
||||||
|
2. **Snapshots zichtbaar** op beide repos via `restic snapshots`.
|
||||||
|
3. **Restore-test slaagt** — `restore-test.sh nas` → `overall_status: success` in `/srv/backups/status/last-restore-test.json`, alle assertions `ok`.
|
||||||
|
4. **Forgejo-restore-stack** (F5) — `forgejo doctor check --all` rond zonder errors, `/api/v1/version` antwoordt.
|
||||||
|
5. **Reboot-test** — server reboot, `systemctl list-timers` toont `server-backup.timer` met next-run gepland; NAS-mount automatisch terug.
|
||||||
|
6. **Failure-injectie**:
|
||||||
|
- NAS unmount → script eindigt met `overall_status: partial_failure`, `phases.restic_nas.status: failed`, B2-snapshot wel aanwezig, systemd exit 75.
|
||||||
|
- B2-key tijdelijk ongeldig → `phases.restic_b2.status: failed`, NAS-snapshot wel, exit 75.
|
||||||
|
- Beide repos onbereikbaar → `overall_status: failed`, exit 1.
|
||||||
|
7. **Concurrency** — tweede `systemctl start server-backup.service` tijdens lopende run → exit 75, log toont `another server-backup is already running`.
|
||||||
|
8. **Maandelijkse maintenance** — eerst keer succesvol uitgevoerd vanaf laptop, B2 `forget --prune` slaagt zonder Object Lock-fouten.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Addendum — uitvoering 2026-05-15
|
||||||
|
|
||||||
|
Eerste install op `scrum4me-srv`. Werk begon met dat alleen `restic` + `jq`
|
||||||
|
geïnstalleerd waren — Deel A2-Deel E + restore-test draaiden in deze sessie.
|
||||||
|
|
||||||
|
## Vastgestelde topologie en concrete waarden
|
||||||
|
|
||||||
|
| Plan-placeholder | Werkelijkheid op `scrum4me-srv` |
|
||||||
|
|---|---|
|
||||||
|
| Repo-pad in deploy-stappen | `/srv/scrum4me/ops-dashboard/` (runbook had `/srv/ops/repos/...` — bestaat niet op deze server). Ook in `recovery.md` en `deploy/ops-dashboard-updater/update.sh` staan nog `/srv/ops/repos`-verwijzingen — losse cleanup-taak. |
|
||||||
|
| NAS-mount | `/mnt/nas/backups` via cifs-`prefixpath=backups` op `//192.168.0.155/ssd`. **Geen aparte Samba-share aangemaakt** — de subdir `backups/` op de bestaande `ssd`-share is genoeg dankzij `prefixpath`. fstab-regel: `uid=0,gid=0,prefixpath=backups,file_mode=0600,dir_mode=0700,_netdev,x-systemd.automount,nofail`. |
|
||||||
|
| B2-bucket-naam | **`ScrumForMeSrvBackup`** (PascalCase) — niet de in het plan voorgestelde `scrum4me-srv-backup`. `RESTIC_REPO_B2=b2:ScrumForMeSrvBackup:scrum4me-srv` (case-sensitive). |
|
||||||
|
| B2-bucket-instellingen | Object Lock = Enabled, Mode = Governance, Default Retention = 30 days. Geen lifecycle rules. |
|
||||||
|
| B2 server-key capabilities | `listBuckets,listFiles,readFiles,writeFiles` (gemaakt via webportal als "Read and Write" — daar zat de juiste capability-set automatisch in). Geen `deleteFiles`, geen `bypassGovernance`. |
|
||||||
|
| B2 storage-cap | $10/maand. Bij 16 GB op B2 (zie cijfers onder) is dat $0,10/maand storage — ruim binnen de cap. |
|
||||||
|
| B2 maintenance-key | **Nog niet aangemaakt** — pas nodig bij eerste maandelijkse prune. Aanmaken vanaf laptop, met `Allow file deletes` en `Allow bypass governance retention` aangevinkt. |
|
||||||
|
| Forgejo-container | **`scrum4me-forgejo`** (image `codeberg.org/forgejo/forgejo:11`). |
|
||||||
|
| Forgejo `git`-user | uid 1000, bestaat ✓ — `docker exec -u git scrum4me-forgejo` werkt. |
|
||||||
|
| Forgejo data-locatie | docker named volume `forgejo_forgejo-data` (NIET `/srv/forgejo/data/...` zoals het runbook nog noemt — die paden bestaan niet maar de excludes zijn no-ops). |
|
||||||
|
| Forgejo-DB | rol `forgejo`, db `forgejo`, in `scrum4me-postgres`-container (zelfde Postgres als ops_dashboard, scrum4me). |
|
||||||
|
| Postgres data live | bind-mount `/srv/scrum4me/postgres/` (excluded). |
|
||||||
|
| restic-password locatie | `/etc/restic-backup.password` (mode 0400, root:root). Óók in passwordmanager onder "restic — scrum4me-srv". |
|
||||||
|
| systemd-timer | `server-backup.timer` enabled, dagelijks 03:30 + max 10 min randomized delay. |
|
||||||
|
|
||||||
|
## Wijzigingen aan de in commit `ab87c0f` gemergede code
|
||||||
|
|
||||||
|
Door deze sessie heen vier kleine fixes nodig gebleken — allemaal in deze PR:
|
||||||
|
|
||||||
|
- **`server-backup.sh`**: `--skip-db` weggehaald uit `dump_forgejo`. Forgejo
|
||||||
|
11.x heeft die flag niet (verwijderd na de Gitea-fork). Output van
|
||||||
|
`forgejo dump --help`: alleen `--skip-repository|-log|-custom-dir|-lfs-data|-attachment-data|-package-data|-index|-repo-archives`. De DB komt nu mee in de zip — redundant met `forgejo_db_dump`-fase, maar onschuldig.
|
||||||
|
- **`server-backup.sh`**: subshell-bug in `determine_exit_code` — werd aangeroepen via `EXIT_CODE=$(determine_exit_code)`, dus `OVERALL_STATUS=...` werd in de subshell gezet en lekte niet naar de parent. Resultaat: `last-run.json` schreef altijd `overall_status: "unknown"` en de eind-banner idem. Fix: directe call die zowel `OVERALL_STATUS` als `EXIT_CODE` in de parent-shell zet.
|
||||||
|
- **`restore-test.sh`**: deed een **full restore** zonder `--include`-filter — probeerde 476 GiB naar `/tmp` (7.6 GB tmpfs) te schrijven, ENOSPC + 3.3M errors + alle assertions "missing". Gefixt met `--include` op alleen de assertion-paden (`/etc/restic-backup.env`, `/srv/scrum4me/{compose/docker-compose.yml,caddy/Caddyfile}`, `/var/backups/databases`). Restore is nu 59 MB in 10s.
|
||||||
|
- **`server-backup.service`**: `RequiresMountsFor=/mnt/nas/backups` toegevoegd (triggert cifs-automount bij timer-fire), `ReadWritePaths` uitgebreid met `/mnt/nas/backups` (`ProtectSystem=strict` blokkeert anders schrijven naar de NAS-repo), `Documentation=`-URL gecorrigeerd. De pre-existing `RuntimeMaxSec= has no effect with Type=oneshot`-warning is cosmetisch en niet aangepakt.
|
||||||
|
|
||||||
|
## Onderweg geleerde quirks
|
||||||
|
|
||||||
|
- **`sudo -E` werkt niet op deze sudoers** — geeft warning `preserving the entire environment is not supported`. Niet erg: de scripts sourcen het env-file binnen de sudo'd shell zelf (`sudo bash -c '. /etc/restic-backup.env; ...'`), dus `-E` is overbodig.
|
||||||
|
- **B2 401-error op `b2_list_buckets` was misleidend** — keys waren prima (`b2_authorize_account` werkte), het probleem was dat `RESTIC_REPO_B2` een andere bucket-naam had dan waar de key voor scoped is. B2 geeft dan 401 i.p.v. 403/404.
|
||||||
|
- **B2 cap-error verschijnt als `403: Cannot upload files, storage cap exceeded`** — niet 402/payment-related. Cap kan op nul staan voor accounts die nog nooit een bucket vol hadden; verhogen via *Account → Caps & Alerts → Storage Cap*.
|
||||||
|
- **47× dedup** op de eerste snapshot — vooral door de drie git-repos in `/srv/scrum4me/repos/` plus de ~12k worker-log files in `/srv/scrum4me/worker-logs/idea/runs/` met veel overlap.
|
||||||
|
|
||||||
|
## Eerste-run-cijfers
|
||||||
|
|
||||||
|
```
|
||||||
|
NAS: 16 GB op disk (du)
|
||||||
|
Restore-size: 974 GiB (over 2 snapshots; ~487 GiB per snap)
|
||||||
|
Raw-data: 20.6 GiB (post-dedup)
|
||||||
|
On-disk: 15.6 GiB (post-compressie, 1.32x)
|
||||||
|
Snapshots: 2 (eerste run + post-fix re-run)
|
||||||
|
|
||||||
|
B2: ~16 GB op disk (vergelijkbare dedup + compressie)
|
||||||
|
Snapshots: 1 (na cap-bump + script-fix)
|
||||||
|
Storage-kost: ≈ $0,10/maand bij huidige grootte
|
||||||
|
|
||||||
|
Eerste run (forgejo+B2 faalden): 47:42 wall-clock
|
||||||
|
Tweede run (alles success): ~15 min
|
||||||
|
Restore-test (na --include fix): 10s, 59 MiB gerestored
|
||||||
|
Files in snapshot: ~2.1M (1.9M unique blobs)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verificatie-status
|
||||||
|
|
||||||
|
| Plan-stap | Status |
|
||||||
|
|---|---|
|
||||||
|
| 1. Eerste run slaagt | ✓ (na 2 attempts; success in run 2 om 15:23) |
|
||||||
|
| 2. Snapshots zichtbaar | ✓ NAS×2, B2×1 |
|
||||||
|
| 3. Restore-test slaagt | ✓ 4/4 assertions ok in 10s |
|
||||||
|
| 4. Forgejo-restore-stack (F5) | ✗ niet uitgevoerd — separate vervolg |
|
||||||
|
| 5. Reboot-test | ✗ niet uitgevoerd — productie-reboot, los moment |
|
||||||
|
| 6. Failure-injectie | ✗ niet bewust uitgevoerd; we hebben **wel** organisch failure paths gezien (B2-cap, forgejo `--skip-db`) en die rapporteerden zoals verwacht (exit 75, juiste per-phase-status) |
|
||||||
|
| 7. Concurrency | ✗ niet getest — `flock`-pad zit in script |
|
||||||
|
| 8. Maandelijkse maintenance vanaf laptop | — over een maand, met dan-aan-te-maken maintenance-key |
|
||||||
|
|
||||||
|
## Te bewerken bestanden op `scrum4me-srv`
|
||||||
|
|
||||||
|
- `/etc/fstab` — extra cifs-regel voor `/mnt/nas/backups` (zie Deel A3).
|
||||||
|
- `/etc/restic-backup.env` — secrets, mode 0600 root:root.
|
||||||
|
- `/etc/restic-backup.password` — mode 0400 root:root, óók in passwordmanager.
|
||||||
|
- `/etc/systemd/system/server-backup.{service,timer}` — uit de repo's
|
||||||
|
`deploy/server-backup/`.
|
||||||
|
- `/srv/backups/{scripts,logs,status}/`, `/var/backups/databases/`.
|
||||||
|
- `/srv/scrum4me/ops-dashboard/deploy/server-backup/*` — code uit deze PR
|
||||||
|
(na `git pull` op de server).
|
||||||
446
docs/runbooks/tailscale-setup.md
Normal file
446
docs/runbooks/tailscale-setup.md
Normal file
|
|
@ -0,0 +1,446 @@
|
||||||
|
# Ubuntu-omgeving (Postgres + app) via Tailscale bereikbaar maken vanaf de Mac
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Er zijn twee Scrum4Me-omgevingen:
|
||||||
|
- **Omgeving 1** — productie: Vercel + Neon (managed Postgres).
|
||||||
|
- **Omgeving 2** — nieuw: een eigen Ubuntu-server (`scrum4me-srv`) die de
|
||||||
|
**volledige Scrum4Me-app (Next.js achter een reverse proxy) + zelf-gehoste
|
||||||
|
Postgres** gaat draaien.
|
||||||
|
|
||||||
|
Het doel: vanaf de Mac (`janpeters-macbook-pro`) omgeving 2 kunnen gebruiken —
|
||||||
|
voor (1) een DB-client (psql/GUI), (2) de `scrum4me-docker` runner lokaal in
|
||||||
|
Docker, en (3) lokale dev van de hoofd-Scrum4Me-app.
|
||||||
|
|
||||||
|
Tailscale is al geïnstalleerd en verbonden op beide machines:
|
||||||
|
- `janpeters-macbook-pro` → `100.73.234.116`
|
||||||
|
- `scrum4me-srv` → `100.118.195.120` (Linux, SSH aan)
|
||||||
|
|
||||||
|
Wat nog ontbreekt: zowel Postgres als de Next.js-app op de Ubuntu-server
|
||||||
|
luisteren standaard alleen op `localhost` en zijn nog niet bereikbaar over de
|
||||||
|
Tailscale-interface. De database zelf is **al volledig ingericht** (schema +
|
||||||
|
data) — er is geen migratie- of seed-werk nodig, alleen netwerk-, auth- en
|
||||||
|
connectie-configuratie.
|
||||||
|
|
||||||
|
**Beslissingen (van de gebruiker):**
|
||||||
|
- App-deploy op Ubuntu: **reverse proxy** (nginx/Caddy) vóór Next.js.
|
||||||
|
- DB-toegang: **hele tailnet** mag erbij (`100.64.0.0/10`) — bewuste keuze;
|
||||||
|
later eventueel te versmallen via Tailscale ACLs/groups.
|
||||||
|
- Postgres-rol: **nog onzeker** — het plan voegt een controle toe en adviseert
|
||||||
|
een dedicated rol.
|
||||||
|
|
||||||
|
**Canonieke `SCRUM4ME_BASE_URL`:** `http://100.118.195.120` (reverse proxy op
|
||||||
|
poort 80 op de Tailscale-interface, **plain HTTP**). Tailscale (WireGuard)
|
||||||
|
verzorgt de transportencryptie binnen de tailnet, dus een tweede TLS-laag is
|
||||||
|
hier niet nodig. Dit raw-IP-adres resolvet ook vanuit een Docker-container
|
||||||
|
(geen MagicDNS-afhankelijkheid). HTTPS op de proxy is optionele hardening — zie
|
||||||
|
de noot onderaan; kies je daarvoor, gebruik dan een hostnaam die óók vanuit
|
||||||
|
Docker oplost en pas álle URL's hieronder consistent aan.
|
||||||
|
|
||||||
|
**Bevinding uit de codebase:** in `scrum4me-docker` is de DB-koppeling puur
|
||||||
|
config. Zowel `bin/run-one-job.ts` (regel 30, 115) als de MCP-server
|
||||||
|
(`mcp-config.json` regel 9-10) lezen `DATABASE_URL` / `DIRECT_URL` uit de
|
||||||
|
omgeving. `bin/check-tokens.sh` (regel 35-38) doet bovendien een harde
|
||||||
|
`curl ${SCRUM4ME_BASE_URL}/api/products` — onbereikbaarheid is fataal
|
||||||
|
(regel 52-57). Er zijn **geen code-wijzigingen** nodig — alleen `.env`.
|
||||||
|
|
||||||
|
## Voorwaarden (aantoonbaar voldaan vóór uitvoering)
|
||||||
|
|
||||||
|
- [ ] Tailscale actief op beide machines (`tailscale status` toont beide nodes)
|
||||||
|
- [ ] SSH naar scrum4me-srv werkt (`ssh scrum4me-srv echo ok`)
|
||||||
|
- [ ] DB-schema aanwezig (tabellen + data) — géén migratie nodig
|
||||||
|
|
||||||
|
## Voorwaarden (input van de gebruiker nodig)
|
||||||
|
|
||||||
|
- Postgres-rol + wachtwoord + databasenaam op de Ubuntu-server (de "USER",
|
||||||
|
"PASS", "DBNAME" hieronder). Niet in de chat delen — alleen lokaal invullen.
|
||||||
|
- De reverse proxy biedt de app aan op `http://100.118.195.120` (poort 80).
|
||||||
|
Wijkt dit af, pas dan overal de canonieke URL consistent aan.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel A — Ubuntu: Postgres openstellen op de Tailscale-interface
|
||||||
|
|
||||||
|
Uit te voeren op `scrum4me-srv` (via `ssh scrum4me-srv` of `tailscale ssh`).
|
||||||
|
|
||||||
|
1. **Tailscale-IP bevestigen**
|
||||||
|
```bash
|
||||||
|
tailscale status
|
||||||
|
tailscale ip -4 # verwacht: 100.118.195.120
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **`listen_addresses` uitbreiden** — Postgres bindt standaard alleen aan
|
||||||
|
localhost. Vind het configbestand en pas aan:
|
||||||
|
```bash
|
||||||
|
sudo -u postgres psql -c 'SHOW config_file;' # bv. /etc/postgresql/16/main/postgresql.conf
|
||||||
|
```
|
||||||
|
Zet in dat bestand:
|
||||||
|
```
|
||||||
|
listen_addresses = 'localhost,100.118.195.120'
|
||||||
|
```
|
||||||
|
Bewust **niet** `'*'` — zo bindt Postgres alleen aan localhost + het
|
||||||
|
Tailscale-adres, nooit aan de publieke interface.
|
||||||
|
|
||||||
|
> ⚠️ **Boot-order:** door aan `100.118.195.120` te binden moet `tailscale0`
|
||||||
|
> al bestaan bij boot. Stap A6 maakt de systemd-ordering verplicht — sla A6
|
||||||
|
> niet over, anders faalt Postgres na een reboot.
|
||||||
|
|
||||||
|
3. **Rol, auth-methode en grants controleren/instellen** (voorkomt een
|
||||||
|
login- of permission-fout ná goede netwerkconfig). De rol is nog onzeker,
|
||||||
|
dus eerst inventariseren:
|
||||||
|
```bash
|
||||||
|
sudo -u postgres psql -c '\du'
|
||||||
|
sudo -u postgres psql -c 'SHOW password_encryption;'
|
||||||
|
```
|
||||||
|
**Advies:** gebruik (of maak) een **dedicated runtime-rol** die alleen de
|
||||||
|
rechten heeft die de app/runner nodig heeft — geen superuser:
|
||||||
|
```sql
|
||||||
|
CREATE ROLE scrum4me_app LOGIN PASSWORD 'lokaal-wachtwoord';
|
||||||
|
GRANT CONNECT ON DATABASE DBNAME TO scrum4me_app;
|
||||||
|
|
||||||
|
-- runtime-rechten op het bestaande (gevulde) public-schema:
|
||||||
|
GRANT USAGE ON SCHEMA public TO scrum4me_app;
|
||||||
|
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO scrum4me_app;
|
||||||
|
GRANT USAGE, SELECT, UPDATE ON ALL SEQUENCES IN SCHEMA public TO scrum4me_app;
|
||||||
|
|
||||||
|
-- zodat ook later toegevoegde tabellen/sequences werken:
|
||||||
|
ALTER DEFAULT PRIVILEGES IN SCHEMA public
|
||||||
|
GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO scrum4me_app;
|
||||||
|
ALTER DEFAULT PRIVILEGES IN SCHEMA public
|
||||||
|
GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO scrum4me_app;
|
||||||
|
```
|
||||||
|
**Migraties:** deze runtime-rol krijgt bewust géén DDL-rechten. De DB is al
|
||||||
|
ingericht, dus in normale operatie draaien er geen migraties via deze rol.
|
||||||
|
Moet je later vanaf de Mac toch een Prisma-migratie draaien, gebruik dan de
|
||||||
|
DB-owner-rol (apart wachtwoord), niet `scrum4me_app`.
|
||||||
|
|
||||||
|
**SCRAM-verifier:** stap A4 kiest `scram-sha-256`. Een rol waarvan het
|
||||||
|
wachtwoord nog als **md5** is opgeslagen kan dan niet inloggen. Forceer een
|
||||||
|
SCRAM-verifier door het wachtwoord opnieuw te zetten (alleen lokaal op de
|
||||||
|
server, niet in chat/docs delen):
|
||||||
|
```sql
|
||||||
|
ALTER ROLE scrum4me_app WITH PASSWORD 'lokaal-wachtwoord';
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **`pg_hba.conf` — toegang vanaf de tailnet toestaan**
|
||||||
|
```bash
|
||||||
|
sudo -u postgres psql -c 'SHOW hba_file;'
|
||||||
|
```
|
||||||
|
Voeg een regel toe (boven de bestaande `host`-regels). De gebruiker koos
|
||||||
|
bewust voor toegang vanaf de **hele tailnet**; maak rol en database wel
|
||||||
|
expliciet i.p.v. `all all`:
|
||||||
|
```
|
||||||
|
# Scrum4Me clients via Tailscale
|
||||||
|
host DBNAME scrum4me_app 100.64.0.0/10 scram-sha-256
|
||||||
|
```
|
||||||
|
Let op: hiermee mag elke tailnet-node mét geldige credentials verbinden.
|
||||||
|
Wil je dat later inperken, doe dat via Tailscale ACLs/groups of versmal
|
||||||
|
het CIDR naar specifieke node-IP's.
|
||||||
|
|
||||||
|
5. **Firewall (defense-in-depth)** — alleen relevant als `ufw` actief is:
|
||||||
|
```bash
|
||||||
|
sudo ufw status
|
||||||
|
sudo ufw allow in on tailscale0 to any port 5432 proto tcp
|
||||||
|
```
|
||||||
|
Open 5432 **nooit** generiek (`sudo ufw allow 5432` zonder interface) —
|
||||||
|
dat zou de DB internet-breed openstellen.
|
||||||
|
|
||||||
|
6. **Boot-order — VERPLICHT.** Postgres bindt aan `100.118.195.120`, een adres
|
||||||
|
dat pas bestaat nadat `tailscaled` `tailscale0` heeft opgezet.
|
||||||
|
**Zonder deze override faalt Postgres bij reboot** ("cannot assign requested
|
||||||
|
address"). Voeg een systemd-override toe:
|
||||||
|
```bash
|
||||||
|
sudo systemctl edit postgresql # of postgresql@<versie>-main
|
||||||
|
```
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
After=tailscaled.service
|
||||||
|
Requires=tailscaled.service
|
||||||
|
```
|
||||||
|
|
||||||
|
7. **Postgres herstarten en verifiëren**
|
||||||
|
```bash
|
||||||
|
sudo systemctl restart postgresql
|
||||||
|
sudo ss -tlnp | grep 5432 # moet 127.0.0.1:5432 én 100.118.195.120:5432 tonen
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel B — Ubuntu: de Scrum4Me-app (reverse proxy) bereikbaar maken op Tailscale
|
||||||
|
|
||||||
|
De runner-tokencheck (`check-tokens.sh`) cURL't `${SCRUM4ME_BASE_URL}/api/products`
|
||||||
|
en faalt hard als die URL onbereikbaar is. De Next.js-app draait achter een
|
||||||
|
reverse proxy, dus de **proxy** moet op het Tailscale-adres luisteren — niet
|
||||||
|
alleen op `localhost`. Canoniek: `http://100.118.195.120` (poort 80, plain HTTP).
|
||||||
|
|
||||||
|
1. **Reverse proxy op de Tailscale-interface laten luisteren (poort 80)**
|
||||||
|
- **nginx:** in het server-block het `listen`-adres aan het Tailscale-IP
|
||||||
|
binden:
|
||||||
|
```
|
||||||
|
listen 100.118.195.120:80;
|
||||||
|
```
|
||||||
|
`sudo nginx -t` → `sudo systemctl reload nginx`.
|
||||||
|
- **Caddy:** site-adres `http://100.118.195.120:80` in de `Caddyfile`.
|
||||||
|
- Next.js zelf mag op `127.0.0.1:<intern>` blijven; alleen de proxy is
|
||||||
|
extern bereikbaar.
|
||||||
|
|
||||||
|
2. **Boot-order — VERPLICHT.** Net als Postgres bindt de proxy aan een adres
|
||||||
|
dat `tailscaled` eerst moet aanmaken. **Zonder deze override faalt nginx/Caddy
|
||||||
|
bij reboot.**
|
||||||
|
```bash
|
||||||
|
sudo systemctl edit nginx # of caddy
|
||||||
|
```
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
After=tailscaled.service
|
||||||
|
Requires=tailscaled.service
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Firewall voor poort 80** — alleen bij actieve `ufw`:
|
||||||
|
```bash
|
||||||
|
sudo ufw allow in on tailscale0 to any port 80 proto tcp
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Lokaal op de server verifiëren**
|
||||||
|
```bash
|
||||||
|
curl -fsS -H "Authorization: Bearer $SCRUM4ME_TOKEN" \
|
||||||
|
http://100.118.195.120/api/products >/dev/null && echo OK
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel C — Mac: connectiviteit verifiëren (DB én app)
|
||||||
|
|
||||||
|
1. **Tailscale-bereik** (al bevestigd: `scrum4me-srv` zichtbaar op
|
||||||
|
`100.118.195.120`):
|
||||||
|
```bash
|
||||||
|
tailscale status
|
||||||
|
tailscale ping scrum4me-srv
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **MagicDNS check** — kan de Mac de server op hostnaam bereiken?
|
||||||
|
```bash
|
||||||
|
ping -c1 scrum4me-srv
|
||||||
|
```
|
||||||
|
Zo ja: native macOS-clients mogen `scrum4me-srv` als host gebruiken.
|
||||||
|
|
||||||
|
3. **Postgres — TCP- en psql-test**
|
||||||
|
```bash
|
||||||
|
nc -vz 100.118.195.120 5432
|
||||||
|
psql "postgresql://USER:PASS@100.118.195.120:5432/DBNAME?sslmode=disable" -c '\dt'
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **App — vanaf de Mac én vanuit een Docker-container** (Docker Desktop heeft
|
||||||
|
geen Tailscale-MagicDNS; daarom de canonieke raw-IP-URL):
|
||||||
|
```bash
|
||||||
|
# vanaf de Mac
|
||||||
|
curl -fsS -H "Authorization: Bearer $SCRUM4ME_TOKEN" \
|
||||||
|
http://100.118.195.120/api/products >/dev/null && echo "mac OK"
|
||||||
|
|
||||||
|
# vanuit een container (simuleert de runner)
|
||||||
|
docker run --rm --env SCRUM4ME_TOKEN alpine sh -lc \
|
||||||
|
'wget -qO- --header "Authorization: Bearer $SCRUM4ME_TOKEN" \
|
||||||
|
http://100.118.195.120/api/products >/dev/null && echo "docker OK"'
|
||||||
|
```
|
||||||
|
Slaagt de container-test niet (geen route naar de tailnet vanuit Docker
|
||||||
|
Desktop), dan moet Tailscale in/naast de runner-container draaien — apart
|
||||||
|
uit te zoeken; eerst de directe route testen.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deel D — De drie consumenten koppelen
|
||||||
|
|
||||||
|
Welke host elke consument gebruikt verschilt — MagicDNS werkt wél native op
|
||||||
|
macOS, maar niet binnen een Docker-container:
|
||||||
|
|
||||||
|
| Consumer | Host | Reden |
|
||||||
|
|---|---|---|
|
||||||
|
| DB-client (psql/TablePlus) native | `scrum4me-srv` | MagicDNS werkt op macOS |
|
||||||
|
| scrum4me-docker runner (Docker) | `100.118.195.120` | Docker Desktop heeft geen MagicDNS |
|
||||||
|
| Hoofd-app lokale dev | `scrum4me-srv` | MagicDNS werkt op macOS |
|
||||||
|
|
||||||
|
### 1. DB-client (psql / TablePlus / DBeaver) — native op macOS
|
||||||
|
Connection-string:
|
||||||
|
```
|
||||||
|
postgresql://USER:PASS@scrum4me-srv:5432/DBNAME?sslmode=disable
|
||||||
|
```
|
||||||
|
GUI-clients: host `scrum4me-srv` (of `100.118.195.120`), poort `5432`,
|
||||||
|
SSL uit.
|
||||||
|
|
||||||
|
### 2. scrum4me-docker runner — bewerk `/Users/janpetervisser/Development/scrum4me-docker/.env`
|
||||||
|
```
|
||||||
|
DATABASE_URL=postgresql://USER:PASS@100.118.195.120:5432/DBNAME?sslmode=disable
|
||||||
|
DIRECT_URL=postgresql://USER:PASS@100.118.195.120:5432/DBNAME?sslmode=disable
|
||||||
|
SCRUM4ME_BASE_URL=http://100.118.195.120
|
||||||
|
```
|
||||||
|
Belangrijk:
|
||||||
|
- **Gebruik het rauwe Tailscale-IP, niet `scrum4me-srv`.** Docker Desktop-
|
||||||
|
containers krijgen geen Tailscale-MagicDNS; de hostnaam resolvet niet
|
||||||
|
binnen de container.
|
||||||
|
- Laat de Neon-specifieke params (`channel_binding=require`,
|
||||||
|
`sslmode=verify-full`) weg — die gelden niet voor zelf-gehoste Postgres.
|
||||||
|
- Zorg dat `SCRUM4ME_TOKEN` een token van **omgeving 2** is — de tokencheck
|
||||||
|
loopt tegen de Ubuntu-app, niet meer tegen Vercel.
|
||||||
|
|
||||||
|
> `SCRUM4ME_TOKEN` haal je op via de Ubuntu-app: Settings → API Tokens → nieuw
|
||||||
|
> token aanmaken. Een bestaand Vercel-token werkt **niet** tegen de
|
||||||
|
> Ubuntu-omgeving.
|
||||||
|
|
||||||
|
### 3. Hoofd-Scrum4Me-app (lokale dev) — bewerk `/Users/janpetervisser/Development/Scrum4Me`
|
||||||
|
Dit is een **andere repo** dan `scrum4me-docker`. In die repo de `.env.local`
|
||||||
|
(of `.env`) aanpassen. De app draait native op macOS, dus de MagicDNS-hostnaam
|
||||||
|
`scrum4me-srv` mag hier wél:
|
||||||
|
```
|
||||||
|
DATABASE_URL=postgresql://USER:PASS@scrum4me-srv:5432/DBNAME?sslmode=disable
|
||||||
|
DIRECT_URL=postgresql://USER:PASS@scrum4me-srv:5432/DBNAME?sslmode=disable
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## SSL-keuze
|
||||||
|
|
||||||
|
Aanbeveling: **`sslmode=disable`** voor Postgres en **plain HTTP** voor de app-
|
||||||
|
URL. Tailscale (WireGuard) versleutelt het transport al end-to-end binnen de
|
||||||
|
tailnet; een tweede TLS-laag op een zelf-gehoste Postgres of op de proxy levert
|
||||||
|
hier vooral configuratie-gedoe op.
|
||||||
|
|
||||||
|
Optionele hardening (later, samenhangend uit te voeren):
|
||||||
|
- TLS-certs op Postgres + `sslmode=require`.
|
||||||
|
- HTTPS op de reverse proxy. Doe dit dan met een **DNS-naam die ook vanuit
|
||||||
|
Docker oplost** (raw-IP + cert geeft validatiefouten), en pas `SCRUM4ME_BASE_URL`
|
||||||
|
én alle verificatie-`curl`s consistent aan naar die `https://`-hostnaam.
|
||||||
|
|
||||||
|
## Twee omgevingen naast elkaar houden
|
||||||
|
|
||||||
|
Omdat omgeving 1 (Neon) blijft bestaan: bewaar twee env-varianten, bv.
|
||||||
|
`.env.neon` en `.env.ubuntu`, en symlink de actieve naar `.env`:
|
||||||
|
```bash
|
||||||
|
ln -sf .env.ubuntu .env # activeer Ubuntu-omgeving
|
||||||
|
ln -sf .env.neon .env # activeer Neon-omgeving
|
||||||
|
```
|
||||||
|
Lichtgewicht en voorkomt dat je per ongeluk de verkeerde DB raakt.
|
||||||
|
|
||||||
|
## Te wijzigen bestanden
|
||||||
|
|
||||||
|
**Op `scrum4me-srv`:**
|
||||||
|
- `postgresql.conf` — `listen_addresses`.
|
||||||
|
- `pg_hba.conf` — tailnet-regel voor `DBNAME` + dedicated rol.
|
||||||
|
- Postgres-rol — dedicated `scrum4me_app`-rol + grants + `ALTER ROLE ... PASSWORD`.
|
||||||
|
- nginx/Caddy-config — `listen` op `100.118.195.120:80`.
|
||||||
|
- systemd-overrides — `After=/Requires=tailscaled.service` voor `postgresql`
|
||||||
|
en de proxy.
|
||||||
|
- evt. `ufw`-regels voor poort 5432 en 80 op `tailscale0`.
|
||||||
|
|
||||||
|
**Op de Mac:**
|
||||||
|
- `/Users/janpetervisser/Development/scrum4me-docker/.env` — `DATABASE_URL`,
|
||||||
|
`DIRECT_URL`, `SCRUM4ME_BASE_URL`, `SCRUM4ME_TOKEN`.
|
||||||
|
- `/Users/janpetervisser/Development/Scrum4Me/.env.local` — `DATABASE_URL`,
|
||||||
|
`DIRECT_URL` (andere repo).
|
||||||
|
- Géén codewijzigingen in `scrum4me-docker`.
|
||||||
|
|
||||||
|
## Verificatie (end-to-end)
|
||||||
|
|
||||||
|
1. **Netwerk:** `nc -vz 100.118.195.120 5432` vanaf de Mac slaagt.
|
||||||
|
2. **DB-client:** `psql ".../DBNAME?sslmode=disable" -c '\dt'` toont de
|
||||||
|
Scrum4Me-tabellen; een test-`INSERT`/`SELECT` bevestigt dat de
|
||||||
|
`scrum4me_app`-grants kloppen.
|
||||||
|
3. **App-bereik:** de `curl`/`docker run`-tests uit Deel C-4 geven beide `OK`.
|
||||||
|
4. **Reboot-test:** herstart `scrum4me-srv`; controleer daarna met
|
||||||
|
`sudo ss -tlnp` dat Postgres én de proxy weer op `100.118.195.120` luisteren,
|
||||||
|
en herhaal de Mac/Docker-connectiviteitstests.
|
||||||
|
5. **Runner:** na `.env`-update `docker compose up -d --force-recreate`,
|
||||||
|
dan `docker compose logs -f` — `check-tokens.sh` moet
|
||||||
|
"OK: 100.118.195.120:5432 reachable" én "OK: SCRUM4ME_TOKEN works" loggen,
|
||||||
|
en de daemon-loop moet een job kunnen claimen uit de Ubuntu-DB.
|
||||||
|
6. **Hoofd-app:** lokale dev-server in `/Users/janpetervisser/Development/Scrum4Me`
|
||||||
|
start en leest data uit de Ubuntu-DB.
|
||||||
|
|
||||||
|
## Veelvoorkomende fouten
|
||||||
|
|
||||||
|
| Fout | Oorzaak | Fix |
|
||||||
|
|---|---|---|
|
||||||
|
| `could not translate host name "scrum4me-srv"` | MagicDNS niet actief (Docker) | Gebruik raw IP `100.118.195.120` |
|
||||||
|
| `cannot assign requested address` bij Postgres-start | `tailscale0` bestaat nog niet | A6 systemd-override toevoegen |
|
||||||
|
| `FATAL: password authentication failed` | SCRAM-verifier niet bijgewerkt | `ALTER ROLE scrum4me_app WITH PASSWORD '...'` herhalen |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Addendum — uitvoering Ubuntu-kant 2026-05-14
|
||||||
|
|
||||||
|
> Deel A + B zijn uitgevoerd. De server bleek een **andere topologie** te hebben
|
||||||
|
> dan dit plan aannam: Postgres én de reverse proxy draaien als **Docker
|
||||||
|
> containers**, niet host-geïnstalleerd. Dit addendum beschrijft wat er feitelijk
|
||||||
|
> is gebeurd. Deel C + D (Mac-kant) staan nog open.
|
||||||
|
|
||||||
|
## Vastgestelde topologie (wijkt af van de aannames)
|
||||||
|
|
||||||
|
| Plan nam aan | Werkelijkheid op `scrum4me-srv` |
|
||||||
|
|---|---|
|
||||||
|
| Host-Postgres (`/etc/postgresql/...`, `systemctl postgresql`) | Docker container `scrum4me-postgres` (postgres:17), data-volume `/srv/scrum4me/postgres` |
|
||||||
|
| Host nginx/Caddy | Docker container `scrum4me-caddy` (caddy:2), al luisterend op `0.0.0.0:80` + `:443` |
|
||||||
|
| Migratie/seed mogelijk nodig | Bevestigd niet nodig — db `scrum4me` was gevuld |
|
||||||
|
|
||||||
|
Concrete waarden die het plan openliet:
|
||||||
|
- **Host** = dit ís `scrum4me-srv` (`100.118.195.120`) — Deel A/B dus direct uitgevoerd, niet via SSH.
|
||||||
|
- **DBNAME** = `scrum4me`
|
||||||
|
- **Rol** = `scrum4me_app` aangemaakt (non-superuser, DML-only), wachtwoord lokaal gegenereerd via `openssl rand -hex 24`.
|
||||||
|
|
||||||
|
## Deel A — zoals feitelijk uitgevoerd (Docker-variant)
|
||||||
|
|
||||||
|
| Plan-stap | Aanpassing |
|
||||||
|
|---|---|
|
||||||
|
| **A2** `listen_addresses` in `postgresql.conf` | **N.v.t.** — de container luistert intern al op `0.0.0.0`. Host-exposure = Docker port-mapping. In `/srv/scrum4me/compose/docker-compose.yml` toegevoegd: `- "100.118.195.120:5432:5432"` náást de bestaande `127.0.0.1:5432:5432`. Specifiek IP i.p.v. `0.0.0.0` — Docker's iptables-DNAT scoped dan op dat IP, publiek blijft dicht. |
|
||||||
|
| **A3** rol + grants | Identiek SQL, maar uitgevoerd via `docker exec -i scrum4me-postgres psql -U scrum4me -d scrum4me`. Idempotent script (`CREATE ROLE` of `ALTER ROLE ... PASSWORD`). De `ALTER ROLE ... PASSWORD` zet meteen een SCRAM-verifier. **Let op:** `CREATE ROLE` op de gedeelde productie-DB wordt door de auto-mode classifier geblokkeerd — moet via een script dat de gebruiker zelf draait. |
|
||||||
|
| **A4** `pg_hba.conf` | Bestand zit in het data-volume: host-pad `/srv/scrum4me/postgres/pg_hba.conf` (root-owned, sudo nodig). Regel toegevoegd onderaan (append is veilig — first-match, geen conflict). **Bevinding:** de postgres-image heeft al een catch-all `host all all all scram-sha-256` — onze scoped regel is dáárdoor strikt genomen redundant. Echte bescherming = IP-scoped port-binding + ufw. Catch-all strakker maken = aparte taak (hij draagt de docker-netwerk-clients). |
|
||||||
|
| **A5** ufw | Identiek: `ufw allow in on tailscale0 to any port 5432 proto tcp`. |
|
||||||
|
| **A6** boot-order | **Niet** `postgresql.service` (bestaat niet) maar `docker.service`. Drop-in `/etc/systemd/system/docker.service.d/tailscale-order.conf`. Bewust `After=tailscaled.service` + **`Wants=`** i.p.v. het door het plan voorgestelde `Requires=` — `Requires` op `docker.service` is fragiel (faalt tailscaled ooit, dan start de hele docker-stack niet). `After=` lost de race op; `Wants=` trekt tailscaled mee zonder hard-fail. |
|
||||||
|
| **A7** restart + verify | `docker compose up -d postgres` (recreate — `restart` pakt port-wijzigingen niet). `ss -tln` toont nu `127.0.0.1:5432` én `100.118.195.120:5432`. Verificatie met een wegwerp-container: `docker run --rm --network host postgres:17 psql "postgresql://scrum4me_app:...@100.118.195.120:5432/scrum4me?sslmode=disable" -c '\dt'` — `--network host` simuleert exact hoe de Mac het ziet. |
|
||||||
|
|
||||||
|
## Deel B — zoals feitelijk uitgevoerd (Docker-variant)
|
||||||
|
|
||||||
|
| Plan-stap | Aanpassing |
|
||||||
|
|---|---|
|
||||||
|
| **B1** proxy op tailscale-interface | **Grotendeels al gedaan** — de Caddy-container publiceert al `0.0.0.0:80`, dus luistert al op `tailscale0`. Alleen een site-block toegevoegd aan `/srv/scrum4me/caddy/Caddyfile`: `100.118.195.120:80 { reverse_proxy 172.18.0.1:3000 }`. |
|
||||||
|
| **B2** boot-order proxy | **Niet nodig.** Caddy bindt aan `0.0.0.0:80`, niet aan een IP-specifiek adres — er is geen `tailscale0`-race. (Alleen Postgres had de IP-specifieke binding, vandaar dat A6 wél nodig was.) |
|
||||||
|
| **B3** ufw poort 80 | **Niet nodig.** Poort 80 stond al op `ALLOW IN Anywhere`. |
|
||||||
|
| **B4** verifiëren | `curl -sI http://100.118.195.120/` → `200 OK`, geen redirect. `/api/products` → 401 (bereikbaar, auth vereist). |
|
||||||
|
|
||||||
|
## Bugs / valkuilen tegengekomen tijdens uitvoering
|
||||||
|
|
||||||
|
1. **Caddy single-file bind-mount wordt stale na een atomic-rename edit.**
|
||||||
|
`/srv/scrum4me/caddy/Caddyfile` is als enkel bestand ge-bind-mount. Editors
|
||||||
|
(en de Edit-tooling) schrijven vaak via write-temp + rename = nieuwe inode.
|
||||||
|
De container blijft naar de oude inode wijzen → `caddy reload` leest de
|
||||||
|
**oude** content, schijnbaar zonder fout. Symptoom hier: het nieuwe
|
||||||
|
site-block leek "stil gedropt" door Caddy's adapter, maar de container zág
|
||||||
|
het block simpelweg niet.
|
||||||
|
**Fix / regel:** na een Caddyfile-edit `docker compose up -d --force-recreate
|
||||||
|
caddy` (of `restart`) — **niet** `caddy reload`. De recreate her-bindt de
|
||||||
|
mount op de nieuwe inode. (Eerder in het project werkte een Caddyfile-edit
|
||||||
|
wél, juist omdat daar toevallig een `restart` op volgde.)
|
||||||
|
|
||||||
|
2. **`http://<IP>` vs `<IP>:80` syntax — bleek een rode haring.**
|
||||||
|
Aanvankelijk leek Caddy's Caddyfile-adapter `http://100.118.195.120` te
|
||||||
|
droppen. Geïsoleerd getest werkte beide syntaxen prima; het echte probleem
|
||||||
|
was bug #1 (stale mount). De definitieve regel gebruikt `100.118.195.120:80`
|
||||||
|
— ondubbelzinnig plain-HTTP-op-poort-80.
|
||||||
|
|
||||||
|
## Verificatie-status
|
||||||
|
|
||||||
|
| Plan verificatie-stap | Status |
|
||||||
|
|---|---|
|
||||||
|
| 1. `nc`/TCP naar 5432 | ✓ `psql` als `scrum4me_app` via `100.118.195.120:5432` werkt, ziet tabellen |
|
||||||
|
| 2. DB-client grants | ✓ `SELECT` op `idea_products` werkt onder `scrum4me_app` |
|
||||||
|
| 3. App-bereik | ✓ `http://100.118.195.120/` → 200, `/api/products` → 401 |
|
||||||
|
| 4. **Reboot-test** | ✗ **Nog niet gedaan** — productie-server niet herstart. Handmatig uitvoeren op rustig moment; check daarna `ss -tlnp \| grep 5432`. |
|
||||||
|
| 5. Runner | — Mac-kant (Deel C/D), nog open |
|
||||||
|
| 6. Hoofd-app lokale dev | — Mac-kant, nog open |
|
||||||
|
|
||||||
|
## Gewijzigde bestanden op `scrum4me-srv`
|
||||||
|
|
||||||
|
- `/srv/scrum4me/compose/docker-compose.yml` — postgres `ports`: extra `100.118.195.120:5432:5432`
|
||||||
|
- `/srv/scrum4me/caddy/Caddyfile` — site-block `100.118.195.120:80`
|
||||||
|
- `/srv/scrum4me/postgres/pg_hba.conf` — tailnet-regel (+ `.bak-<timestamp>`)
|
||||||
|
- `/etc/systemd/system/docker.service.d/tailscale-order.conf` — boot-order drop-in (nieuw)
|
||||||
|
- ufw — regel `5432/tcp on tailscale0`
|
||||||
|
- Postgres-rol `scrum4me_app` — aangemaakt met grants op db `scrum4me`
|
||||||
444
lib/parse-worker-log.ts
Normal file
444
lib/parse-worker-log.ts
Normal file
|
|
@ -0,0 +1,444 @@
|
||||||
|
// lib/parse-worker-log.ts
|
||||||
|
//
|
||||||
|
// Parser for Scrum4Me worker run-logs (/srv/scrum4me/worker-logs/idea/runs/*.log).
|
||||||
|
// Each file is produced by `tsx run-one-job.ts > run_log 2>&1` and is a mix of
|
||||||
|
// plain-text `[run-one-job]` annotation lines and Claude Code `stream-json`
|
||||||
|
// event lines (the worker spawns `claude --output-format stream-json --verbose`).
|
||||||
|
//
|
||||||
|
// Two entry points:
|
||||||
|
// summarizeRunLog(raw, fileName) — one cheap line scan, for the table.
|
||||||
|
// parseRunLog(raw, fileName) — full event timeline, for the detail panel.
|
||||||
|
//
|
||||||
|
// Pure module, no dependencies — mirrors lib/parse-docker.ts / lib/parse-systemd.ts.
|
||||||
|
|
||||||
|
export type RunStatus = 'idle' | 'running' | 'success' | 'error' | 'token-expired' | 'unknown'
|
||||||
|
|
||||||
|
export interface RunLogSummary {
|
||||||
|
fileName: string
|
||||||
|
runId: string
|
||||||
|
startedAt: string | null
|
||||||
|
status: RunStatus
|
||||||
|
jobId: string | null
|
||||||
|
model: string | null
|
||||||
|
permissionMode: string | null
|
||||||
|
durationMs: number | null
|
||||||
|
numTurns: number | null
|
||||||
|
totalCostUsd: number | null
|
||||||
|
exitCode: number | null
|
||||||
|
eventCount: number
|
||||||
|
inProgress: boolean
|
||||||
|
errorSummary: string | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export type MetaTag =
|
||||||
|
| 'claim'
|
||||||
|
| 'auth'
|
||||||
|
| 'quota'
|
||||||
|
| 'no-job'
|
||||||
|
| 'claimed'
|
||||||
|
| 'worktree'
|
||||||
|
| 'config'
|
||||||
|
| 'payload'
|
||||||
|
| 'spawn'
|
||||||
|
| 'claude-done'
|
||||||
|
| 'cleanup'
|
||||||
|
| 'exit'
|
||||||
|
| 'error'
|
||||||
|
| 'token-expired'
|
||||||
|
| 'timeout'
|
||||||
|
| 'other'
|
||||||
|
|
||||||
|
export type LogEvent =
|
||||||
|
| { kind: 'meta'; ts: string | null; tag: MetaTag; text: string }
|
||||||
|
| {
|
||||||
|
kind: 'system-init'
|
||||||
|
ts: string | null
|
||||||
|
model: string
|
||||||
|
permissionMode: string
|
||||||
|
tools: string[]
|
||||||
|
mcpServers: string[]
|
||||||
|
sessionId: string
|
||||||
|
cwd: string
|
||||||
|
version: string
|
||||||
|
}
|
||||||
|
| { kind: 'assistant-text'; ts: string | null; text: string; truncated: boolean }
|
||||||
|
| { kind: 'thinking'; ts: string | null; text: string; truncated: boolean }
|
||||||
|
| { kind: 'tool-call'; ts: string | null; id: string; name: string; input: string; truncated: boolean }
|
||||||
|
| {
|
||||||
|
kind: 'tool-result'
|
||||||
|
ts: string | null
|
||||||
|
toolUseId: string
|
||||||
|
isError: boolean
|
||||||
|
body: string
|
||||||
|
truncated: boolean
|
||||||
|
fullLength: number
|
||||||
|
}
|
||||||
|
| { kind: 'rate-limit'; ts: string | null; status: string }
|
||||||
|
| {
|
||||||
|
kind: 'result'
|
||||||
|
ts: string | null
|
||||||
|
subtype: string
|
||||||
|
isError: boolean
|
||||||
|
durationMs: number | null
|
||||||
|
numTurns: number | null
|
||||||
|
totalCostUsd: number | null
|
||||||
|
resultText: string
|
||||||
|
resultTruncated: boolean
|
||||||
|
}
|
||||||
|
| { kind: 'raw'; ts: string | null; text: string }
|
||||||
|
|
||||||
|
export interface ParsedRunLog {
|
||||||
|
summary: RunLogSummary
|
||||||
|
events: LogEvent[]
|
||||||
|
inProgress: boolean
|
||||||
|
responseTruncated: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-item caps keep the detail payload bounded even for ~350 KB raw logs.
|
||||||
|
const TOOL_RESULT_CAP = 8 * 1024
|
||||||
|
const TEXT_CAP = 16 * 1024
|
||||||
|
const TOOL_INPUT_CAP = 4 * 1024
|
||||||
|
const RESPONSE_CAP = 1_500_000
|
||||||
|
|
||||||
|
const META_RE = /^(\S+)\s+\[run-one-job\]\s+(.*)$/
|
||||||
|
|
||||||
|
function cap(s: string, max: number): { text: string; truncated: boolean } {
|
||||||
|
if (s.length <= max) return { text: s, truncated: false }
|
||||||
|
return { text: s.slice(0, max), truncated: true }
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Strip the `.log` / `.log.gz` suffix — the run id is the timestamp filename. */
|
||||||
|
export function runIdFromFileName(fileName: string): string {
|
||||||
|
return fileName.replace(/\.log(\.gz)?$/, '')
|
||||||
|
}
|
||||||
|
|
||||||
|
/** run-agent.sh names each file `$(date -u +%Y%m%dT%H%M%SZ).log`, so the name is the start time. */
|
||||||
|
function startedAtFromRunId(runId: string): string | null {
|
||||||
|
const m = runId.match(/^(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})(\d{2})Z$/)
|
||||||
|
if (!m) return null
|
||||||
|
return `${m[1]}-${m[2]}-${m[3]}T${m[4]}:${m[5]}:${m[6]}Z`
|
||||||
|
}
|
||||||
|
|
||||||
|
function classifyMeta(msg: string): MetaTag {
|
||||||
|
if (msg.startsWith('claim attempt')) return 'claim'
|
||||||
|
if (msg.startsWith('auth ok')) return 'auth'
|
||||||
|
if (msg.startsWith('quota probe')) return 'quota'
|
||||||
|
if (msg.startsWith('no job claimed')) return 'no-job'
|
||||||
|
if (msg.startsWith('claimed job_id=')) return 'claimed'
|
||||||
|
if (msg.startsWith('worktree path=')) return 'worktree'
|
||||||
|
if (msg.startsWith('config ')) return 'config'
|
||||||
|
if (msg.startsWith('payload written')) return 'payload'
|
||||||
|
if (msg.startsWith('spawn claude')) return 'spawn'
|
||||||
|
if (msg.startsWith('claude done')) return 'claude-done'
|
||||||
|
if (msg.startsWith('cleanup')) return 'cleanup'
|
||||||
|
if (msg.startsWith('exit code=')) return 'exit'
|
||||||
|
if (msg.startsWith('ERROR')) return 'error'
|
||||||
|
if (msg.startsWith('TOKEN_EXPIRED detected')) return 'token-expired'
|
||||||
|
if (msg.startsWith('claim timeout')) return 'timeout'
|
||||||
|
return 'other'
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Cheap single-pass summary for the table — at most one JSON.parse (the result line). */
|
||||||
|
export function summarizeRunLog(raw: string, fileName: string): RunLogSummary {
|
||||||
|
const runId = runIdFromFileName(fileName)
|
||||||
|
const lines = raw.split('\n')
|
||||||
|
|
||||||
|
let jobId: string | null = null
|
||||||
|
let model: string | null = null
|
||||||
|
let permissionMode: string | null = null
|
||||||
|
let claudeExit: number | null = null
|
||||||
|
let runExit: number | null = null
|
||||||
|
let durationMs: number | null = null
|
||||||
|
let numTurns: number | null = null
|
||||||
|
let totalCostUsd: number | null = null
|
||||||
|
let eventCount = 0
|
||||||
|
let hasResult = false
|
||||||
|
let resultIsError = false
|
||||||
|
let resultSubtype: string | null = null
|
||||||
|
let tokenExpired = false
|
||||||
|
let hasErrorLine = false
|
||||||
|
let firstErrorMsg: string | null = null
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (!line) continue
|
||||||
|
const m = line.match(META_RE)
|
||||||
|
if (m) {
|
||||||
|
const msg = m[2]
|
||||||
|
if (msg.startsWith('claimed job_id=')) {
|
||||||
|
jobId = msg.slice('claimed job_id='.length).trim() || jobId
|
||||||
|
} else if (msg.startsWith('config ')) {
|
||||||
|
model = /\bmodel=(\S+)/.exec(msg)?.[1] ?? model
|
||||||
|
permissionMode = /\bpermission_mode=(\S+)/.exec(msg)?.[1] ?? permissionMode
|
||||||
|
} else if (msg.startsWith('claude done')) {
|
||||||
|
const e = /\bexit_code=(-?\d+)/.exec(msg)
|
||||||
|
if (e) claudeExit = Number(e[1])
|
||||||
|
const d = /\bduration_ms=(\d+)/.exec(msg)
|
||||||
|
if (d) durationMs = Number(d[1])
|
||||||
|
} else if (msg.startsWith('exit code=')) {
|
||||||
|
const e = /exit code=(-?\d+)/.exec(msg)
|
||||||
|
if (e) runExit = Number(e[1])
|
||||||
|
} else if (msg.startsWith('TOKEN_EXPIRED detected')) {
|
||||||
|
tokenExpired = true
|
||||||
|
} else if (msg.startsWith('ERROR')) {
|
||||||
|
hasErrorLine = true
|
||||||
|
if (!firstErrorMsg) firstErrorMsg = msg.replace(/^ERROR\s*/, '').slice(0, 300)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
const trimmed = line.trimStart()
|
||||||
|
if (trimmed.startsWith('{')) {
|
||||||
|
eventCount++
|
||||||
|
if (!hasResult && trimmed.startsWith('{"type":"result"')) {
|
||||||
|
try {
|
||||||
|
const obj = JSON.parse(trimmed)
|
||||||
|
hasResult = true
|
||||||
|
resultIsError = !!obj.is_error
|
||||||
|
resultSubtype = typeof obj.subtype === 'string' ? obj.subtype : null
|
||||||
|
if (typeof obj.num_turns === 'number') numTurns = obj.num_turns
|
||||||
|
if (typeof obj.total_cost_usd === 'number') totalCostUsd = obj.total_cost_usd
|
||||||
|
if (durationMs == null && typeof obj.duration_ms === 'number') durationMs = obj.duration_ms
|
||||||
|
} catch {
|
||||||
|
// malformed result line — ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const exitCode = claudeExit ?? runExit
|
||||||
|
const terminal = runExit != null || hasResult || hasErrorLine || tokenExpired
|
||||||
|
const inProgress = !terminal
|
||||||
|
|
||||||
|
let status: RunStatus
|
||||||
|
if (tokenExpired) {
|
||||||
|
status = 'token-expired'
|
||||||
|
} else if (jobId) {
|
||||||
|
if (inProgress) {
|
||||||
|
status = 'running'
|
||||||
|
} else if (
|
||||||
|
resultIsError ||
|
||||||
|
hasErrorLine ||
|
||||||
|
(claudeExit != null && claudeExit !== 0) ||
|
||||||
|
(runExit != null && runExit !== 0)
|
||||||
|
) {
|
||||||
|
status = 'error'
|
||||||
|
} else {
|
||||||
|
status = 'success'
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No job was claimed this iteration — the worker was idle / waiting.
|
||||||
|
status = 'idle'
|
||||||
|
}
|
||||||
|
|
||||||
|
let errorSummary: string | null = null
|
||||||
|
if (status === 'error' || status === 'token-expired') {
|
||||||
|
errorSummary =
|
||||||
|
firstErrorMsg ??
|
||||||
|
(tokenExpired ? 'TOKEN_EXPIRED detected in output' : null) ??
|
||||||
|
(resultIsError ? `result: ${resultSubtype ?? 'error'}` : null) ??
|
||||||
|
(exitCode != null ? `exit code ${exitCode}` : null)
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
fileName,
|
||||||
|
runId,
|
||||||
|
startedAt: startedAtFromRunId(runId),
|
||||||
|
status,
|
||||||
|
jobId,
|
||||||
|
model,
|
||||||
|
permissionMode,
|
||||||
|
durationMs,
|
||||||
|
numTurns,
|
||||||
|
totalCostUsd,
|
||||||
|
exitCode,
|
||||||
|
eventCount,
|
||||||
|
inProgress,
|
||||||
|
errorSummary,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeContent(content: unknown): string {
|
||||||
|
if (typeof content === 'string') return content
|
||||||
|
if (Array.isArray(content)) {
|
||||||
|
return content
|
||||||
|
.map((b) => {
|
||||||
|
if (typeof b === 'string') return b
|
||||||
|
if (b && typeof b === 'object' && typeof (b as { text?: unknown }).text === 'string') {
|
||||||
|
return (b as { text: string }).text
|
||||||
|
}
|
||||||
|
return JSON.stringify(b)
|
||||||
|
})
|
||||||
|
.join('\n')
|
||||||
|
}
|
||||||
|
if (content == null) return ''
|
||||||
|
return JSON.stringify(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
/* eslint-disable @typescript-eslint/no-explicit-any -- stream-json events are genuinely dynamic */
|
||||||
|
function pushJsonEvent(events: LogEvent[], obj: any): void {
|
||||||
|
const type = obj?.type
|
||||||
|
const ts: string | null = typeof obj?.timestamp === 'string' ? obj.timestamp : null
|
||||||
|
|
||||||
|
if (type === 'system') {
|
||||||
|
const mcp = Array.isArray(obj.mcp_servers)
|
||||||
|
? obj.mcp_servers.map((s: any) => (typeof s?.name === 'string' ? s.name : String(s)))
|
||||||
|
: []
|
||||||
|
events.push({
|
||||||
|
kind: 'system-init',
|
||||||
|
ts,
|
||||||
|
model: typeof obj.model === 'string' ? obj.model : '—',
|
||||||
|
permissionMode: typeof obj.permissionMode === 'string' ? obj.permissionMode : '—',
|
||||||
|
tools: Array.isArray(obj.tools) ? obj.tools.filter((t: unknown) => typeof t === 'string') : [],
|
||||||
|
mcpServers: mcp,
|
||||||
|
sessionId: typeof obj.session_id === 'string' ? obj.session_id : '',
|
||||||
|
cwd: typeof obj.cwd === 'string' ? obj.cwd : '',
|
||||||
|
version: typeof obj.claude_code_version === 'string' ? obj.claude_code_version : '',
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === 'rate_limit_event') {
|
||||||
|
events.push({
|
||||||
|
kind: 'rate-limit',
|
||||||
|
ts,
|
||||||
|
status: typeof obj.rate_limit_info?.status === 'string' ? obj.rate_limit_info.status : 'unknown',
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === 'assistant') {
|
||||||
|
const content = obj?.message?.content
|
||||||
|
if (Array.isArray(content)) {
|
||||||
|
for (const block of content) {
|
||||||
|
if (block?.type === 'text' && typeof block.text === 'string') {
|
||||||
|
const c = cap(block.text, TEXT_CAP)
|
||||||
|
events.push({ kind: 'assistant-text', ts, text: c.text, truncated: c.truncated })
|
||||||
|
} else if (block?.type === 'thinking' && typeof block.thinking === 'string') {
|
||||||
|
const c = cap(block.thinking, TEXT_CAP)
|
||||||
|
events.push({ kind: 'thinking', ts, text: c.text, truncated: c.truncated })
|
||||||
|
} else if (block?.type === 'tool_use') {
|
||||||
|
let inputStr: string
|
||||||
|
try {
|
||||||
|
inputStr = JSON.stringify(block.input, null, 2)
|
||||||
|
} catch {
|
||||||
|
inputStr = String(block.input)
|
||||||
|
}
|
||||||
|
const c = cap(inputStr, TOOL_INPUT_CAP)
|
||||||
|
events.push({
|
||||||
|
kind: 'tool-call',
|
||||||
|
ts,
|
||||||
|
id: typeof block.id === 'string' ? block.id : '',
|
||||||
|
name: typeof block.name === 'string' ? block.name : 'tool',
|
||||||
|
input: c.text,
|
||||||
|
truncated: c.truncated,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === 'user') {
|
||||||
|
const content = obj?.message?.content
|
||||||
|
if (Array.isArray(content)) {
|
||||||
|
for (const block of content) {
|
||||||
|
if (block?.type === 'tool_result') {
|
||||||
|
const body = normalizeContent(block.content)
|
||||||
|
const c = cap(body, TOOL_RESULT_CAP)
|
||||||
|
events.push({
|
||||||
|
kind: 'tool-result',
|
||||||
|
ts,
|
||||||
|
toolUseId: typeof block.tool_use_id === 'string' ? block.tool_use_id : '',
|
||||||
|
isError: !!block.is_error,
|
||||||
|
body: c.text,
|
||||||
|
truncated: c.truncated,
|
||||||
|
fullLength: body.length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === 'result') {
|
||||||
|
const c = cap(typeof obj.result === 'string' ? obj.result : '', TEXT_CAP)
|
||||||
|
events.push({
|
||||||
|
kind: 'result',
|
||||||
|
ts,
|
||||||
|
subtype: typeof obj.subtype === 'string' ? obj.subtype : 'unknown',
|
||||||
|
isError: !!obj.is_error,
|
||||||
|
durationMs: typeof obj.duration_ms === 'number' ? obj.duration_ms : null,
|
||||||
|
numTurns: typeof obj.num_turns === 'number' ? obj.num_turns : null,
|
||||||
|
totalCostUsd: typeof obj.total_cost_usd === 'number' ? obj.total_cost_usd : null,
|
||||||
|
resultText: c.text,
|
||||||
|
resultTruncated: c.truncated,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unknown event type — keep a compact raw note so nothing is silently dropped.
|
||||||
|
events.push({ kind: 'raw', ts, text: cap(`${type ?? 'event'}: ${JSON.stringify(obj)}`, 2048).text })
|
||||||
|
}
|
||||||
|
/* eslint-enable @typescript-eslint/no-explicit-any */
|
||||||
|
|
||||||
|
function estimateSize(e: LogEvent): number {
|
||||||
|
switch (e.kind) {
|
||||||
|
case 'assistant-text':
|
||||||
|
case 'thinking':
|
||||||
|
case 'raw':
|
||||||
|
return e.text.length
|
||||||
|
case 'tool-call':
|
||||||
|
return e.input.length
|
||||||
|
case 'tool-result':
|
||||||
|
return e.body.length
|
||||||
|
case 'result':
|
||||||
|
return e.resultText.length
|
||||||
|
default:
|
||||||
|
return 64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Bound the whole payload — drop tool-result bodies oldest-first if still too large. */
|
||||||
|
function enforceResponseCap(events: LogEvent[]): boolean {
|
||||||
|
let total = 0
|
||||||
|
for (const e of events) total += estimateSize(e)
|
||||||
|
if (total <= RESPONSE_CAP) return false
|
||||||
|
for (const e of events) {
|
||||||
|
if (total <= RESPONSE_CAP) break
|
||||||
|
if (e.kind === 'tool-result' && e.body) {
|
||||||
|
total -= e.body.length
|
||||||
|
e.body = ''
|
||||||
|
e.truncated = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Full event timeline for the detail panel. */
|
||||||
|
export function parseRunLog(raw: string, fileName: string): ParsedRunLog {
|
||||||
|
const summary = summarizeRunLog(raw, fileName)
|
||||||
|
const events: LogEvent[] = []
|
||||||
|
|
||||||
|
for (const line of raw.split('\n')) {
|
||||||
|
if (!line.trim()) continue
|
||||||
|
const m = line.match(META_RE)
|
||||||
|
if (m) {
|
||||||
|
events.push({ kind: 'meta', ts: m[1], tag: classifyMeta(m[2]), text: m[2] })
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
const trimmed = line.trimStart()
|
||||||
|
if (trimmed.startsWith('{')) {
|
||||||
|
try {
|
||||||
|
pushJsonEvent(events, JSON.parse(trimmed))
|
||||||
|
} catch {
|
||||||
|
// partial / malformed JSON line (e.g. a log read mid-write) — keep it raw
|
||||||
|
events.push({ kind: 'raw', ts: null, text: cap(line, TOOL_RESULT_CAP).text })
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Non-JSON, non-meta noise (e.g. a bare `Warning: ...` from claude).
|
||||||
|
events.push({ kind: 'raw', ts: null, text: cap(line, TOOL_RESULT_CAP).text })
|
||||||
|
}
|
||||||
|
|
||||||
|
const responseTruncated = enforceResponseCap(events)
|
||||||
|
return { summary, events, inProgress: summary.inProgress, responseTruncated }
|
||||||
|
}
|
||||||
12
lib/utils.ts
12
lib/utils.ts
|
|
@ -14,3 +14,15 @@ export function relativeTime(date: Date): string {
|
||||||
if (hours < 24) return `${hours}u geleden`
|
if (hours < 24) return `${hours}u geleden`
|
||||||
return `${Math.floor(hours / 24)}d geleden`
|
return `${Math.floor(hours / 24)}d geleden`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Human-readable duration from a millisecond count. */
|
||||||
|
export function formatDuration(ms: number): string {
|
||||||
|
if (ms < 1000) return `${ms}ms`
|
||||||
|
const totalSec = Math.round(ms / 1000)
|
||||||
|
if (totalSec < 60) return `${totalSec}s`
|
||||||
|
const minutes = Math.floor(totalSec / 60)
|
||||||
|
const seconds = totalSec % 60
|
||||||
|
if (minutes < 60) return `${minutes}m ${seconds}s`
|
||||||
|
const hours = Math.floor(minutes / 60)
|
||||||
|
return `${hours}u ${minutes % 60}m`
|
||||||
|
}
|
||||||
|
|
|
||||||
116
lib/worker-logs.ts
Normal file
116
lib/worker-logs.ts
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
// lib/worker-logs.ts
|
||||||
|
//
|
||||||
|
// Server-only filesystem access to the worker run-logs. The directory is
|
||||||
|
// mounted read-only into the ops-dashboard container (see docker-compose.yml:
|
||||||
|
// `/srv/scrum4me/worker-logs:/var/worker-logs:ro`). Path configurable via the
|
||||||
|
// WORKER_LOGS_DIR env var.
|
||||||
|
//
|
||||||
|
// Only imported by server components and route handlers — never by a
|
||||||
|
// 'use client' file.
|
||||||
|
|
||||||
|
import 'server-only'
|
||||||
|
import { readdir, readFile } from 'node:fs/promises'
|
||||||
|
import { gunzipSync } from 'node:zlib'
|
||||||
|
import { join, resolve } from 'node:path'
|
||||||
|
import { summarizeRunLog, type RunLogSummary } from './parse-worker-log'
|
||||||
|
|
||||||
|
const WORKER_LOGS_DIR = process.env.WORKER_LOGS_DIR ?? '/var/worker-logs/idea'
|
||||||
|
const RUNS_DIR = join(WORKER_LOGS_DIR, 'runs')
|
||||||
|
|
||||||
|
/** Selectable row counts for the table. */
|
||||||
|
export const LIMIT_OPTIONS = [10, 25, 50, 100] as const
|
||||||
|
const DEFAULT_LIMIT = 10
|
||||||
|
|
||||||
|
// Filenames are `$(date -u +%Y%m%dT%H%M%SZ).log` — no slashes, no dots beyond
|
||||||
|
// the literal suffix, so this regex alone rules out path traversal.
|
||||||
|
const NAME_RE = /^\d{8}T\d{6}Z\.log(\.gz)?$/
|
||||||
|
|
||||||
|
export type WorkerLogErrorCode = 'invalid' | 'not-found' | 'unavailable'
|
||||||
|
|
||||||
|
export class WorkerLogError extends Error {
|
||||||
|
readonly code: WorkerLogErrorCode
|
||||||
|
constructor(message: string, code: WorkerLogErrorCode) {
|
||||||
|
super(message)
|
||||||
|
this.name = 'WorkerLogError'
|
||||||
|
this.code = code
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Clamp an arbitrary requested limit down to the largest allowed option. */
|
||||||
|
export function clampLimit(n: number): number {
|
||||||
|
if (!Number.isFinite(n)) return DEFAULT_LIMIT
|
||||||
|
let chosen: number = DEFAULT_LIMIT
|
||||||
|
for (const opt of LIMIT_OPTIONS) {
|
||||||
|
if (n >= opt) chosen = opt
|
||||||
|
}
|
||||||
|
return chosen
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isValidLogName(name: string): boolean {
|
||||||
|
return NAME_RE.test(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveLogPath(name: string): string {
|
||||||
|
if (!isValidLogName(name)) {
|
||||||
|
throw new WorkerLogError(`invalid log name: ${name}`, 'invalid')
|
||||||
|
}
|
||||||
|
const base = resolve(RUNS_DIR)
|
||||||
|
const full = resolve(base, name)
|
||||||
|
// Defense-in-depth: the regex already forbids traversal, but confirm anyway.
|
||||||
|
if (full !== join(base, name)) {
|
||||||
|
throw new WorkerLogError(`path escapes worker logs dir: ${name}`, 'invalid')
|
||||||
|
}
|
||||||
|
return full
|
||||||
|
}
|
||||||
|
|
||||||
|
async function readLogFile(name: string): Promise<string> {
|
||||||
|
const full = resolveLogPath(name)
|
||||||
|
if (name.endsWith('.gz')) {
|
||||||
|
const buf = await readFile(full)
|
||||||
|
return gunzipSync(buf).toString('utf8')
|
||||||
|
}
|
||||||
|
return readFile(full, 'utf8')
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Newest-first summaries for the table. Sorts by filename, slices, then reads. */
|
||||||
|
export async function listRunLogs(limit: number): Promise<RunLogSummary[]> {
|
||||||
|
const n = clampLimit(limit)
|
||||||
|
|
||||||
|
let entries: string[]
|
||||||
|
try {
|
||||||
|
entries = await readdir(RUNS_DIR)
|
||||||
|
} catch (err) {
|
||||||
|
throw new WorkerLogError(
|
||||||
|
`cannot read worker logs dir ${RUNS_DIR}: ${(err as Error).message}`,
|
||||||
|
'unavailable',
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filename is `YYYYMMDDTHHMMSSZ` — lexicographic order == chronological order.
|
||||||
|
// Sort + slice BEFORE touching file content (the dir holds ~12k files).
|
||||||
|
const names = entries.filter(isValidLogName).sort().reverse().slice(0, n)
|
||||||
|
|
||||||
|
return Promise.all(
|
||||||
|
names.map(async (name) => {
|
||||||
|
try {
|
||||||
|
return summarizeRunLog(await readLogFile(name), name)
|
||||||
|
} catch {
|
||||||
|
// A single unreadable / mid-rotation file must not break the table.
|
||||||
|
return { ...summarizeRunLog('', name), status: 'unknown' as const, inProgress: false }
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Raw contents of one run-log (gunzipped if needed). */
|
||||||
|
export async function readRunLog(name: string): Promise<string> {
|
||||||
|
try {
|
||||||
|
return await readLogFile(name)
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof WorkerLogError) throw err
|
||||||
|
if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||||
|
throw new WorkerLogError(`log not found: ${name}`, 'not-found')
|
||||||
|
}
|
||||||
|
throw new WorkerLogError(`cannot read log ${name}: ${(err as Error).message}`, 'unavailable')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -107,6 +107,20 @@ commands:
|
||||||
- ops-dashboard
|
- ops-dashboard
|
||||||
description: "Build a docker compose service image"
|
description: "Build a docker compose service image"
|
||||||
|
|
||||||
|
docker_compose_build_worker_fresh:
|
||||||
|
# De worker-idea Dockerfile clonet scrum4me-mcp van GitHub in een aparte
|
||||||
|
# laag. Een gewone docker compose build hergebruikt die laag zolang
|
||||||
|
# MCP_GIT_REF gelijk blijft (= altijd 'main'), dus nieuwe MCP-commits worden
|
||||||
|
# NIET opgepikt. MCP_CACHE_BUST met een verse timestamp invalideert de
|
||||||
|
# clone-laag. sh -c is nodig om $(date) te evalueren (geen shell-injectie:
|
||||||
|
# vaste string, geen externe input).
|
||||||
|
cmd:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "docker compose build --build-arg MCP_CACHE_BUST=$(date +%s) worker-idea"
|
||||||
|
cwd: "/srv/scrum4me/compose"
|
||||||
|
description: "Rebuild worker-idea image, busting the scrum4me-mcp clone cache so the latest MCP code is pulled"
|
||||||
|
|
||||||
docker_compose_up:
|
docker_compose_up:
|
||||||
cmd: ["docker", "compose", "up", "-d"]
|
cmd: ["docker", "compose", "up", "-d"]
|
||||||
cwd: "/srv/scrum4me/compose"
|
cwd: "/srv/scrum4me/compose"
|
||||||
|
|
@ -236,3 +250,51 @@ commands:
|
||||||
- -delete
|
- -delete
|
||||||
- -print
|
- -print
|
||||||
description: "Delete ops_dashboard backup files older than 30 days"
|
description: "Delete ops_dashboard backup files older than 30 days"
|
||||||
|
|
||||||
|
# ── Server-wide backup (restic + NAS + B2) ────────────────────────────────
|
||||||
|
# All wrappers live under /srv/backups/scripts/wrappers/ and read
|
||||||
|
# /etc/restic-backup.env (mode 0600 root:root) which the ops-agent user
|
||||||
|
# cannot read directly — hence the sudo prefix. See deploy/ops-agent/sudoers
|
||||||
|
# for the corresponding NOPASSWD entries.
|
||||||
|
|
||||||
|
read_backup_status:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/read-status.sh"]
|
||||||
|
description: "Read /srv/backups/status/last-run.json + last-restore-test.json (JSON)"
|
||||||
|
|
||||||
|
restic_snapshots_nas:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-snapshots.sh", "nas"]
|
||||||
|
description: "Restic snapshots from the NAS repo (JSON array, newest first)"
|
||||||
|
|
||||||
|
restic_snapshots_b2:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-snapshots.sh", "b2"]
|
||||||
|
description: "Restic snapshots from the B2 repo (JSON array, newest first)"
|
||||||
|
|
||||||
|
restic_stats_nas:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-stats.sh", "nas"]
|
||||||
|
description: "Restic stats for the NAS repo (restore-size + raw-data + dedup ratio)"
|
||||||
|
|
||||||
|
restic_stats_b2:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-stats.sh", "b2"]
|
||||||
|
description: "Restic stats for the B2 repo (restore-size + raw-data + dedup ratio)"
|
||||||
|
|
||||||
|
list_backup_logs:
|
||||||
|
cmd:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "ls -lt /srv/backups/logs/*.log 2>/dev/null | head -10 || echo 'no logs yet'"
|
||||||
|
description: "List the 10 most recent server-backup logs"
|
||||||
|
|
||||||
|
tail_backup_log_today:
|
||||||
|
cmd:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- "f=/srv/backups/logs/server-backup-$(date +%F).log; [ -f \"$f\" ] && tail -200 \"$f\" || echo 'no log for today'"
|
||||||
|
description: "Tail the last 200 lines of today's server-backup log"
|
||||||
|
|
||||||
|
trigger_server_backup:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/trigger-backup.sh"]
|
||||||
|
description: "Trigger server-backup.service ad-hoc (refuses if already running)"
|
||||||
|
|
||||||
|
trigger_restore_test:
|
||||||
|
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/trigger-restore-test.sh", "nas"]
|
||||||
|
description: "Run restore-test.sh against the NAS repo (non-destructive, writes /tmp/restore-test/)"
|
||||||
|
|
|
||||||
86
ops-agent/flows.example/redeploy_all.yml
Normal file
86
ops-agent/flows.example/redeploy_all.yml
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
# Volledige redeploy van de Scrum4Me-stack — alle drie de repos in één flow.
|
||||||
|
# Copy to /etc/ops-agent/flows/redeploy_all.yml on the host.
|
||||||
|
#
|
||||||
|
# Dit is de gecombineerde werkwijze: eerst de hoofd-app (scrum4me-web),
|
||||||
|
# dan de worker (scrum4me-docker image met verse scrum4me-mcp clone).
|
||||||
|
# Equivalent aan update_scrum4me_web.yml gevolgd door update_mcp_worker.yml,
|
||||||
|
# maar als één atomaire flow met audit-trail.
|
||||||
|
#
|
||||||
|
# Volgorde-redenering:
|
||||||
|
# - Web eerst: de DB-migratie (stap 6) is additief en niet-breaking, dus
|
||||||
|
# veilig terwijl de oude worker nog draait.
|
||||||
|
# - Worker daarna: de nieuwe MCP-code kan afhankelijk zijn van de nieuwe
|
||||||
|
# DB-kolommen/enums uit de web-migratie.
|
||||||
|
#
|
||||||
|
# Steps:
|
||||||
|
# 1-9. scrum4me-web: status, fetch, log-ahead, pull, npm ci, migrate,
|
||||||
|
# build, restart service, smoke-test
|
||||||
|
# 10-16. worker: status + fetch + pull scrum4me-docker, pull scrum4me-mcp,
|
||||||
|
# cache-busted image rebuild, container recreate, health-wait
|
||||||
|
#
|
||||||
|
# Let op: de worker-rebuild MOET docker_compose_build_worker_fresh gebruiken,
|
||||||
|
# niet docker_compose_build — anders blijft de scrum4me-mcp clone-laag
|
||||||
|
# gecached en wordt nieuwe MCP-code gemist.
|
||||||
|
|
||||||
|
name: Redeploy All
|
||||||
|
description: Volledige stack-redeploy — scrum4me-web (pull/migrate/build/restart) gevolgd door de MCP-worker (cache-busted image rebuild)
|
||||||
|
steps:
|
||||||
|
# --- scrum4me-web -------------------------------------------------------
|
||||||
|
- command_key: git_status
|
||||||
|
args: ["/srv/scrum4me/repos/Scrum4Me"]
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: git_fetch
|
||||||
|
args: ["/srv/scrum4me/repos/Scrum4Me"]
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: git_log_ahead
|
||||||
|
args: ["/srv/scrum4me/repos/Scrum4Me"]
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: git_pull
|
||||||
|
args: ["/srv/scrum4me/repos/Scrum4Me"]
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: npm_ci
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: prisma_migrate_deploy
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: npm_run_build
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: systemctl_restart
|
||||||
|
args: ["scrum4me-web"]
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: curl_smoke_scrum4me_thuis
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
# --- MCP-worker ---------------------------------------------------------
|
||||||
|
- command_key: git_status
|
||||||
|
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: git_fetch
|
||||||
|
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: git_pull
|
||||||
|
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: git_pull
|
||||||
|
args: ["/srv/scrum4me/repos/scrum4me-mcp"]
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: docker_compose_build_worker_fresh
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: docker_compose_up_recreate
|
||||||
|
args: ["worker-idea"]
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: wait_for_health_worker
|
||||||
|
on_failure: continue
|
||||||
21
ops-agent/flows.example/server_backup_full.yml
Normal file
21
ops-agent/flows.example/server_backup_full.yml
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
# Trigger a full server-wide backup (pg_dumpall + restic to NAS + B2).
|
||||||
|
# Runs out-of-band via systemd; this flow just kicks it off and then tails
|
||||||
|
# today's log + reads the structured statusfile so the dashboard can render
|
||||||
|
# progress and final result.
|
||||||
|
#
|
||||||
|
# Copy to /etc/ops-agent/flows/server_backup_full.yml on the host.
|
||||||
|
# Triggered manually via /settings/backups → "Backup now" or by the daily
|
||||||
|
# server-backup.timer (which runs server-backup.service directly, skipping
|
||||||
|
# this flow).
|
||||||
|
|
||||||
|
name: Server backup (full)
|
||||||
|
description: Daily full server backup — pg_dumpall + restic to NAS + B2 (Object Lock)
|
||||||
|
steps:
|
||||||
|
- command_key: trigger_server_backup
|
||||||
|
on_failure: abort
|
||||||
|
|
||||||
|
- command_key: tail_backup_log_today
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: read_backup_status
|
||||||
|
on_failure: continue
|
||||||
14
ops-agent/flows.example/server_backup_restore_test.yml
Normal file
14
ops-agent/flows.example/server_backup_restore_test.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
# Run a non-destructive restore test against the NAS repo. Restores the latest
|
||||||
|
# snapshot to /tmp/restore-test/ and asserts that critical files came back
|
||||||
|
# intact. Used to verify backups periodically without touching the live stack.
|
||||||
|
#
|
||||||
|
# Copy to /etc/ops-agent/flows/server_backup_restore_test.yml on the host.
|
||||||
|
|
||||||
|
name: Server backup — restore test
|
||||||
|
description: Restore latest snapshot to /tmp/restore-test and assert critical files
|
||||||
|
steps:
|
||||||
|
- command_key: trigger_restore_test
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: read_backup_status
|
||||||
|
on_failure: continue
|
||||||
|
|
@ -2,15 +2,21 @@
|
||||||
# Copy to /etc/ops-agent/flows/update_mcp_worker.yml on the host.
|
# Copy to /etc/ops-agent/flows/update_mcp_worker.yml on the host.
|
||||||
#
|
#
|
||||||
# Steps:
|
# Steps:
|
||||||
# 1. Show current git status (informational)
|
# 1. Show current git status of scrum4me-docker (informational)
|
||||||
# 2. Fetch remote refs
|
# 2. Fetch remote refs for scrum4me-docker
|
||||||
# 3. Fast-forward pull (aborts if working tree is dirty)
|
# 3. Fast-forward pull scrum4me-docker (aborts if working tree is dirty)
|
||||||
# 4. Rebuild the Docker image
|
# 4. Fast-forward pull scrum4me-mcp — sync van de lokale repo. De image
|
||||||
# 5. Recreate the container in detached mode (force-recreate picks up new image)
|
# cloned MCP zelf van GitHub, dus dit is alleen lokale referentie;
|
||||||
# 6. Wait for worker pre-flight to pass (checks /var/log/agent/current)
|
# on_failure: continue zodat een dirty mcp-tree de deploy niet blokkeert.
|
||||||
|
# 5. Rebuild the worker image MET cache-bust. Een gewone build hergebruikt
|
||||||
|
# de scrum4me-mcp clone-laag (MCP_GIT_REF blijft 'main'), dus nieuwe
|
||||||
|
# MCP-commits worden gemist. docker_compose_build_worker_fresh forceert
|
||||||
|
# een verse clone via MCP_CACHE_BUST.
|
||||||
|
# 6. Recreate the container (force-recreate picks up the new image)
|
||||||
|
# 7. Wait for worker pre-flight to pass (checks /var/log/agent/current)
|
||||||
|
|
||||||
name: Update MCP Worker
|
name: Update MCP Worker
|
||||||
description: Pull latest code, rebuild Docker image, and restart the MCP worker service
|
description: Pull latest code, rebuild the worker image with a fresh scrum4me-mcp clone, and recreate the worker container
|
||||||
steps:
|
steps:
|
||||||
- command_key: git_status
|
- command_key: git_status
|
||||||
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
||||||
|
|
@ -24,8 +30,11 @@ steps:
|
||||||
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
args: ["/srv/scrum4me/repos/scrum4me-docker"]
|
||||||
on_failure: abort
|
on_failure: abort
|
||||||
|
|
||||||
- command_key: docker_compose_build
|
- command_key: git_pull
|
||||||
args: ["worker-idea"]
|
args: ["/srv/scrum4me/repos/scrum4me-mcp"]
|
||||||
|
on_failure: continue
|
||||||
|
|
||||||
|
- command_key: docker_compose_build_worker_fresh
|
||||||
on_failure: abort
|
on_failure: abort
|
||||||
|
|
||||||
- command_key: docker_compose_up_recreate
|
- command_key: docker_compose_up_recreate
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue