diff --git a/app/settings/backups/_components/backups-panel.tsx b/app/settings/backups/_components/backups-panel.tsx index b2707d9..bf0a2ef 100644 --- a/app/settings/backups/_components/backups-panel.tsx +++ b/app/settings/backups/_components/backups-panel.tsx @@ -1,171 +1,52 @@ 'use client' -import { useState, useCallback } from 'react' -import Link from 'next/link' -import { useFlowRun } from '@/hooks/useFlowRun' -import StreamingTerminal from '@/components/StreamingTerminal' -import ConfirmDialog from '@/components/ConfirmDialog' import type { BackupFile } from '../page' - -function formatSize(bytes: number): string { - if (bytes === 0) return '—' - if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB` - return `${(bytes / (1024 * 1024)).toFixed(1)} MB` -} +import type { + BackupStatusEnvelope, + ResticSnapshot, + ResticStats, +} from '../_lib/types' +import DatabaseBackupsSection from './database-backups-section' +import ServerBackupSection from './server-backup-section' type Props = { backups: BackupFile[] listError: string | null + envelope: BackupStatusEnvelope + nasSnapshots: ResticSnapshot[] + b2Snapshots: ResticSnapshot[] + nasStats: ResticStats | null + b2Stats: ResticStats | null + serverBackupErrors: { + status?: string + nasSnapshots?: string + b2Snapshots?: string + nasStats?: string + b2Stats?: string + } } -export default function BackupsPanel({ backups, listError }: Props) { - const [pending, setPending] = useState(false) - const [completedFlowRunId, setCompletedFlowRunId] = useState(null) - - const handleComplete = useCallback((flowRunId: string) => { - setCompletedFlowRunId(flowRunId) - }, []) - - const flowRun = useFlowRun(handleComplete) - - const handleConfirm = useCallback(() => { - setPending(false) - setCompletedFlowRunId(null) - flowRun.startFlow('backup_ops_db', false) - }, [flowRun]) - - const handleReset = useCallback(() => { - flowRun.reset() - setCompletedFlowRunId(null) - }, [flowRun]) - +export default function BackupsPanel({ + backups, + listError, + envelope, + nasSnapshots, + b2Snapshots, + nasStats, + b2Stats, + serverBackupErrors, +}: Props) { return ( -
- {/* Description */} -
-

- Backs up the ops_dashboard database using{' '} - pg_dump. Dumps are stored in{' '} - /srv/ops/backups/ and retained for 30 days. - For automated daily backups, enable the systemd timer:{' '} - deploy/ops-agent/ops-db-backup.timer. -

- -
    -
  1. - 1. - pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump -
  2. -
  3. - 2. - cleanup: delete backup files older than 30 days -
  4. -
-
- - {/* Action buttons */} -
- - {flowRun.status !== 'idle' && flowRun.status !== 'running' && ( - - )} -
- - {/* Terminal output */} - {flowRun.status !== 'idle' && ( -
-
- Output - {completedFlowRunId && ( - - View in audit log → - - )} -
- - {flowRun.status === 'done' && ( -

- Reload this page to see the updated backup list. -

- )} -
- )} - - {/* Backup list */} -
-

Existing backups

- - {listError && ( -
- Could not list backups: {listError} -
- )} - - {!listError && backups.length === 0 && ( -
- No backups found in /srv/ops/backups/ -
- )} - - {!listError && backups.length > 0 && ( -
- - - - - - - - - - {backups.map((b, i) => ( - - - - - - ))} - -
- Timestamp - FileSize
{b.label}{b.name} - {formatSize(b.sizeBytes)} -
-
- )} - -

- Backups older than 30 days are removed automatically by the cleanup step. -

-
- - {/* Confirm dialog */} - setPending(false)} +
+ +
+
) diff --git a/app/settings/backups/_components/database-backups-section.tsx b/app/settings/backups/_components/database-backups-section.tsx new file mode 100644 index 0000000..81a0285 --- /dev/null +++ b/app/settings/backups/_components/database-backups-section.tsx @@ -0,0 +1,172 @@ +'use client' + +import { useCallback, useState } from 'react' +import Link from 'next/link' +import { useFlowRun } from '@/hooks/useFlowRun' +import StreamingTerminal from '@/components/StreamingTerminal' +import ConfirmDialog from '@/components/ConfirmDialog' +import type { BackupFile } from '../page' + +function formatSize(bytes: number): string { + if (bytes === 0) return '—' + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB` + return `${(bytes / (1024 * 1024)).toFixed(1)} MB` +} + +type Props = { + backups: BackupFile[] + listError: string | null +} + +export default function DatabaseBackupsSection({ backups, listError }: Props) { + const [pending, setPending] = useState(false) + const [completedFlowRunId, setCompletedFlowRunId] = useState(null) + + const handleComplete = useCallback((flowRunId: string) => { + setCompletedFlowRunId(flowRunId) + }, []) + + const flowRun = useFlowRun(handleComplete) + + const handleConfirm = useCallback(() => { + setPending(false) + setCompletedFlowRunId(null) + flowRun.startFlow('backup_ops_db', false) + }, [flowRun]) + + const handleReset = useCallback(() => { + flowRun.reset() + setCompletedFlowRunId(null) + }, [flowRun]) + + return ( +
+
+

Database backups

+ flow: backup_ops_db +
+ +
+

+ Backs up the ops_dashboard database using{' '} + pg_dump. Dumps are stored in{' '} + /srv/ops/backups/ and retained for 30 days. + For automated daily backups, enable the systemd timer:{' '} + deploy/ops-agent/ops-db-backup.timer. +

+ +
    +
  1. + 1. + pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump +
  2. +
  3. + 2. + cleanup: delete backup files older than 30 days +
  4. +
+
+ +
+ + {flowRun.status !== 'idle' && flowRun.status !== 'running' && ( + + )} +
+ + {flowRun.status !== 'idle' && ( +
+
+ Output + {completedFlowRunId && ( + + View in audit log → + + )} +
+ + {flowRun.status === 'done' && ( +

+ Reload this page to see the updated backup list. +

+ )} +
+ )} + +
+

Existing backups

+ + {listError && ( +
+ Could not list backups: {listError} +
+ )} + + {!listError && backups.length === 0 && ( +
+ No backups found in /srv/ops/backups/ +
+ )} + + {!listError && backups.length > 0 && ( +
+ + + + + + + + + + {backups.map((b, i) => ( + + + + + + ))} + +
+ Timestamp + FileSize
{b.label}{b.name} + {formatSize(b.sizeBytes)} +
+
+ )} + +

+ Backups older than 30 days are removed automatically by the cleanup step. +

+
+ + setPending(false)} + /> +
+ ) +} diff --git a/app/settings/backups/_components/server-backup-section.tsx b/app/settings/backups/_components/server-backup-section.tsx new file mode 100644 index 0000000..bd64cb9 --- /dev/null +++ b/app/settings/backups/_components/server-backup-section.tsx @@ -0,0 +1,447 @@ +'use client' + +import { useCallback, useState } from 'react' +import Link from 'next/link' +import { useFlowRun } from '@/hooks/useFlowRun' +import StreamingTerminal from '@/components/StreamingTerminal' +import ConfirmDialog from '@/components/ConfirmDialog' +import type { + BackupPhase, + BackupStatus, + BackupStatusEnvelope, + OverallStatus, + PhaseStatus, + ResticSnapshot, + ResticStats, +} from '../_lib/types' + +type Props = { + envelope: BackupStatusEnvelope + nasSnapshots: ResticSnapshot[] + b2Snapshots: ResticSnapshot[] + nasStats: ResticStats | null + b2Stats: ResticStats | null + errors: { + status?: string + nasSnapshots?: string + b2Snapshots?: string + nasStats?: string + b2Stats?: string + } +} + +type ActiveFlow = 'backup' | 'restore' | null + +function formatBytes(bytes: number | null | undefined): string { + if (bytes == null) return '—' + if (bytes < 1024) return `${bytes} B` + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB` + if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB` + return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB` +} + +function formatDuration(seconds: number | null | undefined): string { + if (seconds == null || seconds === 0) return '—' + if (seconds < 60) return `${seconds}s` + if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${seconds % 60}s` + const h = Math.floor(seconds / 3600) + const m = Math.floor((seconds % 3600) / 60) + return `${h}h ${m}m` +} + +function formatTimestamp(iso: string | null | undefined): string { + if (!iso) return '—' + try { + const d = new Date(iso) + if (Number.isNaN(d.getTime())) return iso + const yyyy = d.getFullYear() + const mm = String(d.getMonth() + 1).padStart(2, '0') + const dd = String(d.getDate()).padStart(2, '0') + const hh = String(d.getHours()).padStart(2, '0') + const mi = String(d.getMinutes()).padStart(2, '0') + return `${yyyy}-${mm}-${dd} ${hh}:${mi}` + } catch { + return iso + } +} + +function overallBadgeClass(status: OverallStatus): string { + switch (status) { + case 'success': + return 'bg-green-500/15 text-green-500 border-green-500/30' + case 'partial_failure': + return 'bg-amber-500/15 text-amber-500 border-amber-500/30' + case 'failed': + return 'bg-destructive/15 text-destructive border-destructive/30' + default: + return 'bg-muted/50 text-muted-foreground border-border' + } +} + +function phaseIcon(status: PhaseStatus): { glyph: string; color: string } { + switch (status) { + case 'success': + return { glyph: '✓', color: 'text-green-500' } + case 'skipped': + return { glyph: '–', color: 'text-muted-foreground' } + case 'degraded': + return { glyph: '!', color: 'text-amber-500' } + case 'failed': + return { glyph: '✗', color: 'text-destructive' } + case 'pending': + default: + return { glyph: '○', color: 'text-muted-foreground/50' } + } +} + +function phaseDurationSeconds(phase: BackupPhase): number | null { + if (!phase.startedAt || !phase.completedAt) return null + const start = new Date(phase.startedAt).getTime() + const end = new Date(phase.completedAt).getTime() + if (Number.isNaN(start) || Number.isNaN(end)) return null + return Math.max(0, Math.round((end - start) / 1000)) +} + +function StatusCard({ status }: { status: BackupStatus | null }) { + if (!status) { + return ( +
+ No backup run recorded yet. Trigger one with the "Backup now" button below. +
+ ) + } + return ( +
+
+
+ + {status.overallStatus.replace('_', ' ')} + + + Last run {formatTimestamp(status.completedAt)} on{' '} + {status.host || '—'} + +
+ + duration {formatDuration(status.durationSeconds)} + +
+
+ {status.phases.map((p) => { + const icon = phaseIcon(p.status) + const dur = phaseDurationSeconds(p) + return ( +
+ {icon.glyph} +
+ {p.name} + + {p.status} + {dur != null ? ` · ${formatDuration(dur)}` : ''} + +
+
+ ) + })} +
+
+ ) +} + +function StatsBlock({ stats, label, error }: { stats: ResticStats | null; label: string; error?: string }) { + if (error) { + return ( +
+ {label}: {error} +
+ ) + } + if (!stats) { + return ( +
+ {label}: no stats yet +
+ ) + } + const dedup = + stats.dedupRatio != null && Number.isFinite(stats.dedupRatio) + ? `${stats.dedupRatio.toFixed(2)}×` + : '—' + return ( +
+
+ + {label} + + + {stats.snapshotsCount} snapshot{stats.snapshotsCount === 1 ? '' : 's'} + +
+
+
restore size
+
{formatBytes(stats.restoreSizeBytes)}
+
raw data
+
{formatBytes(stats.rawDataBytes)}
+
dedup ratio
+
{dedup}
+
+
+ ) +} + +function SnapshotsTable({ + snapshots, + label, + error, +}: { + snapshots: ResticSnapshot[] + label: string + error?: string +}) { + return ( +
+
+

{label}

+ {snapshots.length} shown +
+ {error ? ( +
+ {error} +
+ ) : snapshots.length === 0 ? ( +
+ No snapshots in this repo yet. +
+ ) : ( +
+ + + + + + + + + + + {snapshots.map((s, i) => ( + + + + + + + ))} + +
TimeIDTags + Files / size added +
{formatTimestamp(s.time)}{s.shortId} + {s.tags.join(', ') || '—'} + + {s.summary?.files_new != null + ? `${s.summary.files_new} new · ${formatBytes(s.summary.data_added ?? 0)}` + : '—'} +
+
+ )} +
+ ) +} + +export default function ServerBackupSection({ + envelope, + nasSnapshots, + b2Snapshots, + nasStats, + b2Stats, + errors, +}: Props) { + const [pending, setPending] = useState(null) + const [completedFlowRunId, setCompletedFlowRunId] = useState(null) + const [activeFlow, setActiveFlow] = useState(null) + + const handleComplete = useCallback((flowRunId: string) => { + setCompletedFlowRunId(flowRunId) + }, []) + + const flowRun = useFlowRun(handleComplete) + + const startFlow = useCallback( + (kind: 'backup' | 'restore') => { + setPending(null) + setCompletedFlowRunId(null) + setActiveFlow(kind) + flowRun.startFlow( + kind === 'backup' ? 'server_backup_full' : 'server_backup_restore_test', + false, + ) + }, + [flowRun], + ) + + const handleReset = useCallback(() => { + flowRun.reset() + setCompletedFlowRunId(null) + setActiveFlow(null) + }, [flowRun]) + + return ( +
+
+

Server backup (restic)

+ flows: server_backup_full · restore_test +
+ +
+

+ Daily server-wide backup at 03:30: pg_dumpall + + Forgejo dump, then restic to NAS (local) and Backblaze B2{' '} + (offsite, Object Lock). Authoritative restore sources are the database dumps; live datadirs + are excluded. See{' '} + + docs/runbooks/server-backup.md + {' '} + for the full procedure. +

+
+ + + {errors.status && ( +
+ Could not read backup status: {errors.status} +
+ )} + +
+ + +
+ +
+ + + {flowRun.status !== 'idle' && flowRun.status !== 'running' && ( + + )} +
+ + {flowRun.status !== 'idle' && ( +
+
+ + Output {activeFlow ? `(${activeFlow === 'backup' ? 'backup' : 'restore test'})` : ''} + + {completedFlowRunId && ( + + View in audit log → + + )} +
+ + {flowRun.status === 'done' && ( +

+ Reload this page to see the updated status, snapshots, and stats. +

+ )} +
+ )} + +
+ + +
+ + {envelope.lastRestoreTest && ( +
+
+

Last restore test

+ + {envelope.lastRestoreTest.overallStatus.replace('_', ' ')} + +
+

+ {formatTimestamp(envelope.lastRestoreTest.completedAt)} · repo{' '} + {envelope.lastRestoreTest.repo} · snapshot{' '} + + {envelope.lastRestoreTest.snapshotId?.slice(0, 8) ?? '—'} + {' '} + · {envelope.lastRestoreTest.assertions.length} assertions +

+ {envelope.lastRestoreTest.assertions.some((a) => a.status !== 'ok') && ( +
    + {envelope.lastRestoreTest.assertions + .filter((a) => a.status !== 'ok') + .map((a) => ( +
  • + {a.status === 'missing' ? '✗ missing' : '! empty'} · {a.path} +
  • + ))} +
+ )} +
+ )} + + startFlow('backup')} + onCancel={() => setPending(null)} + /> + startFlow('restore')} + onCancel={() => setPending(null)} + /> +
+ ) +} diff --git a/app/settings/backups/_lib/parse.ts b/app/settings/backups/_lib/parse.ts new file mode 100644 index 0000000..70d23fe --- /dev/null +++ b/app/settings/backups/_lib/parse.ts @@ -0,0 +1,191 @@ +import type { + BackupPhase, + BackupStatus, + BackupStatusEnvelope, + OverallStatus, + PhaseStatus, + ResticSnapshot, + ResticStats, + RestoreTestAssertion, + RestoreTestStatus, +} from './types' + +const PHASE_ORDER = [ + 'postgres_dump', + 'forgejo_dump', + 'forgejo_db_dump', + 'restic_nas', + 'restic_b2', + 'forget_nas', + 'check_nas', + 'check_b2', +] as const + +function isRecord(v: unknown): v is Record { + return typeof v === 'object' && v !== null && !Array.isArray(v) +} + +function asString(v: unknown): string | null { + return typeof v === 'string' ? v : null +} + +function asNumber(v: unknown): number | null { + return typeof v === 'number' && Number.isFinite(v) ? v : null +} + +function asPhaseStatus(v: unknown): PhaseStatus { + if ( + v === 'success' || + v === 'skipped' || + v === 'degraded' || + v === 'failed' || + v === 'pending' + ) { + return v + } + return 'pending' +} + +function asOverallStatus(v: unknown): OverallStatus { + if (v === 'success' || v === 'partial_failure' || v === 'failed') return v + return 'unknown' +} + +function parsePhase(name: string, raw: unknown): BackupPhase { + if (!isRecord(raw)) { + return { + name, + status: 'pending', + exitCode: null, + startedAt: null, + completedAt: null, + error: null, + } + } + return { + name, + status: asPhaseStatus(raw.status), + exitCode: asNumber(raw.exit_code), + startedAt: asString(raw.started_at), + completedAt: asString(raw.completed_at), + error: asString(raw.error), + snapshotId: asString(raw.snapshot_id) ?? undefined, + filesNew: asNumber(raw.files_new), + dataAddedBytes: asNumber(raw.data_added_bytes), + outputFile: asString(raw.output_file) ?? undefined, + bytes: asNumber(raw.bytes), + } +} + +function parseBackupStatus(raw: unknown): BackupStatus | null { + if (!isRecord(raw)) return null + const phasesRaw = isRecord(raw.phases) ? raw.phases : {} + const phases = PHASE_ORDER.map((name) => parsePhase(name, phasesRaw[name])) + return { + schemaVersion: asNumber(raw.schema_version) ?? 1, + overallStatus: asOverallStatus(raw.overall_status), + startedAt: asString(raw.started_at) ?? '', + completedAt: asString(raw.completed_at) ?? '', + durationSeconds: asNumber(raw.duration_seconds) ?? 0, + host: asString(raw.host) ?? '', + phases, + } +} + +function parseRestoreTestAssertion(raw: unknown): RestoreTestAssertion | null { + if (!isRecord(raw)) return null + const status = raw.status + if (status !== 'ok' && status !== 'empty' && status !== 'missing') return null + return { + path: asString(raw.path) ?? '', + status, + bytes: asNumber(raw.bytes) ?? 0, + } +} + +function parseRestoreTestStatus(raw: unknown): RestoreTestStatus | null { + if (!isRecord(raw)) return null + const assertionsRaw = Array.isArray(raw.assertions) ? raw.assertions : [] + const assertions: RestoreTestAssertion[] = [] + for (const a of assertionsRaw) { + const parsed = parseRestoreTestAssertion(a) + if (parsed) assertions.push(parsed) + } + return { + schemaVersion: asNumber(raw.schema_version) ?? 1, + overallStatus: asOverallStatus(raw.overall_status), + startedAt: asString(raw.started_at) ?? '', + completedAt: asString(raw.completed_at) ?? '', + durationSeconds: asNumber(raw.duration_seconds) ?? 0, + repo: asString(raw.repo) ?? '', + snapshotId: asString(raw.snapshot_id), + restoreExitCode: asNumber(raw.restore_exit_code), + target: asString(raw.target) ?? undefined, + assertions, + error: asString(raw.error) ?? undefined, + } +} + +export function parseStatusEnvelope(output: string): BackupStatusEnvelope { + try { + const trimmed = output.trim() + if (!trimmed) return { lastRun: null, lastRestoreTest: null } + const parsed: unknown = JSON.parse(trimmed) + if (!isRecord(parsed)) return { lastRun: null, lastRestoreTest: null } + return { + lastRun: parseBackupStatus(parsed.last_run), + lastRestoreTest: parseRestoreTestStatus(parsed.last_restore_test), + } + } catch { + return { lastRun: null, lastRestoreTest: null } + } +} + +export function parseResticSnapshots(output: string, repo: 'nas' | 'b2'): ResticSnapshot[] { + try { + const trimmed = output.trim() + if (!trimmed) return [] + const parsed: unknown = JSON.parse(trimmed) + if (!Array.isArray(parsed)) return [] + const result: ResticSnapshot[] = [] + for (const s of parsed) { + if (!isRecord(s)) continue + const id = asString(s.id) + if (!id) continue + const shortId = asString(s.short_id) ?? id.slice(0, 8) + const time = asString(s.time) ?? '' + const hostname = asString(s.hostname) ?? '' + const tags = Array.isArray(s.tags) + ? s.tags.filter((t): t is string => typeof t === 'string') + : [] + const paths = Array.isArray(s.paths) + ? s.paths.filter((p): p is string => typeof p === 'string') + : [] + const summary = isRecord(s.summary) ? (s.summary as ResticSnapshot['summary']) : null + result.push({ id, shortId, time, hostname, tags, paths, repo, summary }) + } + return result + } catch { + return [] + } +} + +export function parseResticStats(output: string, repo: 'nas' | 'b2'): ResticStats | null { + try { + const trimmed = output.trim() + if (!trimmed) return null + const parsed: unknown = JSON.parse(trimmed) + if (!isRecord(parsed)) return null + return { + repo, + snapshotsCount: asNumber(parsed.snapshots_count) ?? 0, + restoreSizeBytes: asNumber(parsed.restore_size_bytes), + restoreSizeFiles: asNumber(parsed.restore_size_files), + rawDataBytes: asNumber(parsed.raw_data_bytes), + rawBlobCount: asNumber(parsed.raw_blob_count), + dedupRatio: asNumber(parsed.dedup_ratio), + } + } catch { + return null + } +} diff --git a/app/settings/backups/_lib/types.ts b/app/settings/backups/_lib/types.ts new file mode 100644 index 0000000..5b768f0 --- /dev/null +++ b/app/settings/backups/_lib/types.ts @@ -0,0 +1,78 @@ +export type PhaseStatus = 'success' | 'skipped' | 'degraded' | 'failed' | 'pending' +export type OverallStatus = 'success' | 'partial_failure' | 'failed' | 'unknown' + +export interface BackupPhase { + name: string + status: PhaseStatus + exitCode: number | null + startedAt: string | null + completedAt: string | null + error: string | null + snapshotId?: string + filesNew?: number | null + dataAddedBytes?: number | null + outputFile?: string + bytes?: number | null +} + +export interface BackupStatus { + schemaVersion: number + overallStatus: OverallStatus + startedAt: string + completedAt: string + durationSeconds: number + host: string + phases: BackupPhase[] +} + +export interface RestoreTestAssertion { + path: string + status: 'ok' | 'empty' | 'missing' + bytes: number +} + +export interface RestoreTestStatus { + schemaVersion: number + overallStatus: OverallStatus + startedAt: string + completedAt: string + durationSeconds: number + repo: string + snapshotId: string | null + restoreExitCode: number | null + target?: string + assertions: RestoreTestAssertion[] + error?: string +} + +export interface BackupStatusEnvelope { + lastRun: BackupStatus | null + lastRestoreTest: RestoreTestStatus | null +} + +export interface ResticSnapshot { + id: string + shortId: string + time: string + hostname: string + tags: string[] + paths: string[] + repo: 'nas' | 'b2' + summary?: { + files_new?: number + files_changed?: number + data_added?: number + total_files_processed?: number + total_bytes_processed?: number + } | null +} + +export interface ResticStats { + repo: 'nas' | 'b2' + snapshotsCount: number + restoreSizeBytes: number | null + restoreSizeFiles: number | null + rawDataBytes: number | null + rawBlobCount: number | null + dedupRatio: number | null +} diff --git a/app/settings/backups/page.tsx b/app/settings/backups/page.tsx index 89f72c7..3a17103 100644 --- a/app/settings/backups/page.tsx +++ b/app/settings/backups/page.tsx @@ -3,6 +3,16 @@ import { redirect } from 'next/navigation' import { getCurrentUser } from '@/lib/session' import { execAgent } from '@/lib/agent-client' import BackupsPanel from './_components/backups-panel' +import { + parseResticSnapshots, + parseResticStats, + parseStatusEnvelope, +} from './_lib/parse' +import type { + BackupStatusEnvelope, + ResticSnapshot, + ResticStats, +} from './_lib/types' export const dynamic = 'force-dynamic' @@ -27,23 +37,74 @@ function parseBackupList(output: string): BackupFile[] { .filter((b) => b.name) } +function errorMessage(err: unknown): string { + return err instanceof Error ? err.message : 'agent call failed' +} + +async function tryExec(command: string): Promise<{ output: string | null; error: string | null }> { + try { + const output = await execAgent(command) + return { output, error: null } + } catch (err) { + return { output: null, error: errorMessage(err) } + } +} + export default async function BackupsPage() { const user = await getCurrentUser() if (!user) redirect('/login') - let backups: BackupFile[] = [] - let listError: string | null = null + // Run all agent calls in parallel; per-call error isolation so one failure + // does not blank the entire page. + const [ + backupListResult, + statusResult, + nasSnapshotsResult, + b2SnapshotsResult, + nasStatsResult, + b2StatsResult, + ] = await Promise.all([ + tryExec('list_ops_backups'), + tryExec('read_backup_status'), + tryExec('restic_snapshots_nas'), + tryExec('restic_snapshots_b2'), + tryExec('restic_stats_nas'), + tryExec('restic_stats_b2'), + ]) - try { - const output = await execAgent('list_ops_backups') - backups = parseBackupList(output) - } catch (err) { - listError = err instanceof Error ? err.message : 'failed to list backups' + const backups: BackupFile[] = backupListResult.output + ? parseBackupList(backupListResult.output) + : [] + const listError = backupListResult.error + + const envelope: BackupStatusEnvelope = statusResult.output + ? parseStatusEnvelope(statusResult.output) + : { lastRun: null, lastRestoreTest: null } + + const nasSnapshots: ResticSnapshot[] = nasSnapshotsResult.output + ? parseResticSnapshots(nasSnapshotsResult.output, 'nas') + : [] + const b2Snapshots: ResticSnapshot[] = b2SnapshotsResult.output + ? parseResticSnapshots(b2SnapshotsResult.output, 'b2') + : [] + const nasStats: ResticStats | null = nasStatsResult.output + ? parseResticStats(nasStatsResult.output, 'nas') + : null + const b2Stats: ResticStats | null = b2StatsResult.output + ? parseResticStats(b2StatsResult.output, 'b2') + : null + + const serverBackupErrors = { + status: statusResult.error ?? undefined, + nasSnapshots: nasSnapshotsResult.error ?? undefined, + b2Snapshots: b2SnapshotsResult.error ?? undefined, + nasStats: nasStatsResult.error ?? undefined, + b2Stats: b2StatsResult.error ?? undefined, } return (
-
+
← Home @@ -52,7 +113,16 @@ export default async function BackupsPage() {

Backups

- +
) diff --git a/deploy/ops-agent/sudoers b/deploy/ops-agent/sudoers index 93c5646..4927d1e 100644 --- a/deploy/ops-agent/sudoers +++ b/deploy/ops-agent/sudoers @@ -1,9 +1,19 @@ # /etc/sudoers.d/ops-agent -# NOPASSWD for explicit systemctl restart invocations by the ops-agent service account. -# Only the service names whitelisted in commands.yml are listed here. +# NOPASSWD for explicit invocations by the ops-agent service account. +# Only the service names + wrapper scripts whitelisted in commands.yml are listed here. # Installed by deploy/ops-agent/setup.sh. ops-agent ALL=(root) NOPASSWD: \ /usr/bin/systemctl restart scrum4me-web, \ /usr/bin/systemctl restart ops-agent, \ - /usr/bin/systemctl restart caddy + /usr/bin/systemctl restart caddy, \ + /srv/backups/scripts/wrappers/read-status.sh, \ + /srv/backups/scripts/wrappers/restic-snapshots.sh nas, \ + /srv/backups/scripts/wrappers/restic-snapshots.sh b2, \ + /srv/backups/scripts/wrappers/restic-stats.sh nas, \ + /srv/backups/scripts/wrappers/restic-stats.sh b2, \ + /srv/backups/scripts/wrappers/restic-check.sh nas, \ + /srv/backups/scripts/wrappers/restic-check.sh b2, \ + /srv/backups/scripts/wrappers/trigger-backup.sh, \ + /srv/backups/scripts/wrappers/trigger-restore-test.sh nas, \ + /srv/backups/scripts/wrappers/trigger-restore-test.sh b2 diff --git a/deploy/server-backup/README.md b/deploy/server-backup/README.md new file mode 100644 index 0000000..bb78780 --- /dev/null +++ b/deploy/server-backup/README.md @@ -0,0 +1,126 @@ +# Server backup — deploy artefacten + +Dagelijkse server-brede backup met restic naar **NAS** (lokaal) en **Backblaze B2** (offsite, Object Lock). Inclusief structured statusfile die de ops-dashboard kan lezen. + +De volledige beschrijving — voorwaarden, B2 keys, Object Lock, Forgejo-restore-test, integriteits-schedule — staat in [`docs/runbooks/server-backup.md`](../../docs/runbooks/server-backup.md). + +## Bestanden + +| Bestand | Doel | Plek op host | +|---|---|---| +| `server-backup.sh` | hoofd-script (phase-based, flock, statusfile) | `/srv/backups/scripts/server-backup.sh` | +| `restore-test.sh` | restore latest snapshot + check critical files | `/srv/backups/scripts/restore-test.sh` | +| `server-backup.service` | systemd oneshot | `/etc/systemd/system/server-backup.service` | +| `server-backup.timer` | daily 03:30 + 10 min jitter | `/etc/systemd/system/server-backup.timer` | +| `restic-backup.env.example` | env-template (repos, B2 keys, Forgejo) | kopiëren naar `/etc/restic-backup.env` | + +Bovendien aan te maken (niet in deze repo, omdat het secrets zijn): + +- `/etc/restic-backup.password` — alleen het restic-wachtwoord (mode `0400 root:root`). + +## Snelle installatie (zie runbook voor alle context) + +```bash +# 1. Tools en directories +sudo apt update && sudo apt install -y restic jq + +sudo mkdir -p /srv/backups/scripts /srv/backups/logs /srv/backups/status \ + /var/backups/databases +sudo chmod 0750 /srv/backups/logs /srv/backups/status + +# 2. Scripts plaatsen +sudo cp deploy/server-backup/server-backup.sh /srv/backups/scripts/ +sudo cp deploy/server-backup/restore-test.sh /srv/backups/scripts/ +sudo chmod 0750 /srv/backups/scripts/*.sh +sudo chown root:root /srv/backups/scripts/*.sh + +# 3. Env + password +sudo cp deploy/server-backup/restic-backup.env.example /etc/restic-backup.env +sudo chmod 0600 /etc/restic-backup.env +sudo chown root:root /etc/restic-backup.env +# Genereer wachtwoord — bewaar dit OOK in je password manager. +sudo sh -c 'openssl rand -hex 24 > /etc/restic-backup.password' +sudo chmod 0400 /etc/restic-backup.password + +# 4. Vul /etc/restic-backup.env (RESTIC_REPO_NAS, RESTIC_REPO_B2, +# B2_ACCOUNT_ID, B2_ACCOUNT_KEY, FORGEJO_*). Zie runbook deel A+B. + +# 5. Repos initialiseren (zie runbook deel C voor Object Lock + key-capabilities) +sudo -E bash -c 'set -a; . /etc/restic-backup.env; set +a; \ + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password; \ + restic -r "$RESTIC_REPO_NAS" init && \ + restic -r "$RESTIC_REPO_B2" init' + +# 6. Systemd +sudo cp deploy/server-backup/server-backup.service /etc/systemd/system/ +sudo cp deploy/server-backup/server-backup.timer /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable --now server-backup.timer +systemctl list-timers | grep server-backup + +# 7. Eerste run handmatig (volgen via journalctl) +sudo systemctl start server-backup.service +journalctl -u server-backup.service -f +``` + +## Verifiëren + +```bash +# Statusfile +sudo jq . /srv/backups/status/last-run.json + +# Snapshots +sudo -E bash -c 'set -a; . /etc/restic-backup.env; set +a; \ + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password; \ + restic -r "$RESTIC_REPO_NAS" snapshots; \ + restic -r "$RESTIC_REPO_B2" snapshots' + +# Restore-test (NAS, niet-destructief — restored naar /tmp/restore-test) +sudo /srv/backups/scripts/restore-test.sh nas +sudo jq . /srv/backups/status/last-restore-test.json +``` + +## Statusfile-schema + +Het script schrijft `/srv/backups/status/last-run.json` na elke run (success of failure), atomisch via temp + `mv`. De ops-dashboard leest deze file via `read_backup_status` (zie `ops-agent/commands.yml.example`). + +```json +{ + "schema_version": 1, + "overall_status": "success | partial_failure | failed", + "started_at": "2026-05-15T03:30:00+02:00", + "completed_at": "2026-05-15T03:48:21+02:00", + "duration_seconds": 1101, + "host": "scrum4me-srv", + "phases": { + "postgres_dump": { "status": "success", "exit_code": 0, "...": "..." }, + "forgejo_dump": { "status": "skipped", "exit_code": 99, "...": "..." }, + "forgejo_db_dump": { "status": "skipped", "exit_code": 99 }, + "restic_nas": { "status": "success", "exit_code": 0, "snapshot_id": "abc123" }, + "restic_b2": { "status": "degraded", "exit_code": 3, "error": "1 file unreadable" }, + "forget_nas": { "status": "success", "exit_code": 0 }, + "check_nas": { "status": "success", "exit_code": 0 }, + "check_b2": { "status": "success", "exit_code": 0 } + } +} +``` + +Per phase `status`: + +| status | betekenis | telt mee als | +|---|---|---| +| `success` | exit 0 | success | +| `skipped` | exit 99 — phase niet van toepassing (bv. Forgejo niet geïnstalleerd) | success | +| `degraded` | exit 3 — restic snapshot is gemaakt maar bepaalde files waren onleesbaar | partial_failure | +| `failed` | andere non-zero exit | partial_failure of failed (zie `overall_status`) | +| `pending` | phase niet gerund (script aborted vóór deze phase) | partial_failure | + +`overall_status` regels: + +- **`failed`** als `postgres_dump` faalt (DB-dump is autoritatief), of als **beide** restic repos falen. +- **`partial_failure`** bij enige `failed` of `degraded` phase die niet kritisch is (bv. één restic repo down, of forgejo_dump faalt terwijl postgres lukt). +- **`success`** als geen enkele phase `failed` of `degraded` is. + +## Volgorde tov bestaande `ops-db-backup.timer` + +De bestaande `deploy/ops-agent/ops-db-backup.timer` draait om **02:00** en doet alleen `pg_dump ops_dashboard` naar `/srv/ops/backups/`. Deze nieuwe `server-backup.timer` draait om **03:30** en pickt die map mee in zijn restic-backup. Beide blijven naast elkaar bestaan. diff --git a/deploy/server-backup/restic-backup.env.example b/deploy/server-backup/restic-backup.env.example new file mode 100644 index 0000000..e138d80 --- /dev/null +++ b/deploy/server-backup/restic-backup.env.example @@ -0,0 +1,44 @@ +# Copy to /etc/restic-backup.env on the host. Permissions: 0600 root:root. +# RESTIC_PASSWORD lives in /etc/restic-backup.password (mode 0400 root:root) +# — the backup script sets RESTIC_PASSWORD_FILE from there, so the password +# never appears in the process listing or this env file. + +# ── Restic repositories ──────────────────────────────────────────────────── +# Local NAS path (must be mounted before the timer fires; see runbook). +RESTIC_REPO_NAS=/mnt/backup-server/restic/scrum4me-srv + +# Backblaze B2 repo, format: b2:: +# Bucket must have Object Lock (Governance) with default retention >= 30 days. +RESTIC_REPO_B2=b2:scrum4me-srv-backup:scrum4me-srv + +# ── Backblaze B2 server key ──────────────────────────────────────────────── +# Capabilities REQUIRED: listBuckets, listFiles, readFiles, writeFiles +# Capabilities FORBIDDEN: deleteFiles, deleteKeys, bypassGovernance +# Create with: +# b2 application-key create \ +# --bucket scrum4me-srv-backup \ +# --name-prefix scrum4me-srv \ +# server-backup-key \ +# listBuckets,listFiles,readFiles,writeFiles +B2_ACCOUNT_ID=REPLACE_WITH_B2_KEY_ID +B2_ACCOUNT_KEY=REPLACE_WITH_B2_APPLICATION_KEY + +# ── Forgejo backup target (optional — set to skip if Forgejo not deployed) ─ +# Container name as it appears in `docker ps`. Set to "" or comment out to +# skip the Forgejo phases entirely. +FORGEJO_CONTAINER=forgejo +# Path to app.ini INSIDE the Forgejo container (used by `forgejo dump -c`). +FORGEJO_CONFIG=/data/gitea/conf/app.ini +# Postgres database name for Forgejo (empty = use SQLite, skip forgejo_db_dump). +FORGEJO_DB_NAME=forgejo +# Postgres container + role for Forgejo's DB (defaults match scrum4me stack). +FORGEJO_DB_CONTAINER=scrum4me-postgres +FORGEJO_DB_USER=scrum4me + +# ── Scrum4Me Postgres (required for postgres_dump phase) ─────────────────── +PG_CONTAINER=scrum4me-postgres +PG_DUMPALL_USER=scrum4me + +# ── Optional bandwidth limit for restic B2 upload (KiB/s; 0 = unlimited) ── +# Translated by the script into `restic --limit-upload "$BACKUP_LIMIT_UPLOAD_KIB"`. +# BACKUP_LIMIT_UPLOAD_KIB=5000 diff --git a/deploy/server-backup/restore-test.sh b/deploy/server-backup/restore-test.sh new file mode 100644 index 0000000..d05ae3d --- /dev/null +++ b/deploy/server-backup/restore-test.sh @@ -0,0 +1,177 @@ +#!/usr/bin/env bash +# Restore the latest restic snapshot to /tmp/restore-test/ and assert that a +# small set of critical files came back intact. Used by the monthly maintenance +# check and by the dashboard's "Restore test" button. +# +# Usage: +# server-backup-restore-test.sh [nas|b2] +# +# Default repo is "nas" (faster, no B2 download fees). + +umask 077 +set -uo pipefail + +REPO_LABEL="${1:-nas}" +RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}" +RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}" +STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}" +STATUS_DIR="$(dirname "$STATUS_FILE")" +STARTED_AT="$(date -Is)" +SECONDS=0 + +# Load env (idempotent: ok if already in environment). +if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then + # shellcheck disable=SC1091 + set -a; . /etc/restic-backup.env; set +a +fi + +case "$REPO_LABEL" in + nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;; + b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;; + *) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;; +esac + +if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then + echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2 + exit 1 +fi +export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH" + +for tool in jq restic; do + command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; } +done + +mkdir -p "$STATUS_DIR" +chmod 0750 "$STATUS_DIR" + +echo "════════════════════════════════════════════════════════════════" +echo " Restore test — started $STARTED_AT" +echo " Repo: $REPO_LABEL ($REPO)" +echo " Target: $RESTORE_DIR" +echo "════════════════════════════════════════════════════════════════" + +# Clean previous attempt to keep results unambiguous. +rm -rf "$RESTORE_DIR" +mkdir -p "$RESTORE_DIR" + +# Find latest snapshot id. +SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \ + | jq -r '.[0].short_id // .[0].id // empty') + +if [ -z "$SNAPSHOT_ID" ]; then + echo "ERROR: no snapshots found in $REPO_LABEL repo" + jq -n \ + --arg started "$STARTED_AT" \ + --arg completed "$(date -Is)" \ + --argjson duration "$SECONDS" \ + --arg repo "$REPO_LABEL" \ + '{ + schema_version: 1, + overall_status: "failed", + started_at: $started, + completed_at: $completed, + duration_seconds: $duration, + repo: $repo, + snapshot_id: null, + error: "no snapshots in repo", + assertions: [] + }' > "$STATUS_FILE" + chmod 0644 "$STATUS_FILE" + exit 1 +fi + +echo "Restoring snapshot $SNAPSHOT_ID …" +RESTORE_RC=0 +restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" || RESTORE_RC=$? + +if [ "$RESTORE_RC" -ne 0 ]; then + echo "ERROR: restic restore exited $RESTORE_RC" +fi + +# Assertions: each is a path that MUST exist and be non-empty. +# Adjust to your stack after first run (and update the runbook addendum). +ASSERTION_PATHS=( + "$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml" + "$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile" + "$RESTORE_DIR/etc/restic-backup.env" +) + +# Latest postgres dump — match the newest file (glob may resolve to zero). +shopt -s nullglob +PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz) +shopt -u nullglob +if [ "${#PG_DUMPS[@]}" -gt 0 ]; then + # pick lexicographic last (= newest date, ISO format) + LATEST_PG="${PG_DUMPS[-1]}" + ASSERTION_PATHS+=("$LATEST_PG") +fi + +ASSERTIONS_JSON='[]' +ANY_FAILED=0 +for p in "${ASSERTION_PATHS[@]}"; do + if [ -s "$p" ]; then + status="ok" + bytes=$(stat -c %s "$p") + echo " ✓ $p ($bytes bytes)" + elif [ -e "$p" ]; then + status="empty" + bytes=0 + ANY_FAILED=1 + echo " ✗ $p (exists but empty)" + else + status="missing" + bytes=0 + ANY_FAILED=1 + echo " ✗ $p (missing)" + fi + ASSERTIONS_JSON=$(jq -c \ + --arg path "$p" \ + --arg status "$status" \ + --argjson bytes "$bytes" \ + '. + [{path: $path, status: $status, bytes: $bytes}]' \ + <<< "$ASSERTIONS_JSON") +done + +if [ "$RESTORE_RC" -ne 0 ]; then + OVERALL="failed" +elif [ "$ANY_FAILED" -ne 0 ]; then + OVERALL="partial_failure" +else + OVERALL="success" +fi + +jq -n \ + --arg started "$STARTED_AT" \ + --arg completed "$(date -Is)" \ + --argjson duration "$SECONDS" \ + --arg repo "$REPO_LABEL" \ + --arg snapshot "$SNAPSHOT_ID" \ + --arg overall "$OVERALL" \ + --argjson restore_exit "$RESTORE_RC" \ + --argjson assertions "$ASSERTIONS_JSON" \ + '{ + schema_version: 1, + overall_status: $overall, + started_at: $started, + completed_at: $completed, + duration_seconds: $duration, + repo: $repo, + snapshot_id: $snapshot, + restore_exit_code: $restore_exit, + target: "'"$RESTORE_DIR"'", + assertions: $assertions + }' > "$STATUS_FILE" +chmod 0644 "$STATUS_FILE" + +echo "" +echo "════════════════════════════════════════════════════════════════" +echo " Restore test — finished $(date -Is)" +echo " Overall: $OVERALL" +echo " Status file: $STATUS_FILE" +echo "════════════════════════════════════════════════════════════════" + +case "$OVERALL" in + success) exit 0 ;; + partial_failure) exit 75 ;; + failed|*) exit 1 ;; +esac diff --git a/deploy/server-backup/server-backup.service b/deploy/server-backup/server-backup.service new file mode 100644 index 0000000..6d4fc4b --- /dev/null +++ b/deploy/server-backup/server-backup.service @@ -0,0 +1,33 @@ +[Unit] +Description=Server-wide backup (pg_dumpall + restic to NAS + B2) +Documentation=file:///srv/ops/repos/ops-dashboard/docs/runbooks/server-backup.md +After=network-online.target docker.service +Wants=network-online.target + +[Service] +Type=oneshot +EnvironmentFile=/etc/restic-backup.env +ExecStart=/srv/backups/scripts/server-backup.sh +TimeoutStartSec=4h +RuntimeMaxSec=6h +Nice=10 +IOSchedulingClass=best-effort +IOSchedulingPriority=7 +# Sandboxing — backup needs root for /etc + docker exec, but limit the rest. +ProtectSystem=strict +ReadWritePaths=/var/backups /srv/backups /run /tmp +ProtectHome=read-only +NoNewPrivileges=yes +PrivateTmp=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +StandardOutput=journal +StandardError=journal +SyslogIdentifier=server-backup + +# Exit code semantics from server-backup.sh: +# 0 = success (all phases ok) +# 75 = partial_failure (some non-critical phase failed/degraded) +# 1 = failed (a critical dump phase failed or both restic repos failed) +SuccessExitStatus=75 diff --git a/deploy/server-backup/server-backup.sh b/deploy/server-backup/server-backup.sh new file mode 100644 index 0000000..96042eb --- /dev/null +++ b/deploy/server-backup/server-backup.sh @@ -0,0 +1,497 @@ +#!/usr/bin/env bash +# Daily server-wide backup: dumps databases, runs restic to NAS + B2, +# writes a structured statusfile that the ops-dashboard can read. +# +# Install: +# cp deploy/server-backup/server-backup.sh /srv/backups/scripts/server-backup.sh +# chmod 0750 /srv/backups/scripts/server-backup.sh +# chown root:root /srv/backups/scripts/server-backup.sh +# +# Requires: bash, jq, flock, restic, docker, gzip. See runbook for setup. + +umask 077 +set -uo pipefail + +# ── Configuration ────────────────────────────────────────────────────────── +STATUS_DIR="${STATUS_DIR:-/srv/backups/status}" +LOG_DIR="${LOG_DIR:-/srv/backups/logs}" +DB_DUMP_DIR="${DB_DUMP_DIR:-/var/backups/databases}" +RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}" +LOCKFILE="${LOCKFILE:-/run/server-backup.lock}" +RUN_DATE="$(date +%F)" +STARTED_AT="$(date -Is)" +SECONDS=0 + +# Phase order — must match write_status_json + determine_exit_code expectations. +PHASE_ORDER=( + postgres_dump + forgejo_dump + forgejo_db_dump + restic_nas + restic_b2 + forget_nas + check_nas + check_b2 +) + +declare -A PHASE_STATUS PHASE_EXIT PHASE_START PHASE_END PHASE_ERR PHASE_EXTRA +OVERALL_STATUS="unknown" + +# ── Single-instance lock ─────────────────────────────────────────────────── +exec 9>"$LOCKFILE" || { echo "ERROR: cannot open lockfile $LOCKFILE" >&2; exit 1; } +if ! flock -n 9; then + echo "ERROR: another server-backup is already running (lock $LOCKFILE held)" >&2 + exit 75 +fi + +# ── Env + secret loading ─────────────────────────────────────────────────── +# When invoked via systemd, EnvironmentFile=/etc/restic-backup.env has already +# been loaded. When invoked manually for testing, source it ourselves. +if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then + # shellcheck disable=SC1091 + set -a; . /etc/restic-backup.env; set +a +fi + +: "${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set (see /etc/restic-backup.env)}" +: "${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set (see /etc/restic-backup.env)}" + +if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then + echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2 + exit 1 +fi +export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH" + +# Required tooling +for tool in jq restic docker gzip flock; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "ERROR: required tool '$tool' not on PATH" >&2 + exit 1 + fi +done + +# ── Logging ──────────────────────────────────────────────────────────────── +mkdir -p "$LOG_DIR" "$STATUS_DIR" "$DB_DUMP_DIR" +chmod 0750 "$LOG_DIR" "$STATUS_DIR" +LOG_FILE="$LOG_DIR/server-backup-$RUN_DATE.log" +# Mirror everything to LOG_FILE and the journal. +exec > >(tee -a "$LOG_FILE") 2>&1 + +echo "════════════════════════════════════════════════════════════════" +echo " Server backup — started $STARTED_AT" +echo " Host: $(hostname)" +echo " NAS repo: $RESTIC_REPO_NAS" +echo " B2 repo: $RESTIC_REPO_B2" +echo "════════════════════════════════════════════════════════════════" + +# ── Phase runner ─────────────────────────────────────────────────────────── +# Runs the function passed as first arg, captures stdout+stderr into a phase +# buffer, records status / exit_code / timestamps / error tail. +run_phase() { + local name="$1"; shift + local phase_buf + phase_buf=$(mktemp -t "backup-phase-${name}.XXXXXX") + + echo "" + echo "─── phase: $name ─── $(date -Is)" + PHASE_START[$name]=$(date -Is) + + local rc=0 + # Run in a sub-shell so set -e inside callees doesn't kill us. + ( + "$@" + ) 2>&1 | tee "$phase_buf" + rc=${PIPESTATUS[0]} + + PHASE_EXIT[$name]=$rc + case "$rc" in + 0) PHASE_STATUS[$name]=success ;; + 3) PHASE_STATUS[$name]=degraded ;; # restic: snapshot created but some files unreadable + 99) PHASE_STATUS[$name]=skipped ;; # our convention for "not applicable" + *) PHASE_STATUS[$name]=failed ;; + esac + + if [ "$rc" -ne 0 ] && [ "$rc" -ne 99 ] && [ -s "$phase_buf" ]; then + # Keep last few non-empty lines as a compact error summary. + PHASE_ERR[$name]=$(tail -n 5 "$phase_buf" | tr '\n' ' ' | head -c 500) + fi + + PHASE_END[$name]=$(date -Is) + rm -f "$phase_buf" + echo "─── end $name (exit=$rc, status=${PHASE_STATUS[$name]})" +} + +# Convention: a phase function returns 99 to mark itself "skipped" — the +# overall outcome treats this as success. +SKIPPED=99 + +# ── Phase 1: pg_dumpall (Scrum4Me Postgres cluster) ──────────────────────── +dump_postgres_all() { + local pg_container="${PG_CONTAINER:-scrum4me-postgres}" + local pg_user="${PG_DUMPALL_USER:-scrum4me}" + + if ! docker ps --format '{{.Names}}' | grep -qx "$pg_container"; then + echo "Postgres container '$pg_container' not running — cannot continue." + return 1 + fi + + local tmp="$DB_DUMP_DIR/.postgres-$RUN_DATE.sql.gz.tmp" + local final="$DB_DUMP_DIR/postgres-$RUN_DATE.sql.gz" + rm -f "$tmp" + + set -o pipefail + docker exec "$pg_container" pg_dumpall -U "$pg_user" --clean --if-exists \ + | gzip -c > "$tmp" + local rc=$? + set +o pipefail + + if [ "$rc" -ne 0 ]; then + rm -f "$tmp" + return "$rc" + fi + + mv "$tmp" "$final" + chmod 0640 "$final" + local bytes + bytes=$(stat -c %s "$final" 2>/dev/null || echo 0) + PHASE_EXTRA[postgres_dump]="output_file=$final;bytes=$bytes" + echo "wrote $final ($bytes bytes)" +} + +# ── Phase 2: Forgejo dump (filesystem + repos) ───────────────────────────── +dump_forgejo() { + local fj="${FORGEJO_CONTAINER:-}" + if [ -z "$fj" ]; then + echo "FORGEJO_CONTAINER unset — skipping Forgejo dump." + return "$SKIPPED" + fi + if ! docker ps --format '{{.Names}}' | grep -qx "$fj"; then + echo "Forgejo container '$fj' not running — skipping." + return "$SKIPPED" + fi + + local config="${FORGEJO_CONFIG:-/data/gitea/conf/app.ini}" + local tmp="$DB_DUMP_DIR/.forgejo-$RUN_DATE.zip.tmp" + local final="$DB_DUMP_DIR/forgejo-$RUN_DATE.zip" + rm -f "$tmp" + + # `forgejo dump -f -` streams the zip to stdout. We run as the `git` user + # inside the container (standard Forgejo image convention). + set -o pipefail + docker exec -u git "$fj" forgejo dump --skip-db -c "$config" --type zip -f - > "$tmp" + local rc=$? + set +o pipefail + + if [ "$rc" -ne 0 ]; then + rm -f "$tmp" + return "$rc" + fi + + mv "$tmp" "$final" + chmod 0640 "$final" + local bytes + bytes=$(stat -c %s "$final" 2>/dev/null || echo 0) + PHASE_EXTRA[forgejo_dump]="output_file=$final;bytes=$bytes" + echo "wrote $final ($bytes bytes)" +} + +# ── Phase 3: Forgejo Postgres DB dump (authoritative for DB restore) ─────── +dump_forgejo_db() { + local db_name="${FORGEJO_DB_NAME:-}" + if [ -z "$db_name" ]; then + echo "FORGEJO_DB_NAME unset — skipping Forgejo DB dump (assume SQLite)." + return "$SKIPPED" + fi + local db_container="${FORGEJO_DB_CONTAINER:-scrum4me-postgres}" + local db_user="${FORGEJO_DB_USER:-scrum4me}" + + if ! docker ps --format '{{.Names}}' | grep -qx "$db_container"; then + echo "DB container '$db_container' not running — skipping Forgejo DB dump." + return "$SKIPPED" + fi + + local tmp="$DB_DUMP_DIR/.forgejo-db-$RUN_DATE.sql.gz.tmp" + local final="$DB_DUMP_DIR/forgejo-db-$RUN_DATE.sql.gz" + rm -f "$tmp" + + set -o pipefail + docker exec "$db_container" pg_dump -U "$db_user" --clean --if-exists "$db_name" \ + | gzip -c > "$tmp" + local rc=$? + set +o pipefail + + if [ "$rc" -ne 0 ]; then + rm -f "$tmp" + return "$rc" + fi + + mv "$tmp" "$final" + chmod 0640 "$final" + local bytes + bytes=$(stat -c %s "$final" 2>/dev/null || echo 0) + PHASE_EXTRA[forgejo_db_dump]="output_file=$final;bytes=$bytes" + echo "wrote $final ($bytes bytes)" +} + +# ── Phases 4 + 5: restic backup to NAS / B2 ──────────────────────────────── +# Live Docker datadirs are excluded — dumps (above) are the authoritative +# restore source for Postgres and Forgejo. +RESTIC_BACKUP_PATHS=( + /etc + /home/janpeter + /root + /opt + /srv + /usr/local/bin + "$DB_DUMP_DIR" + /srv/ops/backups +) +RESTIC_EXCLUDES=( + --exclude='**/node_modules' + --exclude='**/.next/cache' + --exclude='**/.cache' + --exclude='**/.git/objects/pack' + --exclude='/srv/backups/logs' + --exclude='/tmp' + --exclude='/var/tmp' + --exclude='/srv/scrum4me/postgres' # live Postgres datadir — non-authoritative + --exclude='/srv/forgejo/data/git' # live Forgejo git objects — non-authoritative + --exclude='/srv/forgejo/data/lfs' + --exclude='/srv/forgejo/data/queues' +) + +restic_backup_to() { + local repo="$1"; local label="$2" + local extra_args=() + if [ "$label" = "b2" ] && [ -n "${BACKUP_LIMIT_UPLOAD_KIB:-}" ]; then + extra_args+=(--limit-upload "$BACKUP_LIMIT_UPLOAD_KIB") + fi + + # Capture restic JSON output so we can extract the snapshot id. + local json_out + json_out=$(mktemp -t "restic-backup-${label}.XXXXXX.json") + + # --no-scan keeps the lockfile interaction light; --skip-if-unchanged still + # records a snapshot per restic semantics so the dashboard sees a daily entry. + restic -r "$repo" backup \ + --tag scheduled \ + --tag "host=$(hostname)" \ + --json \ + "${extra_args[@]}" \ + "${RESTIC_EXCLUDES[@]}" \ + "${RESTIC_BACKUP_PATHS[@]}" \ + | tee "$json_out" + local rc=${PIPESTATUS[0]} + + # Extract snapshot id from the final summary line (last JSON object of type=summary). + local snap + snap=$(jq -rs 'map(select(.message_type=="summary")) | last | .snapshot_id // empty' < "$json_out" 2>/dev/null || true) + local files_new + files_new=$(jq -rs 'map(select(.message_type=="summary")) | last | .files_new // empty' < "$json_out" 2>/dev/null || true) + local data_added + data_added=$(jq -rs 'map(select(.message_type=="summary")) | last | .data_added // empty' < "$json_out" 2>/dev/null || true) + + if [ -n "$snap" ]; then + PHASE_EXTRA["restic_$label"]="snapshot_id=$snap;files_new=${files_new:-0};data_added_bytes=${data_added:-0}" + fi + + rm -f "$json_out" + return "$rc" +} + +# ── Phase 6: prune NAS only (B2 is Object Lock — pruning runs off-server) ── +restic_forget_nas() { + restic -r "$RESTIC_REPO_NAS" forget \ + --keep-daily 7 \ + --keep-weekly 4 \ + --keep-monthly 12 \ + --prune +} + +# ── Phase 7: integrity check (light daily; weekly read-data-subset on Sun) ─ +is_sunday() { + [ "$(date +%u)" = "7" ] +} + +restic_check_nas() { + if is_sunday; then + restic -r "$RESTIC_REPO_NAS" check --read-data-subset=2.5% + else + restic -r "$RESTIC_REPO_NAS" check + fi +} + +restic_check_b2() { + if is_sunday; then + # On B2 a read-data-subset costs bandwidth + B2 download fees. Keep the + # subset tiny on Sundays; deeper checks run monthly off-server. + restic -r "$RESTIC_REPO_B2" check --read-data-subset=1% + else + restic -r "$RESTIC_REPO_B2" check + fi +} + +# ── Statusfile writer ────────────────────────────────────────────────────── +# Builds a structured JSON statusfile in /srv/backups/status/last-run.json +# atomically (write to tmp, then mv). +write_status_json() { + local tmpfile + tmpfile=$(mktemp -t "backup-status.XXXXXX.json") + + # Build the phases object incrementally with jq for safe escaping. + local phases_json='{}' + local name status exit_code started ended err extra + local snapshot_id files_new data_added output_file bytes + for name in "${PHASE_ORDER[@]}"; do + status="${PHASE_STATUS[$name]:-pending}" + exit_code="${PHASE_EXIT[$name]:-}" + started="${PHASE_START[$name]:-}" + ended="${PHASE_END[$name]:-}" + err="${PHASE_ERR[$name]:-}" + extra="${PHASE_EXTRA[$name]:-}" + + snapshot_id="" + files_new="" + data_added="" + output_file="" + bytes="" + if [ -n "$extra" ]; then + # extra is a semicolon-separated list of key=value pairs + local pair key val + IFS=';' read -ra pairs <<< "$extra" + for pair in "${pairs[@]}"; do + key="${pair%%=*}" + val="${pair#*=}" + case "$key" in + snapshot_id) snapshot_id="$val" ;; + files_new) files_new="$val" ;; + data_added_bytes) data_added="$val" ;; + output_file) output_file="$val" ;; + bytes) bytes="$val" ;; + esac + done + fi + + # exit_code as JSON number when present, null otherwise. + local exit_arg='null' + if [ -n "$exit_code" ]; then + exit_arg="$exit_code" + fi + + phases_json=$( + jq -c -n \ + --argjson base "$phases_json" \ + --arg name "$name" \ + --arg status "$status" \ + --argjson exit_code "$exit_arg" \ + --arg started "$started" \ + --arg ended "$ended" \ + --arg err "$err" \ + --arg snapshot_id "$snapshot_id" \ + --arg files_new "$files_new" \ + --arg data_added "$data_added" \ + --arg output_file "$output_file" \ + --arg bytes "$bytes" \ + ' + $base + { + ($name): ({ + status: $status, + exit_code: $exit_code, + started_at: (if $started == "" then null else $started end), + completed_at: (if $ended == "" then null else $ended end), + error: (if $err == "" then null else $err end) + } + + (if $snapshot_id != "" then { snapshot_id: $snapshot_id } else {} end) + + (if $files_new != "" then { files_new: ($files_new | tonumber? // null) } else {} end) + + (if $data_added != "" then { data_added_bytes: ($data_added | tonumber? // null) } else {} end) + + (if $output_file != "" then { output_file: $output_file } else {} end) + + (if $bytes != "" then { bytes: ($bytes | tonumber? // null) } else {} end)) + }' + ) + done + + jq -n \ + --arg overall "$OVERALL_STATUS" \ + --arg started "$STARTED_AT" \ + --arg completed "$(date -Is)" \ + --argjson duration "$SECONDS" \ + --arg host "$(hostname)" \ + --argjson phases "$phases_json" \ + '{ + schema_version: 1, + overall_status: $overall, + started_at: $started, + completed_at: $completed, + duration_seconds: $duration, + host: $host, + phases: $phases + }' > "$tmpfile" + + mv "$tmpfile" "$STATUS_DIR/last-run.json" + chmod 0644 "$STATUS_DIR/last-run.json" +} + +# ── Outcome aggregation ──────────────────────────────────────────────────── +# success → exit 0 +# partial_failure → exit 75 (visible but distinguishable from hard failure) +# failed → exit 1 +determine_exit_code() { + local critical_failure=false + local has_failure=false + local has_degraded=false + local name status + + for name in "${PHASE_ORDER[@]}"; do + status="${PHASE_STATUS[$name]:-pending}" + case "$status" in + success|skipped) ;; + degraded) has_degraded=true ;; + failed) + has_failure=true + case "$name" in + postgres_dump) critical_failure=true ;; # losing the DB dump is catastrophic + esac + ;; + esac + done + + # Losing BOTH restic repos is also catastrophic. + if [ "${PHASE_STATUS[restic_nas]:-}" = "failed" ] \ + && [ "${PHASE_STATUS[restic_b2]:-}" = "failed" ]; then + critical_failure=true + fi + + if [ "$critical_failure" = true ]; then + OVERALL_STATUS="failed" + echo 1 + elif [ "$has_failure" = true ] || [ "$has_degraded" = true ]; then + OVERALL_STATUS="partial_failure" + echo 75 + else + OVERALL_STATUS="success" + echo 0 + fi +} + +# ── Main sequence ────────────────────────────────────────────────────────── +run_phase postgres_dump dump_postgres_all +run_phase forgejo_dump dump_forgejo +run_phase forgejo_db_dump dump_forgejo_db +run_phase restic_nas restic_backup_to "$RESTIC_REPO_NAS" nas +run_phase restic_b2 restic_backup_to "$RESTIC_REPO_B2" b2 +run_phase forget_nas restic_forget_nas +run_phase check_nas restic_check_nas +run_phase check_b2 restic_check_b2 + +EXIT_CODE=$(determine_exit_code) +write_status_json + +echo "" +echo "════════════════════════════════════════════════════════════════" +echo " Server backup — finished $(date -Is)" +echo " Overall status: $OVERALL_STATUS (exit $EXIT_CODE)" +echo " Duration: ${SECONDS}s" +echo " Status file: $STATUS_DIR/last-run.json" +echo " Log file: $LOG_FILE" +echo "════════════════════════════════════════════════════════════════" + +exit "$EXIT_CODE" diff --git a/deploy/server-backup/server-backup.timer b/deploy/server-backup/server-backup.timer new file mode 100644 index 0000000..ea80dc2 --- /dev/null +++ b/deploy/server-backup/server-backup.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Daily server-wide backup (timer) + +[Timer] +# Daily at 03:30 local. After ops-db-backup.timer (02:00) so the ops_dashboard +# pg_dump from /srv/ops/backups/ is fresh when restic picks it up. +OnCalendar=*-*-* 03:30:00 +Persistent=true +RandomizedDelaySec=600 + +[Install] +WantedBy=timers.target diff --git a/deploy/server-backup/wrappers/read-status.sh b/deploy/server-backup/wrappers/read-status.sh new file mode 100644 index 0000000..b1e0081 --- /dev/null +++ b/deploy/server-backup/wrappers/read-status.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Read /srv/backups/status/last-run.json. Returns "{}" if missing, so the +# dashboard can render an "unknown" state instead of erroring. + +set -uo pipefail + +STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-run.json}" +RESTORE_STATUS_FILE="${RESTORE_STATUS_FILE:-/srv/backups/status/last-restore-test.json}" + +# We emit a small wrapper object with both files so the UI can render the +# server-backup status AND the most recent restore-test status from one call. +last_run='{}' +if [ -r "$STATUS_FILE" ]; then + last_run=$(cat "$STATUS_FILE") +fi + +last_restore='null' +if [ -r "$RESTORE_STATUS_FILE" ]; then + last_restore=$(cat "$RESTORE_STATUS_FILE") +fi + +jq -n \ + --argjson last_run "$last_run" \ + --argjson last_restore "$last_restore" \ + '{ last_run: $last_run, last_restore_test: $last_restore }' diff --git a/deploy/server-backup/wrappers/restic-check.sh b/deploy/server-backup/wrappers/restic-check.sh new file mode 100644 index 0000000..ecd8eb6 --- /dev/null +++ b/deploy/server-backup/wrappers/restic-check.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Run a light restic integrity check on the given repo. +# Usage: restic-check.sh nas|b2 + +set -uo pipefail + +LABEL="${1:-}" +if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then + echo "label must be nas or b2" >&2 + exit 2 +fi + +if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then + set -a; . /etc/restic-backup.env; set +a +fi + +case "$LABEL" in + nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;; + b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;; +esac + +export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}" + +restic -r "$REPO" check diff --git a/deploy/server-backup/wrappers/restic-snapshots.sh b/deploy/server-backup/wrappers/restic-snapshots.sh new file mode 100644 index 0000000..bd6e6a8 --- /dev/null +++ b/deploy/server-backup/wrappers/restic-snapshots.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# List recent restic snapshots from a labelled repo. Output: JSON array. +# Usage: restic-snapshots.sh nas|b2 + +set -uo pipefail + +LABEL="${1:-}" +if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then + echo '{"error":"label must be nas or b2"}' >&2 + exit 2 +fi + +# Load env (idempotent — systemd already loaded it for service contexts). +if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then + set -a; . /etc/restic-backup.env; set +a +fi + +case "$LABEL" in + nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;; + b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;; +esac + +export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}" + +# Show last 30 snapshots, newest first, with the fields the UI needs. +restic -r "$REPO" snapshots --json 2>/dev/null \ + | jq --arg repo "$LABEL" ' + sort_by(.time) | reverse | .[0:30] + | map({ + id: .id, + short_id: (.short_id // (.id[0:8])), + time: .time, + hostname: .hostname, + tags: (.tags // []), + paths: (.paths // []), + summary: (.summary // null), + repo: $repo + }) + ' diff --git a/deploy/server-backup/wrappers/restic-stats.sh b/deploy/server-backup/wrappers/restic-stats.sh new file mode 100644 index 0000000..4eeea6d --- /dev/null +++ b/deploy/server-backup/wrappers/restic-stats.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Repo stats: combines restic stats in two modes plus snapshot count. +# Output: JSON object with restore_size_bytes, raw_data_bytes, dedup_ratio. +# Usage: restic-stats.sh nas|b2 + +set -uo pipefail + +LABEL="${1:-}" +if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then + echo '{"error":"label must be nas or b2"}' >&2 + exit 2 +fi + +if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then + set -a; . /etc/restic-backup.env; set +a +fi + +case "$LABEL" in + nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;; + b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;; +esac + +export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}" + +# restore-size: total bytes if every file in every snapshot were re-extracted. +restore_json=$(restic -r "$REPO" stats --mode restore-size --json 2>/dev/null || echo '{}') +# raw-data: total unique blob bytes after dedup + compression. +raw_json=$(restic -r "$REPO" stats --mode raw-data --json 2>/dev/null || echo '{}') +# Snapshot count for the same repo. +snap_count=$(restic -r "$REPO" snapshots --json 2>/dev/null | jq 'length // 0') + +jq -n \ + --arg repo "$LABEL" \ + --argjson restore "$restore_json" \ + --argjson raw "$raw_json" \ + --argjson snap_count "${snap_count:-0}" \ + ' + { + repo: $repo, + snapshots_count: $snap_count, + restore_size_bytes: ($restore.total_size // null), + restore_size_files: ($restore.total_file_count // null), + raw_data_bytes: ($raw.total_size // null), + raw_blob_count: ($raw.total_blob_count // null), + dedup_ratio: ( + if ($restore.total_size != null) and ($raw.total_size != null) and ($raw.total_size > 0) + then (($restore.total_size | tonumber) / ($raw.total_size | tonumber)) + else null + end + ) + }' diff --git a/deploy/server-backup/wrappers/trigger-backup.sh b/deploy/server-backup/wrappers/trigger-backup.sh new file mode 100644 index 0000000..6a2ee61 --- /dev/null +++ b/deploy/server-backup/wrappers/trigger-backup.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Trigger server-backup.service ad-hoc. Refuses if a run is already active +# (the script itself also flock's, but checking here gives a friendlier error). + +set -uo pipefail + +UNIT=server-backup.service + +active=$(systemctl is-active "$UNIT" 2>/dev/null || true) +if [ "$active" = "active" ] || [ "$active" = "activating" ]; then + echo "ERROR: $UNIT is already $active — refusing to trigger." >&2 + exit 75 +fi + +# Use --no-block so we return immediately; the dashboard will poll via +# read-status.sh and tail the log to follow progress. +systemctl start --no-block "$UNIT" +echo "Triggered $UNIT. Follow with: journalctl -u $UNIT -f" diff --git a/deploy/server-backup/wrappers/trigger-restore-test.sh b/deploy/server-backup/wrappers/trigger-restore-test.sh new file mode 100644 index 0000000..d0368b3 --- /dev/null +++ b/deploy/server-backup/wrappers/trigger-restore-test.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Run a non-destructive restore test against the NAS repo. Streams output to +# stdout (so the dashboard's StreamingTerminal can render it) and writes the +# structured result to /srv/backups/status/last-restore-test.json. + +set -uo pipefail + +REPO_LABEL="${1:-nas}" + +if [ ! -x /srv/backups/scripts/restore-test.sh ]; then + echo "ERROR: /srv/backups/scripts/restore-test.sh not installed" >&2 + exit 1 +fi + +exec /srv/backups/scripts/restore-test.sh "$REPO_LABEL" diff --git a/docs/runbooks/server-backup.md b/docs/runbooks/server-backup.md new file mode 100644 index 0000000..dd1fe11 --- /dev/null +++ b/docs/runbooks/server-backup.md @@ -0,0 +1,462 @@ +# Server-brede backup (restic + NAS + B2, dashboard-bediend) + +## Context + +`scrum4me-srv` draait een Docker-stack (Scrum4Me-web, worker-idea, ops-dashboard, +postgres-17, caddy) plus Forgejo. De huidige backup-dekking — alleen +`pg_dump ops_dashboard` naar `/srv/ops/backups/` met 30 dagen retentie op één +disk — laat **alles anders** vallen: Scrum4Me-data, Forgejo, Caddy-certs, +Docker-volumes en `/etc` zijn weg bij brand, diefstal, ransomware of disk-fail. + +Doel: de server **herbouwbaar** maken vanuit een encrypted, gededupliceerde, +versioned backup met twee onafhankelijke kopieën — **NAS** lokaal en +**Backblaze B2** offsite — bediend vanuit de ops-dashboard. De bestaande +`backup_ops_db`-flow blijft draaien; restic pickt zijn dump-directory mee. + +**Belangrijke ontwerpkeuzes** (uitgebreid toegelicht in de review onder +`/Users/janpetervisser/Development/Scrum4Me/docs/recommendations/server-backup-plan-review-2026-05-15.md`): + +- **B2 Object Lock + server-key zonder `deleteFiles`** — een aanvaller met root + op de server kan geen B2-snapshots weghalen tot Object Lock-retention + verloopt. Dat is de ransomware-bescherming. Prune op B2 gebeurt maandelijks + vanaf de laptop met een aparte hoge-cap maintenance-key. +- **Authoritative restore-bron = dumps, niet live datadirs.** Postgres- en + Forgejo-data-directories zijn expliciet `--exclude`'d uit restic; + `pg_dumpall` en `forgejo dump` + aparte `pg_dump ` zijn de + autoritatieve bronnen. +- **Phase-based script met structured statusfile.** Eén falende fase laat de + rest doorlopen; per-phase status / exit-code / timestamps / error-tail komen + in `/srv/backups/status/last-run.json` die de dashboard live leest. +- **Single-instance lock** via `flock /run/server-backup.lock` — UI-knop en + systemd-timer kunnen elkaar niet overlappen. + +## Voorwaarden (aantoonbaar voldaan vóór uitvoering) + +- [ ] Bash, jq, restic, docker, gzip, flock op `$PATH` (`apt install restic jq` voor de eerste twee — de rest zit standaard). +- [ ] De Scrum4Me-stack draait in Docker (`docker ps | grep scrum4me-postgres`). +- [ ] `/srv/scrum4me/compose/docker-compose.yml` bestaat (anders herzie je het exclude-pad in `server-backup.sh`). +- [ ] Tijd loopt synchroon (`timedatectl status`) — backups gebruiken ISO-timestamps. + +## Voorwaarden (input van de gebruiker nodig) + +- **NAS-mount** — pad zoals `/mnt/backup-server` met genoeg ruimte (initieel ≥ 100 GB; restic is gededupliceerd, dus daarna groeit het traag). +- **Backblaze B2-account** — credit-card geregistreerd, bucket aanmaken vereist een operator-actie. +- **Restic-wachtwoord** — `openssl rand -hex 24`, bewaard in je password manager **én** in `/etc/restic-backup.password` op de server. Beide nodig — kwijt op één plek = repo onleesbaar. +- **B2 maintenance-key** — bewaard alleen op je laptop in passwordmanager. Niet op de server. + +--- + +## Deel A — Voorbereiding op `scrum4me-srv` + +Uit te voeren als `root` op `scrum4me-srv`. + +1. **Tools installeren** + ```bash + sudo apt update + sudo apt install -y restic jq + restic version + ``` + +2. **Directories aanmaken** + ```bash + sudo mkdir -p /srv/backups/scripts /srv/backups/logs /srv/backups/status \ + /var/backups/databases + sudo chmod 0750 /srv/backups/logs /srv/backups/status + ``` + +3. **NAS-mount controleren / aanmaken** + ```bash + mountpoint -q /mnt/backup-server && echo "OK" || echo "NIET gemount" + ``` + Zo nee: `fstab`-regel toevoegen, `systemctl daemon-reload`, `mount -a`. Zorg dat de mount automatisch terugkomt bij reboot — anders crashed de eerste backup-run na een reboot. + +4. **Restic-wachtwoord genereren en plaatsen** + ```bash + sudo sh -c 'openssl rand -hex 24 > /etc/restic-backup.password' + sudo chmod 0400 /etc/restic-backup.password + sudo chown root:root /etc/restic-backup.password + ``` + **Kopieer dezelfde string naar je password manager** vóór je verder gaat. Een gegeneerd wachtwoord dat alleen op de server staat is geen wachtwoord — het is een ticking time bomb. + +--- + +## Deel B — Backblaze B2 inrichten (Object Lock + scoped keys) + +Doel: een bucket waarvan **bestaande** snapshots niet door de server gewist kunnen worden, plus twee separate keys: één voor de server (alleen schrijven/lezen) en één voor de operator (alle rechten, alleen vanaf laptop gebruikt). + +1. **Bucket aanmaken** in de Backblaze-UI of via `b2` CLI: + - Naam: `scrum4me-srv-backup` (of een variant; vermeld in `/etc/restic-backup.env`). + - Privacy: **Private**. + - **File Lock: Enabled, Governance mode, default retention = 30 days**. Governance betekent: een key met `bypassGovernance` kan locks omzeilen — die capability geven we **alleen** aan de maintenance-key. + - Lifecycle rules: **geen** (lifecycle conflicts met Object Lock). + - Encryption: server-side encryption aanlaten (B2 standaard). + +2. **Server-key** aanmaken (gaat naar `/etc/restic-backup.env` op de server): + ```bash + # via b2 CLI: + b2 application-key create \ + --bucket scrum4me-srv-backup \ + --name-prefix scrum4me-srv \ + server-backup-key \ + listBuckets,listFiles,readFiles,writeFiles + ``` + Bewaar de output (`keyID` + `applicationKey`). Verifieer in de UI dat de key **niet** `deleteFiles`, **niet** `deleteKeys`, **niet** `bypassGovernance` heeft. + +3. **Maintenance-key** aanmaken (gaat in je password manager op de laptop): + ```bash + b2 application-key create \ + --bucket scrum4me-srv-backup \ + scrum4me-srv-maintenance-key \ + listBuckets,listFiles,readFiles,writeFiles,deleteFiles,bypassGovernance + ``` + Deze key komt **nooit** op de server. Gebruik alleen voor `restic forget --prune` vanaf je laptop (zie Deel H). + +4. **`/etc/restic-backup.env` aanmaken** + ```bash + sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/restic-backup.env.example \ + /etc/restic-backup.env + sudo chmod 0600 /etc/restic-backup.env + sudo chown root:root /etc/restic-backup.env + sudo nano /etc/restic-backup.env + ``` + Vul in: `RESTIC_REPO_NAS`, `RESTIC_REPO_B2`, `B2_ACCOUNT_ID` (= keyID), `B2_ACCOUNT_KEY` (= applicationKey). Forgejo-velden in Deel F. + +**Dreigingsmodel** + +| Dreiging | Gedekt door dit ontwerp? | +|---|---| +| Disk-fail / corruptie | ✓ NAS + B2 = 2× redundancy | +| Brand / diefstal / waterschade | ✓ B2 is offsite | +| Ransomware op de server | ✓ B2 Object Lock — bestaande snapshots immutable tot retention verloopt | +| Server-compromise (root) | ✓ server-key kan geen B2-files verwijderen | +| Laptop-compromise + server-compromise simultaan | ✗ maintenance-key dan ook in handen van aanvaller — geen verdediging | +| Backblaze account-compromise | ✗ — buiten scope; mitigeer met 2FA en audit-trail | +| Verlies restic-wachtwoord | ✗ — repos onleesbaar; bewaar wachtwoord óók in password manager | + +--- + +## Deel C — Restic-repos initialiseren + +1. **NAS-repo init** + ```bash + sudo -E bash -c ' + set -a; . /etc/restic-backup.env; set +a + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password + restic -r "$RESTIC_REPO_NAS" init + ' + ``` + +2. **B2-repo init** + ```bash + sudo -E bash -c ' + set -a; . /etc/restic-backup.env; set +a + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password + restic -r "$RESTIC_REPO_B2" init + ' + ``` + +3. **Retentie droogtest** — controleer dat het forget-beleid niet té agressief is op een eerste-snapshot-only repo. (Op een verse repo verwijdert `forget` niets, maar dit toont dat alle paden + auth werken.) + ```bash + sudo -E bash -c ' + set -a; . /etc/restic-backup.env; set +a + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password + restic -r "$RESTIC_REPO_NAS" forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --dry-run + ' + ``` + +--- + +## Deel D — Scripts en systemd-units plaatsen + +1. **Scripts kopiëren** + ```bash + sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/server-backup.sh /srv/backups/scripts/ + sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/restore-test.sh /srv/backups/scripts/ + sudo chmod 0750 /srv/backups/scripts/*.sh + sudo chown root:root /srv/backups/scripts/*.sh + ``` + +2. **Systemd-units kopiëren** + ```bash + sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/server-backup.service /etc/systemd/system/ + sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/server-backup.timer /etc/systemd/system/ + sudo systemctl daemon-reload + sudo systemctl enable --now server-backup.timer + ``` + +3. **Timer verifiëren** + ```bash + systemctl list-timers | grep server-backup + ``` + Toont next-run morgen 03:30 (+ randomized delay tot 10 min). + +--- + +## Deel E — Eerste run handmatig + statusfile-verificatie + +1. **Trigger** + ```bash + sudo systemctl start server-backup.service + ``` + +2. **Live volgen** + ```bash + journalctl -u server-backup.service -f + ``` + Verwacht: 8 fasen (postgres_dump, forgejo_dump, forgejo_db_dump, restic_nas, restic_b2, forget_nas, check_nas, check_b2), elk met een `─── phase: X ───` start- en `─── end X (exit=N, status=S)` eindregel. + +3. **Statusfile** + ```bash + sudo jq . /srv/backups/status/last-run.json + ``` + Verwacht: `overall_status: "success"`, alle 5 verplichte fasen `success` (Forgejo mag `skipped` zijn als die nog niet geconfigureerd is). + +4. **Snapshots** + ```bash + sudo -E bash -c ' + set -a; . /etc/restic-backup.env; set +a + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password + restic -r "$RESTIC_REPO_NAS" snapshots + restic -r "$RESTIC_REPO_B2" snapshots + ' + ``` + Beide tonen één snapshot met `host=scrum4me-srv` en tags `scheduled`. + +--- + +## Deel F — Forgejo subplan + +Vóór de eerste full-backup run: inventariseer Forgejo en bevestig (of corrigeer) de defaults in `restic-backup.env`. Bij twijfel — zet `FORGEJO_CONTAINER=` (leeg) zodat de Forgejo-fases als `skipped` markeren tot je verifieerd hebt. + +### F1. Inventarisatie + +```bash +docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}' | grep -i forgejo +``` + +Noteer: +- container-naam (vermoedelijk `forgejo`). +- image-versie (`codeberg.org/forgejo/forgejo:`). + +### F2. Configpaden in de container + +```bash +docker inspect --format '{{ range .Mounts }}{{ .Source }} -> {{ .Destination }}{{ println }}{{ end }}' +docker exec ls -la /data/gitea/conf/app.ini +``` + +Standaard: `app.ini` in `/data/gitea/conf/app.ini` binnen de container. Wijkt dat af, pas `FORGEJO_CONFIG=` in `/etc/restic-backup.env` aan. + +### F3. DB-koppeling controleren + +```bash +docker exec grep -E '^DB_TYPE|^HOST|^NAME|^USER' /data/gitea/conf/app.ini +``` + +- `DB_TYPE=postgres` met `NAME=forgejo` ⇒ zet `FORGEJO_DB_NAME=forgejo`, en als de Postgres-container niet `scrum4me-postgres` is: `FORGEJO_DB_CONTAINER=...`. +- `DB_TYPE=sqlite` ⇒ laat `FORGEJO_DB_NAME=` leeg; SQLite-DB komt mee in `forgejo dump`. + +### F4. Dump-strategie + +Het script doet **drie** dingen voor Forgejo: + +1. `forgejo dump --skip-db -c --type zip -f -` — codebases, attachments, hooks, LFS metadata, etc. +2. Separate `pg_dump ` — autoritatieve DB-restore-bron (Forgejo docs documenteren bekende import-issues bij DB-inhoud uit `forgejo dump`, daarom `--skip-db`). +3. Live datadirs (`/srv/forgejo/data/git`, `/srv/forgejo/data/lfs`, `/srv/forgejo/data/queues`) worden **niet** door restic gekopieerd — dat zijn live B-Trees waar een file-level kopie inconsistent zou zijn. + +### F5. Restore-test in geïsoleerde compose-stack + +Vóór je de Forgejo-restore voor real nodig hebt: test hem een keer. Maak een tijdelijke directory met een verse Forgejo + Postgres, voer de dumps in, draai `forgejo doctor check --all`. + +```bash +# Minimaal restore-test-recept (vul in op basis van je Forgejo-versie) +RESTORE_DIR=/tmp/forgejo-restore-test +mkdir -p "$RESTORE_DIR" +cd "$RESTORE_DIR" + +# 1. compose-stack met blanco Forgejo + Postgres +cat > docker-compose.yml <<'YAML' +services: + forgejo: + image: codeberg.org/forgejo/forgejo: + volumes: [ "./forgejo-data:/data" ] + depends_on: [ db ] + db: + image: postgres:17 + environment: + POSTGRES_USER: forgejo + POSTGRES_PASSWORD: testtest + POSTGRES_DB: forgejo + volumes: [ "./db-data:/var/lib/postgresql/data" ] +YAML + +docker compose up -d + +# 2. DB-dump terugzetten +gunzip < /var/backups/databases/forgejo-db-$(date +%F).sql.gz \ + | docker compose exec -T db psql -U forgejo forgejo + +# 3. Forgejo-dump uitpakken in de data-volume +docker compose stop forgejo +unzip /var/backups/databases/forgejo-$(date +%F).zip -d forgejo-data/ +docker compose start forgejo + +# 4. Health-checks +docker compose exec forgejo forgejo doctor check --all +curl -fsS http://localhost:3000/api/v1/version +``` + +Slaagt `forgejo doctor check --all` en het `/api/v1/version`-endpoint? Dan is je Forgejo-restore werkend. Tear-down: `docker compose down -v && rm -rf "$RESTORE_DIR"`. + +--- + +## Deel G — Restore-procedure in productie + +### G1. Files uit een snapshot terughalen + +```bash +# Snapshot kiezen +sudo -E bash -c ' + set -a; . /etc/restic-backup.env; set +a + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password + restic -r "$RESTIC_REPO_NAS" snapshots +' + +# Restore (latest, alleen /etc — voorbeeld) +sudo -E bash -c ' + set -a; . /etc/restic-backup.env; set +a + export RESTIC_PASSWORD_FILE=/etc/restic-backup.password + restic -r "$RESTIC_REPO_NAS" restore latest --target /tmp/restore --include /etc +' +``` + +### G2. Postgres herstellen (Scrum4Me-cluster) + +```bash +# Stop de apps die met de DB praten +docker compose -f /srv/scrum4me/compose/docker-compose.yml stop scrum4me-web ops-dashboard worker-idea + +# Restore dumpall (drop + recreate alle DBs in de cluster — vandaar --clean --if-exists in de dump) +gunzip < /var/backups/databases/postgres-2026-05-15.sql.gz \ + | docker exec -i scrum4me-postgres psql -U scrum4me + +# Apps weer aan +docker compose -f /srv/scrum4me/compose/docker-compose.yml start scrum4me-web ops-dashboard worker-idea +``` + +Voor partial restore (alleen één database): pak die DB uit de dumpall-tekst met `pg_restore` of `awk`-block extractie. Voor alleen `ops_dashboard` is de bestaande [recovery.md](recovery.md) sectie 2a primair. + +### G3. Forgejo herstellen + +Volg [F5](#f5-restore-test-in-geïsoleerde-compose-stack) maar dan met de echte Forgejo-compose-stack en zonder tear-down. Belangrijk: stop de live Forgejo eerst, vervang `/srv/forgejo/data` volledig, restore DB, start Forgejo, `forgejo doctor check --all`. + +--- + +## Deel H — Maintenance vanaf de laptop (maandelijks) + +Doel: B2-snapshots ouder dan retention-policy daadwerkelijk pruning, plus een diepere integriteits-check die op de server te duur zou zijn. + +1. **Voorbereiding** (eenmalig op laptop): + ```bash + brew install restic jq + # Maintenance-key uit password manager + export B2_ACCOUNT_ID= + export B2_ACCOUNT_KEY= + export RESTIC_REPOSITORY=b2:scrum4me-srv-backup:scrum4me-srv + read -rs RESTIC_PASSWORD < /dev/tty # uit password manager + export RESTIC_PASSWORD + ``` + +2. **Prune-check** (eerst dry-run om te zien wat er zou gebeuren): + ```bash + restic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --dry-run + ``` + +3. **Daadwerkelijke prune** (vereist `bypassGovernance` capability — alleen via maintenance-key): + ```bash + restic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --prune + ``` + +4. **Diepere check**: + ```bash + restic check --read-data-subset=10% + ``` + B2-bandbreedte: 10% van een 50 GB repo = 5 GB download, B2-prijs ~ $0.05 (gratis 1 GB/dag). + +5. **Cleanup environment** — sluit shell of `unset RESTIC_PASSWORD B2_ACCOUNT_*`. + +--- + +## Deel I — Integriteits-schedule (samenvatting) + +| Cadans | Wie | Wat | Waarom | +|---|---|---|---| +| Dagelijks 03:30 | server (systemd timer) | `restic check` op beide repos | snelle metadata-/structure-validatie | +| Wekelijks (zondag) | server (zelfde script) | `restic check --read-data-subset=2.5%` op NAS, `1%` op B2 | sample-based data-integrity | +| Maandelijks | operator (laptop) | `restic check --read-data-subset=10%` + `forget --prune` op B2 | diepere check + prune (B2 server-key heeft geen delete-rechten) | +| Maandelijks | operator (server) | `/srv/backups/scripts/restore-test.sh nas` + handmatige Forgejo-stack-restore (F5) | end-to-end restore-verificatie | + +--- + +## Te wijzigen / nieuw aangemaakte bestanden + +**Op `scrum4me-srv`** (alleen via deploy uit deze repo, geen handmatige edits): + +- `/srv/backups/scripts/server-backup.sh` (uit `deploy/server-backup/`). +- `/srv/backups/scripts/restore-test.sh` (idem). +- `/etc/systemd/system/server-backup.service`, `server-backup.timer` (uit `deploy/server-backup/`). +- `/etc/restic-backup.env` — secrets, niet in repo. +- `/etc/restic-backup.password` — secret, niet in repo. + +**In deze repo (`ops-dashboard`)**, nieuw aangemaakt: + +- `deploy/server-backup/*` — alle deploy-artefacten. +- `docs/runbooks/server-backup.md` — dit document. +- Later (Fase 3+4): `ops-agent/commands.yml.example`-uitbreiding, `ops-agent/flows.example/server_backup_*.yml`, `app/settings/backups/_components/server-backup-section.tsx`. + +**Op de laptop**, in password manager: + +- restic-wachtwoord (identiek aan `/etc/restic-backup.password`). +- B2 maintenance-key (keyID + applicationKey). + +--- + +## Veelvoorkomende fouten + +| Symptoom | Oorzaak | Fix | +|---|---|---| +| `unable to open repository ... no such file or directory` (NAS) | NAS-mount weg na reboot | `mountpoint -q /mnt/backup-server` — fix `fstab`/`autofs`; herstart `server-backup.service` | +| `unable to open repository ... AccessDenied` (B2) | server-key heeft verkeerde capabilities of bucket-prefix | check `b2 application-key list`; capabilities moeten `listBuckets,listFiles,readFiles,writeFiles` zijn, name-prefix moet matchen | +| `Object Lock In Place` bij `forget --prune` op B2 | server probeert ten onrechte B2 te prunen (heeft die capability niet) | het script prune'd alleen NAS — als deze fout opduikt: handmatige `restic forget` op B2 gedraaid (zou off-server moeten); gebruik maintenance-key | +| `restic snapshot tag scheduled` ontbreekt in UI | run heeft `--tag scheduled` niet meegekregen | check script — `restic_backup_to` zet beide tags hardcoded | +| `forgejo dump` faalt met permission denied | container-user niet `git` | pas `dump_forgejo` aan: `docker exec -u ` | +| restic exit code 3 in statusfile | sommige files waren niet leesbaar tijdens snapshot (open file lock) | non-fataal — log toont welke files; meestal logs of sockets; eventueel toevoegen aan `RESTIC_EXCLUDES` | +| `another server-backup is already running` exit 75 | timer en UI-knop tegelijk, of vorige run hangt | `systemctl status server-backup.service`; bij hang: `systemctl kill server-backup.service`, lockfile `/run/server-backup.lock` opruimen | +| `last-run.json` niet geüpdatet | script gecrashed vóór `write_status_json` | `journalctl -u server-backup.service --since=today` — meestal env-file of password-file probleem | +| Postgres-datadir in restic snapshot terug te zien | excludes verkeerd geconfigureerd | check `RESTIC_EXCLUDES` in script — moet `/srv/scrum4me/postgres` bevatten | + +--- + +## Verificatie (end-to-end) + +1. **Eerste run slaagt** — Deel E groen, statusfile `overall_status: success`. +2. **Snapshots zichtbaar** op beide repos via `restic snapshots`. +3. **Restore-test slaagt** — `restore-test.sh nas` → `overall_status: success` in `/srv/backups/status/last-restore-test.json`, alle assertions `ok`. +4. **Forgejo-restore-stack** (F5) — `forgejo doctor check --all` rond zonder errors, `/api/v1/version` antwoordt. +5. **Reboot-test** — server reboot, `systemctl list-timers` toont `server-backup.timer` met next-run gepland; NAS-mount automatisch terug. +6. **Failure-injectie**: + - NAS unmount → script eindigt met `overall_status: partial_failure`, `phases.restic_nas.status: failed`, B2-snapshot wel aanwezig, systemd exit 75. + - B2-key tijdelijk ongeldig → `phases.restic_b2.status: failed`, NAS-snapshot wel, exit 75. + - Beide repos onbereikbaar → `overall_status: failed`, exit 1. +7. **Concurrency** — tweede `systemctl start server-backup.service` tijdens lopende run → exit 75, log toont `another server-backup is already running`. +8. **Maandelijkse maintenance** — eerst keer succesvol uitgevoerd vanaf laptop, B2 `forget --prune` slaagt zonder Object Lock-fouten. + +--- + +# Addendum — uitvoering + +> Vul deze sectie na de eerste uitvoering met alle afwijkingen van het plan +> hierboven: exacte Forgejo container-naam, image-versie, eventuele paden die +> anders bleken, sudoers-precieze regels, Object Lock-retention die je gekozen +> hebt, B2 key-IDs (geredacteerd), tijden van eerste runs, etc. Zelfde +> discipline als [tailscale-setup.md](tailscale-setup.md). diff --git a/ops-agent/commands.yml.example b/ops-agent/commands.yml.example index 87c17dc..7100079 100644 --- a/ops-agent/commands.yml.example +++ b/ops-agent/commands.yml.example @@ -250,3 +250,51 @@ commands: - -delete - -print description: "Delete ops_dashboard backup files older than 30 days" + + # ── Server-wide backup (restic + NAS + B2) ──────────────────────────────── + # All wrappers live under /srv/backups/scripts/wrappers/ and read + # /etc/restic-backup.env (mode 0600 root:root) which the ops-agent user + # cannot read directly — hence the sudo prefix. See deploy/ops-agent/sudoers + # for the corresponding NOPASSWD entries. + + read_backup_status: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/read-status.sh"] + description: "Read /srv/backups/status/last-run.json + last-restore-test.json (JSON)" + + restic_snapshots_nas: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-snapshots.sh", "nas"] + description: "Restic snapshots from the NAS repo (JSON array, newest first)" + + restic_snapshots_b2: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-snapshots.sh", "b2"] + description: "Restic snapshots from the B2 repo (JSON array, newest first)" + + restic_stats_nas: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-stats.sh", "nas"] + description: "Restic stats for the NAS repo (restore-size + raw-data + dedup ratio)" + + restic_stats_b2: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-stats.sh", "b2"] + description: "Restic stats for the B2 repo (restore-size + raw-data + dedup ratio)" + + list_backup_logs: + cmd: + - sh + - -c + - "ls -lt /srv/backups/logs/*.log 2>/dev/null | head -10 || echo 'no logs yet'" + description: "List the 10 most recent server-backup logs" + + tail_backup_log_today: + cmd: + - sh + - -c + - "f=/srv/backups/logs/server-backup-$(date +%F).log; [ -f \"$f\" ] && tail -200 \"$f\" || echo 'no log for today'" + description: "Tail the last 200 lines of today's server-backup log" + + trigger_server_backup: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/trigger-backup.sh"] + description: "Trigger server-backup.service ad-hoc (refuses if already running)" + + trigger_restore_test: + cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/trigger-restore-test.sh", "nas"] + description: "Run restore-test.sh against the NAS repo (non-destructive, writes /tmp/restore-test/)" diff --git a/ops-agent/flows.example/server_backup_full.yml b/ops-agent/flows.example/server_backup_full.yml new file mode 100644 index 0000000..9beb0f9 --- /dev/null +++ b/ops-agent/flows.example/server_backup_full.yml @@ -0,0 +1,21 @@ +# Trigger a full server-wide backup (pg_dumpall + restic to NAS + B2). +# Runs out-of-band via systemd; this flow just kicks it off and then tails +# today's log + reads the structured statusfile so the dashboard can render +# progress and final result. +# +# Copy to /etc/ops-agent/flows/server_backup_full.yml on the host. +# Triggered manually via /settings/backups → "Backup now" or by the daily +# server-backup.timer (which runs server-backup.service directly, skipping +# this flow). + +name: Server backup (full) +description: Daily full server backup — pg_dumpall + restic to NAS + B2 (Object Lock) +steps: + - command_key: trigger_server_backup + on_failure: abort + + - command_key: tail_backup_log_today + on_failure: continue + + - command_key: read_backup_status + on_failure: continue diff --git a/ops-agent/flows.example/server_backup_restore_test.yml b/ops-agent/flows.example/server_backup_restore_test.yml new file mode 100644 index 0000000..1ed5b31 --- /dev/null +++ b/ops-agent/flows.example/server_backup_restore_test.yml @@ -0,0 +1,14 @@ +# Run a non-destructive restore test against the NAS repo. Restores the latest +# snapshot to /tmp/restore-test/ and asserts that critical files came back +# intact. Used to verify backups periodically without touching the live stack. +# +# Copy to /etc/ops-agent/flows/server_backup_restore_test.yml on the host. + +name: Server backup — restore test +description: Restore latest snapshot to /tmp/restore-test and assert critical files +steps: + - command_key: trigger_restore_test + on_failure: continue + + - command_key: read_backup_status + on_failure: continue