feat(server-backup): restic dual-repo backup (NAS + B2) with dashboard UI
Adds a server-wide backup capability beyond the existing ops_dashboard pg_dump flow: - Daily systemd timer (03:30) runs pg_dumpall + Forgejo dump, then restic to a local NAS repo and an offsite Backblaze B2 repo with Object Lock. Phase-based script with single-instance flock, structured statusfile, systemd hardening, and live-datadir excludes (Postgres / Forgejo) so the dumps stay authoritative. - Ops-agent gets nine new read-only/trigger commands (snapshots, stats, status, logs, plus two triggers) backed by sudoers-whitelisted wrapper scripts that source /etc/restic-backup.env so the agent never sees the restic password or B2 keys. - Two new flows (server_backup_full, server_backup_restore_test) drive the dashboard's "Backup now" and "Restore test" buttons. - /settings/backups gains a Server backup section with overall + per-phase status, NAS / B2 snapshot tables, restore-size / raw-data / dedup-ratio stats, and the last restore-test result. The existing pg_dump section is preserved unchanged. - Runbook docs/runbooks/server-backup.md follows the tailscale-setup pattern (plan + addendum) and covers B2 Object Lock + scoped keys, Forgejo subplan with isolated restore-test stack, the off-server maintenance flow for B2 prune, and the integrity-check schedule. Code-only change — installation on scrum4me-srv follows the runbook. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
27cba872a8
commit
ab87c0fada
23 changed files with 2625 additions and 170 deletions
|
|
@ -1,171 +1,52 @@
|
|||
'use client'
|
||||
|
||||
import { useState, useCallback } from 'react'
|
||||
import Link from 'next/link'
|
||||
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||
import type { BackupFile } from '../page'
|
||||
|
||||
function formatSize(bytes: number): string {
|
||||
if (bytes === 0) return '—'
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||
}
|
||||
import type {
|
||||
BackupStatusEnvelope,
|
||||
ResticSnapshot,
|
||||
ResticStats,
|
||||
} from '../_lib/types'
|
||||
import DatabaseBackupsSection from './database-backups-section'
|
||||
import ServerBackupSection from './server-backup-section'
|
||||
|
||||
type Props = {
|
||||
backups: BackupFile[]
|
||||
listError: string | null
|
||||
envelope: BackupStatusEnvelope
|
||||
nasSnapshots: ResticSnapshot[]
|
||||
b2Snapshots: ResticSnapshot[]
|
||||
nasStats: ResticStats | null
|
||||
b2Stats: ResticStats | null
|
||||
serverBackupErrors: {
|
||||
status?: string
|
||||
nasSnapshots?: string
|
||||
b2Snapshots?: string
|
||||
nasStats?: string
|
||||
b2Stats?: string
|
||||
}
|
||||
}
|
||||
|
||||
export default function BackupsPanel({ backups, listError }: Props) {
|
||||
const [pending, setPending] = useState(false)
|
||||
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||
|
||||
const handleComplete = useCallback((flowRunId: string) => {
|
||||
setCompletedFlowRunId(flowRunId)
|
||||
}, [])
|
||||
|
||||
const flowRun = useFlowRun(handleComplete)
|
||||
|
||||
const handleConfirm = useCallback(() => {
|
||||
setPending(false)
|
||||
setCompletedFlowRunId(null)
|
||||
flowRun.startFlow('backup_ops_db', false)
|
||||
}, [flowRun])
|
||||
|
||||
const handleReset = useCallback(() => {
|
||||
flowRun.reset()
|
||||
setCompletedFlowRunId(null)
|
||||
}, [flowRun])
|
||||
|
||||
export default function BackupsPanel({
|
||||
backups,
|
||||
listError,
|
||||
envelope,
|
||||
nasSnapshots,
|
||||
b2Snapshots,
|
||||
nasStats,
|
||||
b2Stats,
|
||||
serverBackupErrors,
|
||||
}: Props) {
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
{/* Description */}
|
||||
<div className="rounded-lg border border-border p-5 space-y-3">
|
||||
<p className="text-sm text-muted-foreground">
|
||||
Backs up the <code className="font-mono text-xs">ops_dashboard</code> database using{' '}
|
||||
<code className="font-mono text-xs">pg_dump</code>. Dumps are stored in{' '}
|
||||
<code className="font-mono text-xs">/srv/ops/backups/</code> and retained for 30 days.
|
||||
For automated daily backups, enable the systemd timer:{' '}
|
||||
<code className="font-mono text-xs">deploy/ops-agent/ops-db-backup.timer</code>.
|
||||
</p>
|
||||
|
||||
<ol className="space-y-0.5">
|
||||
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||
<span className="text-border min-w-[1.5rem]">1.</span>
|
||||
<span>pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump</span>
|
||||
</li>
|
||||
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||
<span className="text-border min-w-[1.5rem]">2.</span>
|
||||
<span>cleanup: delete backup files older than 30 days</span>
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
{/* Action buttons */}
|
||||
<div className="flex items-center gap-3">
|
||||
<button
|
||||
onClick={() => setPending(true)}
|
||||
disabled={flowRun.status === 'running'}
|
||||
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
||||
>
|
||||
Backup now
|
||||
</button>
|
||||
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||
<button
|
||||
onClick={handleReset}
|
||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||
>
|
||||
Reset
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Terminal output */}
|
||||
{flowRun.status !== 'idle' && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-sm font-medium">Output</span>
|
||||
{completedFlowRunId && (
|
||||
<Link
|
||||
href={`/audit/${completedFlowRunId}`}
|
||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||
>
|
||||
View in audit log →
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
<StreamingTerminal
|
||||
lines={flowRun.lines}
|
||||
status={flowRun.status}
|
||||
error={flowRun.error}
|
||||
/>
|
||||
{flowRun.status === 'done' && (
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Reload this page to see the updated backup list.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Backup list */}
|
||||
<div className="space-y-3">
|
||||
<h2 className="text-sm font-semibold">Existing backups</h2>
|
||||
|
||||
{listError && (
|
||||
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-4 text-sm text-destructive">
|
||||
Could not list backups: {listError}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!listError && backups.length === 0 && (
|
||||
<div className="rounded-lg border border-border px-4 py-6 text-sm text-muted-foreground text-center">
|
||||
No backups found in /srv/ops/backups/
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!listError && backups.length > 0 && (
|
||||
<div className="rounded-lg border border-border overflow-hidden">
|
||||
<table className="w-full text-xs font-mono">
|
||||
<thead>
|
||||
<tr className="border-b border-border bg-muted/30">
|
||||
<th className="text-left px-4 py-2 font-medium text-muted-foreground">
|
||||
Timestamp
|
||||
</th>
|
||||
<th className="text-left px-4 py-2 font-medium text-muted-foreground">File</th>
|
||||
<th className="text-right px-4 py-2 font-medium text-muted-foreground">Size</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{backups.map((b, i) => (
|
||||
<tr key={b.name} className={i % 2 === 0 ? '' : 'bg-muted/10'}>
|
||||
<td className="px-4 py-2 text-muted-foreground">{b.label}</td>
|
||||
<td className="px-4 py-2">{b.name}</td>
|
||||
<td className="px-4 py-2 text-right text-muted-foreground">
|
||||
{formatSize(b.sizeBytes)}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Backups older than 30 days are removed automatically by the cleanup step.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Confirm dialog */}
|
||||
<ConfirmDialog
|
||||
open={pending}
|
||||
title="Backup ops_dashboard database"
|
||||
commandPreview={
|
||||
'flow: backup_ops_db\n\nSteps:\n 1. pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump\n 2. cleanup: delete backups older than 30 days'
|
||||
}
|
||||
onConfirm={handleConfirm}
|
||||
onCancel={() => setPending(false)}
|
||||
<div className="space-y-12">
|
||||
<DatabaseBackupsSection backups={backups} listError={listError} />
|
||||
<div className="h-px bg-border" />
|
||||
<ServerBackupSection
|
||||
envelope={envelope}
|
||||
nasSnapshots={nasSnapshots}
|
||||
b2Snapshots={b2Snapshots}
|
||||
nasStats={nasStats}
|
||||
b2Stats={b2Stats}
|
||||
errors={serverBackupErrors}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
|
|
|
|||
172
app/settings/backups/_components/database-backups-section.tsx
Normal file
172
app/settings/backups/_components/database-backups-section.tsx
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
'use client'
|
||||
|
||||
import { useCallback, useState } from 'react'
|
||||
import Link from 'next/link'
|
||||
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||
import type { BackupFile } from '../page'
|
||||
|
||||
function formatSize(bytes: number): string {
|
||||
if (bytes === 0) return '—'
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||
}
|
||||
|
||||
type Props = {
|
||||
backups: BackupFile[]
|
||||
listError: string | null
|
||||
}
|
||||
|
||||
export default function DatabaseBackupsSection({ backups, listError }: Props) {
|
||||
const [pending, setPending] = useState(false)
|
||||
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||
|
||||
const handleComplete = useCallback((flowRunId: string) => {
|
||||
setCompletedFlowRunId(flowRunId)
|
||||
}, [])
|
||||
|
||||
const flowRun = useFlowRun(handleComplete)
|
||||
|
||||
const handleConfirm = useCallback(() => {
|
||||
setPending(false)
|
||||
setCompletedFlowRunId(null)
|
||||
flowRun.startFlow('backup_ops_db', false)
|
||||
}, [flowRun])
|
||||
|
||||
const handleReset = useCallback(() => {
|
||||
flowRun.reset()
|
||||
setCompletedFlowRunId(null)
|
||||
}, [flowRun])
|
||||
|
||||
return (
|
||||
<section className="space-y-6">
|
||||
<div className="flex items-baseline justify-between">
|
||||
<h2 className="text-lg font-semibold tracking-tight">Database backups</h2>
|
||||
<span className="text-xs text-muted-foreground">flow: backup_ops_db</span>
|
||||
</div>
|
||||
|
||||
<div className="rounded-lg border border-border p-5 space-y-3">
|
||||
<p className="text-sm text-muted-foreground">
|
||||
Backs up the <code className="font-mono text-xs">ops_dashboard</code> database using{' '}
|
||||
<code className="font-mono text-xs">pg_dump</code>. Dumps are stored in{' '}
|
||||
<code className="font-mono text-xs">/srv/ops/backups/</code> and retained for 30 days.
|
||||
For automated daily backups, enable the systemd timer:{' '}
|
||||
<code className="font-mono text-xs">deploy/ops-agent/ops-db-backup.timer</code>.
|
||||
</p>
|
||||
|
||||
<ol className="space-y-0.5">
|
||||
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||
<span className="text-border min-w-[1.5rem]">1.</span>
|
||||
<span>pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump</span>
|
||||
</li>
|
||||
<li className="flex gap-2 text-xs font-mono text-muted-foreground">
|
||||
<span className="text-border min-w-[1.5rem]">2.</span>
|
||||
<span>cleanup: delete backup files older than 30 days</span>
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-3">
|
||||
<button
|
||||
onClick={() => setPending(true)}
|
||||
disabled={flowRun.status === 'running'}
|
||||
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
||||
>
|
||||
Backup now
|
||||
</button>
|
||||
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||
<button
|
||||
onClick={handleReset}
|
||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||
>
|
||||
Reset
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{flowRun.status !== 'idle' && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-sm font-medium">Output</span>
|
||||
{completedFlowRunId && (
|
||||
<Link
|
||||
href={`/audit/${completedFlowRunId}`}
|
||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||
>
|
||||
View in audit log →
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
<StreamingTerminal
|
||||
lines={flowRun.lines}
|
||||
status={flowRun.status}
|
||||
error={flowRun.error}
|
||||
/>
|
||||
{flowRun.status === 'done' && (
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Reload this page to see the updated backup list.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="space-y-3">
|
||||
<h3 className="text-sm font-semibold">Existing backups</h3>
|
||||
|
||||
{listError && (
|
||||
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-4 text-sm text-destructive">
|
||||
Could not list backups: {listError}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!listError && backups.length === 0 && (
|
||||
<div className="rounded-lg border border-border px-4 py-6 text-sm text-muted-foreground text-center">
|
||||
No backups found in /srv/ops/backups/
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!listError && backups.length > 0 && (
|
||||
<div className="rounded-lg border border-border overflow-hidden">
|
||||
<table className="w-full text-xs font-mono">
|
||||
<thead>
|
||||
<tr className="border-b border-border bg-muted/30">
|
||||
<th className="text-left px-4 py-2 font-medium text-muted-foreground">
|
||||
Timestamp
|
||||
</th>
|
||||
<th className="text-left px-4 py-2 font-medium text-muted-foreground">File</th>
|
||||
<th className="text-right px-4 py-2 font-medium text-muted-foreground">Size</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{backups.map((b, i) => (
|
||||
<tr key={b.name} className={i % 2 === 0 ? '' : 'bg-muted/10'}>
|
||||
<td className="px-4 py-2 text-muted-foreground">{b.label}</td>
|
||||
<td className="px-4 py-2">{b.name}</td>
|
||||
<td className="px-4 py-2 text-right text-muted-foreground">
|
||||
{formatSize(b.sizeBytes)}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Backups older than 30 days are removed automatically by the cleanup step.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<ConfirmDialog
|
||||
open={pending}
|
||||
title="Backup ops_dashboard database"
|
||||
commandPreview={
|
||||
'flow: backup_ops_db\n\nSteps:\n 1. pg_dump ops_dashboard → /srv/ops/backups/ops_db_YYYYMMDD_HHMM.dump\n 2. cleanup: delete backups older than 30 days'
|
||||
}
|
||||
onConfirm={handleConfirm}
|
||||
onCancel={() => setPending(false)}
|
||||
/>
|
||||
</section>
|
||||
)
|
||||
}
|
||||
447
app/settings/backups/_components/server-backup-section.tsx
Normal file
447
app/settings/backups/_components/server-backup-section.tsx
Normal file
|
|
@ -0,0 +1,447 @@
|
|||
'use client'
|
||||
|
||||
import { useCallback, useState } from 'react'
|
||||
import Link from 'next/link'
|
||||
import { useFlowRun } from '@/hooks/useFlowRun'
|
||||
import StreamingTerminal from '@/components/StreamingTerminal'
|
||||
import ConfirmDialog from '@/components/ConfirmDialog'
|
||||
import type {
|
||||
BackupPhase,
|
||||
BackupStatus,
|
||||
BackupStatusEnvelope,
|
||||
OverallStatus,
|
||||
PhaseStatus,
|
||||
ResticSnapshot,
|
||||
ResticStats,
|
||||
} from '../_lib/types'
|
||||
|
||||
type Props = {
|
||||
envelope: BackupStatusEnvelope
|
||||
nasSnapshots: ResticSnapshot[]
|
||||
b2Snapshots: ResticSnapshot[]
|
||||
nasStats: ResticStats | null
|
||||
b2Stats: ResticStats | null
|
||||
errors: {
|
||||
status?: string
|
||||
nasSnapshots?: string
|
||||
b2Snapshots?: string
|
||||
nasStats?: string
|
||||
b2Stats?: string
|
||||
}
|
||||
}
|
||||
|
||||
type ActiveFlow = 'backup' | 'restore' | null
|
||||
|
||||
function formatBytes(bytes: number | null | undefined): string {
|
||||
if (bytes == null) return '—'
|
||||
if (bytes < 1024) return `${bytes} B`
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
||||
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`
|
||||
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`
|
||||
}
|
||||
|
||||
function formatDuration(seconds: number | null | undefined): string {
|
||||
if (seconds == null || seconds === 0) return '—'
|
||||
if (seconds < 60) return `${seconds}s`
|
||||
if (seconds < 3600) return `${Math.floor(seconds / 60)}m ${seconds % 60}s`
|
||||
const h = Math.floor(seconds / 3600)
|
||||
const m = Math.floor((seconds % 3600) / 60)
|
||||
return `${h}h ${m}m`
|
||||
}
|
||||
|
||||
function formatTimestamp(iso: string | null | undefined): string {
|
||||
if (!iso) return '—'
|
||||
try {
|
||||
const d = new Date(iso)
|
||||
if (Number.isNaN(d.getTime())) return iso
|
||||
const yyyy = d.getFullYear()
|
||||
const mm = String(d.getMonth() + 1).padStart(2, '0')
|
||||
const dd = String(d.getDate()).padStart(2, '0')
|
||||
const hh = String(d.getHours()).padStart(2, '0')
|
||||
const mi = String(d.getMinutes()).padStart(2, '0')
|
||||
return `${yyyy}-${mm}-${dd} ${hh}:${mi}`
|
||||
} catch {
|
||||
return iso
|
||||
}
|
||||
}
|
||||
|
||||
function overallBadgeClass(status: OverallStatus): string {
|
||||
switch (status) {
|
||||
case 'success':
|
||||
return 'bg-green-500/15 text-green-500 border-green-500/30'
|
||||
case 'partial_failure':
|
||||
return 'bg-amber-500/15 text-amber-500 border-amber-500/30'
|
||||
case 'failed':
|
||||
return 'bg-destructive/15 text-destructive border-destructive/30'
|
||||
default:
|
||||
return 'bg-muted/50 text-muted-foreground border-border'
|
||||
}
|
||||
}
|
||||
|
||||
function phaseIcon(status: PhaseStatus): { glyph: string; color: string } {
|
||||
switch (status) {
|
||||
case 'success':
|
||||
return { glyph: '✓', color: 'text-green-500' }
|
||||
case 'skipped':
|
||||
return { glyph: '–', color: 'text-muted-foreground' }
|
||||
case 'degraded':
|
||||
return { glyph: '!', color: 'text-amber-500' }
|
||||
case 'failed':
|
||||
return { glyph: '✗', color: 'text-destructive' }
|
||||
case 'pending':
|
||||
default:
|
||||
return { glyph: '○', color: 'text-muted-foreground/50' }
|
||||
}
|
||||
}
|
||||
|
||||
function phaseDurationSeconds(phase: BackupPhase): number | null {
|
||||
if (!phase.startedAt || !phase.completedAt) return null
|
||||
const start = new Date(phase.startedAt).getTime()
|
||||
const end = new Date(phase.completedAt).getTime()
|
||||
if (Number.isNaN(start) || Number.isNaN(end)) return null
|
||||
return Math.max(0, Math.round((end - start) / 1000))
|
||||
}
|
||||
|
||||
function StatusCard({ status }: { status: BackupStatus | null }) {
|
||||
if (!status) {
|
||||
return (
|
||||
<div className="rounded-lg border border-border px-4 py-3 text-sm text-muted-foreground">
|
||||
No backup run recorded yet. Trigger one with the "Backup now" button below.
|
||||
</div>
|
||||
)
|
||||
}
|
||||
return (
|
||||
<div className="rounded-lg border border-border p-4 space-y-3">
|
||||
<div className="flex items-center justify-between flex-wrap gap-2">
|
||||
<div className="flex items-center gap-3">
|
||||
<span
|
||||
className={`inline-flex items-center gap-1.5 rounded-md border px-2 py-0.5 text-xs font-medium uppercase tracking-wide ${overallBadgeClass(status.overallStatus)}`}
|
||||
>
|
||||
{status.overallStatus.replace('_', ' ')}
|
||||
</span>
|
||||
<span className="text-sm text-muted-foreground">
|
||||
Last run {formatTimestamp(status.completedAt)} on{' '}
|
||||
<code className="font-mono text-xs">{status.host || '—'}</code>
|
||||
</span>
|
||||
</div>
|
||||
<span className="text-xs text-muted-foreground">
|
||||
duration {formatDuration(status.durationSeconds)}
|
||||
</span>
|
||||
</div>
|
||||
<div className="grid grid-cols-2 gap-1 sm:grid-cols-4">
|
||||
{status.phases.map((p) => {
|
||||
const icon = phaseIcon(p.status)
|
||||
const dur = phaseDurationSeconds(p)
|
||||
return (
|
||||
<div
|
||||
key={p.name}
|
||||
className="flex items-center gap-2 rounded-md border border-border/60 bg-muted/20 px-2 py-1.5"
|
||||
title={p.error ?? p.status}
|
||||
>
|
||||
<span className={`font-mono text-sm ${icon.color}`}>{icon.glyph}</span>
|
||||
<div className="flex flex-col leading-tight min-w-0">
|
||||
<span className="truncate text-xs font-medium">{p.name}</span>
|
||||
<span className="text-[10px] text-muted-foreground">
|
||||
{p.status}
|
||||
{dur != null ? ` · ${formatDuration(dur)}` : ''}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function StatsBlock({ stats, label, error }: { stats: ResticStats | null; label: string; error?: string }) {
|
||||
if (error) {
|
||||
return (
|
||||
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-3 text-xs text-destructive">
|
||||
{label}: {error}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
if (!stats) {
|
||||
return (
|
||||
<div className="rounded-lg border border-border p-3 text-xs text-muted-foreground">
|
||||
{label}: no stats yet
|
||||
</div>
|
||||
)
|
||||
}
|
||||
const dedup =
|
||||
stats.dedupRatio != null && Number.isFinite(stats.dedupRatio)
|
||||
? `${stats.dedupRatio.toFixed(2)}×`
|
||||
: '—'
|
||||
return (
|
||||
<div className="rounded-lg border border-border p-3 space-y-1.5">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-xs font-semibold uppercase tracking-wide text-muted-foreground">
|
||||
{label}
|
||||
</span>
|
||||
<span className="text-xs text-muted-foreground">
|
||||
{stats.snapshotsCount} snapshot{stats.snapshotsCount === 1 ? '' : 's'}
|
||||
</span>
|
||||
</div>
|
||||
<dl className="grid grid-cols-2 gap-x-3 gap-y-0.5 text-xs font-mono">
|
||||
<dt className="text-muted-foreground">restore size</dt>
|
||||
<dd className="text-right">{formatBytes(stats.restoreSizeBytes)}</dd>
|
||||
<dt className="text-muted-foreground">raw data</dt>
|
||||
<dd className="text-right">{formatBytes(stats.rawDataBytes)}</dd>
|
||||
<dt className="text-muted-foreground">dedup ratio</dt>
|
||||
<dd className="text-right">{dedup}</dd>
|
||||
</dl>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function SnapshotsTable({
|
||||
snapshots,
|
||||
label,
|
||||
error,
|
||||
}: {
|
||||
snapshots: ResticSnapshot[]
|
||||
label: string
|
||||
error?: string
|
||||
}) {
|
||||
return (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<h3 className="text-sm font-semibold">{label}</h3>
|
||||
<span className="text-xs text-muted-foreground">{snapshots.length} shown</span>
|
||||
</div>
|
||||
{error ? (
|
||||
<div className="rounded-lg border border-destructive/50 bg-destructive/10 p-3 text-xs text-destructive">
|
||||
{error}
|
||||
</div>
|
||||
) : snapshots.length === 0 ? (
|
||||
<div className="rounded-lg border border-border px-4 py-6 text-xs text-muted-foreground text-center">
|
||||
No snapshots in this repo yet.
|
||||
</div>
|
||||
) : (
|
||||
<div className="rounded-lg border border-border overflow-hidden">
|
||||
<table className="w-full text-xs font-mono">
|
||||
<thead>
|
||||
<tr className="border-b border-border bg-muted/30">
|
||||
<th className="text-left px-3 py-2 font-medium text-muted-foreground">Time</th>
|
||||
<th className="text-left px-3 py-2 font-medium text-muted-foreground">ID</th>
|
||||
<th className="text-left px-3 py-2 font-medium text-muted-foreground">Tags</th>
|
||||
<th className="text-right px-3 py-2 font-medium text-muted-foreground">
|
||||
Files / size added
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{snapshots.map((s, i) => (
|
||||
<tr key={s.id} className={i % 2 === 0 ? '' : 'bg-muted/10'}>
|
||||
<td className="px-3 py-1.5 text-muted-foreground">{formatTimestamp(s.time)}</td>
|
||||
<td className="px-3 py-1.5">{s.shortId}</td>
|
||||
<td className="px-3 py-1.5 text-muted-foreground truncate max-w-[12rem]">
|
||||
{s.tags.join(', ') || '—'}
|
||||
</td>
|
||||
<td className="px-3 py-1.5 text-right text-muted-foreground">
|
||||
{s.summary?.files_new != null
|
||||
? `${s.summary.files_new} new · ${formatBytes(s.summary.data_added ?? 0)}`
|
||||
: '—'}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default function ServerBackupSection({
|
||||
envelope,
|
||||
nasSnapshots,
|
||||
b2Snapshots,
|
||||
nasStats,
|
||||
b2Stats,
|
||||
errors,
|
||||
}: Props) {
|
||||
const [pending, setPending] = useState<ActiveFlow>(null)
|
||||
const [completedFlowRunId, setCompletedFlowRunId] = useState<string | null>(null)
|
||||
const [activeFlow, setActiveFlow] = useState<ActiveFlow>(null)
|
||||
|
||||
const handleComplete = useCallback((flowRunId: string) => {
|
||||
setCompletedFlowRunId(flowRunId)
|
||||
}, [])
|
||||
|
||||
const flowRun = useFlowRun(handleComplete)
|
||||
|
||||
const startFlow = useCallback(
|
||||
(kind: 'backup' | 'restore') => {
|
||||
setPending(null)
|
||||
setCompletedFlowRunId(null)
|
||||
setActiveFlow(kind)
|
||||
flowRun.startFlow(
|
||||
kind === 'backup' ? 'server_backup_full' : 'server_backup_restore_test',
|
||||
false,
|
||||
)
|
||||
},
|
||||
[flowRun],
|
||||
)
|
||||
|
||||
const handleReset = useCallback(() => {
|
||||
flowRun.reset()
|
||||
setCompletedFlowRunId(null)
|
||||
setActiveFlow(null)
|
||||
}, [flowRun])
|
||||
|
||||
return (
|
||||
<section className="space-y-6">
|
||||
<div className="flex items-baseline justify-between">
|
||||
<h2 className="text-lg font-semibold tracking-tight">Server backup (restic)</h2>
|
||||
<span className="text-xs text-muted-foreground">flows: server_backup_full · restore_test</span>
|
||||
</div>
|
||||
|
||||
<div className="rounded-lg border border-border p-5 space-y-3">
|
||||
<p className="text-sm text-muted-foreground">
|
||||
Daily server-wide backup at 03:30: <code className="font-mono text-xs">pg_dumpall</code> +
|
||||
Forgejo dump, then restic to <strong>NAS</strong> (local) and <strong>Backblaze B2</strong>{' '}
|
||||
(offsite, Object Lock). Authoritative restore sources are the database dumps; live datadirs
|
||||
are excluded. See{' '}
|
||||
<Link
|
||||
href="https://github.com/Madhura68/Ops-dashboard/blob/main/docs/runbooks/server-backup.md"
|
||||
className="underline hover:text-foreground"
|
||||
>
|
||||
docs/runbooks/server-backup.md
|
||||
</Link>{' '}
|
||||
for the full procedure.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<StatusCard status={envelope.lastRun} />
|
||||
{errors.status && (
|
||||
<div className="rounded-lg border border-amber-500/50 bg-amber-500/10 p-3 text-xs text-amber-500">
|
||||
Could not read backup status: {errors.status}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="grid gap-3 md:grid-cols-2">
|
||||
<StatsBlock stats={nasStats} label="NAS repo" error={errors.nasStats} />
|
||||
<StatsBlock stats={b2Stats} label="B2 repo" error={errors.b2Stats} />
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-3 flex-wrap">
|
||||
<button
|
||||
onClick={() => setPending('backup')}
|
||||
disabled={flowRun.status === 'running'}
|
||||
className="rounded-lg bg-foreground text-background px-4 py-2 text-sm font-medium hover:opacity-90 disabled:opacity-50 transition-opacity"
|
||||
>
|
||||
Backup now
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setPending('restore')}
|
||||
disabled={flowRun.status === 'running'}
|
||||
className="rounded-lg border border-border px-4 py-2 text-sm font-medium hover:bg-muted/50 disabled:opacity-50 transition-colors"
|
||||
>
|
||||
Run restore test
|
||||
</button>
|
||||
{flowRun.status !== 'idle' && flowRun.status !== 'running' && (
|
||||
<button
|
||||
onClick={handleReset}
|
||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||
>
|
||||
Reset
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{flowRun.status !== 'idle' && (
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center justify-between">
|
||||
<span className="text-sm font-medium">
|
||||
Output {activeFlow ? `(${activeFlow === 'backup' ? 'backup' : 'restore test'})` : ''}
|
||||
</span>
|
||||
{completedFlowRunId && (
|
||||
<Link
|
||||
href={`/audit/${completedFlowRunId}`}
|
||||
className="text-xs text-muted-foreground hover:text-foreground transition-colors"
|
||||
>
|
||||
View in audit log →
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
<StreamingTerminal
|
||||
lines={flowRun.lines}
|
||||
status={flowRun.status}
|
||||
error={flowRun.error}
|
||||
/>
|
||||
{flowRun.status === 'done' && (
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Reload this page to see the updated status, snapshots, and stats.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="grid gap-6 lg:grid-cols-2">
|
||||
<SnapshotsTable
|
||||
snapshots={nasSnapshots}
|
||||
label="NAS snapshots"
|
||||
error={errors.nasSnapshots}
|
||||
/>
|
||||
<SnapshotsTable
|
||||
snapshots={b2Snapshots}
|
||||
label="B2 snapshots"
|
||||
error={errors.b2Snapshots}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{envelope.lastRestoreTest && (
|
||||
<div className="rounded-lg border border-border p-4 space-y-2">
|
||||
<div className="flex items-center justify-between flex-wrap gap-2">
|
||||
<h3 className="text-sm font-semibold">Last restore test</h3>
|
||||
<span
|
||||
className={`inline-flex items-center rounded-md border px-2 py-0.5 text-xs font-medium uppercase tracking-wide ${overallBadgeClass(envelope.lastRestoreTest.overallStatus)}`}
|
||||
>
|
||||
{envelope.lastRestoreTest.overallStatus.replace('_', ' ')}
|
||||
</span>
|
||||
</div>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{formatTimestamp(envelope.lastRestoreTest.completedAt)} · repo{' '}
|
||||
<code className="font-mono">{envelope.lastRestoreTest.repo}</code> · snapshot{' '}
|
||||
<code className="font-mono">
|
||||
{envelope.lastRestoreTest.snapshotId?.slice(0, 8) ?? '—'}
|
||||
</code>{' '}
|
||||
· {envelope.lastRestoreTest.assertions.length} assertions
|
||||
</p>
|
||||
{envelope.lastRestoreTest.assertions.some((a) => a.status !== 'ok') && (
|
||||
<ul className="space-y-0.5">
|
||||
{envelope.lastRestoreTest.assertions
|
||||
.filter((a) => a.status !== 'ok')
|
||||
.map((a) => (
|
||||
<li key={a.path} className="text-xs font-mono text-amber-500">
|
||||
{a.status === 'missing' ? '✗ missing' : '! empty'} · {a.path}
|
||||
</li>
|
||||
))}
|
||||
</ul>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<ConfirmDialog
|
||||
open={pending === 'backup'}
|
||||
title="Trigger server backup"
|
||||
commandPreview={
|
||||
'flow: server_backup_full\n\nSteps:\n 1. trigger_server_backup (systemctl start server-backup.service)\n 2. tail_backup_log_today\n 3. read_backup_status\n\nThe actual work happens in systemd; this flow kicks it off and tails the log.'
|
||||
}
|
||||
onConfirm={() => startFlow('backup')}
|
||||
onCancel={() => setPending(null)}
|
||||
/>
|
||||
<ConfirmDialog
|
||||
open={pending === 'restore'}
|
||||
title="Run restore test (NAS)"
|
||||
commandPreview={
|
||||
'flow: server_backup_restore_test\n\nSteps:\n 1. trigger_restore_test (restore latest NAS snapshot to /tmp/restore-test/)\n 2. read_backup_status\n\nNon-destructive — restores into /tmp only and asserts critical files exist.'
|
||||
}
|
||||
onConfirm={() => startFlow('restore')}
|
||||
onCancel={() => setPending(null)}
|
||||
/>
|
||||
</section>
|
||||
)
|
||||
}
|
||||
191
app/settings/backups/_lib/parse.ts
Normal file
191
app/settings/backups/_lib/parse.ts
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
import type {
|
||||
BackupPhase,
|
||||
BackupStatus,
|
||||
BackupStatusEnvelope,
|
||||
OverallStatus,
|
||||
PhaseStatus,
|
||||
ResticSnapshot,
|
||||
ResticStats,
|
||||
RestoreTestAssertion,
|
||||
RestoreTestStatus,
|
||||
} from './types'
|
||||
|
||||
const PHASE_ORDER = [
|
||||
'postgres_dump',
|
||||
'forgejo_dump',
|
||||
'forgejo_db_dump',
|
||||
'restic_nas',
|
||||
'restic_b2',
|
||||
'forget_nas',
|
||||
'check_nas',
|
||||
'check_b2',
|
||||
] as const
|
||||
|
||||
function isRecord(v: unknown): v is Record<string, unknown> {
|
||||
return typeof v === 'object' && v !== null && !Array.isArray(v)
|
||||
}
|
||||
|
||||
function asString(v: unknown): string | null {
|
||||
return typeof v === 'string' ? v : null
|
||||
}
|
||||
|
||||
function asNumber(v: unknown): number | null {
|
||||
return typeof v === 'number' && Number.isFinite(v) ? v : null
|
||||
}
|
||||
|
||||
function asPhaseStatus(v: unknown): PhaseStatus {
|
||||
if (
|
||||
v === 'success' ||
|
||||
v === 'skipped' ||
|
||||
v === 'degraded' ||
|
||||
v === 'failed' ||
|
||||
v === 'pending'
|
||||
) {
|
||||
return v
|
||||
}
|
||||
return 'pending'
|
||||
}
|
||||
|
||||
function asOverallStatus(v: unknown): OverallStatus {
|
||||
if (v === 'success' || v === 'partial_failure' || v === 'failed') return v
|
||||
return 'unknown'
|
||||
}
|
||||
|
||||
function parsePhase(name: string, raw: unknown): BackupPhase {
|
||||
if (!isRecord(raw)) {
|
||||
return {
|
||||
name,
|
||||
status: 'pending',
|
||||
exitCode: null,
|
||||
startedAt: null,
|
||||
completedAt: null,
|
||||
error: null,
|
||||
}
|
||||
}
|
||||
return {
|
||||
name,
|
||||
status: asPhaseStatus(raw.status),
|
||||
exitCode: asNumber(raw.exit_code),
|
||||
startedAt: asString(raw.started_at),
|
||||
completedAt: asString(raw.completed_at),
|
||||
error: asString(raw.error),
|
||||
snapshotId: asString(raw.snapshot_id) ?? undefined,
|
||||
filesNew: asNumber(raw.files_new),
|
||||
dataAddedBytes: asNumber(raw.data_added_bytes),
|
||||
outputFile: asString(raw.output_file) ?? undefined,
|
||||
bytes: asNumber(raw.bytes),
|
||||
}
|
||||
}
|
||||
|
||||
function parseBackupStatus(raw: unknown): BackupStatus | null {
|
||||
if (!isRecord(raw)) return null
|
||||
const phasesRaw = isRecord(raw.phases) ? raw.phases : {}
|
||||
const phases = PHASE_ORDER.map((name) => parsePhase(name, phasesRaw[name]))
|
||||
return {
|
||||
schemaVersion: asNumber(raw.schema_version) ?? 1,
|
||||
overallStatus: asOverallStatus(raw.overall_status),
|
||||
startedAt: asString(raw.started_at) ?? '',
|
||||
completedAt: asString(raw.completed_at) ?? '',
|
||||
durationSeconds: asNumber(raw.duration_seconds) ?? 0,
|
||||
host: asString(raw.host) ?? '',
|
||||
phases,
|
||||
}
|
||||
}
|
||||
|
||||
function parseRestoreTestAssertion(raw: unknown): RestoreTestAssertion | null {
|
||||
if (!isRecord(raw)) return null
|
||||
const status = raw.status
|
||||
if (status !== 'ok' && status !== 'empty' && status !== 'missing') return null
|
||||
return {
|
||||
path: asString(raw.path) ?? '',
|
||||
status,
|
||||
bytes: asNumber(raw.bytes) ?? 0,
|
||||
}
|
||||
}
|
||||
|
||||
function parseRestoreTestStatus(raw: unknown): RestoreTestStatus | null {
|
||||
if (!isRecord(raw)) return null
|
||||
const assertionsRaw = Array.isArray(raw.assertions) ? raw.assertions : []
|
||||
const assertions: RestoreTestAssertion[] = []
|
||||
for (const a of assertionsRaw) {
|
||||
const parsed = parseRestoreTestAssertion(a)
|
||||
if (parsed) assertions.push(parsed)
|
||||
}
|
||||
return {
|
||||
schemaVersion: asNumber(raw.schema_version) ?? 1,
|
||||
overallStatus: asOverallStatus(raw.overall_status),
|
||||
startedAt: asString(raw.started_at) ?? '',
|
||||
completedAt: asString(raw.completed_at) ?? '',
|
||||
durationSeconds: asNumber(raw.duration_seconds) ?? 0,
|
||||
repo: asString(raw.repo) ?? '',
|
||||
snapshotId: asString(raw.snapshot_id),
|
||||
restoreExitCode: asNumber(raw.restore_exit_code),
|
||||
target: asString(raw.target) ?? undefined,
|
||||
assertions,
|
||||
error: asString(raw.error) ?? undefined,
|
||||
}
|
||||
}
|
||||
|
||||
export function parseStatusEnvelope(output: string): BackupStatusEnvelope {
|
||||
try {
|
||||
const trimmed = output.trim()
|
||||
if (!trimmed) return { lastRun: null, lastRestoreTest: null }
|
||||
const parsed: unknown = JSON.parse(trimmed)
|
||||
if (!isRecord(parsed)) return { lastRun: null, lastRestoreTest: null }
|
||||
return {
|
||||
lastRun: parseBackupStatus(parsed.last_run),
|
||||
lastRestoreTest: parseRestoreTestStatus(parsed.last_restore_test),
|
||||
}
|
||||
} catch {
|
||||
return { lastRun: null, lastRestoreTest: null }
|
||||
}
|
||||
}
|
||||
|
||||
export function parseResticSnapshots(output: string, repo: 'nas' | 'b2'): ResticSnapshot[] {
|
||||
try {
|
||||
const trimmed = output.trim()
|
||||
if (!trimmed) return []
|
||||
const parsed: unknown = JSON.parse(trimmed)
|
||||
if (!Array.isArray(parsed)) return []
|
||||
const result: ResticSnapshot[] = []
|
||||
for (const s of parsed) {
|
||||
if (!isRecord(s)) continue
|
||||
const id = asString(s.id)
|
||||
if (!id) continue
|
||||
const shortId = asString(s.short_id) ?? id.slice(0, 8)
|
||||
const time = asString(s.time) ?? ''
|
||||
const hostname = asString(s.hostname) ?? ''
|
||||
const tags = Array.isArray(s.tags)
|
||||
? s.tags.filter((t): t is string => typeof t === 'string')
|
||||
: []
|
||||
const paths = Array.isArray(s.paths)
|
||||
? s.paths.filter((p): p is string => typeof p === 'string')
|
||||
: []
|
||||
const summary = isRecord(s.summary) ? (s.summary as ResticSnapshot['summary']) : null
|
||||
result.push({ id, shortId, time, hostname, tags, paths, repo, summary })
|
||||
}
|
||||
return result
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
export function parseResticStats(output: string, repo: 'nas' | 'b2'): ResticStats | null {
|
||||
try {
|
||||
const trimmed = output.trim()
|
||||
if (!trimmed) return null
|
||||
const parsed: unknown = JSON.parse(trimmed)
|
||||
if (!isRecord(parsed)) return null
|
||||
return {
|
||||
repo,
|
||||
snapshotsCount: asNumber(parsed.snapshots_count) ?? 0,
|
||||
restoreSizeBytes: asNumber(parsed.restore_size_bytes),
|
||||
restoreSizeFiles: asNumber(parsed.restore_size_files),
|
||||
rawDataBytes: asNumber(parsed.raw_data_bytes),
|
||||
rawBlobCount: asNumber(parsed.raw_blob_count),
|
||||
dedupRatio: asNumber(parsed.dedup_ratio),
|
||||
}
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
78
app/settings/backups/_lib/types.ts
Normal file
78
app/settings/backups/_lib/types.ts
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
export type PhaseStatus = 'success' | 'skipped' | 'degraded' | 'failed' | 'pending'
|
||||
export type OverallStatus = 'success' | 'partial_failure' | 'failed' | 'unknown'
|
||||
|
||||
export interface BackupPhase {
|
||||
name: string
|
||||
status: PhaseStatus
|
||||
exitCode: number | null
|
||||
startedAt: string | null
|
||||
completedAt: string | null
|
||||
error: string | null
|
||||
snapshotId?: string
|
||||
filesNew?: number | null
|
||||
dataAddedBytes?: number | null
|
||||
outputFile?: string
|
||||
bytes?: number | null
|
||||
}
|
||||
|
||||
export interface BackupStatus {
|
||||
schemaVersion: number
|
||||
overallStatus: OverallStatus
|
||||
startedAt: string
|
||||
completedAt: string
|
||||
durationSeconds: number
|
||||
host: string
|
||||
phases: BackupPhase[]
|
||||
}
|
||||
|
||||
export interface RestoreTestAssertion {
|
||||
path: string
|
||||
status: 'ok' | 'empty' | 'missing'
|
||||
bytes: number
|
||||
}
|
||||
|
||||
export interface RestoreTestStatus {
|
||||
schemaVersion: number
|
||||
overallStatus: OverallStatus
|
||||
startedAt: string
|
||||
completedAt: string
|
||||
durationSeconds: number
|
||||
repo: string
|
||||
snapshotId: string | null
|
||||
restoreExitCode: number | null
|
||||
target?: string
|
||||
assertions: RestoreTestAssertion[]
|
||||
error?: string
|
||||
}
|
||||
|
||||
export interface BackupStatusEnvelope {
|
||||
lastRun: BackupStatus | null
|
||||
lastRestoreTest: RestoreTestStatus | null
|
||||
}
|
||||
|
||||
export interface ResticSnapshot {
|
||||
id: string
|
||||
shortId: string
|
||||
time: string
|
||||
hostname: string
|
||||
tags: string[]
|
||||
paths: string[]
|
||||
repo: 'nas' | 'b2'
|
||||
summary?: {
|
||||
files_new?: number
|
||||
files_changed?: number
|
||||
data_added?: number
|
||||
total_files_processed?: number
|
||||
total_bytes_processed?: number
|
||||
} | null
|
||||
}
|
||||
|
||||
export interface ResticStats {
|
||||
repo: 'nas' | 'b2'
|
||||
snapshotsCount: number
|
||||
restoreSizeBytes: number | null
|
||||
restoreSizeFiles: number | null
|
||||
rawDataBytes: number | null
|
||||
rawBlobCount: number | null
|
||||
dedupRatio: number | null
|
||||
}
|
||||
|
|
@ -3,6 +3,16 @@ import { redirect } from 'next/navigation'
|
|||
import { getCurrentUser } from '@/lib/session'
|
||||
import { execAgent } from '@/lib/agent-client'
|
||||
import BackupsPanel from './_components/backups-panel'
|
||||
import {
|
||||
parseResticSnapshots,
|
||||
parseResticStats,
|
||||
parseStatusEnvelope,
|
||||
} from './_lib/parse'
|
||||
import type {
|
||||
BackupStatusEnvelope,
|
||||
ResticSnapshot,
|
||||
ResticStats,
|
||||
} from './_lib/types'
|
||||
|
||||
export const dynamic = 'force-dynamic'
|
||||
|
||||
|
|
@ -27,23 +37,74 @@ function parseBackupList(output: string): BackupFile[] {
|
|||
.filter((b) => b.name)
|
||||
}
|
||||
|
||||
function errorMessage(err: unknown): string {
|
||||
return err instanceof Error ? err.message : 'agent call failed'
|
||||
}
|
||||
|
||||
async function tryExec(command: string): Promise<{ output: string | null; error: string | null }> {
|
||||
try {
|
||||
const output = await execAgent(command)
|
||||
return { output, error: null }
|
||||
} catch (err) {
|
||||
return { output: null, error: errorMessage(err) }
|
||||
}
|
||||
}
|
||||
|
||||
export default async function BackupsPage() {
|
||||
const user = await getCurrentUser()
|
||||
if (!user) redirect('/login')
|
||||
|
||||
let backups: BackupFile[] = []
|
||||
let listError: string | null = null
|
||||
// Run all agent calls in parallel; per-call error isolation so one failure
|
||||
// does not blank the entire page.
|
||||
const [
|
||||
backupListResult,
|
||||
statusResult,
|
||||
nasSnapshotsResult,
|
||||
b2SnapshotsResult,
|
||||
nasStatsResult,
|
||||
b2StatsResult,
|
||||
] = await Promise.all([
|
||||
tryExec('list_ops_backups'),
|
||||
tryExec('read_backup_status'),
|
||||
tryExec('restic_snapshots_nas'),
|
||||
tryExec('restic_snapshots_b2'),
|
||||
tryExec('restic_stats_nas'),
|
||||
tryExec('restic_stats_b2'),
|
||||
])
|
||||
|
||||
try {
|
||||
const output = await execAgent('list_ops_backups')
|
||||
backups = parseBackupList(output)
|
||||
} catch (err) {
|
||||
listError = err instanceof Error ? err.message : 'failed to list backups'
|
||||
const backups: BackupFile[] = backupListResult.output
|
||||
? parseBackupList(backupListResult.output)
|
||||
: []
|
||||
const listError = backupListResult.error
|
||||
|
||||
const envelope: BackupStatusEnvelope = statusResult.output
|
||||
? parseStatusEnvelope(statusResult.output)
|
||||
: { lastRun: null, lastRestoreTest: null }
|
||||
|
||||
const nasSnapshots: ResticSnapshot[] = nasSnapshotsResult.output
|
||||
? parseResticSnapshots(nasSnapshotsResult.output, 'nas')
|
||||
: []
|
||||
const b2Snapshots: ResticSnapshot[] = b2SnapshotsResult.output
|
||||
? parseResticSnapshots(b2SnapshotsResult.output, 'b2')
|
||||
: []
|
||||
const nasStats: ResticStats | null = nasStatsResult.output
|
||||
? parseResticStats(nasStatsResult.output, 'nas')
|
||||
: null
|
||||
const b2Stats: ResticStats | null = b2StatsResult.output
|
||||
? parseResticStats(b2StatsResult.output, 'b2')
|
||||
: null
|
||||
|
||||
const serverBackupErrors = {
|
||||
status: statusResult.error ?? undefined,
|
||||
nasSnapshots: nasSnapshotsResult.error ?? undefined,
|
||||
b2Snapshots: b2SnapshotsResult.error ?? undefined,
|
||||
nasStats: nasStatsResult.error ?? undefined,
|
||||
b2Stats: b2StatsResult.error ?? undefined,
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="min-h-screen bg-background p-6">
|
||||
<div className="mx-auto max-w-4xl space-y-6">
|
||||
<div className="mx-auto max-w-6xl space-y-6">
|
||||
<div className="flex items-center gap-3">
|
||||
<Link href="/" className="text-sm text-muted-foreground hover:text-foreground">
|
||||
← Home
|
||||
|
|
@ -52,7 +113,16 @@ export default async function BackupsPage() {
|
|||
<h1 className="text-2xl font-semibold tracking-tight">Backups</h1>
|
||||
</div>
|
||||
|
||||
<BackupsPanel backups={backups} listError={listError} />
|
||||
<BackupsPanel
|
||||
backups={backups}
|
||||
listError={listError}
|
||||
envelope={envelope}
|
||||
nasSnapshots={nasSnapshots}
|
||||
b2Snapshots={b2Snapshots}
|
||||
nasStats={nasStats}
|
||||
b2Stats={b2Stats}
|
||||
serverBackupErrors={serverBackupErrors}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,19 @@
|
|||
# /etc/sudoers.d/ops-agent
|
||||
# NOPASSWD for explicit systemctl restart invocations by the ops-agent service account.
|
||||
# Only the service names whitelisted in commands.yml are listed here.
|
||||
# NOPASSWD for explicit invocations by the ops-agent service account.
|
||||
# Only the service names + wrapper scripts whitelisted in commands.yml are listed here.
|
||||
# Installed by deploy/ops-agent/setup.sh.
|
||||
|
||||
ops-agent ALL=(root) NOPASSWD: \
|
||||
/usr/bin/systemctl restart scrum4me-web, \
|
||||
/usr/bin/systemctl restart ops-agent, \
|
||||
/usr/bin/systemctl restart caddy
|
||||
/usr/bin/systemctl restart caddy, \
|
||||
/srv/backups/scripts/wrappers/read-status.sh, \
|
||||
/srv/backups/scripts/wrappers/restic-snapshots.sh nas, \
|
||||
/srv/backups/scripts/wrappers/restic-snapshots.sh b2, \
|
||||
/srv/backups/scripts/wrappers/restic-stats.sh nas, \
|
||||
/srv/backups/scripts/wrappers/restic-stats.sh b2, \
|
||||
/srv/backups/scripts/wrappers/restic-check.sh nas, \
|
||||
/srv/backups/scripts/wrappers/restic-check.sh b2, \
|
||||
/srv/backups/scripts/wrappers/trigger-backup.sh, \
|
||||
/srv/backups/scripts/wrappers/trigger-restore-test.sh nas, \
|
||||
/srv/backups/scripts/wrappers/trigger-restore-test.sh b2
|
||||
|
|
|
|||
126
deploy/server-backup/README.md
Normal file
126
deploy/server-backup/README.md
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
# Server backup — deploy artefacten
|
||||
|
||||
Dagelijkse server-brede backup met restic naar **NAS** (lokaal) en **Backblaze B2** (offsite, Object Lock). Inclusief structured statusfile die de ops-dashboard kan lezen.
|
||||
|
||||
De volledige beschrijving — voorwaarden, B2 keys, Object Lock, Forgejo-restore-test, integriteits-schedule — staat in [`docs/runbooks/server-backup.md`](../../docs/runbooks/server-backup.md).
|
||||
|
||||
## Bestanden
|
||||
|
||||
| Bestand | Doel | Plek op host |
|
||||
|---|---|---|
|
||||
| `server-backup.sh` | hoofd-script (phase-based, flock, statusfile) | `/srv/backups/scripts/server-backup.sh` |
|
||||
| `restore-test.sh` | restore latest snapshot + check critical files | `/srv/backups/scripts/restore-test.sh` |
|
||||
| `server-backup.service` | systemd oneshot | `/etc/systemd/system/server-backup.service` |
|
||||
| `server-backup.timer` | daily 03:30 + 10 min jitter | `/etc/systemd/system/server-backup.timer` |
|
||||
| `restic-backup.env.example` | env-template (repos, B2 keys, Forgejo) | kopiëren naar `/etc/restic-backup.env` |
|
||||
|
||||
Bovendien aan te maken (niet in deze repo, omdat het secrets zijn):
|
||||
|
||||
- `/etc/restic-backup.password` — alleen het restic-wachtwoord (mode `0400 root:root`).
|
||||
|
||||
## Snelle installatie (zie runbook voor alle context)
|
||||
|
||||
```bash
|
||||
# 1. Tools en directories
|
||||
sudo apt update && sudo apt install -y restic jq
|
||||
|
||||
sudo mkdir -p /srv/backups/scripts /srv/backups/logs /srv/backups/status \
|
||||
/var/backups/databases
|
||||
sudo chmod 0750 /srv/backups/logs /srv/backups/status
|
||||
|
||||
# 2. Scripts plaatsen
|
||||
sudo cp deploy/server-backup/server-backup.sh /srv/backups/scripts/
|
||||
sudo cp deploy/server-backup/restore-test.sh /srv/backups/scripts/
|
||||
sudo chmod 0750 /srv/backups/scripts/*.sh
|
||||
sudo chown root:root /srv/backups/scripts/*.sh
|
||||
|
||||
# 3. Env + password
|
||||
sudo cp deploy/server-backup/restic-backup.env.example /etc/restic-backup.env
|
||||
sudo chmod 0600 /etc/restic-backup.env
|
||||
sudo chown root:root /etc/restic-backup.env
|
||||
# Genereer wachtwoord — bewaar dit OOK in je password manager.
|
||||
sudo sh -c 'openssl rand -hex 24 > /etc/restic-backup.password'
|
||||
sudo chmod 0400 /etc/restic-backup.password
|
||||
|
||||
# 4. Vul /etc/restic-backup.env (RESTIC_REPO_NAS, RESTIC_REPO_B2,
|
||||
# B2_ACCOUNT_ID, B2_ACCOUNT_KEY, FORGEJO_*). Zie runbook deel A+B.
|
||||
|
||||
# 5. Repos initialiseren (zie runbook deel C voor Object Lock + key-capabilities)
|
||||
sudo -E bash -c 'set -a; . /etc/restic-backup.env; set +a; \
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password; \
|
||||
restic -r "$RESTIC_REPO_NAS" init && \
|
||||
restic -r "$RESTIC_REPO_B2" init'
|
||||
|
||||
# 6. Systemd
|
||||
sudo cp deploy/server-backup/server-backup.service /etc/systemd/system/
|
||||
sudo cp deploy/server-backup/server-backup.timer /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now server-backup.timer
|
||||
systemctl list-timers | grep server-backup
|
||||
|
||||
# 7. Eerste run handmatig (volgen via journalctl)
|
||||
sudo systemctl start server-backup.service
|
||||
journalctl -u server-backup.service -f
|
||||
```
|
||||
|
||||
## Verifiëren
|
||||
|
||||
```bash
|
||||
# Statusfile
|
||||
sudo jq . /srv/backups/status/last-run.json
|
||||
|
||||
# Snapshots
|
||||
sudo -E bash -c 'set -a; . /etc/restic-backup.env; set +a; \
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password; \
|
||||
restic -r "$RESTIC_REPO_NAS" snapshots; \
|
||||
restic -r "$RESTIC_REPO_B2" snapshots'
|
||||
|
||||
# Restore-test (NAS, niet-destructief — restored naar /tmp/restore-test)
|
||||
sudo /srv/backups/scripts/restore-test.sh nas
|
||||
sudo jq . /srv/backups/status/last-restore-test.json
|
||||
```
|
||||
|
||||
## Statusfile-schema
|
||||
|
||||
Het script schrijft `/srv/backups/status/last-run.json` na elke run (success of failure), atomisch via temp + `mv`. De ops-dashboard leest deze file via `read_backup_status` (zie `ops-agent/commands.yml.example`).
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": 1,
|
||||
"overall_status": "success | partial_failure | failed",
|
||||
"started_at": "2026-05-15T03:30:00+02:00",
|
||||
"completed_at": "2026-05-15T03:48:21+02:00",
|
||||
"duration_seconds": 1101,
|
||||
"host": "scrum4me-srv",
|
||||
"phases": {
|
||||
"postgres_dump": { "status": "success", "exit_code": 0, "...": "..." },
|
||||
"forgejo_dump": { "status": "skipped", "exit_code": 99, "...": "..." },
|
||||
"forgejo_db_dump": { "status": "skipped", "exit_code": 99 },
|
||||
"restic_nas": { "status": "success", "exit_code": 0, "snapshot_id": "abc123" },
|
||||
"restic_b2": { "status": "degraded", "exit_code": 3, "error": "1 file unreadable" },
|
||||
"forget_nas": { "status": "success", "exit_code": 0 },
|
||||
"check_nas": { "status": "success", "exit_code": 0 },
|
||||
"check_b2": { "status": "success", "exit_code": 0 }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Per phase `status`:
|
||||
|
||||
| status | betekenis | telt mee als |
|
||||
|---|---|---|
|
||||
| `success` | exit 0 | success |
|
||||
| `skipped` | exit 99 — phase niet van toepassing (bv. Forgejo niet geïnstalleerd) | success |
|
||||
| `degraded` | exit 3 — restic snapshot is gemaakt maar bepaalde files waren onleesbaar | partial_failure |
|
||||
| `failed` | andere non-zero exit | partial_failure of failed (zie `overall_status`) |
|
||||
| `pending` | phase niet gerund (script aborted vóór deze phase) | partial_failure |
|
||||
|
||||
`overall_status` regels:
|
||||
|
||||
- **`failed`** als `postgres_dump` faalt (DB-dump is autoritatief), of als **beide** restic repos falen.
|
||||
- **`partial_failure`** bij enige `failed` of `degraded` phase die niet kritisch is (bv. één restic repo down, of forgejo_dump faalt terwijl postgres lukt).
|
||||
- **`success`** als geen enkele phase `failed` of `degraded` is.
|
||||
|
||||
## Volgorde tov bestaande `ops-db-backup.timer`
|
||||
|
||||
De bestaande `deploy/ops-agent/ops-db-backup.timer` draait om **02:00** en doet alleen `pg_dump ops_dashboard` naar `/srv/ops/backups/`. Deze nieuwe `server-backup.timer` draait om **03:30** en pickt die map mee in zijn restic-backup. Beide blijven naast elkaar bestaan.
|
||||
44
deploy/server-backup/restic-backup.env.example
Normal file
44
deploy/server-backup/restic-backup.env.example
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# Copy to /etc/restic-backup.env on the host. Permissions: 0600 root:root.
|
||||
# RESTIC_PASSWORD lives in /etc/restic-backup.password (mode 0400 root:root)
|
||||
# — the backup script sets RESTIC_PASSWORD_FILE from there, so the password
|
||||
# never appears in the process listing or this env file.
|
||||
|
||||
# ── Restic repositories ────────────────────────────────────────────────────
|
||||
# Local NAS path (must be mounted before the timer fires; see runbook).
|
||||
RESTIC_REPO_NAS=/mnt/backup-server/restic/scrum4me-srv
|
||||
|
||||
# Backblaze B2 repo, format: b2:<bucket-name>:<prefix>
|
||||
# Bucket must have Object Lock (Governance) with default retention >= 30 days.
|
||||
RESTIC_REPO_B2=b2:scrum4me-srv-backup:scrum4me-srv
|
||||
|
||||
# ── Backblaze B2 server key ────────────────────────────────────────────────
|
||||
# Capabilities REQUIRED: listBuckets, listFiles, readFiles, writeFiles
|
||||
# Capabilities FORBIDDEN: deleteFiles, deleteKeys, bypassGovernance
|
||||
# Create with:
|
||||
# b2 application-key create \
|
||||
# --bucket scrum4me-srv-backup \
|
||||
# --name-prefix scrum4me-srv \
|
||||
# server-backup-key \
|
||||
# listBuckets,listFiles,readFiles,writeFiles
|
||||
B2_ACCOUNT_ID=REPLACE_WITH_B2_KEY_ID
|
||||
B2_ACCOUNT_KEY=REPLACE_WITH_B2_APPLICATION_KEY
|
||||
|
||||
# ── Forgejo backup target (optional — set to skip if Forgejo not deployed) ─
|
||||
# Container name as it appears in `docker ps`. Set to "" or comment out to
|
||||
# skip the Forgejo phases entirely.
|
||||
FORGEJO_CONTAINER=forgejo
|
||||
# Path to app.ini INSIDE the Forgejo container (used by `forgejo dump -c`).
|
||||
FORGEJO_CONFIG=/data/gitea/conf/app.ini
|
||||
# Postgres database name for Forgejo (empty = use SQLite, skip forgejo_db_dump).
|
||||
FORGEJO_DB_NAME=forgejo
|
||||
# Postgres container + role for Forgejo's DB (defaults match scrum4me stack).
|
||||
FORGEJO_DB_CONTAINER=scrum4me-postgres
|
||||
FORGEJO_DB_USER=scrum4me
|
||||
|
||||
# ── Scrum4Me Postgres (required for postgres_dump phase) ───────────────────
|
||||
PG_CONTAINER=scrum4me-postgres
|
||||
PG_DUMPALL_USER=scrum4me
|
||||
|
||||
# ── Optional bandwidth limit for restic B2 upload (KiB/s; 0 = unlimited) ──
|
||||
# Translated by the script into `restic --limit-upload "$BACKUP_LIMIT_UPLOAD_KIB"`.
|
||||
# BACKUP_LIMIT_UPLOAD_KIB=5000
|
||||
177
deploy/server-backup/restore-test.sh
Normal file
177
deploy/server-backup/restore-test.sh
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
#!/usr/bin/env bash
|
||||
# Restore the latest restic snapshot to /tmp/restore-test/ and assert that a
|
||||
# small set of critical files came back intact. Used by the monthly maintenance
|
||||
# check and by the dashboard's "Restore test" button.
|
||||
#
|
||||
# Usage:
|
||||
# server-backup-restore-test.sh [nas|b2]
|
||||
#
|
||||
# Default repo is "nas" (faster, no B2 download fees).
|
||||
|
||||
umask 077
|
||||
set -uo pipefail
|
||||
|
||||
REPO_LABEL="${1:-nas}"
|
||||
RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}"
|
||||
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
|
||||
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
|
||||
STATUS_DIR="$(dirname "$STATUS_FILE")"
|
||||
STARTED_AT="$(date -Is)"
|
||||
SECONDS=0
|
||||
|
||||
# Load env (idempotent: ok if already in environment).
|
||||
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||
# shellcheck disable=SC1091
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
fi
|
||||
|
||||
case "$REPO_LABEL" in
|
||||
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||
*) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;;
|
||||
esac
|
||||
|
||||
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
|
||||
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
|
||||
exit 1
|
||||
fi
|
||||
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
|
||||
|
||||
for tool in jq restic; do
|
||||
command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; }
|
||||
done
|
||||
|
||||
mkdir -p "$STATUS_DIR"
|
||||
chmod 0750 "$STATUS_DIR"
|
||||
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo " Restore test — started $STARTED_AT"
|
||||
echo " Repo: $REPO_LABEL ($REPO)"
|
||||
echo " Target: $RESTORE_DIR"
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
|
||||
# Clean previous attempt to keep results unambiguous.
|
||||
rm -rf "$RESTORE_DIR"
|
||||
mkdir -p "$RESTORE_DIR"
|
||||
|
||||
# Find latest snapshot id.
|
||||
SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \
|
||||
| jq -r '.[0].short_id // .[0].id // empty')
|
||||
|
||||
if [ -z "$SNAPSHOT_ID" ]; then
|
||||
echo "ERROR: no snapshots found in $REPO_LABEL repo"
|
||||
jq -n \
|
||||
--arg started "$STARTED_AT" \
|
||||
--arg completed "$(date -Is)" \
|
||||
--argjson duration "$SECONDS" \
|
||||
--arg repo "$REPO_LABEL" \
|
||||
'{
|
||||
schema_version: 1,
|
||||
overall_status: "failed",
|
||||
started_at: $started,
|
||||
completed_at: $completed,
|
||||
duration_seconds: $duration,
|
||||
repo: $repo,
|
||||
snapshot_id: null,
|
||||
error: "no snapshots in repo",
|
||||
assertions: []
|
||||
}' > "$STATUS_FILE"
|
||||
chmod 0644 "$STATUS_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Restoring snapshot $SNAPSHOT_ID …"
|
||||
RESTORE_RC=0
|
||||
restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" || RESTORE_RC=$?
|
||||
|
||||
if [ "$RESTORE_RC" -ne 0 ]; then
|
||||
echo "ERROR: restic restore exited $RESTORE_RC"
|
||||
fi
|
||||
|
||||
# Assertions: each is a path that MUST exist and be non-empty.
|
||||
# Adjust to your stack after first run (and update the runbook addendum).
|
||||
ASSERTION_PATHS=(
|
||||
"$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml"
|
||||
"$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile"
|
||||
"$RESTORE_DIR/etc/restic-backup.env"
|
||||
)
|
||||
|
||||
# Latest postgres dump — match the newest file (glob may resolve to zero).
|
||||
shopt -s nullglob
|
||||
PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz)
|
||||
shopt -u nullglob
|
||||
if [ "${#PG_DUMPS[@]}" -gt 0 ]; then
|
||||
# pick lexicographic last (= newest date, ISO format)
|
||||
LATEST_PG="${PG_DUMPS[-1]}"
|
||||
ASSERTION_PATHS+=("$LATEST_PG")
|
||||
fi
|
||||
|
||||
ASSERTIONS_JSON='[]'
|
||||
ANY_FAILED=0
|
||||
for p in "${ASSERTION_PATHS[@]}"; do
|
||||
if [ -s "$p" ]; then
|
||||
status="ok"
|
||||
bytes=$(stat -c %s "$p")
|
||||
echo " ✓ $p ($bytes bytes)"
|
||||
elif [ -e "$p" ]; then
|
||||
status="empty"
|
||||
bytes=0
|
||||
ANY_FAILED=1
|
||||
echo " ✗ $p (exists but empty)"
|
||||
else
|
||||
status="missing"
|
||||
bytes=0
|
||||
ANY_FAILED=1
|
||||
echo " ✗ $p (missing)"
|
||||
fi
|
||||
ASSERTIONS_JSON=$(jq -c \
|
||||
--arg path "$p" \
|
||||
--arg status "$status" \
|
||||
--argjson bytes "$bytes" \
|
||||
'. + [{path: $path, status: $status, bytes: $bytes}]' \
|
||||
<<< "$ASSERTIONS_JSON")
|
||||
done
|
||||
|
||||
if [ "$RESTORE_RC" -ne 0 ]; then
|
||||
OVERALL="failed"
|
||||
elif [ "$ANY_FAILED" -ne 0 ]; then
|
||||
OVERALL="partial_failure"
|
||||
else
|
||||
OVERALL="success"
|
||||
fi
|
||||
|
||||
jq -n \
|
||||
--arg started "$STARTED_AT" \
|
||||
--arg completed "$(date -Is)" \
|
||||
--argjson duration "$SECONDS" \
|
||||
--arg repo "$REPO_LABEL" \
|
||||
--arg snapshot "$SNAPSHOT_ID" \
|
||||
--arg overall "$OVERALL" \
|
||||
--argjson restore_exit "$RESTORE_RC" \
|
||||
--argjson assertions "$ASSERTIONS_JSON" \
|
||||
'{
|
||||
schema_version: 1,
|
||||
overall_status: $overall,
|
||||
started_at: $started,
|
||||
completed_at: $completed,
|
||||
duration_seconds: $duration,
|
||||
repo: $repo,
|
||||
snapshot_id: $snapshot,
|
||||
restore_exit_code: $restore_exit,
|
||||
target: "'"$RESTORE_DIR"'",
|
||||
assertions: $assertions
|
||||
}' > "$STATUS_FILE"
|
||||
chmod 0644 "$STATUS_FILE"
|
||||
|
||||
echo ""
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo " Restore test — finished $(date -Is)"
|
||||
echo " Overall: $OVERALL"
|
||||
echo " Status file: $STATUS_FILE"
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
|
||||
case "$OVERALL" in
|
||||
success) exit 0 ;;
|
||||
partial_failure) exit 75 ;;
|
||||
failed|*) exit 1 ;;
|
||||
esac
|
||||
33
deploy/server-backup/server-backup.service
Normal file
33
deploy/server-backup/server-backup.service
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
[Unit]
|
||||
Description=Server-wide backup (pg_dumpall + restic to NAS + B2)
|
||||
Documentation=file:///srv/ops/repos/ops-dashboard/docs/runbooks/server-backup.md
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
EnvironmentFile=/etc/restic-backup.env
|
||||
ExecStart=/srv/backups/scripts/server-backup.sh
|
||||
TimeoutStartSec=4h
|
||||
RuntimeMaxSec=6h
|
||||
Nice=10
|
||||
IOSchedulingClass=best-effort
|
||||
IOSchedulingPriority=7
|
||||
# Sandboxing — backup needs root for /etc + docker exec, but limit the rest.
|
||||
ProtectSystem=strict
|
||||
ReadWritePaths=/var/backups /srv/backups /run /tmp
|
||||
ProtectHome=read-only
|
||||
NoNewPrivileges=yes
|
||||
PrivateTmp=yes
|
||||
ProtectKernelTunables=yes
|
||||
ProtectKernelModules=yes
|
||||
ProtectControlGroups=yes
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=server-backup
|
||||
|
||||
# Exit code semantics from server-backup.sh:
|
||||
# 0 = success (all phases ok)
|
||||
# 75 = partial_failure (some non-critical phase failed/degraded)
|
||||
# 1 = failed (a critical dump phase failed or both restic repos failed)
|
||||
SuccessExitStatus=75
|
||||
497
deploy/server-backup/server-backup.sh
Normal file
497
deploy/server-backup/server-backup.sh
Normal file
|
|
@ -0,0 +1,497 @@
|
|||
#!/usr/bin/env bash
|
||||
# Daily server-wide backup: dumps databases, runs restic to NAS + B2,
|
||||
# writes a structured statusfile that the ops-dashboard can read.
|
||||
#
|
||||
# Install:
|
||||
# cp deploy/server-backup/server-backup.sh /srv/backups/scripts/server-backup.sh
|
||||
# chmod 0750 /srv/backups/scripts/server-backup.sh
|
||||
# chown root:root /srv/backups/scripts/server-backup.sh
|
||||
#
|
||||
# Requires: bash, jq, flock, restic, docker, gzip. See runbook for setup.
|
||||
|
||||
umask 077
|
||||
set -uo pipefail
|
||||
|
||||
# ── Configuration ──────────────────────────────────────────────────────────
|
||||
STATUS_DIR="${STATUS_DIR:-/srv/backups/status}"
|
||||
LOG_DIR="${LOG_DIR:-/srv/backups/logs}"
|
||||
DB_DUMP_DIR="${DB_DUMP_DIR:-/var/backups/databases}"
|
||||
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
|
||||
LOCKFILE="${LOCKFILE:-/run/server-backup.lock}"
|
||||
RUN_DATE="$(date +%F)"
|
||||
STARTED_AT="$(date -Is)"
|
||||
SECONDS=0
|
||||
|
||||
# Phase order — must match write_status_json + determine_exit_code expectations.
|
||||
PHASE_ORDER=(
|
||||
postgres_dump
|
||||
forgejo_dump
|
||||
forgejo_db_dump
|
||||
restic_nas
|
||||
restic_b2
|
||||
forget_nas
|
||||
check_nas
|
||||
check_b2
|
||||
)
|
||||
|
||||
declare -A PHASE_STATUS PHASE_EXIT PHASE_START PHASE_END PHASE_ERR PHASE_EXTRA
|
||||
OVERALL_STATUS="unknown"
|
||||
|
||||
# ── Single-instance lock ───────────────────────────────────────────────────
|
||||
exec 9>"$LOCKFILE" || { echo "ERROR: cannot open lockfile $LOCKFILE" >&2; exit 1; }
|
||||
if ! flock -n 9; then
|
||||
echo "ERROR: another server-backup is already running (lock $LOCKFILE held)" >&2
|
||||
exit 75
|
||||
fi
|
||||
|
||||
# ── Env + secret loading ───────────────────────────────────────────────────
|
||||
# When invoked via systemd, EnvironmentFile=/etc/restic-backup.env has already
|
||||
# been loaded. When invoked manually for testing, source it ourselves.
|
||||
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||
# shellcheck disable=SC1091
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
fi
|
||||
|
||||
: "${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set (see /etc/restic-backup.env)}"
|
||||
: "${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set (see /etc/restic-backup.env)}"
|
||||
|
||||
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
|
||||
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
|
||||
exit 1
|
||||
fi
|
||||
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
|
||||
|
||||
# Required tooling
|
||||
for tool in jq restic docker gzip flock; do
|
||||
if ! command -v "$tool" >/dev/null 2>&1; then
|
||||
echo "ERROR: required tool '$tool' not on PATH" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# ── Logging ────────────────────────────────────────────────────────────────
|
||||
mkdir -p "$LOG_DIR" "$STATUS_DIR" "$DB_DUMP_DIR"
|
||||
chmod 0750 "$LOG_DIR" "$STATUS_DIR"
|
||||
LOG_FILE="$LOG_DIR/server-backup-$RUN_DATE.log"
|
||||
# Mirror everything to LOG_FILE and the journal.
|
||||
exec > >(tee -a "$LOG_FILE") 2>&1
|
||||
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo " Server backup — started $STARTED_AT"
|
||||
echo " Host: $(hostname)"
|
||||
echo " NAS repo: $RESTIC_REPO_NAS"
|
||||
echo " B2 repo: $RESTIC_REPO_B2"
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
|
||||
# ── Phase runner ───────────────────────────────────────────────────────────
|
||||
# Runs the function passed as first arg, captures stdout+stderr into a phase
|
||||
# buffer, records status / exit_code / timestamps / error tail.
|
||||
run_phase() {
|
||||
local name="$1"; shift
|
||||
local phase_buf
|
||||
phase_buf=$(mktemp -t "backup-phase-${name}.XXXXXX")
|
||||
|
||||
echo ""
|
||||
echo "─── phase: $name ─── $(date -Is)"
|
||||
PHASE_START[$name]=$(date -Is)
|
||||
|
||||
local rc=0
|
||||
# Run in a sub-shell so set -e inside callees doesn't kill us.
|
||||
(
|
||||
"$@"
|
||||
) 2>&1 | tee "$phase_buf"
|
||||
rc=${PIPESTATUS[0]}
|
||||
|
||||
PHASE_EXIT[$name]=$rc
|
||||
case "$rc" in
|
||||
0) PHASE_STATUS[$name]=success ;;
|
||||
3) PHASE_STATUS[$name]=degraded ;; # restic: snapshot created but some files unreadable
|
||||
99) PHASE_STATUS[$name]=skipped ;; # our convention for "not applicable"
|
||||
*) PHASE_STATUS[$name]=failed ;;
|
||||
esac
|
||||
|
||||
if [ "$rc" -ne 0 ] && [ "$rc" -ne 99 ] && [ -s "$phase_buf" ]; then
|
||||
# Keep last few non-empty lines as a compact error summary.
|
||||
PHASE_ERR[$name]=$(tail -n 5 "$phase_buf" | tr '\n' ' ' | head -c 500)
|
||||
fi
|
||||
|
||||
PHASE_END[$name]=$(date -Is)
|
||||
rm -f "$phase_buf"
|
||||
echo "─── end $name (exit=$rc, status=${PHASE_STATUS[$name]})"
|
||||
}
|
||||
|
||||
# Convention: a phase function returns 99 to mark itself "skipped" — the
|
||||
# overall outcome treats this as success.
|
||||
SKIPPED=99
|
||||
|
||||
# ── Phase 1: pg_dumpall (Scrum4Me Postgres cluster) ────────────────────────
|
||||
dump_postgres_all() {
|
||||
local pg_container="${PG_CONTAINER:-scrum4me-postgres}"
|
||||
local pg_user="${PG_DUMPALL_USER:-scrum4me}"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' | grep -qx "$pg_container"; then
|
||||
echo "Postgres container '$pg_container' not running — cannot continue."
|
||||
return 1
|
||||
fi
|
||||
|
||||
local tmp="$DB_DUMP_DIR/.postgres-$RUN_DATE.sql.gz.tmp"
|
||||
local final="$DB_DUMP_DIR/postgres-$RUN_DATE.sql.gz"
|
||||
rm -f "$tmp"
|
||||
|
||||
set -o pipefail
|
||||
docker exec "$pg_container" pg_dumpall -U "$pg_user" --clean --if-exists \
|
||||
| gzip -c > "$tmp"
|
||||
local rc=$?
|
||||
set +o pipefail
|
||||
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
rm -f "$tmp"
|
||||
return "$rc"
|
||||
fi
|
||||
|
||||
mv "$tmp" "$final"
|
||||
chmod 0640 "$final"
|
||||
local bytes
|
||||
bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
|
||||
PHASE_EXTRA[postgres_dump]="output_file=$final;bytes=$bytes"
|
||||
echo "wrote $final ($bytes bytes)"
|
||||
}
|
||||
|
||||
# ── Phase 2: Forgejo dump (filesystem + repos) ─────────────────────────────
|
||||
dump_forgejo() {
|
||||
local fj="${FORGEJO_CONTAINER:-}"
|
||||
if [ -z "$fj" ]; then
|
||||
echo "FORGEJO_CONTAINER unset — skipping Forgejo dump."
|
||||
return "$SKIPPED"
|
||||
fi
|
||||
if ! docker ps --format '{{.Names}}' | grep -qx "$fj"; then
|
||||
echo "Forgejo container '$fj' not running — skipping."
|
||||
return "$SKIPPED"
|
||||
fi
|
||||
|
||||
local config="${FORGEJO_CONFIG:-/data/gitea/conf/app.ini}"
|
||||
local tmp="$DB_DUMP_DIR/.forgejo-$RUN_DATE.zip.tmp"
|
||||
local final="$DB_DUMP_DIR/forgejo-$RUN_DATE.zip"
|
||||
rm -f "$tmp"
|
||||
|
||||
# `forgejo dump -f -` streams the zip to stdout. We run as the `git` user
|
||||
# inside the container (standard Forgejo image convention).
|
||||
set -o pipefail
|
||||
docker exec -u git "$fj" forgejo dump --skip-db -c "$config" --type zip -f - > "$tmp"
|
||||
local rc=$?
|
||||
set +o pipefail
|
||||
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
rm -f "$tmp"
|
||||
return "$rc"
|
||||
fi
|
||||
|
||||
mv "$tmp" "$final"
|
||||
chmod 0640 "$final"
|
||||
local bytes
|
||||
bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
|
||||
PHASE_EXTRA[forgejo_dump]="output_file=$final;bytes=$bytes"
|
||||
echo "wrote $final ($bytes bytes)"
|
||||
}
|
||||
|
||||
# ── Phase 3: Forgejo Postgres DB dump (authoritative for DB restore) ───────
|
||||
dump_forgejo_db() {
|
||||
local db_name="${FORGEJO_DB_NAME:-}"
|
||||
if [ -z "$db_name" ]; then
|
||||
echo "FORGEJO_DB_NAME unset — skipping Forgejo DB dump (assume SQLite)."
|
||||
return "$SKIPPED"
|
||||
fi
|
||||
local db_container="${FORGEJO_DB_CONTAINER:-scrum4me-postgres}"
|
||||
local db_user="${FORGEJO_DB_USER:-scrum4me}"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' | grep -qx "$db_container"; then
|
||||
echo "DB container '$db_container' not running — skipping Forgejo DB dump."
|
||||
return "$SKIPPED"
|
||||
fi
|
||||
|
||||
local tmp="$DB_DUMP_DIR/.forgejo-db-$RUN_DATE.sql.gz.tmp"
|
||||
local final="$DB_DUMP_DIR/forgejo-db-$RUN_DATE.sql.gz"
|
||||
rm -f "$tmp"
|
||||
|
||||
set -o pipefail
|
||||
docker exec "$db_container" pg_dump -U "$db_user" --clean --if-exists "$db_name" \
|
||||
| gzip -c > "$tmp"
|
||||
local rc=$?
|
||||
set +o pipefail
|
||||
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
rm -f "$tmp"
|
||||
return "$rc"
|
||||
fi
|
||||
|
||||
mv "$tmp" "$final"
|
||||
chmod 0640 "$final"
|
||||
local bytes
|
||||
bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
|
||||
PHASE_EXTRA[forgejo_db_dump]="output_file=$final;bytes=$bytes"
|
||||
echo "wrote $final ($bytes bytes)"
|
||||
}
|
||||
|
||||
# ── Phases 4 + 5: restic backup to NAS / B2 ────────────────────────────────
|
||||
# Live Docker datadirs are excluded — dumps (above) are the authoritative
|
||||
# restore source for Postgres and Forgejo.
|
||||
RESTIC_BACKUP_PATHS=(
|
||||
/etc
|
||||
/home/janpeter
|
||||
/root
|
||||
/opt
|
||||
/srv
|
||||
/usr/local/bin
|
||||
"$DB_DUMP_DIR"
|
||||
/srv/ops/backups
|
||||
)
|
||||
RESTIC_EXCLUDES=(
|
||||
--exclude='**/node_modules'
|
||||
--exclude='**/.next/cache'
|
||||
--exclude='**/.cache'
|
||||
--exclude='**/.git/objects/pack'
|
||||
--exclude='/srv/backups/logs'
|
||||
--exclude='/tmp'
|
||||
--exclude='/var/tmp'
|
||||
--exclude='/srv/scrum4me/postgres' # live Postgres datadir — non-authoritative
|
||||
--exclude='/srv/forgejo/data/git' # live Forgejo git objects — non-authoritative
|
||||
--exclude='/srv/forgejo/data/lfs'
|
||||
--exclude='/srv/forgejo/data/queues'
|
||||
)
|
||||
|
||||
restic_backup_to() {
|
||||
local repo="$1"; local label="$2"
|
||||
local extra_args=()
|
||||
if [ "$label" = "b2" ] && [ -n "${BACKUP_LIMIT_UPLOAD_KIB:-}" ]; then
|
||||
extra_args+=(--limit-upload "$BACKUP_LIMIT_UPLOAD_KIB")
|
||||
fi
|
||||
|
||||
# Capture restic JSON output so we can extract the snapshot id.
|
||||
local json_out
|
||||
json_out=$(mktemp -t "restic-backup-${label}.XXXXXX.json")
|
||||
|
||||
# --no-scan keeps the lockfile interaction light; --skip-if-unchanged still
|
||||
# records a snapshot per restic semantics so the dashboard sees a daily entry.
|
||||
restic -r "$repo" backup \
|
||||
--tag scheduled \
|
||||
--tag "host=$(hostname)" \
|
||||
--json \
|
||||
"${extra_args[@]}" \
|
||||
"${RESTIC_EXCLUDES[@]}" \
|
||||
"${RESTIC_BACKUP_PATHS[@]}" \
|
||||
| tee "$json_out"
|
||||
local rc=${PIPESTATUS[0]}
|
||||
|
||||
# Extract snapshot id from the final summary line (last JSON object of type=summary).
|
||||
local snap
|
||||
snap=$(jq -rs 'map(select(.message_type=="summary")) | last | .snapshot_id // empty' < "$json_out" 2>/dev/null || true)
|
||||
local files_new
|
||||
files_new=$(jq -rs 'map(select(.message_type=="summary")) | last | .files_new // empty' < "$json_out" 2>/dev/null || true)
|
||||
local data_added
|
||||
data_added=$(jq -rs 'map(select(.message_type=="summary")) | last | .data_added // empty' < "$json_out" 2>/dev/null || true)
|
||||
|
||||
if [ -n "$snap" ]; then
|
||||
PHASE_EXTRA["restic_$label"]="snapshot_id=$snap;files_new=${files_new:-0};data_added_bytes=${data_added:-0}"
|
||||
fi
|
||||
|
||||
rm -f "$json_out"
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
# ── Phase 6: prune NAS only (B2 is Object Lock — pruning runs off-server) ──
|
||||
restic_forget_nas() {
|
||||
restic -r "$RESTIC_REPO_NAS" forget \
|
||||
--keep-daily 7 \
|
||||
--keep-weekly 4 \
|
||||
--keep-monthly 12 \
|
||||
--prune
|
||||
}
|
||||
|
||||
# ── Phase 7: integrity check (light daily; weekly read-data-subset on Sun) ─
|
||||
is_sunday() {
|
||||
[ "$(date +%u)" = "7" ]
|
||||
}
|
||||
|
||||
restic_check_nas() {
|
||||
if is_sunday; then
|
||||
restic -r "$RESTIC_REPO_NAS" check --read-data-subset=2.5%
|
||||
else
|
||||
restic -r "$RESTIC_REPO_NAS" check
|
||||
fi
|
||||
}
|
||||
|
||||
restic_check_b2() {
|
||||
if is_sunday; then
|
||||
# On B2 a read-data-subset costs bandwidth + B2 download fees. Keep the
|
||||
# subset tiny on Sundays; deeper checks run monthly off-server.
|
||||
restic -r "$RESTIC_REPO_B2" check --read-data-subset=1%
|
||||
else
|
||||
restic -r "$RESTIC_REPO_B2" check
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Statusfile writer ──────────────────────────────────────────────────────
|
||||
# Builds a structured JSON statusfile in /srv/backups/status/last-run.json
|
||||
# atomically (write to tmp, then mv).
|
||||
write_status_json() {
|
||||
local tmpfile
|
||||
tmpfile=$(mktemp -t "backup-status.XXXXXX.json")
|
||||
|
||||
# Build the phases object incrementally with jq for safe escaping.
|
||||
local phases_json='{}'
|
||||
local name status exit_code started ended err extra
|
||||
local snapshot_id files_new data_added output_file bytes
|
||||
for name in "${PHASE_ORDER[@]}"; do
|
||||
status="${PHASE_STATUS[$name]:-pending}"
|
||||
exit_code="${PHASE_EXIT[$name]:-}"
|
||||
started="${PHASE_START[$name]:-}"
|
||||
ended="${PHASE_END[$name]:-}"
|
||||
err="${PHASE_ERR[$name]:-}"
|
||||
extra="${PHASE_EXTRA[$name]:-}"
|
||||
|
||||
snapshot_id=""
|
||||
files_new=""
|
||||
data_added=""
|
||||
output_file=""
|
||||
bytes=""
|
||||
if [ -n "$extra" ]; then
|
||||
# extra is a semicolon-separated list of key=value pairs
|
||||
local pair key val
|
||||
IFS=';' read -ra pairs <<< "$extra"
|
||||
for pair in "${pairs[@]}"; do
|
||||
key="${pair%%=*}"
|
||||
val="${pair#*=}"
|
||||
case "$key" in
|
||||
snapshot_id) snapshot_id="$val" ;;
|
||||
files_new) files_new="$val" ;;
|
||||
data_added_bytes) data_added="$val" ;;
|
||||
output_file) output_file="$val" ;;
|
||||
bytes) bytes="$val" ;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
|
||||
# exit_code as JSON number when present, null otherwise.
|
||||
local exit_arg='null'
|
||||
if [ -n "$exit_code" ]; then
|
||||
exit_arg="$exit_code"
|
||||
fi
|
||||
|
||||
phases_json=$(
|
||||
jq -c -n \
|
||||
--argjson base "$phases_json" \
|
||||
--arg name "$name" \
|
||||
--arg status "$status" \
|
||||
--argjson exit_code "$exit_arg" \
|
||||
--arg started "$started" \
|
||||
--arg ended "$ended" \
|
||||
--arg err "$err" \
|
||||
--arg snapshot_id "$snapshot_id" \
|
||||
--arg files_new "$files_new" \
|
||||
--arg data_added "$data_added" \
|
||||
--arg output_file "$output_file" \
|
||||
--arg bytes "$bytes" \
|
||||
'
|
||||
$base + {
|
||||
($name): ({
|
||||
status: $status,
|
||||
exit_code: $exit_code,
|
||||
started_at: (if $started == "" then null else $started end),
|
||||
completed_at: (if $ended == "" then null else $ended end),
|
||||
error: (if $err == "" then null else $err end)
|
||||
}
|
||||
+ (if $snapshot_id != "" then { snapshot_id: $snapshot_id } else {} end)
|
||||
+ (if $files_new != "" then { files_new: ($files_new | tonumber? // null) } else {} end)
|
||||
+ (if $data_added != "" then { data_added_bytes: ($data_added | tonumber? // null) } else {} end)
|
||||
+ (if $output_file != "" then { output_file: $output_file } else {} end)
|
||||
+ (if $bytes != "" then { bytes: ($bytes | tonumber? // null) } else {} end))
|
||||
}'
|
||||
)
|
||||
done
|
||||
|
||||
jq -n \
|
||||
--arg overall "$OVERALL_STATUS" \
|
||||
--arg started "$STARTED_AT" \
|
||||
--arg completed "$(date -Is)" \
|
||||
--argjson duration "$SECONDS" \
|
||||
--arg host "$(hostname)" \
|
||||
--argjson phases "$phases_json" \
|
||||
'{
|
||||
schema_version: 1,
|
||||
overall_status: $overall,
|
||||
started_at: $started,
|
||||
completed_at: $completed,
|
||||
duration_seconds: $duration,
|
||||
host: $host,
|
||||
phases: $phases
|
||||
}' > "$tmpfile"
|
||||
|
||||
mv "$tmpfile" "$STATUS_DIR/last-run.json"
|
||||
chmod 0644 "$STATUS_DIR/last-run.json"
|
||||
}
|
||||
|
||||
# ── Outcome aggregation ────────────────────────────────────────────────────
|
||||
# success → exit 0
|
||||
# partial_failure → exit 75 (visible but distinguishable from hard failure)
|
||||
# failed → exit 1
|
||||
determine_exit_code() {
|
||||
local critical_failure=false
|
||||
local has_failure=false
|
||||
local has_degraded=false
|
||||
local name status
|
||||
|
||||
for name in "${PHASE_ORDER[@]}"; do
|
||||
status="${PHASE_STATUS[$name]:-pending}"
|
||||
case "$status" in
|
||||
success|skipped) ;;
|
||||
degraded) has_degraded=true ;;
|
||||
failed)
|
||||
has_failure=true
|
||||
case "$name" in
|
||||
postgres_dump) critical_failure=true ;; # losing the DB dump is catastrophic
|
||||
esac
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Losing BOTH restic repos is also catastrophic.
|
||||
if [ "${PHASE_STATUS[restic_nas]:-}" = "failed" ] \
|
||||
&& [ "${PHASE_STATUS[restic_b2]:-}" = "failed" ]; then
|
||||
critical_failure=true
|
||||
fi
|
||||
|
||||
if [ "$critical_failure" = true ]; then
|
||||
OVERALL_STATUS="failed"
|
||||
echo 1
|
||||
elif [ "$has_failure" = true ] || [ "$has_degraded" = true ]; then
|
||||
OVERALL_STATUS="partial_failure"
|
||||
echo 75
|
||||
else
|
||||
OVERALL_STATUS="success"
|
||||
echo 0
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Main sequence ──────────────────────────────────────────────────────────
|
||||
run_phase postgres_dump dump_postgres_all
|
||||
run_phase forgejo_dump dump_forgejo
|
||||
run_phase forgejo_db_dump dump_forgejo_db
|
||||
run_phase restic_nas restic_backup_to "$RESTIC_REPO_NAS" nas
|
||||
run_phase restic_b2 restic_backup_to "$RESTIC_REPO_B2" b2
|
||||
run_phase forget_nas restic_forget_nas
|
||||
run_phase check_nas restic_check_nas
|
||||
run_phase check_b2 restic_check_b2
|
||||
|
||||
EXIT_CODE=$(determine_exit_code)
|
||||
write_status_json
|
||||
|
||||
echo ""
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo " Server backup — finished $(date -Is)"
|
||||
echo " Overall status: $OVERALL_STATUS (exit $EXIT_CODE)"
|
||||
echo " Duration: ${SECONDS}s"
|
||||
echo " Status file: $STATUS_DIR/last-run.json"
|
||||
echo " Log file: $LOG_FILE"
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
12
deploy/server-backup/server-backup.timer
Normal file
12
deploy/server-backup/server-backup.timer
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
[Unit]
|
||||
Description=Daily server-wide backup (timer)
|
||||
|
||||
[Timer]
|
||||
# Daily at 03:30 local. After ops-db-backup.timer (02:00) so the ops_dashboard
|
||||
# pg_dump from /srv/ops/backups/ is fresh when restic picks it up.
|
||||
OnCalendar=*-*-* 03:30:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=600
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
25
deploy/server-backup/wrappers/read-status.sh
Normal file
25
deploy/server-backup/wrappers/read-status.sh
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env bash
|
||||
# Read /srv/backups/status/last-run.json. Returns "{}" if missing, so the
|
||||
# dashboard can render an "unknown" state instead of erroring.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-run.json}"
|
||||
RESTORE_STATUS_FILE="${RESTORE_STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
|
||||
|
||||
# We emit a small wrapper object with both files so the UI can render the
|
||||
# server-backup status AND the most recent restore-test status from one call.
|
||||
last_run='{}'
|
||||
if [ -r "$STATUS_FILE" ]; then
|
||||
last_run=$(cat "$STATUS_FILE")
|
||||
fi
|
||||
|
||||
last_restore='null'
|
||||
if [ -r "$RESTORE_STATUS_FILE" ]; then
|
||||
last_restore=$(cat "$RESTORE_STATUS_FILE")
|
||||
fi
|
||||
|
||||
jq -n \
|
||||
--argjson last_run "$last_run" \
|
||||
--argjson last_restore "$last_restore" \
|
||||
'{ last_run: $last_run, last_restore_test: $last_restore }'
|
||||
24
deploy/server-backup/wrappers/restic-check.sh
Normal file
24
deploy/server-backup/wrappers/restic-check.sh
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env bash
|
||||
# Run a light restic integrity check on the given repo.
|
||||
# Usage: restic-check.sh nas|b2
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LABEL="${1:-}"
|
||||
if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then
|
||||
echo "label must be nas or b2" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
fi
|
||||
|
||||
case "$LABEL" in
|
||||
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||
esac
|
||||
|
||||
export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}"
|
||||
|
||||
restic -r "$REPO" check
|
||||
39
deploy/server-backup/wrappers/restic-snapshots.sh
Normal file
39
deploy/server-backup/wrappers/restic-snapshots.sh
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#!/usr/bin/env bash
|
||||
# List recent restic snapshots from a labelled repo. Output: JSON array.
|
||||
# Usage: restic-snapshots.sh nas|b2
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LABEL="${1:-}"
|
||||
if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then
|
||||
echo '{"error":"label must be nas or b2"}' >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Load env (idempotent — systemd already loaded it for service contexts).
|
||||
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
fi
|
||||
|
||||
case "$LABEL" in
|
||||
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||
esac
|
||||
|
||||
export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}"
|
||||
|
||||
# Show last 30 snapshots, newest first, with the fields the UI needs.
|
||||
restic -r "$REPO" snapshots --json 2>/dev/null \
|
||||
| jq --arg repo "$LABEL" '
|
||||
sort_by(.time) | reverse | .[0:30]
|
||||
| map({
|
||||
id: .id,
|
||||
short_id: (.short_id // (.id[0:8])),
|
||||
time: .time,
|
||||
hostname: .hostname,
|
||||
tags: (.tags // []),
|
||||
paths: (.paths // []),
|
||||
summary: (.summary // null),
|
||||
repo: $repo
|
||||
})
|
||||
'
|
||||
51
deploy/server-backup/wrappers/restic-stats.sh
Normal file
51
deploy/server-backup/wrappers/restic-stats.sh
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/env bash
|
||||
# Repo stats: combines restic stats in two modes plus snapshot count.
|
||||
# Output: JSON object with restore_size_bytes, raw_data_bytes, dedup_ratio.
|
||||
# Usage: restic-stats.sh nas|b2
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LABEL="${1:-}"
|
||||
if [ "$LABEL" != "nas" ] && [ "$LABEL" != "b2" ]; then
|
||||
echo '{"error":"label must be nas or b2"}' >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
fi
|
||||
|
||||
case "$LABEL" in
|
||||
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
||||
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
||||
esac
|
||||
|
||||
export RESTIC_PASSWORD_FILE="${RESTIC_PASSWORD_FILE:-/etc/restic-backup.password}"
|
||||
|
||||
# restore-size: total bytes if every file in every snapshot were re-extracted.
|
||||
restore_json=$(restic -r "$REPO" stats --mode restore-size --json 2>/dev/null || echo '{}')
|
||||
# raw-data: total unique blob bytes after dedup + compression.
|
||||
raw_json=$(restic -r "$REPO" stats --mode raw-data --json 2>/dev/null || echo '{}')
|
||||
# Snapshot count for the same repo.
|
||||
snap_count=$(restic -r "$REPO" snapshots --json 2>/dev/null | jq 'length // 0')
|
||||
|
||||
jq -n \
|
||||
--arg repo "$LABEL" \
|
||||
--argjson restore "$restore_json" \
|
||||
--argjson raw "$raw_json" \
|
||||
--argjson snap_count "${snap_count:-0}" \
|
||||
'
|
||||
{
|
||||
repo: $repo,
|
||||
snapshots_count: $snap_count,
|
||||
restore_size_bytes: ($restore.total_size // null),
|
||||
restore_size_files: ($restore.total_file_count // null),
|
||||
raw_data_bytes: ($raw.total_size // null),
|
||||
raw_blob_count: ($raw.total_blob_count // null),
|
||||
dedup_ratio: (
|
||||
if ($restore.total_size != null) and ($raw.total_size != null) and ($raw.total_size > 0)
|
||||
then (($restore.total_size | tonumber) / ($raw.total_size | tonumber))
|
||||
else null
|
||||
end
|
||||
)
|
||||
}'
|
||||
18
deploy/server-backup/wrappers/trigger-backup.sh
Normal file
18
deploy/server-backup/wrappers/trigger-backup.sh
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env bash
|
||||
# Trigger server-backup.service ad-hoc. Refuses if a run is already active
|
||||
# (the script itself also flock's, but checking here gives a friendlier error).
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
UNIT=server-backup.service
|
||||
|
||||
active=$(systemctl is-active "$UNIT" 2>/dev/null || true)
|
||||
if [ "$active" = "active" ] || [ "$active" = "activating" ]; then
|
||||
echo "ERROR: $UNIT is already $active — refusing to trigger." >&2
|
||||
exit 75
|
||||
fi
|
||||
|
||||
# Use --no-block so we return immediately; the dashboard will poll via
|
||||
# read-status.sh and tail the log to follow progress.
|
||||
systemctl start --no-block "$UNIT"
|
||||
echo "Triggered $UNIT. Follow with: journalctl -u $UNIT -f"
|
||||
15
deploy/server-backup/wrappers/trigger-restore-test.sh
Normal file
15
deploy/server-backup/wrappers/trigger-restore-test.sh
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
# Run a non-destructive restore test against the NAS repo. Streams output to
|
||||
# stdout (so the dashboard's StreamingTerminal can render it) and writes the
|
||||
# structured result to /srv/backups/status/last-restore-test.json.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
REPO_LABEL="${1:-nas}"
|
||||
|
||||
if [ ! -x /srv/backups/scripts/restore-test.sh ]; then
|
||||
echo "ERROR: /srv/backups/scripts/restore-test.sh not installed" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exec /srv/backups/scripts/restore-test.sh "$REPO_LABEL"
|
||||
462
docs/runbooks/server-backup.md
Normal file
462
docs/runbooks/server-backup.md
Normal file
|
|
@ -0,0 +1,462 @@
|
|||
# Server-brede backup (restic + NAS + B2, dashboard-bediend)
|
||||
|
||||
## Context
|
||||
|
||||
`scrum4me-srv` draait een Docker-stack (Scrum4Me-web, worker-idea, ops-dashboard,
|
||||
postgres-17, caddy) plus Forgejo. De huidige backup-dekking — alleen
|
||||
`pg_dump ops_dashboard` naar `/srv/ops/backups/` met 30 dagen retentie op één
|
||||
disk — laat **alles anders** vallen: Scrum4Me-data, Forgejo, Caddy-certs,
|
||||
Docker-volumes en `/etc` zijn weg bij brand, diefstal, ransomware of disk-fail.
|
||||
|
||||
Doel: de server **herbouwbaar** maken vanuit een encrypted, gededupliceerde,
|
||||
versioned backup met twee onafhankelijke kopieën — **NAS** lokaal en
|
||||
**Backblaze B2** offsite — bediend vanuit de ops-dashboard. De bestaande
|
||||
`backup_ops_db`-flow blijft draaien; restic pickt zijn dump-directory mee.
|
||||
|
||||
**Belangrijke ontwerpkeuzes** (uitgebreid toegelicht in de review onder
|
||||
`/Users/janpetervisser/Development/Scrum4Me/docs/recommendations/server-backup-plan-review-2026-05-15.md`):
|
||||
|
||||
- **B2 Object Lock + server-key zonder `deleteFiles`** — een aanvaller met root
|
||||
op de server kan geen B2-snapshots weghalen tot Object Lock-retention
|
||||
verloopt. Dat is de ransomware-bescherming. Prune op B2 gebeurt maandelijks
|
||||
vanaf de laptop met een aparte hoge-cap maintenance-key.
|
||||
- **Authoritative restore-bron = dumps, niet live datadirs.** Postgres- en
|
||||
Forgejo-data-directories zijn expliciet `--exclude`'d uit restic;
|
||||
`pg_dumpall` en `forgejo dump` + aparte `pg_dump <forgejo_db>` zijn de
|
||||
autoritatieve bronnen.
|
||||
- **Phase-based script met structured statusfile.** Eén falende fase laat de
|
||||
rest doorlopen; per-phase status / exit-code / timestamps / error-tail komen
|
||||
in `/srv/backups/status/last-run.json` die de dashboard live leest.
|
||||
- **Single-instance lock** via `flock /run/server-backup.lock` — UI-knop en
|
||||
systemd-timer kunnen elkaar niet overlappen.
|
||||
|
||||
## Voorwaarden (aantoonbaar voldaan vóór uitvoering)
|
||||
|
||||
- [ ] Bash, jq, restic, docker, gzip, flock op `$PATH` (`apt install restic jq` voor de eerste twee — de rest zit standaard).
|
||||
- [ ] De Scrum4Me-stack draait in Docker (`docker ps | grep scrum4me-postgres`).
|
||||
- [ ] `/srv/scrum4me/compose/docker-compose.yml` bestaat (anders herzie je het exclude-pad in `server-backup.sh`).
|
||||
- [ ] Tijd loopt synchroon (`timedatectl status`) — backups gebruiken ISO-timestamps.
|
||||
|
||||
## Voorwaarden (input van de gebruiker nodig)
|
||||
|
||||
- **NAS-mount** — pad zoals `/mnt/backup-server` met genoeg ruimte (initieel ≥ 100 GB; restic is gededupliceerd, dus daarna groeit het traag).
|
||||
- **Backblaze B2-account** — credit-card geregistreerd, bucket aanmaken vereist een operator-actie.
|
||||
- **Restic-wachtwoord** — `openssl rand -hex 24`, bewaard in je password manager **én** in `/etc/restic-backup.password` op de server. Beide nodig — kwijt op één plek = repo onleesbaar.
|
||||
- **B2 maintenance-key** — bewaard alleen op je laptop in passwordmanager. Niet op de server.
|
||||
|
||||
---
|
||||
|
||||
## Deel A — Voorbereiding op `scrum4me-srv`
|
||||
|
||||
Uit te voeren als `root` op `scrum4me-srv`.
|
||||
|
||||
1. **Tools installeren**
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install -y restic jq
|
||||
restic version
|
||||
```
|
||||
|
||||
2. **Directories aanmaken**
|
||||
```bash
|
||||
sudo mkdir -p /srv/backups/scripts /srv/backups/logs /srv/backups/status \
|
||||
/var/backups/databases
|
||||
sudo chmod 0750 /srv/backups/logs /srv/backups/status
|
||||
```
|
||||
|
||||
3. **NAS-mount controleren / aanmaken**
|
||||
```bash
|
||||
mountpoint -q /mnt/backup-server && echo "OK" || echo "NIET gemount"
|
||||
```
|
||||
Zo nee: `fstab`-regel toevoegen, `systemctl daemon-reload`, `mount -a`. Zorg dat de mount automatisch terugkomt bij reboot — anders crashed de eerste backup-run na een reboot.
|
||||
|
||||
4. **Restic-wachtwoord genereren en plaatsen**
|
||||
```bash
|
||||
sudo sh -c 'openssl rand -hex 24 > /etc/restic-backup.password'
|
||||
sudo chmod 0400 /etc/restic-backup.password
|
||||
sudo chown root:root /etc/restic-backup.password
|
||||
```
|
||||
**Kopieer dezelfde string naar je password manager** vóór je verder gaat. Een gegeneerd wachtwoord dat alleen op de server staat is geen wachtwoord — het is een ticking time bomb.
|
||||
|
||||
---
|
||||
|
||||
## Deel B — Backblaze B2 inrichten (Object Lock + scoped keys)
|
||||
|
||||
Doel: een bucket waarvan **bestaande** snapshots niet door de server gewist kunnen worden, plus twee separate keys: één voor de server (alleen schrijven/lezen) en één voor de operator (alle rechten, alleen vanaf laptop gebruikt).
|
||||
|
||||
1. **Bucket aanmaken** in de Backblaze-UI of via `b2` CLI:
|
||||
- Naam: `scrum4me-srv-backup` (of een variant; vermeld in `/etc/restic-backup.env`).
|
||||
- Privacy: **Private**.
|
||||
- **File Lock: Enabled, Governance mode, default retention = 30 days**. Governance betekent: een key met `bypassGovernance` kan locks omzeilen — die capability geven we **alleen** aan de maintenance-key.
|
||||
- Lifecycle rules: **geen** (lifecycle conflicts met Object Lock).
|
||||
- Encryption: server-side encryption aanlaten (B2 standaard).
|
||||
|
||||
2. **Server-key** aanmaken (gaat naar `/etc/restic-backup.env` op de server):
|
||||
```bash
|
||||
# via b2 CLI:
|
||||
b2 application-key create \
|
||||
--bucket scrum4me-srv-backup \
|
||||
--name-prefix scrum4me-srv \
|
||||
server-backup-key \
|
||||
listBuckets,listFiles,readFiles,writeFiles
|
||||
```
|
||||
Bewaar de output (`keyID` + `applicationKey`). Verifieer in de UI dat de key **niet** `deleteFiles`, **niet** `deleteKeys`, **niet** `bypassGovernance` heeft.
|
||||
|
||||
3. **Maintenance-key** aanmaken (gaat in je password manager op de laptop):
|
||||
```bash
|
||||
b2 application-key create \
|
||||
--bucket scrum4me-srv-backup \
|
||||
scrum4me-srv-maintenance-key \
|
||||
listBuckets,listFiles,readFiles,writeFiles,deleteFiles,bypassGovernance
|
||||
```
|
||||
Deze key komt **nooit** op de server. Gebruik alleen voor `restic forget --prune` vanaf je laptop (zie Deel H).
|
||||
|
||||
4. **`/etc/restic-backup.env` aanmaken**
|
||||
```bash
|
||||
sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/restic-backup.env.example \
|
||||
/etc/restic-backup.env
|
||||
sudo chmod 0600 /etc/restic-backup.env
|
||||
sudo chown root:root /etc/restic-backup.env
|
||||
sudo nano /etc/restic-backup.env
|
||||
```
|
||||
Vul in: `RESTIC_REPO_NAS`, `RESTIC_REPO_B2`, `B2_ACCOUNT_ID` (= keyID), `B2_ACCOUNT_KEY` (= applicationKey). Forgejo-velden in Deel F.
|
||||
|
||||
**Dreigingsmodel**
|
||||
|
||||
| Dreiging | Gedekt door dit ontwerp? |
|
||||
|---|---|
|
||||
| Disk-fail / corruptie | ✓ NAS + B2 = 2× redundancy |
|
||||
| Brand / diefstal / waterschade | ✓ B2 is offsite |
|
||||
| Ransomware op de server | ✓ B2 Object Lock — bestaande snapshots immutable tot retention verloopt |
|
||||
| Server-compromise (root) | ✓ server-key kan geen B2-files verwijderen |
|
||||
| Laptop-compromise + server-compromise simultaan | ✗ maintenance-key dan ook in handen van aanvaller — geen verdediging |
|
||||
| Backblaze account-compromise | ✗ — buiten scope; mitigeer met 2FA en audit-trail |
|
||||
| Verlies restic-wachtwoord | ✗ — repos onleesbaar; bewaar wachtwoord óók in password manager |
|
||||
|
||||
---
|
||||
|
||||
## Deel C — Restic-repos initialiseren
|
||||
|
||||
1. **NAS-repo init**
|
||||
```bash
|
||||
sudo -E bash -c '
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||
restic -r "$RESTIC_REPO_NAS" init
|
||||
'
|
||||
```
|
||||
|
||||
2. **B2-repo init**
|
||||
```bash
|
||||
sudo -E bash -c '
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||
restic -r "$RESTIC_REPO_B2" init
|
||||
'
|
||||
```
|
||||
|
||||
3. **Retentie droogtest** — controleer dat het forget-beleid niet té agressief is op een eerste-snapshot-only repo. (Op een verse repo verwijdert `forget` niets, maar dit toont dat alle paden + auth werken.)
|
||||
```bash
|
||||
sudo -E bash -c '
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||
restic -r "$RESTIC_REPO_NAS" forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --dry-run
|
||||
'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deel D — Scripts en systemd-units plaatsen
|
||||
|
||||
1. **Scripts kopiëren**
|
||||
```bash
|
||||
sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/server-backup.sh /srv/backups/scripts/
|
||||
sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/restore-test.sh /srv/backups/scripts/
|
||||
sudo chmod 0750 /srv/backups/scripts/*.sh
|
||||
sudo chown root:root /srv/backups/scripts/*.sh
|
||||
```
|
||||
|
||||
2. **Systemd-units kopiëren**
|
||||
```bash
|
||||
sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/server-backup.service /etc/systemd/system/
|
||||
sudo cp /srv/ops/repos/ops-dashboard/deploy/server-backup/server-backup.timer /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now server-backup.timer
|
||||
```
|
||||
|
||||
3. **Timer verifiëren**
|
||||
```bash
|
||||
systemctl list-timers | grep server-backup
|
||||
```
|
||||
Toont next-run morgen 03:30 (+ randomized delay tot 10 min).
|
||||
|
||||
---
|
||||
|
||||
## Deel E — Eerste run handmatig + statusfile-verificatie
|
||||
|
||||
1. **Trigger**
|
||||
```bash
|
||||
sudo systemctl start server-backup.service
|
||||
```
|
||||
|
||||
2. **Live volgen**
|
||||
```bash
|
||||
journalctl -u server-backup.service -f
|
||||
```
|
||||
Verwacht: 8 fasen (postgres_dump, forgejo_dump, forgejo_db_dump, restic_nas, restic_b2, forget_nas, check_nas, check_b2), elk met een `─── phase: X ───` start- en `─── end X (exit=N, status=S)` eindregel.
|
||||
|
||||
3. **Statusfile**
|
||||
```bash
|
||||
sudo jq . /srv/backups/status/last-run.json
|
||||
```
|
||||
Verwacht: `overall_status: "success"`, alle 5 verplichte fasen `success` (Forgejo mag `skipped` zijn als die nog niet geconfigureerd is).
|
||||
|
||||
4. **Snapshots**
|
||||
```bash
|
||||
sudo -E bash -c '
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||
restic -r "$RESTIC_REPO_NAS" snapshots
|
||||
restic -r "$RESTIC_REPO_B2" snapshots
|
||||
'
|
||||
```
|
||||
Beide tonen één snapshot met `host=scrum4me-srv` en tags `scheduled`.
|
||||
|
||||
---
|
||||
|
||||
## Deel F — Forgejo subplan
|
||||
|
||||
Vóór de eerste full-backup run: inventariseer Forgejo en bevestig (of corrigeer) de defaults in `restic-backup.env`. Bij twijfel — zet `FORGEJO_CONTAINER=` (leeg) zodat de Forgejo-fases als `skipped` markeren tot je verifieerd hebt.
|
||||
|
||||
### F1. Inventarisatie
|
||||
|
||||
```bash
|
||||
docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}' | grep -i forgejo
|
||||
```
|
||||
|
||||
Noteer:
|
||||
- container-naam (vermoedelijk `forgejo`).
|
||||
- image-versie (`codeberg.org/forgejo/forgejo:<versie>`).
|
||||
|
||||
### F2. Configpaden in de container
|
||||
|
||||
```bash
|
||||
docker inspect <forgejo> --format '{{ range .Mounts }}{{ .Source }} -> {{ .Destination }}{{ println }}{{ end }}'
|
||||
docker exec <forgejo> ls -la /data/gitea/conf/app.ini
|
||||
```
|
||||
|
||||
Standaard: `app.ini` in `/data/gitea/conf/app.ini` binnen de container. Wijkt dat af, pas `FORGEJO_CONFIG=` in `/etc/restic-backup.env` aan.
|
||||
|
||||
### F3. DB-koppeling controleren
|
||||
|
||||
```bash
|
||||
docker exec <forgejo> grep -E '^DB_TYPE|^HOST|^NAME|^USER' /data/gitea/conf/app.ini
|
||||
```
|
||||
|
||||
- `DB_TYPE=postgres` met `NAME=forgejo` ⇒ zet `FORGEJO_DB_NAME=forgejo`, en als de Postgres-container niet `scrum4me-postgres` is: `FORGEJO_DB_CONTAINER=...`.
|
||||
- `DB_TYPE=sqlite` ⇒ laat `FORGEJO_DB_NAME=` leeg; SQLite-DB komt mee in `forgejo dump`.
|
||||
|
||||
### F4. Dump-strategie
|
||||
|
||||
Het script doet **drie** dingen voor Forgejo:
|
||||
|
||||
1. `forgejo dump --skip-db -c <config> --type zip -f -` — codebases, attachments, hooks, LFS metadata, etc.
|
||||
2. Separate `pg_dump <forgejo_db>` — autoritatieve DB-restore-bron (Forgejo docs documenteren bekende import-issues bij DB-inhoud uit `forgejo dump`, daarom `--skip-db`).
|
||||
3. Live datadirs (`/srv/forgejo/data/git`, `/srv/forgejo/data/lfs`, `/srv/forgejo/data/queues`) worden **niet** door restic gekopieerd — dat zijn live B-Trees waar een file-level kopie inconsistent zou zijn.
|
||||
|
||||
### F5. Restore-test in geïsoleerde compose-stack
|
||||
|
||||
Vóór je de Forgejo-restore voor real nodig hebt: test hem een keer. Maak een tijdelijke directory met een verse Forgejo + Postgres, voer de dumps in, draai `forgejo doctor check --all`.
|
||||
|
||||
```bash
|
||||
# Minimaal restore-test-recept (vul in op basis van je Forgejo-versie)
|
||||
RESTORE_DIR=/tmp/forgejo-restore-test
|
||||
mkdir -p "$RESTORE_DIR"
|
||||
cd "$RESTORE_DIR"
|
||||
|
||||
# 1. compose-stack met blanco Forgejo + Postgres
|
||||
cat > docker-compose.yml <<'YAML'
|
||||
services:
|
||||
forgejo:
|
||||
image: codeberg.org/forgejo/forgejo:<vul-versie-in>
|
||||
volumes: [ "./forgejo-data:/data" ]
|
||||
depends_on: [ db ]
|
||||
db:
|
||||
image: postgres:17
|
||||
environment:
|
||||
POSTGRES_USER: forgejo
|
||||
POSTGRES_PASSWORD: testtest
|
||||
POSTGRES_DB: forgejo
|
||||
volumes: [ "./db-data:/var/lib/postgresql/data" ]
|
||||
YAML
|
||||
|
||||
docker compose up -d
|
||||
|
||||
# 2. DB-dump terugzetten
|
||||
gunzip < /var/backups/databases/forgejo-db-$(date +%F).sql.gz \
|
||||
| docker compose exec -T db psql -U forgejo forgejo
|
||||
|
||||
# 3. Forgejo-dump uitpakken in de data-volume
|
||||
docker compose stop forgejo
|
||||
unzip /var/backups/databases/forgejo-$(date +%F).zip -d forgejo-data/
|
||||
docker compose start forgejo
|
||||
|
||||
# 4. Health-checks
|
||||
docker compose exec forgejo forgejo doctor check --all
|
||||
curl -fsS http://localhost:3000/api/v1/version
|
||||
```
|
||||
|
||||
Slaagt `forgejo doctor check --all` en het `/api/v1/version`-endpoint? Dan is je Forgejo-restore werkend. Tear-down: `docker compose down -v && rm -rf "$RESTORE_DIR"`.
|
||||
|
||||
---
|
||||
|
||||
## Deel G — Restore-procedure in productie
|
||||
|
||||
### G1. Files uit een snapshot terughalen
|
||||
|
||||
```bash
|
||||
# Snapshot kiezen
|
||||
sudo -E bash -c '
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||
restic -r "$RESTIC_REPO_NAS" snapshots
|
||||
'
|
||||
|
||||
# Restore (latest, alleen /etc — voorbeeld)
|
||||
sudo -E bash -c '
|
||||
set -a; . /etc/restic-backup.env; set +a
|
||||
export RESTIC_PASSWORD_FILE=/etc/restic-backup.password
|
||||
restic -r "$RESTIC_REPO_NAS" restore latest --target /tmp/restore --include /etc
|
||||
'
|
||||
```
|
||||
|
||||
### G2. Postgres herstellen (Scrum4Me-cluster)
|
||||
|
||||
```bash
|
||||
# Stop de apps die met de DB praten
|
||||
docker compose -f /srv/scrum4me/compose/docker-compose.yml stop scrum4me-web ops-dashboard worker-idea
|
||||
|
||||
# Restore dumpall (drop + recreate alle DBs in de cluster — vandaar --clean --if-exists in de dump)
|
||||
gunzip < /var/backups/databases/postgres-2026-05-15.sql.gz \
|
||||
| docker exec -i scrum4me-postgres psql -U scrum4me
|
||||
|
||||
# Apps weer aan
|
||||
docker compose -f /srv/scrum4me/compose/docker-compose.yml start scrum4me-web ops-dashboard worker-idea
|
||||
```
|
||||
|
||||
Voor partial restore (alleen één database): pak die DB uit de dumpall-tekst met `pg_restore` of `awk`-block extractie. Voor alleen `ops_dashboard` is de bestaande [recovery.md](recovery.md) sectie 2a primair.
|
||||
|
||||
### G3. Forgejo herstellen
|
||||
|
||||
Volg [F5](#f5-restore-test-in-geïsoleerde-compose-stack) maar dan met de echte Forgejo-compose-stack en zonder tear-down. Belangrijk: stop de live Forgejo eerst, vervang `/srv/forgejo/data` volledig, restore DB, start Forgejo, `forgejo doctor check --all`.
|
||||
|
||||
---
|
||||
|
||||
## Deel H — Maintenance vanaf de laptop (maandelijks)
|
||||
|
||||
Doel: B2-snapshots ouder dan retention-policy daadwerkelijk pruning, plus een diepere integriteits-check die op de server te duur zou zijn.
|
||||
|
||||
1. **Voorbereiding** (eenmalig op laptop):
|
||||
```bash
|
||||
brew install restic jq
|
||||
# Maintenance-key uit password manager
|
||||
export B2_ACCOUNT_ID=<maintenance-key-id>
|
||||
export B2_ACCOUNT_KEY=<maintenance-app-key>
|
||||
export RESTIC_REPOSITORY=b2:scrum4me-srv-backup:scrum4me-srv
|
||||
read -rs RESTIC_PASSWORD < /dev/tty # uit password manager
|
||||
export RESTIC_PASSWORD
|
||||
```
|
||||
|
||||
2. **Prune-check** (eerst dry-run om te zien wat er zou gebeuren):
|
||||
```bash
|
||||
restic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --dry-run
|
||||
```
|
||||
|
||||
3. **Daadwerkelijke prune** (vereist `bypassGovernance` capability — alleen via maintenance-key):
|
||||
```bash
|
||||
restic forget --keep-daily 7 --keep-weekly 4 --keep-monthly 12 --prune
|
||||
```
|
||||
|
||||
4. **Diepere check**:
|
||||
```bash
|
||||
restic check --read-data-subset=10%
|
||||
```
|
||||
B2-bandbreedte: 10% van een 50 GB repo = 5 GB download, B2-prijs ~ $0.05 (gratis 1 GB/dag).
|
||||
|
||||
5. **Cleanup environment** — sluit shell of `unset RESTIC_PASSWORD B2_ACCOUNT_*`.
|
||||
|
||||
---
|
||||
|
||||
## Deel I — Integriteits-schedule (samenvatting)
|
||||
|
||||
| Cadans | Wie | Wat | Waarom |
|
||||
|---|---|---|---|
|
||||
| Dagelijks 03:30 | server (systemd timer) | `restic check` op beide repos | snelle metadata-/structure-validatie |
|
||||
| Wekelijks (zondag) | server (zelfde script) | `restic check --read-data-subset=2.5%` op NAS, `1%` op B2 | sample-based data-integrity |
|
||||
| Maandelijks | operator (laptop) | `restic check --read-data-subset=10%` + `forget --prune` op B2 | diepere check + prune (B2 server-key heeft geen delete-rechten) |
|
||||
| Maandelijks | operator (server) | `/srv/backups/scripts/restore-test.sh nas` + handmatige Forgejo-stack-restore (F5) | end-to-end restore-verificatie |
|
||||
|
||||
---
|
||||
|
||||
## Te wijzigen / nieuw aangemaakte bestanden
|
||||
|
||||
**Op `scrum4me-srv`** (alleen via deploy uit deze repo, geen handmatige edits):
|
||||
|
||||
- `/srv/backups/scripts/server-backup.sh` (uit `deploy/server-backup/`).
|
||||
- `/srv/backups/scripts/restore-test.sh` (idem).
|
||||
- `/etc/systemd/system/server-backup.service`, `server-backup.timer` (uit `deploy/server-backup/`).
|
||||
- `/etc/restic-backup.env` — secrets, niet in repo.
|
||||
- `/etc/restic-backup.password` — secret, niet in repo.
|
||||
|
||||
**In deze repo (`ops-dashboard`)**, nieuw aangemaakt:
|
||||
|
||||
- `deploy/server-backup/*` — alle deploy-artefacten.
|
||||
- `docs/runbooks/server-backup.md` — dit document.
|
||||
- Later (Fase 3+4): `ops-agent/commands.yml.example`-uitbreiding, `ops-agent/flows.example/server_backup_*.yml`, `app/settings/backups/_components/server-backup-section.tsx`.
|
||||
|
||||
**Op de laptop**, in password manager:
|
||||
|
||||
- restic-wachtwoord (identiek aan `/etc/restic-backup.password`).
|
||||
- B2 maintenance-key (keyID + applicationKey).
|
||||
|
||||
---
|
||||
|
||||
## Veelvoorkomende fouten
|
||||
|
||||
| Symptoom | Oorzaak | Fix |
|
||||
|---|---|---|
|
||||
| `unable to open repository ... no such file or directory` (NAS) | NAS-mount weg na reboot | `mountpoint -q /mnt/backup-server` — fix `fstab`/`autofs`; herstart `server-backup.service` |
|
||||
| `unable to open repository ... AccessDenied` (B2) | server-key heeft verkeerde capabilities of bucket-prefix | check `b2 application-key list`; capabilities moeten `listBuckets,listFiles,readFiles,writeFiles` zijn, name-prefix moet matchen |
|
||||
| `Object Lock In Place` bij `forget --prune` op B2 | server probeert ten onrechte B2 te prunen (heeft die capability niet) | het script prune'd alleen NAS — als deze fout opduikt: handmatige `restic forget` op B2 gedraaid (zou off-server moeten); gebruik maintenance-key |
|
||||
| `restic snapshot tag scheduled` ontbreekt in UI | run heeft `--tag scheduled` niet meegekregen | check script — `restic_backup_to` zet beide tags hardcoded |
|
||||
| `forgejo dump` faalt met permission denied | container-user niet `git` | pas `dump_forgejo` aan: `docker exec -u <correct-user>` |
|
||||
| restic exit code 3 in statusfile | sommige files waren niet leesbaar tijdens snapshot (open file lock) | non-fataal — log toont welke files; meestal logs of sockets; eventueel toevoegen aan `RESTIC_EXCLUDES` |
|
||||
| `another server-backup is already running` exit 75 | timer en UI-knop tegelijk, of vorige run hangt | `systemctl status server-backup.service`; bij hang: `systemctl kill server-backup.service`, lockfile `/run/server-backup.lock` opruimen |
|
||||
| `last-run.json` niet geüpdatet | script gecrashed vóór `write_status_json` | `journalctl -u server-backup.service --since=today` — meestal env-file of password-file probleem |
|
||||
| Postgres-datadir in restic snapshot terug te zien | excludes verkeerd geconfigureerd | check `RESTIC_EXCLUDES` in script — moet `/srv/scrum4me/postgres` bevatten |
|
||||
|
||||
---
|
||||
|
||||
## Verificatie (end-to-end)
|
||||
|
||||
1. **Eerste run slaagt** — Deel E groen, statusfile `overall_status: success`.
|
||||
2. **Snapshots zichtbaar** op beide repos via `restic snapshots`.
|
||||
3. **Restore-test slaagt** — `restore-test.sh nas` → `overall_status: success` in `/srv/backups/status/last-restore-test.json`, alle assertions `ok`.
|
||||
4. **Forgejo-restore-stack** (F5) — `forgejo doctor check --all` rond zonder errors, `/api/v1/version` antwoordt.
|
||||
5. **Reboot-test** — server reboot, `systemctl list-timers` toont `server-backup.timer` met next-run gepland; NAS-mount automatisch terug.
|
||||
6. **Failure-injectie**:
|
||||
- NAS unmount → script eindigt met `overall_status: partial_failure`, `phases.restic_nas.status: failed`, B2-snapshot wel aanwezig, systemd exit 75.
|
||||
- B2-key tijdelijk ongeldig → `phases.restic_b2.status: failed`, NAS-snapshot wel, exit 75.
|
||||
- Beide repos onbereikbaar → `overall_status: failed`, exit 1.
|
||||
7. **Concurrency** — tweede `systemctl start server-backup.service` tijdens lopende run → exit 75, log toont `another server-backup is already running`.
|
||||
8. **Maandelijkse maintenance** — eerst keer succesvol uitgevoerd vanaf laptop, B2 `forget --prune` slaagt zonder Object Lock-fouten.
|
||||
|
||||
---
|
||||
|
||||
# Addendum — uitvoering
|
||||
|
||||
> Vul deze sectie na de eerste uitvoering met alle afwijkingen van het plan
|
||||
> hierboven: exacte Forgejo container-naam, image-versie, eventuele paden die
|
||||
> anders bleken, sudoers-precieze regels, Object Lock-retention die je gekozen
|
||||
> hebt, B2 key-IDs (geredacteerd), tijden van eerste runs, etc. Zelfde
|
||||
> discipline als [tailscale-setup.md](tailscale-setup.md).
|
||||
|
|
@ -250,3 +250,51 @@ commands:
|
|||
- -delete
|
||||
- -print
|
||||
description: "Delete ops_dashboard backup files older than 30 days"
|
||||
|
||||
# ── Server-wide backup (restic + NAS + B2) ────────────────────────────────
|
||||
# All wrappers live under /srv/backups/scripts/wrappers/ and read
|
||||
# /etc/restic-backup.env (mode 0600 root:root) which the ops-agent user
|
||||
# cannot read directly — hence the sudo prefix. See deploy/ops-agent/sudoers
|
||||
# for the corresponding NOPASSWD entries.
|
||||
|
||||
read_backup_status:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/read-status.sh"]
|
||||
description: "Read /srv/backups/status/last-run.json + last-restore-test.json (JSON)"
|
||||
|
||||
restic_snapshots_nas:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-snapshots.sh", "nas"]
|
||||
description: "Restic snapshots from the NAS repo (JSON array, newest first)"
|
||||
|
||||
restic_snapshots_b2:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-snapshots.sh", "b2"]
|
||||
description: "Restic snapshots from the B2 repo (JSON array, newest first)"
|
||||
|
||||
restic_stats_nas:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-stats.sh", "nas"]
|
||||
description: "Restic stats for the NAS repo (restore-size + raw-data + dedup ratio)"
|
||||
|
||||
restic_stats_b2:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/restic-stats.sh", "b2"]
|
||||
description: "Restic stats for the B2 repo (restore-size + raw-data + dedup ratio)"
|
||||
|
||||
list_backup_logs:
|
||||
cmd:
|
||||
- sh
|
||||
- -c
|
||||
- "ls -lt /srv/backups/logs/*.log 2>/dev/null | head -10 || echo 'no logs yet'"
|
||||
description: "List the 10 most recent server-backup logs"
|
||||
|
||||
tail_backup_log_today:
|
||||
cmd:
|
||||
- sh
|
||||
- -c
|
||||
- "f=/srv/backups/logs/server-backup-$(date +%F).log; [ -f \"$f\" ] && tail -200 \"$f\" || echo 'no log for today'"
|
||||
description: "Tail the last 200 lines of today's server-backup log"
|
||||
|
||||
trigger_server_backup:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/trigger-backup.sh"]
|
||||
description: "Trigger server-backup.service ad-hoc (refuses if already running)"
|
||||
|
||||
trigger_restore_test:
|
||||
cmd: ["sudo", "-n", "/srv/backups/scripts/wrappers/trigger-restore-test.sh", "nas"]
|
||||
description: "Run restore-test.sh against the NAS repo (non-destructive, writes /tmp/restore-test/)"
|
||||
|
|
|
|||
21
ops-agent/flows.example/server_backup_full.yml
Normal file
21
ops-agent/flows.example/server_backup_full.yml
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# Trigger a full server-wide backup (pg_dumpall + restic to NAS + B2).
|
||||
# Runs out-of-band via systemd; this flow just kicks it off and then tails
|
||||
# today's log + reads the structured statusfile so the dashboard can render
|
||||
# progress and final result.
|
||||
#
|
||||
# Copy to /etc/ops-agent/flows/server_backup_full.yml on the host.
|
||||
# Triggered manually via /settings/backups → "Backup now" or by the daily
|
||||
# server-backup.timer (which runs server-backup.service directly, skipping
|
||||
# this flow).
|
||||
|
||||
name: Server backup (full)
|
||||
description: Daily full server backup — pg_dumpall + restic to NAS + B2 (Object Lock)
|
||||
steps:
|
||||
- command_key: trigger_server_backup
|
||||
on_failure: abort
|
||||
|
||||
- command_key: tail_backup_log_today
|
||||
on_failure: continue
|
||||
|
||||
- command_key: read_backup_status
|
||||
on_failure: continue
|
||||
14
ops-agent/flows.example/server_backup_restore_test.yml
Normal file
14
ops-agent/flows.example/server_backup_restore_test.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
# Run a non-destructive restore test against the NAS repo. Restores the latest
|
||||
# snapshot to /tmp/restore-test/ and asserts that critical files came back
|
||||
# intact. Used to verify backups periodically without touching the live stack.
|
||||
#
|
||||
# Copy to /etc/ops-agent/flows/server_backup_restore_test.yml on the host.
|
||||
|
||||
name: Server backup — restore test
|
||||
description: Restore latest snapshot to /tmp/restore-test and assert critical files
|
||||
steps:
|
||||
- command_key: trigger_restore_test
|
||||
on_failure: continue
|
||||
|
||||
- command_key: read_backup_status
|
||||
on_failure: continue
|
||||
Loading…
Add table
Add a link
Reference in a new issue