Ops-dashboard/deploy/server-backup/restore-test.sh
Madhura68 ab87c0fada feat(server-backup): restic dual-repo backup (NAS + B2) with dashboard UI
Adds a server-wide backup capability beyond the existing ops_dashboard
pg_dump flow:

- Daily systemd timer (03:30) runs pg_dumpall + Forgejo dump, then restic
  to a local NAS repo and an offsite Backblaze B2 repo with Object Lock.
  Phase-based script with single-instance flock, structured statusfile,
  systemd hardening, and live-datadir excludes (Postgres / Forgejo) so
  the dumps stay authoritative.
- Ops-agent gets nine new read-only/trigger commands (snapshots, stats,
  status, logs, plus two triggers) backed by sudoers-whitelisted wrapper
  scripts that source /etc/restic-backup.env so the agent never sees the
  restic password or B2 keys.
- Two new flows (server_backup_full, server_backup_restore_test) drive
  the dashboard's "Backup now" and "Restore test" buttons.
- /settings/backups gains a Server backup section with overall + per-phase
  status, NAS / B2 snapshot tables, restore-size / raw-data / dedup-ratio
  stats, and the last restore-test result. The existing pg_dump section
  is preserved unchanged.
- Runbook docs/runbooks/server-backup.md follows the tailscale-setup
  pattern (plan + addendum) and covers B2 Object Lock + scoped keys,
  Forgejo subplan with isolated restore-test stack, the off-server
  maintenance flow for B2 prune, and the integrity-check schedule.

Code-only change — installation on scrum4me-srv follows the runbook.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 13:03:00 +02:00

177 lines
5.5 KiB
Bash

#!/usr/bin/env bash
# Restore the latest restic snapshot to /tmp/restore-test/ and assert that a
# small set of critical files came back intact. Used by the monthly maintenance
# check and by the dashboard's "Restore test" button.
#
# Usage:
# server-backup-restore-test.sh [nas|b2]
#
# Default repo is "nas" (faster, no B2 download fees).
umask 077
set -uo pipefail
REPO_LABEL="${1:-nas}"
RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}"
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
STATUS_DIR="$(dirname "$STATUS_FILE")"
STARTED_AT="$(date -Is)"
SECONDS=0
# Load env (idempotent: ok if already in environment).
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
# shellcheck disable=SC1091
set -a; . /etc/restic-backup.env; set +a
fi
case "$REPO_LABEL" in
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
*) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;;
esac
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
exit 1
fi
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
for tool in jq restic; do
command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; }
done
mkdir -p "$STATUS_DIR"
chmod 0750 "$STATUS_DIR"
echo "════════════════════════════════════════════════════════════════"
echo " Restore test — started $STARTED_AT"
echo " Repo: $REPO_LABEL ($REPO)"
echo " Target: $RESTORE_DIR"
echo "════════════════════════════════════════════════════════════════"
# Clean previous attempt to keep results unambiguous.
rm -rf "$RESTORE_DIR"
mkdir -p "$RESTORE_DIR"
# Find latest snapshot id.
SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \
| jq -r '.[0].short_id // .[0].id // empty')
if [ -z "$SNAPSHOT_ID" ]; then
echo "ERROR: no snapshots found in $REPO_LABEL repo"
jq -n \
--arg started "$STARTED_AT" \
--arg completed "$(date -Is)" \
--argjson duration "$SECONDS" \
--arg repo "$REPO_LABEL" \
'{
schema_version: 1,
overall_status: "failed",
started_at: $started,
completed_at: $completed,
duration_seconds: $duration,
repo: $repo,
snapshot_id: null,
error: "no snapshots in repo",
assertions: []
}' > "$STATUS_FILE"
chmod 0644 "$STATUS_FILE"
exit 1
fi
echo "Restoring snapshot $SNAPSHOT_ID"
RESTORE_RC=0
restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" || RESTORE_RC=$?
if [ "$RESTORE_RC" -ne 0 ]; then
echo "ERROR: restic restore exited $RESTORE_RC"
fi
# Assertions: each is a path that MUST exist and be non-empty.
# Adjust to your stack after first run (and update the runbook addendum).
ASSERTION_PATHS=(
"$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml"
"$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile"
"$RESTORE_DIR/etc/restic-backup.env"
)
# Latest postgres dump — match the newest file (glob may resolve to zero).
shopt -s nullglob
PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz)
shopt -u nullglob
if [ "${#PG_DUMPS[@]}" -gt 0 ]; then
# pick lexicographic last (= newest date, ISO format)
LATEST_PG="${PG_DUMPS[-1]}"
ASSERTION_PATHS+=("$LATEST_PG")
fi
ASSERTIONS_JSON='[]'
ANY_FAILED=0
for p in "${ASSERTION_PATHS[@]}"; do
if [ -s "$p" ]; then
status="ok"
bytes=$(stat -c %s "$p")
echo "$p ($bytes bytes)"
elif [ -e "$p" ]; then
status="empty"
bytes=0
ANY_FAILED=1
echo "$p (exists but empty)"
else
status="missing"
bytes=0
ANY_FAILED=1
echo "$p (missing)"
fi
ASSERTIONS_JSON=$(jq -c \
--arg path "$p" \
--arg status "$status" \
--argjson bytes "$bytes" \
'. + [{path: $path, status: $status, bytes: $bytes}]' \
<<< "$ASSERTIONS_JSON")
done
if [ "$RESTORE_RC" -ne 0 ]; then
OVERALL="failed"
elif [ "$ANY_FAILED" -ne 0 ]; then
OVERALL="partial_failure"
else
OVERALL="success"
fi
jq -n \
--arg started "$STARTED_AT" \
--arg completed "$(date -Is)" \
--argjson duration "$SECONDS" \
--arg repo "$REPO_LABEL" \
--arg snapshot "$SNAPSHOT_ID" \
--arg overall "$OVERALL" \
--argjson restore_exit "$RESTORE_RC" \
--argjson assertions "$ASSERTIONS_JSON" \
'{
schema_version: 1,
overall_status: $overall,
started_at: $started,
completed_at: $completed,
duration_seconds: $duration,
repo: $repo,
snapshot_id: $snapshot,
restore_exit_code: $restore_exit,
target: "'"$RESTORE_DIR"'",
assertions: $assertions
}' > "$STATUS_FILE"
chmod 0644 "$STATUS_FILE"
echo ""
echo "════════════════════════════════════════════════════════════════"
echo " Restore test — finished $(date -Is)"
echo " Overall: $OVERALL"
echo " Status file: $STATUS_FILE"
echo "════════════════════════════════════════════════════════════════"
case "$OVERALL" in
success) exit 0 ;;
partial_failure) exit 75 ;;
failed|*) exit 1 ;;
esac