#!/usr/bin/env bash # Restore the latest restic snapshot to /tmp/restore-test/ and assert that a # small set of critical files came back intact. Used by the monthly maintenance # check and by the dashboard's "Restore test" button. # # Usage: # server-backup-restore-test.sh [nas|b2] # # Default repo is "nas" (faster, no B2 download fees). umask 077 set -uo pipefail REPO_LABEL="${1:-nas}" RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}" RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}" STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}" STATUS_DIR="$(dirname "$STATUS_FILE")" STARTED_AT="$(date -Is)" SECONDS=0 # Load env (idempotent: ok if already in environment). if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then # shellcheck disable=SC1091 set -a; . /etc/restic-backup.env; set +a fi case "$REPO_LABEL" in nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;; b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;; *) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;; esac if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2 exit 1 fi export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH" for tool in jq restic; do command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; } done mkdir -p "$STATUS_DIR" chmod 0750 "$STATUS_DIR" echo "════════════════════════════════════════════════════════════════" echo " Restore test — started $STARTED_AT" echo " Repo: $REPO_LABEL ($REPO)" echo " Target: $RESTORE_DIR" echo "════════════════════════════════════════════════════════════════" # Clean previous attempt to keep results unambiguous. rm -rf "$RESTORE_DIR" mkdir -p "$RESTORE_DIR" # Find latest snapshot id. SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \ | jq -r '.[0].short_id // .[0].id // empty') if [ -z "$SNAPSHOT_ID" ]; then echo "ERROR: no snapshots found in $REPO_LABEL repo" jq -n \ --arg started "$STARTED_AT" \ --arg completed "$(date -Is)" \ --argjson duration "$SECONDS" \ --arg repo "$REPO_LABEL" \ '{ schema_version: 1, overall_status: "failed", started_at: $started, completed_at: $completed, duration_seconds: $duration, repo: $repo, snapshot_id: null, error: "no snapshots in repo", assertions: [] }' > "$STATUS_FILE" chmod 0644 "$STATUS_FILE" exit 1 fi echo "Restoring snapshot $SNAPSHOT_ID (filtered) …" # Restore ALLEEN de paden waar we op asserten — een full restore zou disk # nodig hebben gelijk aan de restore-size van de snapshot (honderden GiB) en # is voor een correctheids-test onnodig. /tmp is vaak tmpfs of klein — # vandaar dat een full restore daar onmiddellijk vastloopt op ENOSPC. # Houd deze lijst gesynchroniseerd met ASSERTION_PATHS hieronder. RESTORE_RC=0 restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" \ --include /srv/scrum4me/compose/docker-compose.yml \ --include /srv/scrum4me/caddy/Caddyfile \ --include /etc/restic-backup.env \ --include /var/backups/databases \ || RESTORE_RC=$? if [ "$RESTORE_RC" -ne 0 ]; then echo "ERROR: restic restore exited $RESTORE_RC" fi # Assertions: each is a path that MUST exist and be non-empty. # Adjust to your stack after first run (and update the runbook addendum). ASSERTION_PATHS=( "$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml" "$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile" "$RESTORE_DIR/etc/restic-backup.env" ) # Latest postgres dump — match the newest file (glob may resolve to zero). shopt -s nullglob PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz) shopt -u nullglob if [ "${#PG_DUMPS[@]}" -gt 0 ]; then # pick lexicographic last (= newest date, ISO format) LATEST_PG="${PG_DUMPS[-1]}" ASSERTION_PATHS+=("$LATEST_PG") fi ASSERTIONS_JSON='[]' ANY_FAILED=0 for p in "${ASSERTION_PATHS[@]}"; do if [ -s "$p" ]; then status="ok" bytes=$(stat -c %s "$p") echo " ✓ $p ($bytes bytes)" elif [ -e "$p" ]; then status="empty" bytes=0 ANY_FAILED=1 echo " ✗ $p (exists but empty)" else status="missing" bytes=0 ANY_FAILED=1 echo " ✗ $p (missing)" fi ASSERTIONS_JSON=$(jq -c \ --arg path "$p" \ --arg status "$status" \ --argjson bytes "$bytes" \ '. + [{path: $path, status: $status, bytes: $bytes}]' \ <<< "$ASSERTIONS_JSON") done if [ "$RESTORE_RC" -ne 0 ]; then OVERALL="failed" elif [ "$ANY_FAILED" -ne 0 ]; then OVERALL="partial_failure" else OVERALL="success" fi jq -n \ --arg started "$STARTED_AT" \ --arg completed "$(date -Is)" \ --argjson duration "$SECONDS" \ --arg repo "$REPO_LABEL" \ --arg snapshot "$SNAPSHOT_ID" \ --arg overall "$OVERALL" \ --argjson restore_exit "$RESTORE_RC" \ --argjson assertions "$ASSERTIONS_JSON" \ '{ schema_version: 1, overall_status: $overall, started_at: $started, completed_at: $completed, duration_seconds: $duration, repo: $repo, snapshot_id: $snapshot, restore_exit_code: $restore_exit, target: "'"$RESTORE_DIR"'", assertions: $assertions }' > "$STATUS_FILE" chmod 0644 "$STATUS_FILE" echo "" echo "════════════════════════════════════════════════════════════════" echo " Restore test — finished $(date -Is)" echo " Overall: $OVERALL" echo " Status file: $STATUS_FILE" echo "════════════════════════════════════════════════════════════════" case "$OVERALL" in success) exit 0 ;; partial_failure) exit 75 ;; failed|*) exit 1 ;; esac