Ops-dashboard/deploy/server-backup/restore-test.sh
Janpeter Visser 20de584759 fix(server-backup): host-paths + script bugs uit eerste install
Kleine correcties bovenop ab87c0f, gevonden tijdens de eerste install
op scrum4me-srv (zie docs/runbooks/server-backup.md addendum):

- restic-backup.env.example: NAS-pad → /mnt/nas/backups/restic/scrum4me-srv,
  Forgejo-container → scrum4me-forgejo (waren placeholders die niet matchten
  met de actuele server-state).
- server-backup.service: ReadWritePaths uitgebreid met /mnt/nas/backups —
  ProtectSystem=strict blokkeerde anders schrijven naar de NAS-repo.
  RequiresMountsFor=/mnt/nas/backups toegevoegd om cifs-automount te triggeren
  bij timer-fire. Documentation=-URL gecorrigeerd naar /srv/scrum4me/.
- server-backup.sh: --skip-db verwijderd uit forgejo dump (Forgejo 11.x heeft
  die flag niet meer; DB komt nu mee in de zip, redundant met de aparte
  forgejo_db_dump-fase maar onschuldig).
- server-backup.sh: subshell-bug in determine_exit_code gefixt — werd
  aangeroepen via $(...), dus OVERALL_STATUS lekte niet naar de parent
  en write_status_json schreef altijd "unknown".
- restore-test.sh: --include filter toegevoegd op de assertion-paden — een
  full restore (~476 GiB logical) liep direct vol op /tmp (7.6 GB tmpfs)
  met 3.3M ENOSPC-errors. Nu 59 MiB in 10s.
- runbook: paden /srv/ops/repos/... → /srv/scrum4me/ops-dashboard/...,
  <forgejo>-placeholders → scrum4me-forgejo, concrete cifs-prefixpath
  fstab-regel in Deel A3, en een gevuld addendum met alle bevindingen
  van de eerste install (B2-bucket-naam ScrumForMeSrvBackup, sudo -E quirk,
  storage-cap incident, dedup-cijfers).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 16:34:21 +02:00

187 lines
6 KiB
Bash

#!/usr/bin/env bash
# Restore the latest restic snapshot to /tmp/restore-test/ and assert that a
# small set of critical files came back intact. Used by the monthly maintenance
# check and by the dashboard's "Restore test" button.
#
# Usage:
# server-backup-restore-test.sh [nas|b2]
#
# Default repo is "nas" (faster, no B2 download fees).
umask 077
set -uo pipefail
REPO_LABEL="${1:-nas}"
RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}"
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
STATUS_DIR="$(dirname "$STATUS_FILE")"
STARTED_AT="$(date -Is)"
SECONDS=0
# Load env (idempotent: ok if already in environment).
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
# shellcheck disable=SC1091
set -a; . /etc/restic-backup.env; set +a
fi
case "$REPO_LABEL" in
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
*) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;;
esac
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
exit 1
fi
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
for tool in jq restic; do
command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; }
done
mkdir -p "$STATUS_DIR"
chmod 0750 "$STATUS_DIR"
echo "════════════════════════════════════════════════════════════════"
echo " Restore test — started $STARTED_AT"
echo " Repo: $REPO_LABEL ($REPO)"
echo " Target: $RESTORE_DIR"
echo "════════════════════════════════════════════════════════════════"
# Clean previous attempt to keep results unambiguous.
rm -rf "$RESTORE_DIR"
mkdir -p "$RESTORE_DIR"
# Find latest snapshot id.
SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \
| jq -r '.[0].short_id // .[0].id // empty')
if [ -z "$SNAPSHOT_ID" ]; then
echo "ERROR: no snapshots found in $REPO_LABEL repo"
jq -n \
--arg started "$STARTED_AT" \
--arg completed "$(date -Is)" \
--argjson duration "$SECONDS" \
--arg repo "$REPO_LABEL" \
'{
schema_version: 1,
overall_status: "failed",
started_at: $started,
completed_at: $completed,
duration_seconds: $duration,
repo: $repo,
snapshot_id: null,
error: "no snapshots in repo",
assertions: []
}' > "$STATUS_FILE"
chmod 0644 "$STATUS_FILE"
exit 1
fi
echo "Restoring snapshot $SNAPSHOT_ID (filtered) …"
# Restore ALLEEN de paden waar we op asserten — een full restore zou disk
# nodig hebben gelijk aan de restore-size van de snapshot (honderden GiB) en
# is voor een correctheids-test onnodig. /tmp is vaak tmpfs of klein —
# vandaar dat een full restore daar onmiddellijk vastloopt op ENOSPC.
# Houd deze lijst gesynchroniseerd met ASSERTION_PATHS hieronder.
RESTORE_RC=0
restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" \
--include /srv/scrum4me/compose/docker-compose.yml \
--include /srv/scrum4me/caddy/Caddyfile \
--include /etc/restic-backup.env \
--include /var/backups/databases \
|| RESTORE_RC=$?
if [ "$RESTORE_RC" -ne 0 ]; then
echo "ERROR: restic restore exited $RESTORE_RC"
fi
# Assertions: each is a path that MUST exist and be non-empty.
# Adjust to your stack after first run (and update the runbook addendum).
ASSERTION_PATHS=(
"$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml"
"$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile"
"$RESTORE_DIR/etc/restic-backup.env"
)
# Latest postgres dump — match the newest file (glob may resolve to zero).
shopt -s nullglob
PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz)
shopt -u nullglob
if [ "${#PG_DUMPS[@]}" -gt 0 ]; then
# pick lexicographic last (= newest date, ISO format)
LATEST_PG="${PG_DUMPS[-1]}"
ASSERTION_PATHS+=("$LATEST_PG")
fi
ASSERTIONS_JSON='[]'
ANY_FAILED=0
for p in "${ASSERTION_PATHS[@]}"; do
if [ -s "$p" ]; then
status="ok"
bytes=$(stat -c %s "$p")
echo "$p ($bytes bytes)"
elif [ -e "$p" ]; then
status="empty"
bytes=0
ANY_FAILED=1
echo "$p (exists but empty)"
else
status="missing"
bytes=0
ANY_FAILED=1
echo "$p (missing)"
fi
ASSERTIONS_JSON=$(jq -c \
--arg path "$p" \
--arg status "$status" \
--argjson bytes "$bytes" \
'. + [{path: $path, status: $status, bytes: $bytes}]' \
<<< "$ASSERTIONS_JSON")
done
if [ "$RESTORE_RC" -ne 0 ]; then
OVERALL="failed"
elif [ "$ANY_FAILED" -ne 0 ]; then
OVERALL="partial_failure"
else
OVERALL="success"
fi
jq -n \
--arg started "$STARTED_AT" \
--arg completed "$(date -Is)" \
--argjson duration "$SECONDS" \
--arg repo "$REPO_LABEL" \
--arg snapshot "$SNAPSHOT_ID" \
--arg overall "$OVERALL" \
--argjson restore_exit "$RESTORE_RC" \
--argjson assertions "$ASSERTIONS_JSON" \
'{
schema_version: 1,
overall_status: $overall,
started_at: $started,
completed_at: $completed,
duration_seconds: $duration,
repo: $repo,
snapshot_id: $snapshot,
restore_exit_code: $restore_exit,
target: "'"$RESTORE_DIR"'",
assertions: $assertions
}' > "$STATUS_FILE"
chmod 0644 "$STATUS_FILE"
echo ""
echo "════════════════════════════════════════════════════════════════"
echo " Restore test — finished $(date -Is)"
echo " Overall: $OVERALL"
echo " Status file: $STATUS_FILE"
echo "════════════════════════════════════════════════════════════════"
case "$OVERALL" in
success) exit 0 ;;
partial_failure) exit 75 ;;
failed|*) exit 1 ;;
esac