Kleine correcties bovenop ab87c0f, gevonden tijdens de eerste install
op scrum4me-srv (zie docs/runbooks/server-backup.md addendum):
- restic-backup.env.example: NAS-pad → /mnt/nas/backups/restic/scrum4me-srv,
Forgejo-container → scrum4me-forgejo (waren placeholders die niet matchten
met de actuele server-state).
- server-backup.service: ReadWritePaths uitgebreid met /mnt/nas/backups —
ProtectSystem=strict blokkeerde anders schrijven naar de NAS-repo.
RequiresMountsFor=/mnt/nas/backups toegevoegd om cifs-automount te triggeren
bij timer-fire. Documentation=-URL gecorrigeerd naar /srv/scrum4me/.
- server-backup.sh: --skip-db verwijderd uit forgejo dump (Forgejo 11.x heeft
die flag niet meer; DB komt nu mee in de zip, redundant met de aparte
forgejo_db_dump-fase maar onschuldig).
- server-backup.sh: subshell-bug in determine_exit_code gefixt — werd
aangeroepen via $(...), dus OVERALL_STATUS lekte niet naar de parent
en write_status_json schreef altijd "unknown".
- restore-test.sh: --include filter toegevoegd op de assertion-paden — een
full restore (~476 GiB logical) liep direct vol op /tmp (7.6 GB tmpfs)
met 3.3M ENOSPC-errors. Nu 59 MiB in 10s.
- runbook: paden /srv/ops/repos/... → /srv/scrum4me/ops-dashboard/...,
<forgejo>-placeholders → scrum4me-forgejo, concrete cifs-prefixpath
fstab-regel in Deel A3, en een gevuld addendum met alle bevindingen
van de eerste install (B2-bucket-naam ScrumForMeSrvBackup, sudo -E quirk,
storage-cap incident, dedup-cijfers).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
6 KiB
Bash
187 lines
6 KiB
Bash
#!/usr/bin/env bash
|
|
# Restore the latest restic snapshot to /tmp/restore-test/ and assert that a
|
|
# small set of critical files came back intact. Used by the monthly maintenance
|
|
# check and by the dashboard's "Restore test" button.
|
|
#
|
|
# Usage:
|
|
# server-backup-restore-test.sh [nas|b2]
|
|
#
|
|
# Default repo is "nas" (faster, no B2 download fees).
|
|
|
|
umask 077
|
|
set -uo pipefail
|
|
|
|
REPO_LABEL="${1:-nas}"
|
|
RESTORE_DIR="${RESTORE_DIR:-/tmp/restore-test}"
|
|
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
|
|
STATUS_FILE="${STATUS_FILE:-/srv/backups/status/last-restore-test.json}"
|
|
STATUS_DIR="$(dirname "$STATUS_FILE")"
|
|
STARTED_AT="$(date -Is)"
|
|
SECONDS=0
|
|
|
|
# Load env (idempotent: ok if already in environment).
|
|
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
|
|
# shellcheck disable=SC1091
|
|
set -a; . /etc/restic-backup.env; set +a
|
|
fi
|
|
|
|
case "$REPO_LABEL" in
|
|
nas) REPO="${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set}" ;;
|
|
b2) REPO="${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set}" ;;
|
|
*) echo "ERROR: repo label must be 'nas' or 'b2', got '$REPO_LABEL'" >&2; exit 2 ;;
|
|
esac
|
|
|
|
if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
|
|
echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
|
|
exit 1
|
|
fi
|
|
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"
|
|
|
|
for tool in jq restic; do
|
|
command -v "$tool" >/dev/null 2>&1 || { echo "ERROR: '$tool' not on PATH" >&2; exit 1; }
|
|
done
|
|
|
|
mkdir -p "$STATUS_DIR"
|
|
chmod 0750 "$STATUS_DIR"
|
|
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
echo " Restore test — started $STARTED_AT"
|
|
echo " Repo: $REPO_LABEL ($REPO)"
|
|
echo " Target: $RESTORE_DIR"
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
|
|
# Clean previous attempt to keep results unambiguous.
|
|
rm -rf "$RESTORE_DIR"
|
|
mkdir -p "$RESTORE_DIR"
|
|
|
|
# Find latest snapshot id.
|
|
SNAPSHOT_ID=$(restic -r "$REPO" snapshots --json --latest 1 2>/dev/null \
|
|
| jq -r '.[0].short_id // .[0].id // empty')
|
|
|
|
if [ -z "$SNAPSHOT_ID" ]; then
|
|
echo "ERROR: no snapshots found in $REPO_LABEL repo"
|
|
jq -n \
|
|
--arg started "$STARTED_AT" \
|
|
--arg completed "$(date -Is)" \
|
|
--argjson duration "$SECONDS" \
|
|
--arg repo "$REPO_LABEL" \
|
|
'{
|
|
schema_version: 1,
|
|
overall_status: "failed",
|
|
started_at: $started,
|
|
completed_at: $completed,
|
|
duration_seconds: $duration,
|
|
repo: $repo,
|
|
snapshot_id: null,
|
|
error: "no snapshots in repo",
|
|
assertions: []
|
|
}' > "$STATUS_FILE"
|
|
chmod 0644 "$STATUS_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Restoring snapshot $SNAPSHOT_ID (filtered) …"
|
|
# Restore ALLEEN de paden waar we op asserten — een full restore zou disk
|
|
# nodig hebben gelijk aan de restore-size van de snapshot (honderden GiB) en
|
|
# is voor een correctheids-test onnodig. /tmp is vaak tmpfs of klein —
|
|
# vandaar dat een full restore daar onmiddellijk vastloopt op ENOSPC.
|
|
# Houd deze lijst gesynchroniseerd met ASSERTION_PATHS hieronder.
|
|
RESTORE_RC=0
|
|
restic -r "$REPO" restore "$SNAPSHOT_ID" --target "$RESTORE_DIR" \
|
|
--include /srv/scrum4me/compose/docker-compose.yml \
|
|
--include /srv/scrum4me/caddy/Caddyfile \
|
|
--include /etc/restic-backup.env \
|
|
--include /var/backups/databases \
|
|
|| RESTORE_RC=$?
|
|
|
|
if [ "$RESTORE_RC" -ne 0 ]; then
|
|
echo "ERROR: restic restore exited $RESTORE_RC"
|
|
fi
|
|
|
|
# Assertions: each is a path that MUST exist and be non-empty.
|
|
# Adjust to your stack after first run (and update the runbook addendum).
|
|
ASSERTION_PATHS=(
|
|
"$RESTORE_DIR/srv/scrum4me/compose/docker-compose.yml"
|
|
"$RESTORE_DIR/srv/scrum4me/caddy/Caddyfile"
|
|
"$RESTORE_DIR/etc/restic-backup.env"
|
|
)
|
|
|
|
# Latest postgres dump — match the newest file (glob may resolve to zero).
|
|
shopt -s nullglob
|
|
PG_DUMPS=("$RESTORE_DIR/var/backups/databases/"postgres-*.sql.gz)
|
|
shopt -u nullglob
|
|
if [ "${#PG_DUMPS[@]}" -gt 0 ]; then
|
|
# pick lexicographic last (= newest date, ISO format)
|
|
LATEST_PG="${PG_DUMPS[-1]}"
|
|
ASSERTION_PATHS+=("$LATEST_PG")
|
|
fi
|
|
|
|
ASSERTIONS_JSON='[]'
|
|
ANY_FAILED=0
|
|
for p in "${ASSERTION_PATHS[@]}"; do
|
|
if [ -s "$p" ]; then
|
|
status="ok"
|
|
bytes=$(stat -c %s "$p")
|
|
echo " ✓ $p ($bytes bytes)"
|
|
elif [ -e "$p" ]; then
|
|
status="empty"
|
|
bytes=0
|
|
ANY_FAILED=1
|
|
echo " ✗ $p (exists but empty)"
|
|
else
|
|
status="missing"
|
|
bytes=0
|
|
ANY_FAILED=1
|
|
echo " ✗ $p (missing)"
|
|
fi
|
|
ASSERTIONS_JSON=$(jq -c \
|
|
--arg path "$p" \
|
|
--arg status "$status" \
|
|
--argjson bytes "$bytes" \
|
|
'. + [{path: $path, status: $status, bytes: $bytes}]' \
|
|
<<< "$ASSERTIONS_JSON")
|
|
done
|
|
|
|
if [ "$RESTORE_RC" -ne 0 ]; then
|
|
OVERALL="failed"
|
|
elif [ "$ANY_FAILED" -ne 0 ]; then
|
|
OVERALL="partial_failure"
|
|
else
|
|
OVERALL="success"
|
|
fi
|
|
|
|
jq -n \
|
|
--arg started "$STARTED_AT" \
|
|
--arg completed "$(date -Is)" \
|
|
--argjson duration "$SECONDS" \
|
|
--arg repo "$REPO_LABEL" \
|
|
--arg snapshot "$SNAPSHOT_ID" \
|
|
--arg overall "$OVERALL" \
|
|
--argjson restore_exit "$RESTORE_RC" \
|
|
--argjson assertions "$ASSERTIONS_JSON" \
|
|
'{
|
|
schema_version: 1,
|
|
overall_status: $overall,
|
|
started_at: $started,
|
|
completed_at: $completed,
|
|
duration_seconds: $duration,
|
|
repo: $repo,
|
|
snapshot_id: $snapshot,
|
|
restore_exit_code: $restore_exit,
|
|
target: "'"$RESTORE_DIR"'",
|
|
assertions: $assertions
|
|
}' > "$STATUS_FILE"
|
|
chmod 0644 "$STATUS_FILE"
|
|
|
|
echo ""
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
echo " Restore test — finished $(date -Is)"
|
|
echo " Overall: $OVERALL"
|
|
echo " Status file: $STATUS_FILE"
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
|
|
case "$OVERALL" in
|
|
success) exit 0 ;;
|
|
partial_failure) exit 75 ;;
|
|
failed|*) exit 1 ;;
|
|
esac
|