#!/usr/bin/env bash # Daily server-wide backup: dumps databases, runs restic to NAS + B2, # writes a structured statusfile that the ops-dashboard can read. # # Install: # cp deploy/server-backup/server-backup.sh /srv/backups/scripts/server-backup.sh # chmod 0750 /srv/backups/scripts/server-backup.sh # chown root:root /srv/backups/scripts/server-backup.sh # # Requires: bash, jq, flock, restic, docker, gzip. See runbook for setup. umask 077 set -uo pipefail # ── Configuration ────────────────────────────────────────────────────────── STATUS_DIR="${STATUS_DIR:-/srv/backups/status}" LOG_DIR="${LOG_DIR:-/srv/backups/logs}" DB_DUMP_DIR="${DB_DUMP_DIR:-/var/backups/databases}" RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}" LOCKFILE="${LOCKFILE:-/run/server-backup.lock}" RUN_DATE="$(date +%F)" STARTED_AT="$(date -Is)" SECONDS=0 # Phase order — must match write_status_json + determine_exit_code expectations. PHASE_ORDER=( postgres_dump forgejo_dump forgejo_db_dump restic_nas restic_b2 forget_nas check_nas check_b2 ) declare -A PHASE_STATUS PHASE_EXIT PHASE_START PHASE_END PHASE_ERR PHASE_EXTRA OVERALL_STATUS="unknown" # ── Single-instance lock ─────────────────────────────────────────────────── exec 9>"$LOCKFILE" || { echo "ERROR: cannot open lockfile $LOCKFILE" >&2; exit 1; } if ! flock -n 9; then echo "ERROR: another server-backup is already running (lock $LOCKFILE held)" >&2 exit 75 fi # ── Env + secret loading ─────────────────────────────────────────────────── # When invoked via systemd, EnvironmentFile=/etc/restic-backup.env has already # been loaded. When invoked manually for testing, source it ourselves. if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then # shellcheck disable=SC1091 set -a; . /etc/restic-backup.env; set +a fi : "${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set (see /etc/restic-backup.env)}" : "${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set (see /etc/restic-backup.env)}" if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2 exit 1 fi export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH" # Required tooling for tool in jq restic docker gzip flock; do if ! command -v "$tool" >/dev/null 2>&1; then echo "ERROR: required tool '$tool' not on PATH" >&2 exit 1 fi done # ── Logging ──────────────────────────────────────────────────────────────── mkdir -p "$LOG_DIR" "$STATUS_DIR" "$DB_DUMP_DIR" chmod 0750 "$LOG_DIR" "$STATUS_DIR" LOG_FILE="$LOG_DIR/server-backup-$RUN_DATE.log" # Mirror everything to LOG_FILE and the journal. exec > >(tee -a "$LOG_FILE") 2>&1 echo "════════════════════════════════════════════════════════════════" echo " Server backup — started $STARTED_AT" echo " Host: $(hostname)" echo " NAS repo: $RESTIC_REPO_NAS" echo " B2 repo: $RESTIC_REPO_B2" echo "════════════════════════════════════════════════════════════════" # ── Phase runner ─────────────────────────────────────────────────────────── # Runs the function passed as first arg, captures stdout+stderr into a phase # buffer, records status / exit_code / timestamps / error tail. run_phase() { local name="$1"; shift local phase_buf phase_buf=$(mktemp -t "backup-phase-${name}.XXXXXX") echo "" echo "─── phase: $name ─── $(date -Is)" PHASE_START[$name]=$(date -Is) local rc=0 # Run in a sub-shell so set -e inside callees doesn't kill us. ( "$@" ) 2>&1 | tee "$phase_buf" rc=${PIPESTATUS[0]} PHASE_EXIT[$name]=$rc case "$rc" in 0) PHASE_STATUS[$name]=success ;; 3) PHASE_STATUS[$name]=degraded ;; # restic: snapshot created but some files unreadable 99) PHASE_STATUS[$name]=skipped ;; # our convention for "not applicable" *) PHASE_STATUS[$name]=failed ;; esac if [ "$rc" -ne 0 ] && [ "$rc" -ne 99 ] && [ -s "$phase_buf" ]; then # Keep last few non-empty lines as a compact error summary. PHASE_ERR[$name]=$(tail -n 5 "$phase_buf" | tr '\n' ' ' | head -c 500) fi PHASE_END[$name]=$(date -Is) rm -f "$phase_buf" echo "─── end $name (exit=$rc, status=${PHASE_STATUS[$name]})" } # Convention: a phase function returns 99 to mark itself "skipped" — the # overall outcome treats this as success. SKIPPED=99 # ── Phase 1: pg_dumpall (Scrum4Me Postgres cluster) ──────────────────────── dump_postgres_all() { local pg_container="${PG_CONTAINER:-scrum4me-postgres}" local pg_user="${PG_DUMPALL_USER:-scrum4me}" if ! docker ps --format '{{.Names}}' | grep -qx "$pg_container"; then echo "Postgres container '$pg_container' not running — cannot continue." return 1 fi local tmp="$DB_DUMP_DIR/.postgres-$RUN_DATE.sql.gz.tmp" local final="$DB_DUMP_DIR/postgres-$RUN_DATE.sql.gz" rm -f "$tmp" set -o pipefail docker exec "$pg_container" pg_dumpall -U "$pg_user" --clean --if-exists \ | gzip -c > "$tmp" local rc=$? set +o pipefail if [ "$rc" -ne 0 ]; then rm -f "$tmp" return "$rc" fi mv "$tmp" "$final" chmod 0640 "$final" local bytes bytes=$(stat -c %s "$final" 2>/dev/null || echo 0) PHASE_EXTRA[postgres_dump]="output_file=$final;bytes=$bytes" echo "wrote $final ($bytes bytes)" } # ── Phase 2: Forgejo dump (filesystem + repos) ───────────────────────────── dump_forgejo() { local fj="${FORGEJO_CONTAINER:-}" if [ -z "$fj" ]; then echo "FORGEJO_CONTAINER unset — skipping Forgejo dump." return "$SKIPPED" fi if ! docker ps --format '{{.Names}}' | grep -qx "$fj"; then echo "Forgejo container '$fj' not running — skipping." return "$SKIPPED" fi local config="${FORGEJO_CONFIG:-/data/gitea/conf/app.ini}" local tmp="$DB_DUMP_DIR/.forgejo-$RUN_DATE.zip.tmp" local final="$DB_DUMP_DIR/forgejo-$RUN_DATE.zip" rm -f "$tmp" # `forgejo dump -f -` streams the zip to stdout. We run as the `git` user # inside the container (standard Forgejo image convention). set -o pipefail docker exec -u git "$fj" forgejo dump --skip-db -c "$config" --type zip -f - > "$tmp" local rc=$? set +o pipefail if [ "$rc" -ne 0 ]; then rm -f "$tmp" return "$rc" fi mv "$tmp" "$final" chmod 0640 "$final" local bytes bytes=$(stat -c %s "$final" 2>/dev/null || echo 0) PHASE_EXTRA[forgejo_dump]="output_file=$final;bytes=$bytes" echo "wrote $final ($bytes bytes)" } # ── Phase 3: Forgejo Postgres DB dump (authoritative for DB restore) ─────── dump_forgejo_db() { local db_name="${FORGEJO_DB_NAME:-}" if [ -z "$db_name" ]; then echo "FORGEJO_DB_NAME unset — skipping Forgejo DB dump (assume SQLite)." return "$SKIPPED" fi local db_container="${FORGEJO_DB_CONTAINER:-scrum4me-postgres}" local db_user="${FORGEJO_DB_USER:-scrum4me}" if ! docker ps --format '{{.Names}}' | grep -qx "$db_container"; then echo "DB container '$db_container' not running — skipping Forgejo DB dump." return "$SKIPPED" fi local tmp="$DB_DUMP_DIR/.forgejo-db-$RUN_DATE.sql.gz.tmp" local final="$DB_DUMP_DIR/forgejo-db-$RUN_DATE.sql.gz" rm -f "$tmp" set -o pipefail docker exec "$db_container" pg_dump -U "$db_user" --clean --if-exists "$db_name" \ | gzip -c > "$tmp" local rc=$? set +o pipefail if [ "$rc" -ne 0 ]; then rm -f "$tmp" return "$rc" fi mv "$tmp" "$final" chmod 0640 "$final" local bytes bytes=$(stat -c %s "$final" 2>/dev/null || echo 0) PHASE_EXTRA[forgejo_db_dump]="output_file=$final;bytes=$bytes" echo "wrote $final ($bytes bytes)" } # ── Phases 4 + 5: restic backup to NAS / B2 ──────────────────────────────── # Live Docker datadirs are excluded — dumps (above) are the authoritative # restore source for Postgres and Forgejo. RESTIC_BACKUP_PATHS=( /etc /home/janpeter /root /opt /srv /usr/local/bin "$DB_DUMP_DIR" /srv/ops/backups ) RESTIC_EXCLUDES=( --exclude='**/node_modules' --exclude='**/.next/cache' --exclude='**/.cache' --exclude='**/.git/objects/pack' --exclude='/srv/backups/logs' --exclude='/tmp' --exclude='/var/tmp' --exclude='/srv/scrum4me/postgres' # live Postgres datadir — non-authoritative --exclude='/srv/forgejo/data/git' # live Forgejo git objects — non-authoritative --exclude='/srv/forgejo/data/lfs' --exclude='/srv/forgejo/data/queues' ) restic_backup_to() { local repo="$1"; local label="$2" local extra_args=() if [ "$label" = "b2" ] && [ -n "${BACKUP_LIMIT_UPLOAD_KIB:-}" ]; then extra_args+=(--limit-upload "$BACKUP_LIMIT_UPLOAD_KIB") fi # Capture restic JSON output so we can extract the snapshot id. local json_out json_out=$(mktemp -t "restic-backup-${label}.XXXXXX.json") # --no-scan keeps the lockfile interaction light; --skip-if-unchanged still # records a snapshot per restic semantics so the dashboard sees a daily entry. restic -r "$repo" backup \ --tag scheduled \ --tag "host=$(hostname)" \ --json \ "${extra_args[@]}" \ "${RESTIC_EXCLUDES[@]}" \ "${RESTIC_BACKUP_PATHS[@]}" \ | tee "$json_out" local rc=${PIPESTATUS[0]} # Extract snapshot id from the final summary line (last JSON object of type=summary). local snap snap=$(jq -rs 'map(select(.message_type=="summary")) | last | .snapshot_id // empty' < "$json_out" 2>/dev/null || true) local files_new files_new=$(jq -rs 'map(select(.message_type=="summary")) | last | .files_new // empty' < "$json_out" 2>/dev/null || true) local data_added data_added=$(jq -rs 'map(select(.message_type=="summary")) | last | .data_added // empty' < "$json_out" 2>/dev/null || true) if [ -n "$snap" ]; then PHASE_EXTRA["restic_$label"]="snapshot_id=$snap;files_new=${files_new:-0};data_added_bytes=${data_added:-0}" fi rm -f "$json_out" return "$rc" } # ── Phase 6: prune NAS only (B2 is Object Lock — pruning runs off-server) ── restic_forget_nas() { restic -r "$RESTIC_REPO_NAS" forget \ --keep-daily 7 \ --keep-weekly 4 \ --keep-monthly 12 \ --prune } # ── Phase 7: integrity check (light daily; weekly read-data-subset on Sun) ─ is_sunday() { [ "$(date +%u)" = "7" ] } restic_check_nas() { if is_sunday; then restic -r "$RESTIC_REPO_NAS" check --read-data-subset=2.5% else restic -r "$RESTIC_REPO_NAS" check fi } restic_check_b2() { if is_sunday; then # On B2 a read-data-subset costs bandwidth + B2 download fees. Keep the # subset tiny on Sundays; deeper checks run monthly off-server. restic -r "$RESTIC_REPO_B2" check --read-data-subset=1% else restic -r "$RESTIC_REPO_B2" check fi } # ── Statusfile writer ────────────────────────────────────────────────────── # Builds a structured JSON statusfile in /srv/backups/status/last-run.json # atomically (write to tmp, then mv). write_status_json() { local tmpfile tmpfile=$(mktemp -t "backup-status.XXXXXX.json") # Build the phases object incrementally with jq for safe escaping. local phases_json='{}' local name status exit_code started ended err extra local snapshot_id files_new data_added output_file bytes for name in "${PHASE_ORDER[@]}"; do status="${PHASE_STATUS[$name]:-pending}" exit_code="${PHASE_EXIT[$name]:-}" started="${PHASE_START[$name]:-}" ended="${PHASE_END[$name]:-}" err="${PHASE_ERR[$name]:-}" extra="${PHASE_EXTRA[$name]:-}" snapshot_id="" files_new="" data_added="" output_file="" bytes="" if [ -n "$extra" ]; then # extra is a semicolon-separated list of key=value pairs local pair key val IFS=';' read -ra pairs <<< "$extra" for pair in "${pairs[@]}"; do key="${pair%%=*}" val="${pair#*=}" case "$key" in snapshot_id) snapshot_id="$val" ;; files_new) files_new="$val" ;; data_added_bytes) data_added="$val" ;; output_file) output_file="$val" ;; bytes) bytes="$val" ;; esac done fi # exit_code as JSON number when present, null otherwise. local exit_arg='null' if [ -n "$exit_code" ]; then exit_arg="$exit_code" fi phases_json=$( jq -c -n \ --argjson base "$phases_json" \ --arg name "$name" \ --arg status "$status" \ --argjson exit_code "$exit_arg" \ --arg started "$started" \ --arg ended "$ended" \ --arg err "$err" \ --arg snapshot_id "$snapshot_id" \ --arg files_new "$files_new" \ --arg data_added "$data_added" \ --arg output_file "$output_file" \ --arg bytes "$bytes" \ ' $base + { ($name): ({ status: $status, exit_code: $exit_code, started_at: (if $started == "" then null else $started end), completed_at: (if $ended == "" then null else $ended end), error: (if $err == "" then null else $err end) } + (if $snapshot_id != "" then { snapshot_id: $snapshot_id } else {} end) + (if $files_new != "" then { files_new: ($files_new | tonumber? // null) } else {} end) + (if $data_added != "" then { data_added_bytes: ($data_added | tonumber? // null) } else {} end) + (if $output_file != "" then { output_file: $output_file } else {} end) + (if $bytes != "" then { bytes: ($bytes | tonumber? // null) } else {} end)) }' ) done jq -n \ --arg overall "$OVERALL_STATUS" \ --arg started "$STARTED_AT" \ --arg completed "$(date -Is)" \ --argjson duration "$SECONDS" \ --arg host "$(hostname)" \ --argjson phases "$phases_json" \ '{ schema_version: 1, overall_status: $overall, started_at: $started, completed_at: $completed, duration_seconds: $duration, host: $host, phases: $phases }' > "$tmpfile" mv "$tmpfile" "$STATUS_DIR/last-run.json" chmod 0644 "$STATUS_DIR/last-run.json" } # ── Outcome aggregation ──────────────────────────────────────────────────── # success → exit 0 # partial_failure → exit 75 (visible but distinguishable from hard failure) # failed → exit 1 determine_exit_code() { local critical_failure=false local has_failure=false local has_degraded=false local name status for name in "${PHASE_ORDER[@]}"; do status="${PHASE_STATUS[$name]:-pending}" case "$status" in success|skipped) ;; degraded) has_degraded=true ;; failed) has_failure=true case "$name" in postgres_dump) critical_failure=true ;; # losing the DB dump is catastrophic esac ;; esac done # Losing BOTH restic repos is also catastrophic. if [ "${PHASE_STATUS[restic_nas]:-}" = "failed" ] \ && [ "${PHASE_STATUS[restic_b2]:-}" = "failed" ]; then critical_failure=true fi if [ "$critical_failure" = true ]; then OVERALL_STATUS="failed" echo 1 elif [ "$has_failure" = true ] || [ "$has_degraded" = true ]; then OVERALL_STATUS="partial_failure" echo 75 else OVERALL_STATUS="success" echo 0 fi } # ── Main sequence ────────────────────────────────────────────────────────── run_phase postgres_dump dump_postgres_all run_phase forgejo_dump dump_forgejo run_phase forgejo_db_dump dump_forgejo_db run_phase restic_nas restic_backup_to "$RESTIC_REPO_NAS" nas run_phase restic_b2 restic_backup_to "$RESTIC_REPO_B2" b2 run_phase forget_nas restic_forget_nas run_phase check_nas restic_check_nas run_phase check_b2 restic_check_b2 EXIT_CODE=$(determine_exit_code) write_status_json echo "" echo "════════════════════════════════════════════════════════════════" echo " Server backup — finished $(date -Is)" echo " Overall status: $OVERALL_STATUS (exit $EXIT_CODE)" echo " Duration: ${SECONDS}s" echo " Status file: $STATUS_DIR/last-run.json" echo " Log file: $LOG_FILE" echo "════════════════════════════════════════════════════════════════" exit "$EXIT_CODE"