Ops-dashboard/deploy/server-backup/server-backup.sh

#!/usr/bin/env bash
# Daily server-wide backup: dumps databases, runs restic to NAS + B2,
# writes a structured statusfile that the ops-dashboard can read.
#
# Install:
#   cp deploy/server-backup/server-backup.sh /srv/backups/scripts/server-backup.sh
#   chmod 0750 /srv/backups/scripts/server-backup.sh
#   chown root:root /srv/backups/scripts/server-backup.sh
#
# Requires: bash, jq, flock, restic, docker, gzip. See runbook for setup.

umask 077
set -uo pipefail

# ── Configuration ──────────────────────────────────────────────────────────
STATUS_DIR="${STATUS_DIR:-/srv/backups/status}"
LOG_DIR="${LOG_DIR:-/srv/backups/logs}"
DB_DUMP_DIR="${DB_DUMP_DIR:-/var/backups/databases}"
RESTIC_PASSWORD_FILE_PATH="${RESTIC_PASSWORD_FILE_PATH:-/etc/restic-backup.password}"
LOCKFILE="${LOCKFILE:-/run/server-backup.lock}"
RUN_DATE="$(date +%F)"
STARTED_AT="$(date -Is)"
SECONDS=0

# Phase order — must match write_status_json + determine_exit_code expectations.
PHASE_ORDER=(
  postgres_dump
  forgejo_dump
  forgejo_db_dump
  restic_nas
  restic_b2
  forget_nas
  check_nas
  check_b2
)

declare -A PHASE_STATUS PHASE_EXIT PHASE_START PHASE_END PHASE_ERR PHASE_EXTRA
OVERALL_STATUS="unknown"

# ── Single-instance lock ───────────────────────────────────────────────────
exec 9>"$LOCKFILE" || { echo "ERROR: cannot open lockfile $LOCKFILE" >&2; exit 1; }
if ! flock -n 9; then
  echo "ERROR: another server-backup is already running (lock $LOCKFILE held)" >&2
  exit 75
fi

# ── Env + secret loading ───────────────────────────────────────────────────
# When invoked via systemd, EnvironmentFile=/etc/restic-backup.env has already
# been loaded. When invoked manually for testing, source it ourselves.
if [ -z "${RESTIC_REPO_NAS:-}" ] && [ -r /etc/restic-backup.env ]; then
  # shellcheck disable=SC1091
  set -a; . /etc/restic-backup.env; set +a
fi

: "${RESTIC_REPO_NAS:?RESTIC_REPO_NAS not set (see /etc/restic-backup.env)}"
: "${RESTIC_REPO_B2:?RESTIC_REPO_B2 not set (see /etc/restic-backup.env)}"

if [ ! -r "$RESTIC_PASSWORD_FILE_PATH" ]; then
  echo "ERROR: restic password file $RESTIC_PASSWORD_FILE_PATH not readable" >&2
  exit 1
fi
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE_PATH"

# Required tooling
for tool in jq restic docker gzip flock; do
  if ! command -v "$tool" >/dev/null 2>&1; then
    echo "ERROR: required tool '$tool' not on PATH" >&2
    exit 1
  fi
done

# ── Logging ────────────────────────────────────────────────────────────────
mkdir -p "$LOG_DIR" "$STATUS_DIR" "$DB_DUMP_DIR"
chmod 0750 "$LOG_DIR" "$STATUS_DIR"
LOG_FILE="$LOG_DIR/server-backup-$RUN_DATE.log"
# Mirror everything to LOG_FILE and the journal.
exec > >(tee -a "$LOG_FILE") 2>&1

echo "════════════════════════════════════════════════════════════════"
echo " Server backup — started $STARTED_AT"
echo " Host: $(hostname)"
echo " NAS repo: $RESTIC_REPO_NAS"
echo " B2 repo:  $RESTIC_REPO_B2"
echo "════════════════════════════════════════════════════════════════"

# ── Phase runner ───────────────────────────────────────────────────────────
# Runs the function passed as first arg, captures stdout+stderr into a phase
# buffer, records status / exit_code / timestamps / error tail.
run_phase() {
  local name="$1"; shift
  local phase_buf
  phase_buf=$(mktemp -t "backup-phase-${name}.XXXXXX")

  echo ""
  echo "─── phase: $name ─── $(date -Is)"
  PHASE_START[$name]=$(date -Is)

  local rc=0
  # Run in a sub-shell so set -e inside callees doesn't kill us.
  (
    "$@"
  ) 2>&1 | tee "$phase_buf"
  rc=${PIPESTATUS[0]}

  PHASE_EXIT[$name]=$rc
  case "$rc" in
    0)  PHASE_STATUS[$name]=success ;;
    3)  PHASE_STATUS[$name]=degraded ;;   # restic: snapshot created but some files unreadable
    99) PHASE_STATUS[$name]=skipped ;;    # our convention for "not applicable"
    *)  PHASE_STATUS[$name]=failed ;;
  esac

  if [ "$rc" -ne 0 ] && [ "$rc" -ne 99 ] && [ -s "$phase_buf" ]; then
    # Keep last few non-empty lines as a compact error summary.
    PHASE_ERR[$name]=$(tail -n 5 "$phase_buf" | tr '\n' ' ' | head -c 500)
  fi

  PHASE_END[$name]=$(date -Is)
  rm -f "$phase_buf"
  echo "─── end $name (exit=$rc, status=${PHASE_STATUS[$name]})"
}

# Convention: a phase function returns 99 to mark itself "skipped" — the
# overall outcome treats this as success.
SKIPPED=99

# ── Phase 1: pg_dumpall (Scrum4Me Postgres cluster) ────────────────────────
dump_postgres_all() {
  local pg_container="${PG_CONTAINER:-scrum4me-postgres}"
  local pg_user="${PG_DUMPALL_USER:-scrum4me}"

  if ! docker ps --format '{{.Names}}' | grep -qx "$pg_container"; then
    echo "Postgres container '$pg_container' not running — cannot continue."
    return 1
  fi

  local tmp="$DB_DUMP_DIR/.postgres-$RUN_DATE.sql.gz.tmp"
  local final="$DB_DUMP_DIR/postgres-$RUN_DATE.sql.gz"
  rm -f "$tmp"

  set -o pipefail
  docker exec "$pg_container" pg_dumpall -U "$pg_user" --clean --if-exists \
    | gzip -c > "$tmp"
  local rc=$?
  set +o pipefail

  if [ "$rc" -ne 0 ]; then
    rm -f "$tmp"
    return "$rc"
  fi

  mv "$tmp" "$final"
  chmod 0640 "$final"
  local bytes
  bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
  PHASE_EXTRA[postgres_dump]="output_file=$final;bytes=$bytes"
  echo "wrote $final ($bytes bytes)"
}

# ── Phase 2: Forgejo dump (filesystem + repos) ─────────────────────────────
dump_forgejo() {
  local fj="${FORGEJO_CONTAINER:-}"
  if [ -z "$fj" ]; then
    echo "FORGEJO_CONTAINER unset — skipping Forgejo dump."
    return "$SKIPPED"
  fi
  if ! docker ps --format '{{.Names}}' | grep -qx "$fj"; then
    echo "Forgejo container '$fj' not running — skipping."
    return "$SKIPPED"
  fi

  local config="${FORGEJO_CONFIG:-/data/gitea/conf/app.ini}"
  local tmp="$DB_DUMP_DIR/.forgejo-$RUN_DATE.zip.tmp"
  local final="$DB_DUMP_DIR/forgejo-$RUN_DATE.zip"
  rm -f "$tmp"

  # `forgejo dump -f -` streams the zip to stdout. We run as the `git` user
  # inside the container (standard Forgejo image convention).
  #
  # NB: Forgejo 11.x heeft GEEN `--skip-db` flag (verwijderd na de Gitea-fork);
  # de DB komt dus mee in de zip. Onze separate `forgejo_db_dump`-fase blijft
  # de autoritatieve restore-bron — de in-zip DB-dump is een redundante kopie.
  set -o pipefail
  docker exec -u git "$fj" forgejo dump -c "$config" --type zip -f - > "$tmp"
  local rc=$?
  set +o pipefail

  if [ "$rc" -ne 0 ]; then
    rm -f "$tmp"
    return "$rc"
  fi

  mv "$tmp" "$final"
  chmod 0640 "$final"
  local bytes
  bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
  PHASE_EXTRA[forgejo_dump]="output_file=$final;bytes=$bytes"
  echo "wrote $final ($bytes bytes)"
}

# ── Phase 3: Forgejo Postgres DB dump (authoritative for DB restore) ───────
dump_forgejo_db() {
  local db_name="${FORGEJO_DB_NAME:-}"
  if [ -z "$db_name" ]; then
    echo "FORGEJO_DB_NAME unset — skipping Forgejo DB dump (assume SQLite)."
    return "$SKIPPED"
  fi
  local db_container="${FORGEJO_DB_CONTAINER:-scrum4me-postgres}"
  local db_user="${FORGEJO_DB_USER:-scrum4me}"

  if ! docker ps --format '{{.Names}}' | grep -qx "$db_container"; then
    echo "DB container '$db_container' not running — skipping Forgejo DB dump."
    return "$SKIPPED"
  fi

  local tmp="$DB_DUMP_DIR/.forgejo-db-$RUN_DATE.sql.gz.tmp"
  local final="$DB_DUMP_DIR/forgejo-db-$RUN_DATE.sql.gz"
  rm -f "$tmp"

  set -o pipefail
  docker exec "$db_container" pg_dump -U "$db_user" --clean --if-exists "$db_name" \
    | gzip -c > "$tmp"
  local rc=$?
  set +o pipefail

  if [ "$rc" -ne 0 ]; then
    rm -f "$tmp"
    return "$rc"
  fi

  mv "$tmp" "$final"
  chmod 0640 "$final"
  local bytes
  bytes=$(stat -c %s "$final" 2>/dev/null || echo 0)
  PHASE_EXTRA[forgejo_db_dump]="output_file=$final;bytes=$bytes"
  echo "wrote $final ($bytes bytes)"
}

# ── Phases 4 + 5: restic backup to NAS / B2 ────────────────────────────────
# Live Docker datadirs are excluded — dumps (above) are the authoritative
# restore source for Postgres and Forgejo.
RESTIC_BACKUP_PATHS=(
  /etc
  /home/janpeter
  /root
  /opt
  /srv
  /usr/local/bin
  "$DB_DUMP_DIR"
  /srv/ops/backups
)
RESTIC_EXCLUDES=(
  --exclude='**/node_modules'
  --exclude='**/.next/cache'
  --exclude='**/.cache'
  --exclude='**/.git/objects/pack'
  --exclude='/srv/backups/logs'
  --exclude='/tmp'
  --exclude='/var/tmp'
  --exclude='/srv/scrum4me/postgres'       # live Postgres datadir — non-authoritative
  --exclude='/srv/forgejo/data/git'        # live Forgejo git objects — non-authoritative
  --exclude='/srv/forgejo/data/lfs'
  --exclude='/srv/forgejo/data/queues'
)

restic_backup_to() {
  local repo="$1"; local label="$2"
  local extra_args=()
  if [ "$label" = "b2" ] && [ -n "${BACKUP_LIMIT_UPLOAD_KIB:-}" ]; then
    extra_args+=(--limit-upload "$BACKUP_LIMIT_UPLOAD_KIB")
  fi

  # Capture restic JSON output so we can extract the snapshot id.
  local json_out
  json_out=$(mktemp -t "restic-backup-${label}.XXXXXX.json")

  # --no-scan keeps the lockfile interaction light; --skip-if-unchanged still
  # records a snapshot per restic semantics so the dashboard sees a daily entry.
  restic -r "$repo" backup \
    --tag scheduled \
    --tag "host=$(hostname)" \
    --json \
    "${extra_args[@]}" \
    "${RESTIC_EXCLUDES[@]}" \
    "${RESTIC_BACKUP_PATHS[@]}" \
    | tee "$json_out"
  local rc=${PIPESTATUS[0]}

  # Extract snapshot id from the final summary line (last JSON object of type=summary).
  local snap
  snap=$(jq -rs 'map(select(.message_type=="summary")) | last | .snapshot_id // empty' < "$json_out" 2>/dev/null || true)
  local files_new
  files_new=$(jq -rs 'map(select(.message_type=="summary")) | last | .files_new // empty' < "$json_out" 2>/dev/null || true)
  local data_added
  data_added=$(jq -rs 'map(select(.message_type=="summary")) | last | .data_added // empty' < "$json_out" 2>/dev/null || true)

  if [ -n "$snap" ]; then
    PHASE_EXTRA["restic_$label"]="snapshot_id=$snap;files_new=${files_new:-0};data_added_bytes=${data_added:-0}"
  fi

  rm -f "$json_out"
  return "$rc"
}

# ── Phase 6: prune NAS only (B2 is Object Lock — pruning runs off-server) ──
restic_forget_nas() {
  restic -r "$RESTIC_REPO_NAS" forget \
    --keep-daily 7 \
    --keep-weekly 4 \
    --keep-monthly 12 \
    --prune
}

# ── Phase 7: integrity check (light daily; weekly read-data-subset on Sun) ─
is_sunday() {
  [ "$(date +%u)" = "7" ]
}

restic_check_nas() {
  if is_sunday; then
    restic -r "$RESTIC_REPO_NAS" check --read-data-subset=2.5%
  else
    restic -r "$RESTIC_REPO_NAS" check
  fi
}

restic_check_b2() {
  if is_sunday; then
    # On B2 a read-data-subset costs bandwidth + B2 download fees. Keep the
    # subset tiny on Sundays; deeper checks run monthly off-server.
    restic -r "$RESTIC_REPO_B2" check --read-data-subset=1%
  else
    restic -r "$RESTIC_REPO_B2" check
  fi
}

# ── Statusfile writer ──────────────────────────────────────────────────────
# Builds a structured JSON statusfile in /srv/backups/status/last-run.json
# atomically (write to tmp, then mv).
write_status_json() {
  local tmpfile
  tmpfile=$(mktemp -t "backup-status.XXXXXX.json")

  # Build the phases object incrementally with jq for safe escaping.
  local phases_json='{}'
  local name status exit_code started ended err extra
  local snapshot_id files_new data_added output_file bytes
  for name in "${PHASE_ORDER[@]}"; do
    status="${PHASE_STATUS[$name]:-pending}"
    exit_code="${PHASE_EXIT[$name]:-}"
    started="${PHASE_START[$name]:-}"
    ended="${PHASE_END[$name]:-}"
    err="${PHASE_ERR[$name]:-}"
    extra="${PHASE_EXTRA[$name]:-}"

    snapshot_id=""
    files_new=""
    data_added=""
    output_file=""
    bytes=""
    if [ -n "$extra" ]; then
      # extra is a semicolon-separated list of key=value pairs
      local pair key val
      IFS=';' read -ra pairs <<< "$extra"
      for pair in "${pairs[@]}"; do
        key="${pair%%=*}"
        val="${pair#*=}"
        case "$key" in
          snapshot_id) snapshot_id="$val" ;;
          files_new) files_new="$val" ;;
          data_added_bytes) data_added="$val" ;;
          output_file) output_file="$val" ;;
          bytes) bytes="$val" ;;
        esac
      done
    fi

    # exit_code as JSON number when present, null otherwise.
    local exit_arg='null'
    if [ -n "$exit_code" ]; then
      exit_arg="$exit_code"
    fi

    phases_json=$(
      jq -c -n \
        --argjson base "$phases_json" \
        --arg name "$name" \
        --arg status "$status" \
        --argjson exit_code "$exit_arg" \
        --arg started "$started" \
        --arg ended "$ended" \
        --arg err "$err" \
        --arg snapshot_id "$snapshot_id" \
        --arg files_new "$files_new" \
        --arg data_added "$data_added" \
        --arg output_file "$output_file" \
        --arg bytes "$bytes" \
        '
        $base + {
          ($name): ({
            status: $status,
            exit_code: $exit_code,
            started_at: (if $started == "" then null else $started end),
            completed_at: (if $ended == "" then null else $ended end),
            error: (if $err == "" then null else $err end)
          }
          + (if $snapshot_id != "" then { snapshot_id: $snapshot_id } else {} end)
          + (if $files_new != ""   then { files_new: ($files_new | tonumber? // null) } else {} end)
          + (if $data_added != ""  then { data_added_bytes: ($data_added | tonumber? // null) } else {} end)
          + (if $output_file != "" then { output_file: $output_file } else {} end)
          + (if $bytes != ""       then { bytes: ($bytes | tonumber? // null) } else {} end))
        }'
    )
  done

  jq -n \
    --arg overall "$OVERALL_STATUS" \
    --arg started "$STARTED_AT" \
    --arg completed "$(date -Is)" \
    --argjson duration "$SECONDS" \
    --arg host "$(hostname)" \
    --argjson phases "$phases_json" \
    '{
      schema_version: 1,
      overall_status: $overall,
      started_at: $started,
      completed_at: $completed,
      duration_seconds: $duration,
      host: $host,
      phases: $phases
    }' > "$tmpfile"

  mv "$tmpfile" "$STATUS_DIR/last-run.json"
  chmod 0644 "$STATUS_DIR/last-run.json"
}

# ── Outcome aggregation ────────────────────────────────────────────────────
# success         → exit 0
# partial_failure → exit 75 (visible but distinguishable from hard failure)
# failed          → exit 1
determine_exit_code() {
  local critical_failure=false
  local has_failure=false
  local has_degraded=false
  local name status

  for name in "${PHASE_ORDER[@]}"; do
    status="${PHASE_STATUS[$name]:-pending}"
    case "$status" in
      success|skipped) ;;
      degraded) has_degraded=true ;;
      failed)
        has_failure=true
        case "$name" in
          postgres_dump) critical_failure=true ;;   # losing the DB dump is catastrophic
        esac
        ;;
    esac
  done

  # Losing BOTH restic repos is also catastrophic.
  if [ "${PHASE_STATUS[restic_nas]:-}" = "failed" ] \
     && [ "${PHASE_STATUS[restic_b2]:-}" = "failed" ]; then
    critical_failure=true
  fi

  # NB: deze functie wordt direct (niet via $(...)) aangeroepen, anders gaan
  # de OVERALL_STATUS-assignments verloren in de subshell — write_status_json
  # zou dan "unknown" wegschrijven en de eind-banner idem.
  if [ "$critical_failure" = true ]; then
    OVERALL_STATUS="failed"
    EXIT_CODE=1
  elif [ "$has_failure" = true ] || [ "$has_degraded" = true ]; then
    OVERALL_STATUS="partial_failure"
    EXIT_CODE=75
  else
    OVERALL_STATUS="success"
    EXIT_CODE=0
  fi
}

# ── Main sequence ──────────────────────────────────────────────────────────
run_phase postgres_dump   dump_postgres_all
run_phase forgejo_dump    dump_forgejo
run_phase forgejo_db_dump dump_forgejo_db
run_phase restic_nas      restic_backup_to "$RESTIC_REPO_NAS" nas
run_phase restic_b2       restic_backup_to "$RESTIC_REPO_B2"  b2
run_phase forget_nas      restic_forget_nas
run_phase check_nas       restic_check_nas
run_phase check_b2        restic_check_b2

determine_exit_code   # sets OVERALL_STATUS + EXIT_CODE in this shell
write_status_json

echo ""
echo "════════════════════════════════════════════════════════════════"
echo " Server backup — finished $(date -Is)"
echo " Overall status: $OVERALL_STATUS (exit $EXIT_CODE)"
echo " Duration: ${SECONDS}s"
echo " Status file: $STATUS_DIR/last-run.json"
echo " Log file:    $LOG_FILE"
echo "════════════════════════════════════════════════════════════════"

exit "$EXIT_CODE"