热门角色不仅是灵感来源,更是你的效率助手。通过精挑细选的角色提示词,你可以快速生成高质量内容、提升创作灵感,并找到最契合你需求的解决方案。让创作更轻松,让价值更直接!
我们根据不同用户需求,持续更新角色库,让你总能找到合适的灵感入口。
本提示词专为DevOps工程师设计,能够根据不同的数据库或系统类型生成专业、可靠的自动化备份脚本。通过分析备份目标的技术特性、存储需求和恢复策略,提供包含完整备份逻辑、错误处理机制和监控告警功能的脚本方案。支持多种主流数据库和文件系统的备份场景,确保脚本具备生产环境部署的可靠性和安全性,帮助用户快速构建企业级数据保护解决方案。
本方案面向大规模 MySQL 生产环境,采用热备的物理备份方式(Percona XtraBackup 8.0 系列),支持:
脚本采用 Bash 实现,结合 awscli、xtrabackup、pigz 等工具。在不硬编码敏感信息的前提下,通过客户端配置文件与 IAM 角色/KMS 实现安全访问与静态加密。
性能与参数前提(建议):
以下脚本为完整可执行示例,保存为 /usr/local/bin/mysql_backup.sh 并赋予执行权限。默认示例使用 AWS S3;若为其他 S3 兼容云,配置相应 endpoint 与鉴权即可。
#!/usr/bin/env bash
# mysql_backup.sh
# Reliable MySQL Hot Backup to Cloud Storage (XtraBackup + S3)
# Author: DevOps Backup Expert
# License: MIT
set -Eeuo pipefail
# ========== 用户配置(通过环境变量或在此设置) ==========
# MySQL
MYSQL_DEFAULTS_FILE="${MYSQL_DEFAULTS_FILE:-/root/.my.cnf}" # [client] user/password/socket
MYSQL_CNF="${MYSQL_CNF:-/etc/my.cnf}" # MySQL 主配置路径(供 xtrabackup 解析)
MYSQL_CLIENT="${MYSQL_CLIENT:-mysql}"
# 备份策略
BACKUP_ROOT="${BACKUP_ROOT:-/var/backups/mysql}"
MODE="${MODE:-manual}" # manual|full|incr
FULL_PREPARE="${FULL_PREPARE:-true}" # full 备份后是否预准备(加速恢复)
THREADS="${THREADS:-4}" # xtrabackup --parallel
THROTTLE_IOPS="${THROTTLE_IOPS:-}" # xtrabackup --throttle,可为空
IONICE_CLASS="${IONICE_CLASS:-2}" # 压缩/上传使用低 IO 优先级
IONICE_PRIO="${IONICE_PRIO:-7}"
NICE_PRIO="${NICE_PRIO:-10}"
RETAIN_LOCAL_DAYS="${RETAIN_LOCAL_DAYS:-3}"
# 云存储(AWS S3 示例)
S3_BUCKET="${S3_BUCKET:-s3://your-bucket-name}"
S3_PREFIX="${S3_PREFIX:-backups/mysql}"
S3_STORAGE_CLASS="${S3_STORAGE_CLASS:-STANDARD_IA}" # 或 INTELLIGENT_TIERING
S3_SSE="${S3_SSE:-aws:kms}" # aws:kms 或 AES256
AWS_KMS_KEY_ID="${AWS_KMS_KEY_ID:-}" # 可为空,使用默认 KMS
AWS_CLI="${AWS_CLI:-aws}"
AWS_EXTRA_OPTS="${AWS_EXTRA_OPTS:-}" # 如 --endpoint-url https://minio.example.com
# 监控/告警(可选)
MONITORING_WEBHOOK="${MONITORING_WEBHOOK:-}" # 如 https://ops.example.com/backup-webhook
PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-}" # 如 http://pushgateway:9091/metrics/job/mysql_backup
# 其他
DRYRUN="${DRYRUN:-false}"
LOG_DIR="${LOG_DIR:-/var/log/mysql-backup}"
UMASK_VALUE="${UMASK_VALUE:-077}" # 严格权限
HOSTNAME_SHORT="$(hostname -s)"
DATE_TAG="$(date +%Y%m%d-%H%M%S)"
# ========== 工具函数 ==========
log() { printf '%s %s\n' "$(date +'%F %T')" "$*" | tee -a "${LOG_DIR}/backup-${DATE_TAG}.log"; }
fail() { log "ERROR: $*"; exit 1; }
run() {
log "+ $*"
if [[ "${DRYRUN}" == "true" ]]; then return 0; fi
"$@"
}
retry() {
local max=${1:-3}; shift
local delay=${1:-5}; shift
local n=0
until "$@"; do
n=$((n+1))
if (( n >= max )); then return 1; fi
log "WARN: retry $n/$max after ${delay}s: $*"
sleep "${delay}"
done
}
json_escape() { python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))'; }
cleanup() {
set +e
if [[ -n "${WORKDIR:-}" && -d "${WORKDIR}" && "${KEEP_WORKDIR_ON_FAIL:-false}" != "true" ]]; then
rm -rf --one-file-system "${WORKDIR}" || true
fi
}
trap cleanup EXIT
# ========== 预检 ==========
umask "${UMASK_VALUE}"
mkdir -p "${BACKUP_ROOT}" "${LOG_DIR}"
command -v xtrabackup >/dev/null 2>&1 || fail "xtrabackup not found"
command -v "${MYSQL_CLIENT}" >/dev/null 2>&1 || fail "mysql client not found"
command -v "${AWS_CLI}" >/dev/null 2>&1 || fail "aws cli not found"
command -v pigz >/dev/null 2>&1 || fail "pigz not found"
command -v jq >/dev/null 2>&1 || fail "jq not found"
command -v sha256sum >/dev/null 2>&1 || fail "sha256sum not found"
command -v ionice >/dev/null 2>&1 || log "WARN: ionice not found, continuing"
command -v nice >/dev/null 2>&1 || log "WARN: nice not found, continuing"
[[ -r "${MYSQL_DEFAULTS_FILE}" ]] || fail "Cannot read MYSQL_DEFAULTS_FILE=${MYSQL_DEFAULTS_FILE}"
# 检查 MySQL 连接
MYSQL_CMD=( "${MYSQL_CLIENT}" --defaults-file="${MYSQL_DEFAULTS_FILE}" --batch --skip-column-names )
MYSQL_VERSION="$("${MYSQL_CMD[@]}" -e "SELECT VERSION();" 2>/dev/null || true)"
[[ -n "${MYSQL_VERSION}" ]] || fail "Cannot connect to MySQL using ${MYSQL_DEFAULTS_FILE}"
# 获取数据目录与 binlog
DATADIR="$("${MYSQL_CMD[@]}" -e "SHOW VARIABLES LIKE 'datadir';" | awk '{print $2}')"
[[ -d "${DATADIR}" ]] || fail "MySQL datadir not found: ${DATADIR}"
BINLOG_ENABLED="$("${MYSQL_CMD[@]}" -e "SHOW VARIABLES LIKE 'log_bin';" | awk '{print $2}')"
if [[ "${BINLOG_ENABLED}" != "ON" ]]; then
log "WARN: binary log is not enabled; point-in-time recovery will be limited"
fi
BINLOG_BASE="$("${MYSQL_CMD[@]}" -e "SHOW VARIABLES LIKE 'log_bin_basename';" | awk '{print $2}')"
if [[ -z "${BINLOG_BASE}" ]]; then
# 退化为 datadir + log_bin
BINLOG_FILE="$("${MYSQL_CMD[@]}" -e "SHOW VARIABLES LIKE 'log_bin';" | awk '{print $2}')"
if [[ "${BINLOG_FILE}" != "ON" && -n "${BINLOG_FILE}" ]]; then
BINLOG_BASE="${DATADIR%/}/${BINLOG_FILE}"
fi
fi
BINLOG_DIR="$(dirname "${BINLOG_BASE:-$DATADIR}")"
# 本地空间检查
DATA_BYTES=$(du -sb "${DATADIR}" | awk '{print $1}')
FS_AVAIL=$(df --output=avail -B1 "${BACKUP_ROOT}" | tail -1)
NEEDED=$(( DATA_BYTES + DATA_BYTES/5 )) # +20%
if (( FS_AVAIL < NEEDED )); then
log "WARN: free space may be insufficient. need~${NEEDED}B, avail=${FS_AVAIL}B"
fi
# ========== 目录与文件名 ==========
WORKDIR="$(mktemp -d "${BACKUP_ROOT}/work-${DATE_TAG}-XXXX")"
BACKUP_TYPE="${MODE}" # expect manual|full|incr, manual means decided by arg
ARCHIVE_NAME=""
META_FILE="${WORKDIR}/meta.json"
START_TIME="$(date +%s)"
# ========== 核心函数 ==========
prepare_full_backup() {
local target="$1"
if [[ "${FULL_PREPARE}" == "true" ]]; then
run xtrabackup --prepare --target-dir="${target}"
else
# 至少保证可用
run xtrabackup --prepare --apply-log-only --target-dir="${target}"
fi
}
do_full_backup() {
local target="${WORKDIR}/full-${DATE_TAG}"
mkdir -p "${target}"
local xb_cmd=( xtrabackup --defaults-file="${MYSQL_CNF}" --defaults-group=mysqld
--backup --target-dir="${target}" --parallel="${THREADS}" --binlog-info=ON )
if [[ -n "${THROTTLE_IOPS}" ]]; then xb_cmd+=( --throttle="${THROTTLE_IOPS}" ); fi
run "${xb_cmd[@]}"
# 预准备(加速恢复)
prepare_full_backup "${target}"
# 更新本地指针
ln -sfn "${target}" "${BACKUP_ROOT}/latest_full"
echo "${target}" > "${BACKUP_ROOT}/latest_full.path"
echo "${target}"
}
do_incremental_backup() {
local base="${1:-${BACKUP_ROOT}/latest_full}"
[[ -d "${base}" ]] || fail "incremental base not found: ${base}"
local target="${WORKDIR}/incr-${DATE_TAG}"
mkdir -p "${target}"
local xb_cmd=( xtrabackup --defaults-file="${MYSQL_CNF}" --defaults-group=mysqld
--backup --incremental-basedir="${base}" --target-dir="${target}"
--parallel="${THREADS}" --binlog-info=ON )
if [[ -n "${THROTTLE_IOPS}" ]]; then xb_cmd+=( --throttle="${THROTTLE_IOPS}" ); fi
run "${xb_cmd[@]}"
# 记录增量链
ln -sfn "${target}" "${BACKUP_ROOT}/latest_incr"
echo "${base}" > "${target}/base.path"
echo "${target}"
}
pack_and_hash() {
local src="$1"
ARCHIVE_NAME="${HOSTNAME_SHORT}-${DATE_TAG}-$(basename "${src}").tar.gz"
local archive="${WORKDIR}/${ARCHIVE_NAME}"
# 打包压缩(低优先级)
if command -v ionice >/dev/null 2>&1; then
IONICE_WRAPPER=( ionice -c "${IONICE_CLASS}" -n "${IONICE_PRIO}" )
else
IONICE_WRAPPER=()
fi
if command -v nice >/dev/null 2>&1; then
NICE_WRAPPER=( nice -n "${NICE_PRIO}" )
else
NICE_WRAPPER=()
fi
run "${IONICE_WRAPPER[@]}" "${NICE_WRAPPER[@]}" tar -I "pigz -p ${THREADS} -9" -cf "${archive}" -C "$(dirname "${src}")" "$(basename "${src}")"
run sha256sum "${archive}" > "${archive}.sha256"
echo "${archive}"
}
s3_cp() {
local src="$1" dst="$2"
local sse_opts=()
if [[ "${S3_SSE}" == "aws:kms" ]]; then
sse_opts+=( --sse aws:kms )
[[ -n "${AWS_KMS_KEY_ID}" ]] && sse_opts+=( --sse-kms-key-id "${AWS_KMS_KEY_ID}" )
elif [[ "${S3_SSE}" == "AES256" ]]; then
sse_opts+=( --sse AES256 )
fi
retry 5 10 \
${AWS_CLI} s3 cp ${AWS_EXTRA_OPTS} --only-show-errors --storage-class "${S3_STORAGE_CLASS}" \
"${src}" "${dst}" "${sse_opts[@]}"
}
upload_archive() {
local archive="$1"
local s3path="${S3_BUCKET}/${S3_PREFIX}/${HOSTNAME_SHORT}/${DATE_TAG}/"
run mkdir -p "${WORKDIR}/upload"
run cp -a "${archive}" "${archive}.sha256" "${WORKDIR}/upload/"
retry 3 5 ${AWS_CLI} s3api head-bucket ${AWS_EXTRA_OPTS} --bucket "$(echo "${S3_BUCKET}" | sed 's#^s3://##')" || fail "S3 bucket not accessible"
s3_cp "${WORKDIR}/upload/$(basename "${archive}")" "${s3path}"
s3_cp "${WORKDIR}/upload/$(basename "${archive}.sha256")" "${s3path}"
echo "${s3path}$(basename "${archive}")"
}
sync_binlogs() {
[[ "${BINLOG_ENABLED}" == "ON" ]] || return 0
# 轮转一次,确保之前文件可上传
run mysqladmin --defaults-file="${MYSQL_DEFAULTS_FILE}" flush-logs
# 识别当前活动文件
local current_bin="$("${MYSQL_CMD[@]}" -e "SHOW MASTER STATUS;" | awk 'NR==1{print $1}')"
[[ -n "${current_bin}" ]] || { log "WARN: cannot get current active binlog"; return 0; }
local s3bin="${S3_BUCKET}/${S3_PREFIX}/${HOSTNAME_SHORT}/binlog/"
# 排除当前活动文件与 index
local sync_cmd=( ${AWS_CLI} s3 sync ${AWS_EXTRA_OPTS} --only-show-errors --delete
--exclude "$(basename "${current_bin}")" --exclude "*.index"
--storage-class "${S3_STORAGE_CLASS}" )
# SSE 选项
if [[ "${S3_SSE}" == "aws:kms" ]]; then
sync_cmd+=( --sse aws:kms )
[[ -n "${AWS_KMS_KEY_ID}" ]] && sync_cmd+=( --sse-kms-key-id "${AWS_KMS_KEY_ID}" )
elif [[ "${S3_SSE}" == "AES256" ]]; then
sync_cmd+=( --sse AES256 )
fi
# 执行上传(低优先级)
if command -v ionice >/dev/null 2>&1; then
IONICE_WRAPPER=( ionice -c "${IONICE_CLASS}" -n "${IONICE_PRIO}" )
else
IONICE_WRAPPER=()
fi
if command -v nice >/dev/null 2>&1; then
NICE_WRAPPER=( nice -n "${NICE_PRIO}" )
else
NICE_WRAPPER=()
fi
retry 5 10 "${IONICE_WRAPPER[@]}" "${NICE_WRAPPER[@]}" "${sync_cmd[@]}" "${BINLOG_DIR}/" "${s3bin}"
}
write_meta() {
local type="$1" srcdir="$2" archive="$3" s3obj="$4"
local size_bytes=$(stat -c%s "${archive}")
local stop_lsn="$(grep -E '^to_lsn = ' "${srcdir}/xtrabackup_checkpoints" | awk '{print $3}')"
local from_lsn="$(grep -E '^from_lsn = ' "${srcdir}/xtrabackup_checkpoints" | awk '{print $3}')"
local binlog_file="$(grep -E '^binlog_pos = ' "${srcdir}/xtrabackup_info" | awk -F'[ =,]+' '{print $4}')"
local binlog_pos="$(grep -E '^binlog_pos = ' "${srcdir}/xtrabackup_info" | awk -F'[ =,]+' '{print $6}')"
jq -n \
--arg host "${HOSTNAME_SHORT}" \
--arg started "$(date -d "@${START_TIME}" --iso-8601=seconds)" \
--arg finished "$(date --iso-8601=seconds)" \
--arg type "${type}" \
--arg datadir "${DATADIR}" \
--arg archive "$(basename "${archive}")" \
--arg s3 "${s3obj}" \
--arg mysql_version "${MYSQL_VERSION}" \
--arg binlog_file "${binlog_file:-}" \
--argjson binlog_pos "${binlog_pos:-0}" \
--argjson size_bytes "${size_bytes}" \
--argjson from_lsn "${from_lsn:-0}" \
--argjson to_lsn "${stop_lsn:-0}" \
'{host:$host, started:$started, finished:$finished, type:$type, mysql_version:$mysql_version,
datadir:$datadir, archive:$archive, s3:$s3, size_bytes:$size_bytes,
lsn:{from:$from_lsn,to:$to_lsn}, binlog:{file:$binlog_file,pos:$binlog_pos}}' \
> "${META_FILE}"
# 也上传 meta.json
echo "${META_FILE}"
}
notify() {
local status="$1"
local meta_json="$(cat "${META_FILE}" 2>/dev/null || echo '{}')"
# Webhook
if [[ -n "${MONITORING_WEBHOOK}" ]]; then
curl -sS -m 10 -H "Content-Type: application/json" -X POST \
-d "{\"status\":\"${status}\",\"meta\":${meta_json}}" \
"${MONITORING_WEBHOOK}" >/dev/null 2>&1 || true
fi
# Pushgateway(可选)
if [[ -n "${PUSHGATEWAY_URL}" ]]; then
end_time="$(date +%s)"
duration=$(( end_time - START_TIME ))
size_bytes="$(jq -r '.size_bytes // 0' "${META_FILE}" 2>/dev/null || echo 0)"
cat <<EOF | curl -sS --data-binary @- "${PUSHGATEWAY_URL}" >/dev/null 2>&1 || true
# TYPE mysql_backup_duration_seconds gauge
mysql_backup_duration_seconds{host="${HOSTNAME_SHORT}"} ${duration}
# TYPE mysql_backup_size_bytes gauge
mysql_backup_size_bytes{host="${HOSTNAME_SHORT}"} ${size_bytes}
# TYPE mysql_backup_status gauge
mysql_backup_status{host="${HOSTNAME_SHORT}"} $( [[ "${status}" == "ok" ]] && echo 1 || echo 0 )
EOF
fi
}
prune_local() {
find "${BACKUP_ROOT}" -maxdepth 1 -type d -name "work-*" -mtime +"${RETAIN_LOCAL_DAYS}" -prune -exec rm -rf {} + || true
find "${BACKUP_ROOT}" -maxdepth 1 -type l -name "latest_*" -mtime +"$((RETAIN_LOCAL_DAYS*30))" -exec true {} + >/dev/null 2>&1 || true
}
# ========== 主流程 ==========
main() {
log "MySQL Backup Started. Mode=${MODE}"
local srcdir archive s3obj
case "${MODE}" in
full)
srcdir="$(do_full_backup)"
;;
incr)
srcdir="$(do_incremental_backup)"
;;
manual|*)
fail "Please specify MODE=full or MODE=incr (or pass --mode flag via env)."
;;
esac
archive="$(pack_and_hash "${srcdir}")"
s3obj="$(upload_archive "${archive}")"
write_meta "${MODE}" "${srcdir}" "${archive}" "${s3obj}" >/dev/null
sync_binlogs
prune_local
notify ok
log "MySQL Backup Completed Successfully."
}
# ========== 执行 ==========
if ! main; then
notify failed || true
fail "Backup failed"
fi
MYSQL_DEFAULTS_FILE:MySQL 客户端凭据文件,示例内容(权限 600): [client] user=backup_user password=YOUR_STRONG_PASSWORD socket=/var/lib/mysql/mysql.sock
MYSQL_CNF:MySQL 主配置,用于 xtrabackup 解析实例参数。
MODE:备份模式,full 或 incr。通常通过系统定时器分别调用。
FULL_PREPARE:true 时全量备份后执行完整 prepare,加速恢复速度(适合 RTO 分钟级)。
THREADS:xtrabackup 并发数以及压缩并发(pigz),一般设置为 CPU 核心数或其一半。
THROTTLE_IOPS:xtrabackup IOPS 节流(可选),如 200 表示约 200 IOPS,上线时先评估。
BACKUP_ROOT:本地暂存目录,需预留足够空间。
S3_BUCKET/S3_PREFIX:云端备份存储路径,示例 s3://prod-backups/backups/mysql。
S3_STORAGE_CLASS:对象存储层级,建议 STANDARD_IA 或 INTELLIGENT_TIERING。
S3_SSE/AWS_KMS_KEY_ID:服务器端加密,优先使用 KMS;若为空使用默认 KMS Key,或 AES256。
AWS_EXTRA_OPTS:如使用 S3 兼容存储可加 --endpoint-url=https://minio.example.com。
MONITORING_WEBHOOK:可选的 HTTP 上报地址(接收 JSON)。
PUSHGATEWAY_URL:可选的 Prometheus Pushgateway 地址,自动上报耗时与大小。
RETAIN_LOCAL_DAYS:本地 work 目录保留天数。
LOG_DIR:备份日志存放目录。
/etc/systemd/system/mysql-backup-full.service [Unit] Description=MySQL Full Backup [Service] Type=oneshot Environment="MODE=full" "S3_BUCKET=s3://prod-backups" "S3_PREFIX=backups/mysql" "S3_SSE=aws:kms" "AWS_KMS_KEY_ID=alias/backup-kms-key" ExecStart=/usr/local/bin/mysql_backup.sh
/etc/systemd/system/mysql-backup-full.timer [Unit] Description=Daily MySQL Full Backup at 02:00 [Timer] OnCalendar=--* 02:00:00 Persistent=true [Install] WantedBy=timers.target
/etc/systemd/system/mysql-backup-incr.service [Unit] Description=MySQL Incremental Backup [Service] Type=oneshot Environment="MODE=incr" "S3_BUCKET=s3://prod-backups" "S3_PREFIX=backups/mysql" ExecStart=/usr/local/bin/mysql_backup.sh
/etc/systemd/system/mysql-backup-incr.timer [Unit] Description=Hourly MySQL Incremental Backup [Timer] OnCalendar=hourly Persistent=true [Install] WantedBy=timers.target
启用: systemctl daemon-reload systemctl enable --now mysql-backup-full.timer mysql-backup-incr.timer
如需将该脚本扩展为多实例多租户备份、跨区域复制、对象锁定(WORM)或备份窗口自适应节流等功能,可在此基础上进一步模块化与参数化。
本方案面向中等规模文件系统(medium)的生产备份,采用“本地仓库 + 远端对象存储(S3兼容)”的混合存储架构。核心以 Restic 作为备份引擎,具备如下能力:
适用场景:
以下脚本为完整可执行的 Bash 备份脚本,具备重试、日志、监控和告警功能。请先阅读“配置说明”并完善配置文件后再部署。
#!/usr/bin/env bash
# filesystem_hybrid_backup.sh
# Hybrid (local + remote) filesystem backup with restic, robust error handling and monitoring.
set -uo pipefail
IFS=$'\n\t'
umask 027
VERSION="1.2.0"
SCRIPT_NAME="$(basename "$0")"
HOST_SHORT="$(hostname -s || echo unknown-host)"
START_TS="$(date +%s)"
# Default config path (override by env CONFIG_FILE or --config)
CONFIG_FILE="${CONFIG_FILE:-/etc/backup.d/filesystem_backup.env}"
# Lock to prevent concurrent runs
LOCK_FILE="/var/run/filesystem_backup.lock"
# Global defaults (can be overridden by config file)
LOG_DIR="/var/log/backup"
LOG_FILE="${LOG_DIR}/filesystem_backup.log"
METRICS_FILE="/var/lib/node_exporter/textfile_collector/backup_metrics.prom"
# Restic repositories (override in config)
REPO_LOCAL=""
REPO_REMOTE=""
# Password file (do not store secrets in script)
RESTIC_PASSWORD_FILE="/etc/backup.d/restic_password.txt"
# Include/exclude
INCLUDE_PATHS=("/etc")
EXCLUDE_FILE="/etc/backup.d/backup_excludes.txt"
ONE_FILE_SYSTEM=true
# Retention policy
KEEP_DAILY=7
KEEP_WEEKLY=4
KEEP_MONTHLY=6
KEEP_YEARLY=1
PRUNE_AFTER_FORGET=true
# Check policy (read subset to reduce IO and egress)
CHECK_READ_SUBSET_LOCAL="5%"
CHECK_READ_SUBSET_REMOTE="1%"
# Performance and safety
USE_IONICE=true
IONICE_CLASS=2
IONICE_PRIORITY=7
USE_NICE=true
NICE_PRIORITY=10
READ_CONCURRENCY="" # e.g., 4
LIMIT_UPLOAD_KBPS="" # e.g., 10240 (10 MB/s)
LIMIT_DOWNLOAD_KBPS="" # e.g., 10240
DRY_RUN=false
# Retries
RETRY_MAX=3
RETRY_BASE_SLEEP=5
# Alerting (optional)
ALERT_WEBHOOK_URL="" # Generic JSON webhook endpoint
ALERT_ON_SUCCESS=false
# Tags
TAG_TYPE="filesystem"
TAG_ENV="${ENVIRONMENT:-prod}"
EXTRA_TAGS="" # comma-separated, e.g., "app:erp,team:sre"
# Misc
INIT_IF_MISSING=true
VERBOSE=false
# Utilities
log_setup() {
mkdir -p "$LOG_DIR" || true
touch "$LOG_FILE" || true
chmod 640 "$LOG_FILE" || true
}
log() {
local level="$1"; shift
local msg="$*"
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
echo "$ts [$level] $SCRIPT_NAME[$$]: $msg" | tee -a "$LOG_FILE" >&2
logger -t "$SCRIPT_NAME" -p "daemon.${level,,}" -- "$msg" || true
}
info() { log "INFO" "$*"; }
warn() { log "WARN" "$*"; }
error() { log "ERR" "$*"; }
debug() { $VERBOSE && log "DEBUG" "$*" || true; }
die() {
error "$*"
exit 1
}
# Load config
load_config() {
if [[ -n "${CUSTOM_CONFIG_FILE:-}" ]]; then
CONFIG_FILE="$CUSTOM_CONFIG_FILE"
fi
if [[ -f "$CONFIG_FILE" ]]; then
# shellcheck source=/dev/null
. "$CONFIG_FILE"
else
warn "Config file not found: $CONFIG_FILE (using built-in defaults)"
fi
}
# Helpers
have_cmd() { command -v "$1" >/dev/null 2>&1; }
required_binaries=(
restic jq logger date cat mkdir tee
)
preflight_checks() {
for b in "${required_binaries[@]}"; do
have_cmd "$b" || die "Missing required binary: $b"
done
[[ -n "$REPO_LOCAL" ]] || die "REPO_LOCAL is not set"
[[ -n "$REPO_REMOTE" ]] || die "REPO_REMOTE is not set"
[[ -f "$RESTIC_PASSWORD_FILE" ]] || die "RESTIC_PASSWORD_FILE not found: $RESTIC_PASSWORD_FILE"
# enforce secure perms on password file
local perm
perm="$(stat -c '%a' "$RESTIC_PASSWORD_FILE" 2>/dev/null || echo "600")"
if [[ "$perm" != "600" && "$perm" != "400" ]]; then
warn "RESTIC_PASSWORD_FILE should be chmod 600 or 400 (current: $perm). Adjusting to 600."
chmod 600 "$RESTIC_PASSWORD_FILE" || die "Failed to chmod 600 $RESTIC_PASSWORD_FILE"
fi
mkdir -p "$LOG_DIR" || die "Cannot create LOG_DIR: $LOG_DIR"
mkdir -p "$(dirname "$METRICS_FILE")" || warn "Cannot create metrics dir: $(dirname "$METRICS_FILE")"
mkdir -p "$REPO_LOCAL" || die "Cannot create local repo dir: $REPO_LOCAL"
# Optional: basic local free space check (warn if < 10 GB)
local avail_kb
avail_kb=$(df -Pk "$REPO_LOCAL" | awk 'NR==2{print $4}')
if [[ -n "$avail_kb" && "$avail_kb" -lt $((10*1024*1024)) ]]; then
warn "Local repo filesystem free space < 10GB; current: $((avail_kb/1024/1024)) GB"
fi
# Validate include paths
if [[ "${#INCLUDE_PATHS[@]}" -eq 0 ]]; then
die "INCLUDE_PATHS is empty. Configure at least one path to back up."
fi
for p in "${INCLUDE_PATHS[@]}"; do
[[ -e "$p" ]] || warn "Include path does not exist: $p"
done
# Create lock
exec 9>"$LOCK_FILE" || die "Cannot open lock file $LOCK_FILE"
if ! flock -n 9; then
die "Another backup process is running (lock: $LOCK_FILE)"
fi
}
# Build tag list for restic --tag multiple args
build_tag_args() {
local tags=()
tags+=( "--tag" "type:${TAG_TYPE}" )
tags+=( "--tag" "env:${TAG_ENV}" )
tags+=( "--tag" "host:${HOST_SHORT}" )
if [[ -n "$EXTRA_TAGS" ]]; then
IFS=',' read -r -a arr <<< "$EXTRA_TAGS"
for t in "${arr[@]}"; do
[[ -n "$t" ]] && tags+=( "--tag" "$t" )
done
fi
printf '%s\n' "${tags[@]}"
}
metric_put() {
local key="$1" value="$2" repo="$3"
local ts
ts="$(date +%s)"
{
echo "backup_${key}{job=\"filesystem\",repo=\"${repo}\",host=\"${HOST_SHORT}\"} ${value}"
echo "backup_${key}_timestamp{job=\"filesystem\",repo=\"${repo}\",host=\"${HOST_SHORT}\"} ${ts}" | sed 's/ //g' >/dev/null
} >> "$METRICS_FILE".tmp 2>/dev/null || true
}
metrics_begin() {
: > "${METRICS_FILE}.tmp" || true
echo "# HELP backup metrics for filesystem restic job" >> "${METRICS_FILE}.tmp" || true
echo "# TYPE backup_last_success_timestamp_seconds gauge" >> "${METRICS_FILE}.tmp" || true
}
metrics_commit() {
if [[ -f "${METRICS_FILE}.tmp" ]]; then
mv "${METRICS_FILE}.tmp" "$METRICS_FILE" 2>/dev/null || true
fi
}
notify_webhook() {
local status="$1"; shift
local text="$*"
[[ -n "$ALERT_WEBHOOK_URL" ]] || return 0
if [[ "$status" == "success" && "$ALERT_ON_SUCCESS" != "true" ]]; then
return 0
fi
local payload
payload=$(jq -n --arg status "$status" --arg host "$HOST_SHORT" --arg msg "$text" \
'{status:$status, host:$host, message:$msg, ts:now|tonumber}')
curl -sS -m 10 -H "Content-Type: application/json" -d "$payload" "$ALERT_WEBHOOK_URL" >/dev/null 2>&1 || true
}
with_nice() {
local cmd=( "$@" )
if $USE_IONICE && have_cmd ionice; then
cmd=( ionice -c "$IONICE_CLASS" -n "$IONICE_PRIORITY" "${cmd[@]}" )
fi
if $USE_NICE && have_cmd nice; then
cmd=( nice -n "$NICE_PRIORITY" "${cmd[@]}" )
fi
"${cmd[@]}"
}
run_with_retry() {
local -a cmd=( "$@" )
local attempt=1
local rc=0
while :; do
debug "Attempt $attempt/${RETRY_MAX}: ${cmd[*]}"
"${cmd[@]}"
rc=$?
if [[ $rc -eq 0 ]]; then
return 0
fi
if [[ $attempt -ge $RETRY_MAX ]]; then
return $rc
fi
local sleep_s=$(( RETRY_BASE_SLEEP * 2**(attempt-1) ))
warn "Command failed (rc=$rc). Retry in ${sleep_s}s: ${cmd[*]}"
sleep "$sleep_s"
attempt=$((attempt+1))
done
}
restic_base_args() {
local args=()
[[ -n "$READ_CONCURRENCY" ]] && args+=( "--read-concurrency" "$READ_CONCURRENCY" )
[[ -n "$LIMIT_UPLOAD_KBPS" ]] && args+=( "--limit-upload" "$LIMIT_UPLOAD_KBPS" )
[[ -n "$LIMIT_DOWNLOAD_KBPS" ]] && args+=( "--limit-download" "$LIMIT_DOWNLOAD_KBPS" )
$VERBOSE && args+=( "-v" )
printf '%s\n' "${args[@]}"
}
restic_env_local() {
export RESTIC_REPOSITORY="$REPO_LOCAL"
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE"
}
restic_env_remote() {
export RESTIC_REPOSITORY="$REPO_REMOTE"
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE"
# For S3-compatible, use environment or IAM role:
# AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_SESSION_TOKEN / AWS_DEFAULT_REGION
}
restic_repo_exists() {
local repo="$1"
RESTIC_REPOSITORY="$repo" RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE" \
restic snapshots >/dev/null 2>&1
}
restic_repo_init() {
local repo="$1"
info "Initializing restic repository: $repo"
RESTIC_REPOSITORY="$repo" RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE" \
run_with_retry restic init
}
ensure_repos() {
if ! restic_repo_exists "$REPO_LOCAL"; then
if $INIT_IF_MISSING; then
restic_repo_init "$REPO_LOCAL" || die "Failed to init local repo"
else
die "Local repo not initialized: $REPO_LOCAL"
fi
fi
if ! restic_repo_exists "$REPO_REMOTE"; then
if $INIT_IF_MISSING; then
restic_repo_init "$REPO_REMOTE" || die "Failed to init remote repo"
else
die "Remote repo not initialized: $REPO_REMOTE"
fi
fi
}
backup_to_local() {
info "Starting local backup to $REPO_LOCAL"
restic_env_local
local tag_args
mapfile -t tag_args < <(build_tag_args)
local base_args
mapfile -t base_args < <(restic_base_args)
local args=( backup --json )
args+=( "${base_args[@]}" )
args+=( "${tag_args[@]}" )
$ONE_FILE_SYSTEM && args+=( "--one-file-system" )
[[ -f "$EXCLUDE_FILE" ]] && args+=( "--exclude-file" "$EXCLUDE_FILE" )
args+=( "--exclude-caches" )
if $DRY_RUN; then
args+=( "--dry-run" )
fi
# Append include paths
for p in "${INCLUDE_PATHS[@]}"; do
args+=( "$p" )
done
local tmp_json
tmp_json="$(mktemp /tmp/restic_backup_XXXX.json)"
local rc=0
# Capture JSON lines for parsing
with_nice bash -c 'restic "${args[@]}" 2>&1 | tee >(cat >&2) > "'"$tmp_json"'"' bash "${args[@]}" || rc=$?
if [[ $rc -ne 0 ]]; then
error "Local backup failed (rc=$rc)"
metric_put "last_exit_code" "$rc" "local"
metric_put "last_error_timestamp_seconds" "$(date +%s)" "local"
notify_webhook "failure" "Local backup failed on $HOST_SHORT (rc=$rc)"
rm -f "$tmp_json"
return $rc
fi
# Parse summary json
local files_processed bytes_processed duration sec_now
files_processed="$(jq -r 'select(.message_type=="summary")|.files_new+.files_changed+.files_unmodified' "$tmp_json" | tail -n1)"
bytes_processed="$(jq -r 'select(.message_type=="summary")|.total_bytes_processed' "$tmp_json" | tail -n1)"
duration="$(jq -r 'select(.message_type=="summary")|.total_duration' "$tmp_json" | tail -n1)"
sec_now="$(date +%s)"
metric_put "last_success_timestamp_seconds" "$sec_now" "local"
metric_put "last_exit_code" "0" "local"
[[ -n "$bytes_processed" && "$bytes_processed" != "null" ]] && metric_put "last_bytes_processed" "$bytes_processed" "local"
# Convert duration "123.45s" => seconds (float not supported; output integer approximation)
if [[ "$duration" =~ ^([0-9]+)(\.[0-9]+)?s$ ]]; then
metric_put "last_duration_seconds" "${BASH_REMATCH[1]}" "local"
fi
info "Local backup completed: files=${files_processed:-?}, bytes=${bytes_processed:-?}, duration=${duration:-?}"
rm -f "$tmp_json"
return 0
}
copy_local_to_remote() {
info "Copying snapshots from local ($REPO_LOCAL) to remote ($REPO_REMOTE)"
restic_env_remote
local base_args
mapfile -t base_args < <(restic_base_args)
local args=( copy )
args+=( "${base_args[@]}" )
args+=( "--from-repo" "$REPO_LOCAL" "--from-password-file" "$RESTIC_PASSWORD_FILE" )
args+=( "--host" "$HOST_SHORT" )
local rc=0
local ts0 ts1
ts0=$(date +%s)
run_with_retry with_nice restic "${args[@]}" || rc=$?
ts1=$(date +%s)
local dur=$((ts1 - ts0))
metric_put "copy_last_duration_seconds" "$dur" "remote"
metric_put "copy_last_exit_code" "$rc" "remote"
if [[ $rc -ne 0 ]]; then
error "Restic copy failed (rc=$rc)"
notify_webhook "failure" "Remote copy failed on $HOST_SHORT (rc=$rc)"
return $rc
fi
info "Copy to remote completed in ${dur}s"
return 0
}
apply_retention() {
local repo="$1" scope="$2" # scope: local|remote
info "Applying retention on $scope repo: daily=$KEEP_DAILY weekly=$KEEP_WEEKLY monthly=$KEEP_MONTHLY yearly=$KEEP_YEARLY prune=$PRUNE_AFTER_FORGET"
export RESTIC_REPOSITORY="$repo"
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE"
local args=( forget "--group-by" "host,tags" "--keep-daily" "$KEEP_DAILY" "--keep-weekly" "$KEEP_WEEKLY" "--keep-monthly" "$KEEP_MONTHLY" "--keep-yearly" "$KEEP_YEARLY" )
$PRUNE_AFTER_FORGET && args+=( "--prune" )
local rc=0
run_with_retry with_nice restic "${args[@]}" || rc=$?
metric_put "forget_last_exit_code" "$rc" "$scope"
if [[ $rc -ne 0 ]]; then
error "Retention (forget/prune) failed on $scope (rc=$rc)"
notify_webhook "failure" "Retention failed on $scope for $HOST_SHORT (rc=$rc)"
return $rc
fi
info "Retention completed on $scope"
return 0
}
repo_check() {
local repo="$1" scope="$2" subset="$3"
info "Running repo check on $scope (read-data-subset=$subset)"
export RESTIC_REPOSITORY="$repo"
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE"
local args=( check "--with-cache" "--read-data-subset" "$subset" )
local rc=0
run_with_retry with_nice restic "${args[@]}" || rc=$?
metric_put "check_last_exit_code" "$rc" "$scope"
if [[ $rc -ne 0 ]]; then
error "Repo check failed on $scope (rc=$rc)"
notify_webhook "failure" "Repo check failed on $scope for $HOST_SHORT (rc=$rc)"
return $rc
fi
info "Repo check successful on $scope"
return 0
}
snapshot_count_metric() {
local repo="$1" scope="$2"
export RESTIC_REPOSITORY="$repo"
export RESTIC_PASSWORD_FILE="$RESTIC_PASSWORD_FILE"
local count
count="$(restic snapshots --json 2>/dev/null | jq 'length' 2>/dev/null || echo 0)"
metric_put "snapshot_count" "$count" "$scope"
}
usage() {
cat <<EOF
$SCRIPT_NAME v$VERSION - Hybrid filesystem backup with restic
Usage:
$SCRIPT_NAME [--config /path/to/config.env] [--dry-run] [--verbose] backup
$SCRIPT_NAME [--config ...] verify
$SCRIPT_NAME [--config ...] list
$SCRIPT_NAME --help
Commands:
backup Run local backup -> copy to remote -> retention -> check -> metrics
verify Run integrity check only on local and remote
list List snapshots on local and remote
Options:
--config PATH Specify config file path (default: $CONFIG_FILE)
--dry-run Dry-run backup (no data written)
--verbose More logs
EOF
}
main() {
log_setup
local cmd=""
CUSTOM_CONFIG_FILE=""
while [[ $# -gt 0 ]]; do
case "$1" in
backup|verify|list) cmd="$1"; shift ;;
--config) CUSTOM_CONFIG_FILE="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
--verbose) VERBOSE=true; shift ;;
-h|--help) usage; exit 0 ;;
*) error "Unknown argument: $1"; usage; exit 2 ;;
esac
done
[[ -n "$cmd" ]] || { usage; exit 2; }
load_config
preflight_checks
ensure_repos
local rc=0
case "$cmd" in
list)
info "Listing snapshots (local and remote)"
restic_env_local; restic snapshots || rc=$?
restic_env_remote; restic snapshots || rc=$?
exit $rc
;;
verify)
metrics_begin
repo_check "$REPO_LOCAL" "local" "$CHECK_READ_SUBSET_LOCAL" || rc=$?
repo_check "$REPO_REMOTE" "remote" "$CHECK_READ_SUBSET_REMOTE" || rc=$?
snapshot_count_metric "$REPO_LOCAL" "local"
snapshot_count_metric "$REPO_REMOTE" "remote"
metrics_commit
if [[ $rc -ne 0 ]]; then
die "Verify failed (rc=$rc)"
fi
info "Verify completed successfully"
exit 0
;;
backup)
metrics_begin
backup_to_local || rc=$?
if [[ $rc -eq 0 ]]; then
copy_local_to_remote || rc=$?
fi
# Retention on both repos (best-effort, do not overwrite rc if already failed)
apply_retention "$REPO_LOCAL" "local" || true
apply_retention "$REPO_REMOTE" "remote" || true
# Checks (best-effort for RTO, but still reported)
repo_check "$REPO_LOCAL" "local" "$CHECK_READ_SUBSET_LOCAL" || true
repo_check "$REPO_REMOTE" "remote" "$CHECK_READ_SUBSET_REMOTE" || true
snapshot_count_metric "$REPO_LOCAL" "local"
snapshot_count_metric "$REPO_REMOTE" "remote"
metrics_commit
if [[ $rc -eq 0 ]]; then
local end_ts
end_ts="$(date +%s)"
info "Backup workflow completed successfully in $((end_ts-START_TS))s"
notify_webhook "success" "Backup workflow succeeded on $HOST_SHORT"
exit 0
else
die "Backup workflow failed (rc=$rc)"
fi
;;
esac
}
trap 'error "Interrupted"; exit 130' INT TERM
main "$@"
示例配置文件:/etc/backup.d/filesystem_backup.env(权限 640/600)
示例排除文件:/etc/backup.d/backup_excludes.txt
云端凭据(不写入脚本/配置):
Prometheus metrics:
如需进一步定制(多租户多仓库、分批路径策略、跨区域复制、带宽按时间窗限速、细粒度指标上报等),可在此脚本基础上扩展相应模块。
该脚本用于在生产环境中对超大规模 MongoDB 数据库进行自动化备份,目标存储为网络存储(NAS/NFS/SMB/SAN 等)。方案基于在副本集的隐藏/次级节点上执行逻辑备份(mongodump),并实现以下能力:
适用场景:
以下为完整可执行的 Python 备份脚本。保存为 mongodb_backup.py 并赋予执行权限。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import datetime
import hashlib
import json
import logging
import os
import shutil
import signal
import socket
import subprocess
import sys
import time
from pathlib import Path
# ---------------------------
# Utility functions
# ---------------------------
def run_cmd(cmd, env=None, timeout=None, retries=0, backoff_sec=5, cwd=None, log_output=True):
"""Run a shell command with retries and return (rc, stdout, stderr)."""
attempt = 0
last_stdout = ""
last_stderr = ""
while True:
try:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env,
cwd=cwd,
text=True
)
try:
stdout, stderr = proc.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
proc.kill()
stdout, stderr = proc.communicate()
raise RuntimeError(f"Command timeout: {' '.join(cmd)}")
rc = proc.returncode
if log_output:
if stdout:
logging.debug(stdout)
if stderr:
logging.debug(stderr)
if rc == 0:
return rc, stdout, stderr
else:
last_stdout, last_stderr = stdout, stderr
attempt += 1
if attempt > retries:
return rc, last_stdout, last_stderr
sleep = backoff_sec * attempt
logging.warning(f"Command failed (rc={rc}), retry {attempt}/{retries} in {sleep}s: {' '.join(cmd)}")
time.sleep(sleep)
except Exception as e:
last_stderr = str(e)
attempt += 1
if attempt > retries:
return 1, "", last_stderr
sleep = backoff_sec * attempt
logging.warning(f"Command exception, retry {attempt}/{retries} in {sleep}s: {e}")
time.sleep(sleep)
def which(bin_name):
rc, out, _ = run_cmd(["bash", "-lc", f"command -v {bin_name}"], log_output=False)
return out.strip() if rc == 0 and out.strip() else None
def compute_sha256(file_path, buf_size=64 * 1024 * 1024):
h = hashlib.sha256()
with open(file_path, "rb") as f:
while True:
data = f.read(buf_size)
if not data:
break
h.update(data)
return h.hexdigest()
def bytes_human(n):
for unit in ["B", "KB", "MB", "GB", "TB", "PB"]:
if n < 1024.0:
return f"{n:.2f}{unit}"
n /= 1024.0
def ensure_dir(path: Path):
path.mkdir(parents=True, exist_ok=True)
def now_iso():
return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
def cpu_count():
try:
import multiprocessing
return multiprocessing.cpu_count()
except:
return 4
def send_slack(webhook, text):
if not webhook:
return
payload = json.dumps({"text": text})
run_cmd(["curl", "-sS", "-X", "POST", "-H", "Content-type: application/json", "--data", payload, webhook], log_output=False)
def write_prometheus_metrics(textfile_dir, metrics):
"""metrics is dict: name -> value"""
if not textfile_dir:
return
ensure_dir(Path(textfile_dir))
fname = Path(textfile_dir) / "mongodb_backup.prom"
lines = []
for k, v in metrics.items():
lines.append(f"{k} {v}")
try:
tmp = fname.with_suffix(".tmp")
with open(tmp, "w") as f:
f.write("\n".join(lines) + "\n")
os.replace(tmp, fname)
except Exception as e:
logging.warning(f"Failed to write Prometheus textfile: {e}")
# ---------------------------
# Backup process
# ---------------------------
def preflight_checks(args):
# Tools
for bin in ["mongodump", "rsync"]:
if not which(bin):
raise RuntimeError(f"Required binary not found in PATH: {bin}")
# Check mongodump version
rc, out, err = run_cmd(["mongodump", "--version"], log_output=False)
if rc != 0:
raise RuntimeError(f"mongodump not working: {err}")
logging.info(out.strip())
# Compression tool check
pigz_bin = which("pigz")
gzip_bin = which("gzip")
if not pigz_bin and not gzip_bin:
raise RuntimeError("Neither pigz nor gzip found. Please install pigz for performance or gzip.")
if pigz_bin:
logging.info(f"Using pigz for compression: {pigz_bin}")
else:
logging.info(f"Using gzip for compression: {gzip_bin}")
# OpenSSL for encryption (optional but recommended)
if args.encrypt:
openssl_bin = which("openssl")
if not openssl_bin:
raise RuntimeError("Encryption enabled but openssl not found.")
if not args.passphrase and not os.environ.get("BACKUP_PASSPHRASE"):
raise RuntimeError("Encryption enabled but no passphrase provided. Set --passphrase-file or BACKUP_PASSPHRASE.")
logging.info(f"Encryption enabled via openssl: {openssl_bin}")
# Network storage path check
net_root = Path(args.network_target_root)
if not net_root.exists():
raise RuntimeError(f"Network target root does not exist: {net_root}")
if not os.access(net_root, os.W_OK):
raise RuntimeError(f"No write permission to network target root: {net_root}")
# Local staging path
ensure_dir(Path(args.local_staging_root))
def build_mongodump_cmd(args, archive_path: Path):
cmd = [
"mongodump",
"--uri", args.mongo_uri,
"--readPreference=secondary",
"--archive=" + str(archive_path),
"--numParallelCollections", str(args.num_parallel),
"--quiet"
]
if args.oplog:
cmd.append("--oplog")
if args.tls:
cmd.append("--tls")
if args.tlsCAFile:
cmd += ["--tlsCAFile", args.tlsCAFile]
if args.tlsCertificateKeyFile:
cmd += ["--tlsCertificateKeyFile", args.tlsCertificateKeyFile]
if args.exclude_collections_with_prefix:
for p in args.exclude_collections_with_prefix:
cmd += ["--excludeCollectionsWithPrefix", p]
if args.db:
cmd += ["--db", args.db]
return cmd
def compress_file(src_path: Path, threads: int):
pigz_bin = which("pigz")
if pigz_bin:
cmd = [pigz_bin, "-f", f"-p{threads}", str(src_path)]
rc, _, err = run_cmd(cmd, retries=1)
if rc != 0:
raise RuntimeError(f"pigz compression failed: {err}")
return src_path.with_suffix(src_path.suffix + ".gz")
else:
gzip_bin = which("gzip")
cmd = [gzip_bin, "-f", str(src_path)]
rc, _, err = run_cmd(cmd, retries=1)
if rc != 0:
raise RuntimeError(f"gzip compression failed: {err}")
return src_path.with_suffix(src_path.suffix + ".gz")
def encrypt_file(src_path: Path, passphrase: str):
# AES-256-GCM with PBKDF2
openssl_bin = which("openssl")
dst = src_path.with_suffix(src_path.suffix + ".enc")
env = os.environ.copy()
env["BACKUP_PASSPHRASE"] = passphrase
cmd = [
openssl_bin, "enc",
"-aes-256-gcm",
"-salt",
"-pbkdf2",
"-pass", "env:BACKUP_PASSPHRASE",
"-in", str(src_path),
"-out", str(dst)
]
rc, _, err = run_cmd(cmd, env=env, retries=1)
if rc != 0:
raise RuntimeError(f"Encryption failed: {err}")
return dst
def rsync_upload(src_path: Path, dst_path: Path):
ensure_dir(dst_path.parent)
cmd = [
"rsync",
"-a",
"--partial",
"--inplace",
"--checksum",
"--info=progress2",
str(src_path),
str(dst_path)
]
rc, _, err = run_cmd(cmd, retries=3, backoff_sec=10)
if rc != 0:
raise RuntimeError(f"rsync upload failed: {err}")
def enforce_retention(target_root: Path, retention_days: int, name_prefix: str):
cutoff = time.time() - retention_days * 86400
removed = []
for item in target_root.glob(f"{name_prefix}-*.manifest.json"):
try:
ts_str = item.stem.replace(f"{name_prefix}-", "").replace(".manifest", "")
# Example: name-20250101T011500Z.manifest.json
# We'll parse robustly: find YYYYMMDDTHHMMSSZ in the stem
parts = item.stem.split("-")
ts_part = parts[-1].replace(".manifest", "")
# Fallback to file mtime if parse fails
mtime = item.stat().st_mtime
if mtime < cutoff:
# Remove manifest and associated files
base = item.stem.replace(".manifest", "")
for suffix in [".archive", ".archive.gz", ".archive.gz.enc", ".sha256"]:
f = target_root / f"{base}{suffix}"
if f.exists():
f.unlink(missing_ok=True)
item.unlink(missing_ok=True)
removed.append(item.name)
except Exception:
continue
if removed:
logging.info(f"Retention cleanup removed: {removed}")
def main():
parser = argparse.ArgumentParser(description="MongoDB automated backup script (logical backup with mongodump)")
parser.add_argument("--mongo-uri", default=os.environ.get("MONGO_URI"), required=True, help="MongoDB connection URI (use secondary/hidden node)")
parser.add_argument("--db", default=None, help="Backup specific database (default: all)")
parser.add_argument("--oplog", action="store_true", default=True, help="Include oplog for consistent snapshot (replica set only)")
parser.add_argument("--tls", action="store_true", default=False, help="Enable TLS")
parser.add_argument("--tlsCAFile", default=os.environ.get("TLS_CA_FILE"), help="TLS CA file")
parser.add_argument("--tlsCertificateKeyFile", default=os.environ.get("TLS_CERT_KEY_FILE"), help="TLS client cert+key")
parser.add_argument("--exclude-collections-with-prefix", nargs="*", default=["system.", "config.system.sessions"], help="Exclude collections with prefix")
parser.add_argument("--local-staging-root", default=os.environ.get("LOCAL_STAGING_ROOT", "/var/backups/mongodb"), help="Local staging directory")
parser.add_argument("--network-target-root", default=os.environ.get("NETWORK_TARGET_ROOT", "/mnt/backup/mongodb"), help="Network storage mount directory")
parser.add_argument("--name-prefix", default=os.environ.get("BACKUP_NAME_PREFIX", socket.gethostname() + "-mongodb"), help="Prefix for backup files")
parser.add_argument("--num-parallel", type=int, default=int(os.environ.get("NUM_PARALLEL", max(4, min(8, cpu_count())))), help="numParallelCollections for mongodump")
parser.add_argument("--threads", type=int, default=int(os.environ.get("COMPRESS_THREADS", max(4, cpu_count()))), help="Compression threads for pigz")
parser.add_argument("--encrypt", action="store_true", default=os.environ.get("ENCRYPT", "true").lower() in ("1", "true", "yes"), help="Enable encryption")
parser.add_argument("--passphrase-file", default=os.environ.get("BACKUP_PASSPHRASE_FILE"), help="File containing encryption passphrase")
parser.add_argument("--retention-days", type=int, default=int(os.environ.get("RETENTION_DAYS", "30")), help="Retention in days")
parser.add_argument("--slack-webhook", default=os.environ.get("SLACK_WEBHOOK_URL"), help="Slack webhook URL for notifications")
parser.add_argument("--prom-textfile-dir", default=os.environ.get("PROM_TEXTFILE_DIR"), help="Prometheus node_exporter textfile dir")
parser.add_argument("--log-file", default=os.environ.get("BACKUP_LOG_FILE"), help="Log file path")
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
args = parser.parse_args()
# Logging setup
log_level = logging.DEBUG if args.debug else logging.INFO
logging.basicConfig(
level=log_level,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
]
)
if args.log_file:
fh = logging.FileHandler(args.log_file)
fh.setLevel(log_level)
fh.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
logging.getLogger().addHandler(fh)
# Signal handling for graceful shutdown
def handle_signal(signum, frame):
logging.error(f"Received signal {signum}, aborting.")
sys.exit(2)
signal.signal(signal.SIGINT, handle_signal)
signal.signal(signal.SIGTERM, handle_signal)
started_at = time.time()
manifest = {
"started_at": now_iso(),
"host": socket.gethostname(),
"mongo_uri_redacted": "<redacted>",
"name_prefix": args.name_prefix,
"oplog": args.oplog,
"tls": args.tls,
"num_parallel_collections": args.num_parallel,
"compress_threads": args.threads,
"encrypt": args.encrypt,
"retention_days": args.retention_days,
"status": "running"
}
# Preflight
try:
preflight_checks(args)
except Exception as e:
logging.error(f"Preflight checks failed: {e}")
send_slack(args.slack_webhook, f":x: MongoDB backup preflight failed on {socket.gethostname()}: {e}")
write_prometheus_metrics(args.prom_textfile_dir, {
"mongodb_backup_last_exit_status": 1,
"mongodb_backup_last_duration_seconds": 0,
"mongodb_backup_last_size_bytes": 0,
"mongodb_backup_last_success_timestamp": 0
})
sys.exit(1)
# Prepare paths
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
base_name = f"{args.name_prefix}-{timestamp}"
staging_root = Path(args.local_staging_root)
ensure_dir(staging_root)
archive_path = staging_root / f"{base_name}.archive"
compressed_path = None
encrypted_path = None
sha256_local = None
# Obtain passphrase if encryption enabled
passphrase = None
if args.encrypt:
if args.passphrase_file and Path(args.passphrase_file).exists():
passphrase = Path(args.passphrase_file).read_text().strip()
else:
passphrase = os.environ.get("BACKUP_PASSPHRASE")
if not passphrase:
logging.error("Encryption enabled but passphrase not provided.")
sys.exit(1)
# Run mongodump
try:
dump_cmd = build_mongodump_cmd(args, archive_path)
manifest["dump_cmd"] = " ".join(dump_cmd[:-2]) + " --archive=<redacted>"
logging.info(f"Running mongodump to archive: {archive_path}")
rc, stdout, stderr = run_cmd(dump_cmd, retries=1, backoff_sec=10)
if rc != 0:
raise RuntimeError(f"mongodump failed: {stderr}")
manifest["dump_stdout_tail"] = stdout[-2048:] if stdout else ""
# Size before compression
size_archive = archive_path.stat().st_size
manifest["size_archive_bytes"] = size_archive
logging.info(f"Dump archive size: {bytes_human(size_archive)}")
except Exception as e:
logging.error(f"Backup failed during mongodump: {e}")
send_slack(args.slack_webhook, f":x: MongoDB backup failed (mongodump) on {socket.gethostname()}: {e}")
write_prometheus_metrics(args.prom_textfile_dir, {
"mongodb_backup_last_exit_status": 2,
"mongodb_backup_last_duration_seconds": int(time.time() - started_at),
"mongodb_backup_last_size_bytes": 0,
"mongodb_backup_last_success_timestamp": 0
})
sys.exit(2)
# Compress
try:
logging.info("Compressing archive with pigz/gzip...")
compressed_path = compress_file(archive_path, args.threads)
size_compressed = compressed_path.stat().st_size
manifest["size_compressed_bytes"] = size_compressed
logging.info(f"Compressed size: {bytes_human(size_compressed)}")
except Exception as e:
logging.error(f"Compression failed: {e}")
send_slack(args.slack_webhook, f":x: MongoDB backup failed (compression) on {socket.gethostname()}: {e}")
write_prometheus_metrics(args.prom_textfile_dir, {
"mongodb_backup_last_exit_status": 3,
"mongodb_backup_last_duration_seconds": int(time.time() - started_at),
"mongodb_backup_last_size_bytes": 0,
"mongodb_backup_last_success_timestamp": 0
})
sys.exit(3)
# Encrypt (optional)
final_local_path = compressed_path
try:
if args.encrypt:
logging.info("Encrypting compressed archive with AES-256-GCM...")
encrypted_path = encrypt_file(compressed_path, passphrase)
# Remove plaintext compressed file after successful encryption
try:
compressed_path.unlink()
except Exception:
pass
final_local_path = encrypted_path
size_encrypted = final_local_path.stat().st_size
manifest["size_encrypted_bytes"] = size_encrypted
logging.info(f"Encrypted size: {bytes_human(size_encrypted)}")
except Exception as e:
logging.error(f"Encryption failed: {e}")
send_slack(args.slack_webhook, f":x: MongoDB backup failed (encryption) on {socket.gethostname()}: {e}")
write_prometheus_metrics(args.prom_textfile_dir, {
"mongodb_backup_last_exit_status": 4,
"mongodb_backup_last_duration_seconds": int(time.time() - started_at),
"mongodb_backup_last_size_bytes": 0,
"mongodb_backup_last_success_timestamp": 0
})
sys.exit(4)
# Checksum
try:
logging.info("Computing SHA256 checksum...")
sha256_local = compute_sha256(final_local_path)
sha_file = final_local_path.with_suffix(final_local_path.suffix + ".sha256")
sha_file.write_text(sha256_local + " " + final_local_path.name + "\n")
manifest["sha256"] = sha256_local
except Exception as e:
logging.error(f"Checksum computation failed: {e}")
send_slack(args.slack_webhook, f":warning: MongoDB backup checksum failed on {socket.gethostname()}: {e}")
# Upload to network storage
network_target_root = Path(args.network_target_root)
final_name = final_local_path.name
remote_path = network_target_root / final_name
remote_sha_path = network_target_root / (final_local_path.name + ".sha256")
try:
logging.info(f"Uploading to network storage: {remote_path}")
rsync_upload(final_local_path, remote_path)
rsync_upload(final_local_path.with_suffix(final_local_path.suffix + ".sha256"), remote_sha_path)
except Exception as e:
logging.error(f"Upload failed: {e}")
send_slack(args.slack_webhook, f":x: MongoDB backup upload failed on {socket.gethostname()}: {e}")
write_prometheus_metrics(args.prom_textfile_dir, {
"mongodb_backup_last_exit_status": 5,
"mongodb_backup_last_duration_seconds": int(time.time() - started_at),
"mongodb_backup_last_size_bytes": 0,
"mongodb_backup_last_success_timestamp": 0
})
sys.exit(5)
# Verify checksum on remote (mount must be POSIX-accessible)
try:
logging.info("Verifying remote checksum...")
sha_remote = compute_sha256(remote_path)
if sha256_local and sha_remote != sha256_local:
raise RuntimeError(f"Checksum mismatch: local {sha256_local} vs remote {sha_remote}")
logging.info("Checksum verified.")
except Exception as e:
logging.error(f"Remote checksum verification failed: {e}")
send_slack(args.slack_webhook, f":warning: MongoDB backup checksum verify failed on {socket.gethostname()}: {e}")
# Write manifest to network storage
manifest["finished_at"] = now_iso()
manifest["status"] = "success"
manifest["duration_seconds"] = int(time.time() - started_at)
manifest["final_file"] = remote_path.name
try:
manifest_path = network_target_root / f"{args.name_prefix}-{timestamp}.manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
logging.info(f"Manifest written: {manifest_path}")
except Exception as e:
logging.warning(f"Failed to write manifest: {e}")
# Retention
try:
enforce_retention(network_target_root, args.retention_days, args.name_prefix)
except Exception as e:
logging.warning(f"Retention cleanup failed: {e}")
# Metrics & notify
write_prometheus_metrics(args.prom_textfile_dir, {
"mongodb_backup_last_exit_status": 0,
"mongodb_backup_last_duration_seconds": manifest.get("duration_seconds", 0),
"mongodb_backup_last_size_bytes": Path(remote_path).stat().st_size if remote_path.exists() else 0,
"mongodb_backup_last_success_timestamp": int(time.time())
})
send_slack(args.slack_webhook, f":white_check_mark: MongoDB backup succeeded on {socket.gethostname()} "
f"({bytes_human(Path(remote_path).stat().st_size)} in {manifest.get('duration_seconds', 0)}s) -> {remote_path}")
# Cleanup local staging
try:
for p in [final_local_path, final_local_path.with_suffix(final_local_path.suffix + ".sha256")]:
if p.exists():
p.unlink(missing_ok=True)
logging.info("Local staging cleaned up.")
except Exception as e:
logging.warning(f"Local cleanup failed: {e}")
if __name__ == "__main__":
main()
如需根据具体拓扑(例如多分片、巨量集合、冷热数据分层)进行进一步优化,可提供详细集群信息,我将为你定制更精细的分片一致性备份与分级存储策略。
帮助运维与数据团队在数分钟内生成可直接上生产的自动化备份脚本与落地方案。覆盖主流数据库与文件系统,按业务场景自动匹配备份类型、频率、保留周期与恢复目标;输出包含脚本、部署步骤与注意事项,内建失败重试、日志追踪与异常告警,并支持压缩与加密。以最小改动接入现有环境,不打扰线上稳定,显著降低数据丢失风险、缩短恢复时间,通过合规审计,实现跨团队、跨环境的标准化交付;一次配置,长期复用,让新人也能按标准快速上线,整体减少脚本维护与排障成本。
快速制定跨数据库与文件系统的统一备份规范,半天内完成脚本落地与演练;将备份与告警纳入值班流程,显著降低恢复风险与交付周期。
按业务时段生成全量/增量方案与保留策略,自动配置加密、压缩与校验;定期输出恢复演练脚本与报告,支撑高可用与审计需求。
一键生成含错误重试与日志的备份作业,接入现有监控与通知;快速定位异常并回滚,减少夜间告警占用与人工排障。
将模板生成的提示词复制粘贴到您常用的 Chat 应用(如 ChatGPT、Claude 等),即可直接对话使用,无需额外开发。适合个人快速体验和轻量使用场景。
把提示词模板转化为 API,您的程序可任意修改模板参数,通过接口直接调用,轻松实现自动化与批量处理。适合开发者集成与业务系统嵌入。
在 MCP client 中配置对应的 server 地址,让您的 AI 应用自动调用提示词模板。适合高级用户和团队协作,让提示词在不同 AI 工具间无缝衔接。
半价获取高级提示词-优惠即将到期