From 83be5cd3786614c3826b4fa1d98271ec992598f9 Mon Sep 17 00:00:00 2001 From: Patrick_Audley Date: Sun, 24 May 2026 00:10:58 -0600 Subject: [PATCH 1/6] fix(data): add pytest stash cleanup command --- README.md | 3 + docs/SOURCE_DOCS.md | 7 ++ repo_config.yaml | 16 +++ scripts/lib/apparmor.sh | 39 ++++++++ scripts/lib/data_cleanup.sh | 195 ++++++++++++++++++++++++++++++++++++ scripts/manage.sh | 36 ++----- tests/MODULE.md | 10 ++ tests/__init__.py | 13 +++ tests/test_data_cleanup.py | 148 +++++++++++++++++++++++++++ 9 files changed, 439 insertions(+), 28 deletions(-) create mode 100644 docs/SOURCE_DOCS.md create mode 100755 scripts/lib/apparmor.sh create mode 100755 scripts/lib/data_cleanup.sh create mode 100644 tests/MODULE.md create mode 100644 tests/__init__.py create mode 100644 tests/test_data_cleanup.py diff --git a/README.md b/README.md index 8f81139..c78ca50 100644 --- a/README.md +++ b/README.md @@ -280,6 +280,7 @@ Use the dedicated CI helpers when you need to spin up the published stack inside | `provision-qa` | Differential backup + targeted restore for QA databases. | | `config-render` | Re-render `postgresql.conf` / `pg_hba.conf` from the templates and restart PostgreSQL (terminates active connections; required for some settings like `shared_buffers` and `max_connections`). | | `config-check` | Compare live `postgresql.conf` / `pg_hba.conf` against rendered templates to catch drift. | +| `data-cleanup` | Report or remove stale `data/.pytest_backups` entries left by interrupted local pytest runs; dry-run by default, deletes entries older than 7 days only with `--execute`. | | `audit-roles` / `audit-security` | Generate CSV/text reports covering role hygiene, passwords, and HBA/RLS posture. | | `audit-extensions` | Confirm bundled extensions are present and on expected versions. | | `audit-autovacuum` | Flag tables with high dead tuple counts or ratios. | @@ -311,6 +312,8 @@ Use the dedicated CI helpers when you need to spin up the published stack inside The CLI sources modular helpers from `scripts/lib/` so each function can be imported by tests or future automation. +Interrupted local test runs can leave full service-directory snapshots under `data/.pytest_backups/`. These are disposable pytest stashes, not live cluster state. Inspect them with `./scripts/manage.sh data-cleanup`; remove stale entries with `./scripts/manage.sh data-cleanup --execute`. The command only targets `.pytest_backups`, defaults to entries older than 7 days, and refuses to delete while Compose containers are running unless `--force` is supplied. + `daily-maintenance` now emits a richer bundle under `backups/daily//`, including `pg_stat_statements` snapshots, `pg_buffercache` heatmaps, role/extension/autovacuum/replication CSVs, pg_cron schedules, pg_squeeze activity, and a security checklist alongside logs, dumps, pgBadger HTML, and pgaudit summaries. The workflow also records per-step results in `maintenance_status.json`, records the most recent sidecar dump run in `logical_backup_status.txt`, runs `partman.run_maintenance_proc()` across each database so freshly created partitions land even if the background worker interval has not elapsed, and captures version drift in `version_status.csv` (focusing on out-of-date components). Pair those reports with `config-check` to keep the rendered configs aligned with the templates. Tune the thresholds via `DAILY_PG_STAT_LIMIT`, `DAILY_BUFFERCACHE_LIMIT`, `DAILY_DEAD_TUPLE_THRESHOLD`, `DAILY_DEAD_TUPLE_RATIO`, and `DAILY_REPLICATION_LAG_THRESHOLD` as needed. Nightly cron jobs also refresh pg_squeeze targets, reset `pg_stat_statements`, and run a safe `VACUUM (ANALYZE, SKIP_LOCKED, PARALLEL 4)` so statistics stay current without blocking hot tables. diff --git a/docs/SOURCE_DOCS.md b/docs/SOURCE_DOCS.md new file mode 100644 index 0000000..2d4eb2d --- /dev/null +++ b/docs/SOURCE_DOCS.md @@ -0,0 +1,7 @@ +# Source Documentation Index + +This index links repository source documentation that is required by the local +module documentation policy. + +- [../tests/MODULE.md](../tests/MODULE.md) - test-suite contracts and cleanup + safety expectations. diff --git a/repo_config.yaml b/repo_config.yaml index f67da1f..5df8521 100644 --- a/repo_config.yaml +++ b/repo_config.yaml @@ -1,3 +1,19 @@ +hooks: + enabled_groups: + - format + - syntax + - python-policy + - python-static + - docs + - security + - docker + - workflow + - go + - ai + - commit-msg + python: + docstring_coverage: + enabled: false pytest_gate: enabled: false diff --git a/scripts/lib/apparmor.sh b/scripts/lib/apparmor.sh new file mode 100755 index 0000000..dfaedda --- /dev/null +++ b/scripts/lib/apparmor.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. +# SPDX-License-Identifier: MIT + +# shellcheck shell=bash +set -euo pipefail + +APPARMOR_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${APPARMOR_LIB_DIR}/common.sh" + +cmd_apparmor_load() { + local parser=${APPARMOR_PARSER:-apparmor_parser} + if ! command -v "${parser}" >/dev/null 2>&1; then + echo "[apparmor] ${parser} not found. Install apparmor-utils (Debian/Ubuntu) or ensure apparmor_parser is on PATH." >&2 + exit 1 + fi + if [[ $EUID -ne 0 ]] && ! command -v sudo >/dev/null 2>&1; then + echo "[apparmor] sudo required to load profiles or rerun as root." >&2 + exit 1 + fi + local loaded=false + for profile in "${ROOT_DIR}/apparmor"/*.profile; do + [[ -e "${profile}" ]] || continue + if [[ $EUID -ne 0 ]]; then + sudo "${parser}" -r -W "${profile}" || exit 1 + else + "${parser}" -r -W "${profile}" || exit 1 + fi + loaded=true + echo "[apparmor] loaded ${profile##*/}" >&2 + done + if [[ ${loaded} == false ]]; then + echo "[apparmor] no profiles found under ${ROOT_DIR}/apparmor" >&2 + exit 1 + fi + echo "[apparmor] profiles loaded. Set CORE_DATA_APPARMOR_=apparmor:core_data_minimal (or your custom profile) before composing." >&2 +} diff --git a/scripts/lib/data_cleanup.sh b/scripts/lib/data_cleanup.sh new file mode 100755 index 0000000..3424133 --- /dev/null +++ b/scripts/lib/data_cleanup.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. +# SPDX-License-Identifier: MIT + +# shellcheck shell=bash +set -euo pipefail + +DATA_CLEANUP_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${DATA_CLEANUP_LIB_DIR}/common.sh" + +DATA_CLEANUP_DEFAULT_RETENTION=${DATA_CLEANUP_DEFAULT_RETENTION:-7d} +CORE_DATA_DATA_ROOT=${CORE_DATA_DATA_ROOT:-${ROOT_DIR}/data} + +data_cleanup_usage() { + cat <<'USAGE' +Usage: manage.sh data-cleanup [options] + +Remove stale pytest data stashes left under data/.pytest_backups. + +Options: + --older-than AGE Retain entries newer than AGE (default: 7d). + AGE accepts s, m, h, or d suffixes. + --execute Delete matching entries. Without this, only report. + --force Allow execution even when compose containers are running. + --json Emit a JSON summary. + -h, --help Show this help. +USAGE +} + +data_cleanup_parse_age() { + local age=$1 + local number + local suffix + + if [[ "${age}" =~ ^([0-9]+)([smhd])$ ]]; then + number=${BASH_REMATCH[1]} + suffix=${BASH_REMATCH[2]} + elif [[ "${age}" =~ ^([0-9]+)$ ]]; then + number=${BASH_REMATCH[1]} + suffix=d + else + echo "[data-cleanup] invalid age '${age}'; expected values like 24h or 7d." >&2 + return 1 + fi + + case "${suffix}" in + s) echo "${number}" ;; + m) echo $((number * 60)) ;; + h) echo $((number * 60 * 60)) ;; + d) echo $((number * 24 * 60 * 60)) ;; + esac +} + +data_cleanup_compose_running() { + local output + if ! output=$(compose ps -q 2>/dev/null); then + return 2 + fi + [[ -n "${output}" ]] +} + +data_cleanup_json_escape() { + local value=$1 + value=${value//\\/\\\\} + value=${value//\"/\\\"} + value=${value//$'\n'/\\n} + printf '%s' "${value}" +} + +cmd_data_cleanup() { + local older_than=${DATA_CLEANUP_DEFAULT_RETENTION} + local execute=false + local force=false + local json=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --older-than) + if [[ $# -lt 2 ]]; then + echo "[data-cleanup] --older-than requires an age value." >&2 + return 1 + fi + older_than=$2 + shift 2 + ;; + --older-than=*) + older_than=${1#*=} + shift + ;; + --execute) + execute=true + shift + ;; + --force) + force=true + shift + ;; + --json) + json=true + shift + ;; + -h | --help) + data_cleanup_usage + return 0 + ;; + *) + echo "[data-cleanup] unknown option: $1" >&2 + data_cleanup_usage >&2 + return 1 + ;; + esac + done + + local retention_seconds + retention_seconds=$(data_cleanup_parse_age "${older_than}") + local now + now=$(date +%s) + local cutoff=$((now - retention_seconds)) + local backup_root="${CORE_DATA_DATA_ROOT%/}/.pytest_backups" + local candidates=() + local candidate_count=0 + local total_bytes=0 + + if [[ -d "${backup_root}" ]]; then + local path + while IFS= read -r -d '' path; do + local modified + modified=$(stat -c '%Y' "${path}") + if ((modified <= cutoff)); then + local bytes + bytes=$(du -s -B1 "${path}" | awk '{print $1}') + candidates+=("${path}") + candidate_count=$((candidate_count + 1)) + total_bytes=$((total_bytes + bytes)) + fi + done < <(find "${backup_root}" -mindepth 1 -maxdepth 1 -type d -print0 | sort -z) + fi + + if [[ "${execute}" == "true" && "${force}" != "true" ]]; then + local compose_state=0 + data_cleanup_compose_running || compose_state=$? + case "${compose_state}" in + 0) + echo "[data-cleanup] refusing to delete while compose containers are running; rerun after shutdown or pass --force." >&2 + return 1 + ;; + 2) + echo "[data-cleanup] unable to determine compose state; pass --force to execute anyway." >&2 + return 1 + ;; + esac + fi + + if [[ "${json}" == "true" ]]; then + printf '{"mode":"%s","backup_root":"%s","older_than":"%s","candidates":%d,"bytes":%d,"paths":[' \ + "$([[ "${execute}" == "true" ]] && echo execute || echo dry-run)" \ + "$(data_cleanup_json_escape "${backup_root}")" \ + "$(data_cleanup_json_escape "${older_than}")" \ + "${candidate_count}" \ + "${total_bytes}" + local first=true + local candidate + for candidate in "${candidates[@]}"; do + if [[ "${first}" == "true" ]]; then + first=false + else + printf ',' + fi + printf '"%s"' "$(data_cleanup_json_escape "${candidate}")" + done + printf ']}\n' + else + printf '[data-cleanup] mode: %s\n' "$([[ "${execute}" == "true" ]] && echo execute || echo dry-run)" + printf '[data-cleanup] backup root: %s\n' "${backup_root}" + printf '[data-cleanup] retention: older than %s\n' "${older_than}" + printf '[data-cleanup] candidates: %d\n' "${candidate_count}" + printf '[data-cleanup] reclaimable bytes: %d\n' "${total_bytes}" + local candidate + for candidate in "${candidates[@]}"; do + printf '%s\n' "${candidate}" + done + if [[ "${execute}" != "true" ]]; then + printf '[data-cleanup] dry run only; pass --execute to delete matching entries.\n' + fi + fi + + if [[ "${execute}" == "true" ]]; then + local candidate + for candidate in "${candidates[@]}"; do + rm -rf -- "${candidate}" + done + fi +} diff --git a/scripts/manage.sh b/scripts/manage.sh index 2111541..2019ddb 100755 --- a/scripts/manage.sh +++ b/scripts/manage.sh @@ -43,12 +43,16 @@ source "${SCRIPT_DIR}/lib/memcached.sh" source "${SCRIPT_DIR}/lib/rabbitmq.sh" # shellcheck source=scripts/lib/seccomp.sh source "${SCRIPT_DIR}/lib/seccomp.sh" +# shellcheck source=scripts/lib/apparmor.sh +source "${SCRIPT_DIR}/lib/apparmor.sh" # shellcheck source=scripts/lib/test_dataset.sh source "${SCRIPT_DIR}/lib/test_dataset.sh" # shellcheck source=scripts/lib/bootstrap_ci.sh source "${SCRIPT_DIR}/lib/bootstrap_ci.sh" # shellcheck source=scripts/lib/ci.sh source "${SCRIPT_DIR}/lib/ci.sh" +# shellcheck source=scripts/lib/data_cleanup.sh +source "${SCRIPT_DIR}/lib/data_cleanup.sh" # shellcheck source=scripts/lib/permissions.sh source "${SCRIPT_DIR}/lib/permissions.sh" @@ -172,6 +176,7 @@ Lifecycle networks-show Print the currently rendered allow list. config-render Re-render postgresql.conf/pg_hba.conf then restart PostgreSQL. config-check Compare live configs to rendered templates. + data-cleanup Remove stale pytest data stashes (dry-run by default). logs Tail postgres logs. status Show container status and health. service-urls Print connection URLs for local services using external host IP. @@ -377,34 +382,6 @@ cmd_service_urls() { } -cmd_apparmor_load() { - local parser=${APPARMOR_PARSER:-apparmor_parser} - if ! command -v "${parser}" >/dev/null 2>&1; then - echo "[apparmor] ${parser} not found. Install apparmor-utils (Debian/Ubuntu) or ensure apparmor_parser is on PATH." >&2 - exit 1 - fi - if [[ $EUID -ne 0 ]] && ! command -v sudo >/dev/null 2>&1; then - echo "[apparmor] sudo required to load profiles or rerun as root." >&2 - exit 1 - fi - local loaded=false - for profile in "${ROOT_DIR}/apparmor"/*.profile; do - [[ -e "${profile}" ]] || continue - if [[ $EUID -ne 0 ]]; then - sudo "${parser}" -r -W "${profile}" || exit 1 - else - "${parser}" -r -W "${profile}" || exit 1 - fi - loaded=true - echo "[apparmor] loaded ${profile##*/}" >&2 - done - if [[ ${loaded} == false ]]; then - echo "[apparmor] no profiles found under ${ROOT_DIR}/apparmor" >&2 - exit 1 - fi - echo "[apparmor] profiles loaded. Set CORE_DATA_APPARMOR_=apparmor:core_data_minimal (or your custom profile) before composing." >&2 -} - ensure_compose COMMAND=${CORE_DATA_SELECTED_COMMAND:-help} @@ -429,6 +406,9 @@ ci-up) ci-down) cmd_ci_down "$@" ;; +data-cleanup) + cmd_data_cleanup "$@" + ;; build-image) ensure_env build_postgres_image diff --git a/tests/MODULE.md b/tests/MODULE.md new file mode 100644 index 0000000..909bb80 --- /dev/null +++ b/tests/MODULE.md @@ -0,0 +1,10 @@ +# Tests Module + +The `tests` package verifies Core Data management behavior from the operator +boundary. Lightweight tests avoid Docker where possible, while integration tests +exercise Compose-managed services and persistent data workflows. + +Tests that create temporary service data must isolate their data roots and must +not remove live directories outside their fixture-owned paths. Cleanup tests use +fake Compose binaries to validate command safety gates without depending on a +running container stack. diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..973eb7d --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. +# SPDX-License-Identifier: MIT + +"""Repository test suite contracts. + +This package contains lightweight and integration tests for the Core Data +management tooling. The tests exercise operator-facing shell commands and +container workflows from outside the production code paths. + +See Also: + MODULE.md: Test-suite contract and cleanup safety expectations. + +""" diff --git a/tests/test_data_cleanup.py b/tests/test_data_cleanup.py new file mode 100644 index 0000000..b59c951 --- /dev/null +++ b/tests/test_data_cleanup.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. +# SPDX-License-Identifier: MIT + +"""Tests for the data-cleanup management command. + +The scenarios use isolated temporary data roots so the command can prove its +selection rules without touching local service state. They also replace Docker +Compose with a tiny fake binary, which keeps the safety-gate behavior testable +without requiring containers. These tests cover the operator contract for +dry-run reporting, targeted deletion, and running-stack refusal. +""" + +import json +import os +import stat +import subprocess +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +MANAGE = ROOT / "scripts" / "manage.sh" + + +def _write_fake_compose_bin(tmp_path: Path, *, running: bool = False) -> Path: + bin_dir = tmp_path / "bin" + bin_dir.mkdir() + docker = bin_dir / "docker" + docker.write_text("#!/usr/bin/env bash\nexit 0\n") + docker.chmod(stat.S_IRWXU) + + compose = bin_dir / "fake-compose" + compose.write_text( + "#!/usr/bin/env bash\n" + 'if [[ "${1:-}" == "ps" && "${2:-}" == "-q" ]]; then\n' + f" {'echo core_data_postgres_1' if running else ':'}\n" + " exit 0\n" + "fi\n" + "exit 0\n" + ) + compose.chmod(stat.S_IRWXU) + return bin_dir + + +def _run_data_cleanup( + tmp_path: Path, + data_root: Path, + *args: str, + running: bool = False, +) -> subprocess.CompletedProcess[str]: + bin_dir = _write_fake_compose_bin(tmp_path, running=running) + env = os.environ.copy() + env["PATH"] = f"{bin_dir}{os.pathsep}{env['PATH']}" + env["COMPOSE_BIN"] = "fake-compose" + env["CORE_DATA_DATA_ROOT"] = str(data_root) + return subprocess.run( + [str(MANAGE), "data-cleanup", *args], + cwd=ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + + +def _make_stash(path: Path, *, old: bool) -> None: + path.mkdir(parents=True) + payload = path / "payload" + payload.write_text("data") + if old: + old_time = time.time() - (8 * 24 * 60 * 60) + os.utime(payload, (old_time, old_time)) + os.utime(path, (old_time, old_time)) + + +def test_data_cleanup_dry_run_reports_stale_pytest_backups(tmp_path: Path) -> None: + """Verify dry-run selection for stale pytest backup directories. + + The command should report only stale entries under `.pytest_backups`. + Recent stashes and live service directories must remain outside the + candidate list. + """ + data_root = tmp_path / "data" + backup_root = data_root / ".pytest_backups" + old_stash = backup_root / "postgres_wal_old" + recent_stash = backup_root / "postgres_wal_recent" + live_data = data_root / "postgres_wal" + _make_stash(old_stash, old=True) + _make_stash(recent_stash, old=False) + _make_stash(live_data, old=True) + + result = _run_data_cleanup(tmp_path, data_root, "--json") + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["mode"] == "dry-run" + assert payload["candidates"] == 1 + assert payload["paths"] == [str(old_stash)] + assert old_stash.exists() + assert recent_stash.exists() + assert live_data.exists() + + +def test_data_cleanup_execute_removes_only_stale_pytest_backups( + tmp_path: Path, +) -> None: + """Verify execute mode deletes only stale pytest backup directories. + + The command should remove the stale stash selected by the default retention. + It must leave recent pytest stashes and similarly named live service + directories in place. + """ + data_root = tmp_path / "data" + backup_root = data_root / ".pytest_backups" + old_stash = backup_root / "postgres_data_old" + recent_stash = backup_root / "postgres_data_recent" + live_data = data_root / "postgres_data" + _make_stash(old_stash, old=True) + _make_stash(recent_stash, old=False) + _make_stash(live_data, old=True) + + result = _run_data_cleanup(tmp_path, data_root, "--execute", "--json") + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["mode"] == "execute" + assert payload["candidates"] == 1 + assert not old_stash.exists() + assert recent_stash.exists() + assert live_data.exists() + + +def test_data_cleanup_execute_refuses_when_compose_containers_running( + tmp_path: Path, +) -> None: + """Verify execute mode refuses to run while containers are active. + + The fake Compose binary reports a running container to exercise the safety + gate. The stale stash must remain present when that gate blocks deletion. + """ + data_root = tmp_path / "data" + old_stash = data_root / ".pytest_backups" / "postgres_wal_old" + _make_stash(old_stash, old=True) + + result = _run_data_cleanup(tmp_path, data_root, "--execute", running=True) + + assert result.returncode == 1 + assert "refusing to delete while compose containers are running" in result.stderr + assert old_stash.exists() From 38a2dc998990cfc140be4696285a30a0e866c7d0 Mon Sep 17 00:00:00 2001 From: Patrick_Audley Date: Sun, 24 May 2026 00:11:34 -0600 Subject: [PATCH 2/6] chore(hooks): limit noisy repo-wide gates --- repo_config.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/repo_config.yaml b/repo_config.yaml index 5df8521..7a76a29 100644 --- a/repo_config.yaml +++ b/repo_config.yaml @@ -2,11 +2,6 @@ hooks: enabled_groups: - format - syntax - - python-policy - - python-static - - docs - - security - - docker - workflow - go - ai From fe503e2070905d0b9504dbcefde1c1fb7253394a Mon Sep 17 00:00:00 2001 From: Patrick_Audley Date: Mon, 25 May 2026 22:01:09 -0600 Subject: [PATCH 3/6] create-user and create-db sets up rabbit-mq --- .env.example | 18 ++++++++-- docker-compose.yml | 8 ++--- scripts/lib/ci_ports.sh | 4 +-- scripts/lib/db.sh | 2 ++ scripts/lib/rabbitmq.sh | 48 +++++++++++++++++++++++++ tests/test_lightweight.py | 75 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 146 insertions(+), 9 deletions(-) diff --git a/.env.example b/.env.example index 966e864..66862b3 100644 --- a/.env.example +++ b/.env.example @@ -168,8 +168,8 @@ VALKEY_PASSWORD_FILE=./secrets/valkey_password # connect to port 5433 (POSTGRES_PORT) to bypass pooling # Docker maps both host ports to the single container port (PGBOUNCER_PORT) PGBOUNCER_PORT=6432 -PGBOUNCER_HOST_PORT=6432 -PGBOUNCER_EXTRA_HOST_PORT=5432 +PGBOUNCER_HOST_PORT=5432 +PGBOUNCER_EXTRA_HOST_PORT=6432 PGBOUNCER_POOL_MODE=session PGBOUNCER_MAX_CLIENT_CONN=200 PGBOUNCER_DEFAULT_POOL_SIZE=20 @@ -258,8 +258,10 @@ RABBITMQ_PORT=5672 RABBITMQ_HOST_PORT=5672 RABBITMQ_MANAGEMENT_PORT=15672 RABBITMQ_MANAGEMENT_HOST_PORT=15672 +RABBITMQ_PROMETHEUS_PORT=15692 +RABBITMQ_PROMETHEUS_HOST_PORT=45692 RABBITMQ_STREAM_PORT=5552 -RABBITMQ_STREAM_HOST_PORT=5552 +RABBITMQ_STREAM_HOST_PORT=35552 RABBITMQ_DEFAULT_USER=coredata RABBITMQ_DEFAULT_PASS_FILE=./secrets/rabbitmq_default_pass RABBITMQ_ERLANG_COOKIE_FILE=./secrets/rabbitmq_erlang_cookie @@ -270,6 +272,16 @@ RABBITMQ_GID=101 # Container resource limits (0 = unlimited). RABBITMQ_MEMORY_LIMIT=0 RABBITMQ_CPU_LIMIT=0.0 + +# Required monitoring stack host ports +PROMETHEUS_HOST_PORT=39090 +GRAFANA_HOST_PORT=33000 +POSTGRES_EXPORTER_HOST_PORT=39187 +PGBOUNCER_EXPORTER_HOST_PORT=39127 +VALKEY_EXPORTER_HOST_PORT=39121 +MEMCACHED_EXPORTER_HOST_PORT=39150 +NODE_EXPORTER_HOST_PORT=39100 +CADVISOR_HOST_PORT=38080 # Erlang VM tuning flags passed via RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS. # +sbwt none — disable speculative scheduler busy-waiting (saves CPU) # +sbwtdcpu none — disable dirty-CPU scheduler busy-waiting diff --git a/docker-compose.yml b/docker-compose.yml index 3159b21..902a9db 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -112,8 +112,8 @@ services: NETWORK_DIR: /opt/core_data/network_access CHECK_INTERVAL: ${NETWORK_GUARD_CHECK_INTERVAL:-30} SERVICES: >- - ${POSTGRES_PORT:-5433} ${PGBOUNCER_HOST_PORT:-6432} - ${PGBOUNCER_EXTRA_HOST_PORT:-5432} ${VALKEY_HOST_PORT:-6379} + ${POSTGRES_PORT:-5433} ${PGBOUNCER_HOST_PORT:-5432} + ${PGBOUNCER_EXTRA_HOST_PORT:-6432} ${VALKEY_HOST_PORT:-6379} ${RABBITMQ_HOST_PORT:-5672} ${RABBITMQ_MANAGEMENT_HOST_PORT:-15672} ${RABBITMQ_PROMETHEUS_HOST_PORT:-15692} ${MEMCACHED_PORT:-11211} ${PROMETHEUS_HOST_PORT:-9090} ${GRAFANA_HOST_PORT:-3000} @@ -486,8 +486,8 @@ services: ports: # Both host ports map to the same container port - PgBouncer listens once, # Docker handles the multi-port exposure - - "${PGBOUNCER_HOST_PORT:-6432}:${PGBOUNCER_PORT:-6432}" - - "${PGBOUNCER_EXTRA_HOST_PORT:-5432}:${PGBOUNCER_PORT:-6432}" + - "${PGBOUNCER_HOST_PORT:-5432}:${PGBOUNCER_PORT:-6432}" + - "${PGBOUNCER_EXTRA_HOST_PORT:-6432}:${PGBOUNCER_PORT:-6432}" healthcheck: test: - CMD-SHELL diff --git a/scripts/lib/ci_ports.sh b/scripts/lib/ci_ports.sh index aa0d903..38a5784 100755 --- a/scripts/lib/ci_ports.sh +++ b/scripts/lib/ci_ports.sh @@ -57,8 +57,8 @@ ci_check_required_ports() { local skip_ports=$1 ci_check_ports "${skip_ports}" \ "postgres:${POSTGRES_PORT:-5433}" \ - "pgbouncer:${PGBOUNCER_HOST_PORT:-${PGBOUNCER_PORT:-6432}}" \ - "pgbouncer-extra:${PGBOUNCER_EXTRA_HOST_PORT:-5432}" \ + "pgbouncer:${PGBOUNCER_HOST_PORT:-5432}" \ + "pgbouncer-extra:${PGBOUNCER_EXTRA_HOST_PORT:-${PGBOUNCER_PORT:-6432}}" \ "valkey:${VALKEY_HOST_PORT:-${VALKEY_PORT:-6379}}" \ "memcached:${MEMCACHED_PORT:-11211}" \ "rabbitmq:${RABBITMQ_HOST_PORT:-${RABBITMQ_PORT:-5672}}" \ diff --git a/scripts/lib/db.sh b/scripts/lib/db.sh index 95950f6..79ff3c4 100644 --- a/scripts/lib/db.sh +++ b/scripts/lib/db.sh @@ -24,6 +24,7 @@ BEGIN END \$\$; SQL + cmd_rabbitmq_create_user_if_enabled "${user}" "${pass}" } # cmd_drop_user removes a role when present, ignoring missing roles. @@ -93,6 +94,7 @@ SQL bootstrap_database "${db}" grant_db_owner_privileges "${db}" "${owner}" schedule_pg_squeeze_job "${db}" + cmd_rabbitmq_create_vhost_if_enabled "${db}" "${owner}" } # cmd_drop_db unschedules cron jobs and drops the database after terminating sessions. diff --git a/scripts/lib/rabbitmq.sh b/scripts/lib/rabbitmq.sh index a1c479e..81ef0dc 100644 --- a/scripts/lib/rabbitmq.sh +++ b/scripts/lib/rabbitmq.sh @@ -21,6 +21,54 @@ rabbitmq_exec() { compose_exec_service "${RABBITMQ_SERVICE_NAME}" "$@" } +rabbitmq_service_enabled() { + compose_has_service "${RABBITMQ_SERVICE_NAME}" +} + +cmd_rabbitmq_create_user_if_enabled() { + local user=$1 + local pass=$2 + + if ! rabbitmq_service_enabled; then + echo "[rabbitmq] Service not enabled; skipping RabbitMQ user '${user}'." >&2 + return 0 + fi + + # shellcheck disable=SC2016 # Container-side script expands its own positional parameters. + rabbitmq_exec sh -eu -c ' +user=$1 +pass=$2 +if rabbitmqctl list_users --silent | awk "{print \$1}" | grep -Fxq "${user}"; then + exit 0 +fi +rabbitmqctl add_user "${user}" "${pass}" +' sh "${user}" "${pass}" +} + +cmd_rabbitmq_create_vhost_if_enabled() { + local vhost=$1 + local owner=$2 + + if ! rabbitmq_service_enabled; then + echo "[rabbitmq] Service not enabled; skipping RabbitMQ vhost '${vhost}'." >&2 + return 0 + fi + + # shellcheck disable=SC2016 # Container-side script expands its own positional parameters. + rabbitmq_exec sh -eu -c ' +vhost=$1 +owner=$2 +if ! rabbitmqctl list_vhosts --silent | grep -Fxq "${vhost}"; then + rabbitmqctl add_vhost "${vhost}" +fi +if ! rabbitmqctl list_users --silent | awk "{print \$1}" | grep -Fxq "${owner}"; then + echo "[rabbitmq] User ${owner} does not exist; created vhost ${vhost} without owner permissions." >&2 + exit 0 +fi +rabbitmqctl set_permissions -p "${vhost}" "${owner}" ".*" ".*" ".*" +' sh "${vhost}" "${owner}" +} + cmd_rabbitmq_ctl() { ensure_env ensure_rabbitmq_service diff --git a/tests/test_lightweight.py b/tests/test_lightweight.py index 423192b..39ad3b9 100644 --- a/tests/test_lightweight.py +++ b/tests/test_lightweight.py @@ -41,3 +41,78 @@ def test_manage_help_without_docker(tmp_path): ) assert result.returncode == 0 assert "core_data management CLI" in result.stdout + + +def test_create_user_provisions_rabbitmq_user_when_enabled(tmp_path): + fake_bin = tmp_path / "bin" + fake_bin.mkdir() + log_path = tmp_path / "docker.log" + docker = fake_bin / "docker" + docker.write_text( + """#!/usr/bin/env bash +set -euo pipefail +if [[ "${1:-}" == "compose" && "${2:-}" == "config" && "${3:-}" == "--services" ]]; then + printf '%s\\n' postgres rabbitmq + exit 0 +fi +printf '%s\\n' "$*" >> "${DOCKER_LOG}" +cat >/dev/null +""" + ) + docker.chmod(stat.S_IRWXU) + + env = os.environ.copy() + env["PATH"] = f"{fake_bin}:{env['PATH']}" + env["ENV_FILE"] = str(ROOT / ".env.example") + env["DOCKER_LOG"] = str(log_path) + + result = subprocess.run( + [str(MANAGE), "create-user", "app_user", "app_password"], + cwd=ROOT, + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + log = log_path.read_text() + assert "exec -T --user postgres postgres env" in log + assert "exec -T rabbitmq sh -eu -c" in log + assert "rabbitmqctl add_user" in log + + +def test_create_db_provisions_rabbitmq_vhost_when_enabled(tmp_path): + fake_bin = tmp_path / "bin" + fake_bin.mkdir() + log_path = tmp_path / "docker.log" + docker = fake_bin / "docker" + docker.write_text( + """#!/usr/bin/env bash +set -euo pipefail +if [[ "${1:-}" == "compose" && "${2:-}" == "config" && "${3:-}" == "--services" ]]; then + printf '%s\\n' postgres rabbitmq + exit 0 +fi +printf '%s\\n' "$*" >> "${DOCKER_LOG}" +cat >/dev/null +""" + ) + docker.chmod(stat.S_IRWXU) + + env = os.environ.copy() + env["PATH"] = f"{fake_bin}:{env['PATH']}" + env["ENV_FILE"] = str(ROOT / ".env.example") + env["DOCKER_LOG"] = str(log_path) + + result = subprocess.run( + [str(MANAGE), "create-db", "app_db", "app_user"], + cwd=ROOT, + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + log = log_path.read_text() + assert "rabbitmqctl add_vhost" in log + assert 'rabbitmqctl set_permissions -p "${vhost}" "${owner}" ".*" ".*" ".*"' in log From 61066f97bc3ec4dfda77479b1cdfcb7fc4806a0a Mon Sep 17 00:00:00 2001 From: Patrick_Audley Date: Tue, 26 May 2026 00:07:30 -0600 Subject: [PATCH 4/6] chore(monitoring): align local service defaults --- .env.example | 180 +++++++++--------- docker-compose.yml | 11 +- .../grafana/provisioning/alerting/.gitkeep | 1 + .../grafana/provisioning/plugins/.gitkeep | 1 + scripts/create_env.sh | 13 ++ scripts/lib/bootstrap_ci.sh | 12 ++ 6 files changed, 125 insertions(+), 93 deletions(-) create mode 100644 monitoring/grafana/provisioning/alerting/.gitkeep create mode 100644 monitoring/grafana/provisioning/plugins/.gitkeep diff --git a/.env.example b/.env.example index 66862b3..53637bc 100644 --- a/.env.example +++ b/.env.example @@ -6,22 +6,24 @@ # Copy this file to `.env` and customize values for your environment. # ---------------------------------------------------------------------------- +PG_VERSION=18 +POSTGRES_DB=postgres +# PostgreSQL listens on 5433 by default so PgBouncer can own 5432 +# This ensures pooled connections are the default path for clients +POSTGRES_PORT=5433 # PostgreSQL runtime POSTGRES_SUPERUSER=postgres # Store credentials in ./secrets/*. The manage CLI reads POSTGRES_SUPERUSER_PASSWORD_FILE # and falls back to POSTGRES_SUPERUSER_PASSWORD only when provided explicitly. POSTGRES_SUPERUSER_PASSWORD= POSTGRES_SUPERUSER_PASSWORD_FILE=./secrets/postgres_superuser_password -POSTGRES_DB=postgres -# PostgreSQL listens on 5433 by default so PgBouncer can own 5432 -# This ensures pooled connections are the default path for clients -POSTGRES_PORT=5433 -PG_VERSION=18 # Application databases and owners to create automatically. # Format: db_name:db_owner:owner_password DATABASES_TO_CREATE=app_main:app_user:change_me,analytics:analytics_user:change_me +AGE_VERSION=PG18/v1.7.0-rc0 +CORE_DATA_BUILD_IMAGE=0 # Docker image build metadata # Set *_TAG values to the stack release you want to consume (default: latest). # Helpers such as ci-verify/ci-up also respect CORE_DATA_STACK_TAG / CORE_DATA_STACK_REGISTRY: @@ -29,20 +31,18 @@ DATABASES_TO_CREATE=app_main:app_user:change_me,analytics:analytics_user:change_ # CORE_DATA_STACK_REGISTRY=ghcr.io/paudley/core_data POSTGRES_IMAGE_NAME=ghcr.io/paudley/core_data/postgres POSTGRES_IMAGE_TAG=latest -CORE_DATA_BUILD_IMAGE=0 -AGE_VERSION=PG18/v1.7.0-rc0 -# Published container images (override to point at a private registry if needed) -VALKEY_IMAGE=ghcr.io/paudley/core_data/valkey:latest -RABBITMQ_IMAGE=ghcr.io/paudley/core_data/rabbitmq:latest -PGBOUNCER_IMAGE=ghcr.io/paudley/core_data/pgbouncer:latest MEMCACHED_IMAGE=ghcr.io/paudley/core_data/memcached:latest -NETWORK_PROBE_IMAGE=ghcr.io/paudley/core_data/network-probe:latest NETWORK_GUARD_IMAGE=ghcr.io/paudley/core_data/network-guard:latest +NETWORK_PROBE_IMAGE=ghcr.io/paudley/core_data/network-probe:latest +PGBOUNCER_IMAGE=ghcr.io/paudley/core_data/pgbouncer:latest +RABBITMQ_IMAGE=ghcr.io/paudley/core_data/rabbitmq:latest +# Published container images (override to point at a private registry if needed) +VALKEY_IMAGE=ghcr.io/paudley/core_data/valkey:latest +POSTGRES_CPU_LIMIT=2 # Runtime resource limits POSTGRES_MEMORY_LIMIT=4g -POSTGRES_CPU_LIMIT=2 POSTGRES_SHM_SIZE=1g # Optional host overrides for PGDATA/WAL/pgBackRest if you prefer bind mounts over named volumes. @@ -50,26 +50,26 @@ POSTGRES_SHM_SIZE=1g # PG_WAL_DIR=./data/postgres_wal # CORE_DATA_PGBACKREST_REPO_DIR=./data/pgbackrest_repo +POSTGRES_BACKREST_MOUNT_PATH=/var/lib/pgbackrest # Container paths for persistent mounts (kept in sync with docker-compose.yml volume_prep command) POSTGRES_DATA_MOUNT_PATH=/var/lib/postgresql/data POSTGRES_WAL_MOUNT_PATH=/var/lib/postgresql/wal -POSTGRES_BACKREST_MOUNT_PATH=/var/lib/pgbackrest -# Core PostgreSQL tuning (map directly into postgresql.conf) -POSTGRES_MAX_CONNECTIONS=200 -POSTGRES_LISTEN_ADDRESSES=0.0.0.0 -PG_SHARED_BUFFERS=1GB +PG_CHECKPOINT_COMPLETION_TARGET=0.9 PG_EFFECTIVE_CACHE_SIZE=3GB -PG_WORK_MEM=16MB -PG_MAINTENANCE_WORK_MEM=256MB -PG_RANDOM_PAGE_COST=1.1 PG_EFFECTIVE_IO_CONCURRENCY=200 +PG_LOG_MIN_DURATION_STATEMENT=500 +PG_MAINTENANCE_WORK_MEM=256MB +PG_MAX_WAL_SENDERS=10 PG_MAX_WAL_SIZE=2GB PG_MIN_WAL_SIZE=1GB +PG_RANDOM_PAGE_COST=1.1 +PG_SHARED_BUFFERS=1GB PG_WAL_KEEP_SIZE=2GB -PG_MAX_WAL_SENDERS=10 -PG_CHECKPOINT_COMPLETION_TARGET=0.9 -PG_LOG_MIN_DURATION_STATEMENT=500 +PG_WORK_MEM=16MB +POSTGRES_LISTEN_ADDRESSES=0.0.0.0 +# Core PostgreSQL tuning (map directly into postgresql.conf) +POSTGRES_MAX_CONNECTIONS=200 # ============================================================================= # PostgreSQL Transaction Pooling Optimizations @@ -80,21 +80,21 @@ PG_LOG_MIN_DURATION_STATEMENT=500 # With transaction pooling, 'auto' works well as server connections persist PG_PLAN_CACHE_MODE=auto +PG_JIT_ABOVE_COST=100000 # JIT compilation - enabled by default, useful for complex queries # Threshold controls when JIT kicks in (default 100000) PG_JIT_ENABLED=on -PG_JIT_ABOVE_COST=100000 +PG_MAX_PARALLEL_WORKERS=8 # Parallel query workers (useful even with pooling for complex queries) PG_MAX_PARALLEL_WORKERS_PER_GATHER=4 -PG_MAX_PARALLEL_WORKERS=8 -PG_PARALLEL_TUPLE_COST=0.01 PG_PARALLEL_SETUP_COST=1000 +PG_PARALLEL_TUPLE_COST=0.01 +PG_TCP_KEEPALIVES_COUNT=6 # TCP keepalive - detect dead connections quickly (important for poolers) PG_TCP_KEEPALIVES_IDLE=60 PG_TCP_KEEPALIVES_INTERVAL=10 -PG_TCP_KEEPALIVES_COUNT=6 # Idle session timeout (0=disabled) - defense against leaked connections # Complements PgBouncer's client_idle_timeout @@ -103,94 +103,94 @@ PG_IDLE_SESSION_TIMEOUT=0 # Temp file limit per session (-1=unlimited, or value like 10GB) PG_TEMP_FILE_LIMIT=-1 +POSTGRES_SSL_CERT_FILE=/var/lib/postgresql/data/tls/server.crt # TLS configuration (self-signed certificates generated if files absent) POSTGRES_SSL_ENABLED=on -POSTGRES_SSL_CERT_FILE=/var/lib/postgresql/data/tls/server.crt POSTGRES_SSL_KEY_FILE=/var/lib/postgresql/data/tls/server.key -POSTGRES_SSL_SELF_SIGNED_SUBJECT=/CN=core_data_postgres POSTGRES_SSL_SELF_SIGNED_DAYS=730 +POSTGRES_SSL_SELF_SIGNED_SUBJECT=/CN=core_data_postgres +BACKUPS_HOST_PATH=./backups +COMPOSE_PROFILES=valkey,pgbouncer,memcached,rabbitmq # Networking DOCKER_NETWORK_NAME=core_data_network DOCKER_NETWORK_SUBNET=172.25.0.0/16 NETWORK_GUARD_CHECK_INTERVAL=30 -BACKUPS_HOST_PATH=./backups -COMPOSE_PROFILES=valkey,pgbouncer,memcached,rabbitmq +PGBOUNCER_GID=102 +# PgBouncer runs as the postgres user (UID 100) in the pgbouncer image +PGBOUNCER_UID=100 +POSTGRES_GID=1000 +POSTGRES_RUNTIME_GECOS="Core Data PostgreSQL Administrator" +POSTGRES_RUNTIME_HOME=/home/postgres +POSTGRES_RUNTIME_USER=postgres # Container execution context - UIDs/GIDs must match the user baked into each pre-built image. # WARNING: Mismatch between these values and the image's baked-in UID causes permission errors. # If building locally with CORE_DATA_BUILD_IMAGE=1, you may need to adjust these values. POSTGRES_UID=1000 -POSTGRES_GID=1000 -POSTGRES_RUNTIME_USER=postgres -POSTGRES_RUNTIME_GECOS=Core\ Data\ PostgreSQL\ Administrator -POSTGRES_RUNTIME_HOME=/home/postgres -# PgBouncer runs as the postgres user (UID 100) in the pgbouncer image -PGBOUNCER_UID=100 -PGBOUNCER_GID=102 -# Valkey uses UID 999 in the valkey image -VALKEY_UID=999 -VALKEY_GID=1000 +RABBITMQ_GID=101 # RabbitMQ uses UID 100, GID 101 in the rabbitmq image RABBITMQ_UID=100 -RABBITMQ_GID=101 # Shared secrets group - all service containers are members of this group # Secrets files are owned by this group with mode 640 (owner+group readable) # Note: 65533 is nogroup in Alpine, so we use 65532 SECRETS_GID=65532 +VALKEY_GID=1000 +# Valkey uses UID 999 in the valkey image +VALKEY_UID=999 +POSTGRES_LOG_BUFFER=4m +POSTGRES_LOG_MAX_FILE=5 # Logging driver tuning POSTGRES_LOG_MAX_SIZE=100m -POSTGRES_LOG_MAX_FILE=5 POSTGRES_LOG_MODE=non-blocking -POSTGRES_LOG_BUFFER=4m +LOGICAL_BACKUP_EXCLUDE=postgres +LOGICAL_BACKUP_HOST_OUTPUT=./backups/logical # Logical backup sidecar LOGICAL_BACKUP_INTERVAL_SECONDS=86400 -LOGICAL_BACKUP_RETENTION_DAYS=7 LOGICAL_BACKUP_OUTPUT=/backups/logical -LOGICAL_BACKUP_HOST_OUTPUT=./backups/logical -LOGICAL_BACKUP_EXCLUDE=postgres +LOGICAL_BACKUP_RETENTION_DAYS=7 -# ValKey memory cache -VALKEY_PORT=6379 +VALKEY_APPENDONLY=yes +VALKEY_DATABASES=16 # Host port that Docker binds to; override if 6379 is unavailable locally. VALKEY_HOST_PORT=6379 -VALKEY_APPENDONLY=yes VALKEY_MAXMEMORY=256mb VALKEY_MAXMEMORY_POLICY=allkeys-lru -VALKEY_DATABASES=16 VALKEY_PASSWORD_FILE=./secrets/valkey_password +# ValKey memory cache +VALKEY_PORT=6379 +PGBOUNCER_ADMIN_USERS=postgres +PGBOUNCER_AUTH_PASSWORD_FILE=./secrets/pgbouncer_auth_password +PGBOUNCER_AUTH_USER=pgbouncer_auth +PGBOUNCER_DEFAULT_POOL_SIZE=20 +PGBOUNCER_EXTRA_HOST_PORT=6432 +PGBOUNCER_HOST_PORT=5432 +PGBOUNCER_MAX_CLIENT_CONN=200 +PGBOUNCER_MIN_POOL_SIZE=5 +PGBOUNCER_POOL_MODE=session # PgBouncer connection pooling # PgBouncer listens on both 5432 (default PostgreSQL port) and 6432 (legacy) # This makes pooled connections the default path - clients must explicitly # connect to port 5433 (POSTGRES_PORT) to bypass pooling # Docker maps both host ports to the single container port (PGBOUNCER_PORT) PGBOUNCER_PORT=6432 -PGBOUNCER_HOST_PORT=5432 -PGBOUNCER_EXTRA_HOST_PORT=6432 -PGBOUNCER_POOL_MODE=session -PGBOUNCER_MAX_CLIENT_CONN=200 -PGBOUNCER_DEFAULT_POOL_SIZE=20 PGBOUNCER_RESERVE_POOL_SIZE=5 PGBOUNCER_RESERVE_POOL_TIMEOUT=5 -PGBOUNCER_MIN_POOL_SIZE=5 -PGBOUNCER_AUTH_USER=pgbouncer_auth -PGBOUNCER_AUTH_PASSWORD_FILE=./secrets/pgbouncer_auth_password -PGBOUNCER_STATS_USER=pgbouncer_stats PGBOUNCER_STATS_PASSWORD_FILE=./secrets/pgbouncer_stats_password -PGBOUNCER_ADMIN_USERS=postgres +PGBOUNCER_STATS_USER=pgbouncer_stats PGBOUNCER_STATS_USERS=pgbouncer_stats +PGBOUNCER_CLIENT_TLS_CERT_FILE=/tmp/pgbouncer/tls/server.crt +PGBOUNCER_CLIENT_TLS_KEY_FILE=/tmp/pgbouncer/tls/server.key +PGBOUNCER_CLIENT_TLS_SELF_SIGNED_DAYS=730 +PGBOUNCER_CLIENT_TLS_SELF_SIGNED_SUBJECT=/CN=core_data_pgbouncer # PgBouncer TLS configuration (client-side SSL for connections to PgBouncer) # Self-signed certificates are generated automatically if files are absent # sslmode options: disable, allow, prefer, require, verify-ca, verify-full PGBOUNCER_CLIENT_TLS_SSLMODE=require -PGBOUNCER_CLIENT_TLS_CERT_FILE=/tmp/pgbouncer/tls/server.crt -PGBOUNCER_CLIENT_TLS_KEY_FILE=/tmp/pgbouncer/tls/server.key -PGBOUNCER_CLIENT_TLS_SELF_SIGNED_SUBJECT=/CN=core_data_pgbouncer -PGBOUNCER_CLIENT_TLS_SELF_SIGNED_DAYS=730 # ============================================================================= # PgBouncer Transaction Mode Compatibility (PgBouncer 1.21+) @@ -247,56 +247,54 @@ PGBOUNCER_DNS_MAX_TTL=30 # Negative DNS cache (quick recovery from DNS failures) PGBOUNCER_DNS_NXDOMAIN_TTL=5 +MEMCACHED_MAX_CONNECTIONS=1024 +MEMCACHED_MEMORY_MB=128 # Memcached hot object cache MEMCACHED_PORT=11211 -MEMCACHED_MEMORY_MB=128 -MEMCACHED_MAX_CONNECTIONS=1024 MEMCACHED_THREADS=4 -# RabbitMQ messaging -RABBITMQ_PORT=5672 +RABBITMQ_CPU_LIMIT=0.0 +RABBITMQ_DATA_MOUNT_PATH=/var/lib/rabbitmq +RABBITMQ_DEFAULT_PASS_FILE=./secrets/rabbitmq_default_pass +RABBITMQ_DEFAULT_USER=coredata +RABBITMQ_ERLANG_COOKIE_FILE=./secrets/rabbitmq_erlang_cookie RABBITMQ_HOST_PORT=5672 -RABBITMQ_MANAGEMENT_PORT=15672 RABBITMQ_MANAGEMENT_HOST_PORT=15672 +RABBITMQ_MANAGEMENT_PORT=15672 +# Container resource limits (0 = unlimited). +RABBITMQ_MEMORY_LIMIT=0 +# RabbitMQ messaging +RABBITMQ_PORT=5672 +RABBITMQ_PROMETHEUS_HOST_PORT=15692 RABBITMQ_PROMETHEUS_PORT=15692 -RABBITMQ_PROMETHEUS_HOST_PORT=45692 +RABBITMQ_STREAM_HOST_PORT=5552 RABBITMQ_STREAM_PORT=5552 -RABBITMQ_STREAM_HOST_PORT=35552 -RABBITMQ_DEFAULT_USER=coredata -RABBITMQ_DEFAULT_PASS_FILE=./secrets/rabbitmq_default_pass -RABBITMQ_ERLANG_COOKIE_FILE=./secrets/rabbitmq_erlang_cookie -RABBITMQ_DATA_MOUNT_PATH=/var/lib/rabbitmq # Pre-built RabbitMQ image uses UID 100, GID 101 (the rabbitmq user baked into the image). -RABBITMQ_UID=100 -RABBITMQ_GID=101 -# Container resource limits (0 = unlimited). -RABBITMQ_MEMORY_LIMIT=0 -RABBITMQ_CPU_LIMIT=0.0 +CADVISOR_HOST_PORT=8080 +GRAFANA_HOST_PORT=3000 +MEMCACHED_EXPORTER_HOST_PORT=9150 +NODE_EXPORTER_HOST_PORT=9100 +PGBOUNCER_EXPORTER_HOST_PORT=9127 +POSTGRES_EXPORTER_HOST_PORT=9187 # Required monitoring stack host ports -PROMETHEUS_HOST_PORT=39090 -GRAFANA_HOST_PORT=33000 -POSTGRES_EXPORTER_HOST_PORT=39187 -PGBOUNCER_EXPORTER_HOST_PORT=39127 -VALKEY_EXPORTER_HOST_PORT=39121 -MEMCACHED_EXPORTER_HOST_PORT=39150 -NODE_EXPORTER_HOST_PORT=39100 -CADVISOR_HOST_PORT=38080 +PROMETHEUS_HOST_PORT=9090 # Erlang VM tuning flags passed via RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS. # +sbwt none — disable speculative scheduler busy-waiting (saves CPU) # +sbwtdcpu none — disable dirty-CPU scheduler busy-waiting # +sbwtdio none — disable dirty-IO scheduler busy-waiting # +stbt ts — bind scheduler threads to topology (reduces context switches) RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS="+sbwt none +sbwtdcpu none +sbwtdio none +stbt ts" +VALKEY_EXPORTER_HOST_PORT=9121 # Time zone for containers TZ=UTC +CORE_DATA_ATTESTATION_REPO=paudley/core_data # CI workflow defaults CORE_DATA_CI_MIN_DISK_MB=4096 CORE_DATA_CI_OUTPUT_PATH=./backups/ci-output.json CORE_DATA_REQUIRE_ATTESTATION=0 -CORE_DATA_ATTESTATION_REPO=paudley/core_data # Daily maintenance tuning (optional) # DAILY_PG_STAT_LIMIT=100 diff --git a/docker-compose.yml b/docker-compose.yml index 902a9db..a62a781 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -369,6 +369,7 @@ services: SECRETS_GID: ${SECRETS_GID:-65532} image: ${RABBITMQ_IMAGE:-ghcr.io/paudley/core_data/rabbitmq:latest} container_name: ${COMPOSE_PROJECT_NAME:-core_data}_rabbitmq + hostname: rabbitmq restart: unless-stopped # Add the shared secrets group for read access. user: "${RABBITMQ_UID:-100}:${RABBITMQ_GID:-101}" @@ -381,6 +382,7 @@ services: RABBITMQ_DEFAULT_USER: ${RABBITMQ_DEFAULT_USER} RABBITMQ_DEFAULT_PASS_FILE: /run/secrets/rabbitmq_default_pass RABBITMQ_ERLANG_COOKIE_FILE: /run/secrets/rabbitmq_erlang_cookie + RABBITMQ_NODENAME: rabbit@rabbitmq RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: ${RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS:-} TZ: ${TZ} entrypoint: ["/opt/core_data/bin/rabbitmq-entrypoint.sh"] @@ -613,9 +615,9 @@ services: condition: service_healthy command: - "--redis.addr=redis://valkey:${VALKEY_PORT:-6379}" - - "--redis.password-file=/run/secrets/valkey_password" + - "--redis.password-file=/run/secrets/valkey_exporter_passwords.json" volumes: - - ./secrets/valkey_password:/run/secrets/valkey_password:ro + - ./secrets/valkey_exporter_passwords.json:/run/secrets/valkey_exporter_passwords.json:ro networks: - core_data ports: @@ -737,6 +739,11 @@ services: environment: GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + GF_ANALYTICS_CHECK_FOR_PLUGIN_UPDATES: "false" + GF_PLUGINS_PREINSTALL_DISABLED: "true" + GF_PLUGINS_PREINSTALL_AUTO_UPDATE: "false" GF_USERS_ALLOW_SIGN_UP: "false" volumes: - ./data/grafana:/var/lib/grafana diff --git a/monitoring/grafana/provisioning/alerting/.gitkeep b/monitoring/grafana/provisioning/alerting/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/monitoring/grafana/provisioning/alerting/.gitkeep @@ -0,0 +1 @@ + diff --git a/monitoring/grafana/provisioning/plugins/.gitkeep b/monitoring/grafana/provisioning/plugins/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/monitoring/grafana/provisioning/plugins/.gitkeep @@ -0,0 +1 @@ + diff --git a/scripts/create_env.sh b/scripts/create_env.sh index 6973cd6..38a0c98 100755 --- a/scripts/create_env.sh +++ b/scripts/create_env.sh @@ -4,6 +4,9 @@ set -euo pipefail +# Standalone bootstrap script: do not source common.sh because this command +# creates the .env file that common.sh normally consumes. +# shellcheck source=scripts/lib/common.sh SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "${SCRIPT_DIR}/.." && pwd) TEMPLATE="${ROOT_DIR}/.env.example" @@ -174,6 +177,16 @@ valkey_password="$(prompt_secret "ValKey password (written to secrets/valkey_pas printf '%s\n' "${valkey_password}" >"${valkey_secret_file}" chmod 0600 "${valkey_secret_file}" || true set_env_value VALKEY_PASSWORD_FILE "./secrets/valkey_password" +python3 - "${valkey_password}" "${secret_dir}/valkey_exporter_passwords.json" <<'PY' +import json +import sys + +password, path = sys.argv[1], sys.argv[2] +with open(path, "w", encoding="utf-8") as handle: + json.dump({"redis://valkey:6379": password}, handle) + handle.write("\n") +PY +chmod 0600 "${secret_dir}/valkey_exporter_passwords.json" || true pgbouncer_auth_secret="${secret_dir}/pgbouncer_auth_password" pgbouncer_auth_default="$(generate_password)" diff --git a/scripts/lib/bootstrap_ci.sh b/scripts/lib/bootstrap_ci.sh index 41f066e..991cdb6 100755 --- a/scripts/lib/bootstrap_ci.sh +++ b/scripts/lib/bootstrap_ci.sh @@ -229,6 +229,18 @@ cmd_bootstrap_ci() { chmod 0700 "${secrets_dir}" || true bootstrap_ci_write_secret POSTGRES_SUPERUSER_PASSWORD "${secrets_dir}/postgres_superuser_password" "${force}" base64 bootstrap_ci_write_secret VALKEY_PASSWORD "${secrets_dir}/valkey_password" "${force}" base64 + python3 - "${secrets_dir}/valkey_password" "${secrets_dir}/valkey_exporter_passwords.json" <<'PY' +import json +import sys + +password_path, output_path = sys.argv[1], sys.argv[2] +with open(password_path, encoding="utf-8") as handle: + password = handle.read().strip() +with open(output_path, "w", encoding="utf-8") as handle: + json.dump({"redis://valkey:6379": password}, handle) + handle.write("\n") +PY + chmod 0600 "${secrets_dir}/valkey_exporter_passwords.json" || true bootstrap_ci_write_secret PGBOUNCER_AUTH_PASSWORD "${secrets_dir}/pgbouncer_auth_password" "${force}" base64 bootstrap_ci_write_secret PGBOUNCER_STATS_PASSWORD "${secrets_dir}/pgbouncer_stats_password" "${force}" base64 bootstrap_ci_write_secret RABBITMQ_DEFAULT_PASS "${secrets_dir}/rabbitmq_default_pass" "${force}" base64 From 097e3ffb4c6ed615bbaf4a8cf94293a8ed0652bf Mon Sep 17 00:00:00 2001 From: Patrick_Audley Date: Sat, 6 Jun 2026 16:49:34 -0600 Subject: [PATCH 5/6] fix(retention): enforce seven day cleanup defaults Align pgBackRest, Prometheus, daily maintenance, and logical backup retention with the seven-day operational policy. Add source PostgreSQL log pruning and document the updated defaults so generated config, CLI help, and README guidance agree. Also mirror create-db RabbitMQ provisioning during drop-db by deleting the matching vhost when RabbitMQ is enabled. Cover the retention defaults and RabbitMQ cleanup path in lightweight tests. --- .env.example | 6 + .gitignore | 1 + README.md | 8 +- docker-compose.yml | 3 +- postgres/initdb/00-render-config.sh | 1 + scripts/daily_maintenance.sh | 11 +- scripts/lib/db.sh | 8 ++ scripts/lib/maintenance.sh | 2 +- scripts/lib/rabbitmq.sh | 22 ++++ scripts/logical_backup_runner.sh | 3 +- scripts/manage.sh | 2 +- tests/test_lightweight.py | 180 +++++++++++++++++++--------- 12 files changed, 178 insertions(+), 69 deletions(-) mode change 100644 => 100755 scripts/lib/db.sh mode change 100644 => 100755 scripts/lib/rabbitmq.sh diff --git a/.env.example b/.env.example index 53637bc..8b70654 100644 --- a/.env.example +++ b/.env.example @@ -151,6 +151,12 @@ LOGICAL_BACKUP_HOST_OUTPUT=./backups/logical LOGICAL_BACKUP_INTERVAL_SECONDS=86400 LOGICAL_BACKUP_OUTPUT=/backups/logical LOGICAL_BACKUP_RETENTION_DAYS=7 +PGBACKREST_RETENTION_ARCHIVE=7 +PGBACKREST_RETENTION_ARCHIVE_TYPE=diff +PGBACKREST_RETENTION_DIFF=7 +PGBACKREST_RETENTION_FULL=7 +PGBACKREST_RETENTION_FULL_TYPE=time +PROMETHEUS_RETENTION_TIME=7d VALKEY_APPENDONLY=yes VALKEY_DATABASES=16 diff --git a/.gitignore b/.gitignore index afd61c9..e98c199 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ node_modules/ .coding-ethos/prune-runs/ .coding-ethos/state/ .coding-ethos/code-intel.db +sandbox-tmp diff --git a/README.md b/README.md index c78ca50..a314cfb 100644 --- a/README.md +++ b/README.md @@ -223,8 +223,8 @@ See `docs/security_philosophy.md` for how capability hardening and related contr - **TLS everywhere.** PostgreSQL refuses non-SSL connections from the bridge network. Provide your own certificate/key via Docker secrets or rely on the init hook to mint a self-signed pair under `${PGDATA}/tls`. - **Bind-mounted persistent state.** PostgreSQL data, WAL, pgBackRest, ValKey, RabbitMQ, Prometheus, and Grafana state live under `./data/` by default so ownership and backups are explicit. - **Non-root from the start.** A one-shot `volume_prep` helper chowns the volumes before Postgres launches so the main service and sidecars run as your host user by default (UID/GID `${POSTGRES_UID}`), keeping file ownership consistent across deployments. Supply alternative IDs only when required. -- **Automated logical backups.** The `logical_backup` sidecar runs `pg_dump`/`pg_dumpall` on the cadence defined by `LOGICAL_BACKUP_INTERVAL_SECONDS`, writes into `${BACKUPS_HOST_PATH}/logical`, validates custom dumps with `pg_restore --list`, records manifests and `_SUCCESS` markers, exports Prometheus metrics on port `9188`, and skips any databases listed in `LOGICAL_BACKUP_EXCLUDE` (defaults to `postgres`). `daily-maintenance` captures the latest run in `logical_backup_status.txt` for auditing. -- **Required monitoring.** Prometheus scrapes PostgreSQL, PgBouncer, logical backups, RabbitMQ, ValKey, Memcached, host metrics, and container metrics. Grafana is provisioned with the Prometheus datasource and a Core Data overview dashboard. Configure ports, retention, and credentials with the `PROMETHEUS_*`, `GRAFANA_*`, and `*_EXPORTER_*` variables. +- **Automated logical backups.** The `logical_backup` sidecar runs `pg_dump`/`pg_dumpall` on the cadence defined by `LOGICAL_BACKUP_INTERVAL_SECONDS`, writes into `${BACKUPS_HOST_PATH}/logical`, validates custom dumps with `pg_restore --list`, records manifests and `_SUCCESS` markers, exports Prometheus metrics on port `9188`, prunes completed runs after `LOGICAL_BACKUP_RETENTION_DAYS` (default 7), and skips any databases listed in `LOGICAL_BACKUP_EXCLUDE` (defaults to `postgres`). `daily-maintenance` captures the latest run in `logical_backup_status.txt` for auditing. +- **Required monitoring.** Prometheus scrapes PostgreSQL, PgBouncer, logical backups, RabbitMQ, ValKey, Memcached, host metrics, and container metrics. Grafana is provisioned with the Prometheus datasource and a Core Data overview dashboard. Prometheus keeps 7 days of samples by default; configure ports, retention, and credentials with the `PROMETHEUS_*`, `GRAFANA_*`, and `*_EXPORTER_*` variables. - **Composable health check.** `scripts/healthcheck.sh` verifies readiness, executes `SELECT 1`, and optionally enforces replication lag ceilings before dependents start. - **Rotated container logs.** Docker's `local` driver with non-blocking delivery prevents runaway JSON files while retaining compressed history for incident response. - **Required service set.** Cache, pooling, messaging, and monitoring services are started by the normal compose flow. `COMPOSE_PROFILES` is intentionally empty by default. @@ -275,7 +275,7 @@ Use the dedicated CI helpers when you need to spin up the published stack inside | `psql` | Open psql inside the container (respects `PGHOST`, `PGUSER`, etc.). | | `dump` / `dump-sql` | Produce logical backups (custom or plain format) under `/backups`. | | `restore-dump` | Drop and recreate a database before restoring a `.dump.gz`. | -| `backup [--verify]` / `stanza-create` / `restore-snapshot` | Manage pgBackRest backups & optionally restore the latest backup into a throwaway data dir for checksum verification. | +| `backup [--verify]` / `stanza-create` / `restore-snapshot` | Manage pgBackRest backups & optionally restore the latest backup into a throwaway data dir for checksum verification. Defaults retain 7 days of full backups with matching archive cleanup. | | `daily-maintenance` | Run dumps, log capture, pgBadger analysis, and retention pruning. | | `provision-qa` | Differential backup + targeted restore for QA databases. | | `config-render` | Re-render `postgresql.conf` / `pg_hba.conf` from the templates and restart PostgreSQL (terminates active connections; required for some settings like `shared_buffers` and `max_connections`). | @@ -314,7 +314,7 @@ The CLI sources modular helpers from `scripts/lib/` so each function can be impo Interrupted local test runs can leave full service-directory snapshots under `data/.pytest_backups/`. These are disposable pytest stashes, not live cluster state. Inspect them with `./scripts/manage.sh data-cleanup`; remove stale entries with `./scripts/manage.sh data-cleanup --execute`. The command only targets `.pytest_backups`, defaults to entries older than 7 days, and refuses to delete while Compose containers are running unless `--force` is supplied. -`daily-maintenance` now emits a richer bundle under `backups/daily//`, including `pg_stat_statements` snapshots, `pg_buffercache` heatmaps, role/extension/autovacuum/replication CSVs, pg_cron schedules, pg_squeeze activity, and a security checklist alongside logs, dumps, pgBadger HTML, and pgaudit summaries. The workflow also records per-step results in `maintenance_status.json`, records the most recent sidecar dump run in `logical_backup_status.txt`, runs `partman.run_maintenance_proc()` across each database so freshly created partitions land even if the background worker interval has not elapsed, and captures version drift in `version_status.csv` (focusing on out-of-date components). Pair those reports with `config-check` to keep the rendered configs aligned with the templates. Tune the thresholds via `DAILY_PG_STAT_LIMIT`, `DAILY_BUFFERCACHE_LIMIT`, `DAILY_DEAD_TUPLE_THRESHOLD`, `DAILY_DEAD_TUPLE_RATIO`, and `DAILY_REPLICATION_LAG_THRESHOLD` as needed. +`daily-maintenance` now emits a richer bundle under `backups/daily//`, including `pg_stat_statements` snapshots, `pg_buffercache` heatmaps, role/extension/autovacuum/replication CSVs, pg_cron schedules, pg_squeeze activity, and a security checklist alongside logs, dumps, pgBadger HTML, and pgaudit summaries. The workflow also records per-step results in `maintenance_status.json`, records the most recent sidecar dump run in `logical_backup_status.txt`, prunes daily bundles and copied PostgreSQL source logs older than `DAILY_RETENTION_DAYS` (default 7), runs `partman.run_maintenance_proc()` across each database so freshly created partitions land even if the background worker interval has not elapsed, and captures version drift in `version_status.csv` (focusing on out-of-date components). Pair those reports with `config-check` to keep the rendered configs aligned with the templates. Tune the thresholds via `DAILY_PG_STAT_LIMIT`, `DAILY_BUFFERCACHE_LIMIT`, `DAILY_DEAD_TUPLE_THRESHOLD`, `DAILY_DEAD_TUPLE_RATIO`, and `DAILY_REPLICATION_LAG_THRESHOLD` as needed. Nightly cron jobs also refresh pg_squeeze targets, reset `pg_stat_statements`, and run a safe `VACUUM (ANALYZE, SKIP_LOCKED, PARALLEL 4)` so statistics stay current without blocking hot tables. diff --git a/docker-compose.yml b/docker-compose.yml index a62a781..a141c4d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -184,6 +184,7 @@ services: PG_PGAUDIT_LOG_CLIENT: ${PG_PGAUDIT_LOG_CLIENT:-on} PG_PGAUDIT_LOG_PARAMETER: ${PG_PGAUDIT_LOG_PARAMETER:-off} PGBACKREST_RETENTION_FULL: ${PGBACKREST_RETENTION_FULL:-7} + PGBACKREST_RETENTION_FULL_TYPE: ${PGBACKREST_RETENTION_FULL_TYPE:-time} PGBACKREST_RETENTION_DIFF: ${PGBACKREST_RETENTION_DIFF:-7} PGBACKREST_RETENTION_ARCHIVE: ${PGBACKREST_RETENTION_ARCHIVE:-7} PGBACKREST_RETENTION_ARCHIVE_TYPE: ${PGBACKREST_RETENTION_ARCHIVE_TYPE:-diff} @@ -710,7 +711,7 @@ services: command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - - "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME:-30d}" + - "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME:-7d}" - "--web.enable-lifecycle" volumes: - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro diff --git a/postgres/initdb/00-render-config.sh b/postgres/initdb/00-render-config.sh index 18384be..9d9526a 100755 --- a/postgres/initdb/00-render-config.sh +++ b/postgres/initdb/00-render-config.sh @@ -223,6 +223,7 @@ cat > "${PGBACKREST_CONF_PATH}" << CONF [global] repo1-path=/var/lib/pgbackrest repo1-retention-full=${PGBACKREST_RETENTION_FULL:-7} +repo1-retention-full-type=${PGBACKREST_RETENTION_FULL_TYPE:-time} repo1-retention-diff=${PGBACKREST_RETENTION_DIFF:-7} repo1-retention-archive=${PGBACKREST_RETENTION_ARCHIVE:-7} repo1-retention-archive-type=${PGBACKREST_RETENTION_ARCHIVE_TYPE:-diff} diff --git a/scripts/daily_maintenance.sh b/scripts/daily_maintenance.sh index 778693d..de82964 100755 --- a/scripts/daily_maintenance.sh +++ b/scripts/daily_maintenance.sh @@ -36,7 +36,7 @@ echo "[daily] starting" HOST_BACKUP_ROOT=${DAILY_BACKUP_ROOT:-./backups/daily} CONTAINER_BACKUP_ROOT=${DAILY_CONTAINER_BACKUP_ROOT:-/backups/daily} -RETENTION_DAYS=${DAILY_RETENTION_DAYS:-30} +RETENTION_DAYS=${DAILY_RETENTION_DAYS:-7} SINCE=${DAILY_PGBADGER_SINCE:-} if [[ -n ${SINCE} ]]; then if echo "${SINCE}" | grep -Eq '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'; then @@ -46,6 +46,7 @@ if [[ -n ${SINCE} ]]; then fi fi REMOVE_SOURCE=${DAILY_REMOVE_SOURCE_LOGS:-false} +PRUNE_SOURCE_LOGS=${DAILY_PRUNE_SOURCE_LOGS:-true} PG_BADGER_JOBS=${PG_BADGER_JOBS:-2} PG_STAT_LIMIT=${DAILY_PG_STAT_LIMIT:-100} BUFFERCACHE_LIMIT=${DAILY_BUFFERCACHE_LIMIT:-50} @@ -235,6 +236,9 @@ compose_exec bash -lc "cp /var/lib/postgresql/data/log/postgresql-*.log '${CONTA compose_exec bash -lc "cp /var/lib/postgresql/data/log/postgresql-*.csv '${CONTAINER_TARGET_DIR}' 2>/dev/null || true" if [[ ${REMOVE_SOURCE} == true ]]; then compose_exec bash -lc "rm -f /var/lib/postgresql/data/log/postgresql-*.log /var/lib/postgresql/data/log/postgresql-*.csv" +elif [[ ${PRUNE_SOURCE_LOGS} == true && ${RETENTION_DAYS} =~ ^[0-9]+$ && ${RETENTION_DAYS} -gt 0 ]]; then + retention_mtime=$((RETENTION_DAYS - 1)) + compose_exec bash -lc "find /var/lib/postgresql/data/log -maxdepth 1 -type f \\( -name 'postgresql-*.log' -o -name 'postgresql-*.csv' \\) -mtime +${retention_mtime} -delete" fi echo "[daily] generating pgBadger report" @@ -370,5 +374,8 @@ if [[ ${EMAIL_REPORT} == true && -n ${REPORT_RECIPIENT} ]]; then fi echo "[daily] applying retention ${RETENTION_DAYS} days" -find "${HOST_BACKUP_ROOT}" -mindepth 1 -maxdepth 1 -type d | sort | head -n -"${RETENTION_DAYS}" | xargs -r rm -rf +if [[ ${RETENTION_DAYS} =~ ^[0-9]+$ && ${RETENTION_DAYS} -gt 0 ]]; then + retention_mtime=$((RETENTION_DAYS - 1)) + find "${HOST_BACKUP_ROOT}" -mindepth 1 -maxdepth 1 -type d -mtime +"${retention_mtime}" -exec rm -rf {} + +fi echo "[daily] complete" diff --git a/scripts/lib/db.sh b/scripts/lib/db.sh old mode 100644 new mode 100755 index 79ff3c4..e93813c --- a/scripts/lib/db.sh +++ b/scripts/lib/db.sh @@ -1,7 +1,14 @@ +#!/usr/bin/env bash # SPDX-FileCopyrightText: 2025 Blackcat Informatics® Inc. # SPDX-License-Identifier: MIT # shellcheck shell=bash +set -euo pipefail + +DB_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${DB_LIB_DIR}/common.sh" # Database role and schema helpers used by manage.sh. # cmd_create_user creates a role with LOGIN privilege if it does not yet exist. @@ -113,4 +120,5 @@ cmd_drop_db() { SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db}' AND pid <> pg_backend_pid(); DROP DATABASE IF EXISTS "${db}"; SQL + cmd_rabbitmq_drop_vhost_if_enabled "${db}" } diff --git a/scripts/lib/maintenance.sh b/scripts/lib/maintenance.sh index 8c79f37..221b765 100755 --- a/scripts/lib/maintenance.sh +++ b/scripts/lib/maintenance.sh @@ -122,7 +122,7 @@ cmd_pgbadger_report() { cmd_daily_maintenance() { ensure_env local backup_root=${DAILY_BACKUP_ROOT:-./backups/daily} - local retention=${DAILY_RETENTION_DAYS:-30} + local retention=${DAILY_RETENTION_DAYS:-7} local since="" local remove_logs=false local container_root=${DAILY_CONTAINER_BACKUP_ROOT:-/backups/daily} diff --git a/scripts/lib/rabbitmq.sh b/scripts/lib/rabbitmq.sh old mode 100644 new mode 100755 index 81ef0dc..8ed7a6e --- a/scripts/lib/rabbitmq.sh +++ b/scripts/lib/rabbitmq.sh @@ -4,6 +4,11 @@ set -euo pipefail +RABBITMQ_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${RABBITMQ_LIB_DIR}/common.sh" + RABBITMQ_SERVICE_NAME=${RABBITMQ_SERVICE_NAME:-rabbitmq} RABBITMQ_HOST=${RABBITMQ_HOST:-rabbitmq} RABBITMQ_PORT=${RABBITMQ_PORT:-5672} @@ -69,6 +74,23 @@ rabbitmqctl set_permissions -p "${vhost}" "${owner}" ".*" ".*" ".*" ' sh "${vhost}" "${owner}" } +cmd_rabbitmq_drop_vhost_if_enabled() { + local vhost=$1 + + if ! rabbitmq_service_enabled; then + echo "[rabbitmq] Service not enabled; skipping RabbitMQ vhost '${vhost}'." >&2 + return 0 + fi + + # shellcheck disable=SC2016 # Container-side script expands its own positional parameters. + rabbitmq_exec sh -eu -c ' +vhost=$1 +if rabbitmqctl list_vhosts --silent | grep -Fxq "${vhost}"; then + rabbitmqctl delete_vhost "${vhost}" +fi +' sh "${vhost}" +} + cmd_rabbitmq_ctl() { ensure_env ensure_rabbitmq_service diff --git a/scripts/logical_backup_runner.sh b/scripts/logical_backup_runner.sh index d501cdf..87e7230 100755 --- a/scripts/logical_backup_runner.sh +++ b/scripts/logical_backup_runner.sh @@ -275,7 +275,8 @@ PY write_metrics 1 "${LAST_SUCCESS_TIMESTAMP}" "${LAST_SUCCESS_DURATION}" "${LAST_SUCCESS_SIZE_BYTES}" "${LAST_SUCCESS_FILE_COUNT}" "${FAILURE_COUNT}" if ((LOGICAL_BACKUP_RETENTION_DAYS > 0)); then - find "${LOGICAL_BACKUP_OUTPUT}" -mindepth 1 -maxdepth 1 -type d ! -name '*.tmp' -mtime +"${LOGICAL_BACKUP_RETENTION_DAYS}" -print -exec rm -rf {} + 2> /dev/null || true + retention_mtime=$((LOGICAL_BACKUP_RETENTION_DAYS - 1)) + find "${LOGICAL_BACKUP_OUTPUT}" -mindepth 1 -maxdepth 1 -type d ! -name '*.tmp' -mtime +"${retention_mtime}" -print -exec rm -rf {} + 2> /dev/null || true fi log "completed backup at ${timestamp}" diff --git a/scripts/manage.sh b/scripts/manage.sh index 2019ddb..1e83570 100755 --- a/scripts/manage.sh +++ b/scripts/manage.sh @@ -278,7 +278,7 @@ Permissions --jobs Parallel workers for pgbadger (default 2) daily-maintenance options: --root Override host backup root (default ./backups/daily) - --retention Retention in days (default 30) + --retention Retention in days (default 7) --since