diff --git a/.env.example b/.env.example index 966e864..8b70654 100644 --- a/.env.example +++ b/.env.example @@ -6,22 +6,24 @@ # Copy this file to `.env` and customize values for your environment. # ---------------------------------------------------------------------------- +PG_VERSION=18 +POSTGRES_DB=postgres +# PostgreSQL listens on 5433 by default so PgBouncer can own 5432 +# This ensures pooled connections are the default path for clients +POSTGRES_PORT=5433 # PostgreSQL runtime POSTGRES_SUPERUSER=postgres # Store credentials in ./secrets/*. The manage CLI reads POSTGRES_SUPERUSER_PASSWORD_FILE # and falls back to POSTGRES_SUPERUSER_PASSWORD only when provided explicitly. POSTGRES_SUPERUSER_PASSWORD= POSTGRES_SUPERUSER_PASSWORD_FILE=./secrets/postgres_superuser_password -POSTGRES_DB=postgres -# PostgreSQL listens on 5433 by default so PgBouncer can own 5432 -# This ensures pooled connections are the default path for clients -POSTGRES_PORT=5433 -PG_VERSION=18 # Application databases and owners to create automatically. # Format: db_name:db_owner:owner_password DATABASES_TO_CREATE=app_main:app_user:change_me,analytics:analytics_user:change_me +AGE_VERSION=PG18/v1.7.0-rc0 +CORE_DATA_BUILD_IMAGE=0 # Docker image build metadata # Set *_TAG values to the stack release you want to consume (default: latest). # Helpers such as ci-verify/ci-up also respect CORE_DATA_STACK_TAG / CORE_DATA_STACK_REGISTRY: @@ -29,20 +31,18 @@ DATABASES_TO_CREATE=app_main:app_user:change_me,analytics:analytics_user:change_ # CORE_DATA_STACK_REGISTRY=ghcr.io/paudley/core_data POSTGRES_IMAGE_NAME=ghcr.io/paudley/core_data/postgres POSTGRES_IMAGE_TAG=latest -CORE_DATA_BUILD_IMAGE=0 -AGE_VERSION=PG18/v1.7.0-rc0 -# Published container images (override to point at a private registry if needed) -VALKEY_IMAGE=ghcr.io/paudley/core_data/valkey:latest -RABBITMQ_IMAGE=ghcr.io/paudley/core_data/rabbitmq:latest -PGBOUNCER_IMAGE=ghcr.io/paudley/core_data/pgbouncer:latest MEMCACHED_IMAGE=ghcr.io/paudley/core_data/memcached:latest -NETWORK_PROBE_IMAGE=ghcr.io/paudley/core_data/network-probe:latest NETWORK_GUARD_IMAGE=ghcr.io/paudley/core_data/network-guard:latest +NETWORK_PROBE_IMAGE=ghcr.io/paudley/core_data/network-probe:latest +PGBOUNCER_IMAGE=ghcr.io/paudley/core_data/pgbouncer:latest +RABBITMQ_IMAGE=ghcr.io/paudley/core_data/rabbitmq:latest +# Published container images (override to point at a private registry if needed) +VALKEY_IMAGE=ghcr.io/paudley/core_data/valkey:latest +POSTGRES_CPU_LIMIT=2 # Runtime resource limits POSTGRES_MEMORY_LIMIT=4g -POSTGRES_CPU_LIMIT=2 POSTGRES_SHM_SIZE=1g # Optional host overrides for PGDATA/WAL/pgBackRest if you prefer bind mounts over named volumes. @@ -50,26 +50,26 @@ POSTGRES_SHM_SIZE=1g # PG_WAL_DIR=./data/postgres_wal # CORE_DATA_PGBACKREST_REPO_DIR=./data/pgbackrest_repo +POSTGRES_BACKREST_MOUNT_PATH=/var/lib/pgbackrest # Container paths for persistent mounts (kept in sync with docker-compose.yml volume_prep command) POSTGRES_DATA_MOUNT_PATH=/var/lib/postgresql/data POSTGRES_WAL_MOUNT_PATH=/var/lib/postgresql/wal -POSTGRES_BACKREST_MOUNT_PATH=/var/lib/pgbackrest -# Core PostgreSQL tuning (map directly into postgresql.conf) -POSTGRES_MAX_CONNECTIONS=200 -POSTGRES_LISTEN_ADDRESSES=0.0.0.0 -PG_SHARED_BUFFERS=1GB +PG_CHECKPOINT_COMPLETION_TARGET=0.9 PG_EFFECTIVE_CACHE_SIZE=3GB -PG_WORK_MEM=16MB -PG_MAINTENANCE_WORK_MEM=256MB -PG_RANDOM_PAGE_COST=1.1 PG_EFFECTIVE_IO_CONCURRENCY=200 +PG_LOG_MIN_DURATION_STATEMENT=500 +PG_MAINTENANCE_WORK_MEM=256MB +PG_MAX_WAL_SENDERS=10 PG_MAX_WAL_SIZE=2GB PG_MIN_WAL_SIZE=1GB +PG_RANDOM_PAGE_COST=1.1 +PG_SHARED_BUFFERS=1GB PG_WAL_KEEP_SIZE=2GB -PG_MAX_WAL_SENDERS=10 -PG_CHECKPOINT_COMPLETION_TARGET=0.9 -PG_LOG_MIN_DURATION_STATEMENT=500 +PG_WORK_MEM=16MB +POSTGRES_LISTEN_ADDRESSES=0.0.0.0 +# Core PostgreSQL tuning (map directly into postgresql.conf) +POSTGRES_MAX_CONNECTIONS=200 # ============================================================================= # PostgreSQL Transaction Pooling Optimizations @@ -80,21 +80,21 @@ PG_LOG_MIN_DURATION_STATEMENT=500 # With transaction pooling, 'auto' works well as server connections persist PG_PLAN_CACHE_MODE=auto +PG_JIT_ABOVE_COST=100000 # JIT compilation - enabled by default, useful for complex queries # Threshold controls when JIT kicks in (default 100000) PG_JIT_ENABLED=on -PG_JIT_ABOVE_COST=100000 +PG_MAX_PARALLEL_WORKERS=8 # Parallel query workers (useful even with pooling for complex queries) PG_MAX_PARALLEL_WORKERS_PER_GATHER=4 -PG_MAX_PARALLEL_WORKERS=8 -PG_PARALLEL_TUPLE_COST=0.01 PG_PARALLEL_SETUP_COST=1000 +PG_PARALLEL_TUPLE_COST=0.01 +PG_TCP_KEEPALIVES_COUNT=6 # TCP keepalive - detect dead connections quickly (important for poolers) PG_TCP_KEEPALIVES_IDLE=60 PG_TCP_KEEPALIVES_INTERVAL=10 -PG_TCP_KEEPALIVES_COUNT=6 # Idle session timeout (0=disabled) - defense against leaked connections # Complements PgBouncer's client_idle_timeout @@ -103,94 +103,100 @@ PG_IDLE_SESSION_TIMEOUT=0 # Temp file limit per session (-1=unlimited, or value like 10GB) PG_TEMP_FILE_LIMIT=-1 +POSTGRES_SSL_CERT_FILE=/var/lib/postgresql/data/tls/server.crt # TLS configuration (self-signed certificates generated if files absent) POSTGRES_SSL_ENABLED=on -POSTGRES_SSL_CERT_FILE=/var/lib/postgresql/data/tls/server.crt POSTGRES_SSL_KEY_FILE=/var/lib/postgresql/data/tls/server.key -POSTGRES_SSL_SELF_SIGNED_SUBJECT=/CN=core_data_postgres POSTGRES_SSL_SELF_SIGNED_DAYS=730 +POSTGRES_SSL_SELF_SIGNED_SUBJECT=/CN=core_data_postgres +BACKUPS_HOST_PATH=./backups +COMPOSE_PROFILES=valkey,pgbouncer,memcached,rabbitmq # Networking DOCKER_NETWORK_NAME=core_data_network DOCKER_NETWORK_SUBNET=172.25.0.0/16 NETWORK_GUARD_CHECK_INTERVAL=30 -BACKUPS_HOST_PATH=./backups -COMPOSE_PROFILES=valkey,pgbouncer,memcached,rabbitmq +PGBOUNCER_GID=102 +# PgBouncer runs as the postgres user (UID 100) in the pgbouncer image +PGBOUNCER_UID=100 +POSTGRES_GID=1000 +POSTGRES_RUNTIME_GECOS="Core Data PostgreSQL Administrator" +POSTGRES_RUNTIME_HOME=/home/postgres +POSTGRES_RUNTIME_USER=postgres # Container execution context - UIDs/GIDs must match the user baked into each pre-built image. # WARNING: Mismatch between these values and the image's baked-in UID causes permission errors. # If building locally with CORE_DATA_BUILD_IMAGE=1, you may need to adjust these values. POSTGRES_UID=1000 -POSTGRES_GID=1000 -POSTGRES_RUNTIME_USER=postgres -POSTGRES_RUNTIME_GECOS=Core\ Data\ PostgreSQL\ Administrator -POSTGRES_RUNTIME_HOME=/home/postgres -# PgBouncer runs as the postgres user (UID 100) in the pgbouncer image -PGBOUNCER_UID=100 -PGBOUNCER_GID=102 -# Valkey uses UID 999 in the valkey image -VALKEY_UID=999 -VALKEY_GID=1000 +RABBITMQ_GID=101 # RabbitMQ uses UID 100, GID 101 in the rabbitmq image RABBITMQ_UID=100 -RABBITMQ_GID=101 # Shared secrets group - all service containers are members of this group # Secrets files are owned by this group with mode 640 (owner+group readable) # Note: 65533 is nogroup in Alpine, so we use 65532 SECRETS_GID=65532 +VALKEY_GID=1000 +# Valkey uses UID 999 in the valkey image +VALKEY_UID=999 +POSTGRES_LOG_BUFFER=4m +POSTGRES_LOG_MAX_FILE=5 # Logging driver tuning POSTGRES_LOG_MAX_SIZE=100m -POSTGRES_LOG_MAX_FILE=5 POSTGRES_LOG_MODE=non-blocking -POSTGRES_LOG_BUFFER=4m +LOGICAL_BACKUP_EXCLUDE=postgres +LOGICAL_BACKUP_HOST_OUTPUT=./backups/logical # Logical backup sidecar LOGICAL_BACKUP_INTERVAL_SECONDS=86400 -LOGICAL_BACKUP_RETENTION_DAYS=7 LOGICAL_BACKUP_OUTPUT=/backups/logical -LOGICAL_BACKUP_HOST_OUTPUT=./backups/logical -LOGICAL_BACKUP_EXCLUDE=postgres +LOGICAL_BACKUP_RETENTION_DAYS=7 +PGBACKREST_RETENTION_ARCHIVE=7 +PGBACKREST_RETENTION_ARCHIVE_TYPE=diff +PGBACKREST_RETENTION_DIFF=7 +PGBACKREST_RETENTION_FULL=7 +PGBACKREST_RETENTION_FULL_TYPE=time +PROMETHEUS_RETENTION_TIME=7d -# ValKey memory cache -VALKEY_PORT=6379 +VALKEY_APPENDONLY=yes +VALKEY_DATABASES=16 # Host port that Docker binds to; override if 6379 is unavailable locally. VALKEY_HOST_PORT=6379 -VALKEY_APPENDONLY=yes VALKEY_MAXMEMORY=256mb VALKEY_MAXMEMORY_POLICY=allkeys-lru -VALKEY_DATABASES=16 VALKEY_PASSWORD_FILE=./secrets/valkey_password +# ValKey memory cache +VALKEY_PORT=6379 +PGBOUNCER_ADMIN_USERS=postgres +PGBOUNCER_AUTH_PASSWORD_FILE=./secrets/pgbouncer_auth_password +PGBOUNCER_AUTH_USER=pgbouncer_auth +PGBOUNCER_DEFAULT_POOL_SIZE=20 +PGBOUNCER_EXTRA_HOST_PORT=6432 +PGBOUNCER_HOST_PORT=5432 +PGBOUNCER_MAX_CLIENT_CONN=200 +PGBOUNCER_MIN_POOL_SIZE=5 +PGBOUNCER_POOL_MODE=session # PgBouncer connection pooling # PgBouncer listens on both 5432 (default PostgreSQL port) and 6432 (legacy) # This makes pooled connections the default path - clients must explicitly # connect to port 5433 (POSTGRES_PORT) to bypass pooling # Docker maps both host ports to the single container port (PGBOUNCER_PORT) PGBOUNCER_PORT=6432 -PGBOUNCER_HOST_PORT=6432 -PGBOUNCER_EXTRA_HOST_PORT=5432 -PGBOUNCER_POOL_MODE=session -PGBOUNCER_MAX_CLIENT_CONN=200 -PGBOUNCER_DEFAULT_POOL_SIZE=20 PGBOUNCER_RESERVE_POOL_SIZE=5 PGBOUNCER_RESERVE_POOL_TIMEOUT=5 -PGBOUNCER_MIN_POOL_SIZE=5 -PGBOUNCER_AUTH_USER=pgbouncer_auth -PGBOUNCER_AUTH_PASSWORD_FILE=./secrets/pgbouncer_auth_password -PGBOUNCER_STATS_USER=pgbouncer_stats PGBOUNCER_STATS_PASSWORD_FILE=./secrets/pgbouncer_stats_password -PGBOUNCER_ADMIN_USERS=postgres +PGBOUNCER_STATS_USER=pgbouncer_stats PGBOUNCER_STATS_USERS=pgbouncer_stats +PGBOUNCER_CLIENT_TLS_CERT_FILE=/tmp/pgbouncer/tls/server.crt +PGBOUNCER_CLIENT_TLS_KEY_FILE=/tmp/pgbouncer/tls/server.key +PGBOUNCER_CLIENT_TLS_SELF_SIGNED_DAYS=730 +PGBOUNCER_CLIENT_TLS_SELF_SIGNED_SUBJECT=/CN=core_data_pgbouncer # PgBouncer TLS configuration (client-side SSL for connections to PgBouncer) # Self-signed certificates are generated automatically if files are absent # sslmode options: disable, allow, prefer, require, verify-ca, verify-full PGBOUNCER_CLIENT_TLS_SSLMODE=require -PGBOUNCER_CLIENT_TLS_CERT_FILE=/tmp/pgbouncer/tls/server.crt -PGBOUNCER_CLIENT_TLS_KEY_FILE=/tmp/pgbouncer/tls/server.key -PGBOUNCER_CLIENT_TLS_SELF_SIGNED_SUBJECT=/CN=core_data_pgbouncer -PGBOUNCER_CLIENT_TLS_SELF_SIGNED_DAYS=730 # ============================================================================= # PgBouncer Transaction Mode Compatibility (PgBouncer 1.21+) @@ -247,44 +253,54 @@ PGBOUNCER_DNS_MAX_TTL=30 # Negative DNS cache (quick recovery from DNS failures) PGBOUNCER_DNS_NXDOMAIN_TTL=5 +MEMCACHED_MAX_CONNECTIONS=1024 +MEMCACHED_MEMORY_MB=128 # Memcached hot object cache MEMCACHED_PORT=11211 -MEMCACHED_MEMORY_MB=128 -MEMCACHED_MAX_CONNECTIONS=1024 MEMCACHED_THREADS=4 -# RabbitMQ messaging -RABBITMQ_PORT=5672 -RABBITMQ_HOST_PORT=5672 -RABBITMQ_MANAGEMENT_PORT=15672 -RABBITMQ_MANAGEMENT_HOST_PORT=15672 -RABBITMQ_STREAM_PORT=5552 -RABBITMQ_STREAM_HOST_PORT=5552 -RABBITMQ_DEFAULT_USER=coredata +RABBITMQ_CPU_LIMIT=0.0 +RABBITMQ_DATA_MOUNT_PATH=/var/lib/rabbitmq RABBITMQ_DEFAULT_PASS_FILE=./secrets/rabbitmq_default_pass +RABBITMQ_DEFAULT_USER=coredata RABBITMQ_ERLANG_COOKIE_FILE=./secrets/rabbitmq_erlang_cookie -RABBITMQ_DATA_MOUNT_PATH=/var/lib/rabbitmq -# Pre-built RabbitMQ image uses UID 100, GID 101 (the rabbitmq user baked into the image). -RABBITMQ_UID=100 -RABBITMQ_GID=101 +RABBITMQ_HOST_PORT=5672 +RABBITMQ_MANAGEMENT_HOST_PORT=15672 +RABBITMQ_MANAGEMENT_PORT=15672 # Container resource limits (0 = unlimited). RABBITMQ_MEMORY_LIMIT=0 -RABBITMQ_CPU_LIMIT=0.0 +# RabbitMQ messaging +RABBITMQ_PORT=5672 +RABBITMQ_PROMETHEUS_HOST_PORT=15692 +RABBITMQ_PROMETHEUS_PORT=15692 +RABBITMQ_STREAM_HOST_PORT=5552 +RABBITMQ_STREAM_PORT=5552 +# Pre-built RabbitMQ image uses UID 100, GID 101 (the rabbitmq user baked into the image). + +CADVISOR_HOST_PORT=8080 +GRAFANA_HOST_PORT=3000 +MEMCACHED_EXPORTER_HOST_PORT=9150 +NODE_EXPORTER_HOST_PORT=9100 +PGBOUNCER_EXPORTER_HOST_PORT=9127 +POSTGRES_EXPORTER_HOST_PORT=9187 +# Required monitoring stack host ports +PROMETHEUS_HOST_PORT=9090 # Erlang VM tuning flags passed via RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS. # +sbwt none — disable speculative scheduler busy-waiting (saves CPU) # +sbwtdcpu none — disable dirty-CPU scheduler busy-waiting # +sbwtdio none — disable dirty-IO scheduler busy-waiting # +stbt ts — bind scheduler threads to topology (reduces context switches) RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS="+sbwt none +sbwtdcpu none +sbwtdio none +stbt ts" +VALKEY_EXPORTER_HOST_PORT=9121 # Time zone for containers TZ=UTC +CORE_DATA_ATTESTATION_REPO=paudley/core_data # CI workflow defaults CORE_DATA_CI_MIN_DISK_MB=4096 CORE_DATA_CI_OUTPUT_PATH=./backups/ci-output.json CORE_DATA_REQUIRE_ATTESTATION=0 -CORE_DATA_ATTESTATION_REPO=paudley/core_data # Daily maintenance tuning (optional) # DAILY_PG_STAT_LIMIT=100 diff --git a/.gitignore b/.gitignore index afd61c9..6c0b16b 100644 --- a/.gitignore +++ b/.gitignore @@ -49,8 +49,10 @@ node_modules/ .code-ethos/cache/ .coding-ethos/cache/ .coding-ethos/code-intel.db +.coding-ethos/code-intel.db-shm +.coding-ethos/code-intel.db-wal .coding-ethos/hook-runs/ .coding-ethos/lint-runs/ .coding-ethos/prune-runs/ .coding-ethos/state/ -.coding-ethos/code-intel.db +sandbox-tmp diff --git a/README.md b/README.md index 8f81139..a314cfb 100644 --- a/README.md +++ b/README.md @@ -223,8 +223,8 @@ See `docs/security_philosophy.md` for how capability hardening and related contr - **TLS everywhere.** PostgreSQL refuses non-SSL connections from the bridge network. Provide your own certificate/key via Docker secrets or rely on the init hook to mint a self-signed pair under `${PGDATA}/tls`. - **Bind-mounted persistent state.** PostgreSQL data, WAL, pgBackRest, ValKey, RabbitMQ, Prometheus, and Grafana state live under `./data/` by default so ownership and backups are explicit. - **Non-root from the start.** A one-shot `volume_prep` helper chowns the volumes before Postgres launches so the main service and sidecars run as your host user by default (UID/GID `${POSTGRES_UID}`), keeping file ownership consistent across deployments. Supply alternative IDs only when required. -- **Automated logical backups.** The `logical_backup` sidecar runs `pg_dump`/`pg_dumpall` on the cadence defined by `LOGICAL_BACKUP_INTERVAL_SECONDS`, writes into `${BACKUPS_HOST_PATH}/logical`, validates custom dumps with `pg_restore --list`, records manifests and `_SUCCESS` markers, exports Prometheus metrics on port `9188`, and skips any databases listed in `LOGICAL_BACKUP_EXCLUDE` (defaults to `postgres`). `daily-maintenance` captures the latest run in `logical_backup_status.txt` for auditing. -- **Required monitoring.** Prometheus scrapes PostgreSQL, PgBouncer, logical backups, RabbitMQ, ValKey, Memcached, host metrics, and container metrics. Grafana is provisioned with the Prometheus datasource and a Core Data overview dashboard. Configure ports, retention, and credentials with the `PROMETHEUS_*`, `GRAFANA_*`, and `*_EXPORTER_*` variables. +- **Automated logical backups.** The `logical_backup` sidecar runs `pg_dump`/`pg_dumpall` on the cadence defined by `LOGICAL_BACKUP_INTERVAL_SECONDS`, writes into `${BACKUPS_HOST_PATH}/logical`, validates custom dumps with `pg_restore --list`, records manifests and `_SUCCESS` markers, exports Prometheus metrics on port `9188`, prunes completed runs after `LOGICAL_BACKUP_RETENTION_DAYS` (default 7), and skips any databases listed in `LOGICAL_BACKUP_EXCLUDE` (defaults to `postgres`). `daily-maintenance` captures the latest run in `logical_backup_status.txt` for auditing. +- **Required monitoring.** Prometheus scrapes PostgreSQL, PgBouncer, logical backups, RabbitMQ, ValKey, Memcached, host metrics, and container metrics. Grafana is provisioned with the Prometheus datasource and a Core Data overview dashboard. Prometheus keeps 7 days of samples by default; configure ports, retention, and credentials with the `PROMETHEUS_*`, `GRAFANA_*`, and `*_EXPORTER_*` variables. - **Composable health check.** `scripts/healthcheck.sh` verifies readiness, executes `SELECT 1`, and optionally enforces replication lag ceilings before dependents start. - **Rotated container logs.** Docker's `local` driver with non-blocking delivery prevents runaway JSON files while retaining compressed history for incident response. - **Required service set.** Cache, pooling, messaging, and monitoring services are started by the normal compose flow. `COMPOSE_PROFILES` is intentionally empty by default. @@ -275,11 +275,12 @@ Use the dedicated CI helpers when you need to spin up the published stack inside | `psql` | Open psql inside the container (respects `PGHOST`, `PGUSER`, etc.). | | `dump` / `dump-sql` | Produce logical backups (custom or plain format) under `/backups`. | | `restore-dump` | Drop and recreate a database before restoring a `.dump.gz`. | -| `backup [--verify]` / `stanza-create` / `restore-snapshot` | Manage pgBackRest backups & optionally restore the latest backup into a throwaway data dir for checksum verification. | +| `backup [--verify]` / `stanza-create` / `restore-snapshot` | Manage pgBackRest backups & optionally restore the latest backup into a throwaway data dir for checksum verification. Defaults retain 7 days of full backups with matching archive cleanup. | | `daily-maintenance` | Run dumps, log capture, pgBadger analysis, and retention pruning. | | `provision-qa` | Differential backup + targeted restore for QA databases. | | `config-render` | Re-render `postgresql.conf` / `pg_hba.conf` from the templates and restart PostgreSQL (terminates active connections; required for some settings like `shared_buffers` and `max_connections`). | | `config-check` | Compare live `postgresql.conf` / `pg_hba.conf` against rendered templates to catch drift. | +| `data-cleanup` | Report or remove stale `data/.pytest_backups` entries left by interrupted local pytest runs; dry-run by default, deletes entries older than 7 days only with `--execute`. | | `audit-roles` / `audit-security` | Generate CSV/text reports covering role hygiene, passwords, and HBA/RLS posture. | | `audit-extensions` | Confirm bundled extensions are present and on expected versions. | | `audit-autovacuum` | Flag tables with high dead tuple counts or ratios. | @@ -311,7 +312,9 @@ Use the dedicated CI helpers when you need to spin up the published stack inside The CLI sources modular helpers from `scripts/lib/` so each function can be imported by tests or future automation. -`daily-maintenance` now emits a richer bundle under `backups/daily//`, including `pg_stat_statements` snapshots, `pg_buffercache` heatmaps, role/extension/autovacuum/replication CSVs, pg_cron schedules, pg_squeeze activity, and a security checklist alongside logs, dumps, pgBadger HTML, and pgaudit summaries. The workflow also records per-step results in `maintenance_status.json`, records the most recent sidecar dump run in `logical_backup_status.txt`, runs `partman.run_maintenance_proc()` across each database so freshly created partitions land even if the background worker interval has not elapsed, and captures version drift in `version_status.csv` (focusing on out-of-date components). Pair those reports with `config-check` to keep the rendered configs aligned with the templates. Tune the thresholds via `DAILY_PG_STAT_LIMIT`, `DAILY_BUFFERCACHE_LIMIT`, `DAILY_DEAD_TUPLE_THRESHOLD`, `DAILY_DEAD_TUPLE_RATIO`, and `DAILY_REPLICATION_LAG_THRESHOLD` as needed. +Interrupted local test runs can leave full service-directory snapshots under `data/.pytest_backups/`. These are disposable pytest stashes, not live cluster state. Inspect them with `./scripts/manage.sh data-cleanup`; remove stale entries with `./scripts/manage.sh data-cleanup --execute`. The command only targets `.pytest_backups`, defaults to entries older than 7 days, and refuses to delete while Compose containers are running unless `--force` is supplied. + +`daily-maintenance` now emits a richer bundle under `backups/daily//`, including `pg_stat_statements` snapshots, `pg_buffercache` heatmaps, role/extension/autovacuum/replication CSVs, pg_cron schedules, pg_squeeze activity, and a security checklist alongside logs, dumps, pgBadger HTML, and pgaudit summaries. The workflow also records per-step results in `maintenance_status.json`, records the most recent sidecar dump run in `logical_backup_status.txt`, prunes daily bundles and copied PostgreSQL source logs older than `DAILY_RETENTION_DAYS` (default 7), runs `partman.run_maintenance_proc()` across each database so freshly created partitions land even if the background worker interval has not elapsed, and captures version drift in `version_status.csv` (focusing on out-of-date components). Pair those reports with `config-check` to keep the rendered configs aligned with the templates. Tune the thresholds via `DAILY_PG_STAT_LIMIT`, `DAILY_BUFFERCACHE_LIMIT`, `DAILY_DEAD_TUPLE_THRESHOLD`, `DAILY_DEAD_TUPLE_RATIO`, and `DAILY_REPLICATION_LAG_THRESHOLD` as needed. Nightly cron jobs also refresh pg_squeeze targets, reset `pg_stat_statements`, and run a safe `VACUUM (ANALYZE, SKIP_LOCKED, PARALLEL 4)` so statistics stay current without blocking hot tables. diff --git a/docker-compose.yml b/docker-compose.yml index 3159b21..a141c4d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -112,8 +112,8 @@ services: NETWORK_DIR: /opt/core_data/network_access CHECK_INTERVAL: ${NETWORK_GUARD_CHECK_INTERVAL:-30} SERVICES: >- - ${POSTGRES_PORT:-5433} ${PGBOUNCER_HOST_PORT:-6432} - ${PGBOUNCER_EXTRA_HOST_PORT:-5432} ${VALKEY_HOST_PORT:-6379} + ${POSTGRES_PORT:-5433} ${PGBOUNCER_HOST_PORT:-5432} + ${PGBOUNCER_EXTRA_HOST_PORT:-6432} ${VALKEY_HOST_PORT:-6379} ${RABBITMQ_HOST_PORT:-5672} ${RABBITMQ_MANAGEMENT_HOST_PORT:-15672} ${RABBITMQ_PROMETHEUS_HOST_PORT:-15692} ${MEMCACHED_PORT:-11211} ${PROMETHEUS_HOST_PORT:-9090} ${GRAFANA_HOST_PORT:-3000} @@ -184,6 +184,7 @@ services: PG_PGAUDIT_LOG_CLIENT: ${PG_PGAUDIT_LOG_CLIENT:-on} PG_PGAUDIT_LOG_PARAMETER: ${PG_PGAUDIT_LOG_PARAMETER:-off} PGBACKREST_RETENTION_FULL: ${PGBACKREST_RETENTION_FULL:-7} + PGBACKREST_RETENTION_FULL_TYPE: ${PGBACKREST_RETENTION_FULL_TYPE:-time} PGBACKREST_RETENTION_DIFF: ${PGBACKREST_RETENTION_DIFF:-7} PGBACKREST_RETENTION_ARCHIVE: ${PGBACKREST_RETENTION_ARCHIVE:-7} PGBACKREST_RETENTION_ARCHIVE_TYPE: ${PGBACKREST_RETENTION_ARCHIVE_TYPE:-diff} @@ -369,6 +370,7 @@ services: SECRETS_GID: ${SECRETS_GID:-65532} image: ${RABBITMQ_IMAGE:-ghcr.io/paudley/core_data/rabbitmq:latest} container_name: ${COMPOSE_PROJECT_NAME:-core_data}_rabbitmq + hostname: rabbitmq restart: unless-stopped # Add the shared secrets group for read access. user: "${RABBITMQ_UID:-100}:${RABBITMQ_GID:-101}" @@ -381,6 +383,7 @@ services: RABBITMQ_DEFAULT_USER: ${RABBITMQ_DEFAULT_USER} RABBITMQ_DEFAULT_PASS_FILE: /run/secrets/rabbitmq_default_pass RABBITMQ_ERLANG_COOKIE_FILE: /run/secrets/rabbitmq_erlang_cookie + RABBITMQ_NODENAME: rabbit@rabbitmq RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: ${RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS:-} TZ: ${TZ} entrypoint: ["/opt/core_data/bin/rabbitmq-entrypoint.sh"] @@ -486,8 +489,8 @@ services: ports: # Both host ports map to the same container port - PgBouncer listens once, # Docker handles the multi-port exposure - - "${PGBOUNCER_HOST_PORT:-6432}:${PGBOUNCER_PORT:-6432}" - - "${PGBOUNCER_EXTRA_HOST_PORT:-5432}:${PGBOUNCER_PORT:-6432}" + - "${PGBOUNCER_HOST_PORT:-5432}:${PGBOUNCER_PORT:-6432}" + - "${PGBOUNCER_EXTRA_HOST_PORT:-6432}:${PGBOUNCER_PORT:-6432}" healthcheck: test: - CMD-SHELL @@ -613,9 +616,9 @@ services: condition: service_healthy command: - "--redis.addr=redis://valkey:${VALKEY_PORT:-6379}" - - "--redis.password-file=/run/secrets/valkey_password" + - "--redis.password-file=/run/secrets/valkey_exporter_passwords.json" volumes: - - ./secrets/valkey_password:/run/secrets/valkey_password:ro + - ./secrets/valkey_exporter_passwords.json:/run/secrets/valkey_exporter_passwords.json:ro networks: - core_data ports: @@ -708,7 +711,7 @@ services: command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - - "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME:-30d}" + - "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME:-7d}" - "--web.enable-lifecycle" volumes: - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro @@ -737,6 +740,11 @@ services: environment: GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} + GF_ANALYTICS_REPORTING_ENABLED: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + GF_ANALYTICS_CHECK_FOR_PLUGIN_UPDATES: "false" + GF_PLUGINS_PREINSTALL_DISABLED: "true" + GF_PLUGINS_PREINSTALL_AUTO_UPDATE: "false" GF_USERS_ALLOW_SIGN_UP: "false" volumes: - ./data/grafana:/var/lib/grafana diff --git a/docs/SOURCE_DOCS.md b/docs/SOURCE_DOCS.md new file mode 100644 index 0000000..2d4eb2d --- /dev/null +++ b/docs/SOURCE_DOCS.md @@ -0,0 +1,7 @@ +# Source Documentation Index + +This index links repository source documentation that is required by the local +module documentation policy. + +- [../tests/MODULE.md](../tests/MODULE.md) - test-suite contracts and cleanup + safety expectations. diff --git a/monitoring/grafana/provisioning/alerting/.gitkeep b/monitoring/grafana/provisioning/alerting/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/monitoring/grafana/provisioning/alerting/.gitkeep @@ -0,0 +1 @@ + diff --git a/monitoring/grafana/provisioning/plugins/.gitkeep b/monitoring/grafana/provisioning/plugins/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/monitoring/grafana/provisioning/plugins/.gitkeep @@ -0,0 +1 @@ + diff --git a/postgres/initdb/00-render-config.sh b/postgres/initdb/00-render-config.sh index 18384be..9d9526a 100755 --- a/postgres/initdb/00-render-config.sh +++ b/postgres/initdb/00-render-config.sh @@ -223,6 +223,7 @@ cat > "${PGBACKREST_CONF_PATH}" << CONF [global] repo1-path=/var/lib/pgbackrest repo1-retention-full=${PGBACKREST_RETENTION_FULL:-7} +repo1-retention-full-type=${PGBACKREST_RETENTION_FULL_TYPE:-time} repo1-retention-diff=${PGBACKREST_RETENTION_DIFF:-7} repo1-retention-archive=${PGBACKREST_RETENTION_ARCHIVE:-7} repo1-retention-archive-type=${PGBACKREST_RETENTION_ARCHIVE_TYPE:-diff} diff --git a/repo_config.yaml b/repo_config.yaml index f67da1f..7a76a29 100644 --- a/repo_config.yaml +++ b/repo_config.yaml @@ -1,3 +1,14 @@ +hooks: + enabled_groups: + - format + - syntax + - workflow + - go + - ai + - commit-msg + python: + docstring_coverage: + enabled: false pytest_gate: enabled: false diff --git a/scripts/create_env.sh b/scripts/create_env.sh index 6973cd6..38a0c98 100755 --- a/scripts/create_env.sh +++ b/scripts/create_env.sh @@ -4,6 +4,9 @@ set -euo pipefail +# Standalone bootstrap script: do not source common.sh because this command +# creates the .env file that common.sh normally consumes. +# shellcheck source=scripts/lib/common.sh SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "${SCRIPT_DIR}/.." && pwd) TEMPLATE="${ROOT_DIR}/.env.example" @@ -174,6 +177,16 @@ valkey_password="$(prompt_secret "ValKey password (written to secrets/valkey_pas printf '%s\n' "${valkey_password}" >"${valkey_secret_file}" chmod 0600 "${valkey_secret_file}" || true set_env_value VALKEY_PASSWORD_FILE "./secrets/valkey_password" +python3 - "${valkey_password}" "${secret_dir}/valkey_exporter_passwords.json" <<'PY' +import json +import sys + +password, path = sys.argv[1], sys.argv[2] +with open(path, "w", encoding="utf-8") as handle: + json.dump({"redis://valkey:6379": password}, handle) + handle.write("\n") +PY +chmod 0600 "${secret_dir}/valkey_exporter_passwords.json" || true pgbouncer_auth_secret="${secret_dir}/pgbouncer_auth_password" pgbouncer_auth_default="$(generate_password)" diff --git a/scripts/daily_maintenance.sh b/scripts/daily_maintenance.sh index 778693d..de82964 100755 --- a/scripts/daily_maintenance.sh +++ b/scripts/daily_maintenance.sh @@ -36,7 +36,7 @@ echo "[daily] starting" HOST_BACKUP_ROOT=${DAILY_BACKUP_ROOT:-./backups/daily} CONTAINER_BACKUP_ROOT=${DAILY_CONTAINER_BACKUP_ROOT:-/backups/daily} -RETENTION_DAYS=${DAILY_RETENTION_DAYS:-30} +RETENTION_DAYS=${DAILY_RETENTION_DAYS:-7} SINCE=${DAILY_PGBADGER_SINCE:-} if [[ -n ${SINCE} ]]; then if echo "${SINCE}" | grep -Eq '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'; then @@ -46,6 +46,7 @@ if [[ -n ${SINCE} ]]; then fi fi REMOVE_SOURCE=${DAILY_REMOVE_SOURCE_LOGS:-false} +PRUNE_SOURCE_LOGS=${DAILY_PRUNE_SOURCE_LOGS:-true} PG_BADGER_JOBS=${PG_BADGER_JOBS:-2} PG_STAT_LIMIT=${DAILY_PG_STAT_LIMIT:-100} BUFFERCACHE_LIMIT=${DAILY_BUFFERCACHE_LIMIT:-50} @@ -235,6 +236,9 @@ compose_exec bash -lc "cp /var/lib/postgresql/data/log/postgresql-*.log '${CONTA compose_exec bash -lc "cp /var/lib/postgresql/data/log/postgresql-*.csv '${CONTAINER_TARGET_DIR}' 2>/dev/null || true" if [[ ${REMOVE_SOURCE} == true ]]; then compose_exec bash -lc "rm -f /var/lib/postgresql/data/log/postgresql-*.log /var/lib/postgresql/data/log/postgresql-*.csv" +elif [[ ${PRUNE_SOURCE_LOGS} == true && ${RETENTION_DAYS} =~ ^[0-9]+$ && ${RETENTION_DAYS} -gt 0 ]]; then + retention_mtime=$((RETENTION_DAYS - 1)) + compose_exec bash -lc "find /var/lib/postgresql/data/log -maxdepth 1 -type f \\( -name 'postgresql-*.log' -o -name 'postgresql-*.csv' \\) -mtime +${retention_mtime} -delete" fi echo "[daily] generating pgBadger report" @@ -370,5 +374,8 @@ if [[ ${EMAIL_REPORT} == true && -n ${REPORT_RECIPIENT} ]]; then fi echo "[daily] applying retention ${RETENTION_DAYS} days" -find "${HOST_BACKUP_ROOT}" -mindepth 1 -maxdepth 1 -type d | sort | head -n -"${RETENTION_DAYS}" | xargs -r rm -rf +if [[ ${RETENTION_DAYS} =~ ^[0-9]+$ && ${RETENTION_DAYS} -gt 0 ]]; then + retention_mtime=$((RETENTION_DAYS - 1)) + find "${HOST_BACKUP_ROOT}" -mindepth 1 -maxdepth 1 -type d -mtime +"${retention_mtime}" -exec rm -rf {} + +fi echo "[daily] complete" diff --git a/scripts/lib/apparmor.sh b/scripts/lib/apparmor.sh new file mode 100755 index 0000000..dfaedda --- /dev/null +++ b/scripts/lib/apparmor.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. +# SPDX-License-Identifier: MIT + +# shellcheck shell=bash +set -euo pipefail + +APPARMOR_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${APPARMOR_LIB_DIR}/common.sh" + +cmd_apparmor_load() { + local parser=${APPARMOR_PARSER:-apparmor_parser} + if ! command -v "${parser}" >/dev/null 2>&1; then + echo "[apparmor] ${parser} not found. Install apparmor-utils (Debian/Ubuntu) or ensure apparmor_parser is on PATH." >&2 + exit 1 + fi + if [[ $EUID -ne 0 ]] && ! command -v sudo >/dev/null 2>&1; then + echo "[apparmor] sudo required to load profiles or rerun as root." >&2 + exit 1 + fi + local loaded=false + for profile in "${ROOT_DIR}/apparmor"/*.profile; do + [[ -e "${profile}" ]] || continue + if [[ $EUID -ne 0 ]]; then + sudo "${parser}" -r -W "${profile}" || exit 1 + else + "${parser}" -r -W "${profile}" || exit 1 + fi + loaded=true + echo "[apparmor] loaded ${profile##*/}" >&2 + done + if [[ ${loaded} == false ]]; then + echo "[apparmor] no profiles found under ${ROOT_DIR}/apparmor" >&2 + exit 1 + fi + echo "[apparmor] profiles loaded. Set CORE_DATA_APPARMOR_=apparmor:core_data_minimal (or your custom profile) before composing." >&2 +} diff --git a/scripts/lib/bootstrap_ci.sh b/scripts/lib/bootstrap_ci.sh index 41f066e..991cdb6 100755 --- a/scripts/lib/bootstrap_ci.sh +++ b/scripts/lib/bootstrap_ci.sh @@ -229,6 +229,18 @@ cmd_bootstrap_ci() { chmod 0700 "${secrets_dir}" || true bootstrap_ci_write_secret POSTGRES_SUPERUSER_PASSWORD "${secrets_dir}/postgres_superuser_password" "${force}" base64 bootstrap_ci_write_secret VALKEY_PASSWORD "${secrets_dir}/valkey_password" "${force}" base64 + python3 - "${secrets_dir}/valkey_password" "${secrets_dir}/valkey_exporter_passwords.json" <<'PY' +import json +import sys + +password_path, output_path = sys.argv[1], sys.argv[2] +with open(password_path, encoding="utf-8") as handle: + password = handle.read().strip() +with open(output_path, "w", encoding="utf-8") as handle: + json.dump({"redis://valkey:6379": password}, handle) + handle.write("\n") +PY + chmod 0600 "${secrets_dir}/valkey_exporter_passwords.json" || true bootstrap_ci_write_secret PGBOUNCER_AUTH_PASSWORD "${secrets_dir}/pgbouncer_auth_password" "${force}" base64 bootstrap_ci_write_secret PGBOUNCER_STATS_PASSWORD "${secrets_dir}/pgbouncer_stats_password" "${force}" base64 bootstrap_ci_write_secret RABBITMQ_DEFAULT_PASS "${secrets_dir}/rabbitmq_default_pass" "${force}" base64 diff --git a/scripts/lib/ci_ports.sh b/scripts/lib/ci_ports.sh index aa0d903..38a5784 100755 --- a/scripts/lib/ci_ports.sh +++ b/scripts/lib/ci_ports.sh @@ -57,8 +57,8 @@ ci_check_required_ports() { local skip_ports=$1 ci_check_ports "${skip_ports}" \ "postgres:${POSTGRES_PORT:-5433}" \ - "pgbouncer:${PGBOUNCER_HOST_PORT:-${PGBOUNCER_PORT:-6432}}" \ - "pgbouncer-extra:${PGBOUNCER_EXTRA_HOST_PORT:-5432}" \ + "pgbouncer:${PGBOUNCER_HOST_PORT:-5432}" \ + "pgbouncer-extra:${PGBOUNCER_EXTRA_HOST_PORT:-${PGBOUNCER_PORT:-6432}}" \ "valkey:${VALKEY_HOST_PORT:-${VALKEY_PORT:-6379}}" \ "memcached:${MEMCACHED_PORT:-11211}" \ "rabbitmq:${RABBITMQ_HOST_PORT:-${RABBITMQ_PORT:-5672}}" \ diff --git a/scripts/lib/data_cleanup.sh b/scripts/lib/data_cleanup.sh new file mode 100755 index 0000000..3424133 --- /dev/null +++ b/scripts/lib/data_cleanup.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 Blackcat Informatics® Inc. +# SPDX-License-Identifier: MIT + +# shellcheck shell=bash +set -euo pipefail + +DATA_CLEANUP_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${DATA_CLEANUP_LIB_DIR}/common.sh" + +DATA_CLEANUP_DEFAULT_RETENTION=${DATA_CLEANUP_DEFAULT_RETENTION:-7d} +CORE_DATA_DATA_ROOT=${CORE_DATA_DATA_ROOT:-${ROOT_DIR}/data} + +data_cleanup_usage() { + cat <<'USAGE' +Usage: manage.sh data-cleanup [options] + +Remove stale pytest data stashes left under data/.pytest_backups. + +Options: + --older-than AGE Retain entries newer than AGE (default: 7d). + AGE accepts s, m, h, or d suffixes. + --execute Delete matching entries. Without this, only report. + --force Allow execution even when compose containers are running. + --json Emit a JSON summary. + -h, --help Show this help. +USAGE +} + +data_cleanup_parse_age() { + local age=$1 + local number + local suffix + + if [[ "${age}" =~ ^([0-9]+)([smhd])$ ]]; then + number=${BASH_REMATCH[1]} + suffix=${BASH_REMATCH[2]} + elif [[ "${age}" =~ ^([0-9]+)$ ]]; then + number=${BASH_REMATCH[1]} + suffix=d + else + echo "[data-cleanup] invalid age '${age}'; expected values like 24h or 7d." >&2 + return 1 + fi + + case "${suffix}" in + s) echo "${number}" ;; + m) echo $((number * 60)) ;; + h) echo $((number * 60 * 60)) ;; + d) echo $((number * 24 * 60 * 60)) ;; + esac +} + +data_cleanup_compose_running() { + local output + if ! output=$(compose ps -q 2>/dev/null); then + return 2 + fi + [[ -n "${output}" ]] +} + +data_cleanup_json_escape() { + local value=$1 + value=${value//\\/\\\\} + value=${value//\"/\\\"} + value=${value//$'\n'/\\n} + printf '%s' "${value}" +} + +cmd_data_cleanup() { + local older_than=${DATA_CLEANUP_DEFAULT_RETENTION} + local execute=false + local force=false + local json=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --older-than) + if [[ $# -lt 2 ]]; then + echo "[data-cleanup] --older-than requires an age value." >&2 + return 1 + fi + older_than=$2 + shift 2 + ;; + --older-than=*) + older_than=${1#*=} + shift + ;; + --execute) + execute=true + shift + ;; + --force) + force=true + shift + ;; + --json) + json=true + shift + ;; + -h | --help) + data_cleanup_usage + return 0 + ;; + *) + echo "[data-cleanup] unknown option: $1" >&2 + data_cleanup_usage >&2 + return 1 + ;; + esac + done + + local retention_seconds + retention_seconds=$(data_cleanup_parse_age "${older_than}") + local now + now=$(date +%s) + local cutoff=$((now - retention_seconds)) + local backup_root="${CORE_DATA_DATA_ROOT%/}/.pytest_backups" + local candidates=() + local candidate_count=0 + local total_bytes=0 + + if [[ -d "${backup_root}" ]]; then + local path + while IFS= read -r -d '' path; do + local modified + modified=$(stat -c '%Y' "${path}") + if ((modified <= cutoff)); then + local bytes + bytes=$(du -s -B1 "${path}" | awk '{print $1}') + candidates+=("${path}") + candidate_count=$((candidate_count + 1)) + total_bytes=$((total_bytes + bytes)) + fi + done < <(find "${backup_root}" -mindepth 1 -maxdepth 1 -type d -print0 | sort -z) + fi + + if [[ "${execute}" == "true" && "${force}" != "true" ]]; then + local compose_state=0 + data_cleanup_compose_running || compose_state=$? + case "${compose_state}" in + 0) + echo "[data-cleanup] refusing to delete while compose containers are running; rerun after shutdown or pass --force." >&2 + return 1 + ;; + 2) + echo "[data-cleanup] unable to determine compose state; pass --force to execute anyway." >&2 + return 1 + ;; + esac + fi + + if [[ "${json}" == "true" ]]; then + printf '{"mode":"%s","backup_root":"%s","older_than":"%s","candidates":%d,"bytes":%d,"paths":[' \ + "$([[ "${execute}" == "true" ]] && echo execute || echo dry-run)" \ + "$(data_cleanup_json_escape "${backup_root}")" \ + "$(data_cleanup_json_escape "${older_than}")" \ + "${candidate_count}" \ + "${total_bytes}" + local first=true + local candidate + for candidate in "${candidates[@]}"; do + if [[ "${first}" == "true" ]]; then + first=false + else + printf ',' + fi + printf '"%s"' "$(data_cleanup_json_escape "${candidate}")" + done + printf ']}\n' + else + printf '[data-cleanup] mode: %s\n' "$([[ "${execute}" == "true" ]] && echo execute || echo dry-run)" + printf '[data-cleanup] backup root: %s\n' "${backup_root}" + printf '[data-cleanup] retention: older than %s\n' "${older_than}" + printf '[data-cleanup] candidates: %d\n' "${candidate_count}" + printf '[data-cleanup] reclaimable bytes: %d\n' "${total_bytes}" + local candidate + for candidate in "${candidates[@]}"; do + printf '%s\n' "${candidate}" + done + if [[ "${execute}" != "true" ]]; then + printf '[data-cleanup] dry run only; pass --execute to delete matching entries.\n' + fi + fi + + if [[ "${execute}" == "true" ]]; then + local candidate + for candidate in "${candidates[@]}"; do + rm -rf -- "${candidate}" + done + fi +} diff --git a/scripts/lib/db.sh b/scripts/lib/db.sh old mode 100644 new mode 100755 index 95950f6..e93813c --- a/scripts/lib/db.sh +++ b/scripts/lib/db.sh @@ -1,7 +1,14 @@ +#!/usr/bin/env bash # SPDX-FileCopyrightText: 2025 Blackcat Informatics® Inc. # SPDX-License-Identifier: MIT # shellcheck shell=bash +set -euo pipefail + +DB_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${DB_LIB_DIR}/common.sh" # Database role and schema helpers used by manage.sh. # cmd_create_user creates a role with LOGIN privilege if it does not yet exist. @@ -24,6 +31,7 @@ BEGIN END \$\$; SQL + cmd_rabbitmq_create_user_if_enabled "${user}" "${pass}" } # cmd_drop_user removes a role when present, ignoring missing roles. @@ -93,6 +101,7 @@ SQL bootstrap_database "${db}" grant_db_owner_privileges "${db}" "${owner}" schedule_pg_squeeze_job "${db}" + cmd_rabbitmq_create_vhost_if_enabled "${db}" "${owner}" } # cmd_drop_db unschedules cron jobs and drops the database after terminating sessions. @@ -111,4 +120,5 @@ cmd_drop_db() { SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db}' AND pid <> pg_backend_pid(); DROP DATABASE IF EXISTS "${db}"; SQL + cmd_rabbitmq_drop_vhost_if_enabled "${db}" } diff --git a/scripts/lib/maintenance.sh b/scripts/lib/maintenance.sh index 8c79f37..221b765 100755 --- a/scripts/lib/maintenance.sh +++ b/scripts/lib/maintenance.sh @@ -122,7 +122,7 @@ cmd_pgbadger_report() { cmd_daily_maintenance() { ensure_env local backup_root=${DAILY_BACKUP_ROOT:-./backups/daily} - local retention=${DAILY_RETENTION_DAYS:-30} + local retention=${DAILY_RETENTION_DAYS:-7} local since="" local remove_logs=false local container_root=${DAILY_CONTAINER_BACKUP_ROOT:-/backups/daily} diff --git a/scripts/lib/rabbitmq.sh b/scripts/lib/rabbitmq.sh old mode 100644 new mode 100755 index a1c479e..8ed7a6e --- a/scripts/lib/rabbitmq.sh +++ b/scripts/lib/rabbitmq.sh @@ -4,6 +4,11 @@ set -euo pipefail +RABBITMQ_LIB_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=common.sh +# shellcheck disable=SC1091 +source "${RABBITMQ_LIB_DIR}/common.sh" + RABBITMQ_SERVICE_NAME=${RABBITMQ_SERVICE_NAME:-rabbitmq} RABBITMQ_HOST=${RABBITMQ_HOST:-rabbitmq} RABBITMQ_PORT=${RABBITMQ_PORT:-5672} @@ -21,6 +26,71 @@ rabbitmq_exec() { compose_exec_service "${RABBITMQ_SERVICE_NAME}" "$@" } +rabbitmq_service_enabled() { + compose_has_service "${RABBITMQ_SERVICE_NAME}" +} + +cmd_rabbitmq_create_user_if_enabled() { + local user=$1 + local pass=$2 + + if ! rabbitmq_service_enabled; then + echo "[rabbitmq] Service not enabled; skipping RabbitMQ user '${user}'." >&2 + return 0 + fi + + # shellcheck disable=SC2016 # Container-side script expands its own positional parameters. + rabbitmq_exec sh -eu -c ' +user=$1 +pass=$2 +if rabbitmqctl list_users --silent | awk "{print \$1}" | grep -Fxq "${user}"; then + exit 0 +fi +rabbitmqctl add_user "${user}" "${pass}" +' sh "${user}" "${pass}" +} + +cmd_rabbitmq_create_vhost_if_enabled() { + local vhost=$1 + local owner=$2 + + if ! rabbitmq_service_enabled; then + echo "[rabbitmq] Service not enabled; skipping RabbitMQ vhost '${vhost}'." >&2 + return 0 + fi + + # shellcheck disable=SC2016 # Container-side script expands its own positional parameters. + rabbitmq_exec sh -eu -c ' +vhost=$1 +owner=$2 +if ! rabbitmqctl list_vhosts --silent | grep -Fxq "${vhost}"; then + rabbitmqctl add_vhost "${vhost}" +fi +if ! rabbitmqctl list_users --silent | awk "{print \$1}" | grep -Fxq "${owner}"; then + echo "[rabbitmq] User ${owner} does not exist; created vhost ${vhost} without owner permissions." >&2 + exit 0 +fi +rabbitmqctl set_permissions -p "${vhost}" "${owner}" ".*" ".*" ".*" +' sh "${vhost}" "${owner}" +} + +cmd_rabbitmq_drop_vhost_if_enabled() { + local vhost=$1 + + if ! rabbitmq_service_enabled; then + echo "[rabbitmq] Service not enabled; skipping RabbitMQ vhost '${vhost}'." >&2 + return 0 + fi + + # shellcheck disable=SC2016 # Container-side script expands its own positional parameters. + rabbitmq_exec sh -eu -c ' +vhost=$1 +if rabbitmqctl list_vhosts --silent | grep -Fxq "${vhost}"; then + rabbitmqctl delete_vhost "${vhost}" +fi +' sh "${vhost}" +} + cmd_rabbitmq_ctl() { ensure_env ensure_rabbitmq_service diff --git a/scripts/logical_backup_runner.sh b/scripts/logical_backup_runner.sh index d501cdf..87e7230 100755 --- a/scripts/logical_backup_runner.sh +++ b/scripts/logical_backup_runner.sh @@ -275,7 +275,8 @@ PY write_metrics 1 "${LAST_SUCCESS_TIMESTAMP}" "${LAST_SUCCESS_DURATION}" "${LAST_SUCCESS_SIZE_BYTES}" "${LAST_SUCCESS_FILE_COUNT}" "${FAILURE_COUNT}" if ((LOGICAL_BACKUP_RETENTION_DAYS > 0)); then - find "${LOGICAL_BACKUP_OUTPUT}" -mindepth 1 -maxdepth 1 -type d ! -name '*.tmp' -mtime +"${LOGICAL_BACKUP_RETENTION_DAYS}" -print -exec rm -rf {} + 2> /dev/null || true + retention_mtime=$((LOGICAL_BACKUP_RETENTION_DAYS - 1)) + find "${LOGICAL_BACKUP_OUTPUT}" -mindepth 1 -maxdepth 1 -type d ! -name '*.tmp' -mtime +"${retention_mtime}" -print -exec rm -rf {} + 2> /dev/null || true fi log "completed backup at ${timestamp}" diff --git a/scripts/manage.sh b/scripts/manage.sh index 2111541..1e83570 100755 --- a/scripts/manage.sh +++ b/scripts/manage.sh @@ -43,12 +43,16 @@ source "${SCRIPT_DIR}/lib/memcached.sh" source "${SCRIPT_DIR}/lib/rabbitmq.sh" # shellcheck source=scripts/lib/seccomp.sh source "${SCRIPT_DIR}/lib/seccomp.sh" +# shellcheck source=scripts/lib/apparmor.sh +source "${SCRIPT_DIR}/lib/apparmor.sh" # shellcheck source=scripts/lib/test_dataset.sh source "${SCRIPT_DIR}/lib/test_dataset.sh" # shellcheck source=scripts/lib/bootstrap_ci.sh source "${SCRIPT_DIR}/lib/bootstrap_ci.sh" # shellcheck source=scripts/lib/ci.sh source "${SCRIPT_DIR}/lib/ci.sh" +# shellcheck source=scripts/lib/data_cleanup.sh +source "${SCRIPT_DIR}/lib/data_cleanup.sh" # shellcheck source=scripts/lib/permissions.sh source "${SCRIPT_DIR}/lib/permissions.sh" @@ -172,6 +176,7 @@ Lifecycle networks-show Print the currently rendered allow list. config-render Re-render postgresql.conf/pg_hba.conf then restart PostgreSQL. config-check Compare live configs to rendered templates. + data-cleanup Remove stale pytest data stashes (dry-run by default). logs Tail postgres logs. status Show container status and health. service-urls Print connection URLs for local services using external host IP. @@ -273,7 +278,7 @@ Permissions --jobs Parallel workers for pgbadger (default 2) daily-maintenance options: --root Override host backup root (default ./backups/daily) - --retention Retention in days (default 30) + --retention Retention in days (default 7) --since