diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8dff1e916..a2d3abef7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -124,16 +124,19 @@ jobs: echo 'CONN_MAX_AGE=60' >> docker.prod.env echo 'CONN_HEALTH_CHECKS="true"' >> docker.prod.env echo 'SANDBOX_DISABLE_PROC="true"' >> docker.prod.env - echo 'ALLOWED_HOSTS="backend,localhost,127.0.0.1"' >> docker.prod.env + echo 'ALLOWED_HOSTS="backend,backend-green,backend-blue,localhost,127.0.0.1"' >> docker.prod.env echo 'USE_SANDBOX_JAIL="on"' >> docker.prod.env echo 'CI=true' >> docker.prod.env - name: Comment out SSL server configuration from nginx run: | - sed -i '/{{HTTPS_SERVER_BLOCK_START}}/,/{{HTTPS_SERVER_BLOCK_END}}/s/^/#/' nginx/production.conf + sed -i '/{{HTTPS_SERVER_BLOCK_START}}/,/{{HTTPS_SERVER_BLOCK_END}}/s/^/#/' nginx/production/default.conf - name: Bring up postgres and nginx containers run: | + cp ./nginx/production/runtime/geo.conf.example ./nginx/production/runtime/geo.conf + cp ./nginx/production/runtime/upstream.conf.example ./nginx/production/runtime/upstream.conf + docker compose -f docker-compose.prod.yaml build nginx docker compose -f docker-compose.prod.yaml up -d postgres nginx timeout 15s docker compose -f docker-compose.prod.yaml logs -f || true docker compose logs nginx | grep "ready for start up" @@ -141,20 +144,20 @@ jobs: - name: Build and bring up up backend container run: | - docker compose -f docker-compose.prod.yaml build backend - docker compose -f docker-compose.prod.yaml up -d backend + docker compose -f docker-compose.prod.yaml build backend-blue + docker compose -f docker-compose.prod.yaml up -d backend-blue timeout 15s docker compose -f docker-compose.prod.yaml logs -f || true - name: Build and bring up up frontend container run: | - docker compose -f docker-compose.prod.yaml build frontend - docker compose -f docker-compose.prod.yaml up -d frontend + docker compose -f docker-compose.prod.yaml build frontend-blue + docker compose -f docker-compose.prod.yaml up -d frontend-blue timeout 15s docker compose -f docker-compose.prod.yaml logs -f || true - name: Sanity check the endpoints run: | - curl --silent http://localhost:8080/ | head -c 256 - curl --silent http://localhost:8000/api/ | jq + curl --silent http://localhost/healthz | jq + curl --silent http://localhost/api/healthz | jq - name: Shut everything down run: | diff --git a/.gitignore b/.gitignore index e58eb5f31..bd15375ab 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ docker.prod.env send_update.py update.sh .venv/ +nginx/production/runtime/*.conf +!nginx/production/runtime/*.conf.example +.deploy.env diff --git a/backend/coreapp/middleware.py b/backend/coreapp/middleware.py index 84c5edd20..734afd7a1 100644 --- a/backend/coreapp/middleware.py +++ b/backend/coreapp/middleware.py @@ -45,6 +45,7 @@ def middleware(request: HttpRequest) -> Response: def is_public_get_request(req: Request) -> bool: public_paths = [ "/api/compiler", + "/api/healthz$", "/api/library", "/api/platform", "/api/preset", diff --git a/backend/coreapp/tests/test_request.py b/backend/coreapp/tests/test_request.py index 64f29cb1d..cda3be32e 100644 --- a/backend/coreapp/tests/test_request.py +++ b/backend/coreapp/tests/test_request.py @@ -12,6 +12,14 @@ class RequestTests(APITestCase): + def test_health_check_is_stateless(self) -> None: + response = self.client.get(reverse("healthz"), HTTP_USER_AGENT="browser") + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json(), {"ok": True}) + self.assertEqual(Profile.objects.count(), 0) + self.assertNotIn("sessionid", response.cookies) + def test_cookie_less_current_user_does_not_create_profile(self) -> None: """ Ensure that a passive current-user read does not create a session profile. diff --git a/backend/coreapp/urls.py b/backend/coreapp/urls.py index 0467ab6b8..dd81d34ab 100644 --- a/backend/coreapp/urls.py +++ b/backend/coreapp/urls.py @@ -3,6 +3,7 @@ from coreapp.views import ( compiler, + health, library, platform, preset, @@ -22,6 +23,7 @@ urlpatterns = [ *router.urls, path("compiler", compiler.CompilerDetail.as_view(), name="compiler"), + path("healthz", health.HealthCheck.as_view(), name="healthz"), path( "compiler//", compiler.SingleCompilerDetail.as_view(), diff --git a/backend/coreapp/views/health.py b/backend/coreapp/views/health.py new file mode 100644 index 000000000..d433a5f3a --- /dev/null +++ b/backend/coreapp/views/health.py @@ -0,0 +1,15 @@ +from django.db import connection +from rest_framework import status +from rest_framework.request import Request +from rest_framework.response import Response +from rest_framework.views import APIView + + +class HealthCheck(APIView): + def get(self, request: Request) -> Response: + try: + connection.ensure_connection() + except Exception: + return Response({"ok": False}, status=status.HTTP_503_SERVICE_UNAVAILABLE) + + return Response({"ok": True}) diff --git a/backend/docker_prod_entrypoint.sh b/backend/docker_prod_entrypoint.sh index 2eb4590a0..23be21800 100755 --- a/backend/docker_prod_entrypoint.sh +++ b/backend/docker_prod_entrypoint.sh @@ -13,8 +13,6 @@ until nc -z ${DB_HOST} ${DB_PORT} > /dev/null; do sleep 1 done -uv run /backend/manage.py migrate - if [ -z "$CI" ]; then uv run manage.py clearsessions uv run /backend/housekeeping.py diff --git a/deploy.py b/deploy.py new file mode 100644 index 000000000..16c8bf1d6 --- /dev/null +++ b/deploy.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +import argparse +import os +import re +import shlex +import subprocess +import sys +import time +from pathlib import Path + +DEPLOY_ENV = Path(".deploy.env") +UPSTREAM_CONF = Path("nginx/production/runtime/upstream.conf") + +DOCKER_COMPOSE = ["docker", "compose", "-f", "docker-compose.prod.yaml"] + +SLOTS = {"blue", "green"} +BLUE_TAG = "BLUE_TAG" +GREEN_TAG = "GREEN_TAG" +NGINX_TAG = "NGINX_TAG" +ACTIVE_SLOT = "ACTIVE_SLOT" +SLOT_COLORS = { + "blue": "\033[34m", + "green": "\033[32m", +} +RESET_COLOR = "\033[0m" + + +def run(cmd, *, env=None, check=True, capture=False, quiet=False): + if not quiet: + print("+", " ".join(shlex.quote(c) for c in cmd)) + return subprocess.run( + cmd, + env=env, + check=check, + text=True, + capture_output=capture, + ) + + +def read_env_file(): + data = {} + if DEPLOY_ENV.exists(): + for line in DEPLOY_ENV.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + data[k.strip()] = v.strip() + return data + + +def write_env_file(data): + keys = [ACTIVE_SLOT, BLUE_TAG, GREEN_TAG, NGINX_TAG] + lines = [] + + for key in keys: + if key in data: + lines.append(f"{key}={data[key]}") + + for key in sorted(data): + if key not in keys: + lines.append(f"{key}={data[key]}") + + tmp = DEPLOY_ENV.with_suffix(".tmp") + tmp.write_text("\n".join(lines) + "\n") + tmp.replace(DEPLOY_ENV) + + +def compose_env(state): + env = os.environ.copy() + env.update(state) + env.setdefault(BLUE_TAG, "latest") + env.setdefault(GREEN_TAG, "latest") + env.setdefault(NGINX_TAG, "latest") + return env + + +def other_slot(slot): + return "green" if slot == "blue" else "blue" + + +def colour_slot(slot): + if not sys.stdout.isatty() or slot not in SLOT_COLORS: + return slot + return f"{SLOT_COLORS[slot]}{slot}{RESET_COLOR}" + + +def validate_tag(tag): + if not re.fullmatch(r"[A-Za-z0-9._-]{6,128}", tag): + raise SystemExit(f"Invalid image tag: {tag}") + + +def write_upstream(slot): + UPSTREAM_CONF.parent.mkdir(parents=True, exist_ok=True) + tmp = UPSTREAM_CONF.with_suffix(".conf.tmp") + tmp.write_text( + f"""upstream backend_upstream {{ + server backend-{slot}:8000; +}} + +upstream frontend_upstream {{ + server frontend-{slot}:8080; +}} +""" + ) + tmp.replace(UPSTREAM_CONF) + + +def switch_upstream(slot, env): + previous = UPSTREAM_CONF.read_text() if UPSTREAM_CONF.exists() else None + write_upstream(slot) + + try: + nginx_test_and_reload(env) + except Exception: + print("nginx reload failed; restoring previous upstream config...") + if previous is None: + UPSTREAM_CONF.unlink(missing_ok=True) + else: + UPSTREAM_CONF.write_text(previous) + + nginx_test_and_reload(env) + raise + + +def container_id(service, env): + result = run( + [*DOCKER_COMPOSE, "ps", "-q", service], + env=env, + capture=True, + ) + cid = result.stdout.strip() + if not cid: + raise SystemExit(f"No container found for service: {service}") + return cid + + +def health_status(service, env): + cid = container_id(service, env) + result = run( + ["docker", "inspect", "-f", "{{.State.Health.Status}}", cid], + env=env, + capture=True, + check=False, + ) + if result.returncode != 0: + return "unknown" + return result.stdout.strip() + + +def service_health_for_status(service, env): + result = run( + [*DOCKER_COMPOSE, "ps", "-q", service], + env=env, + capture=True, + check=False, + quiet=True, + ) + cid = result.stdout.strip() + if not cid: + return "missing" + + result = run( + [ + "docker", + "inspect", + "-f", + "{{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{else}}no-healthcheck{{end}}", + cid, + ], + env=env, + capture=True, + check=False, + quiet=True, + ) + if result.returncode != 0: + return "unknown" + return result.stdout.strip() + + +def wait_for_healthy(service, env, timeout=120): + print(f"Waiting for {service} to become healthy...") + deadline = time.time() + timeout + + while time.time() < deadline: + status = health_status(service, env) + print(f" {service}: {status}") + + if status == "healthy": + return + + if status == "unhealthy": + raise SystemExit(f"{service} became unhealthy") + + if status == "unknown": + # No healthcheck configured. Fall back to container running state. + cid = container_id(service, env) + result = run( + ["docker", "inspect", "-f", "{{.State.Running}}", cid], + env=env, + capture=True, + ) + if result.stdout.strip() == "true": + print(f" {service}: no healthcheck configured, container is running") + return + + time.sleep(2) + + raise SystemExit(f"Timed out waiting for {service} to become healthy") + + +def nginx_fetch(url, env): + cmd = [ + *DOCKER_COMPOSE, + "exec", + "-T", + "nginx", + "wget", + "-q", + "--spider", + url, + ] + result = run(cmd, env=env, check=False) + if result.returncode == 0: + return + + raise SystemExit(f"nginx smoke test failed to reach {url}") + + +def smoke_test(slot, env): + print(f"Smoke testing {slot} from nginx...") + nginx_fetch(f"http://backend-{slot}:8000/api/healthz", env) + nginx_fetch(f"http://frontend-{slot}:8080/healthz", env) + + +def nginx_test_and_reload(env): + run([*DOCKER_COMPOSE, "exec", "-T", "nginx", "nginx", "-t"], env=env) + run([*DOCKER_COMPOSE, "exec", "-T", "nginx", "nginx", "-s", "reload"], env=env) + + +def print_status(state, env): + print("Deployment state:") + active = state.get(ACTIVE_SLOT, "unknown") + print(f" active slot: {colour_slot(active)}") + print(f" blue tag: {state.get(BLUE_TAG, 'unset')}") + print(f" green tag: {state.get(GREEN_TAG, 'unset')}") + print(f" nginx tag: {state.get(NGINX_TAG, 'latest')}") + print() + + print("Slot health:") + for slot in sorted(SLOTS): + coloured_slot = colour_slot(slot) + print( + f" backend-{coloured_slot}: " + f"{service_health_for_status(f'backend-{slot}', env)}" + ) + print( + f" frontend-{coloured_slot}: " + f"{service_health_for_status(f'frontend-{slot}', env)}" + ) + print() + + run([*DOCKER_COMPOSE, "ps"], env=env, check=False) + + +def cmd_status(args): + state = read_env_file() + env = compose_env(state) + print_status(state, env) + + +def cmd_deploy(args): + validate_tag(args.tag) + + state = read_env_file() + active = state.get(ACTIVE_SLOT, "blue") + + if args.slot == "auto": + slot = other_slot(active) + else: + slot = args.slot + + if slot not in SLOTS: + raise SystemExit("slot must be auto, blue, or green") + + if slot == active: + raise SystemExit(f"Refusing to deploy over active slot: {slot}") + + tag_key = f"{slot.upper()}_TAG" + state[tag_key] = args.tag + env = compose_env(state) + + print(f"Deploying tag {args.tag} to {slot}") + + run([*DOCKER_COMPOSE, "pull", f"backend-{slot}", f"frontend-{slot}"], env=env) + run( + [ + *DOCKER_COMPOSE, + "up", + "-d", + f"backend-{slot}", + f"frontend-{slot}", + ], + env=env, + ) + + wait_for_healthy(f"backend-{slot}", env) + wait_for_healthy(f"frontend-{slot}", env) + + smoke_test(slot, env) + + switch_upstream(slot, env) + + state[ACTIVE_SLOT] = slot + write_env_file(state) + + print() + print(f"Deploy complete: {slot} is active on {args.tag}") + print(f"Old slot left running for rollback/drain: {other_slot(slot)}") + print( + f"Stop old slot with: {' '.join(DOCKER_COMPOSE)} " + f"stop backend-{other_slot(slot)} frontend-{other_slot(slot)}" + ) + print() + print_status(state, env) + + +def cmd_rollback(args): + state = read_env_file() + active = state.get(ACTIVE_SLOT) + if active not in SLOTS: + raise SystemExit("Cannot rollback: ACTIVE_SLOT is missing or invalid") + + slot = other_slot(active) + env = compose_env(state) + + print(f"Rolling back from {active} to {slot}") + print("No images will be pulled; rollback uses the already-running previous slot.") + print() + + smoke_test(slot, env) + switch_upstream(slot, env) + + state[ACTIVE_SLOT] = slot + write_env_file(state) + + print() + print(f"Rollback complete: {slot} is active") + print() + print_status(state, env) + + +def cmd_migrate(args): + validate_tag(args.tag) + + slot = "blue" + state = read_env_file() + state[BLUE_TAG] = args.tag + state[ACTIVE_SLOT] = "blue" + env = compose_env(state) + + print("Maintenance migration deploy.") + print("This will stop app containers, run migrations, and restart on blue.") + print() + + run([*DOCKER_COMPOSE, "up", "-d", "postgres"], env=env) + + run( + [ + *DOCKER_COMPOSE, + "stop", + "backend-blue", + "frontend-blue", + "backend-green", + "frontend-green", + ], + env=env, + check=False, + ) + + run([*DOCKER_COMPOSE, "pull", "backend-blue", "frontend-blue"], env=env) + + run( + [ + *DOCKER_COMPOSE, + "run", + "--rm", + "--no-deps", + "--entrypoint", + "uv", + "backend-blue", + "run", + "manage.py", + "migrate", + "--noinput", + ], + env=env, + ) + + run([*DOCKER_COMPOSE, "up", "-d", "backend-blue", "frontend-blue"], env=env) + + wait_for_healthy("backend-blue", env) + wait_for_healthy("frontend-blue", env) + + smoke_test(slot, env) + + switch_upstream(slot, env) + + write_env_file(state) + + print() + print(f"Migration deploy complete: blue is active on {args.tag}") + print() + print_status(state, env) + + +def main(): + parser = argparse.ArgumentParser() + sub = parser.add_subparsers(dest="command", required=True) + + status = sub.add_parser("status") + status.set_defaults(func=cmd_status) + + rollback = sub.add_parser("rollback") + rollback.set_defaults(func=cmd_rollback) + + deploy = sub.add_parser("deploy") + deploy.add_argument("tag") + deploy.add_argument( + "slot", choices=["auto", "blue", "green"], nargs="?", default="auto" + ) + deploy.set_defaults(func=cmd_deploy) + + migrate = sub.add_parser("migrate") + migrate.add_argument("tag") + migrate.set_defaults(func=cmd_migrate) + + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/docker-compose.prod.yaml b/docker-compose.prod.yaml index e8844870e..f63ab8c99 100644 --- a/docker-compose.prod.yaml +++ b/docker-compose.prod.yaml @@ -1,50 +1,74 @@ +x-backend-base: &backend-base + build: + context: backend + target: prod + cap_drop: + - all + cap_add: + - setuid + - setgid + - setfcap + env_file: + - docker.prod.env + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8000/api/healthz"] + interval: 15s + timeout: 3s + retries: 10 + start_period: 20s + security_opt: + - apparmor=unconfined + - seccomp=unconfined + volumes: + # persist compilers + libraries + - ./backend/compilers:/backend/compilers + - ./backend/libraries:/backend/libraries + # static files for django /admin control panel + - ./backend/static:/backend/static + tmpfs: + # Use a separate tmpfs to prevent a rogue jailed process + # from filling /tmp on the parent container + - /sandbox/tmp:exec,uid=1000,gid=1000,size=64M,mode=0700 + networks: + - decompme +x-frontend-base: &frontend-base + build: + context: frontend + target: prod + env_file: + - docker.prod.env + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/healthz"] + interval: 15s + timeout: 3s + retries: 10 + start_period: 20s + networks: + - decompme + services: - backend: - build: - context: backend - target: prod - cap_drop: - - all - cap_add: - - setuid - - setgid - - setfcap - env_file: - - docker.prod.env - ports: - - "8000:8000" - restart: unless-stopped - security_opt: - - apparmor=unconfined - - seccomp=unconfined - volumes: - # persist compilers + libraries - - ./backend/compilers:/backend/compilers - - ./backend/libraries:/backend/libraries - # static files for django /admin control panel - - ./backend/static:/backend/static - tmpfs: - # Use a separate tmpfs to prevent a rogue jailed process - # from filling /tmp on the parent container - - /sandbox/tmp:exec,uid=1000,gid=1000,size=64M,mode=0700 - networks: - - decompme - # uncommment for local testing - # entrypoint: tail -f /dev/null + backend-blue: + <<: *backend-base + image: ghcr.io/decompme/decompme-backend:${BLUE_TAG:-latest} + frontend-blue: + <<: *frontend-base + image: ghcr.io/decompme/decompme-frontend:${BLUE_TAG:-latest} + environment: + INTERNAL_API_BASE: http://backend-blue:8000/api - frontend: - build: - context: frontend - target: prod - env_file: - - docker.prod.env - ports: - - "8080:8080" - restart: unless-stopped - networks: - - decompme + backend-green: + <<: *backend-base + image: ghcr.io/decompme/decompme-backend:${GREEN_TAG:-latest} + frontend-green: + <<: *frontend-base + image: ghcr.io/decompme/decompme-frontend:${GREEN_TAG:-latest} + environment: + INTERNAL_API_BASE: http://backend-green:8000/api nginx: + image: ghcr.io/decompme/decompme-nginx:${NGINX_TAG:-latest} build: context: nginx ports: @@ -53,8 +77,7 @@ services: restart: unless-stopped volumes: # repo files - - ./nginx/production.conf:/etc/nginx/conf.d/default.conf:ro - - ./nginx/geo.conf:/etc/nginx/conf.d/000_geo.conf:ro + - ./nginx/production/:/etc/nginx/conf.d/:ro - ./frontend/down.html:/var/www/decomp.me/down.html:ro # certbot - ./certbot:/var/www/certbot @@ -66,8 +89,8 @@ services: # TODO: mount static + public files from frontend # BOOTSTRAP: allow nginx to start before backend/frontend containers are up #extra_hosts: - # - "backend=172.17.0.1" # docker0 bridge - # - "frontend=172.17.0.1" # docker0 bridge + # - "backend-blue=172.17.0.1" # docker0 bridge + # - "frontend-blue=172.17.0.1" # docker0 bridge networks: - decompme diff --git a/docker-compose.yaml b/docker-compose.yaml index 00589bcbb..29a9dbc6a 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -35,6 +35,8 @@ services: # Use a separate tmpfs to prevent a rogue jailed process # from filling /tmp on the parent container - /sandbox/tmp:exec,uid=1000,gid=1000,size=64M,mode=0700 + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8000/api/healthz"] frontend: build: context: frontend @@ -53,8 +55,7 @@ services: ports: - "80:80" volumes: - - ./nginx/development.conf:/etc/nginx/conf.d/default.conf:ro - - ./nginx/geo.conf:/etc/nginx/conf.d/geo.conf:ro + - ./nginx/development/:/etc/nginx/conf.d/:ro - ./frontend/down.html:/var/www/down.html:ro - ./backend/media:/media - ./backend/static:/var/www/static diff --git a/docs/DOCKER.md b/docs/DOCKER.md index 5c9185c3f..94fca1947 100644 --- a/docs/DOCKER.md +++ b/docs/DOCKER.md @@ -9,39 +9,37 @@ You will need [Docker](https://docs.docker.com/get-docker/) and [Docker Compose] ## Production -0. Create a `docker.prod.env` and set the necessary configuration options (see .env for inspiration). +Production uses `docker-compose.prod.yaml` with blue/green backend and frontend slots. See [PRODUCTION.md](PRODUCTION.md) for the deployment runbook. + +Create a `docker.prod.env` and set the necessary configuration options. ```bash nano docker.prod.env ``` -1. Bring up postgres & nginx containers +Bring up the shared production services. ```bash -docker compose -f docker-compose.prod.yaml up -d postgres nginx +docker compose -f docker-compose.prod.yaml up -d postgres nginx certbot ``` -2. Build and bring up backend +Deploy an app image tag with the blue/green deploy script. ```bash -docker compose -f docker-compose.prod.yaml build backend -docker compose -f docker-compose.prod.yaml up -d backend +python3 deploy.py deploy githash ``` -3. Build and bring up frontend (relies on backend for SSR) +Use the migration flow for deploys that require database maintenance. ```bash -# NOTE: this can be overridden if needed, i.e. --build-arg INTERNAL_API_BASE=https://decomp.me/api -docker compose -f docker-compose.prod.yaml build frontend -docker compose -f docker-compose.prod.yaml up -d frontend +python3 deploy.py migrate githash ``` - ### SSL Certificates Bootstrap In order to bring up nginx we need to have SSL certificates. In order to do that we need to get nginx to run only on port 80, then run certbot to fetch the certs. -1. Modify the `nginx/production.conf` to comment out the *whole* `server { listen 443 ssl http2; ... }` block. +1. Modify `nginx/production/default.conf` to comment out the whole HTTPS server block between `{{HTTPS_SERVER_BLOCK_START}}` and `{{HTTPS_SERVER_BLOCK_END}}`. 2. Bring up nginx diff --git a/docs/PRODUCTION.md b/docs/PRODUCTION.md new file mode 100644 index 000000000..05d10ca10 --- /dev/null +++ b/docs/PRODUCTION.md @@ -0,0 +1,64 @@ +# Production + +## Prerequisites + +Create `.deploy.env` with the desired `NGINX_TAG`. + +```bash +cat < .deploy.env +NGINX_TAG=8ca8d5b59b50 +EOF +``` + +Create the runtime nginx config files. + +```bash +cp ./nginx/production/runtime/geo.conf.example ./nginx/production/runtime/geo.conf +cp ./nginx/production/runtime/upstream.conf.example ./nginx/production/runtime/upstream.conf +``` + +Start the shared production services. + +```bash +docker compose -f docker-compose.prod.yaml --env-file .deploy.env up -d postgres nginx certbot +``` + +## Blue/Green deployment + +We support blue/green deployments when running decomp.me in production. This allows us to release the majority of our changes with zero downtime. + +`deploy.py` deploys the requested image tag to the inactive slot, waits for the backend and frontend containers to become healthy, smoke-tests the inactive slot from nginx, then reloads nginx to switch traffic. + +### Standard deployments + +```bash +python3 deploy.py deploy githash +``` + +The old slot is left running after a successful deploy so rollback remains quick. + +### Rollback + +```bash +python3 deploy.py rollback +``` + +### Migrations + +Schema-changing deploys may require maintenance time. The migration flow stops both app slots, runs migrations using the new backend image, starts `blue`, then points nginx at `blue`. + +```bash +python3 deploy.py migrate githash +``` + +### Status + +```bash +python3 deploy.py status +``` + +## Health checks + +Production backend containers use `GET /api/healthz`. This verifies Django can connect to the database without doing user-facing work. + +Production frontend containers use `GET /healthz`. This verifies the Next.js server is responding without rendering the homepage. diff --git a/frontend/src/app/healthz/route.ts b/frontend/src/app/healthz/route.ts new file mode 100644 index 000000000..d6a81eb5e --- /dev/null +++ b/frontend/src/app/healthz/route.ts @@ -0,0 +1,12 @@ +export const dynamic = "force-dynamic"; + +export function GET() { + return Response.json( + { ok: true }, + { + headers: { + "Cache-Control": "no-store", + }, + }, + ); +} diff --git a/nginx/development.conf b/nginx/development/default.conf similarity index 95% rename from nginx/development.conf rename to nginx/development/default.conf index 46c7188b9..764e0db86 100644 --- a/nginx/development.conf +++ b/nginx/development/default.conf @@ -1,5 +1,3 @@ -include /etc/nginx/conf.d/geo.conf; - server { listen 80; listen [::]:80; @@ -10,10 +8,6 @@ server { server_name decomp.local www.decomp.local; - if ($is_denied) { - return 403; - } - location / { try_files $uri @proxy_frontend; } diff --git a/nginx/production.conf b/nginx/production/default.conf similarity index 95% rename from nginx/production.conf rename to nginx/production/default.conf index bc60b51a1..29734d83d 100644 --- a/nginx/production.conf +++ b/nginx/production/default.conf @@ -1,3 +1,6 @@ +include /etc/nginx/conf.d/runtime/geo.conf; +include /etc/nginx/conf.d/runtime/upstream.conf; + server { listen 80; server_name decomp.me www.decomp.me; @@ -113,7 +116,7 @@ server { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header Host $http_host; proxy_redirect off; - proxy_pass http://backend:8000; + proxy_pass http://backend_upstream; } location @proxy_frontend { @@ -127,7 +130,7 @@ server { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header Host $http_host; proxy_redirect off; - proxy_pass http://frontend:8080; + proxy_pass http://frontend_upstream; } # Avoid returning HTML from the /api endpoint if backend is unavailable diff --git a/nginx/geo.conf b/nginx/production/runtime/geo.conf.example similarity index 100% rename from nginx/geo.conf rename to nginx/production/runtime/geo.conf.example diff --git a/nginx/production/runtime/upstream.conf.example b/nginx/production/runtime/upstream.conf.example new file mode 100644 index 000000000..1952d9c2f --- /dev/null +++ b/nginx/production/runtime/upstream.conf.example @@ -0,0 +1,6 @@ +upstream backend_upstream { + server backend-blue:8000; +} +upstream frontend_upstream { + server frontend-blue:8080; +}