diff --git a/common b/common index decb0e19107..60c9241eec1 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit decb0e1910786dcb7ca2b07b7e83608819593239 +Subproject commit 60c9241eec1f0bdbc0a1f53c5dfc8a5b2852df6e diff --git a/readthedocs/builds/migrations/0073_build_task_arn.py b/readthedocs/builds/migrations/0073_build_task_arn.py new file mode 100644 index 00000000000..ba384f60f91 --- /dev/null +++ b/readthedocs/builds/migrations/0073_build_task_arn.py @@ -0,0 +1,27 @@ +# Generated by Django 5.2.13 on 2026-06-03 09:32 + +from django.db import migrations +from django.db import models +from django_safemigrate import Safe + + +class Migration(migrations.Migration): + safe = Safe.before_deploy() + + dependencies = [ + ("builds", "0072_remove_deprecated_build_fields"), + ] + + operations = [ + migrations.AddField( + model_name="build", + name="task_arn", + field=models.CharField( + blank=True, + help_text="ECS task ARN for builds dispatched via Fargate. Set by ``submit_build_to_ecs``; consumed by ``cancel_build`` to call ``ecs:StopTask``. Mutually exclusive with ``task_id`` (legacy Celery path).", + max_length=255, + null=True, + verbose_name="ECS task ARN", + ), + ), + ] diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index d5387617cb7..7aaf1e57103 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -819,6 +819,18 @@ class Build(models.Model): null=True, blank=True, ) + task_arn = models.CharField( + _("ECS task ARN"), + max_length=255, + null=True, + blank=True, + help_text=_( + "ECS task ARN for builds dispatched via Fargate. " + "Set by ``submit_build_to_ecs``; consumed by ``cancel_build`` " + "to call ``ecs:StopTask``. Mutually exclusive with ``task_id`` " + "(legacy Celery path)." + ), + ) task_executed_at = models.DateTimeField( _("Task executed at datetime"), null=True, diff --git a/readthedocs/core/utils/__init__.py b/readthedocs/core/utils/__init__.py index 2faf183107d..42c3c1c55d0 100644 --- a/readthedocs/core/utils/__init__.py +++ b/readthedocs/core/utils/__init__.py @@ -200,12 +200,22 @@ def trigger_build(project, version=None, commit=None, from_webhook=False): Helper that calls ``prepare_build`` and just effectively trigger the Celery task to be executed by a worker. + When the project has ``Feature.USE_FARGATE_BUILDER`` enabled, dispatches + to the new ``submit_build_to_ecs`` bootstrap task (which clones the + config, resolves ``build.os``, and runs the build inside a Fargate task) + instead of the legacy ``update_docs_task``. See + ``readthedocs-builder/docs/architecture.md`` for the broader design. + :param project: project's documentation to be built :param version: version of the project to be built. Default: ``latest`` :param commit: commit sha of the version required for sending build status reports :returns: Celery AsyncResult promise and Build instance :rtype: tuple """ + # Avoid circular import. + from readthedocs.projects.models import Feature + from readthedocs.projects.tasks.fargate import submit_build_to_ecs + structlog.contextvars.bind_contextvars( project_slug=project.slug, version_slug=version.slug if version else None, @@ -229,6 +239,16 @@ def trigger_build(project, version=None, commit=None, from_webhook=False): # Build was skipped return (None, None) + # Feature-flag dispatch: Fargate path vs legacy Celery path. + if project.has_feature(Feature.USE_FARGATE_BUILDER): + log.info("Dispatching build via submit_build_to_ecs (Fargate path).") + task = submit_build_to_ecs.delay(build_pk=build.pk) + # The Build's ECS task_arn is populated by submit_build_to_ecs once + # ``ecs:RunTask`` returns; we don't write task_id here (that's the + # legacy-Celery-path's cancellation handle). cancel_build branches + # on which one is set. + return task, build + task = update_docs_task.apply_async() # FIXME: I'm using `isinstance` here because I wasn't able to mock this @@ -255,6 +275,14 @@ def cancel_build(build): - Running: Communicate Celery to force the termination of the current build and rely on the worker to update the build's status. + + Routing during the Fargate rollout: branches on which task identifier + is set on the build. ``Build.task_arn`` (Fargate path) takes + precedence — if the build was dispatched to ECS, we call + ``ecs:StopTask``. Otherwise we fall back to the legacy Celery revoke. + The branch is on the build's *actual* state (which dispatcher ran), + not the project's current feature flag, so an in-flight build that + started before the flag was flipped still cancels correctly. """ # NOTE: `terminate=True` is required for the child to attend our call # immediately when it's running the build. Otherwise, it finishes the @@ -287,9 +315,66 @@ def cancel_build(build): version_slug=build.version.slug, build_id=build.pk, build_task_id=build.task_id, + build_task_arn=build.task_arn, terminate=terminate, ) - app.control.revoke(build.task_id, signal="SIGINT", terminate=terminate) + + if build.task_arn: + # Fargate / local-docker path. ``task_arn`` is a container id + # under docker-compose dev and a real ECS task ARN in + # production; we branch on ``settings.RTD_DOCKER_COMPOSE``. + if settings.RTD_DOCKER_COMPOSE: + import docker + + try: + # SIGTERM, not the default SIGKILL: the runner installs a + # SIGTERM handler that raises ``BuildCancelled`` so its + # lifecycle's ``try/except/finally`` runs (attaches the + # ``CANCELLED_BY_USER`` notification + PATCHes the build + # to ``finished`` with ``success=False``) before the + # process exits. SIGKILL would skip all of that and + # leave the build stuck mid-state. Matches ECS StopTask + # semantics in production. + docker.from_env().containers.get(build.task_arn).kill(signal="SIGTERM") + except Exception: + log.exception( + "docker kill failed.", + container_id=build.task_arn, + ) + return + + # The runner inside the ECS task catches SIGTERM via its + # lifecycle's try/finally and finalizes the build. If the task + # has already exited (race), StopTask returns a benign error + # which we log but don't propagate. + import boto3 + + try: + boto3.client("ecs", region_name=settings.RTD_ECS_REGION or None).stop_task( + cluster=settings.RTD_ECS_CLUSTER, + task=build.task_arn, + reason="cancelled by user", + ) + except Exception: + log.exception( + "ecs:StopTask failed.", + task_arn=build.task_arn, + ) + return + + if build.task_id: + # Legacy Celery path. + app.control.revoke(build.task_id, signal="SIGINT", terminate=terminate) + return + + # Neither path has a handle yet. This happens when the Fargate + # bootstrap task hasn't run / hasn't called ecs:RunTask. The + # state-update above is enough — submit_build_to_ecs checks the + # build state at startup and bails out when it's CANCELLED. + log.info( + "No task handle on the build; relying on submit_build_to_ecs " + "to bail out when it observes BUILD_STATE_CANCELLED.", + ) def send_email_from_object(email: EmailMultiAlternatives | EmailMessage): diff --git a/readthedocs/projects/apps.py b/readthedocs/projects/apps.py index 3ea05df8157..6b2c698403a 100644 --- a/readthedocs/projects/apps.py +++ b/readthedocs/projects/apps.py @@ -15,6 +15,7 @@ def ready(self): import readthedocs.projects.notifications # noqa import readthedocs.projects.signals # noqa import readthedocs.projects.tasks.builds # noqa + import readthedocs.projects.tasks.fargate # noqa import readthedocs.projects.tasks.search # noqa import readthedocs.projects.tasks.utils # noqa diff --git a/readthedocs/projects/migrations/0165_historicalproject_container_cpu_limit_and_more.py b/readthedocs/projects/migrations/0165_historicalproject_container_cpu_limit_and_more.py new file mode 100644 index 00000000000..b9c32c4d39f --- /dev/null +++ b/readthedocs/projects/migrations/0165_historicalproject_container_cpu_limit_and_more.py @@ -0,0 +1,36 @@ +# Generated by Django 5.2.13 on 2026-06-03 09:32 + +from django.db import migrations +from django.db import models +from django_safemigrate import Safe + + +class Migration(migrations.Migration): + safe = Safe.before_deploy() + + dependencies = [ + ("projects", "0164_show_build_overview_in_comment_default_true"), + ] + + operations = [ + migrations.AddField( + model_name="historicalproject", + name="container_cpu_limit", + field=models.PositiveIntegerField( + blank=True, + help_text="Fargate CPU units (1024 = 1 vCPU). Defaults to the system-wide default (2048 = 2 vCPU) when unset. Must pair with ``container_mem_limit`` to a supported Fargate CPU/memory combination.", + null=True, + verbose_name="Container CPU limit in Fargate CPU units", + ), + ), + migrations.AddField( + model_name="project", + name="container_cpu_limit", + field=models.PositiveIntegerField( + blank=True, + help_text="Fargate CPU units (1024 = 1 vCPU). Defaults to the system-wide default (2048 = 2 vCPU) when unset. Must pair with ``container_mem_limit`` to a supported Fargate CPU/memory combination.", + null=True, + verbose_name="Container CPU limit in Fargate CPU units", + ), + ), + ] diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 229537cebee..1063c622d74 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -492,6 +492,17 @@ class Project(models.Model): null=True, blank=True, ) + container_cpu_limit = models.PositiveIntegerField( + _("Container CPU limit in Fargate CPU units"), + null=True, + blank=True, + help_text=_( + "Fargate CPU units (1024 = 1 vCPU). Defaults to the " + "system-wide default (2048 = 2 vCPU) when unset. Must " + "pair with ``container_mem_limit`` to a supported Fargate " + "CPU/memory combination." + ), + ) build_queue = models.CharField( _("Alternate build queue id"), max_length=32, @@ -2086,6 +2097,7 @@ def add_features(sender, **kwargs): BUILD_NO_ACKS_LATE = "build_no_acks_late" BUILD_IN_PARALLEL = "build_in_parallel" USE_GVISOR_RUNTIME = "use_gvisor_runtime" + USE_FARGATE_BUILDER = "use_fargate_builder" FEATURES = ( ( @@ -2155,6 +2167,14 @@ def add_features(sender, **kwargs): USE_GVISOR_RUNTIME, _("Build: Run build containers under the gVisor (runsc) runtime."), ), + ( + USE_FARGATE_BUILDER, + _( + "Build: Dispatch this project's builds to AWS Fargate via " + "``submit_build_to_ecs`` instead of the legacy ``update_docs_task`` " + "Celery worker pool." + ), + ), ) FEATURES = sorted(FEATURES, key=lambda x: x[1]) diff --git a/readthedocs/projects/tasks/fargate.py b/readthedocs/projects/tasks/fargate.py new file mode 100644 index 00000000000..aa88c9fe8bf --- /dev/null +++ b/readthedocs/projects/tasks/fargate.py @@ -0,0 +1,618 @@ +""" +Bootstrap task that dispatches a build to AWS Fargate. + +When a project has ``Feature.USE_FARGATE_BUILDER`` enabled, ``trigger_build`` +enqueues :func:`submit_build_to_ecs` here instead of the legacy +``update_docs_task``. This task does the minimal work needed *before* the +Fargate task can run: + +1. Sparse-clones just the ``.readthedocs.yaml`` to learn ``build.os``. +2. Resolves ``build.os`` to an ECS task definition name and snaps the + project's per-build resource limits to a Fargate-supported CPU/memory pair. +3. Mints a per-build API key. +4. Calls ``ecs:RunTask`` with the right image + command + env. +5. Stores the returned ECS task ARN on ``Build.task_arn`` so + ``cancel_build`` can later call ``ecs:StopTask``. + +The full Fargate build itself (clone, install, build, upload, finalize) runs +inside the ``readthedocs/builder:`` container — see the +``readthedocs-builder`` repository for the runner. + +See ``readthedocs-builder/docs/architecture.md`` for the broader design. +""" + +import os +import re +import shutil +import subprocess +import tempfile +from urllib.parse import urlparse + +import boto3 +import structlog +import yaml +from django.conf import settings + +from readthedocs.api.v2.models import BuildAPIKey +from readthedocs.builds.constants import BUILD_STATE_CANCELLED +from readthedocs.builds.constants import BUILD_STATE_FINISHED +from readthedocs.builds.models import Build +from readthedocs.doc_builder.exceptions import BuildAppError +from readthedocs.doc_builder.exceptions import BuildUserError +from readthedocs.notifications.models import Notification +from readthedocs.projects.models import Feature +from readthedocs.worker import app + + +log = structlog.get_logger(__name__) + + +# Candidate paths the runner accepts as the project's config file. +# Mirrors the four-pattern sparse-checkout regex elsewhere in the codebase. +_CONFIG_FILENAMES = ( + ".readthedocs.yaml", + ".readthedocs.yml", + "readthedocs.yaml", + "readthedocs.yml", +) + + +# Fargate's supported task-level CPU/memory matrix. Memory values are in MiB. +# Source: AWS docs — "Task CPU and memory" under Fargate task definitions. +_FARGATE_CPU_MEMORY_MATRIX = { + 256: [512, 1024, 2048], + 512: [1024, 2048, 3072, 4096], + 1024: [2048, 3072, 4096, 5120, 6144, 7168, 8192], + 2048: list(range(4096, 16384 + 1, 1024)), + 4096: list(range(8192, 30720 + 1, 1024)), + 8192: list(range(16384, 61440 + 1, 4096)), + 16384: list(range(32768, 122880 + 1, 8192)), +} +_FARGATE_CPU_VALUES = sorted(_FARGATE_CPU_MEMORY_MATRIX.keys()) + + +# ---- Helpers ---- + + +def _sparse_clone_yaml(repo_url, ref, clone_token, dest): + """ + Clone just the ``.readthedocs.yaml`` from a remote repo into ``dest``. + + Uses ``--filter=blob:none --no-checkout`` so only commit / tree metadata + is downloaded, then ``sparse-checkout`` to pull just the config file. + Returns the absolute path to the downloaded config file, or ``None`` if + none of the candidate filenames were present. + + HTTPS auth: ``clone_token`` is injected into the URL when non-empty. + SSH auth: not supported by this bootstrap path; SSH-hosted projects need + to use the legacy path until we surface the deploy key here. + """ + if not repo_url: + raise BuildUserError(message_id=BuildUserError.GENERIC) + + if repo_url.startswith("git@"): + # SSH clone needs a deploy key we don't have access to here. + raise BuildAppError( + BuildAppError.GENERIC_WITH_BUILD_ID, + exception_message=( + f"Fargate bootstrap doesn't support SSH clone URLs yet; project repo: {repo_url}" + ), + ) + + auth_url = repo_url + if clone_token and repo_url.startswith(("https://", "http://")): + parsed = urlparse(repo_url) + # x-access-token@host pattern is what GitHub/GitLab tokens expect. + auth_url = f"{parsed.scheme}://{clone_token}@{parsed.netloc}{parsed.path}" + + # TODO: consider if we want to log these commands here. + subprocess.run( + [ + "git", + "clone", + "--filter=blob:none", + "--no-checkout", + "--depth=1", + "-b", + ref, + auth_url, + dest, + ], + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "-C", dest, "sparse-checkout", "init", "--no-cone"], + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "-C", dest, "sparse-checkout", "set", *_CONFIG_FILENAMES], + check=True, + capture_output=True, + ) + subprocess.run( + ["git", "-C", dest, "checkout"], + check=True, + capture_output=True, + ) + + for name in _CONFIG_FILENAMES: + candidate = os.path.join(dest, name) + if os.path.isfile(candidate): + return candidate + return None + + +def _read_build_os(config_path): + """ + Parse ``.readthedocs.yaml`` and return the ``build.os`` value. + + Resolves the ``ubuntu-lts-latest`` alias via + ``settings.RTD_DOCKER_BUILD_SETTINGS`` so the rest of the pipeline only + ever sees a concrete OS tag. + """ + with open(config_path) as fh: + config = yaml.safe_load(fh) + + if not isinstance(config, dict): + raise BuildUserError(BuildUserError.NO_CONFIG_FILE_DEPRECATED) + + build_os = (config.get("build") or {}).get("os") + if not build_os: + raise BuildUserError(BuildUserError.BUILD_OS_REQUIRED) + + if build_os == "ubuntu-lts-latest": + alias = settings.RTD_DOCKER_BUILD_SETTINGS["os"].get("ubuntu-lts-latest", "") + if ":" in alias: + build_os = alias.split(":", 1)[1] + + return build_os + + +def _parse_mem_limit_mb(value): + """ + Coerce a project ``container_mem_limit`` value into MiB. + + Accepts the historical Docker formats (``"512m"``, ``"8g"``) for + compatibility with existing rows, plus plain integers / int-strings + (interpreted as MiB). + """ + if value is None or value == "": + return None + if isinstance(value, int): + return value + + match = re.fullmatch(r"\s*(\d+)\s*([mMgG]?)\s*", str(value)) + if not match: + return None + n = int(match.group(1)) + unit = match.group(2).lower() + if unit == "g": + return n * 1024 + # Default + 'm' suffix: already MiB. + return n + + +def _snap_to_fargate_pair(cpu, memory): + """ + Round ``(cpu, memory)`` up to the smallest Fargate-supported pair. + + Returns ``(cpu, memory)`` integers. Caps at the largest supported pair + so a misconfigured project can't accidentally request unbounded compute. + """ + cpu = next((c for c in _FARGATE_CPU_VALUES if c >= cpu), _FARGATE_CPU_VALUES[-1]) + allowed = _FARGATE_CPU_MEMORY_MATRIX[cpu] + memory = next((m for m in allowed if m >= memory), allowed[-1]) + return cpu, memory + + +def _resolve_fargate_resources(project): + """ + Resolve the per-build CPU / memory / time-limit for ``project``. + + Layers: + 1. Project field, or settings default. + 2. Capped at ``settings.RTD_BUILD_MAX_*``. + 3. CPU+memory snapped to a valid Fargate pair (CPU-first wins). + + Returns ``(cpu, memory_mib, time_limit_seconds)``. + """ + raw_cpu = project.container_cpu_limit or settings.RTD_BUILD_DEFAULT_CPU + raw_mem = _parse_mem_limit_mb(project.container_mem_limit) or settings.RTD_BUILD_DEFAULT_MEMORY + raw_time = project.container_time_limit or settings.RTD_BUILD_DEFAULT_TIME_LIMIT + + cpu = min(raw_cpu, settings.RTD_BUILD_MAX_CPU) + memory = min(raw_mem, settings.RTD_BUILD_MAX_MEMORY) + time_limit = min(raw_time, settings.RTD_BUILD_MAX_TIME_LIMIT) + + cpu, memory = _snap_to_fargate_pair(cpu, memory) + return cpu, memory, time_limit + + +def _dispatch_build_task(*, build_pk, build_os, cpu, memory, environment, command): + """ + Dispatch a build to either Fargate (prod) or local Docker (dev). + + Branches on ``settings.RTD_DOCKER_COMPOSE``: docker-compose dev runs + the build in a sibling container on the host's docker daemon via + docker-py; production hits ``ecs:RunTask``. Returns the task + identifier stored on ``Build.task_arn`` — a container id under + docker-compose, a real ECS task ARN in production. ``cancel_build`` + branches on the same setting to interpret it. + """ + if settings.RTD_DOCKER_COMPOSE: + return _docker_run_task( + build_pk=build_pk, + cpu=cpu, + memory=memory, + environment=environment, + command=command, + ) + return _ecs_run_task( + build_os=build_os, + cpu=cpu, + memory=memory, + environment=environment, + command=command, + ) + + +def _docker_run_task(*, build_pk, cpu, memory, environment, command): + """ + Spawn a builder container via the host's docker daemon (dev only). + + Returns the resulting container id (stored on ``Build.task_arn``; + ``cancel_build`` knows whether to interpret it as a container id or + an ECS ARN based on ``settings.RTD_DOCKER_COMPOSE``). + + The container shares the docker-compose network (so it can reach + ``web``, ``storage``, etc.) and gets the same resource constraints + Fargate would apply (cpus + memory). When + ``settings.RTD_PATH_BUILDER`` is set, the host-side + readthedocs-builder checkout is bind-mounted at ``/opt/builder`` so + the entrypoint skips the GitHub clone — matches the + ``dev-run.sh`` iteration loop. + + Requires the celery container to have ``/var/run/docker.sock`` + bind-mounted (docker-out-of-docker). + """ + # Import lazily so the prod settings don't need the ``docker`` package + # available at import time. (It already is, via the legacy + # DockerBuildEnvironment, but lazy keeps the dep graph clean.) + import docker + + client = docker.from_env() + # TODO: in production this is derived from build.os and points to a + # readthedocs/builder: image. For local dev we currently use a + # single ``builder-dev:latest`` image regardless of build.os because + # we don't have the OS image matrix yet. Match production once the + # matrix exists. + image = settings.RTD_LOCAL_BUILDER_IMAGE + + volumes = {} + if settings.RTD_PATH_BUILDER: + # The path here is resolved by the *host* docker daemon (we're + # talking to it via the bind-mounted socket), so + # ``RTD_PATH_BUILDER`` must be a host-side absolute path — + # not a path inside the celery container. + volumes[settings.RTD_PATH_BUILDER] = { + "bind": "/opt/builder", + "mode": "ro", + } + # ``entrypoint.sh`` is COPYed into the image at ``/opt/entrypoint.sh`` + # (see ``readthedocs-builder/Dockerfile``) — outside the + # ``/opt/builder`` bind-mount above, so edits on the host don't + # take effect without a full image rebuild. Bind-mount the host + # copy on top so dev iterations on the entrypoint (signal + # handling, watchdog, etc.) are live. + volumes[os.path.join(settings.RTD_PATH_BUILDER, "scripts/entrypoint.sh")] = { + "bind": "/opt/entrypoint.sh", + "mode": "ro", + } + + # Stable container name so the user can ``docker logs build-`` + # without having to look up the random id. If a stopped container + # from a previous run of the same pk is still around, remove it + # first so the name is free. + container_name = f"build-{build_pk}" + try: + existing = client.containers.get(container_name) + except docker.errors.NotFound: + pass + else: + log.info("Removing stale container.", container_name=container_name) + existing.remove(force=True) + + try: + container = client.containers.run( + image=image, + name=container_name, + command=list(command), + environment={k: str(v) for k, v in environment.items()}, + # Fargate CPU units (1024 = 1 vCPU) -> Docker nano_cpus (1e9 = 1 vCPU). + nano_cpus=int(cpu * 1_000_000_000 // 1024), + mem_limit=f"{memory}m", + network=settings.RTD_DOCKER_COMPOSE_NETWORK, + volumes=volumes, + detach=True, + # Keep the container around after exit in dev so its logs + # remain inspectable via ``docker logs build-``. Prune + # accumulated stopped containers periodically with + # ``docker container prune``. Production (``_ecs_run_task``) + # doesn't hit this path — CloudWatch handles log retention. + auto_remove=False, + ) + except Exception as exc: + raise BuildAppError( + BuildAppError.GENERIC_WITH_BUILD_ID, + exception_message=f"docker run failed: {exc}", + ) from exc + + log.info( + "Dispatched build to local Docker.", + image=image, + container_id=container.id, + nano_cpus=int(cpu * 1_000_000_000 // 1024), + mem_limit=f"{memory}m", + network=settings.RTD_DOCKER_COMPOSE_NETWORK, + bind_mount=bool(volumes), + ) + return container.id + + +def _ecs_run_task(*, build_os, cpu, memory, environment, command): + """ + Call ``ecs:RunTask`` and return the resulting task ARN. + + Uses Fargate Spot as the primary capacity (with Fargate on-demand as + fallback could be configured at the cluster level via a default + capacity provider strategy; here we always request Spot first). + + Raises :class:`BuildAppError` on any AWS error so the failure flows + through the caller's exception handling. + """ + client = boto3.client("ecs", region_name=settings.RTD_ECS_REGION or None) + task_definition = settings.RTD_ECS_TASK_DEFINITION_FORMAT.format(build_os=build_os) + + try: + response = client.run_task( + cluster=settings.RTD_ECS_CLUSTER, + taskDefinition=task_definition, + capacityProviderStrategy=[ + {"capacityProvider": "FARGATE_SPOT", "weight": 1}, + ], + count=1, + overrides={ + "cpu": str(cpu), + "memory": str(memory), + "containerOverrides": [ + { + "name": "builder", + "command": list(command), + "environment": [ + {"name": k, "value": str(v)} for k, v in environment.items() + ], + }, + ], + }, + networkConfiguration={ + "awsvpcConfiguration": { + "subnets": list(settings.RTD_ECS_SUBNETS), + "securityGroups": list(settings.RTD_ECS_SECURITY_GROUPS), + "assignPublicIp": settings.RTD_ECS_ASSIGN_PUBLIC_IP, + }, + }, + ) + except Exception as exc: + raise BuildAppError( + BuildAppError.GENERIC_WITH_BUILD_ID, + exception_message=f"ecs:RunTask failed: {exc}", + ) from exc + + tasks = response.get("tasks") or [] + failures = response.get("failures") or [] + if not tasks: + raise BuildAppError( + BuildAppError.GENERIC_WITH_BUILD_ID, + exception_message=f"ecs:RunTask returned no tasks; failures={failures}", + ) + + return tasks[0]["taskArn"] + + +# ---- The bootstrap task ---- + + +def _fail_build(build, exc): + """ + Finalize a build that failed *before* the runner container started. + + The runner's own try/except (``builder.runner.Runner.run``) only + catches exceptions raised inside the build container. Anything raised + by ``submit_build_to_ecs`` itself (missing config file, malformed + YAML, ECS RunTask failure, etc.) never reaches that handler, so the + build would be left stuck in ``triggered`` state with no + user-facing explanation. + + Mirrors what the runner does on failure: attach a notification + derived from the exception's ``message_id`` / ``format_values``, then + PATCH the build to ``finished`` with ``success=False``. + """ + fallback = ( + BuildUserError.GENERIC + if isinstance(exc, BuildUserError) + else BuildAppError.GENERIC_WITH_BUILD_ID + ) + message_id = getattr(exc, "message_id", None) or fallback + format_values = getattr(exc, "format_values", None) or {} + + log.error( + "Failing build at bootstrap.", + exception_type=type(exc).__name__, + message_id=message_id, + format_values=format_values, + ) + + Notification.objects.add( + message_id=message_id, + attached_to=build, + format_values=format_values, + dismissable=False, + ) + + build.state = BUILD_STATE_FINISHED + build.success = False + build.length = 0 + build.save(update_fields=["state", "success", "length"]) + + +@app.task(bind=True, max_retries=3, default_retry_delay=30, queue="web") +def submit_build_to_ecs(self, build_pk): + """ + Dispatch a build to AWS Fargate. + + Replaces ``update_docs_task.delay`` for projects with + ``Feature.USE_FARGATE_BUILDER`` enabled. See module docstring for the + full flow. + """ + build = Build.objects.select_related("version__project").get(pk=build_pk) + version = build.version + project = version.project + + # The build was cancelled (e.g. via ``cancel_build`` while we were + # waiting in the Celery queue) before we got a chance to dispatch. + # Bail out without minting an API key or hitting ECS — the Build + # already reflects ``state=cancelled``. + if build.state == BUILD_STATE_CANCELLED: + log.info( + "Build was cancelled before Fargate dispatch; skipping.", + build_id=build.pk, + project_slug=project.slug, + ) + return + + structlog.contextvars.bind_contextvars( + build_id=build.pk, + project_slug=project.slug, + version_slug=version.slug, + ) + + try: + _submit_build_to_ecs(build, version, project) + except (BuildUserError, BuildAppError) as exc: + # Failures *before* the build container starts never reach the + # runner's own try/except. Finalize the build at the API layer + # so the user sees a proper notification + ``finished`` state + # instead of a build stuck in ``triggered``. + _fail_build(build, exc) + log.exception("submit_build_to_ecs failed.") + + +def _submit_build_to_ecs(build, version, project): + """ + Inner body of :func:`submit_build_to_ecs` — split out so the caller + can wrap it in a single try/except that finalizes the build on any + user / app error. See :func:`_fail_build` for the failure path. + """ + if not project.has_feature(Feature.USE_FARGATE_BUILDER): + # Defensive: the dispatcher in ``trigger_build`` shouldn't route here + # without the flag. If it does, fail loudly rather than silently + # dispatching to Fargate. + raise BuildAppError( + BuildAppError.GENERIC_WITH_BUILD_ID, + exception_message=( + f"submit_build_to_ecs called for project '{project.slug}' " + "without Feature.USE_FARGATE_BUILDER set." + ), + ) + + # 1. Sparse-clone just the YAML to learn build.os. + tmp = tempfile.mkdtemp(prefix="rtd-bootstrap-") + try: + config_path = _sparse_clone_yaml( + repo_url=project.repo, + ref=version.identifier, + clone_token=project.clone_token, + dest=tmp, + ) + if config_path is None: + raise BuildUserError(message_id=BuildUserError.NO_CONFIG_FILE_DEPRECATED) + build_os = _read_build_os(config_path) + finally: + shutil.rmtree(tmp, ignore_errors=True) + + log.info("Resolved build.os.", build_os=build_os) + + # 2. Resolve per-build resource limits. + cpu, memory, time_limit = _resolve_fargate_resources(project) + log.info( + "Resolved Fargate resources.", + cpu=cpu, + memory=memory, + time_limit=time_limit, + ) + + # 3. Mint a per-build API key (24h-scoped). + _, build_api_key = BuildAPIKey.objects.create_key(project=project) + + # 4. Submit to ECS (prod) or local Docker (dev). + environment = { + "RTD_API_URL": getattr(settings, "RTD_API_URL", settings.PUBLIC_API_URL), + "RTD_PRODUCTION_DOMAIN": settings.PRODUCTION_DOMAIN, + "RTD_BUILD_API_KEY": build_api_key, + "RTD_BUILDER_REF": settings.RTD_BUILDER_REF, + "RTD_BUILDER_REPO": settings.RTD_BUILDER_REPO, + "RTD_BUILD_TIME_LIMIT_SECONDS": time_limit, + "RTD_BUILD_TIME_LIMIT_GRACE_SECONDS": settings.RTD_BUILD_TIME_LIMIT_GRACE_SECONDS, + "RTD_BUILD_TIME_LIMIT_KILL_SECONDS": settings.RTD_BUILD_TIME_LIMIT_KILL_SECONDS, + } + # Forward the readthedocs-builder clone token when configured. The + # entrypoint inside the container injects it into the clone URL at + # clone time, so it never appears in container logs. + if getattr(settings, "RTD_BUILDER_TOKEN", ""): + environment["RTD_BUILDER_TOKEN"] = settings.RTD_BUILDER_TOKEN + + if settings.RTD_DOCKER_COMPOSE: + # Local dev: the runner uses the API's STS endpoint for storage + # credentials (same as production), but boto3 needs to know where + # to point — the dev environment uses an S3-compatible service + # at ``http://storage:9000`` (rustfs) instead of real AWS. Forward + # only that URL; credentials + bucket names come from the API. + environment["AWS_S3_ENDPOINT_URL"] = settings.AWS_S3_ENDPOINT_URL or "" + # Skip the runuser privilege drop — the bind-mounted docroot in + # dev is owned by the host UID, which won't match the container's + # ``docs`` user (same trick ``dev-run.sh`` already uses). + environment["RTD_DOCKER_USER"] = "root" + # The build container joins ``RTD_DOCKER_COMPOSE_NETWORK`` so it + # can reach ``nginx`` (which fronts the API on port 80) by docker + # service-name DNS. ``HOSTIP`` doesn't work here: the compose + # bridge can't route to the host's LAN IP on the port-forwarded + # nginx port. ``dev-run.sh`` sidesteps this with ``--network=host``, + # but that defeats the compose-network plumbing we want for the + # rest of the runner's calls (storage, etc.). + # + # TODO: update ``RTD_API_URL`` in ``docker_compose.py`` once we are fully migrated + # and remove this override here. + environment["RTD_API_URL"] = "http://nginx" + + command = ["--build-pk", str(build.pk), "--run", "--record-commands"] + + task_arn = _dispatch_build_task( + build_pk=build.pk, + build_os=build_os, + cpu=cpu, + memory=memory, + environment=environment, + command=command, + ) + + log.info( + "Dispatched build.", + backend="docker" if settings.RTD_DOCKER_COMPOSE else "fargate", + task_arn=task_arn, + ) + + # 5. Store the (pseudo-)ARN so cancel_build can stop the task. + build.task_arn = task_arn + build.save(update_fields=["task_arn"]) diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 9cbd8c85e5a..2794cab8f0e 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -626,6 +626,37 @@ def TEMPLATES(self): BUILD_TIME_LIMIT = 900 # seconds + # Fargate builder defaults — see readthedocs-builder/docs/architecture.md. + # Per-build resources can be overridden by Project.container_cpu_limit / + # container_mem_limit / container_time_limit. The bootstrap task snaps + # the resolved values to a Fargate-supported CPU/memory pair before + # calling ecs:RunTask. + RTD_BUILD_DEFAULT_CPU = 2048 # Fargate CPU units (1024 = 1 vCPU) + RTD_BUILD_DEFAULT_MEMORY = 8192 # MiB + RTD_BUILD_DEFAULT_TIME_LIMIT = 1800 # seconds + RTD_BUILD_MAX_CPU = 4096 + RTD_BUILD_MAX_MEMORY = 30720 + RTD_BUILD_MAX_TIME_LIMIT = 3600 + # Two-layer timeout grace periods (see entrypoint.sh): + RTD_BUILD_TIME_LIMIT_GRACE_SECONDS = 30 + RTD_BUILD_TIME_LIMIT_KILL_SECONDS = 10 + + # Where the runner clones itself from at container startup. + RTD_BUILDER_REPO = "https://github.com/readthedocs/readthedocs-builder.git" + + # TODO: use RTD_BUILDER_REF = "rel" when we have a stable release of the builder + RTD_BUILDER_REF = "main" + + # ECS / Fargate config. All four MUST be set in production; left empty here + # so dev settings can override (or the bootstrap can short-circuit when not + # configured). + RTD_ECS_CLUSTER = "" + RTD_ECS_TASK_DEFINITION_FORMAT = "rtd-builder-{build_os}" + RTD_ECS_SUBNETS = [] + RTD_ECS_SECURITY_GROUPS = [] + RTD_ECS_ASSIGN_PUBLIC_IP = "ENABLED" + RTD_ECS_REGION = "" + @property def BUILD_MEMORY_LIMIT(self): """ diff --git a/readthedocs/settings/docker_compose.py b/readthedocs/settings/docker_compose.py index c5021b94242..d311d964f11 100644 --- a/readthedocs/settings/docker_compose.py +++ b/readthedocs/settings/docker_compose.py @@ -17,6 +17,34 @@ class DockerBaseSettings(CommunityBaseSettings): RTD_DOCKER_USER = f"{os.geteuid()}:{os.getegid()}" BUILD_MEMORY_LIMIT = "2g" + # Local Fargate emulation: submit_build_to_ecs falls back to ``docker run`` + # against the host's docker daemon (mounted via /var/run/docker.sock into + # this container) instead of ecs:RunTask. See + # readthedocs-builder/docs/architecture.md for the prod design. + # + # TODO: drop this setting once we have the readthedocs/builder: image + # matrix and can resolve the image from build.os exactly like production + # does. For now we use a single dev image regardless of build.os. + # Build it once via: + # cd ../readthedocs-builder && docker build -t builder-dev:latest . + RTD_LOCAL_BUILDER_IMAGE = os.environ.get( + "RTD_LOCAL_BUILDER_IMAGE", "builder-dev:latest" + ) + # Host-side path to the readthedocs-builder checkout. When set, the + # bootstrap bind-mounts it at /opt/builder so the entrypoint skips the + # GitHub clone (matches the dev-run.sh iteration loop). Comment the + # env var out / leave it empty to exercise the clone path. + # To clone the repository, you need a GH token with read access + # defined in RTD_BUILDER_TOKEN since it's private for now. + RTD_PATH_BUILDER = os.environ.get("RTD_PATH_BUILDER", "") + + # Personal access token used by the entrypoint to clone the + # readthedocs-builder repo when it's private. Forwarded into the + # builder container's env; the entrypoint injects it into the clone + # URL at clone time so it doesn't leak into ``docker logs``. Leave + # empty when the repo is public or you're using the bind-mount path. + RTD_BUILDER_TOKEN = os.environ.get("RTD_BUILDER_TOKEN", "") + PRODUCTION_DOMAIN = os.environ.get("RTD_PRODUCTION_DOMAIN", "devthedocs.org") PUBLIC_DOMAIN = os.environ.get("RTD_PUBLIC_DOMAIN", "devthedocs.org") PUBLIC_API_URL = f"http://{PRODUCTION_DOMAIN}"