Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions readthedocs/builds/migrations/0073_build_task_arn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Generated by Django 5.2.13 on 2026-06-03 09:32

from django.db import migrations
from django.db import models
from django_safemigrate import Safe


class Migration(migrations.Migration):
safe = Safe.before_deploy()

dependencies = [
("builds", "0072_remove_deprecated_build_fields"),
]

operations = [
migrations.AddField(
model_name="build",
name="task_arn",
field=models.CharField(
blank=True,
help_text="ECS task ARN for builds dispatched via Fargate. Set by ``submit_build_to_ecs``; consumed by ``cancel_build`` to call ``ecs:StopTask``. Mutually exclusive with ``task_id`` (legacy Celery path).",
max_length=255,
null=True,
verbose_name="ECS task ARN",
),
),
]
12 changes: 12 additions & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,18 @@ class Build(models.Model):
null=True,
blank=True,
)
task_arn = models.CharField(
_("ECS task ARN"),
max_length=255,
null=True,
blank=True,
help_text=_(
"ECS task ARN for builds dispatched via Fargate. "
"Set by ``submit_build_to_ecs``; consumed by ``cancel_build`` "
"to call ``ecs:StopTask``. Mutually exclusive with ``task_id`` "
"(legacy Celery path)."
),
)
task_executed_at = models.DateTimeField(
_("Task executed at datetime"),
null=True,
Expand Down
87 changes: 86 additions & 1 deletion readthedocs/core/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,22 @@ def trigger_build(project, version=None, commit=None, from_webhook=False):
Helper that calls ``prepare_build`` and just effectively trigger the Celery
task to be executed by a worker.

When the project has ``Feature.USE_FARGATE_BUILDER`` enabled, dispatches
to the new ``submit_build_to_ecs`` bootstrap task (which clones the
config, resolves ``build.os``, and runs the build inside a Fargate task)
instead of the legacy ``update_docs_task``. See
``readthedocs-builder/docs/architecture.md`` for the broader design.

:param project: project's documentation to be built
:param version: version of the project to be built. Default: ``latest``
:param commit: commit sha of the version required for sending build status reports
:returns: Celery AsyncResult promise and Build instance
:rtype: tuple
"""
# Avoid circular import.
from readthedocs.projects.models import Feature
from readthedocs.projects.tasks.fargate import submit_build_to_ecs

structlog.contextvars.bind_contextvars(
project_slug=project.slug,
version_slug=version.slug if version else None,
Expand All @@ -229,6 +239,16 @@ def trigger_build(project, version=None, commit=None, from_webhook=False):
# Build was skipped
return (None, None)

# Feature-flag dispatch: Fargate path vs legacy Celery path.
if project.has_feature(Feature.USE_FARGATE_BUILDER):
log.info("Dispatching build via submit_build_to_ecs (Fargate path).")
task = submit_build_to_ecs.delay(build_pk=build.pk)
# The Build's ECS task_arn is populated by submit_build_to_ecs once
# ``ecs:RunTask`` returns; we don't write task_id here (that's the
# legacy-Celery-path's cancellation handle). cancel_build branches
# on which one is set.
return task, build

task = update_docs_task.apply_async()

# FIXME: I'm using `isinstance` here because I wasn't able to mock this
Expand All @@ -255,6 +275,14 @@ def cancel_build(build):
- Running:
Communicate Celery to force the termination of the current build
and rely on the worker to update the build's status.

Routing during the Fargate rollout: branches on which task identifier
is set on the build. ``Build.task_arn`` (Fargate path) takes
precedence — if the build was dispatched to ECS, we call
``ecs:StopTask``. Otherwise we fall back to the legacy Celery revoke.
The branch is on the build's *actual* state (which dispatcher ran),
not the project's current feature flag, so an in-flight build that
started before the flag was flipped still cancels correctly.
"""
# NOTE: `terminate=True` is required for the child to attend our call
# immediately when it's running the build. Otherwise, it finishes the
Expand Down Expand Up @@ -287,9 +315,66 @@ def cancel_build(build):
version_slug=build.version.slug,
build_id=build.pk,
build_task_id=build.task_id,
build_task_arn=build.task_arn,
terminate=terminate,
)
app.control.revoke(build.task_id, signal="SIGINT", terminate=terminate)

if build.task_arn:
# Fargate / local-docker path. ``task_arn`` is a container id
# under docker-compose dev and a real ECS task ARN in
# production; we branch on ``settings.RTD_DOCKER_COMPOSE``.
if settings.RTD_DOCKER_COMPOSE:
import docker

try:
# SIGTERM, not the default SIGKILL: the runner installs a
# SIGTERM handler that raises ``BuildCancelled`` so its
# lifecycle's ``try/except/finally`` runs (attaches the
# ``CANCELLED_BY_USER`` notification + PATCHes the build
# to ``finished`` with ``success=False``) before the
# process exits. SIGKILL would skip all of that and
# leave the build stuck mid-state. Matches ECS StopTask
# semantics in production.
docker.from_env().containers.get(build.task_arn).kill(signal="SIGTERM")
except Exception:
log.exception(
"docker kill failed.",
container_id=build.task_arn,
)
return

# The runner inside the ECS task catches SIGTERM via its
# lifecycle's try/finally and finalizes the build. If the task
# has already exited (race), StopTask returns a benign error
# which we log but don't propagate.
import boto3

try:
boto3.client("ecs", region_name=settings.RTD_ECS_REGION or None).stop_task(
cluster=settings.RTD_ECS_CLUSTER,
task=build.task_arn,
reason="cancelled by user",
)
except Exception:
log.exception(
"ecs:StopTask failed.",
task_arn=build.task_arn,
)
return

if build.task_id:
# Legacy Celery path.
app.control.revoke(build.task_id, signal="SIGINT", terminate=terminate)
return

# Neither path has a handle yet. This happens when the Fargate
# bootstrap task hasn't run / hasn't called ecs:RunTask. The
# state-update above is enough — submit_build_to_ecs checks the
# build state at startup and bails out when it's CANCELLED.
log.info(
"No task handle on the build; relying on submit_build_to_ecs "
"to bail out when it observes BUILD_STATE_CANCELLED.",
)


def send_email_from_object(email: EmailMultiAlternatives | EmailMessage):
Expand Down
1 change: 1 addition & 0 deletions readthedocs/projects/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def ready(self):
import readthedocs.projects.notifications # noqa
import readthedocs.projects.signals # noqa
import readthedocs.projects.tasks.builds # noqa
import readthedocs.projects.tasks.fargate # noqa
import readthedocs.projects.tasks.search # noqa
import readthedocs.projects.tasks.utils # noqa

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Generated by Django 5.2.13 on 2026-06-03 09:32

from django.db import migrations
from django.db import models
from django_safemigrate import Safe


class Migration(migrations.Migration):
safe = Safe.before_deploy()

dependencies = [
("projects", "0164_show_build_overview_in_comment_default_true"),
]

operations = [
migrations.AddField(
model_name="historicalproject",
name="container_cpu_limit",
field=models.PositiveIntegerField(
blank=True,
help_text="Fargate CPU units (1024 = 1 vCPU). Defaults to the system-wide default (2048 = 2 vCPU) when unset. Must pair with ``container_mem_limit`` to a supported Fargate CPU/memory combination.",
null=True,
verbose_name="Container CPU limit in Fargate CPU units",
),
),
migrations.AddField(
model_name="project",
name="container_cpu_limit",
field=models.PositiveIntegerField(
blank=True,
help_text="Fargate CPU units (1024 = 1 vCPU). Defaults to the system-wide default (2048 = 2 vCPU) when unset. Must pair with ``container_mem_limit`` to a supported Fargate CPU/memory combination.",
null=True,
verbose_name="Container CPU limit in Fargate CPU units",
),
),
]
20 changes: 20 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,17 @@ class Project(models.Model):
null=True,
blank=True,
)
container_cpu_limit = models.PositiveIntegerField(
_("Container CPU limit in Fargate CPU units"),
null=True,
blank=True,
help_text=_(
"Fargate CPU units (1024 = 1 vCPU). Defaults to the "
"system-wide default (2048 = 2 vCPU) when unset. Must "
"pair with ``container_mem_limit`` to a supported Fargate "
"CPU/memory combination."
),
)
build_queue = models.CharField(
_("Alternate build queue id"),
max_length=32,
Expand Down Expand Up @@ -2086,6 +2097,7 @@ def add_features(sender, **kwargs):
BUILD_NO_ACKS_LATE = "build_no_acks_late"
BUILD_IN_PARALLEL = "build_in_parallel"
USE_GVISOR_RUNTIME = "use_gvisor_runtime"
USE_FARGATE_BUILDER = "use_fargate_builder"

FEATURES = (
(
Expand Down Expand Up @@ -2155,6 +2167,14 @@ def add_features(sender, **kwargs):
USE_GVISOR_RUNTIME,
_("Build: Run build containers under the gVisor (runsc) runtime."),
),
(
USE_FARGATE_BUILDER,
_(
"Build: Dispatch this project's builds to AWS Fargate via "
"``submit_build_to_ecs`` instead of the legacy ``update_docs_task`` "
"Celery worker pool."
),
),
)

FEATURES = sorted(FEATURES, key=lambda x: x[1])
Expand Down
Loading