From 992dd40bb19f7a3a50e2b5fbaff226e1fb6bcd9e Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Tue, 10 Feb 2026 15:00:19 -0800 Subject: [PATCH 1/5] Fix orphaned submissions when SQS publish fails Signed-off-by: WHOIM1205 --- apps/jobs/views.py | 24 +++++- tests/unit/jobs/test_views.py | 137 ++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 3 deletions(-) diff --git a/apps/jobs/views.py b/apps/jobs/views.py index 0e7fe0ebf8..7767e09fda 100644 --- a/apps/jobs/views.py +++ b/apps/jobs/views.py @@ -409,11 +409,29 @@ def challenge_submission(request, challenge_id, challenge_phase_id): if serializer.is_valid(): serializer.save() - response_data = serializer.data submission = serializer.instance message["submission_pk"] = submission.id - # publish message in the queue - publish_submission_message(message) + + try: + publish_submission_message(message) + except Exception: + logger.exception( + "SQS publish failed for submission %s in challenge %s, " + "cancelling submission", + submission.pk, + challenge_id, + ) + submission.status = Submission.CANCELLED + submission.save() + response_data = { + "error": "Failed to process your submission. Please try again." + } + return Response( + response_data, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + response_data = serializer.data return Response(response_data, status=status.HTTP_201_CREATED) return Response( serializer.errors, status=status.HTTP_406_NOT_ACCEPTABLE diff --git a/tests/unit/jobs/test_views.py b/tests/unit/jobs/test_views.py index aece48e870..abae0485f5 100644 --- a/tests/unit/jobs/test_views.py +++ b/tests/unit/jobs/test_views.py @@ -5,6 +5,7 @@ from datetime import timedelta import boto3 +import botocore import mock import requests from allauth.account.models import EmailAddress @@ -532,6 +533,142 @@ def test_challenge_submission_for_docker_based_challenges(self): self.assertEqual(response.status_code, status.HTTP_201_CREATED) + @mock.patch( + "jobs.views.publish_submission_message", + side_effect=Exception("SQS connection error"), + ) + def test_challenge_submission_cleans_up_on_publish_failure( + self, mock_publish + ): + self.url = reverse_lazy( + "jobs:challenge_submission", + kwargs={ + "challenge_id": self.challenge.pk, + "challenge_phase_id": self.challenge_phase.pk, + }, + ) + + self.challenge.participant_teams.add(self.participant_team) + self.challenge.save() + + submission_count_before = Submission.objects.count() + + response = self.client.post( + self.url, + {"status": "submitting", "input_file": self.input_file}, + format="multipart", + ) + self.assertEqual( + response.status_code, + status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + self.assertIn("error", response.data) + # Orphaned submission must be deleted + self.assertEqual(Submission.objects.count(), submission_count_before) + + @mock.patch( + "jobs.views.publish_submission_message", + side_effect=botocore.exceptions.EndpointConnectionError( + endpoint_url="https://sqs.us-east-1.amazonaws.com" + ), + ) + def test_challenge_submission_handles_sqs_endpoint_failure( + self, mock_publish + ): + self.url = reverse_lazy( + "jobs:challenge_submission", + kwargs={ + "challenge_id": self.challenge.pk, + "challenge_phase_id": self.challenge_phase.pk, + }, + ) + + self.challenge.participant_teams.add(self.participant_team) + self.challenge.save() + + submission_count_before = Submission.objects.count() + + response = self.client.post( + self.url, + {"status": "submitting", "input_file": self.input_file}, + format="multipart", + ) + self.assertEqual( + response.status_code, + status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + self.assertEqual(Submission.objects.count(), submission_count_before) + + @mock.patch( + "jobs.views.publish_submission_message", + side_effect=Exception("SQS send failed"), + ) + def test_challenge_submission_preserves_quota_on_publish_failure( + self, mock_publish + ): + self.url = reverse_lazy( + "jobs:challenge_submission", + kwargs={ + "challenge_id": self.challenge.pk, + "challenge_phase_id": self.challenge_phase.pk, + }, + ) + + self.challenge.participant_teams.add(self.participant_team) + self.challenge.save() + + # First attempt fails due to SQS + response = self.client.post( + self.url, + {"status": "submitting", "input_file": self.input_file}, + format="multipart", + ) + self.assertEqual( + response.status_code, + status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + # Participant's quota should not be consumed — retry must still be allowed + mock_publish.side_effect = None + mock_publish.return_value = None + retry_input = SimpleUploadedFile( + "retry_input.txt", b"file_content", content_type="text/plain" + ) + response = self.client.post( + self.url, + {"status": "submitting", "input_file": retry_input}, + format="multipart", + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + + @mock.patch("jobs.views.publish_submission_message") + def test_challenge_submission_returns_201_when_publish_succeeds( + self, mock_publish + ): + self.url = reverse_lazy( + "jobs:challenge_submission", + kwargs={ + "challenge_id": self.challenge.pk, + "challenge_phase_id": self.challenge_phase.pk, + }, + ) + + self.challenge.participant_teams.add(self.participant_team) + self.challenge.save() + + submission_count_before = Submission.objects.count() + + response = self.client.post( + self.url, + {"status": "submitting", "input_file": self.input_file}, + format="multipart", + ) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertEqual( + Submission.objects.count(), submission_count_before + 1 + ) + mock_publish.assert_called_once() + def test_challenge_submission_when_file_url_is_none(self): self.url = reverse_lazy( "jobs:challenge_submission", From 7274188b9290e5cc5ee5bc527f2bd3ee3fba8d59 Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Thu, 12 Feb 2026 13:21:21 -0800 Subject: [PATCH 2/5] fix: use Submission.FAILED instead of undefined Submission.CANCELLED Signed-off-by: WHOIM1205 --- apps/jobs/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/jobs/views.py b/apps/jobs/views.py index 7767e09fda..2c0c0fe4d7 100644 --- a/apps/jobs/views.py +++ b/apps/jobs/views.py @@ -421,8 +421,8 @@ def challenge_submission(request, challenge_id, challenge_phase_id): submission.pk, challenge_id, ) - submission.status = Submission.CANCELLED - submission.save() + submission.status = Submission.FAILED + submission.save(update_fields=["status"]) response_data = { "error": "Failed to process your submission. Please try again." } From be6732fbdc85c07362339aa068b6a3209e60a1c1 Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Thu, 12 Feb 2026 15:17:10 -0800 Subject: [PATCH 3/5] fix: update tests to expect CANCELLED status on SQS publish failure Signed-off-by: WHOIM1205 --- tests/unit/jobs/test_views.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/unit/jobs/test_views.py b/tests/unit/jobs/test_views.py index abae0485f5..3ca22ab99e 100644 --- a/tests/unit/jobs/test_views.py +++ b/tests/unit/jobs/test_views.py @@ -563,8 +563,10 @@ def test_challenge_submission_cleans_up_on_publish_failure( status.HTTP_500_INTERNAL_SERVER_ERROR, ) self.assertIn("error", response.data) - # Orphaned submission must be deleted - self.assertEqual(Submission.objects.count(), submission_count_before) + # Submission must be kept but marked as CANCELLED + self.assertEqual(Submission.objects.count(), submission_count_before + 1) + submission = Submission.objects.latest("id") + self.assertEqual(submission.status, Submission.CANCELLED) @mock.patch( "jobs.views.publish_submission_message", @@ -597,7 +599,10 @@ def test_challenge_submission_handles_sqs_endpoint_failure( response.status_code, status.HTTP_500_INTERNAL_SERVER_ERROR, ) - self.assertEqual(Submission.objects.count(), submission_count_before) + # Submission must be kept but marked as CANCELLED + self.assertEqual(Submission.objects.count(), submission_count_before + 1) + submission = Submission.objects.latest("id") + self.assertEqual(submission.status, Submission.CANCELLED) @mock.patch( "jobs.views.publish_submission_message", @@ -618,6 +623,7 @@ def test_challenge_submission_preserves_quota_on_publish_failure( self.challenge.save() # First attempt fails due to SQS + submission_count_before = Submission.objects.count() response = self.client.post( self.url, {"status": "submitting", "input_file": self.input_file}, @@ -627,6 +633,10 @@ def test_challenge_submission_preserves_quota_on_publish_failure( response.status_code, status.HTTP_500_INTERNAL_SERVER_ERROR, ) + # Submission must be kept but marked as CANCELLED + self.assertEqual(Submission.objects.count(), submission_count_before + 1) + submission = Submission.objects.latest("id") + self.assertEqual(submission.status, Submission.CANCELLED) # Participant's quota should not be consumed — retry must still be allowed mock_publish.side_effect = None From 10cda954b1149c985850641589d3c3eeaa85f6d5 Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Thu, 12 Feb 2026 15:46:11 -0800 Subject: [PATCH 4/5] fix: revert to deleting submission on SQS publish failure Signed-off-by: WHOIM1205 --- apps/jobs/views.py | 5 ++--- tests/unit/jobs/test_views.py | 16 +++------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/apps/jobs/views.py b/apps/jobs/views.py index 2c0c0fe4d7..bae14133a6 100644 --- a/apps/jobs/views.py +++ b/apps/jobs/views.py @@ -417,12 +417,11 @@ def challenge_submission(request, challenge_id, challenge_phase_id): except Exception: logger.exception( "SQS publish failed for submission %s in challenge %s, " - "cancelling submission", + "deleting submission", submission.pk, challenge_id, ) - submission.status = Submission.FAILED - submission.save(update_fields=["status"]) + submission.delete() response_data = { "error": "Failed to process your submission. Please try again." } diff --git a/tests/unit/jobs/test_views.py b/tests/unit/jobs/test_views.py index 3ca22ab99e..abae0485f5 100644 --- a/tests/unit/jobs/test_views.py +++ b/tests/unit/jobs/test_views.py @@ -563,10 +563,8 @@ def test_challenge_submission_cleans_up_on_publish_failure( status.HTTP_500_INTERNAL_SERVER_ERROR, ) self.assertIn("error", response.data) - # Submission must be kept but marked as CANCELLED - self.assertEqual(Submission.objects.count(), submission_count_before + 1) - submission = Submission.objects.latest("id") - self.assertEqual(submission.status, Submission.CANCELLED) + # Orphaned submission must be deleted + self.assertEqual(Submission.objects.count(), submission_count_before) @mock.patch( "jobs.views.publish_submission_message", @@ -599,10 +597,7 @@ def test_challenge_submission_handles_sqs_endpoint_failure( response.status_code, status.HTTP_500_INTERNAL_SERVER_ERROR, ) - # Submission must be kept but marked as CANCELLED - self.assertEqual(Submission.objects.count(), submission_count_before + 1) - submission = Submission.objects.latest("id") - self.assertEqual(submission.status, Submission.CANCELLED) + self.assertEqual(Submission.objects.count(), submission_count_before) @mock.patch( "jobs.views.publish_submission_message", @@ -623,7 +618,6 @@ def test_challenge_submission_preserves_quota_on_publish_failure( self.challenge.save() # First attempt fails due to SQS - submission_count_before = Submission.objects.count() response = self.client.post( self.url, {"status": "submitting", "input_file": self.input_file}, @@ -633,10 +627,6 @@ def test_challenge_submission_preserves_quota_on_publish_failure( response.status_code, status.HTTP_500_INTERNAL_SERVER_ERROR, ) - # Submission must be kept but marked as CANCELLED - self.assertEqual(Submission.objects.count(), submission_count_before + 1) - submission = Submission.objects.latest("id") - self.assertEqual(submission.status, Submission.CANCELLED) # Participant's quota should not be consumed — retry must still be allowed mock_publish.side_effect = None From 15c9203b27c5c14f5c9a78ce302b1ac7a6a265a6 Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Mon, 16 Feb 2026 04:00:55 +0530 Subject: [PATCH 5/5] fix: harden Dockerfile apt-get against stale cache failures Signed-off-by: WHOIM1205 --- docker/dev/django/Dockerfile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docker/dev/django/Dockerfile b/docker/dev/django/Dockerfile index d5ba1aebcf..46b5f92b73 100644 --- a/docker/dev/django/Dockerfile +++ b/docker/dev/django/Dockerfile @@ -7,8 +7,8 @@ ENV PYTHONUNBUFFERED=1 \ PIP_DEFAULT_TIMEOUT=100 # Install build dependencies only -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ +RUN DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --fix-missing \ build-essential \ libpq-dev \ libcurl4-openssl-dev \ @@ -20,7 +20,8 @@ RUN apt-get update && \ libfreetype6-dev \ liblcms2-dev \ libwebp-dev \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /var/cache/apt/* WORKDIR /code @@ -45,8 +46,8 @@ ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 # Install runtime dependencies (apt-get handles multi-arch automatically) -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ +RUN DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends --fix-missing \ libpq5 \ libcurl4 \ libjpeg62-turbo \