-
-
Notifications
You must be signed in to change notification settings - Fork 984
Backend: Optimize leaderboard query with two-stage fetch to reduce payload #5087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
cb2ddd8
421cc52
a0706d2
9e33e6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -426,70 +426,27 @@ | |
| submission__is_public=True | ||
| ) | ||
|
|
||
| if challenge_phase_split.show_execution_time: | ||
| time_diff_expression = ExpressionWrapper( | ||
| F("submission__completed_at") - F("submission__started_at"), | ||
| output_field=fields.DurationField(), | ||
| ) | ||
| leaderboard_data = leaderboard_data.annotate( | ||
| filtering_score=RawSQL( | ||
| "result->>%s", (default_order_by,), output_field=FloatField() | ||
| ), | ||
| filtering_error=RawSQL( | ||
| "error->>%s", | ||
| ("error_{0}".format(default_order_by),), | ||
| output_field=FloatField(), | ||
| ), | ||
| submission__execution_time=time_diff_expression, | ||
| ).values( | ||
| "id", | ||
| "submission__participant_team", | ||
| "submission__participant_team__team_name", | ||
| "submission__participant_team__team_url", | ||
| "submission__is_baseline", | ||
| "submission__is_public", | ||
| "challenge_phase_split", | ||
| "result", | ||
| "error", | ||
| "filtering_score", | ||
| "filtering_error", | ||
| "leaderboard__schema", | ||
| "submission__submitted_at", | ||
| "submission__method_name", | ||
| "submission__id", | ||
| "submission__submission_metadata", | ||
| "submission__execution_time", | ||
| "submission__is_verified_by_host", | ||
| ) | ||
| else: | ||
| leaderboard_data = leaderboard_data.annotate( | ||
| filtering_score=RawSQL( | ||
| "result->>%s", (default_order_by,), output_field=FloatField() | ||
| ), | ||
| filtering_error=RawSQL( | ||
| "error->>%s", | ||
| ("error_{0}".format(default_order_by),), | ||
| output_field=FloatField(), | ||
| ), | ||
| ).values( | ||
| "id", | ||
| "submission__participant_team", | ||
| "submission__participant_team__team_name", | ||
| "submission__participant_team__team_url", | ||
| "submission__is_baseline", | ||
| "submission__is_public", | ||
| "challenge_phase_split", | ||
| "result", | ||
| "error", | ||
| "filtering_score", | ||
| "filtering_error", | ||
| "leaderboard__schema", | ||
| "submission__submitted_at", | ||
| "submission__method_name", | ||
| "submission__id", | ||
| "submission__submission_metadata", | ||
| "submission__is_verified_by_host", | ||
| ) | ||
| # Stage 1: fetch a lightweight projection (no heavy JSON columns) for | ||
| # sort + dedup. This avoids transferring result/error/submission_metadata | ||
| # for rows that will be discarded by per-team deduplication. | ||
| leaderboard_data_light = leaderboard_data.annotate( | ||
| filtering_score=RawSQL( | ||
| "result->>%s", (default_order_by,), output_field=FloatField() | ||
| ), | ||
| filtering_error=RawSQL( | ||
| "error->>%s", | ||
| ("error_{0}".format(default_order_by),), | ||
| output_field=FloatField(), | ||
| ), | ||
| ).values( | ||
| "id", | ||
| "submission__participant_team", | ||
| "submission__participant_team__team_name", | ||
| "submission__is_baseline", | ||
| "error", | ||
| "filtering_score", | ||
| "filtering_error", | ||
| ) | ||
|
|
||
| all_banned_participant_team = set() | ||
| all_banned_email_ids_set = ( | ||
|
|
@@ -498,15 +455,12 @@ | |
|
|
||
| # Apply query limit to prevent slow queries on popular challenges | ||
| max_limit = getattr(settings, "MAX_LEADERBOARD_QUERY_LIMIT", 10000) | ||
| leaderboard_data = leaderboard_data[:max_limit] | ||
|
|
||
| # Convert to list to allow multiple iterations | ||
| leaderboard_data = list(leaderboard_data) | ||
| leaderboard_data_light = list(leaderboard_data_light[:max_limit]) | ||
|
|
||
| # Prefetch all participant teams and their participants' emails in bulk | ||
| # (fixes N+1 query) | ||
| unique_team_ids = set( | ||
| item["submission__participant_team"] for item in leaderboard_data | ||
| item["submission__participant_team"] for item in leaderboard_data_light | ||
| ) | ||
| participant_teams = ParticipantTeam.objects.filter( | ||
| id__in=unique_team_ids | ||
|
|
@@ -517,7 +471,7 @@ | |
| for team in participant_teams | ||
| } | ||
|
|
||
| for leaderboard_item in leaderboard_data: | ||
| for leaderboard_item in leaderboard_data_light: | ||
| participant_team_id = leaderboard_item["submission__participant_team"] | ||
| all_participants_email_ids = team_emails_lookup.get( | ||
| participant_team_id, [] | ||
|
|
@@ -527,21 +481,23 @@ | |
| all_banned_participant_team.add(participant_team_id) | ||
| break | ||
| if leaderboard_item["error"] is None: | ||
| leaderboard_item.update(filtering_error=0) | ||
| leaderboard_item["filtering_error"] = 0 | ||
| if leaderboard_item["filtering_score"] is None: | ||
| leaderboard_item.update(filtering_score=0) | ||
| leaderboard_item["filtering_score"] = 0 | ||
|
|
||
| if challenge_phase_split.show_leaderboard_by_latest_submission: | ||
| sorted_leaderboard_data = leaderboard_data | ||
| sorted_leaderboard_data = leaderboard_data_light | ||
| else: | ||
| sorted_leaderboard_data = sorted( | ||
| leaderboard_data, | ||
| leaderboard_data_light, | ||
| key=lambda k: ( | ||
| float(k["filtering_score"]), | ||
| float(-k["filtering_error"]), | ||
| ), | ||
| reverse=True if is_leaderboard_order_descending else False, | ||
| ) | ||
| distinct_sorted_leaderboard_data = [] | ||
|
|
||
| retained_light = [] | ||
| team_list = set() | ||
| for data in sorted_leaderboard_data: | ||
| if ( | ||
|
|
@@ -551,11 +507,86 @@ | |
| ): | ||
| continue | ||
| elif data["submission__is_baseline"] is True: | ||
| distinct_sorted_leaderboard_data.append(data) | ||
| retained_light.append(data) | ||
| else: | ||
| distinct_sorted_leaderboard_data.append(data) | ||
| retained_light.append(data) | ||
| team_list.add(data["submission__participant_team__team_name"]) | ||
|
|
||
| # Stage 2: fetch full row data (with heavy JSON columns) only for the | ||
| # retained leaderboard rows, then reapply the order from stage 1. | ||
| retained_ids = [item["id"] for item in retained_light] | ||
| heavy_qs = LeaderboardData.objects.filter(id__in=retained_ids).annotate( | ||
| filtering_score=RawSQL( | ||
| "result->>%s", (default_order_by,), output_field=FloatField() | ||
| ), | ||
| filtering_error=RawSQL( | ||
| "error->>%s", | ||
| ("error_{0}".format(default_order_by),), | ||
| output_field=FloatField(), | ||
| ), | ||
| ) | ||
| if challenge_phase_split.show_execution_time: | ||
| time_diff_expression = ExpressionWrapper( | ||
| F("submission__completed_at") - F("submission__started_at"), | ||
| output_field=fields.DurationField(), | ||
| ) | ||
| heavy_qs = heavy_qs.annotate( | ||
| submission__execution_time=time_diff_expression, | ||
| ).values( | ||
| "id", | ||
| "submission__participant_team", | ||
| "submission__participant_team__team_name", | ||
| "submission__participant_team__team_url", | ||
| "submission__is_baseline", | ||
| "submission__is_public", | ||
| "challenge_phase_split", | ||
| "result", | ||
| "error", | ||
| "filtering_score", | ||
| "filtering_error", | ||
| "leaderboard__schema", | ||
| "submission__submitted_at", | ||
| "submission__method_name", | ||
| "submission__id", | ||
| "submission__submission_metadata", | ||
| "submission__execution_time", | ||
| "submission__is_verified_by_host", | ||
| ) | ||
| else: | ||
| heavy_qs = heavy_qs.values( | ||
| "id", | ||
| "submission__participant_team", | ||
| "submission__participant_team__team_name", | ||
| "submission__participant_team__team_url", | ||
| "submission__is_baseline", | ||
| "submission__is_public", | ||
| "challenge_phase_split", | ||
| "result", | ||
| "error", | ||
| "filtering_score", | ||
| "filtering_error", | ||
| "leaderboard__schema", | ||
| "submission__submitted_at", | ||
| "submission__method_name", | ||
| "submission__id", | ||
| "submission__submission_metadata", | ||
| "submission__is_verified_by_host", | ||
| ) | ||
|
Comment on lines
+517
to
+574
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
# Locate and show the relevant code block
FILE="apps/jobs/utils.py"
python - <<'PY'
import os, itertools
path="apps/jobs/utils.py"
start=480
end=640
with open(path,'r',encoding='utf-8') as f:
for i,line in enumerate(f, start=1):
if start <= i <= end:
print(f"{i:4d}:{line.rstrip()}")
if i > end:
break
PY
# Also capture where the stage-1 queryset is defined (look for likely variable names)
rg -n "leaderboard_data|retained_light|stage.?1|is_disabled|host.*exclude|public" apps/jobs/utils.py || true
# Show definition of any queryset variable used in the snippet (e.g., leaderboard_data)
rg -n "leaderboard_data\s*=" apps/jobs/utils.py || trueRepository: Cloud-CV/EvalAI Length of output: 9444 🏁 Script executed: #!/bin/bash
set -euo pipefail
FILE="apps/jobs/utils.py"
python - <<'PY'
path="apps/jobs/utils.py"
with open(path,'r',encoding='utf-8') as f:
for i,line in enumerate(f, start=1):
if 360 <= i <= 470:
print(f"{i:4d}:{line.rstrip()}")
PY
rg -n "only_public_entries|exclude\(|is_disabled|is_public|host|flag|banned|all_banned|participant" apps/jobs/utils.pyRepository: Cloud-CV/EvalAI Length of output: 9110 Keep stage-2 fetch scoped to the stage-1 filtered Suggested change-retained_ids = [item["id"] for item in retained_light]
-heavy_qs = LeaderboardData.objects.filter(id__in=retained_ids).annotate(
+retained_ids = [item["id"] for item in retained_light]
+heavy_qs = leaderboard_data.filter(id__in=retained_ids).annotate(
filtering_score=RawSQL(
"result->>%s", (default_order_by,), output_field=FloatField()
),🤖 Prompt for AI Agents |
||
|
|
||
| heavy_by_id = {row["id"]: row for row in heavy_qs} | ||
|
|
||
| distinct_sorted_leaderboard_data = [] | ||
| for light_item in retained_light: | ||
| full = heavy_by_id.get(light_item["id"]) | ||
| if full is None: | ||
| # Row removed between stage 1 and stage 2; skip. | ||
| continue | ||
| if full["error"] is None: | ||
| full["filtering_error"] = 0 | ||
| if full["filtering_score"] is None: | ||
| full["filtering_score"] = 0 | ||
| distinct_sorted_leaderboard_data.append(full) | ||
|
|
||
| leaderboard_labels = challenge_phase_split.leaderboard.schema["labels"] | ||
| show_scores = challenge_phase_split.show_scores_on_leaderboard | ||
| for item in distinct_sorted_leaderboard_data: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: Cloud-CV/EvalAI
Length of output: 169
🏁 Script executed:
Repository: Cloud-CV/EvalAI
Length of output: 41
🏁 Script executed:
Repository: Cloud-CV/EvalAI
Length of output: 19990
🏁 Script executed:
Repository: Cloud-CV/EvalAI
Length of output: 15273
🏁 Script executed:
Repository: Cloud-CV/EvalAI
Length of output: 5418
Fix stage-1 projection to not fetch full
errorJSON and ensure stage-2 reapplies stage-1 filterscalculate_distinct_sorted_leaderboard_datastage 1 includes"error"in.values(...), contradicting the “no heavy JSON columns” goal; this fetches the fullLeaderboardData.errorJSON for every candidate row.LeaderboardData.objects.filter(id__in=retained_ids)and does not reapply the stage-1 filters (includingsubmission__is_public), so visibility/status constraints are only guaranteed via TOCTOU luck.Suggested change
@@ leaderboard_data_light = leaderboard_data.annotate( filtering_score=RawSQL( "result->>%s", (default_order_by,), output_field=FloatField() ), filtering_error=RawSQL( "error->>%s", ("error_{0}".format(default_order_by),), output_field=FloatField(), ), ).values( "id", "submission__participant_team", "submission__participant_team__team_name", "submission__is_baseline", - "error", "filtering_score", "filtering_error", ) @@ for leaderboard_item in leaderboard_data_light: @@ - if leaderboard_item["error"] is None: + if leaderboard_item["filtering_error"] is None: leaderboard_item["filtering_error"] = 0 if leaderboard_item["filtering_score"] is None: leaderboard_item["filtering_score"] = 0 @@ # Stage 2: fetch full row data (with heavy JSON columns) only for the # retained leaderboard rows, then reapply the order from stage 1. retained_ids = [item["id"] for item in retained_light] - heavy_qs = LeaderboardData.objects.filter(id__in=retained_ids).annotate( + heavy_qs = leaderboard_data.filter(id__in=retained_ids).annotate( filtering_score=RawSQL( "result->>%s", (default_order_by,), output_field=FloatField() ), filtering_error=RawSQL( "error->>%s", ("error_{0}".format(default_order_by),), output_field=FloatField(), ), )🤖 Prompt for AI Agents