From 91f3204961147e747464387fa875daf0cc6379bf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Apr 2026 16:33:36 -0700 Subject: [PATCH] add new 'rate-limited' state to indicate crawl has been rate limited, but is still running: backend: - crawl is in expontential backoff loop, set state to 'rate-limited' when corresponding redis key is set frontend: - show 'Rate Limited' state and warning message indicating the crawl is rate limited, though still running wip: need to decide on final ux, wording and state name fixes #3255 --- backend/btrixcloud/models.py | 2 +- backend/btrixcloud/operator/crawls.py | 5 +++ backend/btrixcloud/operator/models.py | 1 + .../features/archived-items/crawl-status.ts | 10 ++++++ frontend/src/pages/org/workflow-detail.ts | 35 +++++++++++++++++-- frontend/src/types/crawlState.ts | 1 + 6 files changed, 51 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index cb5ec49216..7ea4b7fdc3 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -270,7 +270,7 @@ class UserOrgInfoOut(BaseModel): # ============================================================================ TYPE_RUNNING_STATES = Literal[ - "running", "pending-wait", "generate-wacz", "uploading-wacz" + "running", "pending-wait", "generate-wacz", "uploading-wacz", "rate-limited" ] RUNNING_STATES = get_args(TYPE_RUNNING_STATES) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index a4fac6ee96..b21aadcf74 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1688,6 +1688,7 @@ async def get_redis_crawl_stats( pipe.hgetall(f"{crawl_id}:size") pipe.get(f"{crawl_id}:profileUploaded") pipe.smembers(f"{crawl_id}:reqCrawls") + pipe.get(f"{crawl_id}:rateLimited") results = await pipe.execute() @@ -1707,6 +1708,7 @@ async def get_redis_crawl_stats( profile_update = results[5] req_crawls = results[6] + rate_limited = results[7] == "1" stats = OpCrawlStats( found=pages_found, @@ -1714,6 +1716,7 @@ async def get_redis_crawl_stats( size=archive_size, profile_update=profile_update, req_crawls=req_crawls, + rate_limited=rate_limited, ) return stats, sizes @@ -1902,6 +1905,8 @@ async def update_crawl_state( else: new_status: TYPE_RUNNING_STATES = "running" + if stats.rate_limited: + new_status = "rate-limited" if status_count.get("generate-wacz"): new_status = "generate-wacz" elif status_count.get("uploading-wacz"): diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 0bd5b903a8..458d94787b 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -219,6 +219,7 @@ class OpCrawlStats(CrawlStats): """crawl stats + internal profile update""" profile_update: Optional[str] = "" + rate_limited: Optional[bool] = False # ============================================================================ diff --git a/frontend/src/features/archived-items/crawl-status.ts b/frontend/src/features/archived-items/crawl-status.ts index 18a08319ca..fa4c889970 100644 --- a/frontend/src/features/archived-items/crawl-status.ts +++ b/frontend/src/features/archived-items/crawl-status.ts @@ -147,6 +147,16 @@ export class CrawlStatus extends TailwindElement { label = msg("Running"); break; + case "rate-limited": + color = "var(--warning)"; + icon = html``; + label = msg("Rate Limited"); + break; + case "stopping": color = "var(--sl-color-violet-600)"; icon = html` - ${this.renderPausedNotice()} ${this.renderLatestCrawl()} + ${this.renderRateLimitedNotice()} ${this.renderPausedNotice()} + ${this.renderLatestCrawl()} ${this.renderSettings()} @@ -1490,6 +1492,35 @@ export class WorkflowDetail extends BtrixElement { `; }; + private renderRateLimitedNotice() { + if (this.workflow?.lastCrawlState !== "rate-limited") { + return html``; + } + return html` + +
+ + + + ${msg("The site is blocking or rate limiting our crawling")} + + +
+
+

+ ${msg( + "The crawl has encountered error or CAPTCHA pages and is skipping them. See our guide for more info", + )} +

+
+
+ `; + } + private renderLatestCrawlAction() { if (!this.workflow || !this.lastCrawlId) return; diff --git a/frontend/src/types/crawlState.ts b/frontend/src/types/crawlState.ts index 20bc18f0df..c54b332151 100644 --- a/frontend/src/types/crawlState.ts +++ b/frontend/src/types/crawlState.ts @@ -4,6 +4,7 @@ export const RUNNING_STATES = [ "pending-wait", "generate-wacz", "uploading-wacz", + "rate-limited", ] as const; // Match backend TYPE_WAITING_NOT_PAUSED_STATES in models.py