From 865048689aa8d711eb3ffa70cc963e6e3ed32511 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 23 Jun 2026 16:23:41 -0400 Subject: [PATCH 1/4] Add ignoreScopeForBehaviorLinks to crawlconfig --- backend/btrixcloud/models.py | 2 ++ backend/test/test_crawlconfigs.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index cb5ec49216..29262b880f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -441,6 +441,8 @@ class RawCrawlConfig(BaseModel): selectLinks: List[str] = ["a[href]->href"] clickSelector: str = "a" + ignoreScopeForBehaviorLinks: bool | None = False + saveStorage: Optional[bool] = False diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 532f835551..953e78a1ba 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -317,6 +317,17 @@ def test_verify_default_click_selector( assert r.json()["config"]["clickSelector"] == "a" +def test_verify_default_ignore_scope_behavior_links( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["config"]["ignoreScopeForBehaviorLinks"] is False + + def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", @@ -327,6 +338,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d "scopeType": "domain", "selectLinks": ["a[href]->href", "script[src]->src"], "clickSelector": "button", + "ignoreScopeForBehaviorLinks": True, } }, ) @@ -344,6 +356,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d assert config["scopeType"] == "domain" assert config["selectLinks"] == ["a[href]->href", "script[src]->src"] assert config["clickSelector"] == "button" + assert config["ignoreScopeForBehaviorLinks"] is True # Verify fields set in config originally are unchanged assert config["lang"] == "en" @@ -363,6 +376,7 @@ def test_update_config_no_changes( "scopeType": "domain", "selectLinks": ["a[href]->href", "script[src]->src"], "clickSelector": "button", + "ignoreScopeForBehaviorLinks": True, } }, ) From f55b772f9cc2c78f5349fdc431358d8a44b9a613 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 23 Jun 2026 17:35:32 -0400 Subject: [PATCH 2/4] Add min crawler image check when creating crawl configmap --- backend/btrixcloud/operator/crawls.py | 13 ++++++++++++- chart/templates/configmap.yaml | 2 ++ chart/values.yaml | 3 +++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index a4fac6ee96..dc15be9076 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -531,11 +531,22 @@ async def _load_crawl_configmap( self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org) + crawler_image = params["crawler_image"] + raw_config = crawlconfig.get_raw_config() raw_config["behaviors"] = self._filter_autoclick_behavior( - raw_config["behaviors"], params["crawler_image"] + raw_config["behaviors"], crawler_image ) + if raw_config.get("ignoreScopeForBehaviorLinks") is True: + min_behavior_links_image = os.environ.get( + "MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE" + ) + if min_behavior_links_image and crawler_image_below_minimum( + crawler_image, min_behavior_links_image + ): + raw_config.pop("ignoreScopeForBehaviorLinks", None) + if crawl.seed_file_url: raw_config["seedFile"] = crawl.seed_file_url raw_config.pop("seedFileId", None) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 0e8370c420..588c96ecc9 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -84,6 +84,8 @@ data: MIN_SEED_FILE_CRAWLER_IMAGE: "{{ .Values.min_seed_file_crawler_image }}" + MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE: "{{ .Values.min_behavior_links_crawler_image }}" + NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}" MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" diff --git a/chart/values.yaml b/chart/values.yaml index 46bd6d80fa..4f2d52a482 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -321,6 +321,9 @@ min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0" # if set, will restrict seed files to image names that are >= this value min_seed_file_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.7.0" +# if set, will restrict ignoring scope for behaviors to image names that are >= this value +min_behavior_links_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.14.0" + # optional: enable to use a persist volume claim for all crawls # can be enabled to use a multi-write shared filesystem # crawler_pv_claim: "nfs-shared-crawls" From fc3f978c8866606fee386764b0ebfed74110d472 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 23 Jun 2026 17:48:51 -0400 Subject: [PATCH 3/4] Log warning when crawler image doesn't meet minimum --- backend/btrixcloud/operator/crawls.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index dc15be9076..6df1c24163 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -533,6 +533,8 @@ async def _load_crawl_configmap( crawler_image = params["crawler_image"] + configmap_logger = logger.bind(crawl_id=crawl.id) + raw_config = crawlconfig.get_raw_config() raw_config["behaviors"] = self._filter_autoclick_behavior( raw_config["behaviors"], crawler_image @@ -546,6 +548,11 @@ async def _load_crawl_configmap( crawler_image, min_behavior_links_image ): raw_config.pop("ignoreScopeForBehaviorLinks", None) + configmap_logger.warning( + "crawl_configmap_ignore_scope_behavior_links_ignored", + crawler_image=crawler_image, + min_behavior_links_image=min_behavior_links_image, + ) if crawl.seed_file_url: raw_config["seedFile"] = crawl.seed_file_url @@ -554,9 +561,8 @@ async def _load_crawl_configmap( params["config"] = json.dumps(raw_config) if config_update_needed: - logger.debug( + configmap_logger.debug( "crawl_configmap_updated", - crawl_id=crawl.id, unstructured_message=f"Updating config for {crawl.id}", ) From 92e2bd7e8cdda6e5209755c20fd1f504c27e37d7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 29 Jun 2026 11:02:08 -0400 Subject: [PATCH 4/4] Rename option to alwaysAddBehaviorLinks, following crawler change --- backend/btrixcloud/models.py | 2 +- backend/btrixcloud/operator/crawls.py | 6 +++--- backend/test/test_crawlconfigs.py | 8 ++++---- chart/values.yaml | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 29262b880f..1759d4e1f1 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -441,7 +441,7 @@ class RawCrawlConfig(BaseModel): selectLinks: List[str] = ["a[href]->href"] clickSelector: str = "a" - ignoreScopeForBehaviorLinks: bool | None = False + alwaysAddBehaviorLinks: bool | None = False saveStorage: Optional[bool] = False diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 6df1c24163..3fce41d993 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -540,16 +540,16 @@ async def _load_crawl_configmap( raw_config["behaviors"], crawler_image ) - if raw_config.get("ignoreScopeForBehaviorLinks") is True: + if raw_config.get("alwaysAddBehaviorLinks") is True: min_behavior_links_image = os.environ.get( "MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE" ) if min_behavior_links_image and crawler_image_below_minimum( crawler_image, min_behavior_links_image ): - raw_config.pop("ignoreScopeForBehaviorLinks", None) + raw_config.pop("alwaysAddBehaviorLinks", None) configmap_logger.warning( - "crawl_configmap_ignore_scope_behavior_links_ignored", + "crawl_configmap_always_add_behavior_links_ignored", crawler_image=crawler_image, min_behavior_links_image=min_behavior_links_image, ) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 953e78a1ba..29362527d6 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -325,7 +325,7 @@ def test_verify_default_ignore_scope_behavior_links( headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["config"]["ignoreScopeForBehaviorLinks"] is False + assert r.json()["config"]["alwaysAddBehaviorLinks"] is False def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): @@ -338,7 +338,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d "scopeType": "domain", "selectLinks": ["a[href]->href", "script[src]->src"], "clickSelector": "button", - "ignoreScopeForBehaviorLinks": True, + "alwaysAddBehaviorLinks": True, } }, ) @@ -356,7 +356,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d assert config["scopeType"] == "domain" assert config["selectLinks"] == ["a[href]->href", "script[src]->src"] assert config["clickSelector"] == "button" - assert config["ignoreScopeForBehaviorLinks"] is True + assert config["alwaysAddBehaviorLinks"] is True # Verify fields set in config originally are unchanged assert config["lang"] == "en" @@ -376,7 +376,7 @@ def test_update_config_no_changes( "scopeType": "domain", "selectLinks": ["a[href]->href", "script[src]->src"], "clickSelector": "button", - "ignoreScopeForBehaviorLinks": True, + "alwaysAddBehaviorLinks": True, } }, ) diff --git a/chart/values.yaml b/chart/values.yaml index 4f2d52a482..e70b897811 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -321,7 +321,7 @@ min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0" # if set, will restrict seed files to image names that are >= this value min_seed_file_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.7.0" -# if set, will restrict ignoring scope for behaviors to image names that are >= this value +# if set, will restrict always adding behavior links to image names that are >= this value min_behavior_links_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.14.0" # optional: enable to use a persist volume claim for all crawls