diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index cb5ec49216..29262b880f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -441,6 +441,8 @@ class RawCrawlConfig(BaseModel): selectLinks: List[str] = ["a[href]->href"] clickSelector: str = "a" + ignoreScopeForBehaviorLinks: bool | None = False + saveStorage: Optional[bool] = False diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index a4fac6ee96..6df1c24163 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -531,11 +531,29 @@ async def _load_crawl_configmap( self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org) + crawler_image = params["crawler_image"] + + configmap_logger = logger.bind(crawl_id=crawl.id) + raw_config = crawlconfig.get_raw_config() raw_config["behaviors"] = self._filter_autoclick_behavior( - raw_config["behaviors"], params["crawler_image"] + raw_config["behaviors"], crawler_image ) + if raw_config.get("ignoreScopeForBehaviorLinks") is True: + min_behavior_links_image = os.environ.get( + "MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE" + ) + if min_behavior_links_image and crawler_image_below_minimum( + crawler_image, min_behavior_links_image + ): + raw_config.pop("ignoreScopeForBehaviorLinks", None) + configmap_logger.warning( + "crawl_configmap_ignore_scope_behavior_links_ignored", + crawler_image=crawler_image, + min_behavior_links_image=min_behavior_links_image, + ) + if crawl.seed_file_url: raw_config["seedFile"] = crawl.seed_file_url raw_config.pop("seedFileId", None) @@ -543,9 +561,8 @@ async def _load_crawl_configmap( params["config"] = json.dumps(raw_config) if config_update_needed: - logger.debug( + configmap_logger.debug( "crawl_configmap_updated", - crawl_id=crawl.id, unstructured_message=f"Updating config for {crawl.id}", ) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 532f835551..953e78a1ba 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -317,6 +317,17 @@ def test_verify_default_click_selector( assert r.json()["config"]["clickSelector"] == "a" +def test_verify_default_ignore_scope_behavior_links( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["config"]["ignoreScopeForBehaviorLinks"] is False + + def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", @@ -327,6 +338,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d "scopeType": "domain", "selectLinks": ["a[href]->href", "script[src]->src"], "clickSelector": "button", + "ignoreScopeForBehaviorLinks": True, } }, ) @@ -344,6 +356,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d assert config["scopeType"] == "domain" assert config["selectLinks"] == ["a[href]->href", "script[src]->src"] assert config["clickSelector"] == "button" + assert config["ignoreScopeForBehaviorLinks"] is True # Verify fields set in config originally are unchanged assert config["lang"] == "en" @@ -363,6 +376,7 @@ def test_update_config_no_changes( "scopeType": "domain", "selectLinks": ["a[href]->href", "script[src]->src"], "clickSelector": "button", + "ignoreScopeForBehaviorLinks": True, } }, ) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 0e8370c420..588c96ecc9 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -84,6 +84,8 @@ data: MIN_SEED_FILE_CRAWLER_IMAGE: "{{ .Values.min_seed_file_crawler_image }}" + MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE: "{{ .Values.min_behavior_links_crawler_image }}" + NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}" MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" diff --git a/chart/values.yaml b/chart/values.yaml index 46bd6d80fa..4f2d52a482 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -321,6 +321,9 @@ min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0" # if set, will restrict seed files to image names that are >= this value min_seed_file_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.7.0" +# if set, will restrict ignoring scope for behaviors to image names that are >= this value +min_behavior_links_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.14.0" + # optional: enable to use a persist volume claim for all crawls # can be enabled to use a multi-write shared filesystem # crawler_pv_claim: "nfs-shared-crawls"