Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@ class RawCrawlConfig(BaseModel):
selectLinks: List[str] = ["a[href]->href"]
clickSelector: str = "a"

ignoreScopeForBehaviorLinks: bool | None = False

saveStorage: Optional[bool] = False


Expand Down
23 changes: 20 additions & 3 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,21 +531,38 @@ async def _load_crawl_configmap(

self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)

crawler_image = params["crawler_image"]

configmap_logger = logger.bind(crawl_id=crawl.id)

raw_config = crawlconfig.get_raw_config()
raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"]
raw_config["behaviors"], crawler_image
)

if raw_config.get("ignoreScopeForBehaviorLinks") is True:
min_behavior_links_image = os.environ.get(
"MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE"
)
if min_behavior_links_image and crawler_image_below_minimum(
crawler_image, min_behavior_links_image
):
raw_config.pop("ignoreScopeForBehaviorLinks", None)
configmap_logger.warning(
"crawl_configmap_ignore_scope_behavior_links_ignored",
crawler_image=crawler_image,
min_behavior_links_image=min_behavior_links_image,
)

if crawl.seed_file_url:
raw_config["seedFile"] = crawl.seed_file_url
raw_config.pop("seedFileId", None)

params["config"] = json.dumps(raw_config)

if config_update_needed:
logger.debug(
configmap_logger.debug(
"crawl_configmap_updated",
crawl_id=crawl.id,
unstructured_message=f"Updating config for {crawl.id}",
)

Expand Down
14 changes: 14 additions & 0 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,17 @@ def test_verify_default_click_selector(
assert r.json()["config"]["clickSelector"] == "a"


def test_verify_default_ignore_scope_behavior_links(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["ignoreScopeForBehaviorLinks"] is False


def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
Expand All @@ -327,6 +338,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
"clickSelector": "button",
"ignoreScopeForBehaviorLinks": True,
}
},
)
Expand All @@ -344,6 +356,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
assert config["scopeType"] == "domain"
assert config["selectLinks"] == ["a[href]->href", "script[src]->src"]
assert config["clickSelector"] == "button"
assert config["ignoreScopeForBehaviorLinks"] is True

# Verify fields set in config originally are unchanged
assert config["lang"] == "en"
Expand All @@ -363,6 +376,7 @@ def test_update_config_no_changes(
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
"clickSelector": "button",
"ignoreScopeForBehaviorLinks": True,
}
},
)
Expand Down
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ data:

MIN_SEED_FILE_CRAWLER_IMAGE: "{{ .Values.min_seed_file_crawler_image }}"

MIN_BEHAVIOR_LINKS_CRAWLER_IMAGE: "{{ .Values.min_behavior_links_crawler_image }}"

NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}"

MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}"
Expand Down
3 changes: 3 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,9 @@ min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0"
# if set, will restrict seed files to image names that are >= this value
min_seed_file_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.7.0"

# if set, will restrict ignoring scope for behaviors to image names that are >= this value
min_behavior_links_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.14.0"

# optional: enable to use a persist volume claim for all crawls
# can be enabled to use a multi-write shared filesystem
# crawler_pv_claim: "nfs-shared-crawls"
Expand Down
Loading