Test
", None).unwrap(); + result.vertical_data = Some(VerticalData { + extractor: "github_repo".to_string(), + data: serde_json::json!({ "repo": "noxa" }), + }); + + let json = serde_json::to_value(&result).unwrap(); + assert_eq!(json["vertical_data"]["extractor"], "github_repo"); + assert_eq!(json["vertical_data"]["data"]["repo"], "noxa"); + } + #[test] fn youtube_extraction_produces_structured_markdown() { let html = r#" diff --git a/crates/noxa-core/src/llm/cleanup/css.rs b/crates/noxa-core/src/llm/cleanup/css.rs index 92f5eb6..d40cc98 100644 --- a/crates/noxa-core/src/llm/cleanup/css.rs +++ b/crates/noxa-core/src/llm/cleanup/css.rs @@ -28,10 +28,7 @@ fn strip_css_at_rules(line: &str) -> String { let mut result = line.to_string(); // Iteratively remove at-rule blocks with balanced brace handling - loop { - let Some(m) = CSS_AT_RE.find(&result) else { - break; - }; + while let Some(m) = CSS_AT_RE.find(&result) { let start = m.start(); // Find the matching closing brace after the at-rule header let after_header = m.end(); diff --git a/crates/noxa-core/src/llm/mod.rs b/crates/noxa-core/src/llm/mod.rs index ad7356c..e1a0b52 100644 --- a/crates/noxa-core/src/llm/mod.rs +++ b/crates/noxa-core/src/llm/mod.rs @@ -97,6 +97,7 @@ mod tests { raw_html: None, }, domain_data: None, + vertical_data: None, structured_data: vec![], } } @@ -405,6 +406,7 @@ mod tests { raw_html: None, }, domain_data: None, + vertical_data: None, structured_data: vec![], }; diff --git a/crates/noxa-core/src/structured_data.rs b/crates/noxa-core/src/structured_data.rs index 2ce41e8..ea3b394 100644 --- a/crates/noxa-core/src/structured_data.rs +++ b/crates/noxa-core/src/structured_data.rs @@ -53,7 +53,7 @@ pub fn extract_json_ld(html: &str) -> VecWe need to make sure you're not a robot.
+ + "#; + + let err = parse_reddit_json( + html, + "https://www.reddit.com/r/rust/comments/abc123/release_thread/", + ) + .expect_err("verification HTML should not be treated as generic JSON parse failure"); + + assert!(err.contains("verification"), "unexpected error: {err}"); + } + + #[test] + fn reddit_json_user_agent_identifies_bot_contact() { + let ua = json_api_user_agent(); + + assert!(ua.contains("noxa")); + assert!(ua.contains("bot")); + } } diff --git a/crates/noxa-fetch/src/sitemap.rs b/crates/noxa-fetch/src/sitemap.rs index 3a5a3bf..2ea09a3 100644 --- a/crates/noxa-fetch/src/sitemap.rs +++ b/crates/noxa-fetch/src/sitemap.rs @@ -155,19 +155,28 @@ async fn fetch_sitemaps( pub fn parse_robots_txt(text: &str) -> Vec
+
+
diff --git a/crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json b/crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json
new file mode 100644
index 0000000..d314f67
--- /dev/null
+++ b/crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json
@@ -0,0 +1,45 @@
+{
+ "data": {
+ "user": {
+ "id": "123",
+ "username": "jmagar",
+ "full_name": "Jacob Magar",
+ "biography": "Building Noxa",
+ "bio_links": [],
+ "external_url": "https://example.com",
+ "category_name": "Software",
+ "profile_pic_url": "https://example.com/pic.jpg",
+ "profile_pic_url_hd": "https://example.com/pic-hd.jpg",
+ "is_verified": false,
+ "is_private": false,
+ "is_business_account": false,
+ "is_professional_account": true,
+ "edge_followed_by": { "count": 100 },
+ "edge_follow": { "count": 50 },
+ "edge_owner_to_timeline_media": {
+ "count": 1,
+ "edges": [
+ {
+ "node": {
+ "__typename": "GraphImage",
+ "shortcode": "ABC123",
+ "is_video": false,
+ "video_view_count": null,
+ "display_url": "https://example.com/display.jpg",
+ "thumbnail_src": "https://example.com/thumb.jpg",
+ "accessibility_caption": "alt text",
+ "taken_at_timestamp": 1767225600,
+ "product_type": "feed",
+ "dimensions": { "width": 1080, "height": 1080 },
+ "edge_media_preview_like": { "count": 10 },
+ "edge_media_to_comment": { "count": 2 },
+ "edge_media_to_caption": {
+ "edges": [{ "node": { "text": "Fixture caption" } }]
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+}
diff --git a/crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html b/crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html
new file mode 100644
index 0000000..5c7bc4d
--- /dev/null
+++ b/crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html
@@ -0,0 +1,12 @@
+
+
+ Shipping extractors today
+ + diff --git a/crates/noxa-fetch/tests/fixtures/extractors/npm_downloads.json b/crates/noxa-fetch/tests/fixtures/extractors/npm_downloads.json new file mode 100644 index 0000000..173e381 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/npm_downloads.json @@ -0,0 +1,3 @@ +{ + "downloads": 123456 +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/npm_registry.json b/crates/noxa-fetch/tests/fixtures/extractors/npm_registry.json new file mode 100644 index 0000000..680a900 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/npm_registry.json @@ -0,0 +1,21 @@ +{ + "name": "@types/node", + "description": "TypeScript definitions for node", + "homepage": "https://github.com/DefinitelyTyped/DefinitelyTyped", + "repository": { "url": "git+https://github.com/DefinitelyTyped/DefinitelyTyped.git" }, + "keywords": ["node"], + "maintainers": [{ "name": "types", "email": "types@example.com" }], + "dist-tags": { "latest": "20.0.0" }, + "versions": { + "20.0.0": { + "license": "MIT", + "dependencies": { "undici-types": "~5.0.0" }, + "devDependencies": {}, + "peerDependencies": {}, + "deprecated": null + } + }, + "time": { + "20.0.0": "2026-01-01T00:00:00.000Z" + } +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/product_page.html b/crates/noxa-fetch/tests/fixtures/extractors/product_page.html new file mode 100644 index 0000000..1f3b338 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/product_page.html @@ -0,0 +1,32 @@ + + +Use fixtures.
", + "creation_date": 1767229200, + "last_edit_date": null, + "owner": { "display_name": "answerer", "reputation": 20 } + } + ] +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/stackoverflow_question.json b/crates/noxa-fetch/tests/fixtures/extractors/stackoverflow_question.json new file mode 100644 index 0000000..944a253 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/stackoverflow_question.json @@ -0,0 +1,19 @@ +{ + "items": [ + { + "question_id": 12345, + "title": "How to test Rust extractors?", + "body": "How?
", + "tags": ["rust", "testing"], + "score": 5, + "view_count": 100, + "answer_count": 1, + "is_answered": true, + "accepted_answer_id": 99, + "creation_date": 1767225600, + "last_activity_date": 1767229200, + "owner": { "display_name": "asker", "reputation": 10 }, + "link": "https://stackoverflow.com/questions/12345/how-to-test-rust" + } + ] +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/substack_post.html b/crates/noxa-fetch/tests/fixtures/extractors/substack_post.html new file mode 100644 index 0000000..28bed32 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/substack_post.html @@ -0,0 +1,30 @@ + + + +Extractor parity needs explicit fixtures for broad content pages.
+Substack posts are intentionally explicit-only.
+