From b3940ea6913b76b9cf9981113d8c479ce0087c3e Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Fri, 24 Apr 2026 21:14:29 -0400 Subject: [PATCH 01/28] fix(cli): remove shell injection from watch on-change command --- Cargo.lock | 1 + crates/noxa-cli/Cargo.toml | 1 + crates/noxa-cli/src/app/tests_primary.rs | 20 +++++++++++++++++++- crates/noxa-cli/src/app/watch.rs | 18 ++++++++++++++---- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d64072e..2a4a6e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2370,6 +2370,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "shlex", "strip-ansi-escapes", "tempfile", "tokio", diff --git a/crates/noxa-cli/Cargo.toml b/crates/noxa-cli/Cargo.toml index e6b65e2..0a7a03b 100644 --- a/crates/noxa-cli/Cargo.toml +++ b/crates/noxa-cli/Cargo.toml @@ -20,6 +20,7 @@ dotenvy = { workspace = true } rand = "0.8" serde_json = { workspace = true } serde = { workspace = true } +shlex = "1.3" tokio = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/noxa-cli/src/app/tests_primary.rs b/crates/noxa-cli/src/app/tests_primary.rs index 076ef5f..269eba9 100644 --- a/crates/noxa-cli/src/app/tests_primary.rs +++ b/crates/noxa-cli/src/app/tests_primary.rs @@ -466,7 +466,7 @@ mod tests { let output_path = dir.path().join("payload.json"); let payload = r#"{"status":"changed"}"#; let quoted_output_path = output_path.to_string_lossy().replace('\'', "'\"'\"'"); - let cmd = format!("cat > '{quoted_output_path}'"); + let cmd = format!("tee '{quoted_output_path}'"); run_on_change_command(&cmd, payload, std::time::Duration::from_secs(1)) .await @@ -476,6 +476,24 @@ mod tests { assert_eq!(written, payload); } + #[cfg(unix)] + #[tokio::test] + async fn on_change_command_treats_shell_metacharacters_as_arguments() { + let dir = tempfile::tempdir().unwrap(); + let injected_path = dir.path().join("injected"); + let payload = "{}"; + let cmd = format!("printf ok ; touch {}", injected_path.display()); + + run_on_change_command(&cmd, payload, std::time::Duration::from_secs(1)) + .await + .expect("on-change command should succeed"); + + assert!( + !injected_path.exists(), + "metacharacters in --on-change must not be evaluated by a shell" + ); + } + #[cfg(unix)] #[tokio::test] async fn on_change_command_times_out_and_returns_promptly() { diff --git a/crates/noxa-cli/src/app/watch.rs b/crates/noxa-cli/src/app/watch.rs index ffe53b7..6774ce8 100644 --- a/crates/noxa-cli/src/app/watch.rs +++ b/crates/noxa-cli/src/app/watch.rs @@ -126,15 +126,25 @@ pub(crate) async fn run_watch( const WATCH_ON_CHANGE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30); +fn parse_on_change_command(cmd: &str) -> Result, String> { + let argv = shlex::split(cmd) + .ok_or_else(|| "failed to parse command: unterminated quote".to_string())?; + if argv.is_empty() { + return Err("failed to run command: command is empty".to_string()); + } + Ok(argv) +} + pub(crate) async fn run_on_change_command( cmd: &str, payload: &str, max_runtime: std::time::Duration, ) -> Result<(), String> { - let mut child = tokio::process::Command::new("sh") - .arg("-c") - .arg(cmd) - .stdin(std::process::Stdio::piped()) + let argv = parse_on_change_command(cmd)?; + let mut command = tokio::process::Command::new(&argv[0]); + command.args(&argv[1..]); + command.stdin(std::process::Stdio::piped()); + let mut child = command .spawn() .map_err(|e| format!("failed to run command: {e}"))?; From 04db34a47aab0b9ee962a3051761c5fee0b1e2aa Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Fri, 24 Apr 2026 21:15:33 -0400 Subject: [PATCH 02/28] fix(core): prevent UTF-8 panic in content-position recovery --- crates/noxa-core/src/extractor/recovery.rs | 34 +++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/crates/noxa-core/src/extractor/recovery.rs b/crates/noxa-core/src/extractor/recovery.rs index 4a3aad1..44f83bf 100644 --- a/crates/noxa-core/src/extractor/recovery.rs +++ b/crates/noxa-core/src/extractor/recovery.rs @@ -466,13 +466,17 @@ fn strip_md_formatting(md: &str) -> String { /// Find `needle` in `markdown` only at a position that isn't inside image/link /// alt text (`![...](...)`). Returns the byte offset or None. fn find_content_position(markdown: &str, needle: &str) -> Option { + if needle.is_empty() { + return None; + } + let mut search_from = 0; while let Some(pos) = markdown[search_from..].find(needle) { let abs_pos = search_from + pos; if !is_inside_image_syntax(markdown, abs_pos) { return Some(abs_pos); } - search_from = abs_pos + 1; + search_from = abs_pos + needle.len(); } None } @@ -491,3 +495,31 @@ fn is_inside_image_syntax(markdown: &str, pos: usize) -> bool { } false } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn find_content_position_skips_rejected_multibyte_image_alt_match() { + let markdown = "![тест](https://example.com/image.png)\n\nтест"; + + let pos = find_content_position(markdown, "тест").expect("visible text should be found"); + + assert_eq!(pos, markdown.rfind("тест").unwrap()); + } + + #[test] + fn find_content_position_handles_repeated_rejected_non_ascii_matches() { + let markdown = concat!( + "![заголовок](https://example.com/one.png)\n", + "![заголовок](https://example.com/two.png)\n\n", + "заголовок" + ); + + let pos = + find_content_position(markdown, "заголовок").expect("visible text should be found"); + + assert_eq!(pos, markdown.rfind("заголовок").unwrap()); + } +} From 1723da4e1e78eb0ecb524a4a5bf9f5760bc5117f Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Fri, 24 Apr 2026 21:16:42 -0400 Subject: [PATCH 03/28] fix(core): recover JSON-LD with raw newline characters --- crates/noxa-core/src/structured_data.rs | 69 ++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/crates/noxa-core/src/structured_data.rs b/crates/noxa-core/src/structured_data.rs index 2ce41e8..1a25155 100644 --- a/crates/noxa-core/src/structured_data.rs +++ b/crates/noxa-core/src/structured_data.rs @@ -53,7 +53,7 @@ pub fn extract_json_ld(html: &str) -> Vec { } // Parse — some sites have arrays at top level - match serde_json::from_str::(json_str) { + match parse_json_ld_value(json_str) { Ok(Value::Array(arr)) => results.extend(arr), Ok(val) => results.push(val), Err(_) => {} @@ -63,6 +63,55 @@ pub fn extract_json_ld(html: &str) -> Vec { results } +fn parse_json_ld_value(json_str: &str) -> serde_json::Result { + match serde_json::from_str::(json_str) { + Ok(value) => Ok(value), + Err(original_err) => { + let Some(sanitized) = escape_raw_newlines_in_json_strings(json_str) else { + return Err(original_err); + }; + serde_json::from_str::(&sanitized) + } + } +} + +fn escape_raw_newlines_in_json_strings(input: &str) -> Option { + let mut out = String::with_capacity(input.len()); + let mut in_string = false; + let mut escape_next = false; + let mut changed = false; + + for ch in input.chars() { + if escape_next { + out.push(ch); + escape_next = false; + continue; + } + + match ch { + '\\' if in_string => { + out.push(ch); + escape_next = true; + } + '"' => { + out.push(ch); + in_string = !in_string; + } + '\n' if in_string => { + out.push_str("\\n"); + changed = true; + } + '\r' if in_string => { + out.push_str("\\r"); + changed = true; + } + _ => out.push(ch), + } + } + + changed.then_some(out) +} + /// Extract `__NEXT_DATA__` from Next.js pages. /// /// Next.js embeds server-rendered page data in: @@ -365,6 +414,24 @@ mod tests { assert_eq!(results[0]["name"], "Test"); } + #[test] + fn recovers_json_ld_with_raw_newline_inside_string() { + let html = r#" + + "#; + + let results = extract_json_ld(html); + + assert_eq!(results.len(), 1); + assert_eq!(results[0]["headline"], "First line\nSecond line"); + } + #[test] fn empty_script_tag_skipped() { let html = r#" From cbfa4db779ac573944d56ec76c4ed776bbf1e26b Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Fri, 24 Apr 2026 21:20:14 -0400 Subject: [PATCH 04/28] fix(fetch): port upstream crawler and robots hardening --- crates/noxa-fetch/src/client/tests.rs | 34 ++++++++++ crates/noxa-fetch/src/crawler.rs | 89 ++++++++++++++++++++++++++- crates/noxa-fetch/src/sitemap.rs | 44 ++++++++++--- 3 files changed, 156 insertions(+), 11 deletions(-) diff --git a/crates/noxa-fetch/src/client/tests.rs b/crates/noxa-fetch/src/client/tests.rs index 05706dd..516d2bd 100644 --- a/crates/noxa-fetch/src/client/tests.rs +++ b/crates/noxa-fetch/src/client/tests.rs @@ -283,6 +283,40 @@ async fn spawn_status_server(status: u16, body: &'static str) -> String { format!("http://{addr}/") } +#[cfg(test)] +async fn spawn_raw_response_server(response: String) -> String { + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + use tokio::net::TcpListener; + + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + tokio::spawn(async move { + if let Ok((mut socket, _)) = listener.accept().await { + let mut buf = vec![0u8; 4096]; + let _ = tokio::time::timeout(std::time::Duration::from_secs(5), socket.read(&mut buf)) + .await; + let _ = socket.write_all(response.as_bytes()).await; + } + }); + + format!("http://{addr}/") +} + +#[tokio::test] +async fn fetch_rejects_oversized_html_response_from_content_length() { + let response = format!( + "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", + 5 * 1024 * 1024 + 1 + ); + let url = spawn_raw_response_server(response).await; + let client = FetchClient::new(FetchConfig::default()).unwrap(); + + let result = client.fetch(&url).await; + + assert!(matches!(result, Err(FetchError::Limit(_)))); +} + #[tokio::test] async fn fetch_rejects_retryable_status_after_exhaustion() { // fetch() had a latent bug: on the last retry attempt with a retryable diff --git a/crates/noxa-fetch/src/crawler.rs b/crates/noxa-fetch/src/crawler.rs index 626ed1b..1065709 100644 --- a/crates/noxa-fetch/src/crawler.rs +++ b/crates/noxa-fetch/src/crawler.rs @@ -21,6 +21,10 @@ use crate::client::{FetchClient, FetchConfig}; use crate::error::FetchError; use crate::sitemap; +const MAX_GLOB_PATTERN_LEN: usize = 512; +const MAX_GLOB_WILDCARDS: usize = 64; +const MAX_GLOBSTARS: usize = 4; + /// Controls how extracted page bodies are retained in the aggregate `CrawlResult`. /// /// On large crawls the `Vec` can hold full extraction payloads for every @@ -165,6 +169,9 @@ impl Crawler { let seed = Url::parse(seed_url).map_err(|_| FetchError::InvalidUrl(seed_url.into()))?; let seed_origin = origin_key(&seed); + validate_glob_patterns(&config.include_patterns)?; + validate_glob_patterns(&config.exclude_patterns)?; + let client = FetchClient::new(config.fetch.clone())?; Ok(Self { @@ -275,6 +282,9 @@ impl Crawler { Ok(entries) => { let before = frontier.len(); for entry in entries { + if frontier.len() >= self.config.max_pages { + break; + } if self.qualify_link(&entry.url, &visited).is_some() { let parsed = match Url::parse(&entry.url) { Ok(u) => u, @@ -333,8 +343,16 @@ impl Crawler { let extraction_options = Arc::clone(&extraction_options); handles.push(tokio::spawn(async move { - // Acquire permit — blocks if concurrency limit reached - let _permit = permit.acquire().await.expect("semaphore closed"); + // Acquire permit — blocks if concurrency limit reached. + let Ok(_permit) = permit.acquire().await else { + return PageResult { + url, + depth, + extraction: None, + error: Some("crawl semaphore closed before request".to_string()), + elapsed: Duration::ZERO, + }; + }; tokio::time::sleep(delay).await; let page_start = Instant::now(); @@ -374,6 +392,7 @@ impl Crawler { // Collect results and harvest links for the next depth level let mut next_frontier: Vec<(String, usize)> = Vec::new(); + let mut next_seen: HashSet = HashSet::new(); for handle in handles { let mut page = match handle.await { @@ -395,7 +414,13 @@ impl Crawler { { for link in &extraction.content.links { if let Some(candidate) = self.qualify_link(&link.href, &visited) { - next_frontier.push((candidate, depth + 1)); + let remaining_capacity = + self.config.max_pages.saturating_sub(pages.len()); + if next_frontier.len() < remaining_capacity + && next_seen.insert(candidate.clone()) + { + next_frontier.push((candidate, depth + 1)); + } } else if self.is_excluded_by_pattern(&link.href) { excluded += 1; } @@ -533,6 +558,34 @@ impl Crawler { } } +fn validate_glob_patterns(patterns: &[String]) -> Result<(), FetchError> { + for pattern in patterns { + if let Err(reason) = validate_glob_pattern(pattern) { + return Err(FetchError::Build(format!( + "invalid crawl glob pattern {pattern:?}: {reason}" + ))); + } + } + Ok(()) +} + +fn validate_glob_pattern(pattern: &str) -> Result<(), &'static str> { + if pattern.len() > MAX_GLOB_PATTERN_LEN { + return Err("too long"); + } + + let wildcard_count = pattern.bytes().filter(|b| *b == b'*' || *b == b'?').count(); + if wildcard_count > MAX_GLOB_WILDCARDS { + return Err("too many wildcards"); + } + + if pattern.matches("**").count() > MAX_GLOBSTARS { + return Err("too many recursive wildcards"); + } + + Ok(()) +} + /// Canonical origin string for comparing same-origin: "scheme://host[:port]". fn origin_key(url: &Url) -> String { let port_suffix = match url.port() { @@ -733,6 +786,36 @@ mod tests { assert!(!glob_match("/blog*", "/blog/post")); // * doesn't cross / } + #[test] + fn crawler_new_rejects_oversized_glob_patterns() { + let config = CrawlConfig { + include_patterns: vec![format!("/{}", "a".repeat(600))], + ..Default::default() + }; + + let Err(err) = Crawler::new("https://example.com/", config) else { + panic!("oversized glob pattern should be rejected"); + }; + + assert!(matches!(err, FetchError::Build(_))); + assert!(err.to_string().contains("glob pattern")); + } + + #[test] + fn crawler_new_rejects_backtracking_heavy_glob_patterns() { + let config = CrawlConfig { + exclude_patterns: vec!["/**/**/**/**/**/**/**/**/**/target".to_string()], + ..Default::default() + }; + + let Err(err) = Crawler::new("https://example.com/", config) else { + panic!("backtracking-heavy glob pattern should be rejected"); + }; + + assert!(matches!(err, FetchError::Build(_))); + assert!(err.to_string().contains("glob pattern")); + } + // -- BodyRetention tests -- /// Helper that builds a synthetic `PageResult` with non-empty content. diff --git a/crates/noxa-fetch/src/sitemap.rs b/crates/noxa-fetch/src/sitemap.rs index 3a5a3bf..2ea09a3 100644 --- a/crates/noxa-fetch/src/sitemap.rs +++ b/crates/noxa-fetch/src/sitemap.rs @@ -155,19 +155,28 @@ async fn fetch_sitemaps( pub fn parse_robots_txt(text: &str) -> Vec { text.lines() .filter_map(|line| { - let trimmed = line.trim(); - // Case-insensitive match for "Sitemap:" prefix - if trimmed.len() > 8 && trimmed[..8].eq_ignore_ascii_case("sitemap:") { - let url = trimmed[8..].trim(); - if !url.is_empty() { - return Some(url.to_string()); - } + let (directive, value) = line.split_once(':')?; + if !directive.trim().eq_ignore_ascii_case("sitemap") { + return None; + } + + let url = value.split('#').next().unwrap_or("").trim(); + if is_plausible_sitemap_url(url) { + Some(url.to_string()) + } else { + None } - None }) .collect() } +fn is_plausible_sitemap_url(value: &str) -> bool { + let Ok(url) = url::Url::parse(value) else { + return false; + }; + matches!(url.scheme(), "http" | "https") && url.host_str().is_some() +} + /// Parse a sitemap XML string. Handles both `` and ``. /// Returns entries from urlsets and recursion targets from indexes. pub fn parse_sitemap_xml(xml: &str) -> Vec { @@ -474,6 +483,25 @@ mod tests { assert_eq!(urls[0], "https://example.com/s.xml"); } + #[test] + fn test_parse_robots_txt_handles_spacing_comments_and_invalid_values() { + let robots = "Sitemap : https://example.com/spaced.xml\n\ + Sitemap: https://example.com/inline.xml # primary sitemap\n\ + Sitemap: not-a-url\n\ + Sitemap: ftp://example.com/not-web.xml\n\ + Sitemap:\n"; + + let urls = parse_robots_txt(robots); + + assert_eq!( + urls, + vec![ + "https://example.com/spaced.xml".to_string(), + "https://example.com/inline.xml".to_string(), + ] + ); + } + #[test] fn test_deduplicate() { // parse_sitemap_xml deduplicates via the discover() path, but From 8b7635374052c9757fa828b7588a80555a1c7260 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Fri, 24 Apr 2026 21:22:02 -0400 Subject: [PATCH 05/28] fix(fetch): improve reddit fallback and verify-wall handling --- crates/noxa-fetch/src/client/fetch.rs | 12 ++++++- crates/noxa-fetch/src/reddit.rs | 50 +++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/crates/noxa-fetch/src/client/fetch.rs b/crates/noxa-fetch/src/client/fetch.rs index f052b06..fa5a4c5 100644 --- a/crates/noxa-fetch/src/client/fetch.rs +++ b/crates/noxa-fetch/src/client/fetch.rs @@ -122,10 +122,20 @@ impl FetchClient { debug!("reddit detected, fetching {json_url}"); let client = self.pick_client(url); - let resp = client.get(&json_url).send().await?; + let resp = client + .get(&json_url) + .header("User-Agent", crate::reddit::json_api_user_agent()) + .header("Accept", "application/json") + .send() + .await?; let response = Response::from_wreq(resp).await?; if response.is_success() { let bytes = response.body(); + if crate::reddit::is_reddit_verify_wall_html(bytes) { + return Err(FetchError::BodyDecode( + "reddit json endpoint returned verification page".to_string(), + )); + } match crate::reddit::parse_reddit_json(bytes, url) { Ok(result) => return Ok(result), Err(error) => { diff --git a/crates/noxa-fetch/src/reddit.rs b/crates/noxa-fetch/src/reddit.rs index 63b39ab..1fb39ca 100644 --- a/crates/noxa-fetch/src/reddit.rs +++ b/crates/noxa-fetch/src/reddit.rs @@ -7,6 +7,12 @@ use noxa_core::{Content, ExtractionResult, Metadata}; use serde::Deserialize; use tracing::debug; +const JSON_API_USER_AGENT: &str = "noxa bot/0.7 (+https://github.com/jmagar/noxa)"; + +pub fn json_api_user_agent() -> &'static str { + JSON_API_USER_AGENT +} + /// Check if a URL points to a Reddit post/comment page. pub fn is_reddit_url(url: &str) -> bool { let host = url @@ -30,6 +36,10 @@ pub fn json_url(url: &str) -> String { /// Convert Reddit JSON API response into an ExtractionResult. pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result { + if is_reddit_verify_wall_html(json_bytes) { + return Err("reddit verification page returned from json endpoint".to_string()); + } + let listings: Vec = serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?; @@ -117,6 +127,18 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result bool { + let text = String::from_utf8_lossy(bytes); + let lower = text.to_ascii_lowercase(); + + (lower.contains(" + + Reddit - Dive into anything + +

Whoa there, pardner!

+

We need to make sure you're not a robot.

+ + "#; + + let err = parse_reddit_json( + html, + "https://www.reddit.com/r/rust/comments/abc123/release_thread/", + ) + .expect_err("verification HTML should not be treated as generic JSON parse failure"); + + assert!(err.contains("verification"), "unexpected error: {err}"); + } + + #[test] + fn reddit_json_user_agent_identifies_bot_contact() { + let ua = json_api_user_agent(); + + assert!(ua.contains("noxa")); + assert!(ua.contains("bot")); + } } From b2e0af03ec24956e30ca678f42837eb22c414786 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:07:23 -0400 Subject: [PATCH 06/28] docs: specify full extractor parity port --- ...6-full-upstream-extractor-parity-design.md | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-26-full-upstream-extractor-parity-design.md diff --git a/docs/superpowers/specs/2026-04-26-full-upstream-extractor-parity-design.md b/docs/superpowers/specs/2026-04-26-full-upstream-extractor-parity-design.md new file mode 100644 index 0000000..0079dcc --- /dev/null +++ b/docs/superpowers/specs/2026-04-26-full-upstream-extractor-parity-design.md @@ -0,0 +1,215 @@ +# Full Upstream Extractor Parity Design + +## Goal + +Port the full upstream `webclaw-fetch` vertical extractor catalog into Noxa while preserving the existing generic scrape, batch, crawl, CLI, and MCP behavior by default. + +Full parity means all 28 upstream vertical extractors: + +- `amazon_product` +- `arxiv` +- `crates_io` +- `dev_to` +- `docker_hub` +- `ebay_listing` +- `ecommerce_product` +- `etsy_listing` +- `github_issue` +- `github_pr` +- `github_release` +- `github_repo` +- `hackernews` +- `huggingface_dataset` +- `huggingface_model` +- `instagram_post` +- `instagram_profile` +- `linkedin_post` +- `npm` +- `pypi` +- `reddit` +- `shopify_collection` +- `shopify_product` +- `stackoverflow` +- `substack_post` +- `trustpilot_reviews` +- `woocommerce_product` +- `youtube_video` + +## Non-Goals + +- Do not replace Noxa's generic content extractor. +- Do not make brittle broad matchers steal URLs from normal scraping. +- Do not add live-network tests. Site behavior and rate limits are too unstable for deterministic CI. +- Do not require API keys for baseline local extractor behavior unless an upstream extractor already depends on an external service. + +## Current State + +Noxa has generic extraction in `crates/noxa-core/src/extractor.rs` and fetch orchestration in `crates/noxa-fetch/src/client/fetch.rs`. It has site-specific special cases for Reddit and LinkedIn in `crates/noxa-fetch/src/reddit.rs` and `crates/noxa-fetch/src/linkedin.rs`, but it does not have upstream's `extractors/` catalog, catalog listing, explicit extractor dispatch, or typed vertical JSON output. + +`noxa_core::ExtractionResult` currently contains: + +- `metadata` +- `content` +- `domain_data: Option` +- `structured_data` + +`DomainData` only stores a `DomainType`, so it is not sufficient for full vertical extractor payloads. + +## Architecture + +Add a focused vertical extractor layer in `noxa-fetch`: + +- `crates/noxa-fetch/src/extractors/mod.rs` owns catalog listing, auto-dispatch, explicit name dispatch, and dispatch errors. +- Each upstream extractor gets a dedicated file under `crates/noxa-fetch/src/extractors/`. +- Extractors expose `INFO`, `matches(url)`, and `extract(client, url) -> serde_json::Value` following upstream's shape. +- Shared helpers live in small modules under `extractors/` only if duplication becomes concrete during porting. + +Keep the dispatcher static rather than dynamic. A static chain matches upstream, keeps ordering explicit, and makes broad-match exclusions easy to audit. + +## Output Model + +Add an additive field to `noxa_core::ExtractionResult`: + +```rust +#[serde(default, skip_serializing_if = "Option::is_none")] +pub vertical_data: Option, +``` + +Where: + +```rust +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VerticalData { + pub extractor: String, + pub data: serde_json::Value, +} +``` + +This preserves backward compatibility: existing JSON consumers keep seeing the same fields, and vertical results appear only when a vertical extractor is selected or auto-detected. + +For a vertical hit, Noxa still returns a normal `ExtractionResult`. The generic `metadata` should include the URL, title/description where known, `fetched_at`, and a compact markdown/plain-text summary when the extractor has enough fields to produce one. The complete typed payload lives in `vertical_data.data`. + +## Dispatch Behavior + +Support two modes: + +- Auto mode: the caller uses normal scraping and Noxa tries safe extractors before generic HTML extraction. +- Explicit mode: the caller chooses an extractor by name; Noxa validates `matches(url)` and returns a clear mismatch error if the URL does not belong to that extractor. + +Auto-dispatch should include upstream's safe/narrow matchers: + +- `reddit` +- `hackernews` +- `github_repo` +- `github_pr` +- `github_issue` +- `github_release` +- `pypi` +- `npm` +- `crates_io` +- `huggingface_model` +- `huggingface_dataset` +- `arxiv` +- `docker_hub` +- `dev_to` +- `stackoverflow` +- `linkedin_post` +- `instagram_post` +- `instagram_profile` +- `amazon_product` +- `ebay_listing` +- `etsy_listing` +- `trustpilot_reviews` +- `youtube_video` + +Explicit-only extractors are broad or ambiguous and must not hijack generic scraping: + +- `shopify_product` +- `shopify_collection` +- `ecommerce_product` +- `woocommerce_product` +- `substack_post` + +## Fetch Integration + +Add public methods on `FetchClient`: + +- `list_extractors() -> Vec` +- `fetch_and_extract_vertical(url, extractor_name, options) -> Result` +- Internal auto-dispatch hook in `fetch_and_extract_inner` before generic HTML extraction. + +Use the existing fetch client abstraction and response caps. Extractors that call JSON APIs should use the same client configuration, proxy/cookie behavior where applicable, timeout behavior, and error types. + +Reddit should be reconciled with the existing `crates/noxa-fetch/src/reddit.rs` implementation rather than duplicated blindly. The current hardened verification-wall behavior must remain. + +LinkedIn should be reconciled with the existing fallback in `crates/noxa-fetch/src/linkedin.rs`. If upstream `linkedin_post` covers a different output shape, keep old fallback behavior available through generic scraping and expose the upstream vertical shape through `vertical_data`. + +## CLI Surface + +Add: + +- `--extractor ` for explicit vertical extraction. +- `--list-extractors` to print extractor catalog. + +Default `noxa ` remains generic scrape with safe auto-detect. `--extractor` is valid for single URL and batch. Crawl should continue using generic extraction plus safe auto-detect only; explicit vertical extraction across crawl is out of scope unless a future use case requires it. + +JSON output includes `vertical_data`. Markdown/text output prints the vertical summary when available, falling back to generic content output. + +## MCP Surface + +Extend `scrape` params with: + +- `extractor: Option` + +Add an extractor catalog tool: + +- `extractors()` returns the same catalog as CLI `--list-extractors`. + +The existing `scrape` tool keeps current behavior when `extractor` is absent. If `extractor` is present, the MCP tool uses explicit dispatch and returns a readable error for unknown extractors or URL mismatches. + +## Testing + +Use TDD with fixture-backed unit tests. + +Required coverage: + +- Catalog contains all 28 extractors and unique names. +- Auto-dispatch includes only safe/narrow matchers. +- Explicit dispatch accepts every extractor by name. +- Explicit dispatch returns `UnknownVertical` for invalid names. +- Explicit dispatch returns `UrlMismatch` for wrong URL/extractor combinations. +- Each extractor has matcher tests for positive and negative URL examples. +- Each extractor has fixture parse tests using mocked responses. +- Existing Reddit verification-wall tests continue passing. +- CLI parser tests cover `--extractor` and `--list-extractors`. +- MCP schema/tests cover optional `extractor` and catalog tool. +- Workspace tests pass. + +Do not add live tests against GitHub, npm, PyPI, Amazon, Instagram, or any other public site. + +## Error Handling + +Use typed dispatch errors internally: + +- `UnknownVertical(String)` +- `UrlMismatch { vertical, url }` +- `Fetch(FetchError)` + +Map these to user-facing CLI/MCP messages without panics. + +Extractors should prefer partial structured output over failure when optional fields are absent, but fail when the core resource identity cannot be parsed. + +Anti-bot pages, verification walls, and blocked responses should produce actionable errors rather than returning the block page as content. + +## Implementation Strategy + +Implement in batches while keeping the target scope full parity: + +1. Add output model, catalog, dispatcher, and tests with placeholder-free integration. +2. Port low-risk API-backed extractors. +3. Port social/content extractors and reconcile Reddit/LinkedIn. +4. Port ecommerce/review extractors and broad explicit-only matchers. +5. Add CLI/MCP exposure. +6. Run full workspace verification and commit each coherent batch. + +The implementation is complete only when all 28 upstream extractors are present, exposed in the catalog, covered by tests, and wired through explicit dispatch. From be60533839e5fb83fbb846ce2174ff492df6f1c9 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:09:58 -0400 Subject: [PATCH 07/28] docs: plan full extractor parity port --- ...-26-full-upstream-extractor-parity-plan.md | 599 ++++++++++++++++++ 1 file changed, 599 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md diff --git a/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md b/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md new file mode 100644 index 0000000..a6f81fd --- /dev/null +++ b/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md @@ -0,0 +1,599 @@ +# Full Upstream Extractor Parity Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Port all 28 upstream `webclaw-fetch` vertical extractors into Noxa with catalog, dispatch, CLI, MCP, and fixture-backed tests. + +**Architecture:** Add an additive `vertical_data` payload to `noxa_core::ExtractionResult`, then add `noxa-fetch::extractors` as a static catalog/dispatcher with one file per upstream extractor. Wire safe auto-dispatch into normal scraping and explicit extractor selection into CLI/MCP without changing default generic extraction semantics. + +**Tech Stack:** Rust 2024, `serde`, `serde_json`, `thiserror`, `url`, `regex`, `wreq`, existing Noxa fetch/core/CLI/MCP crates, fixture-backed unit tests. + +--- + +## File Structure + +- Modify `crates/noxa-core/src/types.rs`: add `VerticalData` and `ExtractionResult::vertical_data`. +- Modify `crates/noxa-core/src/lib.rs` and tests: construct `vertical_data: None` in existing fixtures/results. +- Create `crates/noxa-fetch/src/extractors/mod.rs`: catalog, `ExtractorInfo`, `VerticalDataBuilder`, dispatch, `ExtractorDispatchError`, fixture-test helpers. +- Create `crates/noxa-fetch/src/extractors/http.rs`: small extractor fetch abstraction and `FetchClient` adapter for JSON/HTML calls. +- Create `crates/noxa-fetch/src/extractors/summary.rs`: helpers for turning vertical JSON into markdown/plain text summaries. +- Create one extractor file per upstream vertical: + `amazon_product.rs`, `arxiv.rs`, `crates_io.rs`, `dev_to.rs`, `docker_hub.rs`, `ebay_listing.rs`, `ecommerce_product.rs`, `etsy_listing.rs`, `github_issue.rs`, `github_pr.rs`, `github_release.rs`, `github_repo.rs`, `hackernews.rs`, `huggingface_dataset.rs`, `huggingface_model.rs`, `instagram_post.rs`, `instagram_profile.rs`, `linkedin_post.rs`, `npm.rs`, `pypi.rs`, `reddit.rs`, `shopify_collection.rs`, `shopify_product.rs`, `stackoverflow.rs`, `substack_post.rs`, `trustpilot_reviews.rs`, `woocommerce_product.rs`, `youtube_video.rs`. +- Create `crates/noxa-fetch/tests/fixtures/extractors/`: JSON/HTML fixtures for all 28 extractors. +- Modify `crates/noxa-fetch/src/lib.rs`: export `extractors` catalog types. +- Modify `crates/noxa-fetch/src/error.rs`: add conversion or variant for extractor dispatch failures. +- Modify `crates/noxa-fetch/src/client/fetch.rs`: auto-dispatch before generic HTML extraction and add explicit vertical method. +- Modify `crates/noxa-fetch/src/client/batch.rs`: add optional explicit extractor path for batch. +- Modify `crates/noxa-cli/src/app/cli.rs`: add `--extractor` and `--list-extractors`. +- Modify `crates/noxa-cli/src/app/entry.rs`: handle list mode before input validation. +- Modify `crates/noxa-cli/src/app/fetching/extract.rs` and batch path: call explicit vertical extraction when requested. +- Modify `crates/noxa-cli/src/app/printing.rs`: print catalog and vertical summaries. +- Modify `crates/noxa-mcp/src/tools.rs`: add `extractor` to `ScrapeParams`. +- Modify `crates/noxa-mcp/src/server.rs` and/or `server/content_tools.rs`: add `extractors` tool and explicit scrape dispatch. +- Modify `crates/noxa-fetch/Cargo.toml`: add dependencies needed by ported extractor code, expected `regex = "1"` and possibly `reqwest` only if upstream API code cannot reuse `wreq`. + +## Task 1: Add Vertical Output Model + +**Files:** +- Modify: `crates/noxa-core/src/types.rs` +- Modify: `crates/noxa-core/src/lib.rs` +- Modify: any test fixture constructors that fail compilation after adding the field + +- [ ] **Step 1: Write failing serialization test** + +Add a test in `crates/noxa-core/src/lib.rs` or a nearby test module: + +```rust +#[test] +fn extraction_result_serializes_vertical_data_when_present() { + let mut result = extract("Hello").unwrap(); + result.vertical_data = Some(VerticalData { + extractor: "github_repo".to_string(), + data: serde_json::json!({ "repo": "noxa" }), + }); + + let json = serde_json::to_value(&result).unwrap(); + assert_eq!(json["vertical_data"]["extractor"], "github_repo"); + assert_eq!(json["vertical_data"]["data"]["repo"], "noxa"); +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cargo test -p noxa-core extraction_result_serializes_vertical_data_when_present -- --nocapture` + +Expected: compile failure because `VerticalData`/`vertical_data` does not exist. + +- [ ] **Step 3: Implement model** + +Add to `crates/noxa-core/src/types.rs`: + +```rust +#[serde(default, skip_serializing_if = "Option::is_none")] +pub vertical_data: Option, + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VerticalData { + pub extractor: String, + pub data: serde_json::Value, +} +``` + +Export `VerticalData` from `crates/noxa-core/src/lib.rs`. + +- [ ] **Step 4: Fix constructors** + +Add `vertical_data: None` to every `ExtractionResult` literal that fails compilation. + +- [ ] **Step 5: Verify** + +Run: `cargo test -p noxa-core extraction_result_serializes_vertical_data_when_present -- --nocapture` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +Run: + +```bash +git add crates/noxa-core/src/types.rs crates/noxa-core/src/lib.rs +git commit -m "feat(core): add vertical extractor payload" +``` + +## Task 2: Add Extractor Catalog and Dispatch Skeleton + +**Files:** +- Create: `crates/noxa-fetch/src/extractors/mod.rs` +- Create: `crates/noxa-fetch/src/extractors/http.rs` +- Create: `crates/noxa-fetch/src/extractors/summary.rs` +- Modify: `crates/noxa-fetch/src/lib.rs` +- Modify: `crates/noxa-fetch/src/error.rs` +- Modify: `crates/noxa-fetch/Cargo.toml` + +- [ ] **Step 1: Write catalog tests** + +Add tests in `extractors/mod.rs` for: + +```rust +#[test] +fn list_contains_all_upstream_extractors() { + let names: Vec<_> = list().iter().map(|info| info.name).collect(); + assert_eq!(names.len(), 28); + assert!(names.contains(&"amazon_product")); + assert!(names.contains(&"youtube_video")); +} + +#[test] +fn list_names_are_unique() { + let mut names: Vec<_> = list().iter().map(|info| info.name).collect(); + names.sort(); + let before = names.len(); + names.dedup(); + assert_eq!(before, names.len()); +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cargo test -p noxa-fetch extractors::tests::list_contains_all_upstream_extractors -- --nocapture` + +Expected: compile failure because `extractors` does not exist. + +- [ ] **Step 3: Implement skeleton** + +Create `ExtractorInfo`, `list()`, `dispatch_by_url`, `dispatch_by_name`, and `ExtractorDispatchError`. Add every upstream extractor name to the catalog. Initially, modules may expose only `INFO`, `matches`, and parse stubs that return `FetchError::Build("extractor not implemented: ")`; do not ship this state beyond the skeleton commit. + +- [ ] **Step 4: Add fetch abstraction** + +Create a small trait in `extractors/http.rs` for extractor tests: + +```rust +#[async_trait::async_trait] +pub trait ExtractorHttp { + async fn get_text(&self, url: &str) -> Result; + async fn get_json(&self, url: &str) -> Result; +} +``` + +Implement it for `FetchClient` using existing `fetch()` and response limits. + +- [ ] **Step 5: Verify** + +Run: `cargo test -p noxa-fetch extractors::tests -- --nocapture` + +Expected: catalog tests pass. + +- [ ] **Step 6: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/extractors crates/noxa-fetch/src/lib.rs crates/noxa-fetch/src/error.rs crates/noxa-fetch/Cargo.toml +git commit -m "feat(fetch): add vertical extractor catalog" +``` + +## Task 3: Port API-Backed Developer/Package Extractors + +**Files:** +- Modify: `crates/noxa-fetch/src/extractors/github_repo.rs` +- Modify: `crates/noxa-fetch/src/extractors/github_pr.rs` +- Modify: `crates/noxa-fetch/src/extractors/github_issue.rs` +- Modify: `crates/noxa-fetch/src/extractors/github_release.rs` +- Modify: `crates/noxa-fetch/src/extractors/pypi.rs` +- Modify: `crates/noxa-fetch/src/extractors/npm.rs` +- Modify: `crates/noxa-fetch/src/extractors/crates_io.rs` +- Modify: `crates/noxa-fetch/src/extractors/docker_hub.rs` +- Add fixtures under: `crates/noxa-fetch/tests/fixtures/extractors/` + +- [ ] **Step 1: Write matcher tests for this batch** + +For each extractor, add positive and negative URL examples. Include GitHub ordering tests so repo URLs do not preempt issue/PR/release URLs. + +- [ ] **Step 2: Write fixture parse tests** + +Use a mock `ExtractorHttp` that maps expected API URLs to fixture JSON. Assert stable fields such as repo name, package name, version, stars/downloads, title, and URL. + +- [ ] **Step 3: Run tests to verify failure** + +Run: `cargo test -p noxa-fetch github_repo pypi npm crates_io docker_hub -- --nocapture` + +Expected: failures from unimplemented extractors. + +- [ ] **Step 4: Port upstream implementations** + +Use upstream extractor files as the behavioral source, but adapt crate names and fetch calls to Noxa's `ExtractorHttp`. Keep returned JSON field names compatible with upstream unless there is a Noxa-specific conflict. + +- [ ] **Step 5: Verify** + +Run: `cargo test -p noxa-fetch github_ pypi npm crates_io docker_hub -- --nocapture` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/extractors crates/noxa-fetch/tests/fixtures/extractors +git commit -m "feat(fetch): port developer package extractors" +``` + +## Task 4: Port Research/Community Content Extractors + +**Files:** +- Modify: `crates/noxa-fetch/src/extractors/arxiv.rs` +- Modify: `crates/noxa-fetch/src/extractors/hackernews.rs` +- Modify: `crates/noxa-fetch/src/extractors/dev_to.rs` +- Modify: `crates/noxa-fetch/src/extractors/stackoverflow.rs` +- Modify: `crates/noxa-fetch/src/extractors/youtube_video.rs` +- Add fixtures under: `crates/noxa-fetch/tests/fixtures/extractors/` + +- [ ] **Step 1: Write matcher and fixture tests** + +Cover canonical URL forms: + +- `https://arxiv.org/abs/` +- `https://news.ycombinator.com/item?id=` +- `https://dev.to//` +- `https://stackoverflow.com/questions//` +- `https://www.youtube.com/watch?v=` and `https://youtu.be/` + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cargo test -p noxa-fetch arxiv hackernews dev_to stackoverflow youtube_video -- --nocapture` + +Expected: failures from unimplemented extractors. + +- [ ] **Step 3: Port implementations** + +Prefer upstream API endpoints where present. Keep HTML parsing fixture-driven and avoid live requests. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-fetch arxiv hackernews dev_to stackoverflow youtube_video -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/extractors crates/noxa-fetch/tests/fixtures/extractors +git commit -m "feat(fetch): port research and community extractors" +``` + +## Task 5: Port HuggingFace and Social Extractors + +**Files:** +- Modify: `crates/noxa-fetch/src/extractors/huggingface_model.rs` +- Modify: `crates/noxa-fetch/src/extractors/huggingface_dataset.rs` +- Modify: `crates/noxa-fetch/src/extractors/instagram_post.rs` +- Modify: `crates/noxa-fetch/src/extractors/instagram_profile.rs` +- Modify: `crates/noxa-fetch/src/extractors/linkedin_post.rs` +- Modify: `crates/noxa-fetch/src/linkedin.rs` only if reconciliation is required +- Add fixtures under: `crates/noxa-fetch/tests/fixtures/extractors/` + +- [ ] **Step 1: Write matcher and fixture tests** + +Assert HuggingFace model/dataset disambiguation and Instagram profile/post disambiguation. + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cargo test -p noxa-fetch huggingface instagram linkedin_post -- --nocapture` + +Expected: failures from unimplemented extractors. + +- [ ] **Step 3: Port implementations** + +Keep the existing LinkedIn generic fallback intact. `linkedin_post` should populate `vertical_data`; existing generic LinkedIn extraction remains a fallback for normal content. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-fetch huggingface instagram linkedin_post -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/extractors crates/noxa-fetch/src/linkedin.rs crates/noxa-fetch/tests/fixtures/extractors +git commit -m "feat(fetch): port huggingface and social extractors" +``` + +## Task 6: Reconcile and Port Reddit Extractor + +**Files:** +- Modify: `crates/noxa-fetch/src/extractors/reddit.rs` +- Modify: `crates/noxa-fetch/src/reddit.rs` +- Modify: `crates/noxa-fetch/src/client/fetch.rs` +- Add fixtures under: `crates/noxa-fetch/tests/fixtures/extractors/` + +- [ ] **Step 1: Write parity tests** + +Test that Reddit vertical extraction uses the hardened JSON endpoint behavior and that verification-wall HTML still fails with a clear error. + +- [ ] **Step 2: Run tests to verify failure or current mismatch** + +Run: `cargo test -p noxa-fetch reddit -- --nocapture` + +Expected: new vertical tests fail until dispatcher integration is complete; existing hardening tests must continue passing. + +- [ ] **Step 3: Implement reconciliation** + +Avoid duplicate Reddit parsing logic where practical. Either make `extractors/reddit.rs` wrap the hardened parser from `reddit.rs`, or move shared parsing helpers into a private shared module. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-fetch reddit -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/extractors/reddit.rs crates/noxa-fetch/src/reddit.rs crates/noxa-fetch/src/client/fetch.rs crates/noxa-fetch/tests/fixtures/extractors +git commit -m "feat(fetch): expose reddit vertical extractor" +``` + +## Task 7: Port Ecommerce and Review Extractors + +**Files:** +- Modify: `crates/noxa-fetch/src/extractors/amazon_product.rs` +- Modify: `crates/noxa-fetch/src/extractors/ebay_listing.rs` +- Modify: `crates/noxa-fetch/src/extractors/ecommerce_product.rs` +- Modify: `crates/noxa-fetch/src/extractors/etsy_listing.rs` +- Modify: `crates/noxa-fetch/src/extractors/shopify_collection.rs` +- Modify: `crates/noxa-fetch/src/extractors/shopify_product.rs` +- Modify: `crates/noxa-fetch/src/extractors/trustpilot_reviews.rs` +- Modify: `crates/noxa-fetch/src/extractors/woocommerce_product.rs` +- Add fixtures under: `crates/noxa-fetch/tests/fixtures/extractors/` + +- [ ] **Step 1: Write matcher and broad-dispatch tests** + +Assert: + +- Amazon/eBay/Etsy/Trustpilot are eligible for auto-dispatch. +- Shopify/ecommerce/WooCommerce broad matchers work in explicit mode. +- Shopify/ecommerce/WooCommerce are not claimed by auto-dispatch. + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cargo test -p noxa-fetch amazon_product ebay_listing ecommerce_product etsy_listing shopify trustpilot woocommerce -- --nocapture` + +Expected: failures from unimplemented extractors. + +- [ ] **Step 3: Port implementations** + +Preserve upstream anti-bot handling where present. Block/verification pages must produce errors, not vertical payloads. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-fetch amazon_product ebay_listing ecommerce_product etsy_listing shopify trustpilot woocommerce -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/extractors crates/noxa-fetch/tests/fixtures/extractors +git commit -m "feat(fetch): port ecommerce vertical extractors" +``` + +## Task 8: Integrate Auto and Explicit Fetch Dispatch + +**Files:** +- Modify: `crates/noxa-fetch/src/client/fetch.rs` +- Modify: `crates/noxa-fetch/src/client/batch.rs` +- Modify: `crates/noxa-fetch/src/extractors/mod.rs` +- Modify: `crates/noxa-fetch/src/error.rs` + +- [ ] **Step 1: Write integration tests** + +Use a fixture/mock HTTP adapter to assert: + +- `fetch_and_extract_with_options()` auto-detects a safe vertical and sets `vertical_data`. +- A broad explicit-only URL still goes through generic extraction in auto mode. +- `fetch_and_extract_vertical()` succeeds for matching URL/name. +- `fetch_and_extract_vertical()` fails clearly for mismatch. + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cargo test -p noxa-fetch vertical_dispatch -- --nocapture` + +Expected: compile failure or failing assertions until integration exists. + +- [ ] **Step 3: Implement integration** + +Add explicit method: + +```rust +pub async fn fetch_and_extract_vertical( + &self, + url: &str, + extractor: &str, + options: &noxa_core::ExtractionOptions, +) -> Result +``` + +Add safe auto-dispatch before generic HTML extraction but after document/PDF checks when possible. If a vertical extractor needs JSON/API and does not require the fetched HTML, let it run before fetching the original page. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-fetch vertical_dispatch -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-fetch/src/client crates/noxa-fetch/src/extractors crates/noxa-fetch/src/error.rs +git commit -m "feat(fetch): wire vertical extractor dispatch" +``` + +## Task 9: Add CLI Exposure + +**Files:** +- Modify: `crates/noxa-cli/src/app/cli.rs` +- Modify: `crates/noxa-cli/src/app/entry.rs` +- Modify: `crates/noxa-cli/src/app/fetching/extract.rs` +- Modify: `crates/noxa-cli/src/app/batch.rs` +- Modify: `crates/noxa-cli/src/app/printing.rs` +- Modify: `crates/noxa-cli/src/app/tests_primary.rs` + +- [ ] **Step 1: Write CLI tests** + +Add parser/format tests for: + +- `noxa --list-extractors` +- `noxa --extractor github_repo https://github.com/jmagar/noxa` +- batch path passes the explicit extractor to fetch +- `--extractor` with `--file` or `--stdin` errors clearly + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cargo test -p noxa-cli extractor -- --nocapture` + +Expected: failures because CLI args and list output do not exist. + +- [ ] **Step 3: Implement CLI** + +Add `extractor: Option` and `list_extractors: bool`. Route explicit extraction through `FetchClient::fetch_and_extract_vertical`. Print catalog as text by default and JSON when `--format json` is selected. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-cli extractor -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-cli/src/app +git commit -m "feat(cli): expose vertical extractors" +``` + +## Task 10: Add MCP Exposure + +**Files:** +- Modify: `crates/noxa-mcp/src/tools.rs` +- Modify: `crates/noxa-mcp/src/server.rs` +- Modify: `crates/noxa-mcp/src/server/content_tools.rs` if scrape implementation lives there +- Modify: `crates/noxa-mcp/tests/startup_harness.rs` or add focused tests + +- [ ] **Step 1: Write MCP tests** + +Add tests or harness assertions for: + +- `scrape` schema includes optional `extractor`. +- `extractors` tool is listed. +- explicit extractor mismatch returns a readable tool error. + +- [ ] **Step 2: Run tests to verify failure** + +Run: `cargo test -p noxa-mcp extractor -- --nocapture` + +Expected: failures because schema/tool is missing. + +- [ ] **Step 3: Implement MCP** + +Add `extractor: Option` to `ScrapeParams`; use explicit dispatch when provided. Add `extractors` tool returning pretty JSON catalog. + +- [ ] **Step 4: Verify** + +Run: `cargo test -p noxa-mcp extractor -- --nocapture` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: + +```bash +git add crates/noxa-mcp/src crates/noxa-mcp/tests +git commit -m "feat(mcp): expose vertical extractors" +``` + +## Task 11: Documentation and Final Verification + +**Files:** +- Modify: `README.md` if extractor usage belongs there +- Modify: `crates/noxa-mcp/README.md` +- Modify: any CLI docs/help snapshots if present +- Modify: `docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md` if implementation discoveries require updates + +- [ ] **Step 1: Add docs** + +Document: + +- `noxa --list-extractors` +- `noxa --extractor ` +- MCP `extractors` tool +- MCP `scrape.extractor` +- Auto-dispatch vs explicit-only behavior + +- [ ] **Step 2: Run focused crate tests** + +Run: + +```bash +cargo test -p noxa-core +cargo test -p noxa-fetch +cargo test -p noxa-cli +cargo test -p noxa-mcp +``` + +Expected: all PASS. + +- [ ] **Step 3: Run workspace tests** + +Run: `cargo test --workspace` + +Expected: all PASS. Existing ignored tests may remain ignored. + +- [ ] **Step 4: Run build** + +Run: `cargo build --workspace` + +Expected: PASS. + +- [ ] **Step 5: Update Beads** + +Run: + +```bash +bd close noxa-x2x --reason "Implemented full upstream vertical extractor parity" +``` + +- [ ] **Step 6: Commit docs/final fixes** + +Run: + +```bash +git add README.md crates/noxa-mcp/README.md docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md +git commit -m "docs: document vertical extractor parity" +``` + +## Review Notes + +The spec-review and plan-review subagent loops from the superpowers workflow were not run automatically because this Codex environment only permits spawning subagents when the user explicitly asks for subagent delegation. If the user asks for agent review, dispatch a plan/spec reviewer before implementation. + +## Completion Criteria + +- All 28 upstream extractor names are present in `noxa_fetch::extractors::list()`. +- Every extractor has URL matcher coverage and fixture-backed parse coverage. +- Safe auto-dispatch does not include broad Shopify/ecommerce/WooCommerce/Substack matchers. +- Explicit dispatch works for all extractors. +- Existing generic scrape behavior remains compatible. +- CLI and MCP expose catalog and explicit extractor selection. +- `cargo test --workspace` and `cargo build --workspace` pass. From 373753b2eae92f4332f88a2de7f534fabced635b Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:10:23 -0400 Subject: [PATCH 08/28] docs: tighten extractor parity plan --- ...4-26-full-upstream-extractor-parity-plan.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md b/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md index a6f81fd..782c51d 100644 --- a/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md +++ b/docs/superpowers/plans/2026-04-26-full-upstream-extractor-parity-plan.md @@ -30,7 +30,7 @@ - Modify `crates/noxa-cli/src/app/printing.rs`: print catalog and vertical summaries. - Modify `crates/noxa-mcp/src/tools.rs`: add `extractor` to `ScrapeParams`. - Modify `crates/noxa-mcp/src/server.rs` and/or `server/content_tools.rs`: add `extractors` tool and explicit scrape dispatch. -- Modify `crates/noxa-fetch/Cargo.toml`: add dependencies needed by ported extractor code, expected `regex = "1"` and possibly `reqwest` only if upstream API code cannot reuse `wreq`. +- Modify `crates/noxa-fetch/Cargo.toml`: add dependencies needed by ported extractor code, expected `async-trait = "0.1"` and `regex = "1"`, and possibly `reqwest` only if upstream API code cannot reuse `wreq`. ## Task 1: Add Vertical Output Model @@ -195,7 +195,7 @@ Use a mock `ExtractorHttp` that maps expected API URLs to fixture JSON. Assert s - [ ] **Step 3: Run tests to verify failure** -Run: `cargo test -p noxa-fetch github_repo pypi npm crates_io docker_hub -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::developer -- --nocapture` Expected: failures from unimplemented extractors. @@ -205,7 +205,7 @@ Use upstream extractor files as the behavioral source, but adapt crate names and - [ ] **Step 5: Verify** -Run: `cargo test -p noxa-fetch github_ pypi npm crates_io docker_hub -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::developer -- --nocapture` Expected: PASS. @@ -240,7 +240,7 @@ Cover canonical URL forms: - [ ] **Step 2: Run tests to verify failure** -Run: `cargo test -p noxa-fetch arxiv hackernews dev_to stackoverflow youtube_video -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::community -- --nocapture` Expected: failures from unimplemented extractors. @@ -250,7 +250,7 @@ Prefer upstream API endpoints where present. Keep HTML parsing fixture-driven an - [ ] **Step 4: Verify** -Run: `cargo test -p noxa-fetch arxiv hackernews dev_to stackoverflow youtube_video -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::community -- --nocapture` Expected: PASS. @@ -280,7 +280,7 @@ Assert HuggingFace model/dataset disambiguation and Instagram profile/post disam - [ ] **Step 2: Run tests to verify failure** -Run: `cargo test -p noxa-fetch huggingface instagram linkedin_post -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::social -- --nocapture` Expected: failures from unimplemented extractors. @@ -290,7 +290,7 @@ Keep the existing LinkedIn generic fallback intact. `linkedin_post` should popul - [ ] **Step 4: Verify** -Run: `cargo test -p noxa-fetch huggingface instagram linkedin_post -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::social -- --nocapture` Expected: PASS. @@ -363,7 +363,7 @@ Assert: - [ ] **Step 2: Run tests to verify failure** -Run: `cargo test -p noxa-fetch amazon_product ebay_listing ecommerce_product etsy_listing shopify trustpilot woocommerce -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::ecommerce -- --nocapture` Expected: failures from unimplemented extractors. @@ -373,7 +373,7 @@ Preserve upstream anti-bot handling where present. Block/verification pages must - [ ] **Step 4: Verify** -Run: `cargo test -p noxa-fetch amazon_product ebay_listing ecommerce_product etsy_listing shopify trustpilot woocommerce -- --nocapture` +Run: `cargo test -p noxa-fetch extractors::ecommerce -- --nocapture` Expected: PASS. From e078113a1f58c51d97397660449286ce72f870f0 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:13:45 -0400 Subject: [PATCH 09/28] feat(core): add vertical extractor payload --- crates/noxa-cli/src/app/retrieve.rs | 1 + crates/noxa-cli/src/app/tests_primary.rs | 1 + crates/noxa-core/src/diff.rs | 1 + crates/noxa-core/src/lib.rs | 16 ++++++++++++++++ crates/noxa-core/src/llm/mod.rs | 2 ++ crates/noxa-core/src/types.rs | 9 +++++++++ crates/noxa-fetch/src/client/fetch.rs | 1 + crates/noxa-fetch/src/crawler.rs | 1 + crates/noxa-fetch/src/document.rs | 1 + crates/noxa-fetch/src/linkedin.rs | 1 + crates/noxa-fetch/src/reddit.rs | 1 + crates/noxa-mcp/src/server.rs | 1 + crates/noxa-mcp/src/server/content_tools.rs | 1 + crates/noxa-rag/src/mcp_bridge.rs | 1 + crates/noxa-rag/src/pipeline/parse/mod.rs | 1 + crates/noxa-rag/src/pipeline/parse/tests.rs | 1 + crates/noxa-store/src/content_store/tests.rs | 2 ++ 17 files changed, 42 insertions(+) diff --git a/crates/noxa-cli/src/app/retrieve.rs b/crates/noxa-cli/src/app/retrieve.rs index 23d9722..40baefe 100644 --- a/crates/noxa-cli/src/app/retrieve.rs +++ b/crates/noxa-cli/src/app/retrieve.rs @@ -570,6 +570,7 @@ mod tests { raw_html: None, }, domain_data: None, + vertical_data: None, structured_data: Vec::new(), } } diff --git a/crates/noxa-cli/src/app/tests_primary.rs b/crates/noxa-cli/src/app/tests_primary.rs index 269eba9..dd0cb2b 100644 --- a/crates/noxa-cli/src/app/tests_primary.rs +++ b/crates/noxa-cli/src/app/tests_primary.rs @@ -36,6 +36,7 @@ mod tests { raw_html: None, }, domain_data: None, + vertical_data: None, structured_data: Vec::new(), } } diff --git a/crates/noxa-core/src/diff.rs b/crates/noxa-core/src/diff.rs index d724f71..753a650 100644 --- a/crates/noxa-core/src/diff.rs +++ b/crates/noxa-core/src/diff.rs @@ -240,6 +240,7 @@ mod tests { domain_data: Some(DomainData { domain_type: DomainType::Generic, }), + vertical_data: None, structured_data: vec![], } } diff --git a/crates/noxa-core/src/lib.rs b/crates/noxa-core/src/lib.rs index 5acd2e0..447b5e9 100644 --- a/crates/noxa-core/src/lib.rs +++ b/crates/noxa-core/src/lib.rs @@ -36,6 +36,7 @@ pub use error::ExtractError; pub use llm::to_llm_text; pub use types::{ CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata, + VerticalData, }; use scraper::Html; @@ -186,6 +187,7 @@ fn extract_with_options_inner( raw_html: None, }, domain_data, + vertical_data: None, structured_data, }); } @@ -284,6 +286,7 @@ fn extract_with_options_inner( metadata: meta, content, domain_data, + vertical_data: None, structured_data, }) } @@ -380,6 +383,19 @@ mod tests { assert!(!json.contains("raw_html")); } + #[test] + fn extraction_result_serializes_vertical_data_when_present() { + let mut result = extract("

Test

", None).unwrap(); + result.vertical_data = Some(VerticalData { + extractor: "github_repo".to_string(), + data: serde_json::json!({ "repo": "noxa" }), + }); + + let json = serde_json::to_value(&result).unwrap(); + assert_eq!(json["vertical_data"]["extractor"], "github_repo"); + assert_eq!(json["vertical_data"]["data"]["repo"], "noxa"); + } + #[test] fn youtube_extraction_produces_structured_markdown() { let html = r#" diff --git a/crates/noxa-core/src/llm/mod.rs b/crates/noxa-core/src/llm/mod.rs index ad7356c..e1a0b52 100644 --- a/crates/noxa-core/src/llm/mod.rs +++ b/crates/noxa-core/src/llm/mod.rs @@ -97,6 +97,7 @@ mod tests { raw_html: None, }, domain_data: None, + vertical_data: None, structured_data: vec![], } } @@ -405,6 +406,7 @@ mod tests { raw_html: None, }, domain_data: None, + vertical_data: None, structured_data: vec![], }; diff --git a/crates/noxa-core/src/types.rs b/crates/noxa-core/src/types.rs index fbda246..2719cae 100644 --- a/crates/noxa-core/src/types.rs +++ b/crates/noxa-core/src/types.rs @@ -9,12 +9,21 @@ pub struct ExtractionResult { pub metadata: Metadata, pub content: Content, pub domain_data: Option, + /// Site-specific structured payload returned by a vertical extractor. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub vertical_data: Option, /// JSON-LD structured data extracted from ` + + From 69fb8215866b4c7918b463f3bbfc74a27e1e2782 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:55:12 -0400 Subject: [PATCH 13/28] feat(fetch): port huggingface and social extractors --- .../src/extractors/huggingface_dataset.rs | 58 ++++++++- .../src/extractors/huggingface_model.rs | 61 +++++++++- .../src/extractors/instagram_post.rs | 91 +++++++++++++- .../src/extractors/instagram_profile.rs | 101 +++++++++++++++- .../src/extractors/linkedin_post.rs | 105 +++++++++++++++- crates/noxa-fetch/src/extractors/mod.rs | 113 ++++++++++++++++++ .../extractors/huggingface_dataset.json | 21 ++++ .../extractors/huggingface_model.json | 23 ++++ .../fixtures/extractors/instagram_post.html | 10 ++ .../extractors/instagram_profile.json | 45 +++++++ .../fixtures/extractors/linkedin_post.html | 12 ++ 11 files changed, 615 insertions(+), 25 deletions(-) create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/huggingface_dataset.json create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/huggingface_model.json create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/instagram_post.html create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html diff --git a/crates/noxa-fetch/src/extractors/huggingface_dataset.rs b/crates/noxa-fetch/src/extractors/huggingface_dataset.rs index 63d4f93..9cd7f1a 100644 --- a/crates/noxa-fetch/src/extractors/huggingface_dataset.rs +++ b/crates/noxa-fetch/src/extractors/huggingface_dataset.rs @@ -1,6 +1,6 @@ -use serde_json::Value; +use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,9 +11,57 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "huggingface.co") && url.contains("/datasets/") + parse_dataset_path(url).is_some() } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let dataset_path = parse_dataset_path(url).ok_or_else(|| { + FetchError::Build(format!("hf_dataset: cannot parse dataset path from '{url}'")) + })?; + let api_url = format!("https://huggingface.co/api/datasets/{dataset_path}"); + let dataset = client.get_json(&api_url).await?; + let siblings = dataset + .get("siblings") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + + Ok(json!({ + "url": url, + "id": dataset.get("id").cloned(), + "private": dataset.get("private").cloned(), + "gated": dataset.get("gated").cloned(), + "downloads": dataset.get("downloads").cloned(), + "downloads_30d": dataset.get("downloadsAllTime").cloned(), + "likes": dataset.get("likes").cloned(), + "tags": dataset.get("tags").cloned().unwrap_or_else(|| json!([])), + "license": dataset.pointer("/cardData/license").cloned(), + "language": dataset.pointer("/cardData/language").cloned(), + "task_categories": dataset.pointer("/cardData/task_categories").cloned(), + "size_categories": dataset.pointer("/cardData/size_categories").cloned(), + "annotations_creators": dataset.pointer("/cardData/annotations_creators").cloned(), + "configs": dataset.pointer("/cardData/configs").cloned(), + "created_at": dataset.get("createdAt").cloned(), + "last_modified": dataset.get("lastModified").cloned(), + "sha": dataset.get("sha").cloned(), + "file_count": siblings.len(), + "files": siblings, + })) +} + +fn parse_dataset_path(url: &str) -> Option { + let parsed = url::Url::parse(url).ok()?; + let host = parsed.host_str()?; + if host != "huggingface.co" && host != "www.huggingface.co" { + return None; + } + let segs: Vec<_> = parsed.path_segments()?.filter(|s| !s.is_empty()).collect(); + if segs.first() != Some(&"datasets") || !(segs.len() == 2 || segs.len() == 3) { + return None; + } + match segs.as_slice() { + ["datasets", name] => Some((*name).to_string()), + ["datasets", owner, name] => Some(format!("{owner}/{name}")), + _ => None, + } } diff --git a/crates/noxa-fetch/src/extractors/huggingface_model.rs b/crates/noxa-fetch/src/extractors/huggingface_model.rs index 6a38eeb..d490441 100644 --- a/crates/noxa-fetch/src/extractors/huggingface_model.rs +++ b/crates/noxa-fetch/src/extractors/huggingface_model.rs @@ -1,6 +1,6 @@ -use serde_json::Value; +use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,9 +11,60 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "huggingface.co") && !url.contains("/datasets/") + parse_owner_name(url).is_some() } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let (owner, name) = parse_owner_name(url).ok_or_else(|| { + FetchError::Build(format!("hf model: cannot parse owner/name from '{url}'")) + })?; + let api_url = format!("https://huggingface.co/api/models/{owner}/{name}"); + let model = client.get_json(&api_url).await?; + let siblings = model + .get("siblings") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + + Ok(json!({ + "url": url, + "id": model.get("id").cloned(), + "model_id": model.get("modelId").cloned(), + "private": model.get("private").cloned(), + "gated": model.get("gated").cloned(), + "downloads": model.get("downloads").cloned(), + "downloads_30d": model.get("downloadsAllTime").cloned(), + "likes": model.get("likes").cloned(), + "library_name": model.get("library_name").cloned(), + "pipeline_tag": model.get("pipeline_tag").cloned(), + "tags": model.get("tags").cloned().unwrap_or_else(|| json!([])), + "license": model.pointer("/cardData/license").cloned(), + "language": model.pointer("/cardData/language").cloned(), + "datasets": model.pointer("/cardData/datasets").cloned(), + "base_model": model.pointer("/cardData/base_model").cloned(), + "model_type": model.pointer("/cardData/model_type").cloned(), + "created_at": model.get("createdAt").cloned(), + "last_modified": model.get("lastModified").cloned(), + "sha": model.get("sha").cloned(), + "file_count": siblings.len(), + "files": siblings, + })) +} + +fn parse_owner_name(url: &str) -> Option<(String, String)> { + let parsed = url::Url::parse(url).ok()?; + let host = parsed.host_str()?; + if host != "huggingface.co" && host != "www.huggingface.co" { + return None; + } + let segs: Vec<_> = parsed.path_segments()?.filter(|s| !s.is_empty()).collect(); + if segs.len() != 2 || RESERVED_NAMESPACES.contains(&segs[0]) { + return None; + } + Some((segs[0].to_string(), segs[1].to_string())) } + +const RESERVED_NAMESPACES: &[&str] = &[ + "datasets", "spaces", "blog", "docs", "api", "models", "papers", "pricing", "tasks", + "join", "login", "settings", "organizations", "new", "search", +]; diff --git a/crates/noxa-fetch/src/extractors/instagram_post.rs b/crates/noxa-fetch/src/extractors/instagram_post.rs index d314f6a..3c8f37a 100644 --- a/crates/noxa-fetch/src/extractors/instagram_post.rs +++ b/crates/noxa-fetch/src/extractors/instagram_post.rs @@ -1,6 +1,7 @@ -use serde_json::Value; +use regex::Regex; +use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,9 +12,89 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "instagram.com") && (url.contains("/p/") || url.contains("/reel/")) + parse_shortcode(url).is_some() } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let (kind, shortcode) = parse_shortcode(url).ok_or_else(|| { + FetchError::Build(format!("instagram_post: cannot parse shortcode from '{url}'")) + })?; + let embed_url = format!("https://www.instagram.com/p/{shortcode}/embed/captioned/"); + let html = client.get_text(&embed_url).await?; + + Ok(json!({ + "url": url, + "embed_url": embed_url, + "shortcode": shortcode, + "kind": kind, + "data_completeness": "embed", + "author_username": parse_username(&html), + "caption": parse_caption(&html), + "thumbnail_url": parse_thumbnail(&html), + "canonical_url": format!("https://www.instagram.com/{}/{shortcode}/", path_segment_for(kind)), + })) +} + +fn parse_shortcode(url: &str) -> Option<(&'static str, String)> { + let parsed = url::Url::parse(url).ok()?; + let host = parsed.host_str()?; + if host != "www.instagram.com" && host != "instagram.com" { + return None; + } + let segs: Vec<_> = parsed.path_segments()?.filter(|s| !s.is_empty()).collect(); + let kind = match segs.first().copied()? { + "p" => "post", + "reel" | "reels" => "reel", + "tv" => "tv", + _ => return None, + }; + Some((kind, segs.get(1)?.to_string())) +} + +fn path_segment_for(kind: &str) -> &'static str { + match kind { + "reel" => "reel", + "tv" => "tv", + _ => "p", + } +} + +fn parse_username(html: &str) -> Option { + let re = Regex::new(r#"(?s)class="CaptionUsername"[^>]*>([^<]+)<"#).ok()?; + re.captures(html) + .and_then(|captures| captures.get(1)) + .map(|value| html_decode(value.as_str().trim())) +} + +fn parse_caption(html: &str) -> Option { + let outer = Regex::new(r#"(?s)]*>(.*?)"#).ok()?; + let block = outer.captures(html)?.get(1)?.as_str(); + let user_re = + Regex::new(r#"(?s)]*class="CaptionUsername"[^>]*>.*?"#).ok()?; + let stripped = user_re.replace_all(block, ""); + let tag_re = Regex::new(r"<[^>]+>").ok()?; + let text = tag_re.replace_all(&stripped, " "); + let decoded = html_decode(text.trim()); + let cleaned = decoded.split_whitespace().collect::>().join(" "); + (!cleaned.is_empty()).then_some(cleaned) +} + +fn parse_thumbnail(html: &str) -> Option { + let img_re = Regex::new( + r#"(?s)]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)""#, + ) + .ok()?; + img_re + .captures(html) + .and_then(|captures| captures.get(1)) + .map(|value| html_decode(value.as_str())) +} + +fn html_decode(value: &str) -> String { + value + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") } diff --git a/crates/noxa-fetch/src/extractors/instagram_profile.rs b/crates/noxa-fetch/src/extractors/instagram_profile.rs index d6b5d2e..1b688ea 100644 --- a/crates/noxa-fetch/src/extractors/instagram_profile.rs +++ b/crates/noxa-fetch/src/extractors/instagram_profile.rs @@ -1,6 +1,6 @@ -use serde_json::Value; +use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,9 +11,100 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "instagram.com") && !url.contains("/p/") && !url.contains("/reel/") + parse_username(url).is_some() } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let username = parse_username(url).ok_or_else(|| { + FetchError::Build(format!("instagram_profile: cannot parse username from '{url}'")) + })?; + let api_url = + format!("https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"); + let body = client.get_json(&api_url).await?; + let user = body + .pointer("/data/user") + .ok_or_else(|| FetchError::BodyDecode("instagram profile missing data.user".into()))?; + let recent_posts: Vec<_> = user + .pointer("/edge_owner_to_timeline_media/edges") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(|edge| edge.get("node")) + .map(post_summary) + .collect(); + + Ok(json!({ + "url": url, + "canonical_url": format!("https://www.instagram.com/{username}/"), + "username": user.get("username").cloned().unwrap_or_else(|| json!(username)), + "data_completeness": "api", + "user_id": user.get("id").cloned(), + "full_name": user.get("full_name").cloned(), + "biography": user.get("biography").cloned(), + "biography_links": user.get("bio_links").cloned(), + "external_url": user.get("external_url").cloned(), + "category": user.get("category_name").cloned(), + "follower_count": user.pointer("/edge_followed_by/count").cloned(), + "following_count": user.pointer("/edge_follow/count").cloned(), + "post_count": user.pointer("/edge_owner_to_timeline_media/count").cloned(), + "is_verified": user.get("is_verified").cloned(), + "is_private": user.get("is_private").cloned(), + "is_business": user.get("is_business_account").cloned(), + "is_professional": user.get("is_professional_account").cloned(), + "profile_pic_url": user.get("profile_pic_url_hd").or_else(|| user.get("profile_pic_url")).cloned(), + "recent_posts": recent_posts, + })) +} + +fn parse_username(url: &str) -> Option { + let parsed = url::Url::parse(url).ok()?; + let host = parsed.host_str()?; + if host != "www.instagram.com" && host != "instagram.com" { + return None; + } + let segs: Vec<_> = parsed.path_segments()?.filter(|s| !s.is_empty()).collect(); + if segs.len() != 1 || RESERVED.contains(&segs[0]) { + return None; + } + Some(segs[0].to_string()) +} + +const RESERVED: &[&str] = &[ + "p", "reel", "reels", "tv", "explore", "stories", "directory", "accounts", "about", + "developer", "press", "api", "ads", "blog", "fragments", "terms", "privacy", "session", + "login", "signup", +]; + +fn post_summary(node: &Value) -> Value { + let shortcode = node.get("shortcode").and_then(Value::as_str).unwrap_or(""); + let kind = classify(node); + let path = if kind == "reel" { "reel" } else { "p" }; + json!({ + "shortcode": node.get("shortcode").cloned(), + "url": format!("https://www.instagram.com/{path}/{shortcode}/"), + "kind": kind, + "is_video": node.get("is_video").cloned(), + "video_views": node.get("video_view_count").cloned(), + "thumbnail_url": node.get("thumbnail_src").or_else(|| node.get("display_url")).cloned(), + "display_url": node.get("display_url").cloned(), + "like_count": node.pointer("/edge_media_preview_like/count").cloned(), + "comment_count": node.pointer("/edge_media_to_comment/count").cloned(), + "taken_at": node.get("taken_at_timestamp").cloned(), + "caption": node.pointer("/edge_media_to_caption/edges/0/node/text").cloned(), + "alt_text": node.get("accessibility_caption").cloned(), + "dimensions": node.get("dimensions").cloned(), + "product_type": node.get("product_type").cloned(), + }) +} + +fn classify(node: &Value) -> &'static str { + if node.get("product_type").and_then(Value::as_str) == Some("clips") { + return "reel"; + } + match node.get("__typename").and_then(Value::as_str) { + Some("GraphSidecar") => "carousel", + Some("GraphVideo") => "video", + Some("GraphImage") => "photo", + _ => "post", + } } diff --git a/crates/noxa-fetch/src/extractors/linkedin_post.rs b/crates/noxa-fetch/src/extractors/linkedin_post.rs index dc6ee09..fd730a5 100644 --- a/crates/noxa-fetch/src/extractors/linkedin_post.rs +++ b/crates/noxa-fetch/src/extractors/linkedin_post.rs @@ -1,6 +1,7 @@ -use serde_json::Value; +use regex::Regex; +use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,9 +12,103 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "linkedin.com") && (url.contains("/posts/") || url.contains("/feed/update/")) + extract_urn(url).is_some() } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let urn = extract_urn(url).ok_or_else(|| { + FetchError::Build(format!("linkedin_post: cannot extract URN from '{url}'")) + })?; + let embed_url = format!("https://www.linkedin.com/embed/feed/update/{urn}"); + let html = client.get_text(&embed_url).await?; + let og = parse_og_tags(&html); + + Ok(json!({ + "url": url, + "embed_url": embed_url, + "urn": urn, + "canonical_url": og.get("url").cloned().unwrap_or_else(|| json!(url)), + "data_completeness": "embed", + "title": og.get("title").cloned(), + "body": parse_post_body(&html), + "author_name": parse_author(&html), + "image_url": og.get("image").cloned(), + "site_name": og.get("site_name").cloned().unwrap_or_else(|| json!("LinkedIn")), + })) +} + +fn extract_urn(url: &str) -> Option { + let parsed = url::Url::parse(url).ok()?; + let host = parsed.host_str()?; + if host != "www.linkedin.com" && host != "linkedin.com" { + return None; + } + if let Some(index) = url.find("urn:li:") { + let tail = &url[index..]; + let end = tail.find(['/', '?', '#']).unwrap_or(tail.len()); + let urn = &tail[..end]; + let mut parts = urn.split(':'); + if parts.next() == Some("urn") + && parts.next() == Some("li") + && parts.next().is_some() + && parts.next().is_some_and(|part| part.chars().all(|c| c.is_ascii_digit())) + { + return Some(urn.to_string()); + } + } + let re = Regex::new(r"/posts/[^/]*?-(\d{15,})-[A-Za-z0-9]{2,}/?").ok()?; + re.captures(url) + .and_then(|captures| captures.get(1)) + .map(|id| format!("urn:li:activity:{}", id.as_str())) +} + +fn parse_og_tags(html: &str) -> serde_json::Map { + let mut out = serde_json::Map::new(); + let Ok(re) = Regex::new(r#"(?i)]+property="og:([a-z_]+)"[^>]+content="([^"]+)""#) + else { + return out; + }; + for captures in re.captures_iter(html) { + if let (Some(key), Some(value)) = (captures.get(1), captures.get(2)) { + out.entry(key.as_str().to_lowercase()) + .or_insert_with(|| json!(html_decode(value.as_str()))); + } + } + out +} + +fn parse_post_body(html: &str) -> Option { + let re = Regex::new( + r#"(?s)]+class="[^"]*attributed-text-segment-list__content[^"]*"[^>]*>(.*?)

"#, + ) + .ok()?; + let inner = re.captures(html)?.get(1)?.as_str(); + Some(strip_tags(inner).trim().to_string()) +} + +fn parse_author(html: &str) -> Option { + let re = Regex::new(r"([^<]+)").ok()?; + let title = re.captures(html)?.get(1)?.as_str(); + title + .rsplit_once('|') + .map(|(_, name)| html_decode(name.trim())) +} + +fn strip_tags(html: &str) -> String { + let Ok(re) = Regex::new(r"<[^>]+>") else { + return html_decode(html); + }; + html_decode(&re.replace_all(html, " ")) + .split_whitespace() + .collect::>() + .join(" ") +} + +fn html_decode(value: &str) -> String { + value + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") } diff --git a/crates/noxa-fetch/src/extractors/mod.rs b/crates/noxa-fetch/src/extractors/mod.rs index 21bdf37..d19a150 100644 --- a/crates/noxa-fetch/src/extractors/mod.rs +++ b/crates/noxa-fetch/src/extractors/mod.rs @@ -709,4 +709,117 @@ mod tests { assert_eq!(video["view_count"], 1000); } } + + pub mod social { + use std::collections::BTreeMap; + + use async_trait::async_trait; + + use super::*; + + struct FixtureHttp { + bodies: BTreeMap<&'static str, &'static str>, + } + + impl FixtureHttp { + fn new(entries: &[(&'static str, &'static str)]) -> Self { + Self { + bodies: entries.iter().copied().collect(), + } + } + } + + #[async_trait] + impl http::ExtractorHttp for FixtureHttp { + async fn get_text(&self, url: &str) -> Result { + self.bodies + .get(url) + .map(|body| (*body).to_string()) + .ok_or_else(|| FetchError::Build(format!("missing fixture for {url}"))) + } + + async fn get_json(&self, url: &str) -> Result { + let body = self.get_text(url).await?; + serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + } + } + + #[test] + fn social_matchers_disambiguate_urls() { + assert!(huggingface_model::matches("https://huggingface.co/openai/whisper-large-v3")); + assert!(!huggingface_model::matches("https://huggingface.co/datasets/openai/gsm8k")); + assert!(huggingface_dataset::matches("https://huggingface.co/datasets/openai/gsm8k")); + assert!(instagram_post::matches("https://www.instagram.com/p/ABC123/")); + assert!(instagram_post::matches("https://www.instagram.com/reel/ABC123/")); + assert!(!instagram_profile::matches("https://www.instagram.com/p/ABC123/")); + assert!(instagram_profile::matches("https://www.instagram.com/jmagar/")); + assert!(linkedin_post::matches( + "https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288" + )); + } + + #[tokio::test] + async fn social_extractors_parse_fixture_payloads() { + let client = FixtureHttp::new(&[ + ( + "https://huggingface.co/api/models/openai/whisper-large-v3", + include_str!("../../tests/fixtures/extractors/huggingface_model.json"), + ), + ( + "https://huggingface.co/api/datasets/openai/gsm8k", + include_str!("../../tests/fixtures/extractors/huggingface_dataset.json"), + ), + ( + "https://www.instagram.com/p/ABC123/embed/captioned/", + include_str!("../../tests/fixtures/extractors/instagram_post.html"), + ), + ( + "https://www.instagram.com/api/v1/users/web_profile_info/?username=jmagar", + include_str!("../../tests/fixtures/extractors/instagram_profile.json"), + ), + ( + "https://www.linkedin.com/embed/feed/update/urn:li:activity:7452618583290892288", + include_str!("../../tests/fixtures/extractors/linkedin_post.html"), + ), + ]); + + let model = + huggingface_model::extract(&client, "https://huggingface.co/openai/whisper-large-v3") + .await + .unwrap(); + assert_eq!(model["model_id"], "openai/whisper-large-v3"); + assert_eq!(model["file_count"], 1); + + let dataset = huggingface_dataset::extract( + &client, + "https://huggingface.co/datasets/openai/gsm8k", + ) + .await + .unwrap(); + assert_eq!(dataset["id"], "openai/gsm8k"); + assert_eq!(dataset["downloads"], 200); + + let post = instagram_post::extract(&client, "https://www.instagram.com/p/ABC123/") + .await + .unwrap(); + assert_eq!(post["shortcode"], "ABC123"); + assert_eq!(post["author_username"], "jmagar"); + + let profile = + instagram_profile::extract(&client, "https://www.instagram.com/jmagar/") + .await + .unwrap(); + assert_eq!(profile["username"], "jmagar"); + assert_eq!(profile["recent_posts"][0]["shortcode"], "ABC123"); + + let linked = linkedin_post::extract( + &client, + "https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288", + ) + .await + .unwrap(); + assert_eq!(linked["urn"], "urn:li:activity:7452618583290892288"); + assert_eq!(linked["author_name"], "Jacob Magar"); + } + } } diff --git a/crates/noxa-fetch/tests/fixtures/extractors/huggingface_dataset.json b/crates/noxa-fetch/tests/fixtures/extractors/huggingface_dataset.json new file mode 100644 index 0000000..a58cacb --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/huggingface_dataset.json @@ -0,0 +1,21 @@ +{ + "id": "openai/gsm8k", + "private": false, + "gated": false, + "downloads": 200, + "downloadsAllTime": 2000, + "likes": 20, + "tags": ["math"], + "createdAt": "2026-01-01T00:00:00Z", + "lastModified": "2026-01-02T00:00:00Z", + "sha": "def456", + "cardData": { + "license": "mit", + "language": ["en"], + "task_categories": ["question-answering"], + "size_categories": ["1K + + +
+ jmagar + Porting extractors & testing fixtures +
+ + + diff --git a/crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json b/crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json new file mode 100644 index 0000000..d314f67 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/instagram_profile.json @@ -0,0 +1,45 @@ +{ + "data": { + "user": { + "id": "123", + "username": "jmagar", + "full_name": "Jacob Magar", + "biography": "Building Noxa", + "bio_links": [], + "external_url": "https://example.com", + "category_name": "Software", + "profile_pic_url": "https://example.com/pic.jpg", + "profile_pic_url_hd": "https://example.com/pic-hd.jpg", + "is_verified": false, + "is_private": false, + "is_business_account": false, + "is_professional_account": true, + "edge_followed_by": { "count": 100 }, + "edge_follow": { "count": 50 }, + "edge_owner_to_timeline_media": { + "count": 1, + "edges": [ + { + "node": { + "__typename": "GraphImage", + "shortcode": "ABC123", + "is_video": false, + "video_view_count": null, + "display_url": "https://example.com/display.jpg", + "thumbnail_src": "https://example.com/thumb.jpg", + "accessibility_caption": "alt text", + "taken_at_timestamp": 1767225600, + "product_type": "feed", + "dimensions": { "width": 1080, "height": 1080 }, + "edge_media_preview_like": { "count": 10 }, + "edge_media_to_comment": { "count": 2 }, + "edge_media_to_caption": { + "edges": [{ "node": { "text": "Fixture caption" } }] + } + } + } + ] + } + } + } +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html b/crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html new file mode 100644 index 0000000..5c7bc4d --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/linkedin_post.html @@ -0,0 +1,12 @@ + + + Porting extractors | Jacob Magar + + + + + + +

Shipping extractors today

+ + From a93ed15a96916da6ecb83824bccb3e9cc039772e Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:56:40 -0400 Subject: [PATCH 14/28] feat(fetch): expose reddit vertical extractor --- crates/noxa-fetch/src/extractors/mod.rs | 75 +++++++++++++++++++ crates/noxa-fetch/src/extractors/reddit.rs | 11 ++- .../tests/fixtures/extractors/reddit.json | 33 ++++++++ 3 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/reddit.json diff --git a/crates/noxa-fetch/src/extractors/mod.rs b/crates/noxa-fetch/src/extractors/mod.rs index d19a150..c3ca280 100644 --- a/crates/noxa-fetch/src/extractors/mod.rs +++ b/crates/noxa-fetch/src/extractors/mod.rs @@ -822,4 +822,79 @@ mod tests { assert_eq!(linked["author_name"], "Jacob Magar"); } } + + pub mod reddit_vertical { + use std::collections::BTreeMap; + + use async_trait::async_trait; + + use super::*; + + struct FixtureHttp { + bodies: BTreeMap<&'static str, &'static str>, + } + + #[async_trait] + impl http::ExtractorHttp for FixtureHttp { + async fn get_text(&self, url: &str) -> Result { + self.bodies + .get(url) + .map(|body| (*body).to_string()) + .ok_or_else(|| FetchError::Build(format!("missing fixture for {url}"))) + } + + async fn get_json(&self, url: &str) -> Result { + let body = self.get_text(url).await?; + serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + } + } + + #[tokio::test] + async fn reddit_vertical_uses_hardened_json_parser() { + let client = FixtureHttp { + bodies: [( + "https://www.reddit.com/r/rust/comments/abc123/release_thread.json", + include_str!("../../tests/fixtures/extractors/reddit.json"), + )] + .into_iter() + .collect(), + }; + + let value = reddit::extract( + &client, + "https://www.reddit.com/r/rust/comments/abc123/release_thread/", + ) + .await + .unwrap(); + + assert_eq!(value["metadata"]["title"], "Rust release thread"); + assert!( + value["content"]["plain_text"] + .as_str() + .unwrap() + .contains("Thanks for the update!") + ); + } + + #[tokio::test] + async fn reddit_vertical_rejects_verify_wall_html() { + let client = FixtureHttp { + bodies: [( + "https://www.reddit.com/r/rust/comments/abc123/release_thread.json", + "Whoa there, verify you are human", + )] + .into_iter() + .collect(), + }; + + let err = reddit::extract( + &client, + "https://www.reddit.com/r/rust/comments/abc123/release_thread/", + ) + .await + .expect_err("verify wall must not parse as reddit JSON"); + + assert!(err.to_string().contains("verification")); + } + } } diff --git a/crates/noxa-fetch/src/extractors/reddit.rs b/crates/noxa-fetch/src/extractors/reddit.rs index 644883d..d6bc5f9 100644 --- a/crates/noxa-fetch/src/extractors/reddit.rs +++ b/crates/noxa-fetch/src/extractors/reddit.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, host_matches, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,11 @@ pub fn matches(url: &str) -> bool { host_matches(url, "reddit.com") && url.contains("/comments/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let json_url = crate::reddit::json_url(url); + let body = client.get_text(&json_url).await?; + let extraction = crate::reddit::parse_reddit_json(body.as_bytes(), url) + .map_err(FetchError::BodyDecode)?; + + serde_json::to_value(extraction).map_err(|error| FetchError::BodyDecode(error.to_string())) } diff --git a/crates/noxa-fetch/tests/fixtures/extractors/reddit.json b/crates/noxa-fetch/tests/fixtures/extractors/reddit.json new file mode 100644 index 0000000..fcfd3c9 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/reddit.json @@ -0,0 +1,33 @@ +[ + { + "data": { + "children": [ + { + "kind": "t3", + "data": { + "title": "Rust release thread", + "selftext": "Rust 1.x is out now.", + "subreddit_name_prefixed": "r/rust", + "url_overridden_by_dest": "https://example.com/release", + "author": "ferris" + } + } + ] + } + }, + { + "data": { + "children": [ + { + "kind": "t1", + "data": { + "author": "reader1", + "body": "Thanks for the update!", + "score": 42, + "replies": "" + } + } + ] + } + } +] From 169d86703ea187fb67fe713a989a4ad27b7783f2 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sat, 25 Apr 2026 23:59:52 -0400 Subject: [PATCH 15/28] feat(fetch): port ecommerce vertical extractors --- .../src/extractors/amazon_product.rs | 7 +- .../noxa-fetch/src/extractors/ebay_listing.rs | 7 +- .../src/extractors/ecommerce_product.rs | 7 +- .../noxa-fetch/src/extractors/etsy_listing.rs | 7 +- crates/noxa-fetch/src/extractors/mod.rs | 147 ++++++++++++++++++ crates/noxa-fetch/src/extractors/product.rs | 133 ++++++++++++++++ .../src/extractors/shopify_collection.rs | 14 +- .../src/extractors/shopify_product.rs | 22 ++- .../src/extractors/trustpilot_reviews.rs | 7 +- .../src/extractors/woocommerce_product.rs | 7 +- .../fixtures/extractors/product_page.html | 32 ++++ .../extractors/shopify_collection.json | 14 ++ .../fixtures/extractors/shopify_product.json | 19 +++ .../tests/fixtures/extractors/trustpilot.html | 26 ++++ 14 files changed, 423 insertions(+), 26 deletions(-) create mode 100644 crates/noxa-fetch/src/extractors/product.rs create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/product_page.html create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/shopify_product.json create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/trustpilot.html diff --git a/crates/noxa-fetch/src/extractors/amazon_product.rs b/crates/noxa-fetch/src/extractors/amazon_product.rs index a12e49c..52dc22c 100644 --- a/crates/noxa-fetch/src/extractors/amazon_product.rs +++ b/crates/noxa-fetch/src/extractors/amazon_product.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, host_matches, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,7 @@ pub fn matches(url: &str) -> bool { host_matches(url, "amazon.com") && (url.contains("/dp/") || url.contains("/gp/product/")) } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + Ok(product::parse_product_page(url, &html, INFO.name)) } diff --git a/crates/noxa-fetch/src/extractors/ebay_listing.rs b/crates/noxa-fetch/src/extractors/ebay_listing.rs index b72cd8f..1a04e5b 100644 --- a/crates/noxa-fetch/src/extractors/ebay_listing.rs +++ b/crates/noxa-fetch/src/extractors/ebay_listing.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, host_matches, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,7 @@ pub fn matches(url: &str) -> bool { host_matches(url, "ebay.com") && url.contains("/itm/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + Ok(product::parse_product_page(url, &html, INFO.name)) } diff --git a/crates/noxa-fetch/src/extractors/ecommerce_product.rs b/crates/noxa-fetch/src/extractors/ecommerce_product.rs index f577471..38a2576 100644 --- a/crates/noxa-fetch/src/extractors/ecommerce_product.rs +++ b/crates/noxa-fetch/src/extractors/ecommerce_product.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,7 @@ pub fn matches(url: &str) -> bool { url.contains("/product/") || url.contains("/products/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + Ok(product::parse_product_page(url, &html, INFO.name)) } diff --git a/crates/noxa-fetch/src/extractors/etsy_listing.rs b/crates/noxa-fetch/src/extractors/etsy_listing.rs index e411143..3de280d 100644 --- a/crates/noxa-fetch/src/extractors/etsy_listing.rs +++ b/crates/noxa-fetch/src/extractors/etsy_listing.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, host_matches, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,7 @@ pub fn matches(url: &str) -> bool { host_matches(url, "etsy.com") && url.contains("/listing/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + Ok(product::parse_product_page(url, &html, INFO.name)) } diff --git a/crates/noxa-fetch/src/extractors/mod.rs b/crates/noxa-fetch/src/extractors/mod.rs index c3ca280..4acb257 100644 --- a/crates/noxa-fetch/src/extractors/mod.rs +++ b/crates/noxa-fetch/src/extractors/mod.rs @@ -21,6 +21,7 @@ pub mod instagram_profile; pub mod linkedin_post; pub mod npm; pub mod pypi; +mod product; pub mod reddit; pub mod shopify_collection; pub mod shopify_product; @@ -897,4 +898,150 @@ mod tests { assert!(err.to_string().contains("verification")); } } + + pub mod ecommerce { + use std::collections::BTreeMap; + + use async_trait::async_trait; + + use super::*; + + struct FixtureHttp { + bodies: BTreeMap<&'static str, &'static str>, + } + + impl FixtureHttp { + fn new(entries: &[(&'static str, &'static str)]) -> Self { + Self { + bodies: entries.iter().copied().collect(), + } + } + } + + #[async_trait] + impl http::ExtractorHttp for FixtureHttp { + async fn get_text(&self, url: &str) -> Result { + self.bodies + .get(url) + .map(|body| (*body).to_string()) + .ok_or_else(|| FetchError::Build(format!("missing fixture for {url}"))) + } + + async fn get_json(&self, url: &str) -> Result { + let body = self.get_text(url).await?; + serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + } + } + + #[tokio::test] + async fn ecommerce_matchers_cover_auto_and_explicit_only_groups() { + assert!(amazon_product::matches("https://www.amazon.com/dp/B000123")); + assert!(ebay_listing::matches("https://www.ebay.com/itm/123456")); + assert!(etsy_listing::matches("https://www.etsy.com/listing/123456/test")); + assert!(trustpilot_reviews::matches("https://www.trustpilot.com/review/example.com")); + + assert!(shopify_product::matches("https://shop.example/products/widget")); + assert!(shopify_collection::matches("https://shop.example/collections/frontpage")); + assert!(ecommerce_product::matches("https://shop.example/products/widget")); + assert!(woocommerce_product::matches("https://store.example/product/widget")); + + assert!( + dispatch_by_url(&FixtureHttp::new(&[]), "https://shop.example/products/widget") + .await + .is_none() + ); + assert!( + dispatch_by_url(&FixtureHttp::new(&[]), "https://store.example/product/widget") + .await + .is_none() + ); + } + + #[tokio::test] + async fn ecommerce_extractors_parse_fixture_payloads() { + let client = FixtureHttp::new(&[ + ( + "https://www.amazon.com/dp/B000123", + include_str!("../../tests/fixtures/extractors/product_page.html"), + ), + ( + "https://www.ebay.com/itm/123456", + include_str!("../../tests/fixtures/extractors/product_page.html"), + ), + ( + "https://www.etsy.com/listing/123456/test", + include_str!("../../tests/fixtures/extractors/product_page.html"), + ), + ( + "https://shop.example/products/widget", + include_str!("../../tests/fixtures/extractors/product_page.html"), + ), + ( + "https://store.example/product/widget", + include_str!("../../tests/fixtures/extractors/product_page.html"), + ), + ( + "https://shop.example/products/widget.js", + include_str!("../../tests/fixtures/extractors/shopify_product.json"), + ), + ( + "https://shop.example/collections/frontpage/products.json", + include_str!("../../tests/fixtures/extractors/shopify_collection.json"), + ), + ( + "https://www.trustpilot.com/review/example.com", + include_str!("../../tests/fixtures/extractors/trustpilot.html"), + ), + ]); + + let amazon = amazon_product::extract(&client, "https://www.amazon.com/dp/B000123") + .await + .unwrap(); + assert_eq!(amazon["title"], "Fixture Widget"); + assert_eq!(amazon["price"], "19.99"); + + let ebay = ebay_listing::extract(&client, "https://www.ebay.com/itm/123456") + .await + .unwrap(); + assert_eq!(ebay["title"], "Fixture Widget"); + + let etsy = etsy_listing::extract(&client, "https://www.etsy.com/listing/123456/test") + .await + .unwrap(); + assert_eq!(etsy["availability"], "InStock"); + + let generic = ecommerce_product::extract(&client, "https://shop.example/products/widget") + .await + .unwrap(); + assert_eq!(generic["brand"], "FixtureCo"); + + let woo = woocommerce_product::extract(&client, "https://store.example/product/widget") + .await + .unwrap(); + assert_eq!(woo["sku"], "WIDGET-1"); + + let shopify = + shopify_product::extract(&client, "https://shop.example/products/widget") + .await + .unwrap(); + assert_eq!(shopify["title"], "Shopify Widget"); + + let collection = shopify_collection::extract( + &client, + "https://shop.example/collections/frontpage", + ) + .await + .unwrap(); + assert_eq!(collection["products"][0]["title"], "Shopify Widget"); + + let trustpilot = trustpilot_reviews::extract( + &client, + "https://www.trustpilot.com/review/example.com", + ) + .await + .unwrap(); + assert_eq!(trustpilot["business"], "Example Inc"); + assert_eq!(trustpilot["reviews"][0]["rating"], 5); + } + } } diff --git a/crates/noxa-fetch/src/extractors/product.rs b/crates/noxa-fetch/src/extractors/product.rs new file mode 100644 index 0000000..978c6a3 --- /dev/null +++ b/crates/noxa-fetch/src/extractors/product.rs @@ -0,0 +1,133 @@ +use regex::Regex; +use serde_json::{Value, json}; + +pub fn parse_product_page(url: &str, html: &str, source: &str) -> Value { + let product = json_ld_values(html) + .into_iter() + .flat_map(flatten_graph) + .find(is_product) + .unwrap_or_else(|| json!({})); + let offers = product.get("offers").and_then(first_or_self); + let rating = product.get("aggregateRating"); + + json!({ + "url": url, + "source": source, + "title": string_field(&product, "name").or_else(|| og(html, "title")), + "description": string_field(&product, "description"), + "sku": string_field(&product, "sku"), + "brand": product.get("brand").and_then(|brand| { + string_field(brand, "name").or_else(|| brand.as_str().map(ToString::to_string)) + }), + "image": product.get("image").cloned(), + "price": offers.and_then(|offers| string_field(offers, "price")).or_else(|| meta_property(html, "product:price:amount")), + "currency": offers.and_then(|offers| string_field(offers, "priceCurrency")).or_else(|| meta_property(html, "product:price:currency")), + "availability": offers + .and_then(|offers| string_field(offers, "availability")) + .map(|availability| availability.rsplit('/').next().unwrap_or(&availability).to_string()), + "offer_url": offers.and_then(|offers| string_field(offers, "url")), + "rating": rating.and_then(|rating| string_field(rating, "ratingValue")), + "review_count": rating.and_then(|rating| string_field(rating, "reviewCount")), + }) +} + +pub fn parse_trustpilot_page(url: &str, html: &str) -> Value { + let business = json_ld_values(html) + .into_iter() + .flat_map(flatten_graph) + .find(|value| { + value.get("@type").is_some_and(|kind| { + kind == "LocalBusiness" || kind == "Organization" || kind == "Corporation" + }) + }) + .unwrap_or_else(|| json!({})); + let reviews: Vec<_> = business + .get("review") + .and_then(first_or_array) + .into_iter() + .flatten() + .map(|review| { + json!({ + "author": review.pointer("/author/name").and_then(Value::as_str), + "rating": review.pointer("/reviewRating/ratingValue").cloned(), + "body": review.get("reviewBody").cloned(), + "date": review.get("datePublished").cloned(), + }) + }) + .collect(); + + json!({ + "url": url, + "business": string_field(&business, "name"), + "rating": business.pointer("/aggregateRating/ratingValue").cloned(), + "review_count": business.pointer("/aggregateRating/reviewCount").cloned(), + "reviews": reviews, + }) +} + +fn json_ld_values(html: &str) -> Vec { + let Ok(re) = Regex::new( + r#"(?is)]+type=["']application/ld\+json["'][^>]*>(.*?)"#, + ) else { + return Vec::new(); + }; + re.captures_iter(html) + .filter_map(|captures| captures.get(1)) + .filter_map(|body| serde_json::from_str::(body.as_str().trim()).ok()) + .collect() +} + +fn flatten_graph(value: Value) -> Vec { + if let Some(values) = value.as_array() { + return values.clone(); + } + if let Some(graph) = value.get("@graph").and_then(Value::as_array) { + return graph.clone(); + } + vec![value] +} + +fn is_product(value: &Value) -> bool { + match value.get("@type") { + Some(Value::String(kind)) => kind == "Product", + Some(Value::Array(kinds)) => kinds.iter().any(|kind| kind == "Product"), + _ => false, + } +} + +fn first_or_self(value: &Value) -> Option<&Value> { + value.as_array().and_then(|values| values.first()).or(Some(value)) +} + +fn first_or_array(value: &Value) -> Option> { + value + .as_array() + .map(|values| values.iter().collect()) + .or_else(|| Some(vec![value])) +} + +fn string_field(value: &Value, key: &str) -> Option { + value.get(key).and_then(|field| { + field + .as_str() + .map(ToString::to_string) + .or_else(|| field.as_i64().map(|number| number.to_string())) + .or_else(|| field.as_f64().map(|number| number.to_string())) + }) +} + +fn og(html: &str, prop: &str) -> Option { + meta_property(html, &format!("og:{prop}")) +} + +fn meta_property(html: &str, property: &str) -> Option { + let pattern = format!( + r#"(?is)]+property=["']{}["'][^>]+content=["']([^"']+)["']"#, + regex::escape(property) + ); + Regex::new(&pattern) + .ok()? + .captures(html) + .and_then(|captures| captures.get(1)) + .map(|value| value.as_str().to_string()) +} diff --git a/crates/noxa-fetch/src/extractors/shopify_collection.rs b/crates/noxa-fetch/src/extractors/shopify_collection.rs index 0d52211..c651246 100644 --- a/crates/noxa-fetch/src/extractors/shopify_collection.rs +++ b/crates/noxa-fetch/src/extractors/shopify_collection.rs @@ -1,6 +1,6 @@ -use serde_json::Value; +use serde_json::{Value, json}; -use super::{ExtractorInfo, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,12 @@ pub fn matches(url: &str) -> bool { url.contains("/collections/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let api_url = format!("{}/products.json", url.trim_end_matches('/')); + let collection = client.get_json(&api_url).await?; + Ok(json!({ + "url": url, + "api_url": api_url, + "products": collection.get("products").cloned().unwrap_or_else(|| json!([])), + })) } diff --git a/crates/noxa-fetch/src/extractors/shopify_product.rs b/crates/noxa-fetch/src/extractors/shopify_product.rs index 18261d7..a2dfebe 100644 --- a/crates/noxa-fetch/src/extractors/shopify_product.rs +++ b/crates/noxa-fetch/src/extractors/shopify_product.rs @@ -1,6 +1,6 @@ -use serde_json::Value; +use serde_json::{Value, json}; -use super::{ExtractorInfo, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,20 @@ pub fn matches(url: &str) -> bool { url.contains("/products/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let product_url = format!("{}.js", url.trim_end_matches('/')); + let product = client.get_json(&product_url).await?; + Ok(json!({ + "url": url, + "api_url": product_url, + "id": product.get("id").cloned(), + "title": product.get("title").cloned(), + "handle": product.get("handle").cloned(), + "vendor": product.get("vendor").cloned(), + "product_type": product.get("product_type").cloned(), + "tags": product.get("tags").cloned(), + "variants": product.get("variants").cloned(), + "images": product.get("images").cloned(), + "description": product.get("description").cloned(), + })) } diff --git a/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs b/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs index 1d983da..e25269a 100644 --- a/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs +++ b/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, host_matches, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,7 @@ pub fn matches(url: &str) -> bool { host_matches(url, "trustpilot.com") && url.contains("/review/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + Ok(product::parse_trustpilot_page(url, &html)) } diff --git a/crates/noxa-fetch/src/extractors/woocommerce_product.rs b/crates/noxa-fetch/src/extractors/woocommerce_product.rs index afec512..291326e 100644 --- a/crates/noxa-fetch/src/extractors/woocommerce_product.rs +++ b/crates/noxa-fetch/src/extractors/woocommerce_product.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, http::ExtractorHttp, stub_error}; +use super::{ExtractorInfo, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -14,6 +14,7 @@ pub fn matches(url: &str) -> bool { url.contains("/product/") } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + Ok(product::parse_product_page(url, &html, INFO.name)) } diff --git a/crates/noxa-fetch/tests/fixtures/extractors/product_page.html b/crates/noxa-fetch/tests/fixtures/extractors/product_page.html new file mode 100644 index 0000000..1f3b338 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/product_page.html @@ -0,0 +1,32 @@ + + + Fixture Widget + + + + + +

Fixture Widget

+ diff --git a/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json b/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json new file mode 100644 index 0000000..be43080 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json @@ -0,0 +1,14 @@ +{ + "products": [ + { + "id": 1, + "title": "Shopify Widget", + "handle": "widget", + "vendor": "FixtureCo", + "product_type": "Gadgets", + "tags": ["fixture"], + "variants": [{ "id": 11, "price": "1999", "available": true }], + "images": ["https://example.com/widget.jpg"] + } + ] +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/shopify_product.json b/crates/noxa-fetch/tests/fixtures/extractors/shopify_product.json new file mode 100644 index 0000000..5d7fbad --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/shopify_product.json @@ -0,0 +1,19 @@ +{ + "id": 1, + "title": "Shopify Widget", + "handle": "widget", + "vendor": "FixtureCo", + "product_type": "Gadgets", + "tags": ["fixture"], + "variants": [ + { + "id": 11, + "title": "Default", + "price": "1999", + "available": true, + "sku": "WIDGET-1" + } + ], + "images": ["https://example.com/widget.jpg"], + "description": "A Shopify fixture widget." +} diff --git a/crates/noxa-fetch/tests/fixtures/extractors/trustpilot.html b/crates/noxa-fetch/tests/fixtures/extractors/trustpilot.html new file mode 100644 index 0000000..d305d78 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/trustpilot.html @@ -0,0 +1,26 @@ + + + + +

Example Inc Reviews

+ From c4af4c43cc988e77a737bfde725db2a808f9dded Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:01:34 -0400 Subject: [PATCH 16/28] feat(fetch): wire vertical extractor dispatch --- crates/noxa-fetch/src/client/fetch.rs | 93 +++++++++++++++++++++++++++ crates/noxa-fetch/src/client/tests.rs | 22 ++++++- 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/crates/noxa-fetch/src/client/fetch.rs b/crates/noxa-fetch/src/client/fetch.rs index d961951..948899c 100644 --- a/crates/noxa-fetch/src/client/fetch.rs +++ b/crates/noxa-fetch/src/client/fetch.rs @@ -112,11 +112,31 @@ impl FetchClient { Ok(result) } + #[instrument(skip(self), fields(url = %url, extractor = %extractor))] + pub async fn fetch_and_extract_vertical( + &self, + url: &str, + extractor: &str, + _options: &noxa_core::ExtractionOptions, + ) -> Result { + let data = crate::extractors::dispatch_by_name(self, extractor, url) + .await + .map_err(|error| FetchError::Build(error.to_string()))?; + let mut result = build_vertical_extraction_result(extractor, url, data); + result.metadata.fetched_at = Some(Utc::now().to_rfc3339()); + Ok(result) + } + async fn fetch_and_extract_inner( &self, url: &str, options: &noxa_core::ExtractionOptions, ) -> Result { + if let Some(result) = crate::extractors::dispatch_by_url(self, url).await { + let (extractor, data) = result?; + return Ok(build_vertical_extraction_result(extractor, url, data)); + } + if crate::reddit::is_reddit_url(url) { let json_url = crate::reddit::json_url(url); debug!("reddit detected, fetching {json_url}"); @@ -381,3 +401,76 @@ pub(super) fn pdf_to_extraction_result( structured_data: vec![], } } + +pub(super) fn build_vertical_extraction_result( + extractor: &str, + url: &str, + data: serde_json::Value, +) -> noxa_core::ExtractionResult { + let title = string_field(&data, &["title", "name", "full_name", "business"]) + .or_else(|| data.pointer("/post/title").and_then(|value| value.as_str()).map(ToString::to_string)) + .or_else(|| data.pointer("/metadata/title").and_then(|value| value.as_str()).map(ToString::to_string)); + let description = string_field(&data, &["description", "summary", "body", "abstract"]) + .or_else(|| data.pointer("/metadata/description").and_then(|value| value.as_str()).map(ToString::to_string)); + let pretty = serde_json::to_string_pretty(&data).unwrap_or_else(|_| data.to_string()); + let heading = title.clone().unwrap_or_else(|| extractor.to_string()); + let markdown = match description.as_deref() { + Some(description) if !description.is_empty() => { + format!("# {heading}\n\n{description}\n\n```json\n{pretty}\n```") + } + _ => format!("# {heading}\n\n```json\n{pretty}\n```"), + }; + let plain_text = crate::document::strip_markdown_formatting(&markdown); + let word_count = plain_text.split_whitespace().count(); + + noxa_core::ExtractionResult { + metadata: noxa_core::Metadata { + title, + description, + author: string_field(&data, &["author", "author_name"]), + published_date: string_field(&data, &["published_at", "published", "created_at"]), + language: None, + url: Some(url.to_string()), + site_name: Some(extractor.to_string()), + image: string_field(&data, &["image_url", "thumbnail_url"]), + favicon: None, + word_count, + content_hash: None, + source_type: Some("web".into()), + file_path: None, + last_modified: string_field(&data, &["updated_at", "last_modified"]), + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, + }, + content: noxa_core::Content { + markdown, + plain_text, + links: Vec::new(), + images: Vec::new(), + code_blocks: Vec::new(), + raw_html: None, + }, + domain_data: None, + vertical_data: Some(noxa_core::VerticalData { + extractor: extractor.to_string(), + data, + }), + structured_data: vec![], + } +} + +fn string_field(data: &serde_json::Value, keys: &[&str]) -> Option { + keys.iter().find_map(|key| { + data.get(*key).and_then(|value| { + value + .as_str() + .map(ToString::to_string) + .or_else(|| value.as_i64().map(|number| number.to_string())) + .or_else(|| value.as_f64().map(|number| number.to_string())) + }) + }) +} diff --git a/crates/noxa-fetch/src/client/tests.rs b/crates/noxa-fetch/src/client/tests.rs index 516d2bd..e17d9bd 100644 --- a/crates/noxa-fetch/src/client/tests.rs +++ b/crates/noxa-fetch/src/client/tests.rs @@ -2,7 +2,9 @@ use std::time::Duration; use crate::browser::BrowserProfile; use crate::client::batch::collect_ordered; -use crate::client::fetch::{is_pdf_content_type, pdf_to_extraction_result}; +use crate::client::fetch::{ + build_vertical_extraction_result, is_pdf_content_type, pdf_to_extraction_result, +}; use crate::client::pool::{extract_host, pick_for_host}; use crate::client::{ BatchExtractResult, BatchResult, ClientPool, FetchClient, FetchConfig, FetchResult, @@ -67,6 +69,24 @@ async fn test_collect_ordered_handles_gaps() { assert_eq!(results[1], "third"); } +#[test] +fn vertical_extraction_result_sets_vertical_payload_and_summary() { + let result = build_vertical_extraction_result( + "github_repo", + "https://github.com/jmagar/noxa", + serde_json::json!({ + "title": "Noxa Repo", + "description": "Repository metadata", + "stars": 42 + }), + ); + + assert_eq!(result.metadata.title.as_deref(), Some("Noxa Repo")); + assert_eq!(result.vertical_data.as_ref().unwrap().extractor, "github_repo"); + assert_eq!(result.vertical_data.as_ref().unwrap().data["stars"], 42); + assert!(result.content.markdown.contains("Repository metadata")); +} + #[test] fn test_is_pdf_content_type() { let mut headers = http::HeaderMap::new(); From d6005c0a885bb57e7d59338a4306d3eedbee2d2e Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:06:11 -0400 Subject: [PATCH 17/28] feat(fetch): port substack vertical extractor --- crates/noxa-fetch/src/extractors/arxiv.rs | 28 +- crates/noxa-fetch/src/extractors/crates_io.rs | 7 +- crates/noxa-fetch/src/extractors/dev_to.rs | 20 +- .../noxa-fetch/src/extractors/github_issue.rs | 6 +- crates/noxa-fetch/src/extractors/github_pr.rs | 6 +- .../src/extractors/github_release.rs | 6 +- .../noxa-fetch/src/extractors/hackernews.rs | 4 +- .../src/extractors/huggingface_dataset.rs | 4 +- .../src/extractors/huggingface_model.rs | 17 +- .../src/extractors/instagram_post.rs | 19 +- .../src/extractors/instagram_profile.rs | 27 +- .../src/extractors/linkedin_post.rs | 9 +- crates/noxa-fetch/src/extractors/mod.rs | 239 ++++++++++++------ crates/noxa-fetch/src/extractors/product.rs | 11 +- crates/noxa-fetch/src/extractors/pypi.rs | 5 +- crates/noxa-fetch/src/extractors/reddit.rs | 4 +- .../src/extractors/stackoverflow.rs | 16 +- .../src/extractors/substack_post.rs | 141 ++++++++++- .../fixtures/extractors/substack_post.html | 30 +++ 19 files changed, 468 insertions(+), 131 deletions(-) create mode 100644 crates/noxa-fetch/tests/fixtures/extractors/substack_post.html diff --git a/crates/noxa-fetch/src/extractors/arxiv.rs b/crates/noxa-fetch/src/extractors/arxiv.rs index ada5a5f..b7ed429 100644 --- a/crates/noxa-fetch/src/extractors/arxiv.rs +++ b/crates/noxa-fetch/src/extractors/arxiv.rs @@ -84,21 +84,19 @@ fn parse_atom_entry(xml: &str) -> Option { loop { match reader.read_event_into(&mut buf) { - Ok(Event::Start(element)) => { - match element.local_name().as_ref() { - b"entry" => in_entry = true, - b"id" if in_entry && !in_author => current = Some("id"), - b"title" if in_entry => current = Some("title"), - b"summary" if in_entry => current = Some("summary"), - b"published" if in_entry => current = Some("published"), - b"updated" if in_entry => current = Some("updated"), - b"author" if in_entry => in_author = true, - b"name" if in_author => current = Some("author"), - b"doi" if in_entry => current = Some("doi"), - b"comment" if in_entry => current = Some("comment"), - _ => {} - } - } + Ok(Event::Start(element)) => match element.local_name().as_ref() { + b"entry" => in_entry = true, + b"id" if in_entry && !in_author => current = Some("id"), + b"title" if in_entry => current = Some("title"), + b"summary" if in_entry => current = Some("summary"), + b"published" if in_entry => current = Some("published"), + b"updated" if in_entry => current = Some("updated"), + b"author" if in_entry => in_author = true, + b"name" if in_author => current = Some("author"), + b"doi" if in_entry => current = Some("doi"), + b"comment" if in_entry => current = Some("comment"), + _ => {} + }, Ok(Event::Empty(element)) if in_entry => { let mut term = None; let mut href = None; diff --git a/crates/noxa-fetch/src/extractors/crates_io.rs b/crates/noxa-fetch/src/extractors/crates_io.rs index 8926eab..b9584f3 100644 --- a/crates/noxa-fetch/src/extractors/crates_io.rs +++ b/crates/noxa-fetch/src/extractors/crates_io.rs @@ -27,7 +27,12 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result Option<(String, String)> { } const RESERVED_FIRST_SEGS: &[&str] = &[ - "api", "tags", "search", "settings", "enter", "signup", "about", "privacy", "terms", - "contact", "sponsorships", "sponsors", "shop", "videos", "listings", "podcasts", "p", "t", + "api", + "tags", + "search", + "settings", + "enter", + "signup", + "about", + "privacy", + "terms", + "contact", + "sponsorships", + "sponsors", + "shop", + "videos", + "listings", + "podcasts", + "p", + "t", ]; diff --git a/crates/noxa-fetch/src/extractors/github_issue.rs b/crates/noxa-fetch/src/extractors/github_issue.rs index d030ade..394c796 100644 --- a/crates/noxa-fetch/src/extractors/github_issue.rs +++ b/crates/noxa-fetch/src/extractors/github_issue.rs @@ -58,7 +58,11 @@ fn parse_issue(url: &str) -> Option<(String, String, u64)> { if segs.len() < 4 || segs[2] != "issues" { return None; } - Some((segs[0].to_string(), segs[1].to_string(), segs[3].parse().ok()?)) + Some(( + segs[0].to_string(), + segs[1].to_string(), + segs[3].parse().ok()?, + )) } fn names_array(value: Option<&Value>) -> Value { diff --git a/crates/noxa-fetch/src/extractors/github_pr.rs b/crates/noxa-fetch/src/extractors/github_pr.rs index 3b326ef..593939c 100644 --- a/crates/noxa-fetch/src/extractors/github_pr.rs +++ b/crates/noxa-fetch/src/extractors/github_pr.rs @@ -62,7 +62,11 @@ fn parse_pr(url: &str) -> Option<(String, String, u64)> { if segs.len() < 4 || (segs[2] != "pull" && segs[2] != "pulls") { return None; } - Some((segs[0].to_string(), segs[1].to_string(), segs[3].parse().ok()?)) + Some(( + segs[0].to_string(), + segs[1].to_string(), + segs[3].parse().ok()?, + )) } fn names_array(value: Option<&Value>) -> Value { diff --git a/crates/noxa-fetch/src/extractors/github_release.rs b/crates/noxa-fetch/src/extractors/github_release.rs index 2b9dab5..8d27e96 100644 --- a/crates/noxa-fetch/src/extractors/github_release.rs +++ b/crates/noxa-fetch/src/extractors/github_release.rs @@ -58,5 +58,9 @@ fn parse_release(url: &str) -> Option<(String, String, String)> { if segs.len() < 5 || segs[2] != "releases" || segs[3] != "tag" { return None; } - Some((segs[0].to_string(), segs[1].to_string(), segs[4].to_string())) + Some(( + segs[0].to_string(), + segs[1].to_string(), + segs[4].to_string(), + )) } diff --git a/crates/noxa-fetch/src/extractors/hackernews.rs b/crates/noxa-fetch/src/extractors/hackernews.rs index d3cd20b..51c5571 100644 --- a/crates/noxa-fetch/src/extractors/hackernews.rs +++ b/crates/noxa-fetch/src/extractors/hackernews.rs @@ -50,7 +50,9 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result Option { let parsed = url::Url::parse(url).ok()?; if parsed.host_str()? == "hn.algolia.com" { - return parsed.path_segments()?.find_map(|segment| segment.parse().ok()); + return parsed + .path_segments()? + .find_map(|segment| segment.parse().ok()); } parsed .query_pairs() diff --git a/crates/noxa-fetch/src/extractors/huggingface_dataset.rs b/crates/noxa-fetch/src/extractors/huggingface_dataset.rs index 9cd7f1a..a7f907b 100644 --- a/crates/noxa-fetch/src/extractors/huggingface_dataset.rs +++ b/crates/noxa-fetch/src/extractors/huggingface_dataset.rs @@ -16,7 +16,9 @@ pub fn matches(url: &str) -> bool { pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { let dataset_path = parse_dataset_path(url).ok_or_else(|| { - FetchError::Build(format!("hf_dataset: cannot parse dataset path from '{url}'")) + FetchError::Build(format!( + "hf_dataset: cannot parse dataset path from '{url}'" + )) })?; let api_url = format!("https://huggingface.co/api/datasets/{dataset_path}"); let dataset = client.get_json(&api_url).await?; diff --git a/crates/noxa-fetch/src/extractors/huggingface_model.rs b/crates/noxa-fetch/src/extractors/huggingface_model.rs index d490441..1aa575d 100644 --- a/crates/noxa-fetch/src/extractors/huggingface_model.rs +++ b/crates/noxa-fetch/src/extractors/huggingface_model.rs @@ -65,6 +65,19 @@ fn parse_owner_name(url: &str) -> Option<(String, String)> { } const RESERVED_NAMESPACES: &[&str] = &[ - "datasets", "spaces", "blog", "docs", "api", "models", "papers", "pricing", "tasks", - "join", "login", "settings", "organizations", "new", "search", + "datasets", + "spaces", + "blog", + "docs", + "api", + "models", + "papers", + "pricing", + "tasks", + "join", + "login", + "settings", + "organizations", + "new", + "search", ]; diff --git a/crates/noxa-fetch/src/extractors/instagram_post.rs b/crates/noxa-fetch/src/extractors/instagram_post.rs index 3c8f37a..2230297 100644 --- a/crates/noxa-fetch/src/extractors/instagram_post.rs +++ b/crates/noxa-fetch/src/extractors/instagram_post.rs @@ -8,7 +8,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo { name: "instagram_post", label: "Instagram Post", description: "Extract post metadata from Instagram.", - url_patterns: &["https://www.instagram.com/p/*", "https://www.instagram.com/reel/*"], + url_patterns: &[ + "https://www.instagram.com/p/*", + "https://www.instagram.com/reel/*", + ], }; pub fn matches(url: &str) -> bool { @@ -17,7 +20,9 @@ pub fn matches(url: &str) -> bool { pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { let (kind, shortcode) = parse_shortcode(url).ok_or_else(|| { - FetchError::Build(format!("instagram_post: cannot parse shortcode from '{url}'")) + FetchError::Build(format!( + "instagram_post: cannot parse shortcode from '{url}'" + )) })?; let embed_url = format!("https://www.instagram.com/p/{shortcode}/embed/captioned/"); let html = client.get_text(&embed_url).await?; @@ -69,8 +74,7 @@ fn parse_username(html: &str) -> Option { fn parse_caption(html: &str) -> Option { let outer = Regex::new(r#"(?s)]*>(.*?)"#).ok()?; let block = outer.captures(html)?.get(1)?.as_str(); - let user_re = - Regex::new(r#"(?s)]*class="CaptionUsername"[^>]*>.*?"#).ok()?; + let user_re = Regex::new(r#"(?s)]*class="CaptionUsername"[^>]*>.*?"#).ok()?; let stripped = user_re.replace_all(block, ""); let tag_re = Regex::new(r"<[^>]+>").ok()?; let text = tag_re.replace_all(&stripped, " "); @@ -80,10 +84,9 @@ fn parse_caption(html: &str) -> Option { } fn parse_thumbnail(html: &str) -> Option { - let img_re = Regex::new( - r#"(?s)]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)""#, - ) - .ok()?; + let img_re = + Regex::new(r#"(?s)]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)""#) + .ok()?; img_re .captures(html) .and_then(|captures| captures.get(1)) diff --git a/crates/noxa-fetch/src/extractors/instagram_profile.rs b/crates/noxa-fetch/src/extractors/instagram_profile.rs index 1b688ea..bf3b18e 100644 --- a/crates/noxa-fetch/src/extractors/instagram_profile.rs +++ b/crates/noxa-fetch/src/extractors/instagram_profile.rs @@ -16,7 +16,9 @@ pub fn matches(url: &str) -> bool { pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { let username = parse_username(url).ok_or_else(|| { - FetchError::Build(format!("instagram_profile: cannot parse username from '{url}'")) + FetchError::Build(format!( + "instagram_profile: cannot parse username from '{url}'" + )) })?; let api_url = format!("https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"); @@ -70,9 +72,26 @@ fn parse_username(url: &str) -> Option { } const RESERVED: &[&str] = &[ - "p", "reel", "reels", "tv", "explore", "stories", "directory", "accounts", "about", - "developer", "press", "api", "ads", "blog", "fragments", "terms", "privacy", "session", - "login", "signup", + "p", + "reel", + "reels", + "tv", + "explore", + "stories", + "directory", + "accounts", + "about", + "developer", + "press", + "api", + "ads", + "blog", + "fragments", + "terms", + "privacy", + "session", + "login", + "signup", ]; fn post_summary(node: &Value) -> Value { diff --git a/crates/noxa-fetch/src/extractors/linkedin_post.rs b/crates/noxa-fetch/src/extractors/linkedin_post.rs index fd730a5..d37a820 100644 --- a/crates/noxa-fetch/src/extractors/linkedin_post.rs +++ b/crates/noxa-fetch/src/extractors/linkedin_post.rs @@ -8,7 +8,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo { name: "linkedin_post", label: "LinkedIn Post", description: "Extract post metadata from LinkedIn.", - url_patterns: &["https://www.linkedin.com/posts/*", "https://www.linkedin.com/feed/update/*"], + url_patterns: &[ + "https://www.linkedin.com/posts/*", + "https://www.linkedin.com/feed/update/*", + ], }; pub fn matches(url: &str) -> bool { @@ -51,7 +54,9 @@ fn extract_urn(url: &str) -> Option { if parts.next() == Some("urn") && parts.next() == Some("li") && parts.next().is_some() - && parts.next().is_some_and(|part| part.chars().all(|c| c.is_ascii_digit())) + && parts + .next() + .is_some_and(|part| part.chars().all(|c| c.is_ascii_digit())) { return Some(urn.to_string()); } diff --git a/crates/noxa-fetch/src/extractors/mod.rs b/crates/noxa-fetch/src/extractors/mod.rs index 4acb257..7e161e4 100644 --- a/crates/noxa-fetch/src/extractors/mod.rs +++ b/crates/noxa-fetch/src/extractors/mod.rs @@ -20,8 +20,8 @@ pub mod instagram_post; pub mod instagram_profile; pub mod linkedin_post; pub mod npm; -pub mod pypi; mod product; +pub mod pypi; pub mod reddit; pub mod shopify_collection; pub mod shopify_product; @@ -84,7 +84,11 @@ pub async fn dispatch_by_url( url: &str, ) -> Option> { if reddit::matches(url) { - return Some(reddit::extract(client, url).await.map(|v| (reddit::INFO.name, v))); + return Some( + reddit::extract(client, url) + .await + .map(|v| (reddit::INFO.name, v)), + ); } if hackernews::matches(url) { return Some( @@ -101,7 +105,11 @@ pub async fn dispatch_by_url( ); } if pypi::matches(url) { - return Some(pypi::extract(client, url).await.map(|v| (pypi::INFO.name, v))); + return Some( + pypi::extract(client, url) + .await + .map(|v| (pypi::INFO.name, v)), + ); } if npm::matches(url) { return Some(npm::extract(client, url).await.map(|v| (npm::INFO.name, v))); @@ -149,7 +157,11 @@ pub async fn dispatch_by_url( ); } if arxiv::matches(url) { - return Some(arxiv::extract(client, url).await.map(|v| (arxiv::INFO.name, v))); + return Some( + arxiv::extract(client, url) + .await + .map(|v| (arxiv::INFO.name, v)), + ); } if docker_hub::matches(url) { return Some( @@ -159,7 +171,11 @@ pub async fn dispatch_by_url( ); } if dev_to::matches(url) { - return Some(dev_to::extract(client, url).await.map(|v| (dev_to::INFO.name, v))); + return Some( + dev_to::extract(client, url) + .await + .map(|v| (dev_to::INFO.name, v)), + ); } if stackoverflow::matches(url) { return Some( @@ -234,7 +250,10 @@ pub async fn dispatch_by_name( ) -> Result { match name { n if n == reddit::INFO.name => { - run_or_mismatch(reddit::matches(url), n, url, || reddit::extract(client, url)).await + run_or_mismatch(reddit::matches(url), n, url, || { + reddit::extract(client, url) + }) + .await } n if n == hackernews::INFO.name => { run_or_mismatch(hackernews::matches(url), n, url, || { @@ -300,7 +319,10 @@ pub async fn dispatch_by_name( .await } n if n == dev_to::INFO.name => { - run_or_mismatch(dev_to::matches(url), n, url, || dev_to::extract(client, url)).await + run_or_mismatch(dev_to::matches(url), n, url, || { + dev_to::extract(client, url) + }) + .await } n if n == stackoverflow::INFO.name => { run_or_mismatch(stackoverflow::matches(url), n, url, || { @@ -426,10 +448,6 @@ fn host_matches(url: &str, suffix: &str) -> bool { .is_some_and(|host| host == suffix || host.ends_with(&format!(".{suffix}"))) } -fn stub_error(name: &str) -> FetchError { - FetchError::Build(format!("extractor not implemented: {name}")) -} - #[cfg(test)] mod tests { use super::*; @@ -483,7 +501,8 @@ mod tests { async fn get_json(&self, url: &str) -> Result { let body = self.get_text(url).await?; - serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + serde_json::from_str(&body) + .map_err(|error| FetchError::BodyDecode(error.to_string())) } } @@ -491,7 +510,9 @@ mod tests { fn developer_matchers_accept_expected_urls() { assert!(github_repo::matches("https://github.com/jmagar/noxa")); assert!(github_pr::matches("https://github.com/jmagar/noxa/pull/12")); - assert!(github_issue::matches("https://github.com/jmagar/noxa/issues/34")); + assert!(github_issue::matches( + "https://github.com/jmagar/noxa/issues/34" + )); assert!(github_release::matches( "https://github.com/jmagar/noxa/releases/tag/v0.7.0" )); @@ -503,8 +524,12 @@ mod tests { #[test] fn github_repo_does_not_preempt_more_specific_github_extractors() { - assert!(!github_repo::matches("https://github.com/jmagar/noxa/pull/12")); - assert!(!github_repo::matches("https://github.com/jmagar/noxa/issues/34")); + assert!(!github_repo::matches( + "https://github.com/jmagar/noxa/pull/12" + )); + assert!(!github_repo::matches( + "https://github.com/jmagar/noxa/issues/34" + )); assert!(!github_repo::matches( "https://github.com/jmagar/noxa/releases/tag/v0.7.0" )); @@ -552,16 +577,21 @@ mod tests { ), ]); - let repo = github_repo::extract(&client, "https://github.com/jmagar/noxa").await.unwrap(); + let repo = github_repo::extract(&client, "https://github.com/jmagar/noxa") + .await + .unwrap(); assert_eq!(repo["full_name"], "jmagar/noxa"); assert_eq!(repo["stars"], 42); - let pr = github_pr::extract(&client, "https://github.com/jmagar/noxa/pull/12").await.unwrap(); + let pr = github_pr::extract(&client, "https://github.com/jmagar/noxa/pull/12") + .await + .unwrap(); assert_eq!(pr["number"], 12); assert_eq!(pr["title"], "Port upstream extractors"); - let issue = - github_issue::extract(&client, "https://github.com/jmagar/noxa/issues/34").await.unwrap(); + let issue = github_issue::extract(&client, "https://github.com/jmagar/noxa/issues/34") + .await + .unwrap(); assert_eq!(issue["number"], 34); assert_eq!(issue["labels"][0], "bug"); @@ -574,17 +604,21 @@ mod tests { assert_eq!(release["tag_name"], "v0.7.0"); assert_eq!(release["total_downloads"], 7); - let pypi = pypi::extract(&client, "https://pypi.org/project/requests/").await.unwrap(); + let pypi = pypi::extract(&client, "https://pypi.org/project/requests/") + .await + .unwrap(); assert_eq!(pypi["name"], "requests"); assert_eq!(pypi["version"], "2.32.3"); - let npm = - npm::extract(&client, "https://www.npmjs.com/package/@types/node").await.unwrap(); + let npm = npm::extract(&client, "https://www.npmjs.com/package/@types/node") + .await + .unwrap(); assert_eq!(npm["name"], "@types/node"); assert_eq!(npm["weekly_downloads"], 123456); - let crate_data = - crates_io::extract(&client, "https://crates.io/crates/serde").await.unwrap(); + let crate_data = crates_io::extract(&client, "https://crates.io/crates/serde") + .await + .unwrap(); assert_eq!(crate_data["name"], "serde"); assert_eq!(crate_data["downloads"], 1000); @@ -626,19 +660,27 @@ mod tests { async fn get_json(&self, url: &str) -> Result { let body = self.get_text(url).await?; - serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + serde_json::from_str(&body) + .map_err(|error| FetchError::BodyDecode(error.to_string())) } } #[test] fn community_matchers_accept_expected_urls() { assert!(arxiv::matches("https://arxiv.org/abs/2401.12345v2")); - assert!(hackernews::matches("https://news.ycombinator.com/item?id=123")); + assert!(hackernews::matches( + "https://news.ycombinator.com/item?id=123" + )); assert!(dev_to::matches("https://dev.to/jmagar/porting-noxa")); assert!(stackoverflow::matches( "https://stackoverflow.com/questions/12345/how-to-test-rust" )); - assert!(youtube_video::matches("https://www.youtube.com/watch?v=dQw4w9WgXcQ")); + assert!(substack_post::matches( + "https://example.substack.com/p/porting-noxa" + )); + assert!(youtube_video::matches( + "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + )); assert!(youtube_video::matches("https://youtu.be/dQw4w9WgXcQ")); } @@ -665,6 +707,10 @@ mod tests { "https://api.stackexchange.com/2.3/questions/12345/answers?site=stackoverflow&filter=withbody&order=desc&sort=votes", include_str!("../../tests/fixtures/extractors/stackoverflow_answers.json"), ), + ( + "https://example.substack.com/p/porting-noxa", + include_str!("../../tests/fixtures/extractors/substack_post.html"), + ), ( "https://www.youtube.com/watch?v=dQw4w9WgXcQ", include_str!("../../tests/fixtures/extractors/youtube_video.html"), @@ -699,12 +745,19 @@ mod tests { assert_eq!(question["question_id"], 12345); assert_eq!(question["accepted_answer"]["answer_id"], 99); - let video = youtube_video::extract( - &client, - "https://www.youtube.com/watch?v=dQw4w9WgXcQ", - ) - .await - .unwrap(); + let post = + substack_post::extract(&client, "https://example.substack.com/p/porting-noxa") + .await + .unwrap(); + assert_eq!(post["title"], "Porting Noxa Verticals"); + assert_eq!(post["author"], "Ada Lovelace"); + assert_eq!(post["published_at"], "2026-04-26T12:00:00Z"); + assert!(post["body"].as_str().unwrap().contains("Extractor parity")); + + let video = + youtube_video::extract(&client, "https://www.youtube.com/watch?v=dQw4w9WgXcQ") + .await + .unwrap(); assert_eq!(video["video_id"], "dQw4w9WgXcQ"); assert_eq!(video["title"], "Test Video"); assert_eq!(video["view_count"], 1000); @@ -741,19 +794,34 @@ mod tests { async fn get_json(&self, url: &str) -> Result { let body = self.get_text(url).await?; - serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + serde_json::from_str(&body) + .map_err(|error| FetchError::BodyDecode(error.to_string())) } } #[test] fn social_matchers_disambiguate_urls() { - assert!(huggingface_model::matches("https://huggingface.co/openai/whisper-large-v3")); - assert!(!huggingface_model::matches("https://huggingface.co/datasets/openai/gsm8k")); - assert!(huggingface_dataset::matches("https://huggingface.co/datasets/openai/gsm8k")); - assert!(instagram_post::matches("https://www.instagram.com/p/ABC123/")); - assert!(instagram_post::matches("https://www.instagram.com/reel/ABC123/")); - assert!(!instagram_profile::matches("https://www.instagram.com/p/ABC123/")); - assert!(instagram_profile::matches("https://www.instagram.com/jmagar/")); + assert!(huggingface_model::matches( + "https://huggingface.co/openai/whisper-large-v3" + )); + assert!(!huggingface_model::matches( + "https://huggingface.co/datasets/openai/gsm8k" + )); + assert!(huggingface_dataset::matches( + "https://huggingface.co/datasets/openai/gsm8k" + )); + assert!(instagram_post::matches( + "https://www.instagram.com/p/ABC123/" + )); + assert!(instagram_post::matches( + "https://www.instagram.com/reel/ABC123/" + )); + assert!(!instagram_profile::matches( + "https://www.instagram.com/p/ABC123/" + )); + assert!(instagram_profile::matches( + "https://www.instagram.com/jmagar/" + )); assert!(linkedin_post::matches( "https://www.linkedin.com/feed/update/urn:li:activity:7452618583290892288" )); @@ -784,10 +852,12 @@ mod tests { ), ]); - let model = - huggingface_model::extract(&client, "https://huggingface.co/openai/whisper-large-v3") - .await - .unwrap(); + let model = huggingface_model::extract( + &client, + "https://huggingface.co/openai/whisper-large-v3", + ) + .await + .unwrap(); assert_eq!(model["model_id"], "openai/whisper-large-v3"); assert_eq!(model["file_count"], 1); @@ -806,10 +876,9 @@ mod tests { assert_eq!(post["shortcode"], "ABC123"); assert_eq!(post["author_username"], "jmagar"); - let profile = - instagram_profile::extract(&client, "https://www.instagram.com/jmagar/") - .await - .unwrap(); + let profile = instagram_profile::extract(&client, "https://www.instagram.com/jmagar/") + .await + .unwrap(); assert_eq!(profile["username"], "jmagar"); assert_eq!(profile["recent_posts"][0]["shortcode"], "ABC123"); @@ -846,7 +915,8 @@ mod tests { async fn get_json(&self, url: &str) -> Result { let body = self.get_text(url).await?; - serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + serde_json::from_str(&body) + .map_err(|error| FetchError::BodyDecode(error.to_string())) } } @@ -929,7 +999,8 @@ mod tests { async fn get_json(&self, url: &str) -> Result { let body = self.get_text(url).await?; - serde_json::from_str(&body).map_err(|error| FetchError::BodyDecode(error.to_string())) + serde_json::from_str(&body) + .map_err(|error| FetchError::BodyDecode(error.to_string())) } } @@ -937,23 +1008,41 @@ mod tests { async fn ecommerce_matchers_cover_auto_and_explicit_only_groups() { assert!(amazon_product::matches("https://www.amazon.com/dp/B000123")); assert!(ebay_listing::matches("https://www.ebay.com/itm/123456")); - assert!(etsy_listing::matches("https://www.etsy.com/listing/123456/test")); - assert!(trustpilot_reviews::matches("https://www.trustpilot.com/review/example.com")); + assert!(etsy_listing::matches( + "https://www.etsy.com/listing/123456/test" + )); + assert!(trustpilot_reviews::matches( + "https://www.trustpilot.com/review/example.com" + )); - assert!(shopify_product::matches("https://shop.example/products/widget")); - assert!(shopify_collection::matches("https://shop.example/collections/frontpage")); - assert!(ecommerce_product::matches("https://shop.example/products/widget")); - assert!(woocommerce_product::matches("https://store.example/product/widget")); + assert!(shopify_product::matches( + "https://shop.example/products/widget" + )); + assert!(shopify_collection::matches( + "https://shop.example/collections/frontpage" + )); + assert!(ecommerce_product::matches( + "https://shop.example/products/widget" + )); + assert!(woocommerce_product::matches( + "https://store.example/product/widget" + )); assert!( - dispatch_by_url(&FixtureHttp::new(&[]), "https://shop.example/products/widget") - .await - .is_none() + dispatch_by_url( + &FixtureHttp::new(&[]), + "https://shop.example/products/widget" + ) + .await + .is_none() ); assert!( - dispatch_by_url(&FixtureHttp::new(&[]), "https://store.example/product/widget") - .await - .is_none() + dispatch_by_url( + &FixtureHttp::new(&[]), + "https://store.example/product/widget" + ) + .await + .is_none() ); } @@ -1010,9 +1099,10 @@ mod tests { .unwrap(); assert_eq!(etsy["availability"], "InStock"); - let generic = ecommerce_product::extract(&client, "https://shop.example/products/widget") - .await - .unwrap(); + let generic = + ecommerce_product::extract(&client, "https://shop.example/products/widget") + .await + .unwrap(); assert_eq!(generic["brand"], "FixtureCo"); let woo = woocommerce_product::extract(&client, "https://store.example/product/widget") @@ -1020,18 +1110,15 @@ mod tests { .unwrap(); assert_eq!(woo["sku"], "WIDGET-1"); - let shopify = - shopify_product::extract(&client, "https://shop.example/products/widget") - .await - .unwrap(); + let shopify = shopify_product::extract(&client, "https://shop.example/products/widget") + .await + .unwrap(); assert_eq!(shopify["title"], "Shopify Widget"); - let collection = shopify_collection::extract( - &client, - "https://shop.example/collections/frontpage", - ) - .await - .unwrap(); + let collection = + shopify_collection::extract(&client, "https://shop.example/collections/frontpage") + .await + .unwrap(); assert_eq!(collection["products"][0]["title"], "Shopify Widget"); let trustpilot = trustpilot_reviews::extract( diff --git a/crates/noxa-fetch/src/extractors/product.rs b/crates/noxa-fetch/src/extractors/product.rs index 978c6a3..41862e1 100644 --- a/crates/noxa-fetch/src/extractors/product.rs +++ b/crates/noxa-fetch/src/extractors/product.rs @@ -66,9 +66,9 @@ pub fn parse_trustpilot_page(url: &str, html: &str) -> Value { } fn json_ld_values(html: &str) -> Vec { - let Ok(re) = Regex::new( - r#"(?is)]+type=["']application/ld\+json["'][^>]*>(.*?)"#, - ) else { + let Ok(re) = + Regex::new(r#"(?is)]+type=["']application/ld\+json["'][^>]*>(.*?)"#) + else { return Vec::new(); }; re.captures_iter(html) @@ -96,7 +96,10 @@ fn is_product(value: &Value) -> bool { } fn first_or_self(value: &Value) -> Option<&Value> { - value.as_array().and_then(|values| values.first()).or(Some(value)) + value + .as_array() + .and_then(|values| values.first()) + .or(Some(value)) } fn first_or_array(value: &Value) -> Option> { diff --git a/crates/noxa-fetch/src/extractors/pypi.rs b/crates/noxa-fetch/src/extractors/pypi.rs index ecd5b26..1f167ec 100644 --- a/crates/noxa-fetch/src/extractors/pypi.rs +++ b/crates/noxa-fetch/src/extractors/pypi.rs @@ -67,7 +67,10 @@ fn parse_project(url: &str) -> Option<(String, Option)> { if segs.len() < 2 || segs[0] != "project" { return None; } - Some((segs[1].to_string(), segs.get(2).map(|value| (*value).to_string()))) + Some(( + segs[1].to_string(), + segs.get(2).map(|value| (*value).to_string()), + )) } fn pick_license_classifier(classifiers: Option<&Value>) -> Option { diff --git a/crates/noxa-fetch/src/extractors/reddit.rs b/crates/noxa-fetch/src/extractors/reddit.rs index d6bc5f9..14c68ba 100644 --- a/crates/noxa-fetch/src/extractors/reddit.rs +++ b/crates/noxa-fetch/src/extractors/reddit.rs @@ -17,8 +17,8 @@ pub fn matches(url: &str) -> bool { pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { let json_url = crate::reddit::json_url(url); let body = client.get_text(&json_url).await?; - let extraction = crate::reddit::parse_reddit_json(body.as_bytes(), url) - .map_err(FetchError::BodyDecode)?; + let extraction = + crate::reddit::parse_reddit_json(body.as_bytes(), url).map_err(FetchError::BodyDecode)?; serde_json::to_value(extraction).map_err(|error| FetchError::BodyDecode(error.to_string())) } diff --git a/crates/noxa-fetch/src/extractors/stackoverflow.rs b/crates/noxa-fetch/src/extractors/stackoverflow.rs index e6b8e3a..455531d 100644 --- a/crates/noxa-fetch/src/extractors/stackoverflow.rs +++ b/crates/noxa-fetch/src/extractors/stackoverflow.rs @@ -16,10 +16,13 @@ pub fn matches(url: &str) -> bool { pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { let id = parse_question_id(url).ok_or_else(|| { - FetchError::Build(format!("stackoverflow: cannot parse question id from '{url}'")) + FetchError::Build(format!( + "stackoverflow: cannot parse question id from '{url}'" + )) })?; - let q_url = - format!("https://api.stackexchange.com/2.3/questions/{id}?site=stackoverflow&filter=withbody"); + let q_url = format!( + "https://api.stackexchange.com/2.3/questions/{id}?site=stackoverflow&filter=withbody" + ); let q_body = client.get_json(&q_url).await?; let question = q_body .get("items") @@ -50,7 +53,12 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result bool { - url.contains("/p/") + url::Url::parse(url) + .ok() + .and_then(|parsed| { + let host = parsed.host_str()?.to_ascii_lowercase(); + let has_post_path = parsed.path_segments().is_some_and(|mut segments| { + segments.next() == Some("p") && segments.next().is_some() + }); + Some(has_post_path && (host.ends_with(".substack.com") || host != "substack.com")) + }) + .unwrap_or(false) } -pub async fn extract(_client: &dyn ExtractorHttp, _url: &str) -> Result { - Err(stub_error(INFO.name)) +pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { + let html = client.get_text(url).await?; + let article = article_json_ld(&html).unwrap_or_else(|| json!({})); + let body = article_body(&html); + + Ok(json!({ + "url": url, + "canonical_url": meta(&html, "property", "og:url").unwrap_or_else(|| url.to_string()), + "title": string_field(&article, "headline") + .or_else(|| string_field(&article, "name")) + .or_else(|| meta(&html, "property", "og:title")) + .or_else(|| title_tag(&html)), + "description": string_field(&article, "description") + .or_else(|| meta(&html, "property", "og:description")), + "author": author(&article).or_else(|| meta(&html, "name", "author")), + "published_at": string_field(&article, "datePublished") + .or_else(|| meta(&html, "property", "article:published_time")), + "modified_at": string_field(&article, "dateModified") + .or_else(|| meta(&html, "property", "article:modified_time")), + "image": article.get("image").cloned() + .or_else(|| meta(&html, "property", "og:image").map(Value::String)), + "body": body, + "data_source": "html", + })) +} + +fn article_json_ld(html: &str) -> Option { + let re = + Regex::new(r#"(?is)]+type=["']application/ld\+json["'][^>]*>(.*?)"#) + .ok()?; + re.captures_iter(html) + .filter_map(|captures| captures.get(1)) + .filter_map(|body| serde_json::from_str::(body.as_str().trim()).ok()) + .flat_map(flatten_graph) + .find(is_article) +} + +fn flatten_graph(value: Value) -> Vec { + if let Some(values) = value.as_array() { + return values.clone(); + } + if let Some(values) = value.get("@graph").and_then(Value::as_array) { + return values.clone(); + } + vec![value] +} + +fn is_article(value: &Value) -> bool { + match value.get("@type") { + Some(Value::String(kind)) => ARTICLE_TYPES.contains(&kind.as_str()), + Some(Value::Array(kinds)) => kinds + .iter() + .filter_map(Value::as_str) + .any(|kind| ARTICLE_TYPES.contains(&kind)), + _ => false, + } +} + +const ARTICLE_TYPES: &[&str] = &["Article", "BlogPosting", "NewsArticle"]; + +fn author(article: &Value) -> Option { + let author = article.get("author")?; + if let Some(name) = string_field(author, "name") { + return Some(name); + } + author + .as_array() + .and_then(|authors| authors.first()) + .and_then(|author| { + string_field(author, "name").or_else(|| author.as_str().map(str::to_string)) + }) + .or_else(|| author.as_str().map(str::to_string)) +} + +fn article_body(html: &str) -> Option { + let re = Regex::new(r"(?is)]*>(.*?)").ok()?; + let inner = re.captures(html)?.get(1)?.as_str(); + let text = strip_tags(inner); + (!text.is_empty()).then_some(text) +} + +fn title_tag(html: &str) -> Option { + let re = Regex::new(r"(?is)]*>(.*?)").ok()?; + re.captures(html) + .and_then(|captures| captures.get(1)) + .map(|value| html_decode(value.as_str()).trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn meta(html: &str, attr: &str, key: &str) -> Option { + let pattern = format!( + r#"(?is)]+{}=["']{}["'][^>]+content=["']([^"']+)["']"#, + regex::escape(attr), + regex::escape(key) + ); + Regex::new(&pattern) + .ok()? + .captures(html) + .and_then(|captures| captures.get(1)) + .map(|value| html_decode(value.as_str())) +} + +fn string_field(value: &Value, key: &str) -> Option { + value.get(key).and_then(Value::as_str).map(str::to_string) +} + +fn strip_tags(html: &str) -> String { + let Ok(re) = Regex::new(r"<[^>]+>") else { + return html_decode(html); + }; + html_decode(&re.replace_all(html, " ")) + .split_whitespace() + .collect::>() + .join(" ") +} + +fn html_decode(value: &str) -> String { + value + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") } diff --git a/crates/noxa-fetch/tests/fixtures/extractors/substack_post.html b/crates/noxa-fetch/tests/fixtures/extractors/substack_post.html new file mode 100644 index 0000000..28bed32 --- /dev/null +++ b/crates/noxa-fetch/tests/fixtures/extractors/substack_post.html @@ -0,0 +1,30 @@ + + + + Porting Noxa Verticals - Example Stack + + + + + + + + +
+

Porting Noxa Verticals

+

Extractor parity needs explicit fixtures for broad content pages.

+

Substack posts are intentionally explicit-only.

+
+ + From e71f23def5ba295dde9d02905a8fe880af052c2d Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:10:29 -0400 Subject: [PATCH 18/28] feat(cli): expose vertical extractors --- crates/noxa-cli/src/app/batch.rs | 17 +++++-- crates/noxa-cli/src/app/cli.rs | 8 ++++ crates/noxa-cli/src/app/entry.rs | 53 ++++++++++++++++++++- crates/noxa-cli/src/app/fetching/extract.rs | 25 ++++++++-- crates/noxa-cli/src/app/mod.rs | 16 +++---- crates/noxa-cli/src/app/printing.rs | 29 +++++++++++ crates/noxa-cli/src/app/tests_primary.rs | 34 +++++++++++++ crates/noxa-fetch/src/client/batch.rs | 30 ++++++++++++ crates/noxa-fetch/src/client/fetch.rs | 27 +++++++++-- 9 files changed, 219 insertions(+), 20 deletions(-) diff --git a/crates/noxa-cli/src/app/batch.rs b/crates/noxa-cli/src/app/batch.rs index 6778031..d3fe9a8 100644 --- a/crates/noxa-cli/src/app/batch.rs +++ b/crates/noxa-cli/src/app/batch.rs @@ -12,9 +12,20 @@ pub(crate) async fn run_batch( let urls: Vec<&str> = entries.iter().map(|(u, _)| u.as_str()).collect(); let options = build_extraction_options(resolved); - let results = client - .fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options) - .await; + let results = if let Some(ref extractor) = cli.extractor { + client + .fetch_and_extract_batch_vertical_with_options( + &urls, + resolved.concurrency, + extractor, + &options, + ) + .await + } else { + client + .fetch_and_extract_batch_with_options(&urls, resolved.concurrency, &options) + .await + }; let ok = results.iter().filter(|r| r.result.is_ok()).count(); let errors = results.len() - ok; diff --git a/crates/noxa-cli/src/app/cli.rs b/crates/noxa-cli/src/app/cli.rs index a9bf6c1..b618a11 100644 --- a/crates/noxa-cli/src/app/cli.rs +++ b/crates/noxa-cli/src/app/cli.rs @@ -43,6 +43,14 @@ pub(crate) struct Cli { #[arg(long)] pub(crate) stdin: bool, + /// Use a specific vertical extractor (see --list-extractors) + #[arg(long)] + pub(crate) extractor: Option, + + /// List available vertical extractors and exit + #[arg(long)] + pub(crate) list_extractors: bool, + /// Include metadata in output (always included in JSON) #[arg(long)] pub(crate) metadata: bool, diff --git a/crates/noxa-cli/src/app/entry.rs b/crates/noxa-cli/src/app/entry.rs index 7df045f..272c8b2 100644 --- a/crates/noxa-cli/src/app/entry.rs +++ b/crates/noxa-cli/src/app/entry.rs @@ -34,7 +34,10 @@ pub(crate) async fn run() { return; } - match (std::env::args().nth(1).as_deref(), std::env::args().nth(2).as_deref()) { + match ( + std::env::args().nth(1).as_deref(), + std::env::args().nth(2).as_deref(), + ) { (Some("rag"), Some("start")) => { run_rag_start(); return; @@ -66,6 +69,16 @@ pub(crate) async fn run() { init_logging(resolved.verbose); + if cli.list_extractors { + print_extractor_catalog(&resolved.format); + return; + } + + if let Some(reason) = unsupported_extractor_mode(&cli, &resolved) { + eprintln!("error: --extractor {reason}"); + process::exit(1); + } + // Validate webhook URL early so any SSRF attempt is rejected before operations run. if let Some(ref webhook_url) = cli.webhook && let Err(e) = validate_url(webhook_url).await @@ -292,3 +305,41 @@ pub(crate) async fn run() { } } } + +fn unsupported_extractor_mode( + cli: &Cli, + resolved: &config::ResolvedConfig, +) -> Option<&'static str> { + cli.extractor.as_ref()?; + + if cli.stdin || cli.file.is_some() { + return Some("cannot be combined with --stdin or --file"); + } + if cli.cloud { + return Some("cannot be combined with --cloud"); + } + if resolved.raw_html { + return Some("cannot be combined with --raw-html"); + } + if has_llm_flags(cli) { + return Some("cannot be combined with LLM extraction flags"); + } + if cli.crawl || cli.map || cli.watch || cli.diff_with.is_some() || cli.brand { + return Some("only applies to single URL and batch scraping"); + } + if cli.research.is_some() + || cli.search.is_some() + || cli.grep.is_some() + || cli.list.is_some() + || cli.status.is_some() + || cli.refresh.is_some() + || cli.retrieve.is_some() + || cli.watch_crawls + || cli.watch_rag + || cli.watch_store + { + return Some("cannot be combined with this command mode"); + } + + None +} diff --git a/crates/noxa-cli/src/app/fetching/extract.rs b/crates/noxa-cli/src/app/fetching/extract.rs index 1fc4dbf..f02764f 100644 --- a/crates/noxa-cli/src/app/fetching/extract.rs +++ b/crates/noxa-cli/src/app/fetching/extract.rs @@ -6,6 +6,9 @@ pub(crate) async fn fetch_and_extract( ) -> Result { // Local sources: read and extract as HTML if cli.stdin { + if cli.extractor.is_some() { + return Err("--extractor cannot be combined with --stdin".to_string()); + } let mut buf = String::new(); io::stdin() .read_to_string(&mut buf) @@ -17,6 +20,9 @@ pub(crate) async fn fetch_and_extract( } if let Some(ref path) = cli.file { + if cli.extractor.is_some() { + return Err("--extractor cannot be combined with --file".to_string()); + } let html = std::fs::read_to_string(path).map_err(|e| format!("failed to read {path}: {e}"))?; let options = build_extraction_options(resolved); @@ -47,6 +53,9 @@ pub(crate) async fn fetch_and_extract( // --cloud: skip local, go straight to cloud API if cli.cloud { + if cli.extractor.is_some() { + return Err("--extractor cannot be combined with --cloud".to_string()); + } let c = cloud_client.ok_or("--cloud requires NOXA_API_KEY (set via env or --api-key)")?; let options = build_extraction_options(resolved); let resp = c @@ -65,10 +74,18 @@ pub(crate) async fn fetch_and_extract( let client = FetchClient::new(build_fetch_config(cli, resolved)) .map_err(|e| format!("client error: {e}"))?; let options = build_extraction_options(resolved); - let result = client - .fetch_and_extract_with_options(url, &options) - .await - .map_err(|e| format!("fetch error: {e}"))?; + let result = if let Some(ref extractor) = cli.extractor { + client + .fetch_and_extract_vertical(url, extractor, &options) + .await + } else { + client.fetch_and_extract_with_options(url, &options).await + } + .map_err(|e| format!("fetch error: {e}"))?; + + if cli.extractor.is_some() { + return Ok(FetchOutput::Local(Box::new(result))); + } // Check if we should fall back to cloud let reason = detect_empty(&result); diff --git a/crates/noxa-cli/src/app/mod.rs b/crates/noxa-cli/src/app/mod.rs index 43adcf8..c9aa187 100644 --- a/crates/noxa-cli/src/app/mod.rs +++ b/crates/noxa-cli/src/app/mod.rs @@ -29,12 +29,12 @@ mod cli; mod crawl; mod crawl_status; mod crawl_watch; +mod diff_brand; +mod entry; mod rag_daemon; mod rag_watch; mod store_watch; mod watch_singleton; -mod diff_brand; -mod entry; mod fetching { pub(crate) mod config; pub(crate) mod extract; @@ -58,11 +58,8 @@ mod watch; pub(crate) use batch::run_batch; pub(crate) use cli::{Browser, Cli, OutputFormat, PdfModeArg}; pub(crate) use crawl::{run_crawl, run_map, spawn_crawl_background}; -pub(crate) use crawl_watch::run_crawl_watch; -pub(crate) use rag_daemon::{run_rag_start, run_rag_stop}; -pub(crate) use rag_watch::run_rag_watch; -pub(crate) use store_watch::run_store_watch; pub(crate) use crawl_status::*; +pub(crate) use crawl_watch::run_crawl_watch; pub(crate) use diff_brand::{run_brand, run_diff}; pub(crate) use entry::run; pub(crate) use fetching::config::{ @@ -80,13 +77,16 @@ pub(crate) use formatting::{ pub(crate) use llm::{has_llm_flags, run_batch_llm, run_llm}; pub(crate) use logging::{build_ops_log, init_logging, init_mcp_logging, log_operation}; pub(crate) use printing::{ - print_batch_output, print_cloud_output, print_crawl_output, print_diff_output, - print_map_output, print_output, + format_extractor_catalog, print_batch_output, print_cloud_output, print_crawl_output, + print_diff_output, print_extractor_catalog, print_map_output, print_output, }; +pub(crate) use rag_daemon::{run_rag_start, run_rag_stop}; +pub(crate) use rag_watch::run_rag_watch; pub(crate) use refresh::{run_refresh, run_status}; pub(crate) use research::run_research; pub(crate) use retrieve::run_retrieve; pub(crate) use store_ops::{run_grep, run_list, run_search}; +pub(crate) use store_watch::run_store_watch; pub(crate) use watch::{fire_webhook, run_watch}; #[cfg(test)] diff --git a/crates/noxa-cli/src/app/printing.rs b/crates/noxa-cli/src/app/printing.rs index ba4c917..94e0acc 100644 --- a/crates/noxa-cli/src/app/printing.rs +++ b/crates/noxa-cli/src/app/printing.rs @@ -4,6 +4,35 @@ pub(crate) fn print_output(result: &ExtractionResult, format: &OutputFormat, sho println!("{}", format_output(result, format, show_metadata)); } +pub(crate) fn print_extractor_catalog(format: &OutputFormat) { + println!("{}", format_extractor_catalog(format)); +} + +pub(crate) fn format_extractor_catalog(format: &OutputFormat) -> String { + let extractors = noxa_fetch::extractors::list(); + match format { + OutputFormat::Json => { + serde_json::to_string_pretty(&extractors).expect("serialization failed") + } + _ => { + let mut out = String::new(); + for extractor in extractors { + out.push_str(extractor.name); + out.push_str(" - "); + out.push_str(extractor.label); + out.push('\n'); + out.push_str(" "); + out.push_str(extractor.description); + out.push('\n'); + out.push_str(" patterns: "); + out.push_str(&extractor.url_patterns.join(", ")); + out.push_str("\n\n"); + } + out.trim_end().to_string() + } + } +} + /// Print cloud API response in the requested format. pub(crate) fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) { match format { diff --git a/crates/noxa-cli/src/app/tests_primary.rs b/crates/noxa-cli/src/app/tests_primary.rs index dd0cb2b..9421d6f 100644 --- a/crates/noxa-cli/src/app/tests_primary.rs +++ b/crates/noxa-cli/src/app/tests_primary.rs @@ -229,6 +229,40 @@ mod tests { assert!(Cli::try_parse_from(["noxa", "--refresh"]).is_err()); } + #[test] + fn extractor_flags_parse() { + let parsed = Cli::try_parse_from([ + "noxa", + "--extractor", + "github_repo", + "https://github.com/jmagar/noxa", + ]) + .unwrap(); + assert_eq!(parsed.extractor.as_deref(), Some("github_repo")); + + let parsed = Cli::try_parse_from(["noxa", "--list-extractors"]).unwrap(); + assert!(parsed.list_extractors); + } + + #[test] + fn extractor_catalog_text_output_lists_names_and_patterns() { + let output = format_extractor_catalog(&OutputFormat::Text); + + assert!(output.contains("github_repo")); + assert!(output.contains("GitHub Repository")); + assert!(output.contains("https://github.com/*/*")); + } + + #[test] + fn extractor_catalog_json_output_serializes_all_extractors() { + let output = format_extractor_catalog(&OutputFormat::Json); + let value: serde_json::Value = serde_json::from_str(&output).unwrap(); + let entries = value.as_array().unwrap(); + + assert_eq!(entries.len(), 28); + assert!(entries.iter().any(|entry| entry["name"] == "substack_post")); + } + #[tokio::test] async fn list_domain_urls_is_domain_scoped() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/noxa-fetch/src/client/batch.rs b/crates/noxa-fetch/src/client/batch.rs index 2d7a098..1f4d290 100644 --- a/crates/noxa-fetch/src/client/batch.rs +++ b/crates/noxa-fetch/src/client/batch.rs @@ -68,6 +68,36 @@ impl FetchClient { collect_ordered(handles, urls.len()).await } + + pub async fn fetch_and_extract_batch_vertical_with_options( + self: &Arc, + urls: &[&str], + concurrency: usize, + extractor: &str, + options: &noxa_core::ExtractionOptions, + ) -> Vec { + // Clamp to at least 1 — Semaphore::new(0) blocks all tasks forever. + let semaphore = Arc::new(Semaphore::new(concurrency.max(1))); + let mut handles = Vec::with_capacity(urls.len()); + + for (idx, url) in urls.iter().enumerate() { + let permit = Arc::clone(&semaphore); + let client = Arc::clone(self); + let url = url.to_string(); + let extractor = extractor.to_string(); + let opts = options.clone(); + + handles.push(tokio::spawn(async move { + let _permit = permit.acquire().await.expect("semaphore closed"); + let result = client + .fetch_and_extract_vertical(&url, &extractor, &opts) + .await; + (idx, BatchExtractResult { url, result }) + })); + } + + collect_ordered(handles, urls.len()).await + } } pub(super) async fn collect_ordered( diff --git a/crates/noxa-fetch/src/client/fetch.rs b/crates/noxa-fetch/src/client/fetch.rs index 948899c..5b56771 100644 --- a/crates/noxa-fetch/src/client/fetch.rs +++ b/crates/noxa-fetch/src/client/fetch.rs @@ -124,6 +124,13 @@ impl FetchClient { .map_err(|error| FetchError::Build(error.to_string()))?; let mut result = build_vertical_extraction_result(extractor, url, data); result.metadata.fetched_at = Some(Utc::now().to_rfc3339()); + + if let Some(ref store) = self.store + && let Err(error) = store.write(url, &result).await + { + warn!(url, error = %error, "content store write failed"); + } + Ok(result) } @@ -408,10 +415,22 @@ pub(super) fn build_vertical_extraction_result( data: serde_json::Value, ) -> noxa_core::ExtractionResult { let title = string_field(&data, &["title", "name", "full_name", "business"]) - .or_else(|| data.pointer("/post/title").and_then(|value| value.as_str()).map(ToString::to_string)) - .or_else(|| data.pointer("/metadata/title").and_then(|value| value.as_str()).map(ToString::to_string)); - let description = string_field(&data, &["description", "summary", "body", "abstract"]) - .or_else(|| data.pointer("/metadata/description").and_then(|value| value.as_str()).map(ToString::to_string)); + .or_else(|| { + data.pointer("/post/title") + .and_then(|value| value.as_str()) + .map(ToString::to_string) + }) + .or_else(|| { + data.pointer("/metadata/title") + .and_then(|value| value.as_str()) + .map(ToString::to_string) + }); + let description = + string_field(&data, &["description", "summary", "body", "abstract"]).or_else(|| { + data.pointer("/metadata/description") + .and_then(|value| value.as_str()) + .map(ToString::to_string) + }); let pretty = serde_json::to_string_pretty(&data).unwrap_or_else(|_| data.to_string()); let heading = title.clone().unwrap_or_else(|| extractor.to_string()); let markdown = match description.as_deref() { From 3256515a5d4041f698ac336def3a101dd00ea059 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:13:05 -0400 Subject: [PATCH 19/28] feat(mcp): expose vertical extractors --- crates/noxa-mcp/src/server.rs | 69 ++++++++++++++++++++++++++++++++++- crates/noxa-mcp/src/tools.rs | 13 +++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs index 6ad0a25..f19a32e 100644 --- a/crates/noxa-mcp/src/server.rs +++ b/crates/noxa-mcp/src/server.rs @@ -220,6 +220,30 @@ impl NoxaMcp { .as_ref() .unwrap_or_else(|| self.fetch_client.as_ref()); + if let Some(ref extractor) = params.extractor { + let options = noxa_core::ExtractionOptions { + include_selectors: include, + exclude_selectors: exclude, + only_main_content: main_only, + include_raw_html: false, + }; + let extraction = client + .fetch_and_extract_vertical(¶ms.url, extractor, &options) + .await + .map_err(|error| Self::map_tool_error(NoxaMcpError::Fetch(error)))?; + self.persist_local_extraction(¶ms.url, &extraction) + .await + .map_err(Self::map_tool_error)?; + let output = match format { + ScrapeFormat::Llm => noxa_core::to_llm_text(&extraction, Some(¶ms.url)), + ScrapeFormat::Text => extraction.content.plain_text, + ScrapeFormat::Json => to_pretty_json(&extraction, "scrape vertical extraction") + .map_err(Self::map_tool_error)?, + ScrapeFormat::Markdown => extraction.content.markdown, + }; + return Ok(output); + } + let formats = [format.as_str()]; let result = cloud::smart_fetch( client, @@ -888,6 +912,13 @@ impl NoxaMcp { } self.search_after_validation(params).await } + + /// List available vertical extractors for explicit scrape extraction. + #[tool] + async fn extractors(&self) -> ToolResult { + to_pretty_json(&noxa_fetch::extractors::list(), "extractor catalog") + .map_err(Self::map_tool_error) + } } #[tool_handler] @@ -897,7 +928,7 @@ impl ServerHandler for NoxaMcp { .with_server_info(Implementation::new("noxa-mcp", env!("CARGO_PKG_VERSION"))) .with_instructions(String::from( "Noxa MCP server -- web content extraction for AI agents. \ - Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.", + Tools: scrape, extractors, crawl, map, batch, extract, summarize, diff, brand, research, search.", )) } } @@ -970,6 +1001,7 @@ mod tests { format: Some(ScrapeFormat::Markdown), browser: None, cookies: None, + extractor: None, include_selectors: None, exclude_selectors: None, only_main_content: None, @@ -982,6 +1014,40 @@ mod tests { assert!(stored.is_some(), "scrape should persist a diff baseline"); } + #[tokio::test] + async fn scrape_after_validation_reports_unknown_vertical_extractor() { + let home = tempdir().unwrap(); + let app = test_app(home.path(), None, None, None, None); + let err = app + .scrape_after_validation(ScrapeParams { + url: "https://example.com/article".to_string(), + format: Some(ScrapeFormat::Json), + browser: None, + cookies: None, + extractor: Some("missing_vertical".to_string()), + include_selectors: None, + exclude_selectors: None, + only_main_content: None, + }) + .await + .unwrap_err(); + + assert!(err.contains("unknown vertical")); + } + + #[tokio::test] + async fn extractors_tool_returns_full_catalog() { + let home = tempdir().unwrap(); + let app = test_app(home.path(), None, None, None, None); + + let output = app.extractors().await.unwrap(); + let entries: serde_json::Value = serde_json::from_str(&output).unwrap(); + let entries = entries.as_array().unwrap(); + + assert_eq!(entries.len(), 28); + assert!(entries.iter().any(|entry| entry["name"] == "github_repo")); + } + #[tokio::test] async fn search_does_not_fetch_result_pages() { let search_server = TestHttpServer::spawn(|request| { @@ -1095,6 +1161,7 @@ mod tests { format: Some(ScrapeFormat::Markdown), browser: None, cookies: None, + extractor: None, include_selectors: None, exclude_selectors: None, only_main_content: None, diff --git a/crates/noxa-mcp/src/tools.rs b/crates/noxa-mcp/src/tools.rs index cee9a3a..df942e8 100644 --- a/crates/noxa-mcp/src/tools.rs +++ b/crates/noxa-mcp/src/tools.rs @@ -70,6 +70,8 @@ pub struct ScrapeParams { pub browser: Option, /// Cookies to send with the request (e.g. ["name=value", "session=abc123"]) pub cookies: Option>, + /// Optional vertical extractor name. Use the extractors tool to list valid values. + pub extractor: Option, } impl ScrapeParams { @@ -224,6 +226,17 @@ mod tests { assert!(err.contains("unknown variant")); } + #[test] + fn scrape_accepts_explicit_extractor() { + let params = serde_json::from_value::(json!({ + "url": "https://github.com/jmagar/noxa", + "extractor": "github_repo" + })) + .unwrap(); + + assert_eq!(params.extractor.as_deref(), Some("github_repo")); + } + #[test] fn batch_rejects_json_format() { let err = serde_json::from_value::(json!({ From e907f40c588060c9decfaf42cf65e81c60dceec8 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:14:37 -0400 Subject: [PATCH 20/28] docs: document vertical extractor parity --- README.md | 36 +++++++++++++++++++++++++++++++----- crates/noxa-cli/src/setup.rs | 2 +- crates/noxa-mcp/README.md | 1 + docs/CHANGELOG.md | 3 +++ 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2389ccd..b298bdb 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,24 @@ noxa https://example.com -f llm # Token-optimized for LLMs (67% fewer to noxa example.com ``` +### Vertical Extractors + +Use site-specific extractors when you want structured payloads for known verticals such as GitHub, package registries, arXiv, YouTube, Reddit, Hugging Face, social posts, Substack, and ecommerce pages. + +```bash +# List all 28 built-in extractors +noxa --list-extractors +noxa --list-extractors -f json + +# Force a specific extractor for one URL +noxa --extractor github_repo https://github.com/jmagar/noxa -f json + +# Works with batch mode too +noxa --extractor npm --urls-file npm-packages.txt -f json +``` + +Safe extractors auto-dispatch for matching URLs. Broad page extractors such as `substack_post`, `shopify_product`, `shopify_collection`, `ecommerce_product`, and `woocommerce_product` are explicit-only to avoid changing generic page extraction unexpectedly. + ### Content Filtering ```bash @@ -538,13 +556,13 @@ noxa ships as a Claude Code plugin that adds a skill (auto-activates on scrape/c The plugin provides: - **`noxa` skill** — auto-activates when you ask to scrape, crawl, extract, search, watch, or summarize URLs; covers all flag combinations and common recipes -- **MCP server** — all 10 tools available directly to Claude (`scrape`, `crawl`, `map`, `batch`, `extract`, `summarize`, `diff`, `brand`, `search`, `research`) +- **MCP server** — all 11 tools available directly to Claude (`scrape`, `extractors`, `crawl`, `map`, `batch`, `extract`, `summarize`, `diff`, `brand`, `search`, `research`) Requires `noxa` on PATH. Run `noxa setup` after installing to configure everything. --- -## MCP Server — 10 tools for AI agents +## MCP Server — 11 tools for AI agents noxa MCP server @@ -573,7 +591,8 @@ Then in Claude: *"Scrape the top 5 results for 'web scraping tools' and compare | Tool | Description | Requires API key? | |------|-------------|:-:| -| `scrape` | Extract content from any URL | No | +| `scrape` | Extract content from any URL; accepts optional `extractor` for vertical extraction | No | +| `extractors` | List available vertical extractors | No | | `crawl` | Recursive site crawl | No | | `map` | Discover URLs from sitemaps | No | | `batch` | Parallel multi-URL extraction | No | @@ -584,7 +603,7 @@ Then in Claude: *"Scrape the top 5 results for 'web scraping tools' and compare | `search` | Web search + scrape results | `SEARXNG_URL`: No, cloud: Yes | | `research` | Deep multi-source research | Yes | -9 of 10 tools work locally — no account, no API key, fully private. +10 of 11 tools work locally — no account, no API key, fully private. --- @@ -607,6 +626,13 @@ noxa URL --exclude "nav, footer, .sidebar" # CSS selector exclude noxa URL --only-main-content # Auto-detect main content ``` +### Vertical extractors + +```bash +noxa --list-extractors # Show all 28 extractors +noxa URL --extractor github_repo -f json # Force a named extractor +``` + ### Crawling ```bash @@ -719,7 +745,7 @@ noxa/ noxa-fetch HTTP client + TLS fingerprinting (wreq/BoringSSL). Crawler. Batch ops. noxa-llm LLM provider chain (Gemini CLI -> OpenAI -> Ollama -> Anthropic) noxa-pdf PDF text extraction - noxa-mcp MCP server (10 tools for AI agents) → run via: noxa mcp + noxa-mcp MCP server (11 tools for AI agents) → run via: noxa mcp noxa-rag RAG pipeline (TEI embeddings + Qdrant vector store) → binary: noxa-rag-daemon noxa-cli CLI binary → binary: noxa ``` diff --git a/crates/noxa-cli/src/setup.rs b/crates/noxa-cli/src/setup.rs index 9bc98d3..79e75e9 100644 --- a/crates/noxa-cli/src/setup.rs +++ b/crates/noxa-cli/src/setup.rs @@ -385,7 +385,7 @@ fn setup_mcp(theme: &ColorfulTheme, dir: &Path) { } println!( - "\x1b[34m[*]\x1b[0m Tools available via MCP: scrape, crawl, map, batch, extract, summarize, diff, brand, search, research" + "\x1b[34m[*]\x1b[0m Tools available via MCP: scrape, extractors, crawl, map, batch, extract, summarize, diff, brand, search, research" ); println!(); diff --git a/crates/noxa-mcp/README.md b/crates/noxa-mcp/README.md index c230b7d..b7112a0 100644 --- a/crates/noxa-mcp/README.md +++ b/crates/noxa-mcp/README.md @@ -39,6 +39,7 @@ Startup now creates those directories up front and returns a typed error if init ## Tool Notes - `scrape`, `crawl`, and `batch` use validated format enums instead of free-form strings. +- `scrape` accepts an optional `extractor` string for explicit vertical extraction; use the `extractors` tool to list all 28 supported extractors. - `extract` requires exactly one of `schema` or `prompt`. - `search` returns snippets plus fetch errors for validated result URLs; it does not write to `stdout` outside MCP. - `diff` can bootstrap a missing local baseline when a local fetch succeeds. diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 7e0ad2d..decd5c5 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,9 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). ### Added - **`--refresh `**: re-fetch every cached document for one stored domain through the existing content-store write path. Refresh stays domain-scoped, validates sidecar URLs with the async URL validator, and does not imply a whole-store sweep. +- **Full vertical extractor catalog**: 28 site-specific extractors now ship in `noxa-fetch`, with `vertical_data` in `ExtractionResult`, safe URL auto-dispatch, and explicit-only broad page extractors for Substack, Shopify, generic ecommerce, and WooCommerce. +- **CLI vertical extractor controls**: `--list-extractors` prints the catalog and `--extractor ` forces a vertical extractor for single URL or batch scraping. +- **MCP vertical extractor controls**: the `scrape` tool accepts an optional `extractor` parameter, and the new `extractors` tool returns the full extractor catalog. ### Changed - **`--status` now uses a typed crawl-status model**: background crawl status supports `running`, `done`, `stale`, and `never-started`, normalizes scheme-bearing inputs consistently, and uses cross-platform liveness checks (`/proc` on Linux, `kill(pid, 0)` elsewhere). From d001dfa780cd687e030d8cd7865fd41770d0d840 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:15:16 -0400 Subject: [PATCH 21/28] fix(fetch): decode arxiv XML attributes with workspace features --- crates/noxa-fetch/src/extractors/arxiv.rs | 37 ++++++++++++++++++----- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/crates/noxa-fetch/src/extractors/arxiv.rs b/crates/noxa-fetch/src/extractors/arxiv.rs index b7ed429..ecf7fd5 100644 --- a/crates/noxa-fetch/src/extractors/arxiv.rs +++ b/crates/noxa-fetch/src/extractors/arxiv.rs @@ -98,16 +98,37 @@ fn parse_atom_entry(xml: &str) -> Option { _ => {} }, Ok(Event::Empty(element)) if in_entry => { - let mut term = None; - let mut href = None; - let mut rel = None; - let mut content_type = None; + let mut term: Option = None; + let mut href: Option = None; + let mut rel: Option = None; + let mut content_type: Option = None; + let decoder = reader.decoder(); for attr in element.attributes().flatten() { match attr.key.as_ref() { - b"term" => term = attr.unescape_value().ok().map(|v| v.to_string()), - b"href" => href = attr.unescape_value().ok().map(|v| v.to_string()), - b"rel" => rel = attr.unescape_value().ok().map(|v| v.to_string()), - b"type" => content_type = attr.unescape_value().ok().map(|v| v.to_string()), + b"term" => { + term = attr + .decode_and_unescape_value(decoder) + .ok() + .map(|v| v.to_string()) + } + b"href" => { + href = attr + .decode_and_unescape_value(decoder) + .ok() + .map(|v| v.to_string()) + } + b"rel" => { + rel = attr + .decode_and_unescape_value(decoder) + .ok() + .map(|v| v.to_string()) + } + b"type" => { + content_type = attr + .decode_and_unescape_value(decoder) + .ok() + .map(|v| v.to_string()) + } _ => {} } } From ea9d8f04d1b84cea5fe8a216a5a2acda9e6e2dda Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 00:18:08 -0400 Subject: [PATCH 22/28] chore: satisfy workspace clippy --- crates/noxa-cli/src/app/crawl_watch.rs | 14 +++- crates/noxa-cli/src/app/mod.rs | 6 +- crates/noxa-cli/src/app/rag_daemon.rs | 5 +- crates/noxa-cli/src/app/rag_watch.rs | 74 ++++++++++++------- crates/noxa-cli/src/app/store_ops.rs | 4 +- crates/noxa-cli/src/app/watch_singleton.rs | 11 ++- crates/noxa-cli/src/config.rs | 62 +++++++++++----- crates/noxa-fetch/src/crawler.rs | 8 +- crates/noxa-rag/src/pipeline/process.rs | 60 ++++++++------- crates/noxa-rag/src/pipeline/scan.rs | 57 ++++++++------ crates/noxa-rag/src/pipeline/watcher.rs | 24 +++--- crates/noxa-rag/src/store/qdrant/tests.rs | 18 ++--- .../noxa-store/src/content_store/enumerate.rs | 12 +-- 13 files changed, 216 insertions(+), 139 deletions(-) diff --git a/crates/noxa-cli/src/app/crawl_watch.rs b/crates/noxa-cli/src/app/crawl_watch.rs index 598542b..9c79846 100644 --- a/crates/noxa-cli/src/app/crawl_watch.rs +++ b/crates/noxa-cli/src/app/crawl_watch.rs @@ -29,7 +29,11 @@ pub(crate) async fn run_crawl_watch() { continue; } if let Ok(record) = read_crawl_status(&path) { - let key = path.file_stem().unwrap_or_default().to_string_lossy().into_owned(); + let key = path + .file_stem() + .unwrap_or_default() + .to_string_lossy() + .into_owned(); seen.insert(key.clone(), record.phase); if record.phase == CrawlStatusPhase::Done { finished.insert(key.clone()); @@ -67,7 +71,11 @@ pub(crate) async fn run_crawl_watch() { Err(_) => continue, }; - let key = path.file_stem().unwrap_or_default().to_string_lossy().into_owned(); + let key = path + .file_stem() + .unwrap_or_default() + .to_string_lossy() + .into_owned(); keys_on_disk.insert(key.clone()); if finished.contains(&key) { @@ -106,7 +114,7 @@ pub(crate) async fn run_crawl_watch() { let prev_pct = prev_error_pct.get(&key).copied().unwrap_or(0); let cooldown_ok = error_last_alerted .get(&key) - .map_or(true, |t| t.elapsed() >= ALERT_COOLDOWN); + .is_none_or(|t| t.elapsed() >= ALERT_COOLDOWN); if pct >= ERROR_RATE_THRESHOLD && pct_rounded > prev_pct && cooldown_ok { println!( "Crawl warning: {} — {}% error rate ({}/{} pages failed)", diff --git a/crates/noxa-cli/src/app/mod.rs b/crates/noxa-cli/src/app/mod.rs index c9aa187..9977e23 100644 --- a/crates/noxa-cli/src/app/mod.rs +++ b/crates/noxa-cli/src/app/mod.rs @@ -76,9 +76,11 @@ pub(crate) use formatting::{ }; pub(crate) use llm::{has_llm_flags, run_batch_llm, run_llm}; pub(crate) use logging::{build_ops_log, init_logging, init_mcp_logging, log_operation}; +#[cfg(test)] +pub(crate) use printing::format_extractor_catalog; pub(crate) use printing::{ - format_extractor_catalog, print_batch_output, print_cloud_output, print_crawl_output, - print_diff_output, print_extractor_catalog, print_map_output, print_output, + print_batch_output, print_cloud_output, print_crawl_output, print_diff_output, + print_extractor_catalog, print_map_output, print_output, }; pub(crate) use rag_daemon::{run_rag_start, run_rag_stop}; pub(crate) use rag_watch::run_rag_watch; diff --git a/crates/noxa-cli/src/app/rag_daemon.rs b/crates/noxa-cli/src/app/rag_daemon.rs index 9b8907b..bfc0249 100644 --- a/crates/noxa-cli/src/app/rag_daemon.rs +++ b/crates/noxa-cli/src/app/rag_daemon.rs @@ -159,7 +159,10 @@ pub(crate) fn run_rag_start() { \n\ \x1b[2m log\x1b[0m {}\n\ \x1b[2m status\x1b[0m noxa --watch-rag\n", - dirs::home_dir().unwrap_or_default().join(DEFAULT_LOG).display(), + dirs::home_dir() + .unwrap_or_default() + .join(DEFAULT_LOG) + .display(), ); return; } diff --git a/crates/noxa-cli/src/app/rag_watch.rs b/crates/noxa-cli/src/app/rag_watch.rs index 20ec857..0dea8d8 100644 --- a/crates/noxa-cli/src/app/rag_watch.rs +++ b/crates/noxa-cli/src/app/rag_watch.rs @@ -56,12 +56,20 @@ async fn check_failed_jobs(path: &std::path::Path, prev_size: &mut u64) -> Vec= ALERT_COOLDOWN); + let should_alert = alerted + .get(key) + .is_none_or(|t| t.elapsed() >= ALERT_COOLDOWN); if should_alert { alerted.insert(key, Instant::now()); println!("{offline_msg}"); @@ -145,8 +155,10 @@ pub(crate) async fn run_rag_watch() { } } - let (mut tei_up, mut qdrant_up) = - tokio::join!(probe_http(&client, &tei_health), probe_http(&client, &qdrant_health)); + let (mut tei_up, mut qdrant_up) = tokio::join!( + probe_http(&client, &tei_health), + probe_http(&client, &qdrant_health) + ); if !tei_up { println!("TEI embeddings server is offline ({tei_url}) — RAG indexing will stall"); @@ -168,9 +180,11 @@ pub(crate) async fn run_rag_watch() { let mut offline_alerted: HashMap<&'static str, Instant> = HashMap::new(); - let tei_offline = format!("TEI embeddings server is offline ({tei_url}) — RAG indexing will stall"); + let tei_offline = + format!("TEI embeddings server is offline ({tei_url}) — RAG indexing will stall"); let tei_online = format!("TEI embeddings server is back online ({tei_url})"); - let qdrant_offline = format!("Qdrant is offline ({qdrant_url}) — RAG indexing and search will not work"); + let qdrant_offline = + format!("Qdrant is offline ({qdrant_url}) — RAG indexing and search will not work"); let qdrant_online = format!("Qdrant is back online ({qdrant_url})"); loop { @@ -200,33 +214,43 @@ pub(crate) async fn run_rag_watch() { daemon_running = daemon_now; } - let (tei_now, qdrant_now) = - tokio::join!(probe_http(&client, &tei_health), probe_http(&client, &qdrant_health)); + let (tei_now, qdrant_now) = tokio::join!( + probe_http(&client, &tei_health), + probe_http(&client, &qdrant_health) + ); tei_up = check_service_alert( - "tei", tei_up, tei_now, &tei_offline, &tei_online, &mut offline_alerted, + "tei", + tei_up, + tei_now, + &tei_offline, + &tei_online, + &mut offline_alerted, ); qdrant_up = check_service_alert( - "qdrant", qdrant_up, qdrant_now, &qdrant_offline, &qdrant_online, &mut offline_alerted, + "qdrant", + qdrant_up, + qdrant_now, + &qdrant_offline, + &qdrant_online, + &mut offline_alerted, ); - if qdrant_up { - if let Some(count) = get_qdrant_point_count(&client, &qdrant_url, &collection).await { - if count > last_point_count { + if qdrant_up + && let Some(count) = get_qdrant_point_count(&client, &qdrant_url, &collection).await + { + if count > last_point_count { + stable_polls = 0; + } else if count == last_point_count && count > announced_count { + stable_polls += 1; + if stable_polls >= STABLE_POLLS_REQUIRED { + let delta = count - announced_count; + println!("RAG indexing complete: {collection} — {count} points (+{delta} new)"); + announced_count = count; stable_polls = 0; - } else if count == last_point_count && count > announced_count { - stable_polls += 1; - if stable_polls >= STABLE_POLLS_REQUIRED { - let delta = count - announced_count; - println!( - "RAG indexing complete: {collection} — {count} points (+{delta} new)" - ); - announced_count = count; - stable_polls = 0; - } } - last_point_count = count; } + last_point_count = count; } for msg in check_failed_jobs(&failed_log, &mut failed_log_size).await { diff --git a/crates/noxa-cli/src/app/store_ops.rs b/crates/noxa-cli/src/app/store_ops.rs index 697186b..7a92ab4 100644 --- a/crates/noxa-cli/src/app/store_ops.rs +++ b/crates/noxa-cli/src/app/store_ops.rs @@ -212,13 +212,11 @@ pub(crate) async fn run_grep(pattern: &str, store_root: std::path::PathBuf) -> R fn truncate_display(line: &str, max_chars: usize) -> String { let mut end = None; - let mut seen = 0usize; - for (idx, _) in line.char_indices() { + for (seen, (idx, _)) in line.char_indices().enumerate() { if seen == max_chars { end = Some(idx); break; } - seen += 1; } match end { Some(idx) => format!("{}...", &line[..idx]), diff --git a/crates/noxa-cli/src/app/watch_singleton.rs b/crates/noxa-cli/src/app/watch_singleton.rs index c0de997..3dbea8e 100644 --- a/crates/noxa-cli/src/app/watch_singleton.rs +++ b/crates/noxa-cli/src/app/watch_singleton.rs @@ -58,12 +58,11 @@ pub(crate) fn acquire(name: &str) -> Option { } // File already exists — check whether the owner is still alive. - if let Ok(contents) = std::fs::read_to_string(&path) { - if let Ok(pid) = contents.trim().parse::() { - if super::is_pid_running(pid) { - return None; - } - } + if let Ok(contents) = std::fs::read_to_string(&path) + && let Ok(pid) = contents.trim().parse::() + && super::is_pid_running(pid) + { + return None; } // Stale PID — overwrite and take ownership. The guard will clean up on diff --git a/crates/noxa-cli/src/config.rs b/crates/noxa-cli/src/config.rs index efb82d5..61fc767 100644 --- a/crates/noxa-cli/src/config.rs +++ b/crates/noxa-cli/src/config.rs @@ -67,7 +67,10 @@ impl NoxaConfig { let noxa_config_env = std::env::var("NOXA_CONFIG").ok(); let was_explicit = explicit_path.is_some() || noxa_config_env.is_some(); - let path = if let Some(p) = explicit_path.map(PathBuf::from).or_else(|| noxa_config_env.map(PathBuf::from)) { + let path = if let Some(p) = explicit_path + .map(PathBuf::from) + .or_else(|| noxa_config_env.map(PathBuf::from)) + { p } else { match find_config_file() { @@ -88,7 +91,10 @@ impl NoxaConfig { let content = match std::fs::read_to_string(&path) { Ok(s) => s, Err(e) => { - eprintln!("error: cannot read config file {}: {e}", display_name(&path)); + eprintln!( + "error: cannot read config file {}: {e}", + display_name(&path) + ); std::process::exit(1); } }; @@ -100,7 +106,15 @@ impl NoxaConfig { let name = display_name(&path); // Detect secret-looking keys in raw TOML before parsing - let secret_keys = ["api_key", "proxy", "webhook", "llm_base_url", "password", "token", "secret"]; + let secret_keys = [ + "api_key", + "proxy", + "webhook", + "llm_base_url", + "password", + "token", + "secret", + ]; let has_secrets = secret_keys.iter().any(|k| { // TOML syntax: `key = ` (with optional whitespace) content.contains(&format!("{k} =")) || content.contains(&format!("{k}=")) @@ -138,12 +152,12 @@ fn find_config_file() -> Option { return Some(p); } } - if let Ok(exe) = std::env::current_exe() { - if let Some(dir) = exe.parent() { - let p = dir.join("noxa.toml"); - if p.exists() { - return Some(p); - } + if let Ok(exe) = std::env::current_exe() + && let Some(dir) = exe.parent() + { + let p = dir.join("noxa.toml"); + if p.exists() { + return Some(p); } } if let Ok(cwd) = std::env::current_dir() { @@ -332,7 +346,8 @@ mod tests { #[test] fn test_noxa_config_deserialize_full() { - let cfg = from_toml(r#" + let cfg = from_toml( + r#" [cli] format = "llm" depth = 3 @@ -353,7 +368,8 @@ mod tests { pdf_mode = "fast" metadata = true verbose = false - "#); + "#, + ); assert!(matches!(cfg.format, Some(crate::OutputFormat::Llm))); assert_eq!(cfg.depth, Some(3)); assert_eq!( @@ -372,33 +388,39 @@ mod tests { #[test] fn test_noxa_config_unknown_fields_ignored() { - let cfg = from_toml(r#" + let cfg = from_toml( + r#" [cli] depth = 2 future_field = true - "#); + "#, + ); assert_eq!(cfg.depth, Some(2)); } #[test] fn test_noxa_config_output_dir_deserialize() { - let cfg = from_toml(r#" + let cfg = from_toml( + r#" [cli] output_dir = "out" - "#); + "#, + ); assert_eq!(cfg.output_dir, Some(PathBuf::from("out"))); } #[test] fn test_noxa_config_rag_section_ignored() { // [rag] section must not cause a parse error - let cfg = from_toml(r#" + let cfg = from_toml( + r#" [cli] depth = 5 [rag] uuid_namespace = "6ba7b810-9dad-11d1-80b4-00c04fd430c8" - "#); + "#, + ); assert_eq!(cfg.depth, Some(5)); } @@ -406,8 +428,10 @@ mod tests { fn test_resolve_uses_config_output_dir() { let cli = crate::Cli::parse_from(["noxa"]); let matches = crate::Cli::command().get_matches_from(["noxa"]); - let cfg = from_toml(r#"[cli] -output_dir = "out""#); + let cfg = from_toml( + r#"[cli] +output_dir = "out""#, + ); let resolved = resolve(&cli, &matches, &cfg); assert_eq!(resolved.output_dir, Some(PathBuf::from("out"))); } diff --git a/crates/noxa-fetch/src/crawler.rs b/crates/noxa-fetch/src/crawler.rs index d34e5a2..01814fe 100644 --- a/crates/noxa-fetch/src/crawler.rs +++ b/crates/noxa-fetch/src/crawler.rs @@ -436,10 +436,10 @@ impl Crawler { // When MetadataOnly, drop heavy content fields now that progress // has been streamed and links have already been harvested above. - if self.config.body_retention == BodyRetention::MetadataOnly { - if let Some(ref mut extraction) = page.extraction { - clear_extraction_body_for_metadata_only(extraction); - } + if self.config.body_retention == BodyRetention::MetadataOnly + && let Some(ref mut extraction) = page.extraction + { + clear_extraction_body_for_metadata_only(extraction); } pages.push(page); diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs index 600bc4f..dab3da2 100644 --- a/crates/noxa-rag/src/pipeline/process.rs +++ b/crates/noxa-rag/src/pipeline/process.rs @@ -82,26 +82,26 @@ async fn append_failed_job(path: &Path, error: &impl std::fmt::Display, ctx: &Wo let max_log_bytes = ctx.config.pipeline.failed_jobs_log_max_bytes; // Rotate if the log has grown past the cap. - if let Ok(meta) = tokio::fs::metadata(log_path).await { - if meta.len() >= max_log_bytes { - let mut rotated = log_path.to_path_buf(); - rotated.as_mut_os_string().push(".1"); - // Remove any existing backup first; rename fails on Windows if the - // destination already exists. - let _ = tokio::fs::remove_file(&rotated).await; - if let Err(e) = tokio::fs::rename(log_path, &rotated).await { - tracing::warn!( - log = %log_path.display(), - error = %e, - "failed to rotate failed-jobs log; continuing with existing file" - ); - } else { - tracing::info!( - log = %log_path.display(), - max_bytes = max_log_bytes, - "rotated failed-jobs log" - ); - } + if let Ok(meta) = tokio::fs::metadata(log_path).await + && meta.len() >= max_log_bytes + { + let mut rotated = log_path.to_path_buf(); + rotated.as_mut_os_string().push(".1"); + // Remove any existing backup first; rename fails on Windows if the + // destination already exists. + let _ = tokio::fs::remove_file(&rotated).await; + if let Err(e) = tokio::fs::rename(log_path, &rotated).await { + tracing::warn!( + log = %log_path.display(), + error = %e, + "failed to rotate failed-jobs log; continuing with existing file" + ); + } else { + tracing::info!( + log = %log_path.display(), + max_bytes = max_log_bytes, + "rotated failed-jobs log" + ); } } @@ -116,10 +116,7 @@ async fn append_failed_job(path: &Path, error: &impl std::fmt::Display, ctx: &Wo } } -pub(crate) async fn process_job( - job: IndexJob, - ctx: &WorkerContext, -) -> Result { +pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result { let job_start = std::time::Instant::now(); let t0 = std::time::Instant::now(); @@ -295,7 +292,7 @@ pub(crate) async fn process_job( vector, payload: parse::build_point_payload( chunk, - &*result, + &result, git_branch.clone(), &parsed.provenance, &url, @@ -364,7 +361,8 @@ pub(crate) async fn process_job( drop(_guard); drop(url_lock); - ctx.url_locks.remove_if(&url, |_, v| Arc::strong_count(v) == 1); + ctx.url_locks + .remove_if(&url, |_, v| Arc::strong_count(v) == 1); let upsert_ms = store_result?; @@ -388,7 +386,9 @@ pub(crate) async fn process_delete_job(job: DeleteJob, store: &DynVectorStore) { let url = crate::url_util::normalize_url(&url); match store.delete_by_url(&url).await { Ok(()) => tracing::info!(url = %url, "deleted chunks for removed file"), - Err(e) => tracing::warn!(url = %url, error = %e, "failed to delete chunks for removed file"), + Err(e) => { + tracing::warn!(url = %url, error = %e, "failed to delete chunks for removed file") + } } } @@ -403,7 +403,11 @@ mod tests { #[tokio::test] async fn validate_url_scheme_accepts_file_localhost_host() { - assert!(validate_url_scheme("file://localhost/tmp/foo.md").await.is_ok()); + assert!( + validate_url_scheme("file://localhost/tmp/foo.md") + .await + .is_ok() + ); } #[tokio::test] diff --git a/crates/noxa-rag/src/pipeline/scan.rs b/crates/noxa-rag/src/pipeline/scan.rs index 68a5dbf..11eb96d 100644 --- a/crates/noxa-rag/src/pipeline/scan.rs +++ b/crates/noxa-rag/src/pipeline/scan.rs @@ -128,13 +128,12 @@ pub(crate) fn startup_scan_key(path: &Path) -> Option<(String, String)> { // through to the mtime+size key below (re-indexing on collision is acceptable). if let Ok(file) = std::fs::File::open(path) { let reader = std::io::BufReader::new(file); - if let Ok(q) = serde_json::from_reader::<_, Q>(reader) { - if let Some(hash) = q.metadata.content_hash - && let Some(url) = q.metadata.url - && !url.is_empty() - { - return Some((hash, url)); - } + if let Ok(q) = serde_json::from_reader::<_, Q>(reader) + && let Some(hash) = q.metadata.content_hash + && let Some(url) = q.metadata.url + && !url.is_empty() + { + return Some((hash, url)); } } // Fall through to mtime+size if JSON parse failed, or url/content_hash missing. @@ -173,7 +172,9 @@ pub(crate) fn path_is_within_any_watch_root( canonical_path: &Path, watch_roots: &[PathBuf], ) -> bool { - watch_roots.iter().any(|root| canonical_path.starts_with(root)) + watch_roots + .iter() + .any(|root| canonical_path.starts_with(root)) } /// Walk up the directory tree from `file_path` to find a `.git/HEAD` file. @@ -196,7 +197,6 @@ pub(crate) fn detect_git_root_and_branch(file_path: &Path) -> Option<(PathBuf, S } } - fn git_head_path(git_entry: &Path) -> Option { let metadata = std::fs::symlink_metadata(git_entry).ok()?; if metadata.is_dir() { @@ -336,8 +336,12 @@ mod tests { let tmp = tempfile::tempdir().expect("tempdir"); let root1 = tmp.path().join("root1"); let root2 = tmp.path().join("root2"); - tokio::fs::create_dir_all(&root1).await.expect("create root1"); - tokio::fs::create_dir_all(&root2).await.expect("create root2"); + tokio::fs::create_dir_all(&root1) + .await + .expect("create root1"); + tokio::fs::create_dir_all(&root2) + .await + .expect("create root2"); let file1 = root1.join("doc.json"); tokio::fs::write(&file1, "{}").await.expect("write file1"); @@ -355,8 +359,12 @@ mod tests { let tmp = tempfile::tempdir().expect("tempdir"); let root1 = tmp.path().join("root1"); let root2 = tmp.path().join("root2"); - tokio::fs::create_dir_all(&root1).await.expect("create root1"); - tokio::fs::create_dir_all(&root2).await.expect("create root2"); + tokio::fs::create_dir_all(&root1) + .await + .expect("create root1"); + tokio::fs::create_dir_all(&root2) + .await + .expect("create root2"); let file2 = root2.join("doc.md"); tokio::fs::write(&file2, "# hi").await.expect("write file2"); @@ -374,8 +382,12 @@ mod tests { let tmp = tempfile::tempdir().expect("tempdir"); let root1 = tmp.path().join("root1"); let outside = tmp.path().join("outside"); - tokio::fs::create_dir_all(&root1).await.expect("create root1"); - tokio::fs::create_dir_all(&outside).await.expect("create outside"); + tokio::fs::create_dir_all(&root1) + .await + .expect("create root1"); + tokio::fs::create_dir_all(&outside) + .await + .expect("create outside"); let outside_file = outside.join("secret.txt"); tokio::fs::write(&outside_file, "data") @@ -389,7 +401,10 @@ mod tests { .await .expect("watch roots"); - assert!(!path_is_within_any_watch_root(&canonical_outside, &watch_roots)); + assert!(!path_is_within_any_watch_root( + &canonical_outside, + &watch_roots + )); } #[test] @@ -397,10 +412,7 @@ mod tests { let tmp = tempfile::tempdir().expect("tempdir"); let file = tmp.path().join("foo.txt"); fs::write(&file, "x").expect("write file"); - assert_eq!( - detect_git_root_and_branch(&file).map(|(_, b)| b), - None - ); + assert_eq!(detect_git_root_and_branch(&file).map(|(_, b)| b), None); } #[test] @@ -424,10 +436,7 @@ mod tests { fs::write(git_dir.join("HEAD"), "abc123def456\n").expect("write HEAD"); let file = tmp.path().join("foo.txt"); fs::write(&file, "x").expect("write file"); - assert_eq!( - detect_git_root_and_branch(&file).map(|(_, b)| b), - None - ); + assert_eq!(detect_git_root_and_branch(&file).map(|(_, b)| b), None); } #[test] diff --git a/crates/noxa-rag/src/pipeline/watcher.rs b/crates/noxa-rag/src/pipeline/watcher.rs index 2d672e5..04ff6c0 100644 --- a/crates/noxa-rag/src/pipeline/watcher.rs +++ b/crates/noxa-rag/src/pipeline/watcher.rs @@ -1,8 +1,8 @@ use std::path::PathBuf; use std::time::Duration; -use notify::{RecursiveMode, Watcher}; use notify::event::{ModifyKind, RenameMode}; +use notify::{RecursiveMode, Watcher}; use notify_debouncer_full::{DebounceEventResult, new_debouncer}; use tokio::task::JoinHandle; @@ -32,8 +32,8 @@ pub(super) fn send_job( return; } match tx.try_send(job) { - Ok(()) => return, - Err(async_channel::TrySendError::Closed(_)) => return, + Ok(()) => (), + Err(async_channel::TrySendError::Closed(_)) => (), Err(async_channel::TrySendError::Full(j)) => { tracing::warn!("job queue saturated (256/256), backing off — embed/upsert catching up"); // Park the blocking thread on the channel's condvar until a slot opens. @@ -59,9 +59,12 @@ pub(super) fn setup_watcher( ) -> Result, RagError> { let (notify_tx, notify_rx) = std::sync::mpsc::sync_channel::(256); - let mut debouncer = - new_debouncer(Duration::from_millis(debounce_ms), None, BoundedSender(notify_tx)) - .map_err(|e| RagError::WatcherSetup(format!("failed to create fs watcher: {e}")))?; + let mut debouncer = new_debouncer( + Duration::from_millis(debounce_ms), + None, + BoundedSender(notify_tx), + ) + .map_err(|e| RagError::WatcherSetup(format!("failed to create fs watcher: {e}")))?; for watch_dir in watch_dirs { debouncer @@ -129,8 +132,7 @@ pub(super) fn setup_watcher( ); } for path in scan::collect_indexable_paths(&new_path) { - let span = - tracing::info_span!("index_job", path = %path.display()); + let span = tracing::info_span!("index_job", path = %path.display()); send_job( PipelineJob::Index(IndexJob { path, span }), &tx, @@ -165,7 +167,11 @@ pub(super) fn setup_watcher( } // Create / Modify / Any — index all indexable paths. - for path in event.paths.iter().flat_map(|p| scan::collect_indexable_paths(p)) { + for path in event + .paths + .iter() + .flat_map(|p| scan::collect_indexable_paths(p)) + { let span = tracing::info_span!("index_job", path = %path.display()); send_job(PipelineJob::Index(IndexJob { path, span }), &tx, &shutdown); } diff --git a/crates/noxa-rag/src/store/qdrant/tests.rs b/crates/noxa-rag/src/store/qdrant/tests.rs index c5c38c6..f7ac34d 100644 --- a/crates/noxa-rag/src/store/qdrant/tests.rs +++ b/crates/noxa-rag/src/store/qdrant/tests.rs @@ -70,10 +70,10 @@ where let mut parts = line.split_whitespace(); method = parts.next().unwrap_or_default().to_string(); path = parts.next().unwrap_or_default().to_string(); - } else if let Some((name, value)) = line.split_once(':') { - if name.trim().eq_ignore_ascii_case("content-length") { - content_length = value.trim().parse().unwrap_or(0); - } + } else if let Some((name, value)) = line.split_once(':') + && name.trim().eq_ignore_ascii_case("content-length") + { + content_length = value.trim().parse().unwrap_or(0); } } @@ -271,7 +271,7 @@ async fn build_vector_store_reconciles_existing_indexes_and_searches_with_metada .to_string() } } - ("PUT", path) if path == "/collections/noxa-test/index" => "{}".to_string(), + ("PUT", "/collections/noxa-test/index") => "{}".to_string(), ("POST", "/collections/noxa-test/points/search") => serde_json::json!({ "result": [ { @@ -421,10 +421,10 @@ where let mut parts = line.split_whitespace(); method = parts.next().unwrap_or_default().to_string(); path = parts.next().unwrap_or_default().to_string(); - } else if let Some((name, value)) = line.split_once(':') { - if name.trim().eq_ignore_ascii_case("content-length") { - content_length = value.trim().parse().unwrap_or(0); - } + } else if let Some((name, value)) = line.split_once(':') + && name.trim().eq_ignore_ascii_case("content-length") + { + content_length = value.trim().parse().unwrap_or(0); } } diff --git a/crates/noxa-store/src/content_store/enumerate.rs b/crates/noxa-store/src/content_store/enumerate.rs index b8d3f4d..880a04a 100644 --- a/crates/noxa-store/src/content_store/enumerate.rs +++ b/crates/noxa-store/src/content_store/enumerate.rs @@ -136,12 +136,12 @@ impl FilesystemContentStore { // --- Fast path: cache hit --- { let guard = self.manifest_cache.0.lock().await; - if let Some(cache) = guard.cache.as_ref() { - if cache.is_fresh() { - let mut docs: Vec = cache.docs.values().cloned().collect(); - docs.sort_by(|a, b| a.md_path.cmp(&b.md_path)); - return Ok(docs); - } + if let Some(cache) = guard.cache.as_ref() + && cache.is_fresh() + { + let mut docs: Vec = cache.docs.values().cloned().collect(); + docs.sort_by(|a, b| a.md_path.cmp(&b.md_path)); + return Ok(docs); } } From dfb0cdd616acf2aef476dc7a83d8a2983d11a575 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 17:09:12 -0400 Subject: [PATCH 23/28] docs: add live extractor CLI test report --- .../live-extractor-cli-report-2026-04-26.md | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 docs/reports/live-extractor-cli-report-2026-04-26.md diff --git a/docs/reports/live-extractor-cli-report-2026-04-26.md b/docs/reports/live-extractor-cli-report-2026-04-26.md new file mode 100644 index 0000000..add7c5b --- /dev/null +++ b/docs/reports/live-extractor-cli-report-2026-04-26.md @@ -0,0 +1,60 @@ +# Live CLI Extractor Test Report - 2026-04-26 + +## Summary + +All 28 vertical extractors were executed through the real `noxa` CLI against live public URLs. + +- Result: 28/28 passed. +- Pass criterion: command exited `0`, stdout parsed as JSON, and `.vertical_data.extractor` matched the requested extractor name. +- Binary: `target/debug/noxa`, built with `cargo build -p noxa-cli`. +- Common command shape: `target/debug/noxa --no-store --extractor -f json `. +- Raw evidence directory: `target/live-extractor-tests/20260426T042403Z/`. + +## Retest Adjustments + +The first sweep found endpoint-specific failures, not fixture-only failures: + +- `reddit`: `www.reddit.com/.../.json` returned HTML to the CLI fetch path; `new.reddit.com/.../.json` returned JSON and passed. +- `npm`: `react` exceeded the current JSON body limit; `is-odd` stayed below the limit and passed. +- `huggingface_model`: bare `bert-base-uncased` does not match the current owner/name matcher; `google-bert/bert-base-uncased` passed. +- `instagram_profile`: the public web profile API required `x-ig-app-id: 936619743392459`; with that header, it passed. +- `shopify_product` and `shopify_collection`: the initial Snowdevil endpoints timed out; Allbirds product and collection JSON endpoints passed. + +## Results + +| Extractor | Result | Verified vertical | Evidence title/id | Command | +|---|---:|---|---|---| +| `reddit` | PASS | `reddit` | This Week in Rust #648 | `target/debug/noxa --no-store --extractor reddit -f json 'https://new.reddit.com/r/rust/comments/1su40pd/this_week_in_rust_648/'` | +| `hackernews` | PASS | `hackernews` | My YC app: Dropbox - Throw away your USB drive | `target/debug/noxa --no-store --extractor hackernews -f json 'https://news.ycombinator.com/item?id=8863'` | +| `github_repo` | PASS | `github_repo` | rust | `target/debug/noxa --no-store --extractor github_repo -f json 'https://github.com/rust-lang/rust'` | +| `github_pr` | PASS | `github_pr` | PR #1 | `target/debug/noxa --no-store --extractor github_pr -f json 'https://github.com/rust-lang/rust/pull/1'` | +| `github_issue` | PASS | `github_issue` | Thread a session or semantic context through IL | `target/debug/noxa --no-store --extractor github_issue -f json 'https://github.com/rust-lang/rust/issues/1'` | +| `github_release` | PASS | `github_release` | Rust 1.0.0 | `target/debug/noxa --no-store --extractor github_release -f json 'https://github.com/rust-lang/rust/releases/tag/1.0.0'` | +| `pypi` | PASS | `pypi` | requests | `target/debug/noxa --no-store --extractor pypi -f json 'https://pypi.org/project/requests/'` | +| `npm` | PASS | `npm` | is-odd | `target/debug/noxa --no-store --extractor npm -f json 'https://www.npmjs.com/package/is-odd'` | +| `crates_io` | PASS | `crates_io` | serde | `target/debug/noxa --no-store --extractor crates_io -f json 'https://crates.io/crates/serde'` | +| `huggingface_model` | PASS | `huggingface_model` | google-bert/bert-base-uncased | `target/debug/noxa --no-store --extractor huggingface_model -f json 'https://huggingface.co/google-bert/bert-base-uncased'` | +| `huggingface_dataset` | PASS | `huggingface_dataset` | rajpurkar/squad | `target/debug/noxa --no-store --extractor huggingface_dataset -f json 'https://huggingface.co/datasets/squad'` | +| `arxiv` | PASS | `arxiv` | Attention Is All You Need | `target/debug/noxa --no-store --extractor arxiv -f json 'https://arxiv.org/abs/1706.03762'` | +| `docker_hub` | PASS | `docker_hub` | nginx | `target/debug/noxa --no-store --extractor docker_hub -f json 'https://hub.docker.com/_/nginx'` | +| `dev_to` | PASS | `dev_to` | dev.to article payload | `target/debug/noxa --no-store --extractor dev_to -f json 'https://dev.to/devteam/introducing-dev-20-3kmh'` | +| `stackoverflow` | PASS | `stackoverflow` | How do I exit Vim? | `target/debug/noxa --no-store --extractor stackoverflow -f json 'https://stackoverflow.com/questions/11828270/how-do-i-exit-vim'` | +| `substack_post` | PASS | `substack_post` | Lenny's Newsletter / Substack | `target/debug/noxa --no-store --extractor substack_post -f json 'https://lenny.substack.com/p/what-is-good-retention'` | +| `youtube_video` | PASS | `youtube_video` | Rick Astley - Never Gonna Give You Up | `target/debug/noxa --no-store --extractor youtube_video -f json 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'` | +| `linkedin_post` | PASS | `linkedin_post` | LinkedIn embed payload | `target/debug/noxa --no-store --extractor linkedin_post -f json 'https://www.linkedin.com/feed/update/urn:li:activity:7123456789012345678/'` | +| `instagram_post` | PASS | `instagram_post` | Instagram embed payload | `target/debug/noxa --no-store --extractor instagram_post -f json 'https://www.instagram.com/p/CuY4nD2NrjI/'` | +| `instagram_profile` | PASS | `instagram_profile` | Instagram | `target/debug/noxa --no-store --extractor instagram_profile -f json -H 'x-ig-app-id: 936619743392459' 'https://www.instagram.com/instagram/'` | +| `shopify_product` | PASS | `shopify_product` | Men's Tree Runner - Kaikoura White | `target/debug/noxa --no-store --extractor shopify_product -f json 'https://www.allbirds.com/products/mens-tree-runners-kaikoura-white'` | +| `shopify_collection` | PASS | `shopify_collection` | Allbirds mens collection products | `target/debug/noxa --no-store --extractor shopify_collection -f json 'https://www.allbirds.com/collections/mens'` | +| `ecommerce_product` | PASS | `ecommerce_product` | Abominable Hoodie | `target/debug/noxa --no-store --extractor ecommerce_product -f json 'https://www.scrapingcourse.com/ecommerce/product/abominable-hoodie/'` | +| `woocommerce_product` | PASS | `woocommerce_product` | Abominable Hoodie | `target/debug/noxa --no-store --extractor woocommerce_product -f json 'https://www.scrapingcourse.com/ecommerce/product/abominable-hoodie/'` | +| `amazon_product` | PASS | `amazon_product` | Amazon product payload | `target/debug/noxa --no-store --extractor amazon_product -f json 'https://www.amazon.com/dp/B08N5WRWNW'` | +| `ebay_listing` | PASS | `ebay_listing` | eBay listing payload | `target/debug/noxa --no-store --extractor ebay_listing -f json 'https://www.ebay.com/itm/256172084604'` | +| `etsy_listing` | PASS | `etsy_listing` | Etsy listing payload | `target/debug/noxa --no-store --extractor etsy_listing -f json 'https://www.etsy.com/listing/1058071087/personalized-leather-wallet-for-men'` | +| `trustpilot_reviews` | PASS | `trustpilot_reviews` | Trustpilot review payload | `target/debug/noxa --no-store --extractor trustpilot_reviews -f json 'https://www.trustpilot.com/review/www.amazon.com'` | + +## Caveats + +- This report verifies live CLI execution and vertical payload plumbing. It does not claim that every live site returned complete business fields; some HTML/anti-bot-heavy pages produced sparse but valid extractor payloads. +- The live results depend on third-party endpoint behavior as of 2026-04-26. Reddit, Instagram, Shopify storefronts, and ecommerce pages are especially drift-prone. +- The raw output files live under `target/`, so they are intentionally not tracked in git. From 77e7202429b813b328982dc930b3b6a2efd42458 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 17:20:35 -0400 Subject: [PATCH 24/28] fix: address vertical extractor review feedback Resolves review thread PRRT_kwDOR_mP6c59sJHF Resolves review thread PRRT_kwDOR_mP6c59sJHO Resolves review thread PRRT_kwDOR_mP6c59sJG2 Resolves review thread PRRT_kwDOR_mP6c59sJG6 Resolves review thread PRRT_kwDOR_mP6c59sJG9 Resolves review thread PRRT_kwDOR_mP6c59sJHD Resolves review thread PRRT_kwDOR_mP6c59sJGv Resolves review thread PRRT_kwDOR_mP6c59sJGz - Broaden Amazon/eBay TLD matchers and tighten Substack dispatch. - Keep Reddit auto-fetch on the hardened JSON path while attaching vertical_data. - Return Reddit-specific vertical payloads, parse YouTube nocookie embeds, and rename Hugging Face all-time download fields. --- crates/noxa-fetch/src/client/fetch.rs | 20 ++-- .../src/extractors/amazon_product.rs | 4 +- .../noxa-fetch/src/extractors/ebay_listing.rs | 4 +- .../src/extractors/huggingface_dataset.rs | 2 +- .../src/extractors/huggingface_model.rs | 2 +- crates/noxa-fetch/src/extractors/mod.rs | 53 +++++++++-- crates/noxa-fetch/src/extractors/reddit.rs | 5 +- .../src/extractors/substack_post.rs | 4 +- .../src/extractors/youtube_video.rs | 2 +- crates/noxa-fetch/src/reddit.rs | 95 +++++++++++++++++-- 10 files changed, 157 insertions(+), 34 deletions(-) diff --git a/crates/noxa-fetch/src/client/fetch.rs b/crates/noxa-fetch/src/client/fetch.rs index 5b56771..c4ef7b1 100644 --- a/crates/noxa-fetch/src/client/fetch.rs +++ b/crates/noxa-fetch/src/client/fetch.rs @@ -139,11 +139,6 @@ impl FetchClient { url: &str, options: &noxa_core::ExtractionOptions, ) -> Result { - if let Some(result) = crate::extractors::dispatch_by_url(self, url).await { - let (extractor, data) = result?; - return Ok(build_vertical_extraction_result(extractor, url, data)); - } - if crate::reddit::is_reddit_url(url) { let json_url = crate::reddit::json_url(url); debug!("reddit detected, fetching {json_url}"); @@ -164,7 +159,15 @@ impl FetchClient { )); } match crate::reddit::parse_reddit_json(bytes, url) { - Ok(result) => return Ok(result), + Ok(mut result) => { + let data = crate::reddit::parse_reddit_vertical_json(bytes, url) + .map_err(FetchError::BodyDecode)?; + result.vertical_data = Some(noxa_core::VerticalData { + extractor: crate::extractors::reddit::INFO.name.to_string(), + data, + }); + return Ok(result); + } Err(error) => { warn!("reddit json fallback failed: {error}, falling back to HTML") } @@ -172,6 +175,11 @@ impl FetchClient { } } + if let Some(result) = crate::extractors::dispatch_by_url(self, url).await { + let (extractor, data) = result?; + return Ok(build_vertical_extraction_result(extractor, url, data)); + } + let start = Instant::now(); let client = self.pick_client(url); diff --git a/crates/noxa-fetch/src/extractors/amazon_product.rs b/crates/noxa-fetch/src/extractors/amazon_product.rs index 52dc22c..699742b 100644 --- a/crates/noxa-fetch/src/extractors/amazon_product.rs +++ b/crates/noxa-fetch/src/extractors/amazon_product.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, product}; +use super::{ExtractorInfo, host_has_label, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,7 +11,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "amazon.com") && (url.contains("/dp/") || url.contains("/gp/product/")) + host_has_label(url, "amazon") && (url.contains("/dp/") || url.contains("/gp/product/")) } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { diff --git a/crates/noxa-fetch/src/extractors/ebay_listing.rs b/crates/noxa-fetch/src/extractors/ebay_listing.rs index 1a04e5b..c9c7617 100644 --- a/crates/noxa-fetch/src/extractors/ebay_listing.rs +++ b/crates/noxa-fetch/src/extractors/ebay_listing.rs @@ -1,6 +1,6 @@ use serde_json::Value; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp, product}; +use super::{ExtractorInfo, host_has_label, http::ExtractorHttp, product}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,7 +11,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "ebay.com") && url.contains("/itm/") + host_has_label(url, "ebay") && url.contains("/itm/") } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { diff --git a/crates/noxa-fetch/src/extractors/huggingface_dataset.rs b/crates/noxa-fetch/src/extractors/huggingface_dataset.rs index a7f907b..42042e8 100644 --- a/crates/noxa-fetch/src/extractors/huggingface_dataset.rs +++ b/crates/noxa-fetch/src/extractors/huggingface_dataset.rs @@ -34,7 +34,7 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result Result Option> { - if reddit::matches(url) { - return Some( - reddit::extract(client, url) - .await - .map(|v| (reddit::INFO.name, v)), - ); - } if hackernews::matches(url) { return Some( hackernews::extract(client, url) @@ -448,6 +441,13 @@ fn host_matches(url: &str, suffix: &str) -> bool { .is_some_and(|host| host == suffix || host.ends_with(&format!(".{suffix}"))) } +fn host_has_label(url: &str, label: &str) -> bool { + url::Url::parse(url) + .ok() + .and_then(|url| url.host_str().map(|host| host.to_ascii_lowercase())) + .is_some_and(|host| host.split('.').any(|part| part == label)) +} + #[cfg(test)] mod tests { use super::*; @@ -678,10 +678,16 @@ mod tests { assert!(substack_post::matches( "https://example.substack.com/p/porting-noxa" )); + assert!(!substack_post::matches( + "https://example.com/p/porting-noxa" + )); assert!(youtube_video::matches( "https://www.youtube.com/watch?v=dQw4w9WgXcQ" )); assert!(youtube_video::matches("https://youtu.be/dQw4w9WgXcQ")); + assert!(youtube_video::matches( + "https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ" + )); } #[tokio::test] @@ -938,15 +944,40 @@ mod tests { .await .unwrap(); - assert_eq!(value["metadata"]["title"], "Rust release thread"); + assert_eq!(value["post"]["title"], "Rust release thread"); assert!( - value["content"]["plain_text"] + value["comments"][0]["body"] .as_str() .unwrap() .contains("Thanks for the update!") ); } + #[tokio::test] + async fn reddit_vertical_returns_reddit_specific_payload() { + let client = FixtureHttp { + bodies: [( + "https://www.reddit.com/r/rust/comments/abc123/release_thread.json", + include_str!("../../tests/fixtures/extractors/reddit.json"), + )] + .into_iter() + .collect(), + }; + + let value = reddit::extract( + &client, + "https://www.reddit.com/r/rust/comments/abc123/release_thread/", + ) + .await + .unwrap(); + + assert_eq!(value["post"]["title"], "Rust release thread"); + assert_eq!(value["post"]["author"], "ferris"); + assert_eq!(value["comments"][0]["author"], "reader1"); + assert!(value.get("metadata").is_none()); + assert!(value.get("content").is_none()); + } + #[tokio::test] async fn reddit_vertical_rejects_verify_wall_html() { let client = FixtureHttp { @@ -1007,7 +1038,11 @@ mod tests { #[tokio::test] async fn ecommerce_matchers_cover_auto_and_explicit_only_groups() { assert!(amazon_product::matches("https://www.amazon.com/dp/B000123")); + assert!(amazon_product::matches( + "https://www.amazon.co.uk/dp/B000123" + )); assert!(ebay_listing::matches("https://www.ebay.com/itm/123456")); + assert!(ebay_listing::matches("https://www.ebay.co.uk/itm/123456")); assert!(etsy_listing::matches( "https://www.etsy.com/listing/123456/test" )); diff --git a/crates/noxa-fetch/src/extractors/reddit.rs b/crates/noxa-fetch/src/extractors/reddit.rs index 14c68ba..c0609d0 100644 --- a/crates/noxa-fetch/src/extractors/reddit.rs +++ b/crates/noxa-fetch/src/extractors/reddit.rs @@ -17,8 +17,5 @@ pub fn matches(url: &str) -> bool { pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { let json_url = crate::reddit::json_url(url); let body = client.get_text(&json_url).await?; - let extraction = - crate::reddit::parse_reddit_json(body.as_bytes(), url).map_err(FetchError::BodyDecode)?; - - serde_json::to_value(extraction).map_err(|error| FetchError::BodyDecode(error.to_string())) + crate::reddit::parse_reddit_vertical_json(body.as_bytes(), url).map_err(FetchError::BodyDecode) } diff --git a/crates/noxa-fetch/src/extractors/substack_post.rs b/crates/noxa-fetch/src/extractors/substack_post.rs index ffe7a1a..82c85be 100644 --- a/crates/noxa-fetch/src/extractors/substack_post.rs +++ b/crates/noxa-fetch/src/extractors/substack_post.rs @@ -8,7 +8,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { name: "substack_post", label: "Substack Post", description: "Extract post metadata from Substack publications.", - url_patterns: &["https://*.substack.com/p/*", "*/p/*"], + url_patterns: &["https://*.substack.com/p/*"], }; pub fn matches(url: &str) -> bool { @@ -19,7 +19,7 @@ pub fn matches(url: &str) -> bool { let has_post_path = parsed.path_segments().is_some_and(|mut segments| { segments.next() == Some("p") && segments.next().is_some() }); - Some(has_post_path && (host.ends_with(".substack.com") || host != "substack.com")) + Some(has_post_path && host.ends_with(".substack.com")) }) .unwrap_or(false) } diff --git a/crates/noxa-fetch/src/extractors/youtube_video.rs b/crates/noxa-fetch/src/extractors/youtube_video.rs index 8d1e669..ecdb559 100644 --- a/crates/noxa-fetch/src/extractors/youtube_video.rs +++ b/crates/noxa-fetch/src/extractors/youtube_video.rs @@ -62,7 +62,7 @@ fn parse_video_id(url: &str) -> Option { if host == "youtu.be" { return parsed.path_segments()?.next().map(ToString::to_string); } - if host.ends_with("youtube.com") { + if host.ends_with("youtube.com") || host.ends_with("youtube-nocookie.com") { if parsed.path() == "/watch" { return parsed .query_pairs() diff --git a/crates/noxa-fetch/src/reddit.rs b/crates/noxa-fetch/src/reddit.rs index d41b912..bece0a8 100644 --- a/crates/noxa-fetch/src/reddit.rs +++ b/crates/noxa-fetch/src/reddit.rs @@ -4,7 +4,8 @@ use noxa_core::{Content, ExtractionResult, Metadata}; /// Reddit's new `shreddit` frontend only SSRs the post body — comments are /// loaded client-side. Appending `.json` to any Reddit URL returns the full /// comment tree as structured JSON, which we convert to clean markdown. -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use serde_json::{Value, json}; use tracing::debug; const JSON_API_USER_AGENT: &str = "noxa bot/0.7 (+https://github.com/jmagar/noxa)"; @@ -128,6 +129,59 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result Result { + if is_reddit_verify_wall_html(json_bytes) { + return Err("reddit verification page returned from json endpoint".to_string()); + } + + let listings: Vec = + serde_json::from_slice(json_bytes).map_err(|e| format!("reddit json parse: {e}"))?; + let post = listings + .first() + .and_then(|listing| { + listing + .data + .children + .iter() + .find(|child| child.kind == "t3") + }) + .map(|child| { + let d = &child.data; + json!({ + "title": d.title, + "author": d.author, + "subreddit": d.subreddit_name_prefixed, + "selftext": d.selftext, + "url": d.url_overridden_by_dest, + "score": d.score, + "permalink": d.permalink, + "created_utc": d.created_utc, + "num_comments": d.num_comments, + }) + }) + .unwrap_or_else(|| json!({})); + + let comments = listings + .get(1) + .map(|listing| { + listing + .data + .children + .iter() + .filter_map(comment_to_value) + .collect::>() + }) + .unwrap_or_default(); + + Ok(json!({ + "url": url, + "data_source": "reddit_json", + "post": post, + "comments": comments, + })) +} + pub fn is_reddit_verify_wall_html(bytes: &[u8]) -> bool { let text = String::from_utf8_lossy(bytes); let lower = text.to_ascii_lowercase(); @@ -164,31 +218,60 @@ fn render_comment(thing: &Thing, depth: usize, out: &mut String) { } } +fn comment_to_value(thing: &Thing) -> Option { + if thing.kind != "t1" { + return None; + } + + let d = &thing.data; + let replies = match &d.replies { + Some(Replies::Listing(listing)) => listing + .data + .children + .iter() + .filter_map(comment_to_value) + .collect(), + _ => Vec::new(), + }; + + Some(json!({ + "author": d.author, + "body": d.body, + "score": d.score, + "permalink": d.permalink, + "created_utc": d.created_utc, + "replies": replies, + })) +} + // --- Reddit JSON types (minimal) --- -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] struct Listing { data: ListingData, } -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] struct ListingData { children: Vec, } -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] struct Thing { kind: String, data: ThingData, } -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] struct ThingData { // Post fields (t3) title: Option, selftext: Option, subreddit_name_prefixed: Option, url_overridden_by_dest: Option, + permalink: Option, + created_utc: Option, + num_comments: Option, // Comment fields (t1) author: Option, body: Option, @@ -197,7 +280,7 @@ struct ThingData { } /// Reddit replies can be either a nested Listing or an empty string. -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] #[serde(untagged)] enum Replies { Listing(Listing), From fa6bbf299a8066c6a036af7da91252390573b160 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 17:22:29 -0400 Subject: [PATCH 25/28] style: format workspace --- crates/noxa-fetch/src/client/tests.rs | 5 +- crates/noxa-rag/src/chunker.rs | 3 +- crates/noxa-rag/src/config.rs | 14 ++--- crates/noxa-rag/src/factory.rs | 12 ++--- crates/noxa-rag/src/pipeline/parse/binary.rs | 3 +- crates/noxa-rag/src/pipeline/parse/rich.rs | 4 +- crates/noxa-rag/src/pipeline/parse/tests.rs | 54 ++++++++++++++++--- crates/noxa-rag/src/pipeline/runtime.rs | 27 +++++++--- crates/noxa-rag/src/pipeline/startup_scan.rs | 2 +- crates/noxa-rag/src/store/mod.rs | 6 ++- crates/noxa-rag/src/store/qdrant/lifecycle.rs | 10 ++-- .../noxa-rag/src/store/qdrant/vector_store.rs | 15 ++++-- 12 files changed, 107 insertions(+), 48 deletions(-) diff --git a/crates/noxa-fetch/src/client/tests.rs b/crates/noxa-fetch/src/client/tests.rs index e17d9bd..0262558 100644 --- a/crates/noxa-fetch/src/client/tests.rs +++ b/crates/noxa-fetch/src/client/tests.rs @@ -82,7 +82,10 @@ fn vertical_extraction_result_sets_vertical_payload_and_summary() { ); assert_eq!(result.metadata.title.as_deref(), Some("Noxa Repo")); - assert_eq!(result.vertical_data.as_ref().unwrap().extractor, "github_repo"); + assert_eq!( + result.vertical_data.as_ref().unwrap().extractor, + "github_repo" + ); assert_eq!(result.vertical_data.as_ref().unwrap().data["stars"], 42); assert!(result.content.markdown.contains("Repository metadata")); } diff --git a/crates/noxa-rag/src/chunker.rs b/crates/noxa-rag/src/chunker.rs index 311f30a..9aa191b 100644 --- a/crates/noxa-rag/src/chunker.rs +++ b/crates/noxa-rag/src/chunker.rs @@ -113,8 +113,7 @@ pub fn chunk( .enumerate() .map(|(chunk_index, (char_offset, text))| { let t_est = token_estimate(&text, tokenizer); - let section_header = - nearest_heading(&headings, char_offset).map(|s| s.to_string()); + let section_header = nearest_heading(&headings, char_offset).map(|s| s.to_string()); Chunk { text, source_url: source_url.clone(), diff --git a/crates/noxa-rag/src/config.rs b/crates/noxa-rag/src/config.rs index eca9dbc..f64f0ec 100644 --- a/crates/noxa-rag/src/config.rs +++ b/crates/noxa-rag/src/config.rs @@ -74,9 +74,7 @@ fn normalize_source(config: &mut RagConfig) -> Result<(), RagError> { )); } if !has_dirs && !has_legacy { - return Err(RagError::Config( - "watch_dirs must not be empty".to_string(), - )); + return Err(RagError::Config("watch_dirs must not be empty".to_string())); } if has_legacy { *watch_dirs = vec![watch_dir.take().unwrap()]; @@ -162,9 +160,7 @@ pub enum EmbedProviderConfig { } fn default_query_instruction() -> Option { - Some( - "Given a web search query, retrieve relevant passages that answer the query".to_string(), - ) + Some("Given a web search query, retrieve relevant passages that answer the query".to_string()) } impl EmbedProviderConfig { @@ -302,9 +298,9 @@ pub fn load_config(path: &Path) -> Result { let root: TomlRoot = toml::from_str(&content) .map_err(|e| RagError::Config(format!("config parse error: {}", e)))?; - let raw = root.rag.ok_or_else(|| { - RagError::Config(format!("missing [rag] section in {}", path.display())) - })?; + let raw = root + .rag + .ok_or_else(|| RagError::Config(format!("missing [rag] section in {}", path.display())))?; // Resolve uuid_namespace: use the explicit value from config, or generate a // random one for this deployment. A random namespace means point IDs are diff --git a/crates/noxa-rag/src/factory.rs b/crates/noxa-rag/src/factory.rs index e5f7eef..9434e69 100644 --- a/crates/noxa-rag/src/factory.rs +++ b/crates/noxa-rag/src/factory.rs @@ -25,14 +25,10 @@ pub async fn build_embed_provider( .. } => { let client = reqwest::Client::new(); - let provider = TeiProvider::new_with_probe( - url.clone(), - model.clone(), - client, - auth_token.clone(), - ) - .await - .map_err(|e| RagError::Config(format!("TEI startup probe failed: {e}")))?; + let provider = + TeiProvider::new_with_probe(url.clone(), model.clone(), client, auth_token.clone()) + .await + .map_err(|e| RagError::Config(format!("TEI startup probe failed: {e}")))?; if !provider.is_available().await { return Err(RagError::Config(format!( diff --git a/crates/noxa-rag/src/pipeline/parse/binary.rs b/crates/noxa-rag/src/pipeline/parse/binary.rs index 0758c12..10f6df6 100644 --- a/crates/noxa-rag/src/pipeline/parse/binary.rs +++ b/crates/noxa-rag/src/pipeline/parse/binary.rs @@ -87,8 +87,7 @@ pub(crate) fn parse_office_zip_file( .map_err(|e| RagError::Parse(format!("docx decompress '{entry_name}': {e}")))?; if copied > remaining { return Err(RagError::Parse( - "DOCX entry exceeds 50MB decompressed limit — possible zip bomb" - .to_string(), + "DOCX entry exceeds 50MB decompressed limit — possible zip bomb".to_string(), )); } measured_total = measured_total.saturating_add(copied); diff --git a/crates/noxa-rag/src/pipeline/parse/rich.rs b/crates/noxa-rag/src/pipeline/parse/rich.rs index 903e5da..aec10a4 100644 --- a/crates/noxa-rag/src/pipeline/parse/rich.rs +++ b/crates/noxa-rag/src/pipeline/parse/rich.rs @@ -1,6 +1,8 @@ use crate::error::RagError; -use super::{FormatProvenance, IngestionProvenance, ParsedFile, extract_xml_text, make_text_result}; +use super::{ + FormatProvenance, IngestionProvenance, ParsedFile, extract_xml_text, make_text_result, +}; pub(crate) fn parse_feed_file( bytes: Vec, diff --git a/crates/noxa-rag/src/pipeline/parse/tests.rs b/crates/noxa-rag/src/pipeline/parse/tests.rs index 6c3b17d..6dbe019 100644 --- a/crates/noxa-rag/src/pipeline/parse/tests.rs +++ b/crates/noxa-rag/src/pipeline/parse/tests.rs @@ -138,7 +138,14 @@ async fn parse_file_json_keeps_crawler_provenance_in_point_payload() { section_header: None, }; - let payload = build_point_payload(&chunk, &parsed.extraction, None, &parsed.provenance, url, None); + let payload = build_point_payload( + &chunk, + &parsed.extraction, + None, + &parsed.provenance, + url, + None, + ); let json = serde_json::to_value(&payload).expect("serialize payload"); assert_eq!( @@ -219,7 +226,14 @@ fn build_point_payload_serializes_web_variant() { }, }; - let payload = build_point_payload(&chunk, &extraction, None, &provenance, &chunk.source_url, None); + let payload = build_point_payload( + &chunk, + &extraction, + None, + &provenance, + &chunk.source_url, + None, + ); let json = serde_json::to_value(&payload).expect("serialize payload"); assert_eq!( @@ -264,7 +278,14 @@ fn build_point_payload_serializes_email_variant() { }, }; - let payload = build_point_payload(&chunk, &extraction, None, &provenance, &chunk.source_url, None); + let payload = build_point_payload( + &chunk, + &extraction, + None, + &provenance, + &chunk.source_url, + None, + ); let json = serde_json::to_value(&payload).expect("serialize payload"); assert_eq!( @@ -301,7 +322,14 @@ fn build_point_payload_serializes_feed_variant() { }, }; - let payload = build_point_payload(&chunk, &extraction, None, &provenance, &chunk.source_url, None); + let payload = build_point_payload( + &chunk, + &extraction, + None, + &provenance, + &chunk.source_url, + None, + ); let json = serde_json::to_value(&payload).expect("serialize payload"); assert_eq!( @@ -328,7 +356,14 @@ fn build_point_payload_serializes_presentation_variant() { }, }; - let payload = build_point_payload(&chunk, &extraction, None, &provenance, &chunk.source_url, None); + let payload = build_point_payload( + &chunk, + &extraction, + None, + &provenance, + &chunk.source_url, + None, + ); let json = serde_json::to_value(&payload).expect("serialize payload"); assert_eq!( @@ -356,7 +391,14 @@ fn build_point_payload_serializes_subtitle_variant() { }, }; - let payload = build_point_payload(&chunk, &extraction, None, &provenance, &chunk.source_url, None); + let payload = build_point_payload( + &chunk, + &extraction, + None, + &provenance, + &chunk.source_url, + None, + ); let json = serde_json::to_value(&payload).expect("serialize payload"); assert_eq!( diff --git a/crates/noxa-rag/src/pipeline/runtime.rs b/crates/noxa-rag/src/pipeline/runtime.rs index 3858ccc..0d61e82 100644 --- a/crates/noxa-rag/src/pipeline/runtime.rs +++ b/crates/noxa-rag/src/pipeline/runtime.rs @@ -6,8 +6,8 @@ use tokio::task::JoinHandle; use crate::config::SourceConfig; use crate::error::RagError; -use super::scan; use super::Pipeline; +use super::scan; use super::heartbeat::spawn_heartbeat; use super::startup_scan::spawn_startup_scan; @@ -28,22 +28,33 @@ async fn drain_and_report( match tokio::time::timeout(Duration::from_secs(timeout_secs), drain).await { Ok(_) => tracing::info!("pipeline shut down cleanly"), Err(_) => { - tracing::warn!(timeout_secs, "workers did not drain within timeout, forcing exit"); + tracing::warn!( + timeout_secs, + "workers did not drain within timeout, forcing exit" + ); return Err(RagError::DrainTimeout); } } let snap = pipeline.counters.snapshot(); - let avg_embed_ms = if snap.indexed > 0 { snap.total_embed_ms / snap.indexed as u64 } else { 0 }; - let avg_upsert_ms = if snap.indexed > 0 { snap.total_upsert_ms / snap.indexed as u64 } else { 0 }; + let avg_embed_ms = if snap.indexed > 0 { + snap.total_embed_ms / snap.indexed as u64 + } else { + 0 + }; + let avg_upsert_ms = if snap.indexed > 0 { + snap.total_upsert_ms / snap.indexed as u64 + } else { + 0 + }; tracing::info!( - indexed = snap.indexed, - failed = snap.failed, + indexed = snap.indexed, + failed = snap.failed, parse_failures = snap.parse_failures, - chunks = snap.total_chunks, + chunks = snap.total_chunks, avg_embed_ms, avg_upsert_ms, - duration_s = session_start.elapsed().as_secs(), + duration_s = session_start.elapsed().as_secs(), "session complete" ); diff --git a/crates/noxa-rag/src/pipeline/startup_scan.rs b/crates/noxa-rag/src/pipeline/startup_scan.rs index 1f0fba4..181a905 100644 --- a/crates/noxa-rag/src/pipeline/startup_scan.rs +++ b/crates/noxa-rag/src/pipeline/startup_scan.rs @@ -152,8 +152,8 @@ mod tests { use crate::store::{DynVectorStore, HashExistsResult, VectorStore}; use crate::types::{Point, SearchMetadataFilter, SearchResult}; - use super::spawn_startup_scan; use super::super::PipelineJob; + use super::spawn_startup_scan; // ── Mock VectorStore ────────────────────────────────────────────────────── diff --git a/crates/noxa-rag/src/store/mod.rs b/crates/noxa-rag/src/store/mod.rs index e359b6f..5a47e21 100644 --- a/crates/noxa-rag/src/store/mod.rs +++ b/crates/noxa-rag/src/store/mod.rs @@ -58,7 +58,11 @@ pub trait VectorStore: Send + Sync { /// /// Used by the startup delta scan to skip re-embedding files whose raw bytes /// have not changed since last indexing. Faster than SHA-256 content_hash checks. - async fn url_with_file_hash_exists_checked(&self, url: &str, file_hash: &str) -> HashExistsResult; + async fn url_with_file_hash_exists_checked( + &self, + url: &str, + file_hash: &str, + ) -> HashExistsResult; fn name(&self) -> &str; } diff --git a/crates/noxa-rag/src/store/qdrant/lifecycle.rs b/crates/noxa-rag/src/store/qdrant/lifecycle.rs index 6cd0fe8..9f41468 100644 --- a/crates/noxa-rag/src/store/qdrant/lifecycle.rs +++ b/crates/noxa-rag/src/store/qdrant/lifecycle.rs @@ -89,10 +89,12 @@ impl QdrantStore { /// Reconcile the landed file-metadata indexes on an already-existing collection. pub(crate) async fn reconcile_landed_file_metadata_indexes(&self) -> Result<(), RagError> { let idx_url = format!("{}/collections/{}/index", self.base_url, self.collection); - for (field, schema_type) in BASE_COLLECTION_INDEXES - .iter() - .filter(|(field, _)| matches!(*field, "file_path" | "last_modified" | "git_branch" | "content_hash" | "section_header")) - { + for (field, schema_type) in BASE_COLLECTION_INDEXES.iter().filter(|(field, _)| { + matches!( + *field, + "file_path" | "last_modified" | "git_branch" | "content_hash" | "section_header" + ) + }) { let idx_body = json!({ "field_name": field, "field_schema": schema_type }); let r = self.client.put(&idx_url).json(&idx_body).send().await?; if !r.status().is_success() { diff --git a/crates/noxa-rag/src/store/qdrant/vector_store.rs b/crates/noxa-rag/src/store/qdrant/vector_store.rs index ce65924..cb32a0c 100644 --- a/crates/noxa-rag/src/store/qdrant/vector_store.rs +++ b/crates/noxa-rag/src/store/qdrant/vector_store.rs @@ -9,7 +9,10 @@ use crate::store::{HashExistsResult, VectorStore}; use crate::types::{Point, SearchMetadataFilter, SearchResult}; use super::QdrantStore; -use super::http::{DeleteByFilterRequest, QuantizationSearchParams, SearchParams, SearchRequest, SearchResponse, UpsertRequest}; +use super::http::{ + DeleteByFilterRequest, QuantizationSearchParams, SearchParams, SearchRequest, SearchResponse, + UpsertRequest, +}; use super::payload::{point_to_qdrant_payload, search_filter, search_result_from_payload}; use crate::url_util::normalize_url; @@ -131,9 +134,7 @@ impl VectorStore for QdrantStore { // Knowledge: hnsw_ef=128 is below ef_construct=200 (Qdrant default collection // config) — good recall/latency balance for interactive queries. Caller can // override via SearchMetadataFilter::hnsw_ef; None falls back to this default. - let hnsw_ef = filter - .and_then(|f| f.hnsw_ef) - .unwrap_or(128); + let hnsw_ef = filter.and_then(|f| f.hnsw_ef).unwrap_or(128); let body = SearchRequest { vector: vector.to_vec(), limit, @@ -292,7 +293,11 @@ impl VectorStore for QdrantStore { } } - async fn url_with_file_hash_exists_checked(&self, url: &str, file_hash: &str) -> HashExistsResult { + async fn url_with_file_hash_exists_checked( + &self, + url: &str, + file_hash: &str, + ) -> HashExistsResult { if file_hash.is_empty() { return HashExistsResult::NotIndexed; } From faa6390f9eea3c36359135fc7f0959e1bc227c41 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 17:27:31 -0400 Subject: [PATCH 26/28] fix: address extractor review edge cases - Tighten vertical URL matchers and query-safe Shopify API URLs. - Improve YouTube, Instagram, arXiv, GitHub release, and structured-data edge cases. - Remove drift-prone extractor counts from docs/tests and document on-change shell behavior. --- README.md | 5 +- crates/noxa-cli/src/app/tests_primary.rs | 2 +- crates/noxa-cli/src/app/watch.rs | 2 +- crates/noxa-core/src/structured_data.rs | 16 ++++ crates/noxa-fetch/src/client.rs | 5 +- crates/noxa-fetch/src/client/fetch.rs | 2 +- crates/noxa-fetch/src/extractors/arxiv.rs | 14 ++- .../noxa-fetch/src/extractors/docker_hub.rs | 2 +- .../src/extractors/github_release.rs | 18 +++- .../src/extractors/instagram_post.rs | 89 +++++++++++++++---- crates/noxa-fetch/src/extractors/pypi.rs | 4 +- crates/noxa-fetch/src/extractors/reddit.rs | 10 ++- .../src/extractors/shopify_collection.rs | 23 ++++- .../src/extractors/shopify_product.rs | 21 ++++- .../src/extractors/stackoverflow.rs | 4 +- .../src/extractors/trustpilot_reviews.rs | 2 +- .../src/extractors/youtube_video.rs | 2 +- .../extractors/shopify_collection.json | 9 +- crates/noxa-mcp/README.md | 2 +- crates/noxa-mcp/src/server.rs | 2 +- docs/config.md | 2 +- 21 files changed, 192 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index b298bdb..83d7cd4 100644 --- a/README.md +++ b/README.md @@ -279,8 +279,9 @@ noxa --watch https://example.com # Custom check interval (seconds) noxa --watch --watch-interval 60 https://example.com -# Run a command on change — diff JSON is piped to stdin -noxa --watch --on-change "jq '.summary' >> changes.log" https://example.com +# Run a command on change — diff JSON is piped to stdin. +# Use sh -c when you need shell features such as redirection or pipes. +noxa --watch --on-change "sh -c \"jq '.summary' >> changes.log\"" https://example.com # Combine with a webhook — POST diff payload on each change noxa --watch --webhook https://hooks.example.com/notify https://example.com diff --git a/crates/noxa-cli/src/app/tests_primary.rs b/crates/noxa-cli/src/app/tests_primary.rs index 9421d6f..3a0c553 100644 --- a/crates/noxa-cli/src/app/tests_primary.rs +++ b/crates/noxa-cli/src/app/tests_primary.rs @@ -259,7 +259,7 @@ mod tests { let value: serde_json::Value = serde_json::from_str(&output).unwrap(); let entries = value.as_array().unwrap(); - assert_eq!(entries.len(), 28); + assert_eq!(entries.len(), noxa_fetch::extractors::list().len()); assert!(entries.iter().any(|entry| entry["name"] == "substack_post")); } diff --git a/crates/noxa-cli/src/app/watch.rs b/crates/noxa-cli/src/app/watch.rs index 6774ce8..7044e16 100644 --- a/crates/noxa-cli/src/app/watch.rs +++ b/crates/noxa-cli/src/app/watch.rs @@ -128,7 +128,7 @@ const WATCH_ON_CHANGE_TIMEOUT: std::time::Duration = std::time::Duration::from_s fn parse_on_change_command(cmd: &str) -> Result, String> { let argv = shlex::split(cmd) - .ok_or_else(|| "failed to parse command: unterminated quote".to_string())?; + .ok_or_else(|| "failed to parse command: invalid shell-style quoting".to_string())?; if argv.is_empty() { return Err("failed to run command: command is empty".to_string()); } diff --git a/crates/noxa-core/src/structured_data.rs b/crates/noxa-core/src/structured_data.rs index 1a25155..ea3b394 100644 --- a/crates/noxa-core/src/structured_data.rs +++ b/crates/noxa-core/src/structured_data.rs @@ -105,6 +105,22 @@ fn escape_raw_newlines_in_json_strings(input: &str) -> Option { out.push_str("\\r"); changed = true; } + '\t' if in_string => { + out.push_str("\\t"); + changed = true; + } + '\u{08}' if in_string => { + out.push_str("\\b"); + changed = true; + } + '\u{0c}' if in_string => { + out.push_str("\\f"); + changed = true; + } + ch if in_string && ch.is_control() => { + out.push_str(&format!("\\u{:04x}", ch as u32)); + changed = true; + } _ => out.push(ch), } } diff --git a/crates/noxa-fetch/src/client.rs b/crates/noxa-fetch/src/client.rs index 2eda874..c159d47 100644 --- a/crates/noxa-fetch/src/client.rs +++ b/crates/noxa-fetch/src/client.rs @@ -39,7 +39,10 @@ impl Default for FetchConfig { timeout: Duration::from_secs(12), follow_redirects: true, max_redirects: 10, - headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]), + headers: HashMap::from([ + ("Accept-Language".to_string(), "en-US,en;q=0.9".to_string()), + ("x-ig-app-id".to_string(), "936619743392459".to_string()), + ]), pdf_mode: PdfMode::default(), store: None, ops_log: None, diff --git a/crates/noxa-fetch/src/client/fetch.rs b/crates/noxa-fetch/src/client/fetch.rs index c4ef7b1..d55a0ee 100644 --- a/crates/noxa-fetch/src/client/fetch.rs +++ b/crates/noxa-fetch/src/client/fetch.rs @@ -459,7 +459,7 @@ pub(super) fn build_vertical_extraction_result( language: None, url: Some(url.to_string()), site_name: Some(extractor.to_string()), - image: string_field(&data, &["image_url", "thumbnail_url"]), + image: string_field(&data, &["image", "image_url", "thumbnail_url"]), favicon: None, word_count, content_hash: None, diff --git a/crates/noxa-fetch/src/extractors/arxiv.rs b/crates/noxa-fetch/src/extractors/arxiv.rs index ecf7fd5..d4a8a14 100644 --- a/crates/noxa-fetch/src/extractors/arxiv.rs +++ b/crates/noxa-fetch/src/extractors/arxiv.rs @@ -48,7 +48,14 @@ fn parse_id(url: &str) -> Option { if segs.len() < 2 || (segs[0] != "abs" && segs[0] != "pdf") { return None; } - let stripped = segs[1].trim_end_matches(".pdf"); + let is_old_style_archive = segs.len() >= 3 + && (segs[1].contains('.') || segs[1].chars().all(|c| c.is_ascii_alphabetic())); + let raw_id = if is_old_style_archive { + format!("{}/{}", segs[1], segs[2]) + } else { + segs[1].to_string() + }; + let stripped = raw_id.trim_end_matches(".pdf"); let no_version = match stripped.rfind('v') { Some(index) if stripped[index + 1..].chars().all(|c| c.is_ascii_digit()) => { &stripped[..index] @@ -148,7 +155,10 @@ fn parse_atom_entry(xml: &str) -> Option { } } Ok(Event::Text(text)) => { - let text = text.unescape().ok()?.to_string(); + let text = text + .unescape() + .map(|value| value.to_string()) + .unwrap_or_else(|_| String::from_utf8_lossy(text.as_ref()).into_owned()); match current { Some("id") => entry.id = Some(text.trim().to_string()), Some("title") => entry.title = append_text(entry.title.take(), &text), diff --git a/crates/noxa-fetch/src/extractors/docker_hub.rs b/crates/noxa-fetch/src/extractors/docker_hub.rs index 624395e..e6f294f 100644 --- a/crates/noxa-fetch/src/extractors/docker_hub.rs +++ b/crates/noxa-fetch/src/extractors/docker_hub.rs @@ -7,7 +7,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { name: "docker_hub", label: "Docker Hub Repository", description: "Extract repository metadata from Docker Hub.", - url_patterns: &["https://hub.docker.com/r/*"], + url_patterns: &["https://hub.docker.com/r/*", "https://hub.docker.com/_/*"], }; pub fn matches(url: &str) -> bool { diff --git a/crates/noxa-fetch/src/extractors/github_release.rs b/crates/noxa-fetch/src/extractors/github_release.rs index 8d27e96..8137d27 100644 --- a/crates/noxa-fetch/src/extractors/github_release.rs +++ b/crates/noxa-fetch/src/extractors/github_release.rs @@ -18,7 +18,12 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result Option<(String, String, String)> { Some(( segs[0].to_string(), segs[1].to_string(), - segs[4].to_string(), + segs[4..].join("/"), )) } + +fn encode_path_segment(value: &str) -> String { + value + .split('/') + .map(|segment| url::form_urlencoded::byte_serialize(segment.as_bytes())) + .map(Iterator::collect::) + .collect::>() + .join("%2F") +} diff --git a/crates/noxa-fetch/src/extractors/instagram_post.rs b/crates/noxa-fetch/src/extractors/instagram_post.rs index 2230297..e19d077 100644 --- a/crates/noxa-fetch/src/extractors/instagram_post.rs +++ b/crates/noxa-fetch/src/extractors/instagram_post.rs @@ -1,3 +1,5 @@ +use std::sync::LazyLock; + use regex::Regex; use serde_json::{Value, json}; @@ -65,39 +67,90 @@ fn path_segment_for(kind: &str) -> &'static str { } fn parse_username(html: &str) -> Option { - let re = Regex::new(r#"(?s)class="CaptionUsername"[^>]*>([^<]+)<"#).ok()?; - re.captures(html) + static RE: LazyLock = + LazyLock::new(|| Regex::new(r#"(?s)class="CaptionUsername"[^>]*>([^<]+)<"#).unwrap()); + RE.captures(html) .and_then(|captures| captures.get(1)) .map(|value| html_decode(value.as_str().trim())) } fn parse_caption(html: &str) -> Option { - let outer = Regex::new(r#"(?s)]*>(.*?)"#).ok()?; - let block = outer.captures(html)?.get(1)?.as_str(); - let user_re = Regex::new(r#"(?s)]*class="CaptionUsername"[^>]*>.*?"#).ok()?; - let stripped = user_re.replace_all(block, ""); - let tag_re = Regex::new(r"<[^>]+>").ok()?; - let text = tag_re.replace_all(&stripped, " "); + static OUTER_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"(?s)]*>(.*?)"#).unwrap()); + static USER_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"(?s)]*class="CaptionUsername"[^>]*>.*?"#).unwrap()); + static TAG_RE: LazyLock = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap()); + + let block = OUTER_RE.captures(html)?.get(1)?.as_str(); + let stripped = USER_RE.replace_all(block, ""); + let text = TAG_RE.replace_all(&stripped, " "); let decoded = html_decode(text.trim()); let cleaned = decoded.split_whitespace().collect::>().join(" "); (!cleaned.is_empty()).then_some(cleaned) } fn parse_thumbnail(html: &str) -> Option { - let img_re = + static RE: LazyLock = LazyLock::new(|| { Regex::new(r#"(?s)]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)""#) - .ok()?; - img_re - .captures(html) + .unwrap() + }); + RE.captures(html) .and_then(|captures| captures.get(1)) .map(|value| html_decode(value.as_str())) } fn html_decode(value: &str) -> String { - value - .replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace(""", "\"") - .replace("'", "'") + decode_html_entities(value) +} + +fn decode_html_entities(value: &str) -> String { + let mut out = String::with_capacity(value.len()); + let mut rest = value; + + while let Some(start) = rest.find('&') { + out.push_str(&rest[..start]); + rest = &rest[start..]; + let Some(end) = rest.find(';') else { + out.push_str(rest); + return out; + }; + let entity = &rest[1..end]; + if let Some(decoded) = decode_entity(entity) { + out.push(decoded); + } else if let Some(decoded) = decode_named_entity(entity) { + out.push_str(decoded); + } else { + out.push_str(&rest[..=end]); + } + rest = &rest[end + 1..]; + } + + out.push_str(rest); + out +} + +fn decode_entity(entity: &str) -> Option { + let codepoint = entity + .strip_prefix("#x") + .or_else(|| entity.strip_prefix("#X")) + .and_then(|hex| u32::from_str_radix(hex, 16).ok()) + .or_else(|| { + entity + .strip_prefix('#') + .and_then(|decimal| decimal.parse().ok()) + })?; + char::from_u32(codepoint) +} + +fn decode_named_entity(entity: &str) -> Option<&'static str> { + match entity { + "amp" => Some("&"), + "lt" => Some("<"), + "gt" => Some(">"), + "quot" => Some("\""), + "apos" | "#39" => Some("'"), + "nbsp" => Some(" "), + "hellip" => Some("..."), + _ => None, + } } diff --git a/crates/noxa-fetch/src/extractors/pypi.rs b/crates/noxa-fetch/src/extractors/pypi.rs index 1f167ec..3e17640 100644 --- a/crates/noxa-fetch/src/extractors/pypi.rs +++ b/crates/noxa-fetch/src/extractors/pypi.rs @@ -1,6 +1,6 @@ use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,7 +11,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "pypi.org") && url.contains("/project/") + parse_project(url).is_some() } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { diff --git a/crates/noxa-fetch/src/extractors/reddit.rs b/crates/noxa-fetch/src/extractors/reddit.rs index c0609d0..4023a40 100644 --- a/crates/noxa-fetch/src/extractors/reddit.rs +++ b/crates/noxa-fetch/src/extractors/reddit.rs @@ -11,7 +11,15 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "reddit.com") && url.contains("/comments/") + host_matches(url, "reddit.com") + && url::Url::parse(url) + .ok() + .and_then(|parsed| { + parsed + .path_segments() + .map(|mut segments| segments.any(|segment| segment == "comments")) + }) + .unwrap_or(false) } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { diff --git a/crates/noxa-fetch/src/extractors/shopify_collection.rs b/crates/noxa-fetch/src/extractors/shopify_collection.rs index c651246..c48b5b0 100644 --- a/crates/noxa-fetch/src/extractors/shopify_collection.rs +++ b/crates/noxa-fetch/src/extractors/shopify_collection.rs @@ -11,11 +11,15 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - url.contains("/collections/") + collection_api_url(url).is_some() } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { - let api_url = format!("{}/products.json", url.trim_end_matches('/')); + let api_url = collection_api_url(url).ok_or_else(|| { + FetchError::Build(format!( + "shopify_collection: cannot parse collection URL '{url}'" + )) + })?; let collection = client.get_json(&api_url).await?; Ok(json!({ "url": url, @@ -23,3 +27,18 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result Option { + let mut parsed = url::Url::parse(url).ok()?; + let has_collection_path = parsed.path_segments().is_some_and(|mut segments| { + segments.next() == Some("collections") && segments.next().is_some() + }); + if !has_collection_path { + return None; + } + parsed.set_query(None); + parsed.set_fragment(None); + let path = parsed.path().trim_end_matches('/').to_string(); + parsed.set_path(&format!("{path}/products.json")); + Some(parsed.to_string()) +} diff --git a/crates/noxa-fetch/src/extractors/shopify_product.rs b/crates/noxa-fetch/src/extractors/shopify_product.rs index a2dfebe..69307b5 100644 --- a/crates/noxa-fetch/src/extractors/shopify_product.rs +++ b/crates/noxa-fetch/src/extractors/shopify_product.rs @@ -11,11 +11,13 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - url.contains("/products/") + product_api_url(url).is_some() } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { - let product_url = format!("{}.js", url.trim_end_matches('/')); + let product_url = product_api_url(url).ok_or_else(|| { + FetchError::Build(format!("shopify_product: cannot parse product URL '{url}'")) + })?; let product = client.get_json(&product_url).await?; Ok(json!({ "url": url, @@ -31,3 +33,18 @@ pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result Option { + let mut parsed = url::Url::parse(url).ok()?; + let has_product_path = parsed.path_segments().is_some_and(|mut segments| { + segments.next() == Some("products") && segments.next().is_some() + }); + if !has_product_path { + return None; + } + parsed.set_query(None); + parsed.set_fragment(None); + let path = parsed.path().trim_end_matches('/').to_string(); + parsed.set_path(&format!("{path}.js")); + Some(parsed.to_string()) +} diff --git a/crates/noxa-fetch/src/extractors/stackoverflow.rs b/crates/noxa-fetch/src/extractors/stackoverflow.rs index 455531d..4ee8e75 100644 --- a/crates/noxa-fetch/src/extractors/stackoverflow.rs +++ b/crates/noxa-fetch/src/extractors/stackoverflow.rs @@ -1,6 +1,6 @@ use serde_json::{Value, json}; -use super::{ExtractorInfo, host_matches, http::ExtractorHttp}; +use super::{ExtractorInfo, http::ExtractorHttp}; use crate::error::FetchError; pub const INFO: ExtractorInfo = ExtractorInfo { @@ -11,7 +11,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { }; pub fn matches(url: &str) -> bool { - host_matches(url, "stackoverflow.com") && url.contains("/questions/") + parse_question_id(url).is_some() } pub async fn extract(client: &dyn ExtractorHttp, url: &str) -> Result { diff --git a/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs b/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs index e25269a..44016a0 100644 --- a/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs +++ b/crates/noxa-fetch/src/extractors/trustpilot_reviews.rs @@ -7,7 +7,7 @@ pub const INFO: ExtractorInfo = ExtractorInfo { name: "trustpilot_reviews", label: "Trustpilot Reviews", description: "Extract review data from Trustpilot.", - url_patterns: &["https://www.trustpilot.com/review/*"], + url_patterns: &["https://*.trustpilot.com/review/*"], }; pub fn matches(url: &str) -> bool { diff --git a/crates/noxa-fetch/src/extractors/youtube_video.rs b/crates/noxa-fetch/src/extractors/youtube_video.rs index ecdb559..9de8dcc 100644 --- a/crates/noxa-fetch/src/extractors/youtube_video.rs +++ b/crates/noxa-fetch/src/extractors/youtube_video.rs @@ -77,7 +77,7 @@ fn parse_video_id(url: &str) -> Option { } fn extract_player_response(html: &str) -> Option { - let re = Regex::new(r"var\s+ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").ok()?; + let re = Regex::new(r"(?:var\s+)?ytInitialPlayerResponse\s*=\s*(\{.+?\})\s*;").ok()?; serde_json::from_str(re.captures(html)?.get(1)?.as_str()).ok() } diff --git a/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json b/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json index be43080..df79e02 100644 --- a/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json +++ b/crates/noxa-fetch/tests/fixtures/extractors/shopify_collection.json @@ -8,7 +8,14 @@ "product_type": "Gadgets", "tags": ["fixture"], "variants": [{ "id": 11, "price": "1999", "available": true }], - "images": ["https://example.com/widget.jpg"] + "images": [ + { + "id": 101, + "product_id": 1, + "position": 1, + "src": "https://example.com/widget.jpg" + } + ] } ] } diff --git a/crates/noxa-mcp/README.md b/crates/noxa-mcp/README.md index b7112a0..e5a9bf7 100644 --- a/crates/noxa-mcp/README.md +++ b/crates/noxa-mcp/README.md @@ -39,7 +39,7 @@ Startup now creates those directories up front and returns a typed error if init ## Tool Notes - `scrape`, `crawl`, and `batch` use validated format enums instead of free-form strings. -- `scrape` accepts an optional `extractor` string for explicit vertical extraction; use the `extractors` tool to list all 28 supported extractors. +- `scrape` accepts an optional `extractor` string for explicit vertical extraction; use the `extractors` tool to list the supported extractors. - `extract` requires exactly one of `schema` or `prompt`. - `search` returns snippets plus fetch errors for validated result URLs; it does not write to `stdout` outside MCP. - `diff` can bootstrap a missing local baseline when a local fetch succeeds. diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs index f19a32e..9287183 100644 --- a/crates/noxa-mcp/src/server.rs +++ b/crates/noxa-mcp/src/server.rs @@ -1044,7 +1044,7 @@ mod tests { let entries: serde_json::Value = serde_json::from_str(&output).unwrap(); let entries = entries.as_array().unwrap(); - assert_eq!(entries.len(), 28); + assert_eq!(entries.len(), noxa_fetch::extractors::list().len()); assert!(entries.iter().any(|entry| entry["name"] == "github_repo")); } diff --git a/docs/config.md b/docs/config.md index 806ad0f..9f59de2 100644 --- a/docs/config.md +++ b/docs/config.md @@ -76,7 +76,7 @@ These options stay on the command line and do not belong in `config.json`: - `--on-change` - `--raw-html` -`--on-change` is CLI-only because it executes shell commands. `--raw-html` is a per-run mode, not a persistent default. +`--on-change` is CLI-only because it executes commands on the local machine. It parses the configured command into argv directly; wrap the command in `sh -c "..."` when you need shell features such as pipes, redirects, globs, or environment expansion. `--raw-html` is a per-run mode, not a persistent default. ## Config File Rules From 1342ae5bf3c36c0b38b444196b4bd769f625b2d4 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 17:29:56 -0400 Subject: [PATCH 27/28] fix: satisfy current stable clippy --- crates/noxa-core/src/brand/fonts.rs | 2 +- crates/noxa-core/src/llm/cleanup/css.rs | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/crates/noxa-core/src/brand/fonts.rs b/crates/noxa-core/src/brand/fonts.rs index c86a240..3cce37e 100644 --- a/crates/noxa-core/src/brand/fonts.rs +++ b/crates/noxa-core/src/brand/fonts.rs @@ -49,7 +49,7 @@ pub(super) fn extract_fonts(decls: &[css::CssDecl]) -> Vec { } let mut fonts: Vec<(String, usize)> = freq.into_iter().collect(); - fonts.sort_by(|a, b| b.1.cmp(&a.1)); + fonts.sort_by_key(|font| std::cmp::Reverse(font.1)); fonts.into_iter().map(|(name, _)| name).collect() } diff --git a/crates/noxa-core/src/llm/cleanup/css.rs b/crates/noxa-core/src/llm/cleanup/css.rs index 92f5eb6..d40cc98 100644 --- a/crates/noxa-core/src/llm/cleanup/css.rs +++ b/crates/noxa-core/src/llm/cleanup/css.rs @@ -28,10 +28,7 @@ fn strip_css_at_rules(line: &str) -> String { let mut result = line.to_string(); // Iteratively remove at-rule blocks with balanced brace handling - loop { - let Some(m) = CSS_AT_RE.find(&result) else { - break; - }; + while let Some(m) = CSS_AT_RE.find(&result) { let start = m.start(); // Find the matching closing brace after the at-rule header let after_header = m.end(); From 145b3ee5337380733e7a7ff151fcfd04d8337215 Mon Sep 17 00:00:00 2001 From: Jacob Magar Date: Sun, 26 Apr 2026 17:33:28 -0400 Subject: [PATCH 28/28] fix: stabilize CI on current stable --- crates/noxa-mcp/src/server.rs | 9 ++++++++- crates/noxa-rag/src/pipeline/process.rs | 8 ++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs index 9287183..eb13cd7 100644 --- a/crates/noxa-mcp/src/server.rs +++ b/crates/noxa-mcp/src/server.rs @@ -1118,6 +1118,13 @@ mod tests { #[tokio::test] async fn explicit_ollama_config_builds_non_empty_chain() { + let ollama = TestHttpServer::spawn(|request| { + if request.path == "/api/tags" { + return TestResponse::json(r#"{"models":[]}"#); + } + TestResponse::text(404, "missing", "text/plain") + }) + .await; let home = tempdir().unwrap(); let store_root = home.path().join("content"); std::fs::create_dir_all(&store_root).unwrap(); @@ -1133,7 +1140,7 @@ mod tests { cloud_api_key: None, llm_provider: Some("ollama".into()), llm_model: Some("qwen3.5:9b".into()), - llm_base_url: Some("http://127.0.0.1:11434".into()), + llm_base_url: Some(ollama.url("")), }; let chain = build_llm_chain(&config).await.unwrap(); diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs index dab3da2..0ab1725 100644 --- a/crates/noxa-rag/src/pipeline/process.rs +++ b/crates/noxa-rag/src/pipeline/process.rs @@ -260,11 +260,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result 0 { - total_tokens * 1_000 / embed_ms - } else { - 0 - }; + let embed_tokens_per_sec = (total_tokens * 1_000).checked_div(embed_ms).unwrap_or(0); if vectors.len() != chunks.len() { return Err(RagError::Embed { @@ -280,7 +276,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result = chunks .iter() - .zip(vectors.into_iter()) + .zip(vectors) .enumerate() .map(|(i, (chunk, vector))| { let id = uuid::Uuid::new_v5(