jmagar · jmagar · Apr 13, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/noxa-core/src/diff.rs b/crates/noxa-core/src/diff.rs
@@ -148,6 +148,16 @@ mod tests {
                 image: None,
                 favicon: None,
                 word_count,
+                content_hash: None,
+                source_type: None,
+                file_path: None,
+                last_modified: None,
+                is_truncated: None,
+                technologies: Vec::new(),
+                seed_url: None,
+                crawl_depth: None,
+                search_query: None,
+                fetched_at: None,
             },
             content: Content {
                 markdown: markdown.to_string(),

diff --git a/crates/noxa-core/src/llm/mod.rs b/crates/noxa-core/src/llm/mod.rs
@@ -77,6 +77,16 @@ mod tests {
                 image: None,
                 favicon: None,
                 word_count: 42,
+                content_hash: None,
+                source_type: None,
+                file_path: None,
+                last_modified: None,
+                is_truncated: None,
+                technologies: Vec::new(),
+                seed_url: None,
+                crawl_depth: None,
+                search_query: None,
+                fetched_at: None,
             },
             content: Content {
                 markdown: markdown.into(),
@@ -375,6 +385,16 @@ mod tests {
                 image: None,
                 favicon: None,
                 word_count: 0,
+                content_hash: None,
+                source_type: None,
+                file_path: None,
+                last_modified: None,
+                is_truncated: None,
+                technologies: Vec::new(),
+                seed_url: None,
+                crawl_depth: None,
+                search_query: None,
+                fetched_at: None,
             },
             content: Content {
                 markdown: "Just content".into(),

diff --git a/crates/noxa-core/src/metadata.rs b/crates/noxa-core/src/metadata.rs
@@ -52,6 +52,16 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
         image,
         favicon,
         word_count: 0, // filled later by the extractor
+        content_hash: None,
+        source_type: None,
+        file_path: None,
+        last_modified: None,
+        is_truncated: None,
+        technologies: Vec::new(),
+        seed_url: None,
+        crawl_depth: None,
+        search_query: None,
+        fetched_at: None,
     }
 }
 

diff --git a/crates/noxa-core/src/types.rs b/crates/noxa-core/src/types.rs
@@ -27,6 +27,37 @@ pub struct Metadata {
     pub image: Option<String>,
     pub favicon: Option<String>,
     pub word_count: usize,
+    // RAG-pipeline fields (all Option<T> for backward compat with existing web extraction callers)
+    /// SHA-256 hex digest of the raw source bytes. Used as a dedup key by noxa-rag.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub content_hash: Option<String>,
+    /// Source classification: 'web' | 'file' | 'mcp' | 'notebook' | 'email'
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub source_type: Option<String>,
+    /// Absolute filesystem path — populated for file:// sources only.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub file_path: Option<String>,
+    /// ISO 8601 timestamp: fs mtime for files, published_at for web content.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub last_modified: Option<String>,
+    /// True when the document hit the max_chunks_per_page limit and was cut short.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub is_truncated: Option<bool>,
+    /// Detected tech stack (e.g. ["React", "TypeScript", "Tailwind"]).
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub technologies: Vec<String>,
+    /// The root URL a crawl started from (populated by noxa-fetch crawler).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub seed_url: Option<String>,
+    /// Number of hops from seed_url (0 = seed page itself).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub crawl_depth: Option<u32>,
+    /// Query string if this page was fetched via a search operation.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub search_query: Option<String>,
+    /// ISO 8601 UTC timestamp of when this page was fetched.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub fetched_at: Option<String>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]

diff --git a/crates/noxa-fetch/Cargo.toml b/crates/noxa-fetch/Cargo.toml
@@ -17,6 +17,7 @@ http = "1"
 bytes = "1"
 url = "2"
 rand = "0.8"
+chrono = { version = "0.4", features = ["serde"] }
 quick-xml = { version = "0.37", features = ["serde"] }
 serde_json.workspace = true
 calamine = "0.34"

diff --git a/crates/noxa-fetch/src/client.rs b/crates/noxa-fetch/src/client.rs
@@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
+use chrono::Utc;
 use noxa_pdf::PdfMode;
 use rand::seq::SliceRandom;
 use tokio::sync::Semaphore;
@@ -279,6 +280,18 @@ impl FetchClient {
         &self,
         url: &str,
         options: &noxa_core::ExtractionOptions,
+    ) -> Result<noxa_core::ExtractionResult, FetchError> {
+        let mut result = self.fetch_and_extract_inner(url, options).await?;
+        result.metadata.fetched_at = Some(Utc::now().to_rfc3339());
+        Ok(result)
+    }
+
+    /// Inner implementation — callers should use [`fetch_and_extract_with_options`] which
+    /// stamps `fetched_at` on the returned metadata.
+    async fn fetch_and_extract_inner(
+        &self,
+        url: &str,
+        options: &noxa_core::ExtractionOptions,
     ) -> Result<noxa_core::ExtractionResult, FetchError> {
         // Reddit fallback: use their JSON API to get post + full comment tree.
         if crate::reddit::is_reddit_url(url) {
@@ -589,6 +602,16 @@ fn pdf_to_extraction_result(pdf: &noxa_pdf::PdfResult, url: &str) -> noxa_core::
             image: None,
             favicon: None,
             word_count,
+            content_hash: None,
+            source_type: Some("web".into()),
+            file_path: None,
+            last_modified: None,
+            is_truncated: None,
+            technologies: Vec::new(),
+            seed_url: None,
+            crawl_depth: None,
+            search_query: None,
+            fetched_at: None,
         },
         content: noxa_core::Content {
             markdown,

diff --git a/crates/noxa-fetch/src/crawler.rs b/crates/noxa-fetch/src/crawler.rs
@@ -319,13 +319,18 @@ impl Crawler {
             let mut next_frontier: Vec<(String, usize)> = Vec::new();
 
             for handle in handles {
-                let page = match handle.await {
+                let mut page = match handle.await {
                     Ok(page) => page,
                     Err(e) => {
                         warn!(error = %e, "crawl task panicked");
                         continue;
                     }
                 };
+                // Stamp provenance fields on each successfully extracted page.
+                if let Some(ref mut extraction) = page.extraction {
+                    extraction.metadata.seed_url = Some(start_url.to_string());
+                    extraction.metadata.crawl_depth = Some(page.depth as u32);
+                }
                 let depth = page.depth;
 
                 if depth < self.config.max_depth

diff --git a/crates/noxa-fetch/src/document.rs b/crates/noxa-fetch/src/document.rs
@@ -110,6 +110,16 @@ pub fn extract_document(
             image: None,
             favicon: None,
             word_count,
+            content_hash: None,
+            source_type: Some("file".into()),
+            file_path: None,
+            last_modified: None,
+            is_truncated: None,
+            technologies: Vec::new(),
+            seed_url: None,
+            crawl_depth: None,
+            search_query: None,
+            fetched_at: None,
         },
         content: noxa_core::Content {
             markdown,

diff --git a/crates/noxa-fetch/src/linkedin.rs b/crates/noxa-fetch/src/linkedin.rs
@@ -216,6 +216,16 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
             image: None,
             favicon: None,
             word_count,
+            content_hash: None,
+            source_type: Some("web".into()),
+            file_path: None,
+            last_modified: None,
+            is_truncated: None,
+            technologies: Vec::new(),
+            seed_url: None,
+            crawl_depth: None,
+            search_query: None,
+            fetched_at: None,
         },
         content: Content {
             markdown,

diff --git a/crates/noxa-fetch/src/reddit.rs b/crates/noxa-fetch/src/reddit.rs
@@ -92,6 +92,16 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
             image: None,
             favicon: None,
             word_count,
+            content_hash: None,
+            source_type: Some("web".into()),
+            file_path: None,
+            last_modified: None,
+            is_truncated: None,
+            technologies: Vec::new(),
+            seed_url: None,
+            crawl_depth: None,
+            search_query: None,
+            fetched_at: None,
         },
         content: Content {
             markdown,

diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs
@@ -474,6 +474,16 @@ impl NoxaMcp {
                         image: None,
                         favicon: None,
                         word_count: markdown.split_whitespace().count(),
+                        content_hash: None,
+                        source_type: Some("web".into()),
+                        file_path: None,
+                        last_modified: None,
+                        is_truncated: None,
+                        technologies: Vec::new(),
+                        seed_url: None,
+                        crawl_depth: None,
+                        search_query: None,
+                        fetched_at: None,
                     },
                     domain_data: None,
                     structured_data: Vec::new(),

diff --git a/crates/noxa-rag/Cargo.toml b/crates/noxa-rag/Cargo.toml
@@ -0,0 +1,77 @@
+[package]
+name = "noxa-rag"
+description = "RAG pipeline for noxa — TEI embeddings + Qdrant vector store"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "noxa-rag-daemon"
+path = "src/bin/noxa-rag-daemon.rs"
+
+[dependencies]
+noxa-core = { workspace = true }
+noxa-pdf = { path = "../noxa-pdf" }
+# noxa-fetch provides extract_document() for DOCX/XLSX/CSV — reused rather than re-implemented.
+noxa-fetch = { workspace = true }
+
+# Multi-format ingestion
+zip = "2"                         # DOCX, ODT, PPTX (ZIP archives) — matches noxa-fetch version
+quick-xml = "0.37"                # XML/OPML/RSS and DOCX word/document.xml — matches noxa-fetch version
+strip-ansi-escapes = "0.2"        # .log file preprocessing
+
+# Async runtime
+tokio = { workspace = true }
+
+# Serialization
+serde = { workspace = true }
+serde_json = { workspace = true }
+toml = "0.8"
+
+# Error handling
+thiserror = { workspace = true }
+
+# Tracing
+tracing = { workspace = true }
+tracing-subscriber = { workspace = true }
+
+# Async traits
+async-trait = "0.1"
+
+# HTTP client (plain reqwest — no primp patches needed for TEI/Qdrant)
+reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
+
+# No qdrant-client crate — REST calls via plain reqwest (no protoc/gRPC dependency)
+
+# Chunking
+text-splitter = { version = "0.25", features = ["markdown", "tokenizers"] }
+tokenizers = "0.21"
+
+# UUID v5 for deterministic point IDs
+uuid = { version = "1", features = ["v5", "serde"] }
+
+# SHA-256 for startup scan delta detection (file content hashing)
+sha2 = "0.10"
+
+# Filesystem watcher
+notify = "6"
+notify-debouncer-mini = "0.4"
+
+# Concurrent data structures
+dashmap = "6"
+
+# URL parsing
+url = "2"
+
+# CLI args
+clap = { workspace = true }
+
+# Date/time for failed-jobs log
+chrono = { version = "0.4", features = ["serde"] }
+
+# CancellationToken for coordinated shutdown
+tokio-util = { version = "0.7", features = ["io"] }
+
+[dev-dependencies]
+tokio = { workspace = true }
+tempfile = "3"