Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
62554b8
feat(noxa-68r.1): crate scaffold, traits, and core types
jmagar Apr 12, 2026
20e880e
feat(noxa-68r.2,noxa-68r.3,noxa-68r.4): chunker, TEI provider, Qdrant…
jmagar Apr 12, 2026
d66522b
feat(noxa-68r.6): factory and TOML config
jmagar Apr 12, 2026
9cde0d6
feat(noxa-68r.7): filesystem watcher pipeline
jmagar Apr 12, 2026
5217b99
feat(noxa-68r.8): daemon binary + README
jmagar Apr 12, 2026
2e43439
fix(noxa-68r.4): replace qdrant-client with plain reqwest REST calls
jmagar Apr 12, 2026
23f3922
fix(noxa-rag): address P0/P1 review comments from Copilot
jmagar Apr 12, 2026
31cb9d8
fix(noxa-rag): address remaining PR review comments
jmagar Apr 12, 2026
4ceb218
fix(noxa-rag): address PR #4 review comments
jmagar Apr 12, 2026
26347ab
fix(noxa-rag): address new coderabbitai review comments
jmagar Apr 12, 2026
b398294
Update Cargo.lock
jmagar Apr 12, 2026
b6a8d68
fix(noxa-rag): harden TEI and Qdrant error handling
jmagar Apr 12, 2026
0ed3c03
fix(noxa-rag): address remaining PR review feedback
jmagar Apr 12, 2026
9bd72fe
feat(noxa-ut9): expand Metadata + add IngestionContext, extend Search…
jmagar Apr 13, 2026
ad298d5
feat(noxa-cz5): expand PointPayload with full metadata, update upsert…
jmagar Apr 13, 2026
65073e5
feat(noxa-xn0): add 4 keyword indexes to create_collection; fix Metad…
jmagar Apr 13, 2026
d8f0334
feat(observability): per-step timing, session summary, heartbeat, del…
jmagar Apr 13, 2026
9c37510
feat(noxa-bvr): add TEI batch/retry/413 structured logging with error…
jmagar Apr 13, 2026
afa4d16
feat(noxa-bzn): add embed_tokens_per_sec to indexed/reindexed log line
jmagar Apr 13, 2026
09a8f29
feat(noxa-s5r): add noxa-pdf, zip, quick-xml, strip-ansi-escapes deps…
jmagar Apr 13, 2026
2f62198
feat(noxa-cl1,noxa-9ww): expand indexable formats, file:// support, p…
jmagar Apr 13, 2026
8a2b6fe
feat(noxa-rag): startup scan delta detection, multi-format parse_file…
jmagar Apr 13, 2026
a4d415f
test(noxa-7wn): add 20 unit tests covering all ingested format paths
jmagar Apr 13, 2026
5e66173
feat(noxa-7bi): replace bare JSON retry with schema-error correction …
jmagar Apr 13, 2026
d8f5378
fix(noxa-rag): address PR #4 review comments (rounds 3)
jmagar Apr 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,068 changes: 1,029 additions & 39 deletions Cargo.lock

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions crates/noxa-core/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,16 @@ mod tests {
image: None,
favicon: None,
word_count,
content_hash: None,
source_type: None,
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: Content {
markdown: markdown.to_string(),
Expand Down
20 changes: 20 additions & 0 deletions crates/noxa-core/src/llm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,16 @@ mod tests {
image: None,
favicon: None,
word_count: 42,
content_hash: None,
source_type: None,
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: Content {
markdown: markdown.into(),
Expand Down Expand Up @@ -375,6 +385,16 @@ mod tests {
image: None,
favicon: None,
word_count: 0,
content_hash: None,
source_type: None,
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: Content {
markdown: "Just content".into(),
Expand Down
10 changes: 10 additions & 0 deletions crates/noxa-core/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata {
image,
favicon,
word_count: 0, // filled later by the extractor
content_hash: None,
source_type: None,
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
}
}

Expand Down
31 changes: 31 additions & 0 deletions crates/noxa-core/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,37 @@ pub struct Metadata {
pub image: Option<String>,
pub favicon: Option<String>,
pub word_count: usize,
// RAG-pipeline fields (all Option<T> for backward compat with existing web extraction callers)
/// SHA-256 hex digest of the raw source bytes. Used as a dedup key by noxa-rag.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub content_hash: Option<String>,
/// Source classification: 'web' | 'file' | 'mcp' | 'notebook' | 'email'
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source_type: Option<String>,
/// Absolute filesystem path — populated for file:// sources only.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub file_path: Option<String>,
/// ISO 8601 timestamp: fs mtime for files, published_at for web content.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_modified: Option<String>,
/// True when the document hit the max_chunks_per_page limit and was cut short.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub is_truncated: Option<bool>,
/// Detected tech stack (e.g. ["React", "TypeScript", "Tailwind"]).
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub technologies: Vec<String>,
/// The root URL a crawl started from (populated by noxa-fetch crawler).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub seed_url: Option<String>,
/// Number of hops from seed_url (0 = seed page itself).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub crawl_depth: Option<u32>,
/// Query string if this page was fetched via a search operation.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub search_query: Option<String>,
/// ISO 8601 UTC timestamp of when this page was fetched.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub fetched_at: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
Expand Down
1 change: 1 addition & 0 deletions crates/noxa-fetch/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ http = "1"
bytes = "1"
url = "2"
rand = "0.8"
chrono = { version = "0.4", features = ["serde"] }
quick-xml = { version = "0.37", features = ["serde"] }
serde_json.workspace = true
calamine = "0.34"
Expand Down
23 changes: 23 additions & 0 deletions crates/noxa-fetch/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher};
use std::sync::Arc;
use std::time::{Duration, Instant};

use chrono::Utc;
use noxa_pdf::PdfMode;
use rand::seq::SliceRandom;
use tokio::sync::Semaphore;
Expand Down Expand Up @@ -279,6 +280,18 @@ impl FetchClient {
&self,
url: &str,
options: &noxa_core::ExtractionOptions,
) -> Result<noxa_core::ExtractionResult, FetchError> {
let mut result = self.fetch_and_extract_inner(url, options).await?;
result.metadata.fetched_at = Some(Utc::now().to_rfc3339());
Ok(result)
}

/// Inner implementation — callers should use [`fetch_and_extract_with_options`] which
/// stamps `fetched_at` on the returned metadata.
async fn fetch_and_extract_inner(
&self,
url: &str,
options: &noxa_core::ExtractionOptions,
) -> Result<noxa_core::ExtractionResult, FetchError> {
// Reddit fallback: use their JSON API to get post + full comment tree.
if crate::reddit::is_reddit_url(url) {
Expand Down Expand Up @@ -589,6 +602,16 @@ fn pdf_to_extraction_result(pdf: &noxa_pdf::PdfResult, url: &str) -> noxa_core::
image: None,
favicon: None,
word_count,
content_hash: None,
source_type: Some("web".into()),
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: noxa_core::Content {
markdown,
Expand Down
7 changes: 6 additions & 1 deletion crates/noxa-fetch/src/crawler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,13 +319,18 @@ impl Crawler {
let mut next_frontier: Vec<(String, usize)> = Vec::new();

for handle in handles {
let page = match handle.await {
let mut page = match handle.await {
Ok(page) => page,
Err(e) => {
warn!(error = %e, "crawl task panicked");
continue;
}
};
// Stamp provenance fields on each successfully extracted page.
if let Some(ref mut extraction) = page.extraction {
extraction.metadata.seed_url = Some(start_url.to_string());
extraction.metadata.crawl_depth = Some(page.depth as u32);
}
let depth = page.depth;

if depth < self.config.max_depth
Expand Down
10 changes: 10 additions & 0 deletions crates/noxa-fetch/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,16 @@ pub fn extract_document(
image: None,
favicon: None,
word_count,
content_hash: None,
source_type: Some("file".into()),
Comment thread
jmagar marked this conversation as resolved.
Outdated
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: noxa_core::Content {
markdown,
Expand Down
10 changes: 10 additions & 0 deletions crates/noxa-fetch/src/linkedin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,16 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option<ExtractionResult>
image: None,
favicon: None,
word_count,
content_hash: None,
source_type: Some("web".into()),
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: Content {
markdown,
Expand Down
10 changes: 10 additions & 0 deletions crates/noxa-fetch/src/reddit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result<ExtractionResul
image: None,
favicon: None,
word_count,
content_hash: None,
source_type: Some("web".into()),
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
content: Content {
markdown,
Expand Down
10 changes: 10 additions & 0 deletions crates/noxa-mcp/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,16 @@ impl NoxaMcp {
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
content_hash: None,
source_type: Some("web".into()),
file_path: None,
last_modified: None,
is_truncated: None,
technologies: Vec::new(),
seed_url: None,
crawl_depth: None,
search_query: None,
fetched_at: None,
},
domain_data: None,
structured_data: Vec::new(),
Expand Down
77 changes: 77 additions & 0 deletions crates/noxa-rag/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
[package]
name = "noxa-rag"
description = "RAG pipeline for noxa — TEI embeddings + Qdrant vector store"
version.workspace = true
edition.workspace = true
license.workspace = true

[[bin]]
name = "noxa-rag-daemon"
path = "src/bin/noxa-rag-daemon.rs"

[dependencies]
noxa-core = { workspace = true }
noxa-pdf = { path = "../noxa-pdf" }
# noxa-fetch provides extract_document() for DOCX/XLSX/CSV — reused rather than re-implemented.
noxa-fetch = { workspace = true }

# Multi-format ingestion
zip = "2" # DOCX, ODT, PPTX (ZIP archives) — matches noxa-fetch version
quick-xml = "0.37" # XML/OPML/RSS and DOCX word/document.xml — matches noxa-fetch version
strip-ansi-escapes = "0.2" # .log file preprocessing

# Async runtime
tokio = { workspace = true }

# Serialization
serde = { workspace = true }
serde_json = { workspace = true }
toml = "0.8"

# Error handling
thiserror = { workspace = true }

# Tracing
tracing = { workspace = true }
tracing-subscriber = { workspace = true }

# Async traits
async-trait = "0.1"

# HTTP client (plain reqwest — no primp patches needed for TEI/Qdrant)
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }

# No qdrant-client crate — REST calls via plain reqwest (no protoc/gRPC dependency)

# Chunking
text-splitter = { version = "0.25", features = ["markdown", "tokenizers"] }
tokenizers = "0.21"

# UUID v5 for deterministic point IDs
uuid = { version = "1", features = ["v5", "serde"] }

# SHA-256 for startup scan delta detection (file content hashing)
sha2 = "0.10"

# Filesystem watcher
notify = "6"
notify-debouncer-mini = "0.4"

# Concurrent data structures
dashmap = "6"

# URL parsing
url = "2"

# CLI args
clap = { workspace = true }

# Date/time for failed-jobs log
chrono = { version = "0.4", features = ["serde"] }

# CancellationToken for coordinated shutdown
tokio-util = { version = "0.7", features = ["io"] }

[dev-dependencies]
tokio = { workspace = true }
tempfile = "3"
Loading