diff --git a/Cargo.lock b/Cargo.lock index 16bbe05..2c3fa05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -184,12 +184,30 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "auto_enums" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65398a2893f41bce5c9259f6e1a4f03fbae40637c1bdc755b4f387f48c613b03" +dependencies = [ + "derive_utils", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.22.1" @@ -202,10 +220,10 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cexpr", "clang-sys", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", "regex", @@ -229,6 +247,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -262,7 +286,7 @@ version = "5.0.0-alpha.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28" dependencies = [ - "bitflags", + "bitflags 2.11.0", "boring-sys2", "foreign-types", "libc", @@ -356,6 +380,15 @@ dependencies = [ "zip 7.2.0", ] +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.58" @@ -488,6 +521,21 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "compression-codecs" version = "0.4.37" @@ -508,6 +556,19 @@ version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.59.0", +] + [[package]] name = "constant_time_eq" version = "0.3.1" @@ -530,6 +591,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core_maths" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77745e017f5edba1a9c1d854f6f3a52dac8a12dd5af5d2f54aecf61e43d80d30" +dependencies = [ + "libm", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -563,6 +633,34 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -602,14 +700,38 @@ dependencies = [ "syn", ] +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + [[package]] name = "darling" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", ] [[package]] @@ -625,17 +747,51 @@ dependencies = [ "syn", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn", +] + [[package]] name = "darling_macro" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ - "darling_core", + "darling_core 0.23.0", "quote", "syn", ] +[[package]] +name = "dary_heap" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04" +dependencies = [ + "serde", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "data-encoding" version = "2.10.0" @@ -674,6 +830,37 @@ dependencies = [ "syn", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + [[package]] name = "derive_more" version = "0.99.20" @@ -685,6 +872,17 @@ dependencies = [ "syn", ] +[[package]] +name = "derive_utils" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "362f47930db19fe7735f527e6595e4900316b893ebf6d48ad3d31be928d57dd6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "digest" version = "0.10.7" @@ -776,6 +974,12 @@ dependencies = [ "serde", ] +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -801,6 +1005,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + [[package]] name = "euclid" version = "0.20.14" @@ -833,6 +1046,17 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -931,6 +1155,15 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "fslock" version = "0.2.1" @@ -1119,6 +1352,12 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.5" @@ -1268,7 +1507,7 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-util", @@ -1309,6 +1548,18 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke 0.7.5", + "zerofrom", + "zerovec 0.10.4", +] + [[package]] name = "icu_collections" version = "2.1.1" @@ -1317,9 +1568,9 @@ checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", - "yoke", + "yoke 0.8.1", "zerofrom", - "zerovec", + "zerovec 0.11.5", ] [[package]] @@ -1329,10 +1580,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", + "litemap 0.8.1", + "tinystr 0.8.2", + "writeable 0.6.2", + "zerovec 0.11.5", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap 0.7.5", + "tinystr 0.7.6", + "writeable 0.5.5", ] [[package]] @@ -1341,12 +1604,12 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "icu_collections", + "icu_collections 2.1.1", "icu_normalizer_data", "icu_properties", - "icu_provider", + "icu_provider 2.1.1", "smallvec", - "zerovec", + "zerovec 0.11.5", ] [[package]] @@ -1361,12 +1624,12 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ - "icu_collections", + "icu_collections 2.1.1", "icu_locale_core", "icu_properties_data", - "icu_provider", + "icu_provider 2.1.1", "zerotrie", - "zerovec", + "zerovec 0.11.5", ] [[package]] @@ -1375,6 +1638,23 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr 0.7.6", + "writeable 0.5.5", + "yoke 0.7.5", + "zerofrom", + "zerovec 0.10.4", +] + [[package]] name = "icu_provider" version = "2.1.1" @@ -1383,13 +1663,46 @@ checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "writeable", - "yoke", + "writeable 0.6.2", + "yoke 0.8.1", "zerofrom", "zerotrie", - "zerovec", + "zerovec 0.11.5", ] +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "icu_segmenter" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de" +dependencies = [ + "core_maths", + "displaydoc", + "icu_collections 1.5.0", + "icu_locid", + "icu_provider 1.5.0", + "icu_segmenter_data", + "utf8_iter", + "zerovec 0.10.4", +] + +[[package]] +name = "icu_segmenter_data" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e52775179941363cc594e49ce99284d13d6948928d8e72c755f55e98caa1eb" + [[package]] name = "id-arena" version = "2.3.0" @@ -1435,6 +1748,39 @@ dependencies = [ "serde_core", ] +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags 1.3.2", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "inout" version = "0.1.4" @@ -1475,6 +1821,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1530,6 +1885,26 @@ dependencies = [ "uuid-simd", ] +[[package]] +name = "kqueue" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1558,13 +1933,22 @@ dependencies = [ "windows-link", ] +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + [[package]] name = "libredox" version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ + "bitflags 2.11.0", "libc", + "plain", + "redox_syscall 0.7.4", ] [[package]] @@ -1573,6 +1957,12 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +[[package]] +name = "litemap" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" + [[package]] name = "litemap" version = "0.8.1" @@ -1645,6 +2035,22 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + [[package]] name = "markup5ever" version = "0.14.1" @@ -1717,6 +2123,18 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys 0.48.0", +] + [[package]] name = "mio" version = "1.2.0" @@ -1728,6 +2146,28 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -1744,6 +2184,36 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "notify" +version = "6.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" +dependencies = [ + "bitflags 2.11.0", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "log", + "mio 0.8.11", + "walkdir", + "windows-sys 0.48.0", +] + +[[package]] +name = "notify-debouncer-mini" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d40b221972a1fc5ef4d858a2f671fb34c75983eb385463dff3780eeff6a9d43" +dependencies = [ + "crossbeam-channel", + "log", + "notify", +] + [[package]] name = "noxa-cli" version = "0.4.0" @@ -1790,6 +2260,7 @@ version = "0.4.0" dependencies = [ "bytes", "calamine", + "chrono", "http", "noxa-core", "noxa-pdf", @@ -1850,6 +2321,39 @@ dependencies = [ "tracing", ] +[[package]] +name = "noxa-rag" +version = "0.4.0" +dependencies = [ + "async-trait", + "chrono", + "clap", + "dashmap", + "notify", + "notify-debouncer-mini", + "noxa-core", + "noxa-fetch", + "noxa-pdf", + "quick-xml 0.37.5", + "reqwest", + "serde", + "serde_json", + "sha2", + "strip-ansi-escapes", + "tempfile", + "text-splitter", + "thiserror", + "tokenizers", + "tokio", + "tokio-util", + "toml", + "tracing", + "tracing-subscriber", + "url", + "uuid", + "zip 2.4.2", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1944,6 +2448,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" version = "1.21.4" @@ -1956,6 +2466,28 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "onig" +version = "6.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0" +dependencies = [ + "bitflags 2.11.0", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "openssl-macros" version = "0.1.1" @@ -1997,11 +2529,17 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.18", "smallvec", "windows-link", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "pastey" version = "0.2.1" @@ -2103,12 +2641,24 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "pom" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "postscript" version = "0.14.1" @@ -2121,7 +2671,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ - "zerovec", + "zerovec 0.11.5", ] [[package]] @@ -2164,6 +2714,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" +dependencies = [ + "bitflags 2.11.0", + "memchr", + "unicase", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -2325,13 +2886,53 @@ version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools 0.14.0", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.0", +] + +[[package]] +name = "redox_syscall" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" +dependencies = [ + "bitflags 2.11.0", ] [[package]] @@ -2417,7 +3018,7 @@ version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-core", "http", @@ -2470,7 +3071,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2231b2c085b371c01bc90c0e6c1cab8834711b6394533375bdbf870b0166d419" dependencies = [ "async-trait", - "base64", + "base64 0.22.1", "chrono", "futures", "pastey", @@ -2491,7 +3092,7 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36ea0e100fadf81be85d7ff70f86cd805c7572601d4ab2946207f36540854b43" dependencies = [ - "darling", + "darling 0.23.0", "proc-macro2", "quote", "serde_json", @@ -2537,7 +3138,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys", @@ -2591,6 +3192,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schemars" version = "1.2.1" @@ -2655,7 +3265,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" dependencies = [ - "bitflags", + "bitflags 2.11.0", "cssparser", "derive_more", "fxhash", @@ -2728,6 +3338,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2760,6 +3379,23 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2825,12 +3461,30 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "string_cache" version = "0.8.9" @@ -2856,12 +3510,42 @@ dependencies = [ "quote", ] +[[package]] +name = "strip-ansi-escapes" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8f8038e7e7969abb3f1b7c2a811225e9296da208539e0f79c5251d6cac0025" +dependencies = [ + "vte", +] + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "subtle" version = "2.6.1" @@ -2923,6 +3607,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "text-splitter" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8130aecc3b7938ce3ea387d7615eca92bd4f702a5adc0548ba930a9c039dda4" +dependencies = [ + "ahash", + "auto_enums", + "either", + "icu_provider 1.5.0", + "icu_segmenter", + "itertools 0.14.0", + "memchr", + "pulldown-cmark", + "strum", + "thiserror", + "tokenizers", +] + [[package]] name = "thiserror" version = "2.0.18" @@ -2983,6 +3686,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -2990,7 +3702,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", - "zerovec", + "zerovec 0.11.5", ] [[package]] @@ -3008,6 +3720,40 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a620b996116a59e184c2fa2dfd8251ea34a36d0a514758c6f966386bd2e03476" +dependencies = [ + "ahash", + "aho-corasick", + "compact_str", + "dary_heap", + "derive_builder", + "esaxx-rs", + "getrandom 0.3.4", + "indicatif", + "itertools 0.14.0", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand 0.9.2", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.50.0" @@ -3016,7 +3762,7 @@ checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ "bytes", "libc", - "mio", + "mio 1.2.0", "parking_lot", "pin-project-lite", "signal-hook-registry", @@ -3069,6 +3815,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tower" version = "0.5.3" @@ -3091,7 +3878,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ "async-compression", - "bitflags", + "bitflags 2.11.0", "bytes", "futures-core", "futures-util", @@ -3207,6 +3994,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-general-category" version = "1.1.0" @@ -3228,6 +4021,21 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + [[package]] name = "unicode-width" version = "0.2.2" @@ -3240,6 +4048,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "untrusted" version = "0.9.0" @@ -3277,6 +4091,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid" +version = "1.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" +dependencies = [ + "js-sys", + "serde_core", + "sha1_smol", + "wasm-bindgen", +] + [[package]] name = "uuid-simd" version = "0.8.0" @@ -3305,6 +4131,25 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "vte" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "231fdcd7ef3037e8330d8e17e61011a2c244126acc0a982f4040ac3f9f0bc077" +dependencies = [ + "memchr", +] + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -3421,7 +4266,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.0", "hashbrown 0.15.5", "indexmap", "semver", @@ -3487,6 +4332,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -3552,6 +4406,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -3561,6 +4424,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" @@ -3579,6 +4451,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -3612,6 +4499,12 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -3624,6 +4517,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -3636,6 +4535,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -3660,6 +4565,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -3672,6 +4583,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -3684,6 +4601,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -3696,6 +4619,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -3708,6 +4637,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.51.0" @@ -3766,7 +4704,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.0", "indexmap", "log", "serde", @@ -3832,6 +4770,12 @@ dependencies = [ "zstd", ] +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "writeable" version = "0.6.2" @@ -3847,6 +4791,18 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive 0.7.5", + "zerofrom", +] + [[package]] name = "yoke" version = "0.8.1" @@ -3854,10 +4810,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ "stable_deref_trait", - "yoke-derive", + "yoke-derive 0.8.1", "zerofrom", ] +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "yoke-derive" version = "0.8.1" @@ -3938,8 +4906,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", - "yoke", + "yoke 0.8.1", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke 0.7.5", "zerofrom", + "zerovec-derive 0.10.3", ] [[package]] @@ -3948,9 +4927,20 @@ version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ - "yoke", + "yoke 0.8.1", "zerofrom", - "zerovec-derive", + "zerovec-derive 0.11.2", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] diff --git a/crates/noxa-core/src/diff.rs b/crates/noxa-core/src/diff.rs index c6a5d71..b2c27eb 100644 --- a/crates/noxa-core/src/diff.rs +++ b/crates/noxa-core/src/diff.rs @@ -148,6 +148,16 @@ mod tests { image: None, favicon: None, word_count, + content_hash: None, + source_type: None, + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, content: Content { markdown: markdown.to_string(), diff --git a/crates/noxa-core/src/llm/mod.rs b/crates/noxa-core/src/llm/mod.rs index edbd993..ad7356c 100644 --- a/crates/noxa-core/src/llm/mod.rs +++ b/crates/noxa-core/src/llm/mod.rs @@ -77,6 +77,16 @@ mod tests { image: None, favicon: None, word_count: 42, + content_hash: None, + source_type: None, + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, content: Content { markdown: markdown.into(), @@ -375,6 +385,16 @@ mod tests { image: None, favicon: None, word_count: 0, + content_hash: None, + source_type: None, + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, content: Content { markdown: "Just content".into(), diff --git a/crates/noxa-core/src/metadata.rs b/crates/noxa-core/src/metadata.rs index c7f142b..b939742 100644 --- a/crates/noxa-core/src/metadata.rs +++ b/crates/noxa-core/src/metadata.rs @@ -52,6 +52,16 @@ pub fn extract(doc: &Html, url: Option<&str>) -> Metadata { image, favicon, word_count: 0, // filled later by the extractor + content_hash: None, + source_type: None, + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, } } diff --git a/crates/noxa-core/src/types.rs b/crates/noxa-core/src/types.rs index ebe7a92..fbda246 100644 --- a/crates/noxa-core/src/types.rs +++ b/crates/noxa-core/src/types.rs @@ -27,6 +27,37 @@ pub struct Metadata { pub image: Option, pub favicon: Option, pub word_count: usize, + // RAG-pipeline fields (all Option for backward compat with existing web extraction callers) + /// SHA-256 hex digest of the raw source bytes. Used as a dedup key by noxa-rag. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub content_hash: Option, + /// Source classification: 'web' | 'file' | 'mcp' | 'notebook' | 'email' + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_type: Option, + /// Absolute filesystem path — populated for file:// sources only. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub file_path: Option, + /// ISO 8601 timestamp: fs mtime for files, published_at for web content. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub last_modified: Option, + /// True when the document hit the max_chunks_per_page limit and was cut short. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub is_truncated: Option, + /// Detected tech stack (e.g. ["React", "TypeScript", "Tailwind"]). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub technologies: Vec, + /// The root URL a crawl started from (populated by noxa-fetch crawler). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub seed_url: Option, + /// Number of hops from seed_url (0 = seed page itself). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub crawl_depth: Option, + /// Query string if this page was fetched via a search operation. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub search_query: Option, + /// ISO 8601 UTC timestamp of when this page was fetched. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub fetched_at: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/crates/noxa-fetch/Cargo.toml b/crates/noxa-fetch/Cargo.toml index 85bb58a..b108824 100644 --- a/crates/noxa-fetch/Cargo.toml +++ b/crates/noxa-fetch/Cargo.toml @@ -17,6 +17,7 @@ http = "1" bytes = "1" url = "2" rand = "0.8" +chrono = { version = "0.4", features = ["serde"] } quick-xml = { version = "0.37", features = ["serde"] } serde_json.workspace = true calamine = "0.34" diff --git a/crates/noxa-fetch/src/client.rs b/crates/noxa-fetch/src/client.rs index e20066a..1f55d55 100644 --- a/crates/noxa-fetch/src/client.rs +++ b/crates/noxa-fetch/src/client.rs @@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use std::time::{Duration, Instant}; +use chrono::Utc; use noxa_pdf::PdfMode; use rand::seq::SliceRandom; use tokio::sync::Semaphore; @@ -279,6 +280,18 @@ impl FetchClient { &self, url: &str, options: &noxa_core::ExtractionOptions, + ) -> Result { + let mut result = self.fetch_and_extract_inner(url, options).await?; + result.metadata.fetched_at = Some(Utc::now().to_rfc3339()); + Ok(result) + } + + /// Inner implementation — callers should use [`fetch_and_extract_with_options`] which + /// stamps `fetched_at` on the returned metadata. + async fn fetch_and_extract_inner( + &self, + url: &str, + options: &noxa_core::ExtractionOptions, ) -> Result { // Reddit fallback: use their JSON API to get post + full comment tree. if crate::reddit::is_reddit_url(url) { @@ -589,6 +602,16 @@ fn pdf_to_extraction_result(pdf: &noxa_pdf::PdfResult, url: &str) -> noxa_core:: image: None, favicon: None, word_count, + content_hash: None, + source_type: Some("web".into()), + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, content: noxa_core::Content { markdown, diff --git a/crates/noxa-fetch/src/crawler.rs b/crates/noxa-fetch/src/crawler.rs index aa6b14c..381a23c 100644 --- a/crates/noxa-fetch/src/crawler.rs +++ b/crates/noxa-fetch/src/crawler.rs @@ -319,13 +319,18 @@ impl Crawler { let mut next_frontier: Vec<(String, usize)> = Vec::new(); for handle in handles { - let page = match handle.await { + let mut page = match handle.await { Ok(page) => page, Err(e) => { warn!(error = %e, "crawl task panicked"); continue; } }; + // Stamp provenance fields on each successfully extracted page. + if let Some(ref mut extraction) = page.extraction { + extraction.metadata.seed_url = Some(start_url.to_string()); + extraction.metadata.crawl_depth = Some(page.depth as u32); + } let depth = page.depth; if depth < self.config.max_depth diff --git a/crates/noxa-fetch/src/document.rs b/crates/noxa-fetch/src/document.rs index 2131a3e..0a30d0d 100644 --- a/crates/noxa-fetch/src/document.rs +++ b/crates/noxa-fetch/src/document.rs @@ -110,6 +110,16 @@ pub fn extract_document( image: None, favicon: None, word_count, + content_hash: None, + source_type: Some("web".into()), + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, content: noxa_core::Content { markdown, diff --git a/crates/noxa-fetch/src/linkedin.rs b/crates/noxa-fetch/src/linkedin.rs index 1c0bb69..0e1519c 100644 --- a/crates/noxa-fetch/src/linkedin.rs +++ b/crates/noxa-fetch/src/linkedin.rs @@ -216,6 +216,16 @@ pub fn extract_linkedin_post(html: &str, url: &str) -> Option image: None, favicon: None, word_count, + content_hash: None, + source_type: Some("web".into()), + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, content: Content { markdown, diff --git a/crates/noxa-fetch/src/reddit.rs b/crates/noxa-fetch/src/reddit.rs index 4d11c0f..ab52c7f 100644 --- a/crates/noxa-fetch/src/reddit.rs +++ b/crates/noxa-fetch/src/reddit.rs @@ -92,6 +92,16 @@ pub fn parse_reddit_json(json_bytes: &[u8], url: &str) -> Result Result<(), LlmError .map_err(|e| LlmError::InvalidJson(format!("invalid schema: {e}"))) } +/// Build a targeted correction prompt from a schema validation failure. +/// +/// Extracts the instance path and the schema keyword that failed (e.g. "type", +/// "required") and formats them into a short instruction under 200 chars. +/// Raw model output and web content are intentionally excluded — the caller +/// must NOT pass them here. +fn build_schema_correction_prompt(value: &serde_json::Value, schema: &serde_json::Value) -> String { + let Ok(compiled) = jsonschema::validator_for(schema) else { + return "Return ONLY corrected JSON matching the schema.".to_string(); + }; + + let correction = compiled.iter_errors(value).next().map(|e| { + let path = e.instance_path().to_string(); + let keyword = e.kind().keyword(); + if path.is_empty() || path == "/" { + format!("Field failed '{}' check. Return ONLY corrected JSON.", keyword) + } else { + format!("Field '{}' failed '{}' check. Return ONLY corrected JSON.", path, keyword) + } + }).unwrap_or_else(|| "Return ONLY corrected JSON matching the schema.".to_string()); + + // Hard cap at 200 chars — schema errors should never need more than this. + if correction.len() > 200 { + correction[..200].to_string() + } else { + correction + } +} + /// Extract structured JSON from content using a JSON schema. /// The schema tells the LLM exactly what fields to extract and their types. /// /// Retry policy: -/// - If the response cannot be parsed as JSON: retry once with a correction prompt. +/// - If the response cannot be parsed as JSON: retry once with a terse correction prompt. /// - If the response is valid JSON but fails schema validation: retry once with -/// a tighter correction prompt that includes the specific validation error. -/// - Both retry attempts add the previous failed response as an 'assistant' message -/// and the correction instructions as a 'user' message to improve success. +/// a correction prompt containing only the field path and keyword that failed. +/// - The correction prompt is capped at 200 chars and never embeds raw model +/// output or web content, preventing token overflow and schema leakage. pub async fn extract_json( content: &str, schema: &serde_json::Value, @@ -79,30 +108,27 @@ pub async fn extract_json( match parse_and_validate(&response, schema) { Ok(value) => Ok(value), - Err(e) => { - // First attempt failed — retry once with a correction prompt. - // Construct a concise correction prompt based on the error type. - let correction_prompt = match &e { - LlmError::InvalidJson(msg) if msg.contains("schema validation failed") => { - let error_msg = msg.replace("schema validation failed: ", ""); - format!("Correction required: {}. Return ONLY the corrected JSON.", error_msg) + Err(_) => { + // First attempt failed — retry once with a targeted correction prompt. + // + // IMPORTANT: Do NOT embed raw model output or web content here. + // For schema mismatches, extract path + keyword from the parsed value + // so the correction is precise. For parse failures, use a terse generic + // message. Both stay under 200 chars. + let correction_prompt = match parse_json_response(&response) { + Ok(parsed_value) => { + // Valid JSON but schema mismatch — extract specific field info. + build_schema_correction_prompt(&parsed_value, schema) } - _ => { - "Your response was not valid JSON. Please return ONLY valid JSON matching the schema.".to_string() + Err(_) => { + // Unparseable JSON — terse generic correction. + "Your response was not valid JSON. Return ONLY valid JSON matching the schema." + .to_string() } }; - // Limit correction context to prevent token blowup on large hallucinated outputs. - let capped_response = if response.len() > 2000 { - format!("{}... [truncated]", &response[..2000]) - } else { - response.clone() - }; - - messages.push(Message { - role: "assistant".into(), - content: capped_response, - }); + // Push only the correction message — raw model output is excluded + // to prevent token overflow and avoid reinforcing wrong patterns. messages.push(Message { role: "user".into(), content: correction_prompt, @@ -296,12 +322,13 @@ mod tests { } }); // Model returns valid JSON but wrong type ("string" instead of number). - // Should NOT retry (schema mismatch ≠ parse failure) — returns InvalidJson immediately. + // Retry fires with a schema-aware correction prompt, but MockProvider returns + // the same bad JSON again — both attempts fail, so the result is InvalidJson. let mock = MockProvider::ok(r#"{"price": "not-a-number"}"#); let result = extract_json("content", &schema, &mock, None).await; assert!( matches!(result, Err(LlmError::InvalidJson(_))), - "expected InvalidJson for schema mismatch, got {result:?}" + "expected InvalidJson after both attempts return wrong type, got {result:?}" ); } @@ -387,4 +414,138 @@ mod tests { let result = extract_json("content", &schema, &mock, None).await.unwrap(); assert_eq!(result["price"], 9.99); } + + // ── Correction prompt unit tests ─────────────────────────────────────────── + + /// Correction prompt for a type mismatch must include the field path and + /// the failing keyword, and must stay under 200 chars. + #[test] + fn correction_prompt_includes_field_path_and_keyword() { + let schema = serde_json::json!({ + "type": "object", + "properties": { + "price": { "type": "integer" } + } + }); + // Provide a string where integer is expected. + let value = serde_json::json!({"price": "wrong"}); + let prompt = build_schema_correction_prompt(&value, &schema); + + // Must mention the failing field path. + assert!( + prompt.contains("price"), + "expected field path in correction prompt, got: {prompt:?}" + ); + // Must mention the schema keyword. + assert!( + prompt.contains("type"), + "expected schema keyword in correction prompt, got: {prompt:?}" + ); + // Must stay under 200 chars — hard cap enforced by the function. + assert!( + prompt.len() <= 200, + "correction prompt exceeded 200 chars: {} chars", + prompt.len() + ); + // Must NOT contain raw model output or web content markers. + assert!( + !prompt.contains("wrong"), + "correction prompt must not embed the invalid value, got: {prompt:?}" + ); + } + + /// Correction prompt for a missing required field must mention the + /// 'required' keyword and stay under 200 chars. + #[test] + fn correction_prompt_for_missing_required_field() { + let schema = serde_json::json!({ + "type": "object", + "required": ["title"], + "properties": { + "title": { "type": "string" } + } + }); + let value = serde_json::json!({"other": "data"}); + let prompt = build_schema_correction_prompt(&value, &schema); + + assert!( + prompt.len() <= 200, + "correction prompt exceeded 200 chars: {} chars", + prompt.len() + ); + // 'required' keyword surfaced for missing required properties. + assert!( + prompt.contains("required"), + "expected 'required' keyword in prompt, got: {prompt:?}" + ); + } + + /// The retry message must not embed the raw model response. + /// We verify this by checking that a very long/distinctive model output + /// does not appear in any message sent during the retry call. + #[tokio::test] + async fn retry_prompt_does_not_embed_raw_model_output() { + use std::sync::{Arc, Mutex}; + use async_trait::async_trait; + use crate::provider::{CompletionRequest, LlmProvider}; + + /// A mock that records every request it receives. + struct RecordingProvider { + responses: Vec, + call_count: Arc>, + recorded_messages: Arc>>>, + } + + #[async_trait] + impl LlmProvider for RecordingProvider { + async fn complete(&self, request: &CompletionRequest) -> Result { + let mut count = self.call_count.lock().unwrap(); + let idx = (*count).min(self.responses.len() - 1); + *count += 1; + self.recorded_messages + .lock() + .unwrap() + .push(request.messages.clone()); + Ok(self.responses[idx].clone()) + } + async fn is_available(&self) -> bool { true } + fn name(&self) -> &str { "recording-mock" } + } + + // A distinctive raw model output that must NOT appear in the retry prompt. + let raw_model_output = r#"{"price": "DISTINCTIVE_BAD_VALUE_DO_NOT_RELAY"}"#; + + let recorded = Arc::new(Mutex::new(Vec::>::new())); + let mock = RecordingProvider { + responses: vec![ + raw_model_output.to_string(), + r#"{"price": 9.99}"#.to_string(), + ], + call_count: Arc::new(Mutex::new(0)), + recorded_messages: recorded.clone(), + }; + + let schema = serde_json::json!({ + "type": "object", + "required": ["price"], + "properties": { "price": { "type": "number" } } + }); + + let result = extract_json("some content", &schema, &mock, None).await.unwrap(); + assert_eq!(result["price"], 9.99); + + // Inspect the messages sent on the second (retry) call. + let all_calls = recorded.lock().unwrap(); + assert_eq!(all_calls.len(), 2, "expected exactly 2 provider calls"); + + let retry_messages = &all_calls[1]; + for msg in retry_messages { + assert!( + !msg.content.contains("DISTINCTIVE_BAD_VALUE_DO_NOT_RELAY"), + "raw model output leaked into retry message role={}: {:?}", + msg.role, + msg.content + ); + } + } } diff --git a/crates/noxa-mcp/src/server.rs b/crates/noxa-mcp/src/server.rs index db926e7..4973e3e 100644 --- a/crates/noxa-mcp/src/server.rs +++ b/crates/noxa-mcp/src/server.rs @@ -474,6 +474,16 @@ impl NoxaMcp { image: None, favicon: None, word_count: markdown.split_whitespace().count(), + content_hash: None, + source_type: Some("web".into()), + file_path: None, + last_modified: None, + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, }, domain_data: None, structured_data: Vec::new(), diff --git a/crates/noxa-rag/Cargo.toml b/crates/noxa-rag/Cargo.toml new file mode 100644 index 0000000..1e423ed --- /dev/null +++ b/crates/noxa-rag/Cargo.toml @@ -0,0 +1,77 @@ +[package] +name = "noxa-rag" +description = "RAG pipeline for noxa — TEI embeddings + Qdrant vector store" +version.workspace = true +edition.workspace = true +license.workspace = true + +[[bin]] +name = "noxa-rag-daemon" +path = "src/bin/noxa-rag-daemon.rs" + +[dependencies] +noxa-core = { workspace = true } +noxa-pdf = { path = "../noxa-pdf" } +# noxa-fetch provides extract_document() for DOCX/XLSX/CSV — reused rather than re-implemented. +noxa-fetch = { workspace = true } + +# Multi-format ingestion +zip = "2" # DOCX, ODT, PPTX (ZIP archives) — matches noxa-fetch version +quick-xml = "0.37" # XML/OPML/RSS and DOCX word/document.xml — matches noxa-fetch version +strip-ansi-escapes = "0.2" # .log file preprocessing + +# Async runtime +tokio = { workspace = true } + +# Serialization +serde = { workspace = true } +serde_json = { workspace = true } +toml = "0.8" + +# Error handling +thiserror = { workspace = true } + +# Tracing +tracing = { workspace = true } +tracing-subscriber = { workspace = true } + +# Async traits +async-trait = "0.1" + +# HTTP client (plain reqwest — no primp patches needed for TEI/Qdrant) +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } + +# No qdrant-client crate — REST calls via plain reqwest (no protoc/gRPC dependency) + +# Chunking +text-splitter = { version = "0.25", features = ["markdown", "tokenizers"] } +tokenizers = "0.21" + +# UUID v5 for deterministic point IDs +uuid = { version = "1", features = ["v5", "serde"] } + +# SHA-256 for startup scan delta detection (file content hashing) +sha2 = "0.10" + +# Filesystem watcher +notify = "6" +notify-debouncer-mini = "0.4" + +# Concurrent data structures +dashmap = "6" + +# URL parsing +url = "2" + +# CLI args +clap = { workspace = true } + +# Date/time for failed-jobs log +chrono = { version = "0.4", features = ["serde"] } + +# CancellationToken for coordinated shutdown +tokio-util = { version = "0.7", features = ["io"] } + +[dev-dependencies] +tokio = { workspace = true } +tempfile = "3" diff --git a/crates/noxa-rag/README.md b/crates/noxa-rag/README.md new file mode 100644 index 0000000..17b2125 --- /dev/null +++ b/crates/noxa-rag/README.md @@ -0,0 +1,162 @@ +# noxa-rag + +RAG pipeline for [noxa](https://github.com/jmagar/noxa) — watches noxa's output directory for `ExtractionResult` JSON files, chunks them, embeds via [HF TEI](https://github.com/huggingface/text-embeddings-inference), and upserts to [Qdrant](https://qdrant.tech/). + +## System Requirements + +- **Qdrant** running locally (REST port 6333) +- **HF TEI** with GPU (tested on RTX 4070) +- **CUDA** for TEI inference (CPU mode is possible but slow) +- **Rust 1.82+** +- **huggingface-cli** to download the tokenizer + +## CRITICAL: TEI Launch Command + +```bash +# CRITICAL: --pooling last-token is REQUIRED for Qwen3-0.6B +# Qwen3 is a decoder-only model. Mean pooling (TEI default) produces +# semantically incorrect embeddings. This flag is NOT optional. +docker run --gpus all -p 8080:80 \ + ghcr.io/huggingface/text-embeddings-inference:latest \ + --model-id Qwen/Qwen3-Embedding-0.6B \ + --pooling last-token \ + --max-batch-tokens 32768 \ + --max-client-batch-size 128 \ + --dtype float16 +``` + +### Verify TEI is working + +```bash +curl http://localhost:8080/health +# {"status":"ok"} + +# Check embedding dimensions (must be 1024 for Qwen3-0.6B) +curl -s http://localhost:8080/embed \ + -H "Content-Type: application/json" \ + -d '{"inputs": ["test"], "normalize": true}' | python3 -c "import sys,json; v=json.load(sys.stdin)[0]; print(f'{len(v)} dims')" +# 1024 dims +``` + +## Quickstart + +### 1. Download the tokenizer + +The Rust `tokenizers` crate cannot download from HF Hub at runtime. Download once: + +```bash +pip install huggingface_hub +huggingface-cli download Qwen/Qwen3-Embedding-0.6B tokenizer.json --local-dir ~/.cache/noxa-rag/tokenizer +``` + +### 2. Create config file + +```toml +# noxa-rag.toml + +[source] +type = "fs_watcher" +watch_dir = "/home/user/.noxa/output" +debounce_ms = 500 + +[embed_provider] +type = "tei" +url = "http://localhost:8080" +model = "Qwen/Qwen3-Embedding-0.6B" +# REQUIRED: path to directory containing tokenizer.json +local_path = "/home/user/.cache/noxa-rag/tokenizer" + +[vector_store] +type = "qdrant" +# REST port 6333 +url = "http://localhost:6333" +collection = "noxa_rag" +# api_key = "..." # or set NOXA_RAG_QDRANT_API_KEY env var + +[chunker] +target_tokens = 512 +overlap_tokens = 64 +min_words = 50 +max_chunks_per_page = 100 + +[pipeline] +embed_concurrency = 4 +# Must be an absolute path (daemon may run with CWD = /) +failed_jobs_log = "/home/user/.noxa/noxa-rag-failed.jsonl" +``` + +### 3. Start Qdrant + +```bash +docker run -p 6333:6333 -p 6334:6334 \ + -v ~/.noxa/qdrant:/qdrant/storage \ + qdrant/qdrant +``` + +### 4. Run the daemon + +```bash +cargo build --release -p noxa-rag +./target/release/noxa-rag-daemon --config noxa-rag.toml +``` + +### 5. Index content with noxa + +```bash +# Extract a page — the daemon will pick up the output file automatically +noxa https://docs.example.com --output ~/.noxa/output/ +``` + +The daemon watches `watch_dir` for `.json` files. When noxa writes an `ExtractionResult` to that directory, the daemon detects it (within `debounce_ms` ms), chunks it, embeds it, and upserts to Qdrant. + +## Configuration Reference + +| Field | Default | Description | +|-------|---------|-------------| +| `source.watch_dir` | — | Directory to watch for `.json` files | +| `source.debounce_ms` | `500` | Debounce window for filesystem events (ms) | +| `embed_provider.url` | — | TEI server URL | +| `embed_provider.model` | — | Model name (used in logs) | +| `embed_provider.local_path` | **required** | Directory containing `tokenizer.json` | +| `vector_store.url` | — | Qdrant REST URL (port 6333) | +| `vector_store.collection` | — | Qdrant collection name | +| `vector_store.api_key` | `null` | Qdrant API key (or `NOXA_RAG_QDRANT_API_KEY` env var) | +| `chunker.target_tokens` | `512` | Target chunk size in tokens | +| `chunker.overlap_tokens` | `64` | Sliding window overlap tokens | +| `chunker.min_words` | `50` | Skip chunks shorter than this | +| `chunker.max_chunks_per_page` | `100` | Cap chunks per document | +| `pipeline.embed_concurrency` | `4` | Concurrent embed workers (must be > 0) | +| `pipeline.failed_jobs_log` | `null` | Absolute path for NDJSON error log | + +## Architecture + +```text +noxa-cli (writes .json) → watch_dir + ↓ + notify-debouncer-mini (500ms debounce) + ↓ + bounded mpsc channel (256 capacity) + ↓ + embed_concurrency worker tasks (default: 4) + ↓ + ┌─────────────────────────────────────┐ + │ process_job() │ + │ 1. Read file (TOCTOU-safe) │ + │ 2. Parse ExtractionResult JSON │ + │ 3. Validate URL scheme (http/https) │ + │ 4. chunk() → Vec │ + │ 5. embed() → Vec> │ + │ 6. UUID v5 point IDs │ + │ 7. Per-URL mutex: delete + upsert │ + └─────────────────────────────────────┘ + ↓ + Qdrant (REST) +``` + +## Notes + +- **Recursive watch**: The daemon watches `watch_dir` recursively, so crawl output saved under nested path-based directories is indexed automatically. +- **Vim/Emacs compatibility**: The daemon watches all filesystem events (not just Create/Modify). Atomic saves via rename are detected correctly. +- **Idempotent indexing**: Re-indexing the same URL deletes old chunks first (delete-before-upsert), so chunk count changes are handled correctly. +- **Point IDs**: UUID v5 deterministic — same URL + chunk index always produces the same Qdrant point ID. +- **Failed jobs**: Parse failures and oversized files (>50MB) are logged to `failed_jobs_log` as NDJSON and skipped (the daemon keeps running). diff --git a/crates/noxa-rag/src/bin/noxa-rag-daemon.rs b/crates/noxa-rag/src/bin/noxa-rag-daemon.rs new file mode 100644 index 0000000..47404e1 --- /dev/null +++ b/crates/noxa-rag/src/bin/noxa-rag-daemon.rs @@ -0,0 +1,190 @@ +use clap::Parser; +/// noxa-rag-daemon — watches an output directory for ExtractionResult JSON files +/// and indexes them via TEI + Qdrant. +/// +/// Usage: +/// noxa-rag-daemon [--config ] [--log-level ] [--version] +use std::path::PathBuf; +use std::sync::Arc; +use tokio_util::sync::CancellationToken; +use tracing_subscriber::EnvFilter; + +use noxa_rag::{ + build_embed_provider, build_vector_store, + config::{EmbedProviderConfig, SourceConfig}, + load_config, + pipeline::Pipeline, +}; + +#[derive(Parser)] +#[command(name = "noxa-rag-daemon", about = "noxa RAG indexing daemon")] +struct Args { + /// Config file path + #[arg(long, default_value = "noxa-rag.toml")] + config: PathBuf, + + /// Log level (overrides RUST_LOG) + #[arg(long, default_value = "info")] + log_level: String, + + /// Print version and exit + #[arg(long)] + version: bool, +} + +#[tokio::main] +async fn main() { + let args = Args::parse(); + + if args.version { + println!("noxa-rag-daemon {}", env!("CARGO_PKG_VERSION")); + std::process::exit(0); + } + + // Init tracing to stderr (stdout may be piped). + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), + ) + .with_writer(std::io::stderr) + .init(); + + if let Err(e) = run(args).await { + eprintln!("[noxa-rag] fatal: {e}"); + std::process::exit(1); + } +} + +async fn run(args: Args) -> Result<(), Box> { + let config_path = &args.config; + + // Warn if config file is world-readable (may contain api_key). + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Ok(meta) = std::fs::metadata(config_path) { + let mode = meta.permissions().mode(); + if mode & 0o004 != 0 { + eprintln!( + "[noxa-rag] WARNING: config file is world-readable (mode {:o}). \ + Consider: chmod 600 {}", + mode, + config_path.display() + ); + } + } + } + + // Load config — fail fast with clear error. + let config = load_config(config_path) + .map_err(|e| format!("failed to load config from {}: {e}", config_path.display()))?; + + // Ensure watch_dir exists (create if missing — convenience for first-run). + let watch_dir = match &config.source { + SourceConfig::FsWatcher { watch_dir, .. } => watch_dir.clone(), + }; + + if !watch_dir.exists() { + std::fs::create_dir_all(&watch_dir).map_err(|e| { + format!( + "watch_dir does not exist and could not be created ({}): {e}", + watch_dir.display() + ) + })?; + eprintln!("[noxa-rag] created watch_dir: {}", watch_dir.display()); + } + + // Build embed provider — startup probe (exits 1 if TEI unavailable). + // Returns (provider, dims) so no redundant second probe is needed. + let (embed, embed_dims) = build_embed_provider(&config) + .await + .map_err(|e| format!("embed provider startup failed: {e}"))?; + + // Build vector store — collection create/validate. + let store = build_vector_store(&config, embed_dims) + .await + .map_err(|e| format!("vector store startup failed: {e}"))?; + + // Log collection stats so we know if starting fresh or resuming. + match store.collection_point_count().await { + Ok(n) => tracing::info!(points = n, "collection ready"), + Err(e) => tracing::warn!(error = %e, "could not query collection point count"), + } + + // Load tokenizer. + let tokenizer_model = match &config.embed_provider { + EmbedProviderConfig::Tei { + model, local_path, .. + } => (model.clone(), local_path.clone()), + _ => { + return Err( + "only the TEI embed provider is supported; set [embed_provider] type = \"tei\"" + .into(), + ); + } + }; + + // Rust tokenizers crate has no from_pretrained — local_path is required. + // Download tokenizer.json from HF Hub before running: + // huggingface-cli download Qwen/Qwen3-Embedding-0.6B tokenizer.json --local-dir ./ + let tokenizer = { + let path = tokenizer_model.1.ok_or_else(|| { + format!( + "embed_provider.local_path is required — the Rust tokenizers crate cannot \ + download from HF Hub. Set local_path to the directory containing tokenizer.json.\n\ + Download: huggingface-cli download {} tokenizer.json --local-dir ", + tokenizer_model.0 + ) + })?; + // If given a directory, look for tokenizer.json inside it. + let tokenizer_file = if path.is_dir() { + path.join("tokenizer.json") + } else { + path.clone() + }; + tokenizers::Tokenizer::from_file(&tokenizer_file).map_err(|e| { + format!( + "failed to load tokenizer from {}: {e}", + tokenizer_file.display() + ) + })? + }; + + eprintln!("[noxa-rag] tokenizer: {} — loaded", tokenizer_model.0); + + let shutdown = CancellationToken::new(); + let pipeline = Pipeline::new(config, embed, store, Arc::new(tokenizer), shutdown.clone()); + + // Signal handling: Ctrl-C + SIGTERM -> cancel. + let shutdown_signal = shutdown.clone(); + tokio::spawn(async move { + #[cfg(unix)] + { + use tokio::signal::unix::{SignalKind, signal}; + let mut sigterm = + signal(SignalKind::terminate()).expect("failed to register SIGTERM handler"); + tokio::select! { + _ = tokio::signal::ctrl_c() => {} + _ = sigterm.recv() => {} + } + } + #[cfg(not(unix))] + { + let _ = tokio::signal::ctrl_c().await; + } + eprintln!("[noxa-rag] shutdown signal received"); + shutdown_signal.cancel(); + }); + + eprintln!("[noxa-rag] daemon started"); + + // Run until a shutdown signal is received; the pipeline drains workers + // internally with a 10s timeout before returning. + pipeline + .run() + .await + .map_err(|e| format!("pipeline error: {e}"))?; + + eprintln!("[noxa-rag] daemon stopped"); + Ok(()) +} diff --git a/crates/noxa-rag/src/chunker.rs b/crates/noxa-rag/src/chunker.rs new file mode 100644 index 0000000..c584298 --- /dev/null +++ b/crates/noxa-rag/src/chunker.rs @@ -0,0 +1,175 @@ +use noxa_core::types::ExtractionResult; +use text_splitter::{ChunkConfig, MarkdownSplitter}; +use tokenizers::Tokenizer; + +use crate::config::ChunkerConfig; +use crate::types::Chunk; + +/// Count whitespace-separated words in a string. +fn word_count(s: &str) -> usize { + s.split_whitespace().count() +} + +/// Extract the domain/host from a URL string. +fn extract_domain(url: &str) -> String { + url::Url::parse(url) + .ok() + .and_then(|u| u.host_str().map(|h| h.to_string())) + .unwrap_or_default() +} + +/// Approximate token count — use the tokenizer when possible, fall back to word count. +fn token_estimate(text: &str, tokenizer: &Tokenizer) -> usize { + tokenizer + .encode(text, false) + .map(|enc| enc.len()) + .unwrap_or_else(|_| text.split_whitespace().count()) +} + +/// Build an overlap prefix from the end of `prev_text`, capped at `overlap_tokens` tokens. +/// +/// Scans backwards through whitespace-separated words, checking the budget before +/// adding each word (so we never exceed `overlap_tokens`). O(n) via a reversed +/// accumulator that is flipped at the end. +fn overlap_prefix(prev_text: &str, overlap_tokens: usize, tokenizer: &Tokenizer) -> String { + if overlap_tokens == 0 || prev_text.is_empty() { + return String::new(); + } + + let words: Vec<&str> = prev_text.split_whitespace().collect(); + if words.is_empty() { + return String::new(); + } + + let mut selected_rev: Vec<&str> = Vec::new(); + let mut token_count = 0usize; + + for &word in words.iter().rev() { + let word_tokens = token_estimate(word, tokenizer); + if token_count + word_tokens > overlap_tokens { + break; + } + token_count += word_tokens; + selected_rev.push(word); + } + + selected_rev.reverse(); + selected_rev.join(" ") +} + +/// Chunk an `ExtractionResult` into a `Vec`. +/// +/// - Uses `content.markdown` if non-empty, otherwise `content.plain_text`. +/// - Empty content (both empty) → `Vec::new()`. +/// - Implements manual sliding-window overlap (text-splitter has no built-in overlap). +/// - Filters chunks below `config.min_words`. +/// - Caps output at `config.max_chunks_per_page`. +pub fn chunk( + result: &ExtractionResult, + config: &ChunkerConfig, + tokenizer: &Tokenizer, +) -> Vec { + // Pick input text: markdown preferred, plain_text fallback. + let text: &str = if !result.content.markdown.is_empty() { + &result.content.markdown + } else if !result.content.plain_text.is_empty() { + &result.content.plain_text + } else { + return Vec::new(); + }; + + // Source URL and domain. + let source_url: String = result.metadata.url.as_deref().unwrap_or("").to_string(); + let domain = extract_domain(&source_url); + + // Build the splitter with a token-range chunk config. + // Use (target - 112)..target as the range; handle pathological configs safely. + let upper = config.target_tokens.max(2); + let lower = upper.saturating_sub(112).max(1); + // Ensure lower < upper so the range is valid. + let lower = lower.min(upper - 1); + + let splitter = + MarkdownSplitter::new(ChunkConfig::new(lower..upper).with_sizer(tokenizer.clone())); + + // Split and collect (char_offset, chunk_text) pairs via chunk_char_indices. + let raw_chunks: Vec<(usize, String)> = splitter + .chunk_char_indices(text) + .map(|ci| (ci.char_offset, ci.chunk.to_string())) + .collect(); + + if raw_chunks.is_empty() { + return Vec::new(); + } + + // Apply sliding-window overlap: each chunk (except the first) gets a prefix + // consisting of the last `overlap_tokens` tokens of the previous raw chunk text. + let mut chunks_with_overlap: Vec<(usize, String)> = Vec::with_capacity(raw_chunks.len()); + + for (i, (offset, chunk_text)) in raw_chunks.iter().enumerate() { + let text_with_overlap: String = if i == 0 || config.overlap_tokens == 0 { + chunk_text.clone() + } else { + let prev_text = &raw_chunks[i - 1].1; + let prefix = overlap_prefix(prev_text, config.overlap_tokens, tokenizer); + if prefix.is_empty() { + chunk_text.clone() + } else { + format!("{}\n\n{}", prefix, chunk_text) + } + }; + chunks_with_overlap.push((*offset, text_with_overlap)); + } + + // Filter by min_words, then cap at max_chunks_per_page. + let filtered: Vec<(usize, String)> = chunks_with_overlap + .into_iter() + .filter(|(_, t)| word_count(t) >= config.min_words) + .take(config.max_chunks_per_page) + .collect(); + + if filtered.is_empty() { + return Vec::new(); + } + + let total_chunks = filtered.len(); + + filtered + .into_iter() + .enumerate() + .map(|(chunk_index, (char_offset, text))| { + let t_est = token_estimate(&text, tokenizer); + Chunk { + text, + source_url: source_url.clone(), + domain: domain.clone(), + chunk_index, + total_chunks, + char_offset, + token_estimate: t_est, + } + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn domain_extraction() { + assert_eq!( + extract_domain("https://docs.example.com/foo"), + "docs.example.com" + ); + assert_eq!(extract_domain(""), ""); + assert_eq!(extract_domain("not-a-url"), ""); + } + + #[test] + fn word_count_basic() { + assert_eq!(word_count("hello world foo"), 3); + assert_eq!(word_count(" "), 0); + assert_eq!(word_count(""), 0); + } +} diff --git a/crates/noxa-rag/src/config.rs b/crates/noxa-rag/src/config.rs new file mode 100644 index 0000000..22422db --- /dev/null +++ b/crates/noxa-rag/src/config.rs @@ -0,0 +1,156 @@ +use serde::Deserialize; +use std::path::{Path, PathBuf}; + +use crate::error::RagError; + +/// Top-level configuration deserialized from noxa-rag.toml. +#[derive(Debug, Clone, Deserialize)] +pub struct RagConfig { + pub source: SourceConfig, + pub embed_provider: EmbedProviderConfig, + pub vector_store: VectorStoreConfig, + pub chunker: ChunkerConfig, + pub pipeline: PipelineConfig, + /// UUID namespace for deterministic point IDs. + /// Default: 6ba7b810-9dad-11d1-80b4-00c04fd430c8 + #[serde(default = "default_uuid_namespace")] + pub uuid_namespace: uuid::Uuid, +} + +fn default_uuid_namespace() -> uuid::Uuid { + uuid::Uuid::parse_str("6ba7b810-9dad-11d1-80b4-00c04fd430c8").unwrap() +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum SourceConfig { + FsWatcher { + watch_dir: PathBuf, + #[serde(default = "default_debounce_ms")] + debounce_ms: u64, + }, +} + +fn default_debounce_ms() -> u64 { + 500 +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum EmbedProviderConfig { + Tei { + url: String, + model: String, + /// Optional: load tokenizer from local path (avoids HF Hub at startup). + local_path: Option, + }, + OpenAi { + api_key: String, + model: String, + }, + VoyageAi { + api_key: String, + model: String, + }, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum VectorStoreConfig { + Qdrant { + /// REST URL — port 6333 (e.g. http://127.0.0.1:53333 if port-mapped). + url: String, + collection: String, + /// Optional API key. Override with NOXA_RAG_QDRANT_API_KEY env var. + api_key: Option, + }, + /// Dev/test only — factory returns RagError::Config("not implemented"). + InMemory, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ChunkerConfig { + #[serde(default = "default_target_tokens")] + pub target_tokens: usize, + #[serde(default = "default_overlap_tokens")] + pub overlap_tokens: usize, + #[serde(default = "default_min_words")] + pub min_words: usize, + #[serde(default = "default_max_chunks_per_page")] + pub max_chunks_per_page: usize, +} + +impl Default for ChunkerConfig { + fn default() -> Self { + Self { + target_tokens: default_target_tokens(), + overlap_tokens: default_overlap_tokens(), + min_words: default_min_words(), + max_chunks_per_page: default_max_chunks_per_page(), + } + } +} + +fn default_target_tokens() -> usize { + 512 +} +fn default_overlap_tokens() -> usize { + 64 +} +fn default_min_words() -> usize { + 50 +} +fn default_max_chunks_per_page() -> usize { + 100 +} + +#[derive(Debug, Clone, Deserialize)] +pub struct PipelineConfig { + #[serde(default = "default_embed_concurrency")] + pub embed_concurrency: usize, + /// MUST be an absolute path — systemd daemon runs with CWD = /. + pub failed_jobs_log: Option, +} + +impl Default for PipelineConfig { + fn default() -> Self { + Self { + embed_concurrency: default_embed_concurrency(), + failed_jobs_log: None, + } + } +} + +fn default_embed_concurrency() -> usize { + 4 +} + +/// Load and validate config from a TOML file. +pub fn load_config(path: &Path) -> Result { + let content = std::fs::read_to_string(path).map_err(|e| { + RagError::Config(format!("cannot read config file {}: {}", path.display(), e)) + })?; + + let config: RagConfig = toml::from_str(&content) + .map_err(|e| RagError::Config(format!("config parse error: {}", e)))?; + + // Validate embed_concurrency > 0 + if config.pipeline.embed_concurrency == 0 { + return Err(RagError::Config( + "pipeline.embed_concurrency must be > 0 or no workers will run".to_string(), + )); + } + + // Validate failed_jobs_log is absolute if set + if let Some(ref log_path) = config.pipeline.failed_jobs_log { + if !log_path.is_absolute() { + return Err(RagError::Config(format!( + "pipeline.failed_jobs_log must be an absolute path (got: {}). \ + systemd daemon runs with CWD = / and relative paths resolve there.", + log_path.display() + ))); + } + } + + Ok(config) +} diff --git a/crates/noxa-rag/src/embed/mod.rs b/crates/noxa-rag/src/embed/mod.rs new file mode 100644 index 0000000..b275add --- /dev/null +++ b/crates/noxa-rag/src/embed/mod.rs @@ -0,0 +1,19 @@ +use async_trait::async_trait; +use std::sync::Arc; + +use crate::error::RagError; + +/// Pluggable embedding provider. +/// +/// Trait surface is minimal by design — only what ALL impls share. +/// `is_available()` and `dimensions()` are concrete methods on each provider struct, +/// called during factory startup probes (not via dyn dispatch). +#[async_trait] +pub trait EmbedProvider: Send + Sync { + async fn embed(&self, texts: &[String]) -> Result>, RagError>; +} + +pub type DynEmbedProvider = Arc; + +pub mod tei; +pub use tei::TeiProvider; diff --git a/crates/noxa-rag/src/embed/tei.rs b/crates/noxa-rag/src/embed/tei.rs new file mode 100644 index 0000000..081778a --- /dev/null +++ b/crates/noxa-rag/src/embed/tei.rs @@ -0,0 +1,258 @@ +// TeiProvider — TEI (Text Embeddings Inference) embed provider +// Targets Qwen3-0.6B (1024-dim) served via Hugging Face TEI. +use crate::embed::EmbedProvider; +use crate::error::RagError; +use async_trait::async_trait; + +/// Batch size tuned for RTX 4070 (~3x throughput vs default 32). +const BATCH_SIZE: usize = 96; +/// Reduced batch size on HTTP 413. +const BATCH_SIZE_REDUCED: usize = 48; +/// Default embedding dimensions for Qwen3-0.6B. +const DEFAULT_DIMENSIONS: usize = 1024; +/// Per-batch request timeout. +const BATCH_TIMEOUT_SECS: u64 = 60; +/// Max retries on 429/503. +const MAX_RETRIES: u32 = 3; + +fn should_retry(status: u16, attempt: u32) -> bool { + (status == 429 || status == 503) && attempt < MAX_RETRIES +} + +#[derive(serde::Serialize)] +struct EmbedRequest<'a> { + inputs: &'a [String], + truncate: bool, + normalize: bool, +} + +pub struct TeiProvider { + pub(crate) client: reqwest::Client, + pub(crate) url: String, + pub(crate) model: String, + pub(crate) dimensions: usize, +} + +impl TeiProvider { + /// Construct with hardcoded dimensions (1024 for Qwen3-0.6B). + pub fn new(url: String, model: String) -> Self { + Self { + client: reqwest::Client::new(), + url, + model, + dimensions: DEFAULT_DIMENSIONS, + } + } + + /// Construct by probing /embed with a single dummy text to discover dimensions. + pub async fn new_with_probe( + url: String, + model: String, + client: reqwest::Client, + ) -> Result { + let dummy = vec!["probe".to_string()]; + let req = EmbedRequest { + inputs: &dummy, + truncate: true, + normalize: true, + }; + let resp = client + .post(format!("{}/embed", url)) + .timeout(std::time::Duration::from_secs(10)) + .json(&req) + .send() + .await?; + + if !resp.status().is_success() { + return Err(RagError::Embed { + message: format!("TEI probe failed with status {}", resp.status()), + status: Some(resp.status().as_u16()), + }); + } + + let vecs: Vec> = resp.json().await?; + let dimensions = + vecs.into_iter() + .next() + .map(|v| v.len()) + .ok_or_else(|| RagError::Embed { + message: "TEI probe returned empty embedding response".to_string(), + status: None, + })?; + + Ok(Self { + client, + url, + model, + dimensions, + }) + } + + /// GET /health — must return 200 within 2 s. + pub async fn is_available(&self) -> bool { + self.client + .get(format!("{}/health", self.url)) + .timeout(std::time::Duration::from_secs(2)) + .send() + .await + .map(|r| r.status().is_success()) + .unwrap_or(false) + } + + pub fn dimensions(&self) -> usize { + self.dimensions + } + + pub fn name(&self) -> &str { + "tei" + } + + /// Send one batch to POST /embed. Handles 429/503 with exponential back-off. + /// Returns Err(RagError::Embed { status: Some(413) }) — caller should halve the batch. + /// + /// `batch_idx` and `total_batches` are passed in from the caller for structured log context. + async fn embed_batch( + &self, + batch: &[String], + batch_idx: usize, + total_batches: usize, + ) -> Result>, RagError> { + let url = format!("{}/embed", self.url); + let req_body = EmbedRequest { + inputs: batch, + truncate: true, + normalize: true, + }; + + let mut delay_ms: u64 = 200; + for attempt in 0..=MAX_RETRIES { + tracing::debug!( + batch = batch_idx + 1, + total_batches, + chunks = batch.len(), + attempt = attempt + 1, + "embedding batch" + ); + + let resp = self + .client + .post(&url) + .timeout(std::time::Duration::from_secs(BATCH_TIMEOUT_SECS)) + .json(&req_body) + .send() + .await?; + + let status = resp.status(); + let status_u16 = status.as_u16(); + + if status.is_success() { + let vecs: Vec> = resp.json().await?; + return Ok(vecs); + } + + if status_u16 == 413 { + // Caller must halve the batch; no point retrying at this size. + tracing::warn!( + batch = batch_idx + 1, + chunks = batch.len(), + reduced_to = batch.len() / 2, + "TEI 413: payload too large, halving batch" + ); + return Err(RagError::Embed { + message: format!( + "TEI returned 413 (payload too large) for batch of {}", + batch.len() + ), + status: Some(status_u16), + }); + } + + if should_retry(status_u16, attempt) { + let body = resp.text().await.unwrap_or_default(); + let preview: String = body.chars().take(512).collect(); + tracing::warn!( + batch = batch_idx + 1, + attempt = attempt + 1, + max_attempts = MAX_RETRIES + 1, + status = status_u16, + delay_ms, + body = preview, + "TEI retry" + ); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; + delay_ms = (delay_ms * 2).min(2_000); + continue; + } + + if status_u16 == 429 || status_u16 == 503 { + break; + } + + let body = resp.text().await.unwrap_or_default(); + let preview: String = body.chars().take(512).collect(); + return Err(RagError::Embed { + message: format!("TEI /embed returned HTTP {status_u16}: {preview}"), + status: Some(status_u16), + }); + } + + Err(RagError::Embed { + message: "TEI /embed: max retries exceeded".to_string(), + status: None, + }) + } +} + +#[async_trait] +impl EmbedProvider for TeiProvider { + async fn embed(&self, texts: &[String]) -> Result>, RagError> { + if texts.is_empty() { + return Ok(vec![]); + } + + let total_batches = (texts.len() + BATCH_SIZE - 1) / BATCH_SIZE; + let mut results: Vec> = Vec::with_capacity(texts.len()); + + for (batch_idx, chunk) in texts.chunks(BATCH_SIZE).enumerate() { + match self.embed_batch(chunk, batch_idx, total_batches).await { + Ok(vecs) => results.extend(vecs), + Err(RagError::Embed { + status: Some(413), .. + }) => { + // Halve batch size and retry. Propagate real errors directly. + let sub_total = (chunk.len() + BATCH_SIZE_REDUCED - 1) / BATCH_SIZE_REDUCED; + let mut chunk_results: Vec> = Vec::with_capacity(chunk.len()); + for (sub_idx, sub_chunk) in chunk.chunks(BATCH_SIZE_REDUCED).enumerate() { + tracing::debug!( + sub_batch = sub_idx + 1, + sub_total, + chunks = sub_chunk.len(), + "embedding reduced sub-batch after 413" + ); + let vecs = self + .embed_batch(sub_chunk, batch_idx, total_batches) + .await?; + chunk_results.extend(vecs); + } + results.extend(chunk_results); + } + Err(e) => return Err(e), + } + } + + Ok(results) + } +} + +#[cfg(test)] +mod tests { + use super::{MAX_RETRIES, should_retry}; + + #[test] + fn retry_limit_counts_retries_not_total_attempts() { + assert!(should_retry(429, 0)); + assert!(should_retry(503, MAX_RETRIES - 1)); + assert!(!should_retry(429, MAX_RETRIES)); + assert!(!should_retry(500, 0)); + } +} diff --git a/crates/noxa-rag/src/error.rs b/crates/noxa-rag/src/error.rs new file mode 100644 index 0000000..a04132d --- /dev/null +++ b/crates/noxa-rag/src/error.rs @@ -0,0 +1,47 @@ +use thiserror::Error; + +#[non_exhaustive] +#[derive(Debug, Error)] +pub enum RagError { + #[error("embed error: {message}")] + Embed { + message: String, + status: Option, + }, + #[error("store error: {0}")] + Store(String), + #[error("chunk error: {0}")] + Chunk(String), + #[error("config error: {0}")] + Config(String), + #[error("io error: {0}")] + Io(#[from] std::io::Error), + #[error("http error: {0}")] + Http(#[from] reqwest::Error), + #[error("json error: {0}")] + Json(#[from] serde_json::Error), + #[error("parse error: {0}")] + Parse(String), + #[error("error: {0}")] + Generic(String), +} + +#[cfg(test)] +mod tests { + use super::RagError; + + #[test] + fn embed_error_exposes_status() { + let err = RagError::Embed { + message: "payload too large".to_string(), + status: Some(413), + }; + + match err { + RagError::Embed { + status: Some(413), .. + } => {} + other => panic!("expected structured 413 embed error, got {other:?}"), + } + } +} diff --git a/crates/noxa-rag/src/factory.rs b/crates/noxa-rag/src/factory.rs new file mode 100644 index 0000000..8b2b9d6 --- /dev/null +++ b/crates/noxa-rag/src/factory.rs @@ -0,0 +1,122 @@ +use std::sync::Arc; + +use crate::config::{EmbedProviderConfig, RagConfig, VectorStoreConfig}; +use crate::embed::{DynEmbedProvider, TeiProvider}; +use crate::error::RagError; +use crate::store::{DynVectorStore, QdrantStore, VectorStore}; + +/// Build the embed provider from config, running a startup probe. +/// +/// Returns `(provider, dims)` so callers can use the probed dimensions directly +/// without a redundant second probe. +/// +/// Fails fast at startup if the provider is unavailable or returns wrong dimensions. +/// `is_available()` and `dimensions()` are concrete methods on the provider struct, +/// called here directly (not via dyn dispatch). +pub async fn build_embed_provider( + config: &RagConfig, +) -> Result<(DynEmbedProvider, usize), RagError> { + match &config.embed_provider { + EmbedProviderConfig::Tei { url, model, .. } => { + let client = reqwest::Client::new(); + let provider = TeiProvider::new_with_probe(url.clone(), model.clone(), client) + .await + .map_err(|e| RagError::Config(format!("TEI startup probe failed: {e}")))?; + + if !provider.is_available().await { + return Err(RagError::Config(format!( + "TEI provider at {} is not available (GET /health failed). \ + Ensure TEI is running with --pooling last-token for Qwen3-0.6B.", + url + ))); + } + + let dims = provider.dimensions(); + if dims == 0 { + return Err(RagError::Config( + "TEI provider returned 0 dimensions — probe failed silently".to_string(), + )); + } + + tracing::info!( + provider = provider.name(), + dims, + url = %url, + "embed provider ready" + ); + + Ok((Arc::new(provider), dims)) + } + EmbedProviderConfig::OpenAi { .. } => Err(RagError::Config( + "OpenAI embed provider not implemented — use tei for phase 1".to_string(), + )), + EmbedProviderConfig::VoyageAi { .. } => Err(RagError::Config( + "VoyageAI embed provider not implemented — use tei for phase 1".to_string(), + )), + } +} + +/// Build the vector store from config, running collection lifecycle checks. +/// +/// Creates the collection if missing; fails if existing collection has wrong dimensions. +/// `collection_exists()` and `create_collection()` are concrete methods on QdrantStore, +/// called here directly (not via dyn dispatch). +pub async fn build_vector_store( + config: &RagConfig, + embed_dims: usize, +) -> Result { + match &config.vector_store { + VectorStoreConfig::Qdrant { + url, + collection, + api_key, + } => { + // Resolve api_key: config value takes precedence, env var as fallback. + let resolved_api_key = api_key + .clone() + .or_else(|| std::env::var("NOXA_RAG_QDRANT_API_KEY").ok()); + + let store = QdrantStore::new( + url, + collection.clone(), + resolved_api_key, + config.uuid_namespace, + )?; + + // Collection lifecycle: create if missing, validate dims if exists. + if store.collection_exists().await? { + // Validate that the existing collection's vector size matches embed dims. + // Fail fast if there is a mismatch rather than letting upsert fail later + // with a confusing Qdrant error. + let existing_dims = store.collection_vector_size().await?; + if existing_dims != embed_dims { + return Err(RagError::Config(format!( + "existing Qdrant collection {collection:?} has {existing_dims}-dim vectors \ + but embed provider outputs {embed_dims} dims — delete the collection or \ + switch to a matching embed model" + ))); + } + tracing::info!( + collection = %collection, + dims = existing_dims, + "collection already exists with matching dimensions" + ); + } else { + tracing::info!(collection = %collection, dims = embed_dims, "creating collection"); + store.create_collection(embed_dims).await?; + } + + tracing::info!( + store = store.name(), + collection = %collection, + url = %url, + "vector store ready" + ); + + Ok(Arc::new(store)) + } + VectorStoreConfig::InMemory => Err(RagError::Config( + "InMemory vector store not implemented — use testcontainers-rs for tests".to_string(), + )), + } +} diff --git a/crates/noxa-rag/src/lib.rs b/crates/noxa-rag/src/lib.rs new file mode 100644 index 0000000..0eae453 --- /dev/null +++ b/crates/noxa-rag/src/lib.rs @@ -0,0 +1,40 @@ +/// noxa-rag — RAG pipeline crate. +/// +/// Watches noxa output directory for ExtractionResult JSON files, +/// chunks them, embeds via TEI, and upserts to Qdrant. +/// +/// # Crate structure +/// - `embed` — EmbedProvider trait + TeiProvider impl +/// - `store` — VectorStore trait + QdrantStore impl +/// - `chunker` — ExtractionResult → Vec +/// - `config` — RagConfig (TOML deserialization) +/// - `factory` — build_embed_provider / build_vector_store +/// - `pipeline` — filesystem watcher orchestration +/// - `error` — RagError enum + +// Tokenizer Sync compile-time assertion. +// tokenizers::Tokenizer must be Sync to be used across tokio workers. +// If this fails to compile, workers cannot safely share the tokenizer. +const _: () = { + fn assert_sync() {} + fn _check() { + assert_sync::(); + } +}; + +pub mod chunker; +pub mod config; +pub mod embed; +pub mod error; +pub mod factory; +pub mod pipeline; +pub mod store; +pub mod types; + +// Re-export most-used types at crate root +pub use config::{RagConfig, load_config}; +pub use embed::{DynEmbedProvider, EmbedProvider}; +pub use error::RagError; +pub use factory::{build_embed_provider, build_vector_store}; +pub use store::{DynVectorStore, VectorStore}; +pub use types::{Chunk, Point, PointPayload, SearchResult}; diff --git a/crates/noxa-rag/src/pipeline.rs b/crates/noxa-rag/src/pipeline.rs new file mode 100644 index 0000000..83de743 --- /dev/null +++ b/crates/noxa-rag/src/pipeline.rs @@ -0,0 +1,1868 @@ +// Pipeline — filesystem watcher → chunk → embed → upsert +// +// Architecture: +// notify-debouncer-mini (sync mpsc) → spawn_blocking bridge → tokio mpsc IndexJob queue +// → embed_concurrency worker tasks → process_job() +// +// Key design decisions: +// - Carry tracing::Span in IndexJob; tokio::spawn would drop it otherwise. +// - Per-URL mutex (DashMap>>) prevents concurrent delete+upsert races. +// - Workers bounded to embed_concurrency provide natural backpressure without a separate semaphore. +// - notify-debouncer-mini 0.4.x uses a callback/sender API, not a receiver() method. +// We use std::sync::mpsc::Sender as the handler and bridge via spawn_blocking. + +use std::fs; +use std::net::IpAddr; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use dashmap::DashMap; +use notify::RecursiveMode; +use notify_debouncer_mini::{DebounceEventResult, new_debouncer}; +use tokio::io::AsyncReadExt; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; + +use noxa_core::types::ExtractionResult; +use tokenizers::Tokenizer; + +use crate::chunker; +use crate::config::{RagConfig, SourceConfig}; +use crate::embed::DynEmbedProvider; +use crate::error::RagError; +use crate::store::DynVectorStore; +use crate::types::{Point, PointPayload}; + +// ─── Session counters ───────────────────────────────────────────────────────── + +/// Shared session metrics updated by workers and read by the heartbeat/shutdown tasks. +#[derive(Default)] +struct SessionCounters { + files_indexed: AtomicUsize, + files_failed: AtomicUsize, + total_chunks: AtomicUsize, + total_embed_ms: AtomicU64, + total_upsert_ms: AtomicU64, +} + +// ─── IndexJob ──────────────────────────────────────────────────────────────── + +/// A unit of work: index the .json file at `path`. +/// The tracing `span` is carried explicitly because tokio::spawn does NOT +/// automatically propagate the current span into the new task. +struct IndexJob { + path: PathBuf, + span: tracing::Span, +} + +// ─── Pipeline ──────────────────────────────────────────────────────────────── + +pub struct Pipeline { + pub config: RagConfig, + pub embed: DynEmbedProvider, + pub store: DynVectorStore, + pub tokenizer: Arc, + pub shutdown: CancellationToken, + /// Per-URL mutex: prevents concurrent delete-then-upsert races for the same URL. + url_locks: Arc>>>, + /// Session-level metrics shared between workers, heartbeat, and shutdown tasks. + counters: Arc, +} + +impl Pipeline { + pub fn new( + config: RagConfig, + embed: DynEmbedProvider, + store: DynVectorStore, + tokenizer: Arc, + shutdown: CancellationToken, + ) -> Self { + Self { + config, + embed, + store, + tokenizer, + shutdown, + url_locks: Arc::new(DashMap::new()), + counters: Arc::new(SessionCounters::default()), + } + } + + /// Run the filesystem watcher pipeline. + /// + /// Returns when the CancellationToken is cancelled. + pub async fn run(&self) -> Result<(), RagError> { + // Extract watch config. + let (watch_dir, debounce_ms) = match &self.config.source { + SourceConfig::FsWatcher { + watch_dir, + debounce_ms, + } => (watch_dir.clone(), *debounce_ms), + }; + + if self.config.pipeline.embed_concurrency == 0 { + return Err(RagError::Config( + "pipeline.embed_concurrency must be > 0 or no workers will run".to_string(), + )); + } + + tracing::info!( + watch_dir = %watch_dir.display(), + debounce_ms, + embed_concurrency = self.config.pipeline.embed_concurrency, + "pipeline starting" + ); + + // Bounded job queue: backpressure at 256 queued jobs. + let (tx, rx) = tokio::sync::mpsc::channel::(256); + + // Spawn worker pool — each worker owns a cloned rx. + // We share a single receiver via Arc> so all workers + // compete fairly for jobs. + let rx = Arc::new(tokio::sync::Mutex::new(rx)); + let mut worker_handles = Vec::with_capacity(self.config.pipeline.embed_concurrency); + + for worker_id in 0..self.config.pipeline.embed_concurrency { + let rx = rx.clone(); + let embed = self.embed.clone(); + let store = self.store.clone(); + let tokenizer = self.tokenizer.clone(); + let config = self.config.clone(); + let url_locks = self.url_locks.clone(); + let counters = self.counters.clone(); + + let handle = tokio::spawn(async move { + tracing::debug!(worker_id, "index worker started"); + loop { + let job = { + let mut guard = rx.lock().await; + guard.recv().await + }; + match job { + Some(job) => { + let span = job.span.clone(); + async { + match process_job( + job, &embed, &store, &tokenizer, &config, &url_locks, + ) + .await + { + Ok(stats) => { + if stats.chunks > 0 { + counters.files_indexed.fetch_add(1, Ordering::Relaxed); + } + counters + .total_chunks + .fetch_add(stats.chunks, Ordering::Relaxed); + counters + .total_embed_ms + .fetch_add(stats.embed_ms, Ordering::Relaxed); + counters + .total_upsert_ms + .fetch_add(stats.upsert_ms, Ordering::Relaxed); + } + Err(e) => { + tracing::error!(error = %e, "index job failed"); + counters.files_failed.fetch_add(1, Ordering::Relaxed); + } + } + } + .instrument(span) + .await; + } + None => { + // Sender dropped — workers drain and exit. + tracing::debug!(worker_id, "index worker shutting down"); + break; + } + } + } + }); + + worker_handles.push(handle); + } + + // Build notify debouncer with a *bounded* sync channel as the event handler. + // notify-debouncer-mini 0.4.x implements DebounceEventHandler for + // std::sync::mpsc::Sender (unbounded) but not SyncSender, so we wrap + // SyncSender in a small newtype. When the bridge is blocked on + // blocking_send (Tokio queue full) the sync_channel fills and the + // debouncer's send() call blocks too — closing the backpressure loop. + struct BoundedSender(std::sync::mpsc::SyncSender); + impl notify_debouncer_mini::DebounceEventHandler for BoundedSender { + fn handle_event(&mut self, event: DebounceEventResult) { + // Blocks when the channel is full, propagating backpressure. + let _ = self.0.send(event); + } + } + + let (notify_tx, notify_rx) = std::sync::mpsc::sync_channel::(256); + + let mut debouncer = + new_debouncer(Duration::from_millis(debounce_ms), BoundedSender(notify_tx)) + .map_err(|e| RagError::Generic(format!("failed to create fs watcher: {e}")))?; + + debouncer + .watcher() + .watch(&watch_dir, RecursiveMode::Recursive) + .map_err(|e| { + RagError::Generic(format!( + "failed to watch directory {}: {e}", + watch_dir.display() + )) + })?; + + tracing::info!(path = %watch_dir.display(), "watching directory recursively"); + + // Bridge: wrap the blocking notify_rx.recv() in spawn_blocking so it + // doesn't block the tokio reactor. Send jobs to the tokio job queue. + let shutdown_clone = self.shutdown.clone(); + let tx_clone = tx.clone(); + + let bridge_handle = tokio::task::spawn_blocking(move || { + // Keep `debouncer` alive for the duration of this thread. + let _debouncer = debouncer; + + loop { + // recv_timeout lets us periodically check whether we should stop. + // We check every 250 ms regardless of debounce setting. + match notify_rx.recv_timeout(Duration::from_millis(250)) { + Ok(Ok(events)) => { + if shutdown_clone.is_cancelled() { + break; + } + for event in events { + for path in collect_indexable_paths(&event.path) { + let span = tracing::info_span!( + "index_job", + path = %path.display(), + ); + let job = IndexJob { path, span }; + // Retry with a short sleep so shutdown can interrupt a full queue. + let mut pending_job = job; + let mut saturated_logged = false; + loop { + match tx_clone.try_send(pending_job) { + Ok(()) => break, + Err(tokio::sync::mpsc::error::TrySendError::Full(job)) => { + if shutdown_clone.is_cancelled() { + break; + } + if !saturated_logged { + tracing::warn!( + "job queue saturated (256/256), \ + backing off — embed/upsert catching up" + ); + saturated_logged = true; + } + pending_job = job; + std::thread::sleep(Duration::from_millis(10)); + } + Err(tokio::sync::mpsc::error::TrySendError::Closed(_)) => { + // Receiver dropped — workers are done; exit. + return; + } + } + } + } + } + } + Ok(Err(e)) => { + tracing::warn!(error = ?e, "fs watcher error"); + } + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + // Check if we should stop. + if shutdown_clone.is_cancelled() { + break; + } + } + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => { + break; + } + } + } + + tracing::info!("fs watcher bridge exiting"); + }); + + // Startup scan: index files already present in watch_dir when the daemon starts. + // + // Runs concurrently with the watcher so new events are not missed during the scan. + // collect_indexable_paths uses std::fs (sync) — MUST run in spawn_blocking to avoid + // stalling the tokio executor on NFS/CIFS with thousands of files. + // + // Delta detection: before enqueuing a path, compute SHA-256 of its bytes and check + // Qdrant. If a point with the same URL + content_hash already exists, the file has + // not changed and is skipped. This prevents re-indexing the entire watch_dir on + // every daemon restart. + let scan_tx = tx.clone(); + let scan_store = self.store.clone(); + let scan_shutdown = self.shutdown.clone(); + let scan_watch_dir = watch_dir.clone(); + + let startup_handle = tokio::spawn(async move { + let paths = match tokio::task::spawn_blocking({ + let dir = scan_watch_dir.clone(); + move || collect_indexable_paths(&dir) + }) + .await + { + Ok(p) => p, + Err(e) => { + tracing::error!(error = %e, "startup scan: collect_indexable_paths panicked"); + return; + } + }; + + let total = paths.len(); + tracing::info!(count = total, "startup scan: checking files for delta"); + + let mut queued = 0usize; + let mut skipped = 0usize; + + for path in paths { + if scan_shutdown.is_cancelled() { + break; + } + + // Read file + compute URL+hash in spawn_blocking (sync file I/O). + let path2 = path.clone(); + let hash_and_url = tokio::task::spawn_blocking(move || { + startup_scan_key(&path2) + }) + .await + .ok() + .flatten(); + + let (hash, url) = match hash_and_url { + Some(t) => t, + None => { + // Cannot determine URL/hash — enqueue conservatively. + tracing::debug!(path = %path.display(), "startup scan: no url/hash, queuing"); + let span = tracing::info_span!("index_job", path = %path.display()); + tokio::select! { + _ = scan_tx.send(IndexJob { path, span }) => {} + _ = scan_shutdown.cancelled() => { break; } + } + queued += 1; + continue; + } + }; + + // Delta check — skip files already indexed with the same content. + // On Qdrant error: conservative (assume not indexed, re-enqueue). + match scan_store.url_with_hash_exists(&url, &hash).await { + Ok(true) => { + skipped += 1; + tracing::debug!( + path = %path.display(), + url = %url, + "startup scan: already indexed, skipping" + ); + } + Ok(false) => { + let span = tracing::info_span!("index_job", path = %path.display()); + tokio::select! { + _ = scan_tx.send(IndexJob { path, span }) => {} + _ = scan_shutdown.cancelled() => { break; } + } + queued += 1; + } + Err(e) => { + tracing::warn!( + path = %path.display(), + error = %e, + "startup scan: delta check failed, re-enqueueing conservatively" + ); + let span = tracing::info_span!("index_job", path = %path.display()); + tokio::select! { + _ = scan_tx.send(IndexJob { path, span }) => {} + _ = scan_shutdown.cancelled() => { break; } + } + queued += 1; + } + } + } + + tracing::info!(total, queued, skipped, "startup scan complete"); + }); + + // Heartbeat: log pipeline health every 60s. + let heartbeat_counters = self.counters.clone(); + let heartbeat_shutdown = self.shutdown.clone(); + let session_start = Instant::now(); + let heartbeat_handle = tokio::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(60)); + interval.tick().await; // consume immediate first tick + loop { + tokio::select! { + _ = interval.tick() => { + let uptime_m = session_start.elapsed().as_secs() / 60; + tracing::info!( + indexed = heartbeat_counters.files_indexed.load(Ordering::Relaxed), + failed = heartbeat_counters.files_failed.load(Ordering::Relaxed), + uptime_m, + "pipeline alive" + ); + } + _ = heartbeat_shutdown.cancelled() => break, + } + } + }); + + // Wait for cancellation signal. + self.shutdown.cancelled().await; + tracing::info!("shutdown signal received, draining pipeline"); + + // Drop tx so workers drain their queues and exit. + drop(tx); + + // Wait for bridge, heartbeat, and startup scan to finish. + let _ = bridge_handle.await; + let _ = heartbeat_handle.await; + let _ = startup_handle.await; + + // Wait for all workers to drain — 10s hard limit to prevent a stuck + // job from blocking indefinite shutdown. + let drain = async { + for handle in worker_handles { + let _ = handle.await; + } + }; + match tokio::time::timeout(Duration::from_secs(10), drain).await { + Ok(_) => tracing::info!("pipeline shut down cleanly"), + Err(_) => { + tracing::warn!("workers did not drain within 10s, forcing exit"); + return Err(RagError::Generic( + "workers did not drain within 10s".to_string(), + )); + } + } + + // Shutdown session summary. + let indexed = self.counters.files_indexed.load(Ordering::Relaxed); + let failed = self.counters.files_failed.load(Ordering::Relaxed); + let chunks = self.counters.total_chunks.load(Ordering::Relaxed); + let embed_ms = self.counters.total_embed_ms.load(Ordering::Relaxed); + let upsert_ms = self.counters.total_upsert_ms.load(Ordering::Relaxed); + let avg_embed_ms = if indexed > 0 { embed_ms / indexed as u64 } else { 0 }; + let avg_upsert_ms = if indexed > 0 { upsert_ms / indexed as u64 } else { 0 }; + tracing::info!( + indexed, + failed, + chunks, + avg_embed_ms, + avg_upsert_ms, + duration_s = session_start.elapsed().as_secs(), + "session complete" + ); + + Ok(()) + } +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +/// Returns true iff the path has a supported extension AND exists on disk. +/// +/// We check existence because rename events (vim/emacs atomic saves) may fire for +/// temp files that are gone by the time we process them. +/// +/// Deferred (no confirmed use case, would add new crate deps): .epub, .eml, .mbox +fn is_indexable(path: &Path) -> bool { + let Some(ext) = path.extension().and_then(|e| e.to_str()) else { + return false; + }; + matches!( + ext, + // ExtractionResult JSON (primary watch-dir format) + "json" + // Plain text + | "md" | "txt" | "log" | "rst" | "org" | "yaml" | "yml" | "toml" + // HTML + | "html" | "htm" + // Notebook + | "ipynb" + // Binary document (via noxa-pdf / zip unpack) + | "pdf" | "docx" | "odt" | "pptx" + // Structured data + | "jsonl" | "xml" | "opml" + // Subtitle / transcript + | "vtt" | "srt" + // RSS / Atom + | "rss" | "atom" + ) && path.exists() +} + +fn collect_indexable_paths(path: &Path) -> Vec { + if is_indexable(path) { + return vec![path.to_path_buf()]; + } + + if !path.is_dir() { + return Vec::new(); + } + + let mut found = Vec::new(); + collect_indexable_paths_recursive(path, &mut found); + found.sort(); + found +} + +fn collect_indexable_paths_recursive(path: &Path, found: &mut Vec) { + let Ok(entries) = fs::read_dir(path) else { + return; + }; + + for entry in entries.flatten() { + let entry_path = entry.path(); + // Never follow symlinks — prevents watch_dir/root -> / traversal attacks. + if entry_path.is_symlink() { + tracing::debug!(path = %entry_path.display(), "skipping symlink"); + continue; + } + if is_indexable(&entry_path) { + found.push(entry_path); + } else if entry_path.is_dir() { + collect_indexable_paths_recursive(&entry_path, found); + } + } +} + +/// Returns true iff `host` resolves to a private/loopback/link-local address. +fn is_private_ip(host: &str) -> bool { + if let Ok(addr) = host.parse::() { + return match addr { + IpAddr::V4(ip) => ip.is_private() || ip.is_loopback() || ip.is_link_local(), + IpAddr::V6(ip) => { + ip.is_loopback() || ip.is_unique_local() || ip.is_unicast_link_local() + } + }; + } + false +} + +/// Validate that `url` uses http or https and does not point to a private IP. +fn validate_url_scheme(url: &str) -> Result<(), RagError> { + if url.is_empty() { + return Err(RagError::Generic( + "extraction result has no URL".to_string(), + )); + } + let parsed = + url::Url::parse(url).map_err(|e| RagError::Generic(format!("invalid URL {url:?}: {e}")))?; + + match parsed.scheme() { + "http" | "https" => { + // Block private/loopback IP literals and localhost for remote schemes. + if let Some(host) = parsed.host_str() { + if is_private_ip(host) { + return Err(RagError::Generic(format!( + "URL {url:?} uses a private/loopback IP literal as its host — indexing blocked" + ))); + } + if host.eq_ignore_ascii_case("localhost") { + return Err(RagError::Generic( + "URL points to localhost — indexing blocked".to_string(), + )); + } + } + } + "file" => { + // Local file:// only — no remote file://server/path references. + // RFC 8089 allows `file://localhost/path` as equivalent to `file:///path`. + match parsed.host_str() { + None | Some("") | Some("localhost") => {} + Some(host) => { + return Err(RagError::Generic(format!( + "file:// URL with remote host {host:?} is not allowed (only local paths)" + ))); + } + } + } + other => { + return Err(RagError::Generic(format!( + "URL scheme {other:?} is not allowed (only http/https/file)" + ))); + } + } + + Ok(()) +} + +// ─── Format dispatch ───────────────────────────────────────────────────────── + +/// Parse a local file into a normalised `ExtractionResult` for the RAG pipeline. +/// +/// Dispatches to the right extractor based on file extension. Heavy / CPU-bound +/// formats (PDF, DOCX, ipynb) run inside `spawn_blocking` so the tokio executor +/// is never stalled. All formats set: +/// - `metadata.url` = file:// URI (percent-encoded, via url crate) +/// - `metadata.domain` = NOT set here — "local" sentinel set in process_job +/// - `metadata.source_type` = "file" +/// - `metadata.title` = filename stem (unless the format provides a better one) +/// +/// Returns `Err(RagError::Parse(...))` on unrecoverable format errors. +async fn parse_file(path: &Path, bytes: Vec) -> Result { + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("json"); + let file_url = url::Url::from_file_path(path) + .map(|u| u.to_string()) + .unwrap_or_else(|_| path.to_string_lossy().into_owned()); + let title = path + .file_stem() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_default(); + + // Helper: bytes → UTF-8 String with replacement for invalid sequences. + let as_text = |b: &[u8]| String::from_utf8_lossy(b).into_owned(); + + match ext { + // ── JSON ExtractionResult ────────────────────────────────────────────── + "json" => serde_json::from_slice::(&bytes) + .map_err(|e| RagError::Parse(format!("JSON parse failed: {e}"))), + + // ── Plain text group (.md .txt .log .rst .org .yaml .yml .toml) ─────── + "md" | "rst" | "org" => { + let content = as_text(&bytes); + let word_count = content.split_whitespace().count(); + Ok(make_text_result(content, String::new(), file_url, Some(title), "file", word_count)) + } + "txt" | "yaml" | "yml" | "toml" => { + let content = as_text(&bytes); + let word_count = content.split_whitespace().count(); + Ok(make_text_result( + content.clone(), + content, + file_url, + Some(title), + "file", + word_count, + )) + } + "log" => { + let raw = as_text(&bytes); + let stripped = strip_ansi_escapes::strip_str(&raw); + let word_count = stripped.split_whitespace().count(); + Ok(make_text_result( + stripped.clone(), + stripped, + file_url, + Some(title), + "file", + word_count, + )) + } + + // ── HTML ─────────────────────────────────────────────────────────────── + "html" | "htm" => { + let html = as_text(&bytes); + let url_for_extract = file_url.clone(); + tokio::task::spawn_blocking(move || -> Result { + let mut r = noxa_core::extract(&html, Some(&url_for_extract)) + .map_err(|e| RagError::Parse(format!("HTML extract: {e}")))?; + r.metadata.url = Some(url_for_extract); + r.metadata.source_type = Some("file".to_string()); + Ok(r) + }) + .await + .map_err(|e| RagError::Parse(format!("HTML spawn_blocking: {e}")))? + } + + // ── Jupyter Notebook ────────────────────────────────────────────────── + "ipynb" => { + tokio::task::spawn_blocking(move || parse_ipynb(&bytes, file_url, title)) + .await + .map_err(|e| RagError::Parse(format!("ipynb spawn_blocking: {e}")))? + } + + // ── PDF ──────────────────────────────────────────────────────────────── + "pdf" => { + tokio::task::spawn_blocking(move || parse_pdf(&bytes, file_url, title)) + .await + .map_err(|e| RagError::Parse(format!("PDF spawn_blocking: {e}")))? + } + + // ── Office binary formats (ZIP-based) ───────────────────────────────── + "docx" => { + tokio::task::spawn_blocking(move || parse_office_zip(&bytes, file_url, title, "docx")) + .await + .map_err(|e| RagError::Parse(format!("DOCX spawn_blocking: {e}")))? + } + "odt" => { + tokio::task::spawn_blocking(move || parse_office_zip(&bytes, file_url, title, "odt")) + .await + .map_err(|e| RagError::Parse(format!("ODT spawn_blocking: {e}")))? + } + "pptx" => { + tokio::task::spawn_blocking(move || parse_office_zip(&bytes, file_url, title, "pptx")) + .await + .map_err(|e| RagError::Parse(format!("PPTX spawn_blocking: {e}")))? + } + + // ── Structured text (.jsonl .xml .opml .rss .atom) ──────────────────── + "jsonl" => { + let content = as_text(&bytes); + let text = content + .lines() + .filter_map(|line| { + let v: serde_json::Value = serde_json::from_str(line).ok()?; + ["text", "content", "body", "message", "value"] + .iter() + .find_map(|k| v[k].as_str().map(str::to_string)) + }) + .collect::>() + .join("\n\n"); + let word_count = text.split_whitespace().count(); + Ok(make_text_result(text.clone(), text, file_url, Some(title), "file", word_count)) + } + "xml" | "opml" | "rss" | "atom" => { + let content = as_text(&bytes); + let text = extract_xml_text(&content); + let word_count = text.split_whitespace().count(); + Ok(make_text_result(text.clone(), text, file_url, Some(title), "file", word_count)) + } + + // ── Subtitle / transcript (.vtt .srt) ───────────────────────────────── + "vtt" | "srt" => { + let content = as_text(&bytes); + let text = strip_subtitle_timestamps(&content); + let word_count = text.split_whitespace().count(); + Ok(make_text_result(text.clone(), text, file_url, Some(title), "file", word_count)) + } + + // ── Unknown / unsupported ────────────────────────────────────────────── + other => Err(RagError::Parse(format!("unsupported file extension: .{other}"))), + } +} + +/// Build a minimal ExtractionResult from pre-extracted text. +fn make_text_result( + markdown: String, + plain_text: String, + url: String, + title: Option, + source_type: &str, + word_count: usize, +) -> ExtractionResult { + ExtractionResult { + metadata: noxa_core::Metadata { + title, + description: None, + author: None, + published_date: None, + language: None, + url: Some(url), + site_name: None, + image: None, + favicon: None, + word_count, + content_hash: None, // filled by process_job if needed + source_type: Some(source_type.to_string()), + file_path: None, // filled by process_job + last_modified: None, // filled by process_job + is_truncated: None, + technologies: Vec::new(), + seed_url: None, + crawl_depth: None, + search_query: None, + fetched_at: None, + }, + content: noxa_core::Content { + markdown, + plain_text, + links: Vec::new(), + images: Vec::new(), + code_blocks: Vec::new(), + raw_html: None, + }, + domain_data: None, + structured_data: Vec::new(), + } +} + +/// Parse a Jupyter Notebook (.ipynb) — must run in spawn_blocking. +/// +/// Extracts source from code + markdown cells only. +/// **Strips cell outputs** to prevent indexing of stack traces, env dumps, or PII. +fn parse_ipynb(bytes: &[u8], url: String, title: String) -> Result { + let v: serde_json::Value = serde_json::from_slice(bytes) + .map_err(|e| RagError::Parse(format!("ipynb JSON parse: {e}")))?; + + let cells = v["cells"] + .as_array() + .ok_or_else(|| RagError::Parse("ipynb: missing 'cells' array".to_string()))?; + + let mut parts: Vec = Vec::new(); + for cell in cells { + let cell_type = cell["cell_type"].as_str().unwrap_or(""); + if !matches!(cell_type, "markdown" | "code") { + continue; + } + // source is either a string or an array of strings. + let source = match &cell["source"] { + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Array(lines) => lines + .iter() + .filter_map(|l| l.as_str()) + .collect::(), + _ => continue, + }; + // Skip empty cells. + let trimmed = source.trim(); + if !trimmed.is_empty() { + parts.push(trimmed.to_string()); + } + // Outputs are intentionally NOT indexed (may contain PII/env dumps). + } + + let text = parts.join("\n\n"); + let word_count = text.split_whitespace().count(); + Ok(make_text_result(text.clone(), text, url, Some(title), "notebook", word_count)) +} + +/// Extract text from a PDF — must run in spawn_blocking. +fn parse_pdf(bytes: &[u8], url: String, title: String) -> Result { + let result = noxa_pdf::extract_pdf( + bytes, + noxa_pdf::PdfMode::Auto, + ) + .map_err(|e| RagError::Parse(format!("PDF extract: {e}")))?; + let text = noxa_pdf::to_markdown(&result); + let word_count = text.split_whitespace().count(); + Ok(make_text_result(text.clone(), text, url, Some(title), "file", word_count)) +} + +/// Shared ZIP-based office parser for DOCX, ODT, PPTX — must run in spawn_blocking. +/// +/// Uses noxa-fetch's tested DOCX extractor for .docx. +/// ODT and PPTX are extracted via ZIP text-node scan (sufficient for indexing). +/// +/// **Decompressed-size guard**: entries > 100 MiB or archives > 1 000 entries +/// are rejected to prevent zip-bomb DoS. +fn parse_office_zip( + bytes: &[u8], + url: String, + title: String, + ext: &str, +) -> Result { + use std::io::Read; + + const MAX_ENTRY_SIZE: u64 = 100 * 1024 * 1024; // 100 MiB decompressed + const MAX_ENTRIES: usize = 1_000; + + let cursor = std::io::Cursor::new(bytes); + let mut archive = zip::ZipArchive::new(cursor) + .map_err(|e| RagError::Parse(format!("{ext} ZIP open: {e}")))?; + + if archive.len() > MAX_ENTRIES { + return Err(RagError::Parse(format!( + "{ext}: archive has {} entries (max {MAX_ENTRIES}) — possible zip bomb", + archive.len() + ))); + } + + // For DOCX, delegate to the tested noxa-fetch extractor. + // Check each entry's decompressed size first — MAX_ENTRIES ran above but + // the per-entry size guard is inside the ODT/PPTX loop which is skipped + // for DOCX. Guard here so a crafted DOCX zip bomb cannot cause OOM. + if ext == "docx" { + for i in 0..archive.len() { + if let Ok(entry) = archive.by_index(i) { + if entry.size() > MAX_ENTRY_SIZE { + return Err(RagError::Parse(format!( + "docx: entry '{}' decompresses to {} bytes (max 100 MiB) — possible zip bomb", + entry.name(), + entry.size() + ))); + } + } + } + let result = noxa_fetch::document::extract_document(bytes, noxa_fetch::document::DocType::Docx) + .map_err(|e| RagError::Parse(format!("DOCX extract: {e}")))?; + let mut r = result; + r.metadata.url = Some(url); + r.metadata.source_type = Some("file".to_string()); + if r.metadata.title.is_none() { + r.metadata.title = Some(title); + } + return Ok(r); + } + + // ODT and PPTX: scan all XML entries for text nodes. + // ODT: content.xml; PPTX: ppt/slides/slide*.xml + let target_prefix = match ext { + "odt" => "content", + "pptx" => "ppt/slides/slide", + _ => "", + }; + + let mut text_parts: Vec = Vec::new(); + for i in 0..archive.len() { + let mut entry = archive + .by_index(i) + .map_err(|e| RagError::Parse(format!("{ext} entry {i}: {e}")))?; + + if entry.size() > MAX_ENTRY_SIZE { + return Err(RagError::Parse(format!( + "{ext}: entry '{}' decompresses to {} bytes (max 100 MiB) — possible zip bomb", + entry.name(), + entry.size() + ))); + } + + let name = entry.name().to_string(); + if !name.ends_with(".xml") { + continue; + } + if !target_prefix.is_empty() && !name.contains(target_prefix) { + continue; + } + + let mut xml_buf = String::new(); + entry + .read_to_string(&mut xml_buf) + .map_err(|e| RagError::Parse(format!("{ext} read '{name}': {e}")))?; + + // Simple text-node extraction via quick-xml. + let fragment = extract_xml_text(&xml_buf); + if !fragment.trim().is_empty() { + text_parts.push(fragment); + } + } + + let text = text_parts.join("\n\n"); + let word_count = text.split_whitespace().count(); + Ok(make_text_result( + text.clone(), + text, + url, + Some(title), + "file", + word_count, + )) +} + +/// Extract plain text from XML/OPML/RSS/Atom by collecting all text nodes. +/// Strips all tags; trims and deduplicates blank lines. +fn extract_xml_text(xml: &str) -> String { + use quick_xml::Reader; + use quick_xml::events::Event; + + let mut reader = Reader::from_str(xml); + let mut parts: Vec = Vec::new(); + + loop { + match reader.read_event() { + Ok(Event::Text(e)) => { + if let Ok(text) = e.unescape() { + let t = text.trim().to_string(); + if !t.is_empty() { + parts.push(t); + } + } + } + Ok(Event::Eof) | Err(_) => break, + _ => {} + } + } + + parts.join("\n") +} + +/// Strip timestamp / cue header lines from WebVTT and SRT subtitles. +/// Keeps only the spoken text lines. +fn strip_subtitle_timestamps(content: &str) -> String { + let mut lines: Vec<&str> = Vec::new(); + for line in content.lines() { + let trimmed = line.trim(); + // Skip WEBVTT header, blank lines as separators, cue timecodes, + // numeric cue identifiers (SRT), and NOTE/STYLE/REGION blocks. + if trimmed.is_empty() + || trimmed.starts_with("WEBVTT") + || trimmed.starts_with("NOTE") + || trimmed.starts_with("STYLE") + || trimmed.starts_with("REGION") + || trimmed.contains("-->") + || trimmed.chars().all(|c| c.is_ascii_digit()) + { + continue; + } + lines.push(trimmed); + } + lines.join(" ") +} + +/// Compute the (content_hash, url) key used by the startup delta scan. +/// +/// For `.json` ExtractionResult files: peeks at `metadata.url` and `metadata.content_hash` +/// from inside the JSON (fast, avoids full deserialisation of large markdown content). +/// Falls back to file:// URL + SHA-256 of file bytes if the JSON lacks a URL. +/// +/// For all other formats: returns file:// URL + SHA-256 of file bytes. +/// +/// Returns `None` when the file cannot be read or a file:// URL cannot be constructed. +/// +/// **Must be called inside `spawn_blocking`** — this function reads from disk synchronously. +fn startup_scan_key(path: &std::path::Path) -> Option<(String, String)> { + use sha2::Digest; + + let bytes = std::fs::read(path).ok()?; + + if path.extension().and_then(|e| e.to_str()) == Some("json") { + // Partial deserialisation: only decode the metadata header, not the full content. + #[derive(serde::Deserialize)] + struct Q { + metadata: QM, + } + #[derive(serde::Deserialize)] + struct QM { + url: Option, + content_hash: Option, + } + if let Ok(q) = serde_json::from_slice::(&bytes) { + let hash = q + .metadata + .content_hash + .unwrap_or_else(|| format!("{:x}", sha2::Sha256::digest(&bytes))); + if let Some(url) = q.metadata.url { + if !url.is_empty() { + return Some((hash, url)); + } + } + } + } + + // Non-JSON or JSON without a stored URL: use file:// + SHA-256 of file bytes. + let hash = format!("{:x}", sha2::Sha256::digest(&bytes)); + let url = url::Url::from_file_path(path).ok()?.to_string(); + Some((hash, url)) +} + +/// Walk up the directory tree from `file_path` to find a `.git/HEAD` file. +/// +/// Reads the HEAD ref to extract the branch name: `ref: refs/heads/`. +/// Returns `None` when not in a git repo, on detached HEAD, or on any I/O error. +/// Uses only file reads — no subprocess, no git binary required. +fn detect_git_branch(file_path: &Path) -> Option { + let mut dir = file_path.parent()?; + loop { + let head = dir.join(".git").join("HEAD"); + if head.exists() { + let content = std::fs::read_to_string(&head).ok()?; + // `ref: refs/heads/main\n` → `main` + return content.trim().strip_prefix("ref: refs/heads/").map(str::to_string); + } + dir = dir.parent()?; + } +} + +/// Append a failed-job record to the configured log file (NDJSON format). +/// Silently ignores if no log path is configured. +async fn append_failed_job(path: &Path, error: &impl std::fmt::Display, config: &RagConfig) { + let Some(ref log_path) = config.pipeline.failed_jobs_log else { + return; + }; + let entry = serde_json::json!({ + "path": path.to_string_lossy(), + "error": error.to_string(), + "ts": chrono::Utc::now().to_rfc3339(), + }); + if let Ok(mut file) = tokio::fs::OpenOptions::new() + .create(true) + .append(true) + .open(log_path) + .await + { + use tokio::io::AsyncWriteExt; + let _ = file.write_all(format!("{}\n", entry).as_bytes()).await; + } +} + +// ─── Core processing ───────────────────────────────────────────────────────── + +/// Per-job timing and volume stats reported back to the worker loop. +struct JobStats { + chunks: usize, + embed_ms: u64, + upsert_ms: u64, +} + +async fn process_job( + job: IndexJob, + embed: &DynEmbedProvider, + store: &DynVectorStore, + tokenizer: &Arc, + config: &RagConfig, + url_locks: &Arc>>>, +) -> Result { + let job_start = Instant::now(); + + // ── 1. Open file and check size from the same FD (TOCTOU fix) ──────────── + let t0 = Instant::now(); + let mut file = tokio::fs::File::open(&job.path).await?; + let file_meta = file.metadata().await?; + let size = file_meta.len(); + + // Path confinement check — guard against TOCTOU rename/hardlink attacks. + // Canonicalize resolves any symlink components in the path itself. + let canonical = tokio::fs::canonicalize(&job.path).await.map_err(|e| { + RagError::Generic(format!( + "canonicalize failed for {}: {e}", + job.path.display() + )) + })?; + let watch_dir = match &config.source { + SourceConfig::FsWatcher { watch_dir, .. } => watch_dir.clone(), + }; + let watch_canonical = tokio::fs::canonicalize(&watch_dir).await.map_err(|e| { + RagError::Generic(format!("canonicalize watch_dir failed: {e}")) + })?; + if !canonical.starts_with(&watch_canonical) { + tracing::warn!( + path = %job.path.display(), + "path outside watch_dir — skipping (potential TOCTOU attack)" + ); + return Ok(JobStats { chunks: 0, embed_ms: 0, upsert_ms: 0 }); + } + + const MAX_FILE_SIZE_BYTES: u64 = 50 * 1024 * 1024; // 50 MiB + if size > MAX_FILE_SIZE_BYTES { + tracing::warn!( + path = ?job.path, + size, + "file too large (>50MB), skipping" + ); + return Ok(JobStats { chunks: 0, embed_ms: 0, upsert_ms: 0 }); + } + + // Read as bytes so binary formats (PDF, DOCX, PPTX, ODT) are handled correctly. + // Text formats convert bytes → String inside parse_file with UTF-8 replacement. + let mut file_bytes: Vec = Vec::with_capacity(size as usize); + file.read_to_end(&mut file_bytes).await?; + let parse_ms = t0.elapsed().as_millis() as u64; + + // ── 2. Parse / ingest by file format ───────────────────────────────────── + // parse_file() dispatches to the right extractor for each format and returns + // a normalized ExtractionResult. Non-JSON formats run in spawn_blocking. + let mut result: ExtractionResult = match parse_file(&job.path, file_bytes).await { + Ok(r) => r, + Err(e) => { + tracing::warn!(path = ?job.path, error = %e, "parse failed, skipping"); + append_failed_job(&job.path, &e, config).await; + return Ok(JobStats { chunks: 0, embed_ms: 0, upsert_ms: 0 }); + } + }; + + // ── 3a. Populate filesystem provenance (noxa-9ww) ───────────────────────── + // Set file_path and last_modified from job.path if not already populated + // by the source tool or ingester. git_branch is read from .git/HEAD walk-up. + if result.metadata.file_path.is_none() { + result.metadata.file_path = Some(job.path.to_string_lossy().into_owned()); + } + if result.metadata.last_modified.is_none() { + if let Ok(mtime) = file_meta.modified() { + result.metadata.last_modified = + Some(chrono::DateTime::::from(mtime).to_rfc3339()); + } + } + let git_branch = detect_git_branch(&job.path); + + // ── 3b. URL validation ──────────────────────────────────────────────────── + let raw_url = result.metadata.url.as_deref().unwrap_or("").to_string(); + if let Err(e) = validate_url_scheme(&raw_url) { + tracing::warn!(path = ?job.path, error = %e, "url validation failed, skipping"); + return Ok(JobStats { chunks: 0, embed_ms: 0, upsert_ms: 0 }); + } + // Normalize so the mutex key and stored payload match what delete_by_url queries. + let url = crate::store::qdrant::normalize_url(&raw_url); + + // ── 4. Chunk ───────────────────────────────────────────────────────────── + let t1 = Instant::now(); + let chunks = chunker::chunk(&result, &config.chunker, tokenizer); + if chunks.is_empty() { + tracing::info!(url = %url, "no indexable content after chunking"); + return Ok(JobStats { chunks: 0, embed_ms: 0, upsert_ms: 0 }); + } + let chunk_ms = t1.elapsed().as_millis() as u64; + + // ── 5. Embed ────────────────────────────────────────────────────────────── + let texts: Vec = chunks.iter().map(|c| c.text.clone()).collect(); + let total_tokens: u64 = chunks.iter().map(|c| c.token_estimate as u64).sum(); + let t2 = Instant::now(); + let vectors = embed.embed(&texts).await?; + let embed_ms = t2.elapsed().as_millis() as u64; + let embed_tokens_per_sec = if embed_ms > 0 { + total_tokens * 1_000 / embed_ms + } else { + 0 + }; + + if vectors.len() != chunks.len() { + return Err(RagError::Embed { + message: format!( + "embed returned {} vectors for {} chunks", + vectors.len(), + chunks.len() + ), + status: None, + }); + } + + // ── 6. Build points with deterministic UUID v5 ──────────────────────────── + // Use the normalized URL for both the UUID seed and payload.url so that + // delete_by_url (which also normalizes) matches the stored value for any + // equivalent URL form (trailing slash, fragment, etc.). + let n_chunks = chunks.len(); + let points: Vec = chunks + .iter() + .zip(vectors.iter()) + .enumerate() + .map(|(i, (chunk, vector))| { + let id = uuid::Uuid::new_v5( + &config.uuid_namespace, + format!("{}#chunk{}", url, i).as_bytes(), + ); + Point { + id, + vector: vector.clone(), + payload: PointPayload { + text: chunk.text.clone(), + url: url.clone(), + domain: chunk.domain.clone(), + chunk_index: chunk.chunk_index, + total_chunks: chunk.total_chunks, + token_estimate: chunk.token_estimate, + title: result.metadata.title.clone(), + author: result.metadata.author.clone(), + published_date: result.metadata.published_date.clone(), + language: result.metadata.language.clone(), + source_type: result.metadata.source_type.clone(), + content_hash: result.metadata.content_hash.clone(), + technologies: result.metadata.technologies.clone(), + is_truncated: result.metadata.is_truncated, + file_path: result.metadata.file_path.clone(), + last_modified: result.metadata.last_modified.clone(), + git_branch: git_branch.clone(), + // IngestionContext provenance fields — populated in Wave 3 by MCP sources. + external_id: None, + platform_url: None, + seed_url: None, + search_query: None, + crawl_depth: None, + }, + } + }) + .collect(); + + // ── 7. Per-URL mutex: delete-then-upsert under lock ─────────────────────── + let url_lock = url_locks + .entry(url.clone()) + .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(()))) + .clone(); + let _guard = url_lock.lock().await; + + // Two-phase replace: upsert new points first, then delete stale ones. + // + // This avoids the data-loss window of delete-before-upsert: if the upsert + // succeeds but the stale cleanup fails, the old points remain alongside the + // new ones (harmless duplicate chunks) until the next file event. The + // reverse was dangerous — a transient store blip after delete but before + // upsert left the document completely unindexed. + // + // UUIDs are v5 deterministic (url + chunk_index), so re-indexing is always + // idempotent and duplicate chunks are deduplicated on the next pass. + // + // Capture the result instead of returning immediately so we can always run + // the eviction logic below, even on error paths. + let new_ids: Vec = points.iter().map(|p| p.id).collect(); + let t3 = Instant::now(); + let store_result: Result = async { + let t4 = Instant::now(); + let upserted = store.upsert(points).await.map_err(|e| { + tracing::error!(url = %url, error = %e, "upsert failed"); + e + })?; + let upsert_ms = t4.elapsed().as_millis() as u64; + + let stale = store.delete_stale_by_url(&url, &new_ids).await.map_err(|e| { + tracing::warn!( + url = %url, + error = %e, + "stale cleanup failed after upsert — duplicate chunks until next file event" + ); + e + })?; + let delete_ms = t3.elapsed().as_millis() as u64 - upsert_ms; + + if stale > 0 { + tracing::info!( + url = %url, + format = "json", + chunks = upserted, + stale_deleted = stale, + embed_tokens = total_tokens, + embed_tokens_per_sec, + parse_ms, + chunk_ms, + embed_ms, + delete_ms, + upsert_ms, + total_ms = job_start.elapsed().as_millis() as u64, + "reindexed" + ); + } else { + tracing::info!( + url = %url, + format = "json", + chunks = upserted, + embed_tokens = total_tokens, + embed_tokens_per_sec, + parse_ms, + chunk_ms, + embed_ms, + delete_ms, + upsert_ms, + total_ms = job_start.elapsed().as_millis() as u64, + "indexed" + ); + } + + Ok(upsert_ms) + } + .await; + + // Always evict the lock entry — including on error paths — to prevent + // unbounded DashMap growth during store outages. + drop(_guard); + // Drop the local Arc clone before eviction check so strong_count reaches 1. + drop(url_lock); + url_locks.remove_if(&url, |_, v| Arc::strong_count(v) == 1); + + let upsert_ms = store_result?; + + Ok(JobStats { chunks: n_chunks, embed_ms, upsert_ms }) +} + +#[cfg(test)] +mod tests { + use super::{collect_indexable_paths, detect_git_branch, is_indexable, parse_file, validate_url_scheme}; + use std::fs; + use std::io::Write; + + #[test] + fn collect_indexable_paths_finds_nested_supported_files() { + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + let nested = root.join("docs/get-started"); + fs::create_dir_all(&nested).expect("create nested dirs"); + fs::write(root.join("top.json"), "{}").expect("write top-level json"); + fs::write(nested.join("guide.json"), "{}").expect("write nested json"); + // .epub is explicitly deferred — should NOT be returned. + fs::write(nested.join("ignore.epub"), "nope").expect("write deferred extension"); + + let paths = collect_indexable_paths(root); + let rendered: Vec = paths + .into_iter() + .map(|p| p.strip_prefix(root).unwrap().display().to_string()) + .collect(); + + assert_eq!(rendered, vec!["docs/get-started/guide.json", "top.json"]); + } + + #[test] + fn is_indexable_accepts_all_supported_extensions() { + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + for ext in &[ + "json", "md", "txt", "log", "rst", "org", "yaml", "yml", "toml", "html", "htm", + "ipynb", "pdf", "docx", "odt", "pptx", "jsonl", "xml", "opml", "vtt", "srt", "rss", + "atom", + ] { + let path = root.join(format!("file.{ext}")); + fs::write(&path, "x").expect("write file"); + assert!(is_indexable(&path), ".{ext} should be indexable"); + } + } + + #[test] + fn is_indexable_rejects_deferred_extensions() { + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + for ext in &["epub", "eml", "mbox"] { + let path = root.join(format!("file.{ext}")); + fs::write(&path, "x").expect("write file"); + assert!(!is_indexable(&path), ".{ext} should NOT be indexable (deferred)"); + } + } + + #[test] + fn detect_git_branch_returns_none_outside_repo() { + let tmp = tempfile::tempdir().expect("tempdir"); + let file = tmp.path().join("foo.txt"); + fs::write(&file, "x").expect("write file"); + assert_eq!(detect_git_branch(&file), None); + } + + #[test] + fn detect_git_branch_reads_head_file() { + let tmp = tempfile::tempdir().expect("tempdir"); + let git_dir = tmp.path().join(".git"); + fs::create_dir_all(&git_dir).expect("create .git"); + fs::write(git_dir.join("HEAD"), "ref: refs/heads/feature/noxa-rag\n") + .expect("write HEAD"); + let nested = tmp.path().join("src/foo.rs"); + fs::create_dir_all(nested.parent().unwrap()).expect("create src"); + fs::write(&nested, "x").expect("write file"); + assert_eq!( + detect_git_branch(&nested), + Some("feature/noxa-rag".to_string()) + ); + } + + #[test] + fn detect_git_branch_returns_none_on_detached_head() { + let tmp = tempfile::tempdir().expect("tempdir"); + let git_dir = tmp.path().join(".git"); + fs::create_dir_all(&git_dir).expect("create .git"); + // Detached HEAD: just a commit SHA, no "ref: refs/heads/" prefix. + fs::write(git_dir.join("HEAD"), "abc123def456\n").expect("write HEAD"); + let file = tmp.path().join("foo.txt"); + fs::write(&file, "x").expect("write file"); + assert_eq!(detect_git_branch(&file), None); + } + + // ─── validate_url_scheme ──────────────────────────────────────────────────── + + #[test] + fn validate_url_scheme_accepts_file_local_path() { + // file:///path/to/file — no host component — must be accepted. + assert!( + validate_url_scheme("file:///tmp/foo.md").is_ok(), + "file:/// should be accepted for local file ingestion" + ); + } + + #[test] + fn validate_url_scheme_accepts_file_localhost_host() { + // RFC 8089: file://localhost/path is equivalent to file:///path. + assert!( + validate_url_scheme("file://localhost/tmp/foo.md").is_ok(), + "file://localhost/ should be accepted per RFC 8089" + ); + } + + #[test] + fn validate_url_scheme_rejects_file_with_remote_host() { + let result = validate_url_scheme("file://remoteserver/share/doc.txt"); + assert!( + result.is_err(), + "file:// with a non-localhost host should be rejected" + ); + let msg = result.unwrap_err().to_string(); + assert!( + msg.contains("remote host") || msg.contains("not allowed"), + "error message should mention remote host, got: {msg}" + ); + } + + #[test] + fn validate_url_scheme_accepts_https() { + assert!(validate_url_scheme("https://example.com/page").is_ok()); + } + + #[test] + fn validate_url_scheme_rejects_ftp() { + let result = validate_url_scheme("ftp://example.com/file.txt"); + assert!(result.is_err()); + } + + #[test] + fn validate_url_scheme_rejects_empty_url() { + assert!(validate_url_scheme("").is_err()); + } + + // ─── is_indexable additional coverage ────────────────────────────────────── + + #[test] + fn is_indexable_rejects_binary_and_unknown_extensions() { + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + for ext in &["exe", "png", "jpg", "gif", "zip", "unknown", "dll"] { + let path = root.join(format!("file.{ext}")); + fs::write(&path, "x").expect("write file"); + assert!(!is_indexable(&path), ".{ext} should NOT be indexable"); + } + } + + #[test] + fn is_indexable_returns_false_for_nonexistent_file() { + // Even a supported extension must fail if the file doesn't exist. + let path = std::path::Path::new("/nonexistent/path/file.md"); + assert!(!is_indexable(path)); + } + + // ─── collect_indexable_paths: broader extension coverage ─────────────────── + + #[test] + fn collect_indexable_paths_finds_md_html_ipynb_and_json() { + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + fs::write(root.join("readme.md"), "# Hello").expect("write md"); + fs::write(root.join("page.html"), "").expect("write html"); + fs::write(root.join("notebook.ipynb"), r#"{"cells":[],"metadata":{},"nbformat":4,"nbformat_minor":4}"#).expect("write ipynb"); + fs::write(root.join("result.json"), "{}").expect("write json"); + // Binary extensions should be ignored. + fs::write(root.join("photo.png"), "data").expect("write png"); + + let paths = collect_indexable_paths(root); + let names: Vec = paths + .into_iter() + .map(|p| p.file_name().unwrap().to_string_lossy().into_owned()) + .collect(); + + assert!(names.contains(&"readme.md".to_string()), "should collect .md"); + assert!(names.contains(&"page.html".to_string()), "should collect .html"); + assert!(names.contains(&"notebook.ipynb".to_string()), "should collect .ipynb"); + assert!(names.contains(&"result.json".to_string()), "should collect .json"); + assert!(!names.contains(&"photo.png".to_string()), "should NOT collect .png"); + } + + // ─── parse_file: plain text formats ──────────────────────────────────────── + + async fn run_parse_file( + dir: &std::path::Path, + filename: &str, + content: &[u8], + ) -> Result { + let path = dir.join(filename); + fs::write(&path, content).expect("write temp file"); + parse_file(&path, content.to_vec()).await + } + + #[tokio::test] + async fn parse_file_md_sets_url_title_and_markdown() { + let tmp = tempfile::tempdir().expect("tempdir"); + let content = b"# My Document\n\nSome content here."; + let result = run_parse_file(tmp.path(), "my-doc.md", content) + .await + .expect("parse .md"); + + // URL must be a file:// URI pointing at the file. + let url = result.metadata.url.as_deref().expect("url must be set"); + assert!(url.starts_with("file://"), "url should be file://, got: {url}"); + assert!(url.contains("my-doc"), "url should contain filename stem, got: {url}"); + + // Title should be the filename stem. + let title = result.metadata.title.as_deref().expect("title must be set"); + assert_eq!(title, "my-doc"); + + // Markdown content must be present. + assert!( + !result.content.markdown.is_empty(), + "markdown should not be empty" + ); + assert!( + result.content.markdown.contains("My Document"), + "markdown should contain heading text" + ); + + // source_type should be "file". + assert_eq!(result.metadata.source_type.as_deref(), Some("file")); + } + + #[tokio::test] + async fn parse_file_txt_populates_plain_text() { + let tmp = tempfile::tempdir().expect("tempdir"); + let content = b"Hello plain text world."; + let result = run_parse_file(tmp.path(), "notes.txt", content) + .await + .expect("parse .txt"); + + // .txt uses make_text_result with both markdown and plain_text set to the content. + assert!( + result.content.plain_text.contains("Hello plain text world"), + "plain_text should contain file content, got: {:?}", + result.content.plain_text + ); + assert_eq!(result.metadata.title.as_deref(), Some("notes")); + } + + #[tokio::test] + async fn parse_file_rst_org_yaml_toml_group_returns_content() { + let tmp = tempfile::tempdir().expect("tempdir"); + let cases = [ + ("doc.rst", b"Section\n=======\n\nRST content." as &[u8]), + ("notes.org", b"* Heading\n\nOrg content."), + ("config.yaml", b"key: value\nother: 42"), + ("settings.toml", b"[section]\nkey = \"value\""), + ]; + for (filename, content) in cases { + let result = run_parse_file(tmp.path(), filename, content) + .await + .unwrap_or_else(|e| panic!("parse {filename} failed: {e}")); + assert!( + !result.content.markdown.is_empty(), + "{filename}: markdown should not be empty" + ); + let url = result.metadata.url.as_deref().expect("url set"); + assert!(url.starts_with("file://"), "{filename}: url should be file://"); + } + } + + #[tokio::test] + async fn parse_file_log_strips_ansi_escapes() { + let tmp = tempfile::tempdir().expect("tempdir"); + // ESC[32m = green colour; ESC[0m = reset. + let content = b"\x1b[32mINFO\x1b[0m server started on port 8080"; + let result = run_parse_file(tmp.path(), "server.log", content) + .await + .expect("parse .log"); + + let text = &result.content.markdown; + assert!( + !text.contains('\x1b'), + "ANSI escape sequences should be stripped, got: {text:?}" + ); + assert!( + text.contains("INFO"), + "visible text should remain after stripping, got: {text:?}" + ); + assert!( + text.contains("server started"), + "full message should be present, got: {text:?}" + ); + } + + #[tokio::test] + async fn parse_file_html_populates_extraction_result() { + let tmp = tempfile::tempdir().expect("tempdir"); + let html = b"

Hello

World content paragraph.

"; + let result = run_parse_file(tmp.path(), "page.html", html) + .await + .expect("parse .html"); + + // URL must be set to a file:// URI. + let url = result.metadata.url.as_deref().expect("url must be set for html"); + assert!(url.starts_with("file://"), "html url should be file://, got: {url}"); + + // source_type must be "file". + assert_eq!(result.metadata.source_type.as_deref(), Some("file")); + + // Markdown should contain extracted text. + assert!( + !result.content.markdown.is_empty(), + "html markdown should not be empty" + ); + } + + #[tokio::test] + async fn parse_file_ipynb_concatenates_cell_sources_and_strips_outputs() { + let tmp = tempfile::tempdir().expect("tempdir"); + // Minimal notebook: one markdown cell, one code cell with outputs. + let notebook = b"\ +{\"cells\": [\ +{\"cell_type\": \"markdown\", \"source\": [\"# Introduction\", \"This is the intro.\"]},\ +{\"cell_type\": \"code\", \"source\": [\"print(x)\"], \"outputs\": [{\"output_type\": \"stream\", \"text\": [\"result\"]}]},\ +{\"cell_type\": \"raw\", \"source\": [\"raw cell should be ignored\"]}\ +], \"metadata\": {}, \"nbformat\": 4, \"nbformat_minor\": 4}"; + + let result = run_parse_file(tmp.path(), "analysis.ipynb", notebook) + .await + .expect("parse .ipynb"); + + let text = &result.content.markdown; + + // Markdown and code cell sources must be present. + assert!( + text.contains("Introduction"), + "markdown cell heading should appear, got: {text:?}" + ); + assert!( + text.contains("print"), + "code cell source should appear, got: {text:?}" + ); + + // Outputs must NOT appear (we strip them to avoid PII/env dumps). + assert!( + !text.contains("output_type"), + "cell outputs should be stripped, got: {text:?}" + ); + + // Raw cells must NOT appear. + assert!( + !text.contains("raw cell"), + "raw cells should be ignored, got: {text:?}" + ); + } + + // ─── Minimal ZIP builder helpers ────────────────────────────────────────── + + /// Build a minimal valid DOCX in-memory: a ZIP containing word/document.xml + /// with one paragraph of text. + fn build_minimal_docx(paragraph_text: &str) -> Vec { + let xml = format!( + r#" + + + + + {} + + + +"#, + paragraph_text + ); + + let buf = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(buf); + let options: zip::write::SimpleFileOptions = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + zip.start_file("word/document.xml", options).expect("start_file"); + zip.write_all(xml.as_bytes()).expect("write xml"); + let cursor = zip.finish().expect("finish zip"); + cursor.into_inner() + } + + /// Build a minimal valid ODT in-memory: a ZIP containing content.xml. + fn build_minimal_odt(paragraph_text: &str) -> Vec { + let xml = format!( + r#" + + + + {} + + +"#, + paragraph_text + ); + + let buf = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(buf); + let options: zip::write::SimpleFileOptions = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + zip.start_file("content.xml", options).expect("start_file odt"); + zip.write_all(xml.as_bytes()).expect("write odt xml"); + let cursor = zip.finish().expect("finish odt zip"); + cursor.into_inner() + } + + #[tokio::test] + async fn parse_file_docx_produces_non_empty_content() { + let tmp = tempfile::tempdir().expect("tempdir"); + let docx_bytes = build_minimal_docx("This is a test document paragraph."); + let path = tmp.path().join("report.docx"); + fs::write(&path, &docx_bytes).expect("write docx"); + + let result = parse_file(&path, docx_bytes) + .await + .expect("parse .docx should succeed"); + + let text = &result.content.markdown; + assert!( + !text.is_empty(), + "DOCX markdown should not be empty" + ); + assert!( + text.contains("test document paragraph"), + "DOCX text should contain paragraph content, got: {text:?}" + ); + // URL must be a file:// reference. + let url = result.metadata.url.as_deref().expect("docx url set"); + assert!(url.starts_with("file://"), "docx url should be file://, got: {url}"); + } + + #[tokio::test] + async fn parse_file_odt_produces_non_empty_content() { + let tmp = tempfile::tempdir().expect("tempdir"); + let odt_bytes = build_minimal_odt("Open document text paragraph content."); + let path = tmp.path().join("document.odt"); + fs::write(&path, &odt_bytes).expect("write odt"); + + let result = parse_file(&path, odt_bytes) + .await + .expect("parse .odt should succeed"); + + let text = &result.content.markdown; + assert!( + !text.is_empty(), + "ODT markdown should not be empty" + ); + assert!( + text.contains("Open document text paragraph"), + "ODT text should contain paragraph content, got: {text:?}" + ); + } + + // ─── PDF test ────────────────────────────────────────────────────────────── + // NOTE: EPUB is explicitly deferred in is_indexable() — no .epub arm in parse_file(). + // Skipping .epub test per bead instructions. + + #[tokio::test] + async fn parse_file_pdf_produces_non_empty_content_from_valid_fixture() { + // Minimal syntactically-valid PDF with one text object. + // This is the smallest PDF that pdf-extract can successfully decode. + let pdf_bytes: &[u8] = b"%PDF-1.4\n\ + 1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\ + 2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n\ + 3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\n\ + /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n\ + 4 0 obj\n<< /Length 44 >>\nstream\n\ + BT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\n\ + endstream\nendobj\n\ + 5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n\ + xref\n0 6\n\ + 0000000000 65535 f \n\ + 0000000009 00000 n \n\ + 0000000058 00000 n \n\ + 0000000115 00000 n \n\ + 0000000266 00000 n \n\ + 0000000360 00000 n \n\ + trailer\n<< /Size 6 /Root 1 0 R >>\n\ + startxref\n441\n%%EOF\n"; + + let tmp = tempfile::tempdir().expect("tempdir"); + let path = tmp.path().join("sample.pdf"); + fs::write(&path, pdf_bytes).expect("write pdf"); + + let result = parse_file(&path, pdf_bytes.to_vec()).await; + + // PDF extraction either succeeds with content or fails cleanly with Parse error. + // We do NOT require the lopdf-based extractor to decode this minimal PDF perfectly, + // but we do require it returns Ok with non-empty text OR a well-formed RagError. + match result { + Ok(r) => { + // If it parsed successfully, the result must have a file:// URL set. + let url = r.metadata.url.as_deref().expect("pdf url should be set on Ok"); + assert!(url.starts_with("file://"), "pdf url should be file://"); + // Content may be empty for this trivial fixture depending on the extractor. + // At minimum verify we got a valid ExtractionResult structure back. + let _ = r.content.markdown; // no panic + } + Err(crate::error::RagError::Parse(_)) => { + // Acceptable: the minimal fixture may not have enough structure for pdf-extract. + // The important thing is it returns a typed error, not a panic. + } + Err(other) => { + panic!("PDF parse returned unexpected error variant: {other:?}"); + } + } + } + + #[tokio::test] + async fn parse_file_rejects_unknown_extension() { + let tmp = tempfile::tempdir().expect("tempdir"); + let path = tmp.path().join("file.xyz"); + fs::write(&path, b"data").expect("write file"); + let result = parse_file(&path, b"data".to_vec()).await; + assert!( + matches!(result, Err(crate::error::RagError::Parse(_))), + "unsupported extension should return Parse error, got: {result:?}" + ); + } +} diff --git a/crates/noxa-rag/src/store/mod.rs b/crates/noxa-rag/src/store/mod.rs new file mode 100644 index 0000000..7964fbb --- /dev/null +++ b/crates/noxa-rag/src/store/mod.rs @@ -0,0 +1,41 @@ +use async_trait::async_trait; +use std::sync::Arc; + +use crate::error::RagError; +use crate::types::{Point, SearchResult}; + +/// Pluggable vector store backend. +/// +/// Trait surface is minimal — only what ALL impls share. +/// Collection lifecycle (create_collection, collection_exists) lives in factory.rs +/// as concrete methods on each store struct, called during startup probes. +#[async_trait] +pub trait VectorStore: Send + Sync { + /// Upsert points into the store. Returns the number of points written. + async fn upsert(&self, points: Vec) -> Result; + /// Delete all points for a given URL. Returns the number of points deleted. + async fn delete_by_url(&self, url: &str) -> Result; + /// Delete all points for a given URL whose IDs are NOT in `keep_ids`. + /// + /// Used for two-phase replace: upsert new points first, then call this to + /// evict only the stale points, so a transient upsert failure never leaves + /// the collection empty. + async fn delete_stale_by_url( + &self, + url: &str, + keep_ids: &[uuid::Uuid], + ) -> Result; + async fn search(&self, vector: &[f32], limit: usize) -> Result, RagError>; + /// Return the total number of indexed points in the collection. + async fn collection_point_count(&self) -> Result; + /// Return true iff there is at least one point with both `url` and `content_hash` + /// matching the given values. Used by the startup delta scan to skip already-indexed + /// files whose content has not changed. + async fn url_with_hash_exists(&self, url: &str, hash: &str) -> Result; + fn name(&self) -> &str; +} + +pub type DynVectorStore = Arc; + +pub mod qdrant; +pub use qdrant::QdrantStore; diff --git a/crates/noxa-rag/src/store/qdrant.rs b/crates/noxa-rag/src/store/qdrant.rs new file mode 100644 index 0000000..278d352 --- /dev/null +++ b/crates/noxa-rag/src/store/qdrant.rs @@ -0,0 +1,677 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::collections::HashMap; + +use crate::error::RagError; +use crate::store::VectorStore; +use crate::types::{Point, SearchResult}; + +// ── REST request/response shapes ───────────────────────────────────────────── + +#[derive(Deserialize)] +struct CollectionInfoResponse { + result: Option, +} + +#[derive(Deserialize)] +struct CollectionResult { + config: CollectionConfig, +} + +#[derive(Deserialize)] +struct CollectionConfig { + params: CollectionParams, +} + +#[derive(Deserialize)] +struct CollectionParams { + vectors: serde_json::Value, +} + +#[derive(Deserialize)] +struct CollectionVectors { + size: usize, +} + +#[derive(Deserialize)] +struct CollectionNamedVectors { + vectors: HashMap, +} + + +#[derive(Serialize)] +struct UpsertRequest { + points: Vec, +} + +#[derive(Serialize)] +struct QdrantPoint { + id: String, // UUID string + vector: Vec, + payload: std::collections::HashMap, +} + +#[derive(Serialize)] +struct DeleteByFilterRequest { + filter: serde_json::Value, +} + +#[derive(Serialize)] +struct SearchRequest { + vector: Vec, + limit: usize, + with_payload: bool, + score_threshold: Option, +} + +#[derive(Deserialize)] +struct SearchResponse { + result: Vec, +} + +#[derive(Deserialize)] +struct SearchHit { + score: f32, + payload: Option>, +} + +// ── QdrantStore ─────────────────────────────────────────────────────────────── + +pub struct QdrantStore { + client: reqwest::Client, + base_url: String, // e.g. "http://127.0.0.1:53333" + collection: String, + uuid_namespace: uuid::Uuid, +} + +impl QdrantStore { + pub fn new( + url: &str, + collection: String, + api_key: Option, + uuid_namespace: uuid::Uuid, + ) -> Result { + let mut headers = reqwest::header::HeaderMap::new(); + if let Some(key) = api_key { + headers.insert( + "api-key", + key.parse() + .map_err(|_| RagError::Config("invalid Qdrant api-key".into()))?, + ); + } + let client = reqwest::Client::builder() + .default_headers(headers) + .connect_timeout(std::time::Duration::from_secs(5)) + .timeout(std::time::Duration::from_secs(30)) + .build() + .map_err(|e| RagError::Config(format!("failed to build HTTP client: {e}")))?; + + Ok(Self { + client, + base_url: url.trim_end_matches('/').to_string(), + collection, + uuid_namespace, + }) + } + + /// GET /collections/{name} → true if 200, false if 404. + pub async fn collection_exists(&self) -> Result { + let url = format!("{}/collections/{}", self.base_url, self.collection); + let resp = self.client.get(&url).send().await?; + match resp.status().as_u16() { + 200 => Ok(true), + 404 => Ok(false), + s => Err(RagError::Store(format!( + "collection_exists: unexpected HTTP {s}" + ))), + } + } + + /// PUT /collections/{name} — create with Cosine/HNSW + payload indexes. + pub async fn create_collection(&self, dims: usize) -> Result<(), RagError> { + let url = format!("{}/collections/{}", self.base_url, self.collection); + let body = json!({ + "vectors": { + "size": dims, + "distance": "Cosine", + "on_disk": true, + "hnsw_config": { "m": 16, "ef_construct": 200 } + }, + "on_disk_payload": true + }); + + let resp = self.client.put(&url).json(&body).send().await?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!( + "create_collection failed: {preview}" + ))); + } + + // Payload indexes for fast filtering. + // + // Only index fields with real query callers today — speculative indexes waste + // Qdrant disk and add index creation time on every startup. + // + // WARNING: Adding indexes to a populated collection is expensive (full + // sequential scan, 30-120s per index for 100k points). For production + // collections, prefer the shadow-collection migration strategy: + // 1. Create 'noxa-v2' with all desired indexes + // 2. Bulk-copy all points from old collection to noxa-v2 + // 3. Verify point counts match + // 4. Update config to point at noxa-v2 + // 5. Delete old collection + // For development / small collections (<10k points), direct creation is fine. + // + // PUT to /index is idempotent — Qdrant returns 200 if the index already exists, + // so this loop is safe to run on every startup against an existing collection. + let indexes: &[(&str, &str)] = &[ + ("url", "keyword"), + ("domain", "keyword"), + ("source_type", "keyword"), + ("language", "keyword"), + ]; + let idx_url = format!("{}/collections/{}/index", self.base_url, self.collection); + for (field, schema_type) in indexes { + let idx_body = json!({ "field_name": field, "field_schema": schema_type }); + let r = self.client.put(&idx_url).json(&idx_body).send().await?; + if !r.status().is_success() { + let text = r.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!( + "create_field_index({field}) failed: {preview}" + ))); + } + } + + Ok(()) + } + + /// GET /collections/{name} and return the configured vector size. + /// + /// Used by `factory::build_vector_store` to validate that an existing + /// collection's dimensions match the embed provider's output dimensions. + pub(crate) async fn collection_vector_size(&self) -> Result { + let endpoint = format!("{}/collections/{}", self.base_url, self.collection); + let resp = self.client.get(&endpoint).send().await?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!( + "collection_info failed: {preview}" + ))); + } + let info: CollectionInfoResponse = resp + .json() + .await + .map_err(|e| RagError::Store(format!("collection_info parse failed: {e}")))?; + info.result + .map(|r| parse_collection_vector_size(r.config.params.vectors)) + .transpose()? + .ok_or_else(|| RagError::Store("collection_info missing result".to_string())) + } +} + +fn parse_collection_vector_size(vectors: serde_json::Value) -> Result { + if let Ok(config) = serde_json::from_value::(vectors.clone()) { + return Ok(config.size); + } + + let named: CollectionNamedVectors = serde_json::from_value(json!({ "vectors": vectors })) + .map_err(|e| RagError::Store(format!("collection_info parse failed: {e}")))?; + + let mut sizes = named.vectors.into_iter().map(|(_, config)| config.size); + let first = sizes + .next() + .ok_or_else(|| RagError::Store("collection_info missing vectors".to_string()))?; + + if sizes.all(|size| size == first) { + Ok(first) + } else { + Err(RagError::Store( + "collection_info has named vectors with mismatched sizes".to_string(), + )) + } +} + +#[cfg(test)] +mod tests { + use super::parse_collection_vector_size; + + #[test] + fn parses_named_vector_collection_size() { + let payload = serde_json::json!({ + "default": { "size": 1024 }, + "title": { "size": 1024 } + }); + + let size = parse_collection_vector_size(payload).expect("named vectors should parse"); + assert_eq!(size, 1024); + } + + #[test] + fn rejects_mixed_named_vector_sizes() { + let payload = serde_json::json!({ + "default": { "size": 1024 }, + "title": { "size": 768 } + }); + + let err = parse_collection_vector_size(payload).expect_err("mixed sizes should fail"); + assert!( + err.to_string().contains("mismatched sizes"), + "unexpected error: {err}" + ); + } +} + +#[async_trait] +impl VectorStore for QdrantStore { + /// PUT /collections/{name}/points?wait=true. Returns the number of points written. + async fn upsert(&self, points: Vec) -> Result { + let n = points.len(); + let url = format!( + "{}/collections/{}/points?wait=true", + self.base_url, self.collection + ); + + let qdrant_points: Vec = points + .iter() + .map(|p| { + let mut payload = std::collections::HashMap::new(); + payload.insert("text".into(), json!(p.payload.text)); + payload.insert("url".into(), json!(p.payload.url)); + payload.insert("domain".into(), json!(p.payload.domain)); + payload.insert("chunk_index".into(), json!(p.payload.chunk_index)); + payload.insert("total_chunks".into(), json!(p.payload.total_chunks)); + payload.insert("token_estimate".into(), json!(p.payload.token_estimate)); + // Extended metadata — only insert when present so payload stays compact. + if let Some(v) = &p.payload.title { + payload.insert("title".into(), json!(v)); + } + if let Some(v) = &p.payload.author { + payload.insert("author".into(), json!(v)); + } + if let Some(v) = &p.payload.published_date { + payload.insert("published_date".into(), json!(v)); + } + if let Some(v) = &p.payload.language { + payload.insert("language".into(), json!(v)); + } + if let Some(v) = &p.payload.source_type { + payload.insert("source_type".into(), json!(v)); + } + if let Some(v) = &p.payload.content_hash { + payload.insert("content_hash".into(), json!(v)); + } + if !p.payload.technologies.is_empty() { + payload.insert("technologies".into(), json!(p.payload.technologies)); + } + if let Some(v) = p.payload.is_truncated { + payload.insert("is_truncated".into(), json!(v)); + } + if let Some(v) = &p.payload.file_path { + payload.insert("file_path".into(), json!(v)); + } + if let Some(v) = &p.payload.last_modified { + payload.insert("last_modified".into(), json!(v)); + } + if let Some(v) = &p.payload.external_id { + payload.insert("external_id".into(), json!(v)); + } + if let Some(v) = &p.payload.platform_url { + payload.insert("platform_url".into(), json!(v)); + } + if let Some(v) = &p.payload.seed_url { + payload.insert("seed_url".into(), json!(v)); + } + if let Some(v) = &p.payload.search_query { + payload.insert("search_query".into(), json!(v)); + } + if let Some(v) = p.payload.crawl_depth { + payload.insert("crawl_depth".into(), json!(v)); + } + QdrantPoint { + id: p.id.to_string(), + vector: p.vector.clone(), + payload, + } + }) + .collect(); + + let resp = self + .client + .put(&url) + .json(&UpsertRequest { + points: qdrant_points, + }) + .send() + .await?; + + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!("upsert failed: {preview}"))); + } + + Ok(n) + } + + /// POST /collections/{name}/points/delete?wait=true filtered by url payload. + /// + /// Queries the stale point count before deleting and returns it. + /// Qdrant's delete response does not include a deleted count, so we count first. + async fn delete_by_url(&self, url: &str) -> Result { + let normalized = normalize_url(url); + + // Count stale points before delete so callers can log reindex vs first-index. + let count_endpoint = format!( + "{}/collections/{}/points/count", + self.base_url, self.collection + ); + let count_body = json!({ + "filter": { + "must": [{ "key": "url", "match": { "value": normalized } }] + }, + "exact": true + }); + let stale_count: u64 = match self + .client + .post(&count_endpoint) + .json(&count_body) + .send() + .await + { + Ok(r) if r.status().is_success() => r + .json::() + .await + .ok() + .and_then(|v| v["result"]["count"].as_u64()) + .unwrap_or(0), + _ => 0, // non-fatal: best-effort count + }; + + let endpoint = format!( + "{}/collections/{}/points/delete?wait=true", + self.base_url, self.collection + ); + let body = DeleteByFilterRequest { + filter: json!({ + "must": [{ "key": "url", "match": { "value": normalized } }] + }), + }; + + let resp = self.client.post(&endpoint).json(&body).send().await?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!("delete_by_url failed: {preview}"))); + } + + Ok(stale_count) + } + + /// POST /collections/{name}/points/delete?wait=true — delete points for a URL + /// whose IDs are NOT in `keep_ids`. + /// + /// Used for two-phase replace so that a transient upsert failure never empties + /// the collection: new points are upserted first, then only stale points are + /// removed. If `keep_ids` is empty all points for the URL are deleted (same as + /// `delete_by_url`). + async fn delete_stale_by_url( + &self, + url: &str, + keep_ids: &[uuid::Uuid], + ) -> Result { + let normalized = normalize_url(url); + + // Build filter: url == normalized AND id NOT IN keep_ids. + let filter = if keep_ids.is_empty() { + json!({ + "must": [{ "key": "url", "match": { "value": normalized } }] + }) + } else { + let id_strs: Vec = keep_ids.iter().map(|id| id.to_string()).collect(); + json!({ + "must": [{ "key": "url", "match": { "value": normalized } }], + "must_not": [{ "has_id": id_strs }] + }) + }; + + // Count stale points before delete for logging. + let count_endpoint = format!( + "{}/collections/{}/points/count", + self.base_url, self.collection + ); + let stale_count: u64 = match self + .client + .post(&count_endpoint) + .json(&json!({ "filter": filter, "exact": true })) + .send() + .await + { + Ok(r) if r.status().is_success() => r + .json::() + .await + .ok() + .and_then(|v| v["result"]["count"].as_u64()) + .unwrap_or(0), + _ => 0, + }; + + if stale_count == 0 { + return Ok(0); + } + + let endpoint = format!( + "{}/collections/{}/points/delete?wait=true", + self.base_url, self.collection + ); + let resp = self + .client + .post(&endpoint) + .json(&DeleteByFilterRequest { filter }) + .send() + .await?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!( + "delete_stale_by_url failed: {preview}" + ))); + } + + Ok(stale_count) + } + + /// POST /collections/{name}/points/search + async fn search(&self, vector: &[f32], limit: usize) -> Result, RagError> { + let url = format!( + "{}/collections/{}/points/search", + self.base_url, self.collection + ); + let body = SearchRequest { + vector: vector.to_vec(), + limit, + with_payload: true, + score_threshold: None, + }; + + let resp = self.client.post(&url).json(&body).send().await?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!("search failed: {preview}"))); + } + + let response: SearchResponse = resp.json().await?; + + let results = response + .result + .into_iter() + .filter_map(|hit| { + let payload = hit.payload?; + let text = payload + .get("text") + .and_then(|v| v.as_str()) + .map(str::to_string); + let url = payload + .get("url") + .and_then(|v| v.as_str()) + .map(str::to_string); + match (text, url) { + (Some(text), Some(url)) => { + let chunk_index = payload + .get("chunk_index") + .and_then(|v| v.as_u64()) + .unwrap_or(0) as usize; + let token_estimate = payload + .get("token_estimate") + .and_then(|v| v.as_u64()) + .unwrap_or(0) as usize; + let title = payload + .get("title") + .and_then(|v| v.as_str()) + .map(String::from); + let author = payload + .get("author") + .and_then(|v| v.as_str()) + .map(String::from); + let published_date = payload + .get("published_date") + .and_then(|v| v.as_str()) + .map(String::from); + let language = payload + .get("language") + .and_then(|v| v.as_str()) + .map(String::from); + let source_type = payload + .get("source_type") + .and_then(|v| v.as_str()) + .map(String::from); + let content_hash = payload + .get("content_hash") + .and_then(|v| v.as_str()) + .map(String::from); + let technologies = payload + .get("technologies") + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|t| t.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default(); + Some(SearchResult { + text, + url, + score: hit.score, + chunk_index, + token_estimate, + title, + author, + published_date, + language, + source_type, + content_hash, + technologies, + }) + } + _ => { + tracing::warn!( + "search hit dropped: missing required payload field (text or url) \ + — possible schema mismatch or data corruption" + ); + None + } + } + }) + .collect(); + + Ok(results) + } + + /// GET /collections/{name} → total vectors_count. + async fn collection_point_count(&self) -> Result { + let endpoint = format!("{}/collections/{}", self.base_url, self.collection); + let resp = self.client.get(&endpoint).send().await?; + if !resp.status().is_success() { + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + return Err(RagError::Store(format!( + "collection_point_count failed: {preview}" + ))); + } + let body: serde_json::Value = resp + .json() + .await + .map_err(|e| RagError::Store(format!("collection_point_count parse failed: {e}")))?; + Ok(body["result"]["vectors_count"] + .as_u64() + .unwrap_or(0)) + } + + /// Check whether any point exists with both `url` == `url` AND `content_hash` == `hash`. + /// + /// Used by the startup delta scan so the daemon can skip re-indexing files whose + /// content has not changed since the last run. Returns `false` when `hash` is empty + /// (no stored hash means we cannot skip). + async fn url_with_hash_exists(&self, url: &str, hash: &str) -> Result { + if hash.is_empty() { + return Ok(false); + } + let normalized = normalize_url(url); + let endpoint = format!( + "{}/collections/{}/points/count", + self.base_url, self.collection + ); + let body = serde_json::json!({ + "filter": { + "must": [ + { "key": "url", "match": { "value": normalized } }, + { "key": "content_hash", "match": { "value": hash } } + ] + } + }); + + let resp = self + .client + .post(&endpoint) + .timeout(std::time::Duration::from_secs(5)) + .json(&body) + .send() + .await?; + + if !resp.status().is_success() { + let status = resp.status().as_u16(); + let text = resp.text().await.unwrap_or_default(); + let preview: String = text.chars().take(512).collect(); + tracing::warn!( + status, + url = %normalized, + body = preview, + "url_with_hash_exists count request failed — assuming not indexed" + ); + return Ok(false); + } + + let json: serde_json::Value = resp.json().await?; + Ok(json["result"]["count"].as_u64().unwrap_or(0) > 0) + } + + fn name(&self) -> &str { + "qdrant" + } +} + +/// Strip fragment, trailing path slash, lowercase scheme+host (url crate already does the latter). +pub(crate) fn normalize_url(url: &str) -> String { + let Ok(mut parsed) = url::Url::parse(url) else { + return url.to_string(); + }; + parsed.set_fragment(None); + let path = parsed.path().trim_end_matches('/').to_string(); + parsed.set_path(&path); + parsed.to_string() +} diff --git a/crates/noxa-rag/src/types.rs b/crates/noxa-rag/src/types.rs new file mode 100644 index 0000000..fa4ccaf --- /dev/null +++ b/crates/noxa-rag/src/types.rs @@ -0,0 +1,143 @@ +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// A chunk produced from an ExtractionResult. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Chunk { + pub text: String, + pub source_url: String, + pub domain: String, + pub chunk_index: usize, + pub total_chunks: usize, + pub char_offset: usize, + pub token_estimate: usize, +} + +/// A point ready for upsert into the vector store. +#[derive(Debug, Clone)] +pub struct Point { + /// UUID v5 deterministic ID: url#chunkN + pub id: Uuid, + pub vector: Vec, + pub payload: PointPayload, +} + +/// Payload stored alongside each vector in the store. +/// +/// All optional fields use `skip_serializing_if = "Option::is_none"` so existing +/// Qdrant points (stored by older pipeline versions) return null for new keys — +/// Qdrant is safe to add new nullable payload fields without migration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PointPayload { + pub text: String, + /// Normalized URL (strip fragment, trailing slash, lowercase scheme+host). + pub url: String, + pub domain: String, + pub chunk_index: usize, + pub total_chunks: usize, + pub token_estimate: usize, + // ── Metadata fields from noxa-core ───────────────────────────────────── + #[serde(default, skip_serializing_if = "Option::is_none")] + pub title: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub author: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub published_date: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub language: Option, + /// 'web' | 'file' | 'mcp' | 'notebook' | 'email' + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_type: Option, + /// SHA-256 hex digest of raw source bytes. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub content_hash: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub technologies: Vec, + /// True when the document was cut at max_chunks_per_page. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub is_truncated: Option, + // ── File-source fields ────────────────────────────────────────────────── + /// Absolute filesystem path (file:// sources only). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub file_path: Option, + /// ISO 8601 mtime for files, published_at for web content. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub last_modified: Option, + /// Git branch detected from .git/HEAD walk-up (file:// sources only). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub git_branch: Option, + // ── Ingestion-provenance fields from IngestionContext ─────────────────── + /// Opaque platform id: 'linkding:42', 'memos:7' (Wave 3+). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub external_id: Option, + /// Native platform UI URL (Wave 3+). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub platform_url: Option, + // ── Web-provenance fields from IngestionContext ───────────────────────── + #[serde(default, skip_serializing_if = "Option::is_none")] + pub seed_url: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub search_query: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub crawl_depth: Option, +} + +/// A search result returned by VectorStore::search(). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchResult { + pub text: String, + pub url: String, + pub score: f32, + pub chunk_index: usize, + pub token_estimate: usize, + // Extended metadata fields (None when stored by older pipeline versions) + #[serde(default, skip_serializing_if = "Option::is_none")] + pub title: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub author: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub published_date: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub language: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_type: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub content_hash: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub technologies: Vec, +} + +/// RAG-pipeline provenance carried alongside ExtractionResult through ingestion. +/// +/// These fields have no meaning to noxa-fetch, noxa-mcp, or WASM consumers — they +/// live here in noxa-rag, not in noxa-core. At upsert time both Metadata and +/// IngestionContext are serialized into PointPayload. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct IngestionContext { + /// Matches Metadata.source_type: 'web' | 'file' | 'mcp' | 'notebook' | 'email' + pub source_type: String, + /// SHA-256 hex digest — duplicated from Metadata.content_hash for fast access. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub content_hash: Option, + // Platform fields — populated when MCP sources land in Wave 3. + /// Opaque platform identifier: 'linkding:42', 'memos:7', 'paperless:15'. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub external_id: Option, + /// Native UI URL (not the canonical content URL). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub platform_url: Option, + // AI session fields — populated when AI session sources land. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub session_tool: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub conversation_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model_id: Option, + // Web provenance — populated by noxa-fetch. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub seed_url: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub search_query: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub crawl_depth: Option, +}