diff --git a/README.md b/README.md index e3e8126..c5ca49c 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,24 @@ artifact's *summary* down to that artifact's best *chunk*: hits = ir.traverse(query, corpus, policy=ir.collapsed_tree_policy()) ``` +The *summary* a query routes on can be an artifact's own short field — or an +**LLM-authored synopsis**. `ir.with_synopsis` wraps any indexing strategy to add +one `synopsis` surface per artifact at build time (the document-summary-index +pattern: build-time cost, ≈free at query time), and that synopsis becomes the +collapsed-tree router: + +```python +strat = ir.with_synopsis(ir.Chunked(), synthesize=my_summarizer) # or default (lazy oa) +corpus = ir.build(ir.CorpusSource.from_mapping(docs, name="d", strategy=strat)) +hits = ir.traverse(q, corpus, policy=ir.collapsed_tree_policy()) # routes via the synopsis +``` + +`synthesize` is injectable (a test double or your own summarizer); omitted, it is +built lazily on [`oa`](https://github.com/thorwhalen/oa) so `import ir` stays +offline. Synopses are derived state with a stamped synthesizer identity, so a +prompt/model change re-synthesizes only the affected artifacts on the next +incremental `build` — no silent staleness. + **Flat top-k stays the default** — `traverse` is opt-in, and a policy earns its keep only by beating flat+rerank on your eval set (a strong flat retriever wins simple lookup; graph methods cost far more). Results are ordinary `SearchHit`s diff --git a/ir/__init__.py b/ir/__init__.py index 6abe7e2..f9a43c1 100644 --- a/ir/__init__.py +++ b/ir/__init__.py @@ -59,6 +59,7 @@ from .sources import CorpusSource from .store import CorpusStore from .strategy import Chunked, IndexingStrategy, Package, Skill, WholeText +from .synopsis import Synthesizer, make_llm_synthesizer, with_synopsis from .traverse import WalkPolicy, WalkState, collapsed_tree_policy, traverse __all__ = [ @@ -72,6 +73,9 @@ "Chunked", "Skill", "Package", + "with_synopsis", + "make_llm_synthesizer", + "Synthesizer", "CorpusSource", "CorpusStore", "Corpus", diff --git a/ir/index.py b/ir/index.py index c495b90..bc27b82 100644 --- a/ir/index.py +++ b/ir/index.py @@ -35,12 +35,23 @@ def _strategy_id(strategy) -> str: Changing the strategy (or its parameters) changes this id, so an unchanged corpus rebuilt under a different strategy is correctly re-decomposed rather than skipped. + + Scalar parameters are taken verbatim; a parameter that is *itself* a + strategy (an attribute with a ``decompose`` method — e.g. the inner + strategy a :func:`ir.with_synopsis` wrapper holds) folds in its own + ``_strategy_id`` recursively. So a wrapper's identity tracks both the inner + strategy's parameters and the wrapper's own scalar stamps (e.g. a + synthesizer id), and a change to either re-decomposes through the normal + incremental path. Non-scalar, non-strategy attributes (callables, + embedders) are deliberately excluded — identity for those rides on an + explicit scalar stamp the wrapper exposes, not on a volatile ``repr``. """ - params = { - k: v - for k, v in vars(strategy).items() - if isinstance(v, (str, int, float, bool, type(None))) - } + params: dict[str, Any] = {} + for k, v in vars(strategy).items(): + if isinstance(v, (str, int, float, bool, type(None))): + params[k] = v + elif hasattr(v, "decompose"): # a nested strategy (wrapper) + params[k] = _strategy_id(v) return f"{type(strategy).__name__}:{json.dumps(params, sort_keys=True)}" diff --git a/ir/strategy.py b/ir/strategy.py index f0ad94b..c6bcdcc 100644 --- a/ir/strategy.py +++ b/ir/strategy.py @@ -40,8 +40,14 @@ def decompose( ) -> IndexPlan: ... -def _text_of(raw: Any, text_key: str | None = None) -> str: - """Best-effort text extraction from a raw artifact payload.""" +def text_of(raw: Any, text_key: str | None = None) -> str: + """Best-effort text extraction from a raw artifact payload. + + The SSOT for turning an opaque ``raw`` (a ``str``, a ``Mapping`` with a + ``text`` field or a ``text_key``, or anything else) into embeddable text — + reused by the shipped strategies *and* by :func:`ir.synopsis.make_llm_synthesizer` + so an injected-free synopsis summarizes the same text a strategy would index. + """ if isinstance(raw, str): return raw if isinstance(raw, Mapping): @@ -54,6 +60,11 @@ def _text_of(raw: Any, text_key: str | None = None) -> str: return str(raw) +#: Backward-compatible private alias (the helper was module-private before it +#: became a cross-module SSOT). Internal call sites may use either name. +_text_of = text_of + + def _split(text: str, *, chunk_size: int, overlap: int) -> list[str]: """Paragraph-packing chunker: greedily fill ~``chunk_size`` chunks. diff --git a/ir/synopsis.py b/ir/synopsis.py new file mode 100644 index 0000000..c325b18 --- /dev/null +++ b/ir/synopsis.py @@ -0,0 +1,249 @@ +"""Synopsis surfaces — LLM-derived summaries as an indexed surface (report 12). + +The document-summary-index / collapsed-tree *fuel* (ADR #43): run a summarizer +over each artifact at **build time** to produce a short *synopsis*, index it as a +``"synopsis"`` surface, and let the collapsed-tree policy (:func:`ir.traverse`) +route a synopsis match down to that artifact's chunks. Build-time cost, ≈free at +query time — and incremental, so only new / changed artifacts are re-synthesized. + +:func:`with_synopsis` wraps *any* :class:`~ir.strategy.IndexingStrategy` and adds +one synopsis surface per artifact:: + + strat = ir.with_synopsis(ir.Chunked(), synthesize=my_summarizer) + corpus = ir.build(ir.CorpusSource.from_mapping(docs, name="d", strategy=strat)) + hits = ir.traverse(q, corpus, policy=ir.collapsed_tree_policy()) # routes via synopsis + +The synopsis is **prepended** (plan position 0) so it is the *first* summary +surface — hence the collapsed-tree *router* (on ``with_synopsis(Package())`` the +synopsis, not the terse ``description``, routes). An empty synopsis is dropped, so +a synth that returns ``""`` simply leaves the artifact with its other surfaces. + +``synthesize: Callable[[Artifact], str]`` is **injectable** (a test double, or +your own summarizer); omitted, it is built lazily on :mod:`oa` via +:func:`make_llm_synthesizer` (the ``make_llm_*`` idiom — ``import ir`` stays +offline, ``oa`` is imported only on the first synthesis). + +**Staleness.** The wrapper exposes its identity as scalar attributes +(``synthesizer_id``, ``synopsis_kind``) and holds the inner strategy, so +:func:`ir.index._strategy_id` (which recurses into nested strategies) folds both +the inner strategy's parameters *and* the synthesizer identity into the corpus's +``strategy_id``. A prompt / model change — or an inner-strategy change — therefore +re-synthesizes exactly the affected artifacts, the same way an ``embedder_id`` +change does; no silent staleness. (An injected *unnamed* synthesizer — a lambda +or a local closure — has no stable identity to fold in, so ``with_synopsis`` +**warns** and disables its staleness tracking unless you pass ``synthesizer_id=``.) + +**Routing needs no edges.** collapsed-tree descends synopsis→chunks *within* an +artifact via :func:`ir.retrieve.records_for_artifact` (shared ``artifact_id``), so +synopsis routing works with no entries in the :mod:`ir.graph` ``links`` view — +that view models *cross-artifact* edges (REF / PARENT between artifacts), a +different grain than surface→surface within one artifact. + +**Caveat — ``edge_extractor``.** Eager edge ingest (``build(edge_extractor=...)``) +calls ``decompose`` for *every* artifact each build, and synthesis lives in +``decompose``; so combining :func:`with_synopsis` with an ``edge_extractor`` +re-runs synthesis on every rebuild. The common path — synopsis routing with no +``edge_extractor`` — stays fully incremental. +""" + +from __future__ import annotations + +import hashlib +import warnings +from typing import Any, Callable + +from .base import Artifact, IndexPlan, Surface +from .strategy import IndexingStrategy, text_of + +#: A synthesizer: an :class:`~ir.base.Artifact` → its synopsis text (``""`` to +#: skip — no synopsis surface is added for that artifact). +Synthesizer = Callable[[Artifact], str] + +#: The surface kind a synopsis is indexed under — a member of +#: :data:`ir.traverse.DFLT_SUMMARY_KINDS`, so collapsed-tree routes from it. +SYNOPSIS_KIND = "synopsis" + +#: Default prompt for :func:`make_llm_synthesizer` (a routing-oriented summary). +SYNOPSIS_PROMPT = ( + "Write a concise synopsis (2-4 sentences) of the document below: what it is " + "about and what questions it answers, so that a search over synopses can route " + "to it. Output only the synopsis, no preamble.\n\nDocument:\n{text}" +) + + +def _prompt_hash(prompt: str) -> str: + """Short stable hash of a prompt, for the default synthesizer identity.""" + return hashlib.sha256(prompt.encode("utf-8")).hexdigest()[:12] + + +def _default_llm_summarizer( + prompt: str, model: str | None, **prompt_function_kwargs: Any +): + """Build the default text→synopsis summarizer on :mod:`oa` (lazy import).""" + import oa + + kwargs = dict(prompt_function_kwargs) + if model is not None: + kwargs.setdefault("model", model) + fn = oa.prompt_function(prompt, name="synthesize_synopsis", **kwargs) + + def summarize(text: str) -> str: + return str(fn(text=text) or "").strip() + + return summarize + + +def make_llm_synthesizer( + *, + summarize: Callable[[str], str] | None = None, + prompt: str = SYNOPSIS_PROMPT, + model: str | None = None, + synthesizer_id: str | None = None, + text_key: str | None = None, + **prompt_function_kwargs: Any, +) -> Synthesizer: + """An LLM-backed :data:`Synthesizer` (:class:`~ir.base.Artifact` → synopsis). + + ``summarize`` is an injectable ``text -> str`` callable (a test double, or + your own summarizer); when omitted it is built lazily on :mod:`oa` + (``oa.prompt_function``) on the **first** synthesis and reused — so importing + this module, and even constructing the synthesizer, stays offline. The + artifact's text is extracted with :func:`ir.strategy.text_of` using + ``text_key`` — which :func:`with_synopsis` threads from the inner strategy, so + the synopsis summarizes the *same* field the strategy indexes. An empty text, + or any synthesis error, yields ``""`` (the surface is then skipped, never a + fabricated summary). + + The returned callable carries a ``synthesizer_id`` attribute (default + ``"oa:{model}:{sha(prompt)[:12]}"``) that :func:`with_synopsis` reads into the + corpus's ``strategy_id`` for staleness — a prompt or model change re-synthesizes. + """ + cache: dict[str, Callable[[str], str]] = {} + + def _summarizer() -> Callable[[str], str]: + if summarize is not None: + return summarize + if "fn" not in cache: + cache["fn"] = _default_llm_summarizer( + prompt, model, **prompt_function_kwargs + ) + return cache["fn"] + + def synthesize(artifact: Artifact) -> str: + text = text_of(artifact.raw, text_key).strip() + if not text: + return "" + try: + out = _summarizer()(text) + except Exception: + out = "" + return out.strip() if isinstance(out, str) else "" + + synthesize.synthesizer_id = ( + synthesizer_id or f"oa:{model or 'default'}:{_prompt_hash(prompt)}" + ) + return synthesize + + +class _SynopsisStrategy: + """An :class:`~ir.strategy.IndexingStrategy` that prepends a synopsis surface. + + Delegates decomposition to the wrapped ``strategy``, then prepends one + ``synopsis`` surface (skipped if the synthesizer returns empty). Exposes + ``synthesizer_id`` / ``synopsis_kind`` as scalar attributes and holds the + inner strategy, so :func:`ir.index._strategy_id` captures the full identity. + """ + + def __init__( + self, + strategy: IndexingStrategy, + *, + synthesize: Synthesizer | None = None, + synthesizer_id: str | None = None, + synopsis_kind: str = SYNOPSIS_KIND, + ): + self.strategy = strategy + # The default synthesizer summarizes the same field the inner strategy + # indexes (thread its text_key, if any), so synopsis routing matches. + self.synthesize: Synthesizer = ( + synthesize + if synthesize is not None + else make_llm_synthesizer(text_key=getattr(strategy, "text_key", None)) + ) + self.synthesizer_id = self._resolve_synthesizer_id(synthesizer_id) + self.synopsis_kind = synopsis_kind + + def _resolve_synthesizer_id(self, explicit: str | None) -> str: + """Identity for staleness: explicit id, else a stamp, else a stable name. + + An explicit ``synthesizer_id`` wins; else a ``synthesizer_id`` the + synthesizer carries (e.g. :func:`make_llm_synthesizer`'s). Failing both, + a *named* callable's ``__qualname__`` is a stable identity — but an + unnamed lambda (``""``) or a local closure (``""``) is + **not**: distinct such callables share one qualname, so swapping them + would silently *not* re-synthesize. In that case we warn and fall back to + a sentinel, surfacing the lost guarantee at construction time. + """ + stamped = explicit or getattr(self.synthesize, "synthesizer_id", None) + if stamped: + return stamped + qualname = getattr(self.synthesize, "__qualname__", "") or "" + if qualname and "" not in qualname and "" not in qualname: + return qualname + warnings.warn( + "with_synopsis: could not derive a stable identity for the injected " + "synthesizer (an unnamed lambda or a local closure), so synopsis " + "staleness tracking is disabled — pass synthesizer_id=... so that " + "swapping the synthesizer re-synthesizes the corpus on the next build.", + stacklevel=3, + ) + return "custom" + + def decompose(self, artifact_id, raw, metadata=None) -> IndexPlan: + plan = self.strategy.decompose(artifact_id, raw, metadata) + artifact = Artifact(id=artifact_id, raw=raw, metadata=dict(metadata or {})) + text = self.synthesize(artifact) + text = text.strip() if isinstance(text, str) else "" + if text: + synopsis = Surface( + artifact_id, + self.synopsis_kind, + text, + granularity="document", + metadata={"synthesizer_id": self.synthesizer_id}, + ) + plan.surfaces = [synopsis, *plan.surfaces] + return plan + + +def with_synopsis( + strategy: IndexingStrategy, + *, + synthesize: Synthesizer | None = None, + synthesizer_id: str | None = None, + synopsis_kind: str = SYNOPSIS_KIND, +) -> IndexingStrategy: + """Wrap *strategy* to add one LLM-derived ``synopsis`` surface per artifact. + + Args: + strategy: the inner :class:`~ir.strategy.IndexingStrategy` (``Chunked``, + ``Package``, ...). Its surfaces are kept; the synopsis is prepended. + synthesize: an injectable ``Artifact -> str`` (test double / custom + summarizer). Omitted → :func:`make_llm_synthesizer` (lazy ``oa``). + synthesizer_id: explicit identity stamp for staleness (recommended when + injecting an unnamed callable / lambda). Omitted → the synthesizer's + own ``synthesizer_id`` / ``__qualname__``. + synopsis_kind: the surface kind (default ``"synopsis"``, a summary kind). + + Returns: + an :class:`~ir.strategy.IndexingStrategy` usable anywhere a strategy is — + ``ir.CorpusSource.from_mapping(docs, name=..., strategy=with_synopsis(...))``. + + >>> strat = with_synopsis(Chunked(), synthesize=lambda a: "a summary") # doctest: +SKIP + """ + return _SynopsisStrategy( + strategy, + synthesize=synthesize, + synthesizer_id=synthesizer_id, + synopsis_kind=synopsis_kind, + ) diff --git a/tests/test_synopsis.py b/tests/test_synopsis.py new file mode 100644 index 0000000..192ad5f --- /dev/null +++ b/tests/test_synopsis.py @@ -0,0 +1,394 @@ +"""Tests for synopsis surfaces (#48) — LLM-derived summaries as indexed surfaces. + +Pins the #48 acceptance: + +- ``with_synopsis`` adds one ``synopsis`` surface per artifact, *prepended* (so it + is the collapsed-tree router); an empty synopsis is dropped. +- search restricted to synopsis surfaces + ``traverse`` routes to the right chunks + end-to-end (the synopsis is the routing signal; a trap whose synopsis does not + match is excluded though its chunk matches the query). +- incremental rebuild re-synthesizes only changed artifacts, and a + synthesizer-identity (or inner-strategy) change re-synthesizes — staleness via + the ledger ``strategy_id`` and the recursive ``_strategy_id``. +- offline import preserved: ``oa`` is lazy, an injected synthesizer never needs it. + +Hermetic: light embedder + memory store + injected synthesizers. +""" + +import pytest + +import ir +from ir.base import Artifact +from ir.index import _strategy_id +from ir.retrieve import records_for_artifact +from ir.store import CorpusStore +from ir.synopsis import make_llm_synthesizer, with_synopsis + + +def _synopsis_from_field(artifact): + """An injected synthesizer double: read the artifact's ``synopsis`` field.""" + raw = artifact.raw + return raw.get("synopsis", "") if isinstance(raw, dict) else "" + + +# --------------------------------------------------------------------------- # +# Surface: one prepended synopsis surface per artifact +# --------------------------------------------------------------------------- # + + +def test_with_synopsis_indexes_one_prepended_synopsis_surface(): + docs = {"a": {"text": "body alpha here", "synopsis": "a short routing summary"}} + strat = with_synopsis( + ir.Chunked(), synthesize=_synopsis_from_field, synthesizer_id="v1" + ) + src = ir.CorpusSource.from_mapping(docs, name="s", strategy=strat) + corpus = ir.build(src, store=CorpusStore.memory(), embedder="light") + + recs = records_for_artifact(corpus, "a") + assert recs[0].surface_kind == "synopsis" # prepended -> plan position 0 + assert recs[0].surface_index == 0 + assert recs[0].text == "a short routing summary" + assert recs[0].metadata["synthesizer_id"] == "v1" # provenance stamp + assert any(r.surface_kind == "chunk" for r in recs) # inner surfaces kept + + syn = ir.search(corpus, "short routing summary", surfaces=("synopsis",)) + assert syn and syn[0].artifact_id == "a" and syn[0].surface_kind == "synopsis" + + +def test_empty_synopsis_is_dropped_artifact_keeps_other_surfaces(): + docs = {"a": {"text": "body alpha here", "synopsis": ""}} + strat = with_synopsis( + ir.Chunked(), synthesize=_synopsis_from_field, synthesizer_id="v1" + ) + corpus = ir.build( + ir.CorpusSource.from_mapping(docs, name="s", strategy=strat), + store=CorpusStore.memory(), + embedder="light", + ) + recs = records_for_artifact(corpus, "a") + assert all(r.surface_kind != "synopsis" for r in recs) + assert any(r.surface_kind == "chunk" for r in recs) + + +# --------------------------------------------------------------------------- # +# Routing: search-on-synopsis + traverse routes to the artifact's chunks +# --------------------------------------------------------------------------- # + +# Gold A: synopsis matches the routing tokens, the answer is in its chunk. +# Trap B: synopsis does NOT match, but its chunk matches the routing tokens. +# Fillers C, D: non-matching synopses, present so B falls below seed_k. +ROUTING_DOCS = { + "A": { + "text": "ANSTOK answer payload here. filler one two three.", + "synopsis": "rtok1 rtok2 rtok3 rtok4 alpha beta", + }, + "B": { + "text": "rtok1 rtok2 rtok3 rtok4 trap distractor strong body.", + "synopsis": "omega phi chi psi unrelated", + }, + "C": {"text": "neutral cccc content.", "synopsis": "gamma delta epsilon zeta"}, + "D": {"text": "neutral dddd content.", "synopsis": "eta theta iota kappa"}, +} +ROUTING_QUERY = "rtok1 rtok2 rtok3 rtok4 ANSTOK" + + +def _routing_corpus(): + strat = with_synopsis( + ir.Chunked(chunk_size=80, overlap=10), + synthesize=_synopsis_from_field, + synthesizer_id="v1", + ) + return ir.build( + ir.CorpusSource.from_mapping(ROUTING_DOCS, name="rt", strategy=strat), + store=CorpusStore.memory(), + embedder="light", + ) + + +def test_synopsis_routes_traverse_to_chunks_end_to_end(): + corpus = _routing_corpus() + trav = ir.traverse( + ROUTING_QUERY, corpus, policy=ir.collapsed_tree_policy(seed_k=2), k=10 + ) + assert trav + # routed via the synopsis (summary) down to chunk leaves — synopses themselves + # are routers, never emitted. + assert all(h.surface_kind == "chunk" for h in trav) + # gold A (synopsis matched) surfaces its answer chunk... + assert trav[0].artifact_id == "A" + assert "ANSTOK" in trav[0].text + # ...and trap B (synopsis didn't match -> not seeded) is excluded, though its + # chunk matches the query terms. + assert all(h.artifact_id != "B" for h in trav) + # walk provenance: a routed leaf at depth 1, seeded by A's synopsis. + assert trav[0].metadata["walk_depth"] == 1 + assert trav[0].metadata["seed"] == "A" + + +def test_flat_search_buries_the_answer_that_synopsis_routing_surfaces(): + # The discriminator: among chunks, flat ranks B's trap chunk first; synopsis + # routing excludes B entirely — proof the synopsis route does real work. + corpus = _routing_corpus() + flat = ir.search(corpus, ROUTING_QUERY, k=10, per_artifact=False) + flat_chunks = [h for h in flat if h.surface_kind == "chunk"] + assert flat_chunks and flat_chunks[0].artifact_id == "B" + + +# --------------------------------------------------------------------------- # +# Staleness: incremental re-synthesis keyed on synthesizer + strategy identity +# --------------------------------------------------------------------------- # + + +def test_incremental_resynthesizes_only_changed_and_on_identity_change(): + calls = [] + + def synth(artifact): + calls.append(artifact.id) + return f"summary of {artifact.id}" + + def source(docs, sid): + strat = with_synopsis(ir.Chunked(), synthesize=synth, synthesizer_id=sid) + return ir.CorpusSource.from_mapping(docs, name="inc", strategy=strat) + + store = CorpusStore.memory() + docs = {"A": {"text": "a body"}, "B": {"text": "b body"}} + + ir.build(source(docs, "v1"), store=store, embedder="light") + assert sorted(calls) == ["A", "B"] # first build synthesizes both + calls.clear() + + ir.build(source(docs, "v1"), store=store, embedder="light") + assert calls == [] # unchanged rebuild -> no synthesis + + docs2 = {"A": {"text": "a body CHANGED"}, "B": {"text": "b body"}} + ir.build(source(docs2, "v1"), store=store, embedder="light") + assert calls == ["A"] # only the changed artifact re-synthesized + calls.clear() + + ir.build(source(docs2, "v2"), store=store, embedder="light") + assert sorted(calls) == ["A", "B"] # synthesizer identity changed -> all + + +def test_strategy_id_recurses_into_inner_strategy_and_synthesizer(): + base = with_synopsis( + ir.Chunked(chunk_size=500), synthesize=_synopsis_from_field, synthesizer_id="v1" + ) + inner_changed = with_synopsis( + ir.Chunked(chunk_size=900), synthesize=_synopsis_from_field, synthesizer_id="v1" + ) + synth_changed = with_synopsis( + ir.Chunked(chunk_size=500), synthesize=_synopsis_from_field, synthesizer_id="v2" + ) + # an inner-strategy param change AND a synthesizer-id change each shift the id + assert _strategy_id(base) != _strategy_id(inner_changed) + assert _strategy_id(base) != _strategy_id(synth_changed) + # the inner strategy's own identity is embedded (recursion), not the callable + assert "Chunked:" in _strategy_id(base) + + +# --------------------------------------------------------------------------- # +# Offline: oa is lazy; an injected synthesizer never needs it +# --------------------------------------------------------------------------- # + + +def test_make_llm_synthesizer_uses_injected_summarize_without_oa(): + synth = make_llm_synthesizer(summarize=lambda text: "S:" + text[:5]) + assert synth(Artifact("x", "hello world")) == "S:hello" + + +def test_make_llm_synthesizer_empty_text_returns_empty(): + synth = make_llm_synthesizer(summarize=lambda text: "never reached") + assert synth(Artifact("x", " ")) == "" + + +def test_make_llm_synthesizer_swallows_summarizer_errors(): + def boom(text): + raise RuntimeError("no LLM available") + + assert make_llm_synthesizer(summarize=boom)(Artifact("x", "body text")) == "" + + +def test_default_synthesizer_constructs_offline_and_stamps_identity(): + # No synthesize injected -> the lazy-oa default; constructing must not need oa, + # and an empty-text artifact short-circuits before the oa path is reached. + strat = with_synopsis(ir.Chunked()) + assert strat.synthesizer_id.startswith("oa:") + assert strat.synthesize(Artifact("x", "")) == "" + # a prompt change shifts the default identity (re-synthesis trigger) + a = make_llm_synthesizer() + b = make_llm_synthesizer(prompt="a very different prompt {text}") + assert a.synthesizer_id.startswith("oa:") and a.synthesizer_id != b.synthesizer_id + + +# --------------------------------------------------------------------------- # +# Review-driven hardening (#48 adversarial review) +# --------------------------------------------------------------------------- # + + +def test_default_synthesizer_threads_inner_strategy_text_key(monkeypatch): + # The default synthesizer must summarize the SAME field the inner strategy + # indexes: with_synopsis(Chunked(text_key="body")) summarizes "body", not a + # competing "text" field. Capture the summarized text via a patched default. + seen = [] + import ir.synopsis as _syn + + monkeypatch.setattr( + _syn, + "_default_llm_summarizer", + lambda prompt, model, **kw: (lambda t: (seen.append(t) or "S")), + ) + strat = with_synopsis(ir.Chunked(text_key="body")) # default synth, no id arg + docs = {"a": {"body": "REAL body content here", "text": "WRONG text field"}} + ir.build( + ir.CorpusSource.from_mapping(docs, name="tk", strategy=strat), + store=CorpusStore.memory(), + embedder="light", + ) + assert seen == ["REAL body content here"] + + +def test_unnamed_synthesizer_warns_and_disables_staleness_tracking(): + # A lambda / local closure has no stable identity -> swapping it would be + # silently stale; with_synopsis warns and uses a sentinel id instead. + with pytest.warns(UserWarning, match="stable identity"): + s = with_synopsis(ir.Chunked(), synthesize=lambda a: "x") + assert s.synthesizer_id == "custom" + # a named top-level function has a stable qualname -> tracks, no warning. + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("error") + s2 = with_synopsis(ir.Chunked(), synthesize=_synopsis_from_field) + assert s2.synthesizer_id == "_synopsis_from_field" + + +def test_with_synopsis_package_prepends_synopsis_before_description_and_routes(): + # On Package (which already has a "description" summary surface), the synopsis + # is prepended ahead of it -> the synopsis is the router. An artifact whose + # synopsis matches (description does not) routes to its readme chunk. + docs = { + "A": { + "name": "alpha", + "description": "terse unrelated blurb", + "readme": "ANSTOK the answer lives in the body here. filler one two.", + "synopsis": "rtok1 rtok2 rtok3 rtok4 routing match", + }, + "B": { + "name": "beta", + "description": "gamma delta", + "readme": "neutral bbbb body.", + "synopsis": "omega phi unrelated", + }, + "C": { + "name": "c", + "description": "x", + "readme": "neutral cccc body.", + "synopsis": "eta theta iota", + }, + } + strat = with_synopsis( + ir.Package(chunk_size=80, overlap=10), + synthesize=_synopsis_from_field, + synthesizer_id="v1", + ) + corpus = ir.build( + ir.CorpusSource.from_mapping(docs, name="pkg", strategy=strat), + store=CorpusStore.memory(), + embedder="light", + ) + kinds = [r.surface_kind for r in records_for_artifact(corpus, "A")] + assert kinds[0] == "synopsis" # prepended ahead of... + assert "description" in kinds # ...the Package description surface + assert "readme_chunk" in kinds + # routes via A's matching synopsis to A's answer chunk (description doesn't match) + trav = ir.traverse( + "rtok1 rtok2 rtok3 rtok4 ANSTOK", + corpus, + policy=ir.collapsed_tree_policy(seed_k=1), + k=5, + ) + assert trav and trav[0].artifact_id == "A" + assert trav[0].surface_kind == "readme_chunk" + assert "ANSTOK" in trav[0].text + + +def test_default_synthesizer_construction_is_lazy_and_offline(monkeypatch): + # Mutation-resistant offline guarantee: poison oa so any use raises, then + # confirm constructing the default synthesizer (and the wrapper) and the + # injected path never reach it. An eager-import regression would fail here. + import sys + import types + + poison = types.ModuleType("oa") + + def _boom(*a, **k): + raise AssertionError("oa.prompt_function reached on an offline path") + + poison.prompt_function = _boom + monkeypatch.setitem(sys.modules, "oa", poison) + + synth = make_llm_synthesizer() # lazy: builds nothing yet + strat = with_synopsis(ir.Chunked()) # default synth: still no oa + assert synth(Artifact("x", "")) == "" # empty text short-circuits the oa path + assert strat.synthesize(Artifact("y", "")) == "" + # an injected summarizer is wholly oa-free even on non-empty text + assert make_llm_synthesizer(summarize=lambda t: "S")(Artifact("z", "body")) == "S" + + +def test_default_synthesizer_id_is_content_stable(): + # The default id is content-derived (prompt/model), so two constructions with + # the same prompt agree — staleness keys on content, not object identity. + a = make_llm_synthesizer() + b = make_llm_synthesizer() + assert a.synthesizer_id == b.synthesizer_id + c = make_llm_synthesizer(prompt="a different prompt {text}") + assert c.synthesizer_id != a.synthesizer_id + + +def test_non_str_synthesize_yields_no_synopsis_surface(): + # The decompose-level guard (independent of make_llm_synthesizer): a synth + # returning a non-str adds no synopsis surface, never crashes the build. + strat = with_synopsis( + ir.Chunked(), synthesize=lambda a: 123, synthesizer_id="nonstr" + ) + corpus = ir.build( + ir.CorpusSource.from_mapping( + {"a": {"text": "body alpha"}}, name="ns", strategy=strat + ), + store=CorpusStore.memory(), + embedder="light", + ) + recs = records_for_artifact(corpus, "a") + assert all(r.surface_kind != "synopsis" for r in recs) + assert any(r.surface_kind == "chunk" for r in recs) + + +def test_file_backed_incremental_round_trip(tmp_path, monkeypatch): + # Staleness through a real file store + reopen: the recursive strategy_id + # (nested inner id + synthesizer id) survives JSON round-trip in the ledger. + monkeypatch.setenv("IR_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("IR_CONFIG_DIR", str(tmp_path / "config")) + monkeypatch.setenv("IR_CACHE_DIR", str(tmp_path / "cache")) + calls = [] + + def synth(artifact): + calls.append(artifact.id) + return f"synopsis of {artifact.id}" + + docs = {"A": {"text": "a body"}, "B": {"text": "b body"}} + + def source(sid): + strat = with_synopsis(ir.Chunked(), synthesize=synth, synthesizer_id=sid) + return ir.CorpusSource.from_mapping(docs, name="syn_fb", strategy=strat) + + ir.build(source("v1"), store=CorpusStore.local("syn_fb"), embedder="light") + assert sorted(calls) == ["A", "B"] + # the synopsis surface persisted to disk and reopens + recs = records_for_artifact(CorpusStore.local("syn_fb"), "A") + assert any(r.surface_kind == "synopsis" for r in recs) + calls.clear() + + ir.build(source("v1"), store=CorpusStore.local("syn_fb"), embedder="light") + assert calls == [] # unchanged rebuild through a fresh handle -> no synthesis + ir.build(source("v2"), store=CorpusStore.local("syn_fb"), embedder="light") + assert sorted(calls) == ["A", "B"] # synthesizer identity changed -> all