From e460b69d9d4d3838eb78a643eadaa0635829f027 Mon Sep 17 00:00:00 2001 From: Roman Pushkin Date: Thu, 11 Jun 2026 09:20:57 -0700 Subject: [PATCH 1/2] fix(weave): exclude op methods from LLMAsAJudgeScorer publish payload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Programmatically created LLMAsAJudgeScorer objects published to an online-scoring monitor carried op refs (score, the inherited summarize, and the nested model's predict) that serialize to CustomWeaveType(Op) and trip the scoring worker's safety guard, so the monitor silently never scored. Building the same scorer in the UI worked. Set _weave_exclude_ops_from_record on LLMAsAJudgeScorer and LLMStructuredCompletionModel — the same opt-out RemoteScorer uses (#7036) — so the published shape matches what the UI already persists. The ops still run and trace at runtime; only the unused stored ref is dropped. SDK-only: a ClassVar is not a pydantic field, so there is no schema change. --- .../test_cases/library_cases.py | 37 +--------- tests/trace/test_llm_as_a_judge_scorer.py | 74 +++++++++++++++++++ weave/scorers/llm_as_a_judge_scorer.py | 8 +- .../llm_structured_model.py | 8 +- 4 files changed, 92 insertions(+), 35 deletions(-) diff --git a/tests/trace/data_serialization/test_cases/library_cases.py b/tests/trace/data_serialization/test_cases/library_cases.py index 302c8508e417..cedbbc1d9ad8 100644 --- a/tests/trace/data_serialization/test_cases/library_cases.py +++ b/tests/trace/data_serialization/test_cases/library_cases.py @@ -79,10 +79,10 @@ def evaluation_equality_check(a, b): # When doing this, replace "llm_as_a_judge_scorer_digest" with the current value of llm_as_a_judge_scorer_digest_for_current_non_legacy_test_on_old_python # Do this, rather than creating a new variable, because each new version of legacy test case will need a different value. llm_as_a_judge_scorer_digest_for_current_non_legacy_test_on_current_python = ( - "2KxvtdkmqyR5ZvsUpuSnH5n7gNu26DzohresdH4sgr4" + "4U7vV5XKCkJ0uOdkflK1O2b6jU3vUY17ZJtoItgc0iA" ) llm_as_a_judge_scorer_digest_for_current_non_legacy_test_on_old_python = ( - "2KxvtdkmqyR5ZvsUpuSnH5n7gNu26DzohresdH4sgr4" + "4U7vV5XKCkJ0uOdkflK1O2b6jU3vUY17ZJtoItgc0iA" ) llm_as_a_judge_scorer_digest = ( llm_as_a_judge_scorer_digest_for_current_non_legacy_test_on_current_python @@ -170,7 +170,7 @@ def evaluation_equality_check(a, b): "name": None, "description": None, "column_map": None, - "model": "weave:///shawn/test-project/object/LLMStructuredCompletionModel:gsLyIHy6h9PE8KVMoKpXcYykXOMQamcLBTvzPU7vNN4", + "model": "weave:///shawn/test-project/object/LLMStructuredCompletionModel:pzXf4DUrjqEMPKQTP4mZnjUp2G7lEGocXS8J1Jk8dqg", "enable_image_input_scoring": True, "enable_audio_input_scoring": True, "enable_video_input_scoring": True, @@ -178,24 +178,13 @@ def evaluation_equality_check(a, b): "$.messages[0].content[1].input_audio" ], "scoring_prompt": "Here are the inputs: {inputs}. Here is the output: {output}. Is the output correct?", - "score": "weave:///shawn/test-project/op/LLMAsAJudgeScorer.score:6xWBXgbLjYI67G1Uvms2dCWP2izbVABBwvqmx00CUT4", - "summarize": "weave:///shawn/test-project/op/Scorer.summarize:LYcmOkxmx4hRYtJ65hnd4uy7jhdYJCnhYXys5aakzfo", "_class_name": "LLMAsAJudgeScorer", "_bases": ["Scorer", "Object", "BaseModel"], }, }, - { - "object_id": "LLMAsAJudgeScorer.score", - "digest": "6xWBXgbLjYI67G1Uvms2dCWP2izbVABBwvqmx00CUT4", - "exp_val": { - "_type": "CustomWeaveType", - "weave_type": {"type": "Op"}, - "files": {"obj.py": "fqXqYs4C4l0HpQOaRfbVXwsvwYUZhMYyn4cvK0wnCMU"}, - }, - }, { "object_id": "LLMStructuredCompletionModel", - "digest": "gsLyIHy6h9PE8KVMoKpXcYykXOMQamcLBTvzPU7vNN4", + "digest": "pzXf4DUrjqEMPKQTP4mZnjUp2G7lEGocXS8J1Jk8dqg", "exp_val": { "_type": "LLMStructuredCompletionModel", "name": None, @@ -228,20 +217,10 @@ def evaluation_equality_check(a, b): "_class_name": "LLMStructuredCompletionModelDefaultParams", "_bases": ["BaseModel"], }, - "predict": "weave:///shawn/test-project/op/LLMStructuredCompletionModel.predict:M6uEk3KmOzZagYl3tJBeoiOHX7opfOQyuqnSguDXjPI", "_class_name": "LLMStructuredCompletionModel", "_bases": ["Model", "Object", "BaseModel"], }, }, - { - "object_id": "LLMStructuredCompletionModel.predict", - "digest": "M6uEk3KmOzZagYl3tJBeoiOHX7opfOQyuqnSguDXjPI", - "exp_val": { - "_type": "CustomWeaveType", - "weave_type": {"type": "Op"}, - "files": {"obj.py": "1GtS3cAyf0xckKcss0LQesVtm44iEG49EsX1xuzTmvc"}, - }, - }, { "object_id": "Evaluation.summarize", "digest": "Y0s05NYTuqlmXieehHPogfq2JXKl4Y1Xgy8CKumdmjI", @@ -291,14 +270,6 @@ def evaluation_equality_check(a, b): "digest": "vY6VtT9xBAKNfqhozgQdWEGuijncPtmZLYKrXexUERY", "exp_content": b'import weave\nfrom weave.object.obj import Object\nfrom weave.trace.table import Table\nfrom weave.flow.util import transpose\nfrom weave.flow.scorer import get_scorer_attributes\nfrom weave.flow.scorer import auto_summarize\nfrom weave.trace.op import op\n\nclass EvaluationResults(Object):\n rows: Table\n\n@weave.op\n@op\nasync def summarize(self, eval_table: EvaluationResults) -> dict:\n eval_table_rows = list(eval_table.rows)\n cols = transpose(eval_table_rows)\n summary = {}\n\n for name, vals in cols.items():\n if name == "scores":\n if scorers := self.scorers:\n for scorer in scorers:\n scorer_attributes = get_scorer_attributes(scorer)\n scorer_name = scorer_attributes.scorer_name\n summarize_fn = scorer_attributes.summarize_fn\n scorer_stats = transpose(vals)\n score_table = scorer_stats[scorer_name]\n scored = summarize_fn(score_table)\n summary[scorer_name] = scored\n else:\n model_output_summary = auto_summarize(vals)\n if model_output_summary:\n summary[name] = model_output_summary\n return summary\n', }, - { - "digest": "fqXqYs4C4l0HpQOaRfbVXwsvwYUZhMYyn4cvK0wnCMU", - "exp_content": b'import weave\nfrom typing import Any\nfrom weave.prompt.prompt import MessagesPrompt\nfrom weave.trace.op import op\n\n@weave.op\n@op\ndef score(self, *, output: str, **kwargs: Any) -> Any:\n """Score the output using the scoring_prompt."""\n if isinstance(self.scoring_prompt, MessagesPrompt):\n model_input = self.scoring_prompt.format(output=output, **kwargs)\n else:\n scoring_prompt = self.scoring_prompt.format(output=output, **kwargs)\n model_input = [{"role": "user", "content": scoring_prompt}]\n return self.model.predict(model_input)\n', - }, - { - "digest": "1GtS3cAyf0xckKcss0LQesVtm44iEG49EsX1xuzTmvc", - "exp_content": b'import weave\nfrom typing import Annotated as MessageListLike\nfrom typing import Annotated as LLMStructuredModelParamsLike\nfrom typing import Any\nfrom weave.trace.context.weave_client_context import get_weave_client\nfrom weave.trace.context.weave_client_context import WeaveInitError\nfrom weave.utils.project_id import to_project_id\nfrom typing import Literal as ResponseFormat\nimport json\nfrom pydantic.main import BaseModel\nfrom weave.trace.op import op\n\nclass Message(BaseModel):\n """A message in a conversation with an LLM.\n\n Attributes:\n role: The role of the message\'s author. Can be: system, user, assistant, function or tool.\n content: The contents of the message. Required for all messages, but may be null for assistant messages with function calls.\n name: The name of the author of the message. Required if role is "function". Must match the name of the function represented in content.\n Can contain characters (a-z, A-Z, 0-9), and underscores, with a maximum length of 64 characters.\n function_call: The name and arguments of a function that should be called, as generated by the model.\n tool_call_id: Tool call that this message is responding to.\n """\n\n role: str\n content: str | list[dict] | None = None\n name: str | None = None\n function_call: dict | None = None\n tool_call_id: str | None = None\n\ndef parse_response(\n response_payload: dict, response_format: ResponseFormat | None\n) -> Message | str | dict[str, Any]:\n """Extract the model output from an LLM completion response payload.\n\n Raises:\n RuntimeError: the provider returned a top-level `error` field.\n ValueError: the payload is malformed (missing choices/message), the\n content is None/empty, or json_object parsing failed.\n """\n if response_payload.get("error"):\n raise RuntimeError(f"LLM API returned an error: {response_payload[\'error\']}")\n\n choices = response_payload.get("choices")\n if not choices:\n raise ValueError(\n "LLM response is missing \'choices\' -> the upstream call likely failed "\n "(invalid API key, content filtering, or provider error). "\n f"Response keys: {sorted(response_payload.keys())}"\n )\n\n message = choices[0].get("message") if isinstance(choices[0], dict) else None\n if not isinstance(message, dict):\n raise TypeError(\n f"LLM response choice did not contain a message dict: {choices[0]!r}"\n )\n content = message.get("content")\n\n if response_format == "text":\n if content is None:\n raise ValueError(\n "LLM response content is None -> the model returned no text. "\n "Check your API key, model config, and content filtering settings."\n )\n return content\n elif response_format == "json_object":\n if content is None or (isinstance(content, str) and not content.strip()):\n raise ValueError(\n "LLM response content was empty when JSON output was requested. "\n "Check your API key and that the model supports JSON mode."\n )\n try:\n return json.loads(content)\n except json.JSONDecodeError as e:\n snippet = content if len(content) <= 200 else content[:200] + "..."\n raise ValueError(\n f"LLM response was not valid JSON (response_format=json_object). "\n f"Content snippet: {snippet!r}"\n ) from e\n else:\n raise ValueError(f"Invalid response_format: {response_format}")\n\n@weave.op\n@op\ndef predict(\n self,\n user_input: MessageListLike | None = None,\n config: LLMStructuredModelParamsLike | None = None,\n **template_vars: Any,\n) -> Message | str | dict[str, Any]:\n """Generates a prediction by preparing messages (template + user_input)\n and calling the LLM completions endpoint with overridden config, using the provided client.\n\n Messages are prepared in one of two ways:\n 1. If default_params.prompt is set, the referenced MessagesPrompt object is\n loaded and its format() method is called with template_vars to generate messages.\n 2. If default_params.messages_template is set (and prompt is not), the template\n messages are used with template variable substitution.\n\n Note: If both prompt and messages_template are provided, prompt takes precedence.\n\n Args:\n user_input: The user input messages to append after template messages\n config: Optional configuration to override default parameters\n **template_vars: Variables to substitute in the messages template using {variable_name} syntax\n """\n if user_input is None:\n user_input = []\n\n current_client = get_weave_client()\n if current_client is None:\n raise WeaveInitError(\n "You must call `weave.init()` first, to predict with a LLMStructuredCompletionModel"\n )\n\n req = self.prepare_completion_request(\n project_id=to_project_id(current_client.entity, current_client.project),\n user_input=user_input,\n config=config,\n **template_vars,\n )\n\n # 5. Call the LLM API\n try:\n api_response = current_client.server.completions_create(req=req)\n except Exception as e:\n raise RuntimeError("Failed to call LLM completions endpoint.") from e\n\n # 6. Extract the message from the API response\n try:\n # The \'response\' attribute of CompletionsCreateRes is a dict\n response_payload = api_response.response\n response_format = (\n req.inputs.response_format.get("type")\n if req.inputs.response_format is not None\n else None\n )\n return parse_response(response_payload, response_format)\n except (\n KeyError,\n IndexError,\n TypeError,\n AttributeError,\n ValueError,\n json.JSONDecodeError,\n ) as e:\n raise RuntimeError(\n f"Failed to extract message from LLM response payload. Response: {api_response.response}"\n ) from e\n', - }, ], # Sad ... equality is really a pain to assert here (and is broken) # TODO: Write a good equality check and make it work diff --git a/tests/trace/test_llm_as_a_judge_scorer.py b/tests/trace/test_llm_as_a_judge_scorer.py index 59f4ef06001e..eadaf45c0081 100644 --- a/tests/trace/test_llm_as_a_judge_scorer.py +++ b/tests/trace/test_llm_as_a_judge_scorer.py @@ -1,8 +1,12 @@ from unittest.mock import patch import weave +from weave.flow.scorer import Scorer from weave.prompt.prompt import MessagesPrompt from weave.scorers import LLMAsAJudgeScorer +from weave.trace.object_record import pydantic_object_record +from weave.trace.refs import ObjectRef +from weave.trace_server import trace_server_interface as tsi from weave.trace_server.interface.builtin_object_classes.builtin_object_registry import ( LLMStructuredCompletionModel, ) @@ -105,3 +109,73 @@ def test_score_with_messages_prompt(): assert len(messages) == 2 assert messages[0]["content"] == "You are a math judge." assert messages[1]["content"] == "Expected: 4, Got: 4" + + +def _make_judge_scorer() -> LLMAsAJudgeScorer: + return LLMAsAJudgeScorer( + model=LLMStructuredCompletionModel( + llm_model_id="gpt-4o-mini", + default_params=LLMStructuredCompletionModelDefaultParams( + response_format="json_object", + ), + ), + scoring_prompt="Output: {output}", + ) + + +def test_llm_as_a_judge_scorer_record_excludes_op_methods(): + """WB-35184: the scorer and its nested model must not record their @op methods. + + Publishing those embeds CustomWeaveType(Op) payloads that the scoring worker + rejects (``_assert_safe_scorer_payload``), so a programmatically created judge + monitor silently never scores. Both classes opt out via + ``_weave_exclude_ops_from_record``; a plain Scorer subclass still records its ops. + """ + scorer = _make_judge_scorer() + + scorer_record = pydantic_object_record(scorer) + assert "score" not in scorer_record.__dict__ + assert "summarize" not in scorer_record.__dict__ + assert scorer_record._class_name == "LLMAsAJudgeScorer" + + model_record = pydantic_object_record(scorer.model) + assert "predict" not in model_record.__dict__ + assert model_record._class_name == "LLMStructuredCompletionModel" + + class _PlainScorer(Scorer): + pass + + plain_record = pydantic_object_record(_PlainScorer(name="plain")) + assert "score" in plain_record.__dict__ + assert "summarize" in plain_record.__dict__ + + +def test_llm_as_a_judge_scorer_publish_has_no_op_refs(client): + """The published payload must carry no op refs, so the scoring worker accepts it. + + The worker walks the payload, follows refs, and fails closed on any nested + CustomWeaveType(Op). Previously the scorer's score/summarize and the nested + model's predict serialized as op refs and tripped that guard (WB-35184). + """ + scorer = _make_judge_scorer() + ref = weave.publish(scorer) + + def stored_val(name: str, digest: str) -> dict: + res = client.server.obj_read( + tsi.ObjReadReq(project_id=client.project_id, object_id=name, digest=digest) + ) + return res.obj.val + + scorer_val = stored_val(ref.name, ref.digest) + assert "score" not in scorer_val + assert "summarize" not in scorer_val + + # The nested model is published as its own ref; resolve and check it too. + model_ref = ObjectRef.parse_uri(scorer_val["model"]) + model_val = stored_val(model_ref.name, model_ref.digest) + assert "predict" not in model_val + + # The scorer still round-trips back to a usable object. + loaded = weave.get(ref.uri) + assert isinstance(loaded, LLMAsAJudgeScorer) + assert isinstance(loaded.model, LLMStructuredCompletionModel) diff --git a/weave/scorers/llm_as_a_judge_scorer.py b/weave/scorers/llm_as_a_judge_scorer.py index 38b943c6de45..d13540718833 100644 --- a/weave/scorers/llm_as_a_judge_scorer.py +++ b/weave/scorers/llm_as_a_judge_scorer.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, ClassVar from pydantic import AliasChoices, ConfigDict, Field, field_validator @@ -29,6 +29,12 @@ class LLMAsAJudgeScorer(Scorer): model_config = ConfigDict(populate_by_name=True) + # Don't serialize score()/summarize() as op refs on publish: the resulting + # CustomWeaveType(Op) payloads are what the scoring worker's safety guard + # rejects, and nothing reads them (the @op still wraps the live method). This + # matches the op-free shape the Weave UI already persists. See WB-35184. + _weave_exclude_ops_from_record: ClassVar[bool] = True + model: LLMStructuredCompletionModel scoring_prompt: str | MessagesPrompt enable_image_input_scoring: bool = False diff --git a/weave/trace_server/interface/builtin_object_classes/llm_structured_model.py b/weave/trace_server/interface/builtin_object_classes/llm_structured_model.py index 0024ee4c3174..ca73404643b7 100644 --- a/weave/trace_server/interface/builtin_object_classes/llm_structured_model.py +++ b/weave/trace_server/interface/builtin_object_classes/llm_structured_model.py @@ -1,5 +1,5 @@ import json -from typing import Annotated, Any, Literal +from typing import Annotated, Any, ClassVar, Literal from pydantic import BaseModel, BeforeValidator, Field @@ -121,6 +121,12 @@ def cast_to_llm_structured_model_params( class LLMStructuredCompletionModel(Model): + # Don't serialize predict() as an op ref on publish: nested inside a + # published LLMAsAJudgeScorer it embeds a CustomWeaveType(Op) payload the + # scoring worker's safety guard rejects, and nothing reads the ref (the @op + # still wraps the live method). See WB-35184. + _weave_exclude_ops_from_record: ClassVar[bool] = True + # / or ref to a provider model llm_model_id: str | base_object_def.RefStr From 7f4d56a178db6f56d3f4ff799331434fd0aea1a8 Mon Sep 17 00:00:00 2001 From: Roman Pushkin Date: Fri, 12 Jun 2026 12:33:03 -0700 Subject: [PATCH 2/2] test(weave): snapshot pre-change LLMAsAJudge serialization as legacy case The serialization tests require copying a case to a new is_legacy=True case before modifying the live one, so the prior wire format stays covered as a deserialization contract. The op-excluding change modified the live case but skipped that snapshot step; add it as legacy v6 (the with-ops shape). It passes, confirming data written by older clients still round-trips under the new code. --- .../test_cases/library_cases.py | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) diff --git a/tests/trace/data_serialization/test_cases/library_cases.py b/tests/trace/data_serialization/test_cases/library_cases.py index 44e109f724d4..b13a778ecd68 100644 --- a/tests/trace/data_serialization/test_cases/library_cases.py +++ b/tests/trace/data_serialization/test_cases/library_cases.py @@ -276,6 +276,219 @@ def evaluation_equality_check(a, b): equality_check=lambda a, b: True, python_version_code_capture=(3, 13), ), + SerializationTestCase( + id="Library Objects - Scorer, Evaluation, Dataset, LLMAsAJudgeScorer, LLMStructuredCompletionModel (legacy v6)", + runtime_object_factory=make_evaluation, + inline_call_param=False, + is_legacy=True, + exp_json={ + "_type": "Evaluation", + "name": None, + "description": None, + "dataset": "weave:///shawn/test-project/object/Dataset:YLYVrBqCtlMOa770T1oPssqYnf9rgqdnY5hVCwRcrm8", + "scorers": [ + "weave:///shawn/test-project/object/MyScorer:erh2OhYuvmiYF5MAHd2iNbtJkiAfvkHYn5l78CV6XrU", + "weave:///shawn/test-project/object/LLMAsAJudgeScorer:usU9eU7is5YeNlwmYcSOHYfjJB8xHGCXXUVpm6dBbfc", + ], + "preprocess_model_input": None, + "trials": 1, + "metadata": None, + "evaluation_name": None, + "evaluate": "weave:///shawn/test-project/op/Evaluation.evaluate:vvs7uu17cnFTlOPIrYnneU8AVfSihXwDk0kMHf6w6cU", + "predict_and_score": "weave:///shawn/test-project/op/Evaluation.predict_and_score:jd4m1EJuNnrGmHeiGY1T2CUngsk9x7knOgRJ2sYpU2g", + "summarize": "weave:///shawn/test-project/op/Evaluation.summarize:Y0s05NYTuqlmXieehHPogfq2JXKl4Y1Xgy8CKumdmjI", + "_class_name": "Evaluation", + "_bases": ["Object", "BaseModel"], + }, + exp_objects=[ + { + "object_id": "Dataset", + "digest": "YLYVrBqCtlMOa770T1oPssqYnf9rgqdnY5hVCwRcrm8", + "exp_val": { + "_type": "Dataset", + "name": None, + "description": None, + "rows": "weave:///shawn/test-project/table/97126095885a61df726e0d1d6197db7c55784b083b33a2c10e6ca8e0a1d4889e", + "_class_name": "Dataset", + "_bases": ["Object", "BaseModel"], + }, + }, + { + "object_id": "Evaluation.evaluate", + "digest": "vvs7uu17cnFTlOPIrYnneU8AVfSihXwDk0kMHf6w6cU", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + # Updated with G004 lint fix (f-string -> %s formatting in logging) + "files": {"obj.py": "qmkYo6tZ2imoZbCzmOJlpsZq0T3mTOQAm5NJVYnXYHQ"}, + }, + }, + { + "object_id": "Evaluation.predict_and_score", + "digest": "jd4m1EJuNnrGmHeiGY1T2CUngsk9x7knOgRJ2sYpU2g", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + "files": {"obj.py": "ZHD4K7uUDPT93NdVQO3I6F9Xah9AEceWYBSQXg1bZPM"}, + }, + }, + { + "object_id": "MyScorer", + "digest": "erh2OhYuvmiYF5MAHd2iNbtJkiAfvkHYn5l78CV6XrU", + "exp_val": { + "_type": "MyScorer", + "name": None, + "description": None, + "column_map": None, + "score": "weave:///shawn/test-project/op/MyScorer.score:lwLZn8tYQ025uYUv8SPwa1TlVfWSbzVSyw4aDynz1yQ", + "summarize": "weave:///shawn/test-project/op/Scorer.summarize:R9dPVXqD4IgSlmmGS5RA8uQPehMlvQ0CHRJMEMf1AMQ", + "_class_name": "MyScorer", + "_bases": ["Scorer", "Object", "BaseModel"], + }, + }, + { + "object_id": "LLMAsAJudgeScorer", + "digest": "usU9eU7is5YeNlwmYcSOHYfjJB8xHGCXXUVpm6dBbfc", + "exp_val": { + "_type": "LLMAsAJudgeScorer", + "name": None, + "description": None, + "column_map": None, + "model": "weave:///shawn/test-project/object/LLMStructuredCompletionModel:gsLyIHy6h9PE8KVMoKpXcYykXOMQamcLBTvzPU7vNN4", + "enable_image_input_scoring": True, + "enable_audio_input_scoring": True, + "enable_video_input_scoring": True, + "media_scoring_json_paths": [ + "$.messages[0].content[1].input_audio" + ], + "scoring_prompt": "Here are the inputs: {inputs}. Here is the output: {output}. Is the output correct?", + "score": "weave:///shawn/test-project/op/LLMAsAJudgeScorer.score:6xWBXgbLjYI67G1Uvms2dCWP2izbVABBwvqmx00CUT4", + "summarize": "weave:///shawn/test-project/op/Scorer.summarize:R9dPVXqD4IgSlmmGS5RA8uQPehMlvQ0CHRJMEMf1AMQ", + "_class_name": "LLMAsAJudgeScorer", + "_bases": ["Scorer", "Object", "BaseModel"], + }, + }, + { + "object_id": "LLMAsAJudgeScorer.score", + "digest": "6xWBXgbLjYI67G1Uvms2dCWP2izbVABBwvqmx00CUT4", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + "files": {"obj.py": "fqXqYs4C4l0HpQOaRfbVXwsvwYUZhMYyn4cvK0wnCMU"}, + }, + }, + { + "object_id": "LLMStructuredCompletionModel", + "digest": "gsLyIHy6h9PE8KVMoKpXcYykXOMQamcLBTvzPU7vNN4", + "exp_val": { + "_type": "LLMStructuredCompletionModel", + "name": None, + "description": None, + "llm_model_id": "gpt-4o-mini", + "default_params": { + "_type": "LLMStructuredCompletionModelDefaultParams", + "messages_template": [ + { + "_type": "Message", + "role": "system", + "content": "You are a judge, respond with json. 'score' (0-1), 'reasoning' (string)", + "name": None, + "function_call": None, + "tool_call_id": None, + "_class_name": "Message", + "_bases": ["BaseModel"], + } + ], + "prompt": None, + "temperature": None, + "top_p": None, + "max_tokens": None, + "presence_penalty": None, + "frequency_penalty": None, + "stop": None, + "n_times": None, + "functions": None, + "response_format": "json_object", + "_class_name": "LLMStructuredCompletionModelDefaultParams", + "_bases": ["BaseModel"], + }, + "predict": "weave:///shawn/test-project/op/LLMStructuredCompletionModel.predict:M6uEk3KmOzZagYl3tJBeoiOHX7opfOQyuqnSguDXjPI", + "_class_name": "LLMStructuredCompletionModel", + "_bases": ["Model", "Object", "BaseModel"], + }, + }, + { + "object_id": "LLMStructuredCompletionModel.predict", + "digest": "M6uEk3KmOzZagYl3tJBeoiOHX7opfOQyuqnSguDXjPI", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + "files": {"obj.py": "1GtS3cAyf0xckKcss0LQesVtm44iEG49EsX1xuzTmvc"}, + }, + }, + { + "object_id": "Evaluation.summarize", + "digest": "Y0s05NYTuqlmXieehHPogfq2JXKl4Y1Xgy8CKumdmjI", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + "files": {"obj.py": "vY6VtT9xBAKNfqhozgQdWEGuijncPtmZLYKrXexUERY"}, + }, + }, + { + "object_id": "MyScorer.score", + "digest": "lwLZn8tYQ025uYUv8SPwa1TlVfWSbzVSyw4aDynz1yQ", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + "files": {"obj.py": "Y7lSNR7UXFYVtxWyD8GOE3CFXRWfdLX2n1mcYfbSErs"}, + }, + }, + { + "object_id": "Scorer.summarize", + "digest": "R9dPVXqD4IgSlmmGS5RA8uQPehMlvQ0CHRJMEMf1AMQ", + "exp_val": { + "_type": "CustomWeaveType", + "weave_type": {"type": "Op"}, + "files": {"obj.py": "kxYDFAafHpBRX2O9hujrELhbFm3pGR6sAhzTfE1JdwA"}, + }, + }, + ], + exp_files=[ + { + "digest": "qmkYo6tZ2imoZbCzmOJlpsZq0T3mTOQAm5NJVYnXYHQ", + "exp_content": b'import weave\nfrom weave.trace.op_protocol import Op\nfrom weave.flow.model import Model\nimport json\nfrom weave.trace.op import op\nfrom weave.trace.call import Call\nfrom datetime import datetime\nfrom weave.flow.util import make_memorable_name\n\ndef _safe_summarize_to_str(summary: dict) -> str:\n summary_str = ""\n try:\n summary_str = json.dumps(summary, indent=2)\n except Exception:\n try:\n summary_str = str(summary)\n except Exception:\n pass\n return summary_str\n\nlogger = ""\n\ndef default_evaluation_display_name(call: Call) -> str:\n date = datetime.now().strftime("%Y-%m-%d")\n unique_name = make_memorable_name()\n return f"eval-{date}-{unique_name}"\n\n@weave.op\n@op(call_display_name=default_evaluation_display_name, eager_call_start=True)\nasync def evaluate(self, model: Op | Model) -> dict:\n eval_results = await self.get_eval_results(model)\n summary = await self.summarize(eval_results)\n\n summary_str = _safe_summarize_to_str(summary)\n if summary_str:\n logger.info("Evaluation summary %s", summary_str)\n\n return summary\n', + }, + { + "digest": "Y7lSNR7UXFYVtxWyD8GOE3CFXRWfdLX2n1mcYfbSErs", + "exp_content": b"import weave\n\n@weave.op\ndef score(self, user_input: str, output: str) -> str:\n return user_input in output\n", + }, + { + "digest": "kxYDFAafHpBRX2O9hujrELhbFm3pGR6sAhzTfE1JdwA", + "exp_content": b'import weave\nfrom numbers import Number\nfrom typing import Any\nfrom pydantic.main import BaseModel\nfrom weave.trace.op import op\n\ndef _import_numpy() -> Any | None:\n try:\n import numpy\n except ImportError:\n return None\n return numpy\n\ndef auto_summarize(data: list) -> dict[str, Any] | None:\n """Automatically summarize a list of (potentially nested) dicts.\n\n Computes:\n - avg for numeric cols\n - count and fraction for boolean cols\n - other col types are ignored\n\n If col is all None, result is None\n\n Returns:\n dict of summary stats, with structure matching input dict structure.\n """\n if not data:\n return {}\n data = [x for x in data if x is not None]\n\n if not data:\n return None\n\n val = data[0]\n\n if isinstance(val, bool):\n return {\n "true_count": (true_count := sum(1 for x in data if x)),\n "true_fraction": true_count / len(data),\n }\n elif isinstance(val, Number):\n if np := _import_numpy():\n return {"mean": np.mean(data).item()}\n else:\n return {"mean": sum(data) / len(data)}\n elif isinstance(val, dict):\n result = {}\n all_keys = list(\n dict.fromkeys([k for d in data if isinstance(d, dict) for k in d.keys()])\n )\n for k in all_keys:\n if (\n summary := auto_summarize(\n [x.get(k) for x in data if isinstance(x, dict)]\n )\n ) is not None:\n if k in summary:\n result.update(summary)\n else:\n result[k] = summary\n if not result:\n return None\n return result\n elif isinstance(val, BaseModel):\n return auto_summarize(\n [x.model_dump() if isinstance(x, BaseModel) else x for x in data]\n )\n return None\n\n@weave.op\n@op\ndef summarize(self, score_rows: list) -> dict | None:\n return auto_summarize(score_rows)\n', + }, + { + "digest": "ZHD4K7uUDPT93NdVQO3I6F9Xah9AEceWYBSQXg1bZPM", + "exp_content": b'import weave\nfrom weave.trace.op_protocol import Op\nfrom weave.flow.model import Model\nfrom weave.flow.model import apply_model_async\nfrom weave.flow.model import ApplyModelError\nimport asyncio\nfrom weave.flow.scorer import get_scorer_attributes\nfrom weave.trace.op import op\n\n@weave.op\n@op\nasync def predict_and_score(self, model: Op | Model, example: dict) -> dict:\n apply_model_result = await apply_model_async(\n model, example, self.preprocess_model_input\n )\n\n if isinstance(apply_model_result, ApplyModelError):\n return {\n self._output_key: None,\n "scores": {},\n "model_latency": apply_model_result.model_latency,\n }\n\n model_output = apply_model_result.model_output\n model_call = apply_model_result.model_call\n model_latency = apply_model_result.model_latency\n\n scores = {}\n if scorers := self.scorers:\n # Run all scorer calls in parallel\n scorer_tasks = [\n model_call.apply_scorer(scorer, example) for scorer in scorers\n ]\n apply_scorer_results = await asyncio.gather(*scorer_tasks)\n\n # Process results and build scores dict\n for scorer, apply_scorer_result in zip(\n scorers, apply_scorer_results, strict=False\n ):\n result = apply_scorer_result.result\n scorer_attributes = get_scorer_attributes(scorer)\n scorer_name = scorer_attributes.scorer_name\n scores[scorer_name] = result\n\n return {\n self._output_key: model_output,\n "scores": scores,\n "model_latency": model_latency,\n }\n', + }, + { + "digest": "vY6VtT9xBAKNfqhozgQdWEGuijncPtmZLYKrXexUERY", + "exp_content": b'import weave\nfrom weave.object.obj import Object\nfrom weave.trace.table import Table\nfrom weave.flow.util import transpose\nfrom weave.flow.scorer import get_scorer_attributes\nfrom weave.flow.scorer import auto_summarize\nfrom weave.trace.op import op\n\nclass EvaluationResults(Object):\n rows: Table\n\n@weave.op\n@op\nasync def summarize(self, eval_table: EvaluationResults) -> dict:\n eval_table_rows = list(eval_table.rows)\n cols = transpose(eval_table_rows)\n summary = {}\n\n for name, vals in cols.items():\n if name == "scores":\n if scorers := self.scorers:\n for scorer in scorers:\n scorer_attributes = get_scorer_attributes(scorer)\n scorer_name = scorer_attributes.scorer_name\n summarize_fn = scorer_attributes.summarize_fn\n scorer_stats = transpose(vals)\n score_table = scorer_stats[scorer_name]\n scored = summarize_fn(score_table)\n summary[scorer_name] = scored\n else:\n model_output_summary = auto_summarize(vals)\n if model_output_summary:\n summary[name] = model_output_summary\n return summary\n', + }, + { + "digest": "fqXqYs4C4l0HpQOaRfbVXwsvwYUZhMYyn4cvK0wnCMU", + "exp_content": b'import weave\nfrom typing import Any\nfrom weave.prompt.prompt import MessagesPrompt\nfrom weave.trace.op import op\n\n@weave.op\n@op\ndef score(self, *, output: str, **kwargs: Any) -> Any:\n """Score the output using the scoring_prompt."""\n if isinstance(self.scoring_prompt, MessagesPrompt):\n model_input = self.scoring_prompt.format(output=output, **kwargs)\n else:\n scoring_prompt = self.scoring_prompt.format(output=output, **kwargs)\n model_input = [{"role": "user", "content": scoring_prompt}]\n return self.model.predict(model_input)\n', + }, + { + "digest": "1GtS3cAyf0xckKcss0LQesVtm44iEG49EsX1xuzTmvc", + "exp_content": b'import weave\nfrom typing import Annotated as MessageListLike\nfrom typing import Annotated as LLMStructuredModelParamsLike\nfrom typing import Any\nfrom weave.trace.context.weave_client_context import get_weave_client\nfrom weave.trace.context.weave_client_context import WeaveInitError\nfrom weave.utils.project_id import to_project_id\nfrom typing import Literal as ResponseFormat\nimport json\nfrom pydantic.main import BaseModel\nfrom weave.trace.op import op\n\nclass Message(BaseModel):\n """A message in a conversation with an LLM.\n\n Attributes:\n role: The role of the message\'s author. Can be: system, user, assistant, function or tool.\n content: The contents of the message. Required for all messages, but may be null for assistant messages with function calls.\n name: The name of the author of the message. Required if role is "function". Must match the name of the function represented in content.\n Can contain characters (a-z, A-Z, 0-9), and underscores, with a maximum length of 64 characters.\n function_call: The name and arguments of a function that should be called, as generated by the model.\n tool_call_id: Tool call that this message is responding to.\n """\n\n role: str\n content: str | list[dict] | None = None\n name: str | None = None\n function_call: dict | None = None\n tool_call_id: str | None = None\n\ndef parse_response(\n response_payload: dict, response_format: ResponseFormat | None\n) -> Message | str | dict[str, Any]:\n """Extract the model output from an LLM completion response payload.\n\n Raises:\n RuntimeError: the provider returned a top-level `error` field.\n ValueError: the payload is malformed (missing choices/message), the\n content is None/empty, or json_object parsing failed.\n """\n if response_payload.get("error"):\n raise RuntimeError(f"LLM API returned an error: {response_payload[\'error\']}")\n\n choices = response_payload.get("choices")\n if not choices:\n raise ValueError(\n "LLM response is missing \'choices\' -> the upstream call likely failed "\n "(invalid API key, content filtering, or provider error). "\n f"Response keys: {sorted(response_payload.keys())}"\n )\n\n message = choices[0].get("message") if isinstance(choices[0], dict) else None\n if not isinstance(message, dict):\n raise TypeError(\n f"LLM response choice did not contain a message dict: {choices[0]!r}"\n )\n content = message.get("content")\n\n if response_format == "text":\n if content is None:\n raise ValueError(\n "LLM response content is None -> the model returned no text. "\n "Check your API key, model config, and content filtering settings."\n )\n return content\n elif response_format == "json_object":\n if content is None or (isinstance(content, str) and not content.strip()):\n raise ValueError(\n "LLM response content was empty when JSON output was requested. "\n "Check your API key and that the model supports JSON mode."\n )\n try:\n return json.loads(content)\n except json.JSONDecodeError as e:\n snippet = content if len(content) <= 200 else content[:200] + "..."\n raise ValueError(\n f"LLM response was not valid JSON (response_format=json_object). "\n f"Content snippet: {snippet!r}"\n ) from e\n else:\n raise ValueError(f"Invalid response_format: {response_format}")\n\n@weave.op\n@op\ndef predict(\n self,\n user_input: MessageListLike | None = None,\n config: LLMStructuredModelParamsLike | None = None,\n **template_vars: Any,\n) -> Message | str | dict[str, Any]:\n """Generates a prediction by preparing messages (template + user_input)\n and calling the LLM completions endpoint with overridden config, using the provided client.\n\n Messages are prepared in one of two ways:\n 1. If default_params.prompt is set, the referenced MessagesPrompt object is\n loaded and its format() method is called with template_vars to generate messages.\n 2. If default_params.messages_template is set (and prompt is not), the template\n messages are used with template variable substitution.\n\n Note: If both prompt and messages_template are provided, prompt takes precedence.\n\n Args:\n user_input: The user input messages to append after template messages\n config: Optional configuration to override default parameters\n **template_vars: Variables to substitute in the messages template using {variable_name} syntax\n """\n if user_input is None:\n user_input = []\n\n current_client = get_weave_client()\n if current_client is None:\n raise WeaveInitError(\n "You must call `weave.init()` first, to predict with a LLMStructuredCompletionModel"\n )\n\n req = self.prepare_completion_request(\n project_id=to_project_id(current_client.entity, current_client.project),\n user_input=user_input,\n config=config,\n **template_vars,\n )\n\n # 5. Call the LLM API\n try:\n api_response = current_client.server.completions_create(req=req)\n except Exception as e:\n raise RuntimeError("Failed to call LLM completions endpoint.") from e\n\n # 6. Extract the message from the API response\n try:\n # The \'response\' attribute of CompletionsCreateRes is a dict\n response_payload = api_response.response\n response_format = (\n req.inputs.response_format.get("type")\n if req.inputs.response_format is not None\n else None\n )\n return parse_response(response_payload, response_format)\n except (\n KeyError,\n IndexError,\n TypeError,\n AttributeError,\n ValueError,\n json.JSONDecodeError,\n ) as e:\n raise RuntimeError(\n f"Failed to extract message from LLM response payload. Response: {api_response.response}"\n ) from e\n', + }, + ], + # Sad ... equality is really a pain to assert here (and is broken) + # TODO: Write a good equality check and make it work + equality_check=lambda a, b: True, + python_version_code_capture=(3, 13), + ), SerializationTestCase( id="Library Objects - Scorer, Evaluation, Dataset, LLMAsAJudgeScorer, LLMStructuredCompletionModel (legacy v5)", runtime_object_factory=make_evaluation,