From aeba9422084b1c951abe8afec19750154232bc5a Mon Sep 17 00:00:00 2001 From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com> Date: Thu, 11 Jun 2026 15:37:49 -0700 Subject: [PATCH 1/5] Update samples to use the new feedback schema --- examples/remote_scorer/README.md | 22 ++++++++++++++++++---- examples/remote_scorer/scoring_logic.py | 6 +++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md index da98ec92d4c2..f94c6c7a9f92 100644 --- a/examples/remote_scorer/README.md +++ b/examples/remote_scorer/README.md @@ -40,7 +40,16 @@ The endpoint must return HTTP 200 with a JSON object. The required top-level contract fields are: - `schema_version`: required integer; must be `1`. -- `result`: required; can be any JSON-serializable shape you choose. +- `result`: required structured scorer output. By default, Weave requires this + to match the scorer result schema so feedback can be stored in typed scorer + columns. + +The simplest structured result is one score object: + +- `value`: required; either a tag string, max 36 characters, or a numeric rating + from `0.0` to `1.0`. +- `reason`: optional string explaining the score. +- `confidence`: optional numeric confidence from `0.0` to `1.0`. For example: @@ -48,13 +57,17 @@ For example: { "schema_version": 1, "result": { - "message_length": 32 + "value": 0.32, + "reason": "Message length is 32 characters.", + "confidence": 1.0 } } ``` Non-200 responses are treated as scorer failures by Weave. The `result` value is -the scorer output that Weave records as feedback. +the scorer output that Weave records as feedback. Weave also accepts multiple +structured score objects as either a bare list, for example +`[{"value": "short"}, {"value": 0.32}]`, or under a `scores` key. ## Auth @@ -191,5 +204,6 @@ adopting this exact FastAPI app. The important production requirements are: - HTTPS endpoint reachable from Weave. - Host allowlist configured if the Weave deployment requires it. - Bearer-token validation implemented with your identity/security standards. -- HTTP 200 response body shaped as `{"schema_version": 1, "result": ...}`. +- HTTP 200 response body shaped as + `{"schema_version": 1, "result": {"value": 0.9, "reason": "...", "confidence": 1.0}}`. - Optional dedupe uses `Idempotency-Key`. diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py index 38f911ac8f74..b76001064467 100644 --- a/examples/remote_scorer/scoring_logic.py +++ b/examples/remote_scorer/scoring_logic.py @@ -25,7 +25,11 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: message = inputs.get("message", "") if isinstance(inputs, dict) else "" if not isinstance(message, str): message = "" + message_length = len(message) + rating = min(message_length / 100, 1.0) return { - "message_length": len(message), + "value": round(rating, 2), + "reason": f"Message length is {message_length} characters.", + "confidence": 1.0, } From 55aba8983c7bdcf96cb600d90cf91b6526b3d265 Mon Sep 17 00:00:00 2001 From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com> Date: Fri, 12 Jun 2026 13:41:35 -0700 Subject: [PATCH 2/5] Remove details about requirements, implementation details, and excess options. Add example of multiple scores. --- examples/remote_scorer/README.md | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md index f94c6c7a9f92..f1313f9c6b71 100644 --- a/examples/remote_scorer/README.md +++ b/examples/remote_scorer/README.md @@ -40,9 +40,7 @@ The endpoint must return HTTP 200 with a JSON object. The required top-level contract fields are: - `schema_version`: required integer; must be `1`. -- `result`: required structured scorer output. By default, Weave requires this - to match the scorer result schema so feedback can be stored in typed scorer - columns. +- `result`: required structured scorer output. The simplest structured result is one score object: @@ -65,9 +63,26 @@ For example: ``` Non-200 responses are treated as scorer failures by Weave. The `result` value is -the scorer output that Weave records as feedback. Weave also accepts multiple -structured score objects as either a bare list, for example -`[{"value": "short"}, {"value": 0.32}]`, or under a `scores` key. +the scorer output that Weave records as feedback. If your scorer returns multiple +scores, provide a list of score objects: + +```json +{ + "schema_version": 1, + "result": [ + { + "value": "concise", + "reason": "The response is brief and focused.", + "confidence": 0.9 + }, + { + "value": 0.82, + "reason": "The response directly answers the request.", + "confidence": 0.8 + } + ] +} +``` ## Auth From a118ae5e68b89cf3bb285ac48e083f78545de3d3 Mon Sep 17 00:00:00 2001 From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com> Date: Fri, 12 Jun 2026 13:44:41 -0700 Subject: [PATCH 3/5] conciseness score --- examples/remote_scorer/README.md | 4 ++-- examples/remote_scorer/scoring_logic.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md index f1313f9c6b71..03b462e39f91 100644 --- a/examples/remote_scorer/README.md +++ b/examples/remote_scorer/README.md @@ -55,8 +55,8 @@ For example: { "schema_version": 1, "result": { - "value": 0.32, - "reason": "Message length is 32 characters.", + "value": 1.0, + "reason": "Message is 32 characters; concise messages score best.", "confidence": 1.0 } } diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py index b76001064467..cb65fc8ebf0b 100644 --- a/examples/remote_scorer/scoring_logic.py +++ b/examples/remote_scorer/scoring_logic.py @@ -5,6 +5,8 @@ from typing import Any REMOTE_SCORER_SCHEMA_VERSION = 1 +CONCISE_MESSAGE_LENGTH = 120 +TOO_LONG_MESSAGE_LENGTH = 500 def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: @@ -26,10 +28,20 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: if not isinstance(message, str): message = "" message_length = len(message) - rating = min(message_length / 100, 1.0) + if message_length <= CONCISE_MESSAGE_LENGTH: + rating = 1.0 + elif message_length >= TOO_LONG_MESSAGE_LENGTH: + rating = 0.0 + else: + rating = 1 - ( + (message_length - CONCISE_MESSAGE_LENGTH) + / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH) + ) return { "value": round(rating, 2), - "reason": f"Message length is {message_length} characters.", + "reason": ( + f"Message is {message_length} characters; concise messages score best." + ), "confidence": 1.0, } From 2786cf6070b1f8778d955f9d17660f79083f5a62 Mon Sep 17 00:00:00 2001 From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:10:14 -0700 Subject: [PATCH 4/5] multiple scores --- examples/remote_scorer/README.md | 28 ++++++++------ examples/remote_scorer/scoring_logic.py | 51 +++++++++++++++++++------ 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md index 03b462e39f91..8833b7c421a4 100644 --- a/examples/remote_scorer/README.md +++ b/examples/remote_scorer/README.md @@ -49,41 +49,47 @@ The simplest structured result is one score object: - `reason`: optional string explaining the score. - `confidence`: optional numeric confidence from `0.0` to `1.0`. -For example: +For a single-score response: ```json { "schema_version": 1, "result": { "value": 1.0, - "reason": "Message is 32 characters; concise messages score best.", - "confidence": 1.0 + "reason": "The response is clear and concise.", + "confidence": 0.9 } } ``` -Non-200 responses are treated as scorer failures by Weave. The `result` value is -the scorer output that Weave records as feedback. If your scorer returns multiple -scores, provide a list of score objects: +This sample returns two numeric ratings and one tag: ```json { "schema_version": 1, "result": [ { - "value": "concise", - "reason": "The response is brief and focused.", + "value": 1.0, + "reason": "Message is 32 characters; concise messages score best.", + "confidence": 1.0 + }, + { + "value": 0.4, + "reason": "Minimum useful detail is 80 characters; message is 32 characters.", "confidence": 0.9 }, { - "value": 0.82, - "reason": "The response directly answers the request.", - "confidence": 0.8 + "value": "too-short", + "reason": "Message length category is too-short.", + "confidence": 0.9 } ] } ``` +Non-200 responses are treated as scorer failures by Weave. The `result` value is +the scorer output that Weave records as feedback. + ## Auth Use explicit per-scorer auth in production. This sample supports two Weave diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py index cb65fc8ebf0b..b14db436791b 100644 --- a/examples/remote_scorer/scoring_logic.py +++ b/examples/remote_scorer/scoring_logic.py @@ -6,10 +6,11 @@ REMOTE_SCORER_SCHEMA_VERSION = 1 CONCISE_MESSAGE_LENGTH = 120 +MIN_USEFUL_DETAIL_LENGTH = 80 TOO_LONG_MESSAGE_LENGTH = 500 -def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: +def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]: """Score one Weave remote scorer request. Replace this function with your real policy, model, or business logic. It is @@ -28,20 +29,46 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: if not isinstance(message, str): message = "" message_length = len(message) - if message_length <= CONCISE_MESSAGE_LENGTH: - rating = 1.0 + + if message_length < MIN_USEFUL_DETAIL_LENGTH: + conciseness_rating = 1.0 + detail_rating = message_length / MIN_USEFUL_DETAIL_LENGTH + length_tag = "too-short" + elif message_length <= CONCISE_MESSAGE_LENGTH: + conciseness_rating = 1.0 + detail_rating = 1.0 + length_tag = "concise" elif message_length >= TOO_LONG_MESSAGE_LENGTH: - rating = 0.0 + conciseness_rating = 0.0 + detail_rating = 1.0 + length_tag = "too-long" else: - rating = 1 - ( + conciseness_rating = 1 - ( (message_length - CONCISE_MESSAGE_LENGTH) / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH) ) + detail_rating = 1.0 + length_tag = "verbose" - return { - "value": round(rating, 2), - "reason": ( - f"Message is {message_length} characters; concise messages score best." - ), - "confidence": 1.0, - } + return [ + { + "value": round(conciseness_rating, 2), + "reason": ( + f"Message is {message_length} characters; concise messages score best." + ), + "confidence": 1.0, + }, + { + "value": round(detail_rating, 2), + "reason": ( + f"Minimum useful detail is {MIN_USEFUL_DETAIL_LENGTH} characters; " + f"message is {message_length} characters." + ), + "confidence": 0.9, + }, + { + "value": length_tag, + "reason": f"Message length category is {length_tag}.", + "confidence": 0.9, + }, + ] From ead7cf5fa4c79108a96bf5e17376bb995156de2a Mon Sep 17 00:00:00 2001 From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:37:51 -0700 Subject: [PATCH 5/5] single numeric rating --- examples/remote_scorer/README.md | 11 +++-------- examples/remote_scorer/scoring_logic.py | 18 +----------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md index 8833b7c421a4..66cb6b21fdc3 100644 --- a/examples/remote_scorer/README.md +++ b/examples/remote_scorer/README.md @@ -62,7 +62,7 @@ For a single-score response: } ``` -This sample returns two numeric ratings and one tag: +This sample returns one numeric rating and one tag: ```json { @@ -74,13 +74,8 @@ This sample returns two numeric ratings and one tag: "confidence": 1.0 }, { - "value": 0.4, - "reason": "Minimum useful detail is 80 characters; message is 32 characters.", - "confidence": 0.9 - }, - { - "value": "too-short", - "reason": "Message length category is too-short.", + "value": "concise", + "reason": "Message length category is concise.", "confidence": 0.9 } ] diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py index b14db436791b..eaf48a3ac7b5 100644 --- a/examples/remote_scorer/scoring_logic.py +++ b/examples/remote_scorer/scoring_logic.py @@ -6,7 +6,6 @@ REMOTE_SCORER_SCHEMA_VERSION = 1 CONCISE_MESSAGE_LENGTH = 120 -MIN_USEFUL_DETAIL_LENGTH = 80 TOO_LONG_MESSAGE_LENGTH = 500 @@ -30,24 +29,17 @@ def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]: message = "" message_length = len(message) - if message_length < MIN_USEFUL_DETAIL_LENGTH: + if message_length <= CONCISE_MESSAGE_LENGTH: conciseness_rating = 1.0 - detail_rating = message_length / MIN_USEFUL_DETAIL_LENGTH - length_tag = "too-short" - elif message_length <= CONCISE_MESSAGE_LENGTH: - conciseness_rating = 1.0 - detail_rating = 1.0 length_tag = "concise" elif message_length >= TOO_LONG_MESSAGE_LENGTH: conciseness_rating = 0.0 - detail_rating = 1.0 length_tag = "too-long" else: conciseness_rating = 1 - ( (message_length - CONCISE_MESSAGE_LENGTH) / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH) ) - detail_rating = 1.0 length_tag = "verbose" return [ @@ -58,14 +50,6 @@ def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]: ), "confidence": 1.0, }, - { - "value": round(detail_rating, 2), - "reason": ( - f"Minimum useful detail is {MIN_USEFUL_DETAIL_LENGTH} characters; " - f"message is {message_length} characters." - ), - "confidence": 0.9, - }, { "value": length_tag, "reason": f"Message length category is {length_tag}.",