From aeba9422084b1c951abe8afec19750154232bc5a Mon Sep 17 00:00:00 2001
From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:37:49 -0700
Subject: [PATCH 1/5] Update samples to use the new feedback schema

---
 examples/remote_scorer/README.md        | 22 ++++++++++++++++++----
 examples/remote_scorer/scoring_logic.py |  6 +++++-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md
index da98ec92d4c2..f94c6c7a9f92 100644
--- a/examples/remote_scorer/README.md
+++ b/examples/remote_scorer/README.md
@@ -40,7 +40,16 @@ The endpoint must return HTTP 200 with a JSON object. The required top-level
 contract fields are:
 
 - `schema_version`: required integer; must be `1`.
-- `result`: required; can be any JSON-serializable shape you choose.
+- `result`: required structured scorer output. By default, Weave requires this
+  to match the scorer result schema so feedback can be stored in typed scorer
+  columns.
+
+The simplest structured result is one score object:
+
+- `value`: required; either a tag string, max 36 characters, or a numeric rating
+  from `0.0` to `1.0`.
+- `reason`: optional string explaining the score.
+- `confidence`: optional numeric confidence from `0.0` to `1.0`.
 
 For example:
 
@@ -48,13 +57,17 @@ For example:
 {
   "schema_version": 1,
   "result": {
-    "message_length": 32
+    "value": 0.32,
+    "reason": "Message length is 32 characters.",
+    "confidence": 1.0
   }
 }
 ```
 
 Non-200 responses are treated as scorer failures by Weave. The `result` value is
-the scorer output that Weave records as feedback.
+the scorer output that Weave records as feedback. Weave also accepts multiple
+structured score objects as either a bare list, for example
+`[{"value": "short"}, {"value": 0.32}]`, or under a `scores` key.
 
 ## Auth
 
@@ -191,5 +204,6 @@ adopting this exact FastAPI app. The important production requirements are:
 - HTTPS endpoint reachable from Weave.
 - Host allowlist configured if the Weave deployment requires it.
 - Bearer-token validation implemented with your identity/security standards.
-- HTTP 200 response body shaped as `{"schema_version": 1, "result": ...}`.
+- HTTP 200 response body shaped as
+  `{"schema_version": 1, "result": {"value": 0.9, "reason": "...", "confidence": 1.0}}`.
 - Optional dedupe uses `Idempotency-Key`.
diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py
index 38f911ac8f74..b76001064467 100644
--- a/examples/remote_scorer/scoring_logic.py
+++ b/examples/remote_scorer/scoring_logic.py
@@ -25,7 +25,11 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]:
     message = inputs.get("message", "") if isinstance(inputs, dict) else ""
     if not isinstance(message, str):
         message = ""
+    message_length = len(message)
+    rating = min(message_length / 100, 1.0)
 
     return {
-        "message_length": len(message),
+        "value": round(rating, 2),
+        "reason": f"Message length is {message_length} characters.",
+        "confidence": 1.0,
     }

From 55aba8983c7bdcf96cb600d90cf91b6526b3d265 Mon Sep 17 00:00:00 2001
From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com>
Date: Fri, 12 Jun 2026 13:41:35 -0700
Subject: [PATCH 2/5] Remove details about requirements, implementation
 details, and excess options. Add example of multiple scores.

---
 examples/remote_scorer/README.md | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md
index f94c6c7a9f92..f1313f9c6b71 100644
--- a/examples/remote_scorer/README.md
+++ b/examples/remote_scorer/README.md
@@ -40,9 +40,7 @@ The endpoint must return HTTP 200 with a JSON object. The required top-level
 contract fields are:
 
 - `schema_version`: required integer; must be `1`.
-- `result`: required structured scorer output. By default, Weave requires this
-  to match the scorer result schema so feedback can be stored in typed scorer
-  columns.
+- `result`: required structured scorer output.
 
 The simplest structured result is one score object:
 
@@ -65,9 +63,26 @@ For example:
 ```
 
 Non-200 responses are treated as scorer failures by Weave. The `result` value is
-the scorer output that Weave records as feedback. Weave also accepts multiple
-structured score objects as either a bare list, for example
-`[{"value": "short"}, {"value": 0.32}]`, or under a `scores` key.
+the scorer output that Weave records as feedback. If your scorer returns multiple
+scores, provide a list of score objects:
+
+```json
+{
+  "schema_version": 1,
+  "result": [
+    {
+      "value": "concise",
+      "reason": "The response is brief and focused.",
+      "confidence": 0.9
+    },
+    {
+      "value": 0.82,
+      "reason": "The response directly answers the request.",
+      "confidence": 0.8
+    }
+  ]
+}
+```
 
 ## Auth
 

From a118ae5e68b89cf3bb285ac48e083f78545de3d3 Mon Sep 17 00:00:00 2001
From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com>
Date: Fri, 12 Jun 2026 13:44:41 -0700
Subject: [PATCH 3/5] conciseness score

---
 examples/remote_scorer/README.md        |  4 ++--
 examples/remote_scorer/scoring_logic.py | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md
index f1313f9c6b71..03b462e39f91 100644
--- a/examples/remote_scorer/README.md
+++ b/examples/remote_scorer/README.md
@@ -55,8 +55,8 @@ For example:
 {
   "schema_version": 1,
   "result": {
-    "value": 0.32,
-    "reason": "Message length is 32 characters.",
+    "value": 1.0,
+    "reason": "Message is 32 characters; concise messages score best.",
     "confidence": 1.0
   }
 }
diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py
index b76001064467..cb65fc8ebf0b 100644
--- a/examples/remote_scorer/scoring_logic.py
+++ b/examples/remote_scorer/scoring_logic.py
@@ -5,6 +5,8 @@
 from typing import Any
 
 REMOTE_SCORER_SCHEMA_VERSION = 1
+CONCISE_MESSAGE_LENGTH = 120
+TOO_LONG_MESSAGE_LENGTH = 500
 
 
 def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]:
@@ -26,10 +28,20 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]:
     if not isinstance(message, str):
         message = ""
     message_length = len(message)
-    rating = min(message_length / 100, 1.0)
+    if message_length <= CONCISE_MESSAGE_LENGTH:
+        rating = 1.0
+    elif message_length >= TOO_LONG_MESSAGE_LENGTH:
+        rating = 0.0
+    else:
+        rating = 1 - (
+            (message_length - CONCISE_MESSAGE_LENGTH)
+            / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH)
+        )
 
     return {
         "value": round(rating, 2),
-        "reason": f"Message length is {message_length} characters.",
+        "reason": (
+            f"Message is {message_length} characters; concise messages score best."
+        ),
         "confidence": 1.0,
     }

From 2786cf6070b1f8778d955f9d17660f79083f5a62 Mon Sep 17 00:00:00 2001
From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com>
Date: Fri, 12 Jun 2026 14:10:14 -0700
Subject: [PATCH 4/5] multiple scores

---
 examples/remote_scorer/README.md        | 28 ++++++++------
 examples/remote_scorer/scoring_logic.py | 51 +++++++++++++++++++------
 2 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md
index 03b462e39f91..8833b7c421a4 100644
--- a/examples/remote_scorer/README.md
+++ b/examples/remote_scorer/README.md
@@ -49,41 +49,47 @@ The simplest structured result is one score object:
 - `reason`: optional string explaining the score.
 - `confidence`: optional numeric confidence from `0.0` to `1.0`.
 
-For example:
+For a single-score response:
 
 ```json
 {
   "schema_version": 1,
   "result": {
     "value": 1.0,
-    "reason": "Message is 32 characters; concise messages score best.",
-    "confidence": 1.0
+    "reason": "The response is clear and concise.",
+    "confidence": 0.9
   }
 }
 ```
 
-Non-200 responses are treated as scorer failures by Weave. The `result` value is
-the scorer output that Weave records as feedback. If your scorer returns multiple
-scores, provide a list of score objects:
+This sample returns two numeric ratings and one tag:
 
 ```json
 {
   "schema_version": 1,
   "result": [
     {
-      "value": "concise",
-      "reason": "The response is brief and focused.",
+      "value": 1.0,
+      "reason": "Message is 32 characters; concise messages score best.",
+      "confidence": 1.0
+    },
+    {
+      "value": 0.4,
+      "reason": "Minimum useful detail is 80 characters; message is 32 characters.",
       "confidence": 0.9
     },
     {
-      "value": 0.82,
-      "reason": "The response directly answers the request.",
-      "confidence": 0.8
+      "value": "too-short",
+      "reason": "Message length category is too-short.",
+      "confidence": 0.9
     }
   ]
 }
 ```
 
+Non-200 responses are treated as scorer failures by Weave. The `result` value is
+the scorer output that Weave records as feedback.
+
 ## Auth
 
 Use explicit per-scorer auth in production. This sample supports two Weave
diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py
index cb65fc8ebf0b..b14db436791b 100644
--- a/examples/remote_scorer/scoring_logic.py
+++ b/examples/remote_scorer/scoring_logic.py
@@ -6,10 +6,11 @@
 
 REMOTE_SCORER_SCHEMA_VERSION = 1
 CONCISE_MESSAGE_LENGTH = 120
+MIN_USEFUL_DETAIL_LENGTH = 80
 TOO_LONG_MESSAGE_LENGTH = 500
 
 
-def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]:
+def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]:
     """Score one Weave remote scorer request.
 
     Replace this function with your real policy, model, or business logic. It is
@@ -28,20 +29,46 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]:
     if not isinstance(message, str):
         message = ""
     message_length = len(message)
-    if message_length <= CONCISE_MESSAGE_LENGTH:
-        rating = 1.0
+
+    if message_length < MIN_USEFUL_DETAIL_LENGTH:
+        conciseness_rating = 1.0
+        detail_rating = message_length / MIN_USEFUL_DETAIL_LENGTH
+        length_tag = "too-short"
+    elif message_length <= CONCISE_MESSAGE_LENGTH:
+        conciseness_rating = 1.0
+        detail_rating = 1.0
+        length_tag = "concise"
     elif message_length >= TOO_LONG_MESSAGE_LENGTH:
-        rating = 0.0
+        conciseness_rating = 0.0
+        detail_rating = 1.0
+        length_tag = "too-long"
     else:
-        rating = 1 - (
+        conciseness_rating = 1 - (
             (message_length - CONCISE_MESSAGE_LENGTH)
             / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH)
         )
+        detail_rating = 1.0
+        length_tag = "verbose"
 
-    return {
-        "value": round(rating, 2),
-        "reason": (
-            f"Message is {message_length} characters; concise messages score best."
-        ),
-        "confidence": 1.0,
-    }
+    return [
+        {
+            "value": round(conciseness_rating, 2),
+            "reason": (
+                f"Message is {message_length} characters; concise messages score best."
+            ),
+            "confidence": 1.0,
+        },
+        {
+            "value": round(detail_rating, 2),
+            "reason": (
+                f"Minimum useful detail is {MIN_USEFUL_DETAIL_LENGTH} characters; "
+                f"message is {message_length} characters."
+            ),
+            "confidence": 0.9,
+        },
+        {
+            "value": length_tag,
+            "reason": f"Message length category is {length_tag}.",
+            "confidence": 0.9,
+        },
+    ]

From ead7cf5fa4c79108a96bf5e17376bb995156de2a Mon Sep 17 00:00:00 2001
From: Mike Scavezze <244614145+mscavezze-cw@users.noreply.github.com>
Date: Fri, 12 Jun 2026 14:37:51 -0700
Subject: [PATCH 5/5] single numeric rating

---
 examples/remote_scorer/README.md        | 11 +++--------
 examples/remote_scorer/scoring_logic.py | 18 +-----------------
 2 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md
index 8833b7c421a4..66cb6b21fdc3 100644
--- a/examples/remote_scorer/README.md
+++ b/examples/remote_scorer/README.md
@@ -62,7 +62,7 @@ For a single-score response:
 }
 ```
 
-This sample returns two numeric ratings and one tag:
+This sample returns one numeric rating and one tag:
 
 ```json
 {
@@ -74,13 +74,8 @@ This sample returns two numeric ratings and one tag:
       "confidence": 1.0
     },
     {
-      "value": 0.4,
-      "reason": "Minimum useful detail is 80 characters; message is 32 characters.",
-      "confidence": 0.9
-    },
-    {
-      "value": "too-short",
-      "reason": "Message length category is too-short.",
+      "value": "concise",
+      "reason": "Message length category is concise.",
       "confidence": 0.9
     }
   ]
diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py
index b14db436791b..eaf48a3ac7b5 100644
--- a/examples/remote_scorer/scoring_logic.py
+++ b/examples/remote_scorer/scoring_logic.py
@@ -6,7 +6,6 @@
 
 REMOTE_SCORER_SCHEMA_VERSION = 1
 CONCISE_MESSAGE_LENGTH = 120
-MIN_USEFUL_DETAIL_LENGTH = 80
 TOO_LONG_MESSAGE_LENGTH = 500
 
 
@@ -30,24 +29,17 @@ def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]:
         message = ""
     message_length = len(message)
 
-    if message_length < MIN_USEFUL_DETAIL_LENGTH:
+    if message_length <= CONCISE_MESSAGE_LENGTH:
         conciseness_rating = 1.0
-        detail_rating = message_length / MIN_USEFUL_DETAIL_LENGTH
-        length_tag = "too-short"
-    elif message_length <= CONCISE_MESSAGE_LENGTH:
-        conciseness_rating = 1.0
-        detail_rating = 1.0
         length_tag = "concise"
     elif message_length >= TOO_LONG_MESSAGE_LENGTH:
         conciseness_rating = 0.0
-        detail_rating = 1.0
         length_tag = "too-long"
     else:
         conciseness_rating = 1 - (
             (message_length - CONCISE_MESSAGE_LENGTH)
             / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH)
         )
-        detail_rating = 1.0
         length_tag = "verbose"
 
     return [
@@ -58,14 +50,6 @@ def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]:
             ),
             "confidence": 1.0,
         },
-        {
-            "value": round(detail_rating, 2),
-            "reason": (
-                f"Minimum useful detail is {MIN_USEFUL_DETAIL_LENGTH} characters; "
-                f"message is {message_length} characters."
-            ),
-            "confidence": 0.9,
-        },
         {
             "value": length_tag,
             "reason": f"Message length category is {length_tag}.",