diff --git a/examples/remote_scorer/README.md b/examples/remote_scorer/README.md index da98ec92d4c2..66cb6b21fdc3 100644 --- a/examples/remote_scorer/README.md +++ b/examples/remote_scorer/README.md @@ -40,19 +40,48 @@ The endpoint must return HTTP 200 with a JSON object. The required top-level contract fields are: - `schema_version`: required integer; must be `1`. -- `result`: required; can be any JSON-serializable shape you choose. +- `result`: required structured scorer output. -For example: +The simplest structured result is one score object: + +- `value`: required; either a tag string, max 36 characters, or a numeric rating + from `0.0` to `1.0`. +- `reason`: optional string explaining the score. +- `confidence`: optional numeric confidence from `0.0` to `1.0`. + +For a single-score response: ```json { "schema_version": 1, "result": { - "message_length": 32 + "value": 1.0, + "reason": "The response is clear and concise.", + "confidence": 0.9 } } ``` +This sample returns one numeric rating and one tag: + +```json +{ + "schema_version": 1, + "result": [ + { + "value": 1.0, + "reason": "Message is 32 characters; concise messages score best.", + "confidence": 1.0 + }, + { + "value": "concise", + "reason": "Message length category is concise.", + "confidence": 0.9 + } + ] +} +``` + Non-200 responses are treated as scorer failures by Weave. The `result` value is the scorer output that Weave records as feedback. @@ -191,5 +220,6 @@ adopting this exact FastAPI app. The important production requirements are: - HTTPS endpoint reachable from Weave. - Host allowlist configured if the Weave deployment requires it. - Bearer-token validation implemented with your identity/security standards. -- HTTP 200 response body shaped as `{"schema_version": 1, "result": ...}`. +- HTTP 200 response body shaped as + `{"schema_version": 1, "result": {"value": 0.9, "reason": "...", "confidence": 1.0}}`. - Optional dedupe uses `Idempotency-Key`. diff --git a/examples/remote_scorer/scoring_logic.py b/examples/remote_scorer/scoring_logic.py index 38f911ac8f74..eaf48a3ac7b5 100644 --- a/examples/remote_scorer/scoring_logic.py +++ b/examples/remote_scorer/scoring_logic.py @@ -5,9 +5,11 @@ from typing import Any REMOTE_SCORER_SCHEMA_VERSION = 1 +CONCISE_MESSAGE_LENGTH = 120 +TOO_LONG_MESSAGE_LENGTH = 500 -def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: +def score_remote_call(request_body: dict[str, Any]) -> list[dict[str, Any]]: """Score one Weave remote scorer request. Replace this function with your real policy, model, or business logic. It is @@ -25,7 +27,32 @@ def score_remote_call(request_body: dict[str, Any]) -> dict[str, Any]: message = inputs.get("message", "") if isinstance(inputs, dict) else "" if not isinstance(message, str): message = "" - - return { - "message_length": len(message), - } + message_length = len(message) + + if message_length <= CONCISE_MESSAGE_LENGTH: + conciseness_rating = 1.0 + length_tag = "concise" + elif message_length >= TOO_LONG_MESSAGE_LENGTH: + conciseness_rating = 0.0 + length_tag = "too-long" + else: + conciseness_rating = 1 - ( + (message_length - CONCISE_MESSAGE_LENGTH) + / (TOO_LONG_MESSAGE_LENGTH - CONCISE_MESSAGE_LENGTH) + ) + length_tag = "verbose" + + return [ + { + "value": round(conciseness_rating, 2), + "reason": ( + f"Message is {message_length} characters; concise messages score best." + ), + "confidence": 1.0, + }, + { + "value": length_tag, + "reason": f"Message length category is {length_tag}.", + "confidence": 0.9, + }, + ]