From 3a13d0fcd1a8fb54bbdbf3b0963793f37ccf8fbb Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 22:24:05 +0300 Subject: [PATCH 1/8] feat(nlu): consume more of the platform request envelope (Phase 0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires four free-tier Yandex Dialogs platform features into the webhook without adding any external dependency: - request.markup.dangerous_context: graceful refusal with end_session=true before NLU/search engages, so flagged content never lands in mass.music.search. - meta.interfaces.screen: buttons in the disambiguation prompt are emitted only on screened surfaces; voice-only devices (Mini/Pro) get the same ordinal prompt without button payload. - request.original_utterance: logged alongside the normalised command for misclassification post-mortems. - request.nlu.entities[YANDEX.NUMBER]: new ParsedControl action "volume_relative" handles "прибавь на 20" / "убавь 5" / "на 15 громче" with regex-captured digits or entity fallback. Executor reads current player.volume_level, applies signed delta, clamps [0, 100], dispatches cmd_volume_set. Bare "прибавь" / "убавь" without a number still resolve to volume_up / volume_down via the existing _CONTROL_PATTERNS rules. Player resolution and music search remain in-house (domain logic, not NLU-shaped). Co-Authored-By: Claude Opus 4.7 (1M context) --- provider/dialogs.py | 108 +++++++++++++++++++++--- provider/dialogs_control.py | 128 ++++++++++++++++++++++++++-- tests/test_dialogs.py | 34 +++++++- tests/test_dialogs_control.py | 151 ++++++++++++++++++++++++++++++++++ 4 files changed, 402 insertions(+), 19 deletions(-) diff --git a/provider/dialogs.py b/provider/dialogs.py index f0eea5c..b0ed474 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -130,6 +130,24 @@ def _safe_dict(value: Any) -> dict[str, Any]: return value if isinstance(value, dict) else {} +def _has_screen(meta: Any) -> bool: + """Return True if the calling surface has a display. + + Yandex sets ``meta.interfaces.screen = {}`` (empty dict, present-as-key) + on devices that can render visual elements: mobile Alice, station-max, + station-2, navigator, smart-screen, tv-app. Audio-only surfaces + (station-mini, station-pro, dumb speakers) omit the key entirely. + Used to gate ``buttons`` / ``card`` emission so we don't ship UI + bits to surfaces that ignore them. + """ + if not isinstance(meta, dict): + return False + interfaces = meta.get("interfaces") + if not isinstance(interfaces, dict): + return False + return "screen" in interfaces + + def _without_pending(state: dict[str, Any]) -> dict[str, Any]: """Return a copy of `state` with disambiguation/elicitation keys removed. @@ -389,6 +407,10 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: req = body.get("request") or {} if not isinstance(req, dict): req = {} + meta = body.get("meta") or {} + if not isinstance(meta, dict): + meta = {} + has_screen = _has_screen(meta) # skill_id sanity check — reject if absent or mismatched. incoming_skill_id = str(session.get("skill_id") or "") @@ -431,6 +453,17 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: is_new = bool(session.get("new")) command = str(req.get("command") or "").strip() + original_utterance = str(req.get("original_utterance") or "").strip() + + # request.markup.dangerous_context — Yandex flags suicide/violence/ + # hate content before passing the phrase through. If raised, refuse + # gracefully without engaging music search; passing flagged content + # to mass.music.search is bad PR (and may surface a result keyed off + # the flagged words). + markup = req.get("markup") or {} + if not isinstance(markup, dict): + markup = {} + dangerous_context = bool(markup.get("dangerous_context")) # Pending-command / awaiting-query lookups follow the same # three-tier order as default_id: session → application → @@ -453,11 +486,21 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: # bits we route on. Sensitive fields (skill_id, webhook_secret, # raw payload IDs) are excluded; user/session IDs are opaque # tokens and DEBUG is opt-in, so they're included as-is. + # `original_utterance` is logged when it differs from the + # normalised `command` (Yandex strips punctuation and converts + # spelled-out numbers; the raw form helps misclassification + # post-mortems). + raw_suffix = ( + f" raw={original_utterance!r}" + if original_utterance and original_utterance != command + else "" + ) self._logger.debug( - "Webhook recv: cmd=%r req_type=%s is_new=%s pending=%s " + "Webhook recv: cmd=%r%s req_type=%s is_new=%s pending=%s " "(session=%s app=%s cache=%s) awaiting=%s default_player=%s " - "session_id=%s", + "dangerous=%s session_id=%s", command, + raw_suffix, req.get("type", "SimpleUtterance"), is_new, bool(pending_in), @@ -466,6 +509,7 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: bool(cached_state.get("pending_command")), awaiting_in, default_id, + dangerous_context, session.get("session_id", ""), ) @@ -489,6 +533,24 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: session_state=session_state_in, ) + if dangerous_context: + # Refuse gracefully and end session. Don't engage NLU or search + # so a flagged phrase never lands in mass.music.search results + # or in our logs as an "intent". Drop pending/awaiting state + # so the next conversation starts clean. + self._logger.info( + "Dropping flagged-content request (dangerous_context=true); session_id=%s", + session.get("session_id", ""), + ) + text = "Не понял команду." + return self._yandex_response( + incoming_session=session, + text=text, + tts=_tts_for(text), + end_session=True, + session_state=_without_pending(session_state_in), + ) + # P0.6 — try control commands (pause/next/volume/...) FIRST, on # the raw command. Doing this before the awaiting-query synthesis # lets the user pivot from a slot-elicit prompt straight into a @@ -496,7 +558,14 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: # without the prefix-prepend turning it into "включи пауза…". # If control matches, drop any pending/awaiting state — the user # is no longer in either of those flows. - if control := parse_control(command): + # `entities` (request.nlu.entities) feeds parse_control's + # YANDEX.NUMBER fallback for relative-volume phrasings where + # the regex didn't anchor on a digit. + nlu = req.get("nlu") or {} + if not isinstance(nlu, dict): + nlu = {} + nlu_entities = nlu.get("entities") if isinstance(nlu.get("entities"), list) else None + if control := parse_control(command, entities=nlu_entities): self._logger.debug("Parsed dialog control %r → %r", command, control) return self._handle_control( session=session, @@ -555,6 +624,7 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: pending=pending, session_state_in=session_state_in, app_state_in=app_state_in, + has_screen=has_screen, ) if replay_response is not None: return replay_response @@ -570,6 +640,7 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: default_id=default_id, session_state_in=session_state_in, app_state_in=app_state_in, + has_screen=has_screen, ) # ------------------------------------------------------------------- @@ -584,6 +655,7 @@ async def _dispatch_play( default_id: str | None, session_state_in: dict[str, Any], app_state_in: dict[str, Any], + has_screen: bool = True, ) -> web.Response: """Slot-elicit / resolve player / disambiguate / play (or fail).""" # P0.4 — slot elicitation: bare verb with no actionable content. @@ -650,6 +722,7 @@ async def _dispatch_play( candidates=all_exposed, session_state_in=session_state_in, app_state_in=app_state_in, + has_screen=has_screen, ) hint = parsed.player_hint or "(не указано)" self._logger.info( @@ -677,6 +750,7 @@ async def _dispatch_play( candidates=candidates, session_state_in=session_state_in, app_state_in=app_state_in, + has_screen=has_screen, ) self._logger.debug( @@ -1037,6 +1111,7 @@ def _build_disambiguation_response( candidates: list[Any], session_state_in: dict[str, Any], app_state_in: dict[str, Any] | None = None, + has_screen: bool = True, ) -> web.Response: """Ask the user which player to use — voice-first, with optional buttons. @@ -1044,8 +1119,8 @@ def _build_disambiguation_response( has to make voice answer obvious. We enumerate candidates with Russian ordinals (`первая` / `вторая` / …) so a user can say either the player name (free-text fallback) or the position. - Buttons are kept on the response for screen surfaces, but voice - is the primary channel. + Buttons are emitted only on screened surfaces; voice-only devices + get the same prompt without the button payload. """ # Yandex caps ItemsList at 5 anyway; cap our buttons to the same. capped = candidates[:5] @@ -1057,14 +1132,18 @@ def _build_disambiguation_response( # маленькая. Скажи название или номер." labelled = [f"{_ORDINAL_LABELS[i]} — {name}" for i, name in enumerate(names)] text = "На какой колонке? " + ", ".join(labelled) + ". Скажи название или номер." - buttons = [ - { - "title": (p.name or p.player_id)[:64], - "payload": {"player_id": p.player_id}, - "hide": True, - } - for p in capped - ] + buttons: list[dict[str, Any]] | None = ( + [ + { + "title": (p.name or p.player_id)[:64], + "payload": {"player_id": p.player_id}, + "hide": True, + } + for p in capped + ] + if has_screen + else None + ) # Clear any prior `awaiting_query` / `pending_command` before # writing the new one, and include the saved `pending_command`. # The same pending entry is mirrored to BOTH `session_state` and @@ -1119,6 +1198,7 @@ async def _try_resume_pending( pending: dict[str, Any], session_state_in: dict[str, Any], app_state_in: dict[str, Any], + has_screen: bool = True, ) -> web.Response | None: """Attempt to resume a saved pending_command using button payload or text. @@ -1199,6 +1279,7 @@ async def _try_resume_pending( candidates=candidates, session_state_in=session_state_in, app_state_in=app_state_in, + has_screen=has_screen, ) # Step 3 — voice ordinal ("первая", "выбираю первую", "номер @@ -1248,6 +1329,7 @@ async def _try_resume_pending( candidates=still_available, session_state_in=session_state_in, app_state_in=app_state_in, + has_screen=has_screen, ) # else: no candidates remain at all — fall through. diff --git a/provider/dialogs_control.py b/provider/dialogs_control.py index f3d7c0e..2e86480 100644 --- a/provider/dialogs_control.py +++ b/provider/dialogs_control.py @@ -34,6 +34,7 @@ "volume_up", "volume_down", "volume_set", + "volume_relative", # value = signed delta (+20, -5); executor reads current vol + clamps "mute", "unmute", "list_players", @@ -179,6 +180,57 @@ class ParsedControl: re.IGNORECASE, ) +# Relative-volume phrasings without the keyword "громкость". The verb is +# matched even when no digit is captured — the digit slot is filled from +# the regex group OR from `request.nlu.entities[YANDEX.NUMBER]` (passed +# in via `parse_control(text, entities=...)`). When neither yields a +# number, these patterns intentionally fall through to the bare +# "прибавь"/"убавь" → volume_up/volume_down rules in `_CONTROL_PATTERNS`. +# Yandex normalises spelled-out numbers in `request.command` (тридцать → 30) +# so the regex covers most phrasings; the entity is the defensive fallback. +_VOLUME_INC_RE = re.compile( + r"^(?:сделай\s+)?(?:прибавь(?:те)?|прибавить)" + r"(?:\s+(?:на\s+)?(?P\d{1,3})(?:\s+процентов)?)?$", + re.IGNORECASE, +) +_VOLUME_DEC_RE = re.compile( + r"^(?:сделай\s+)?(?:убавь(?:те)?|убавить)" + r"(?:\s+(?:на\s+)?(?P\d{1,3})(?:\s+процентов)?)?$", + re.IGNORECASE, +) +_VOLUME_NUM_INC_RE = re.compile( + r"^на\s+(?P\d{1,3})\s+(?:громче|погромче)$", + re.IGNORECASE, +) +_VOLUME_NUM_DEC_RE = re.compile( + r"^на\s+(?P\d{1,3})\s+(?:тише|потише)$", + re.IGNORECASE, +) + + +def _yandex_number(entities: list[Any] | None) -> int | None: + """Return the first integer YANDEX.NUMBER value from request entities, or None. + + Yandex's normalised `request.command` already converts most spelled-out + Russian numbers to digits, but the entity is the authoritative fallback + for phrasings the regex didn't anchor on a digit position. The list + type is `list[Any]` because the values come from network JSON and we + defend against mixed-type elements. + """ + if not entities: + return None + for ent in entities: + if not isinstance(ent, dict): + continue + if ent.get("type") != "YANDEX.NUMBER": + continue + value = ent.get("value") + if isinstance(value, bool): # bool is a subclass of int — exclude it + continue + if isinstance(value, (int, float)): + return int(value) + return None + # Seek forward / backward with numeric amount + optional unit. Unit defaults # to seconds when missing. "Минут[уы]" multiplies by 60. _SEEK_FORWARD_RE = re.compile( @@ -214,8 +266,17 @@ def _seek_seconds(match: re.Match[str]) -> int | None: return n -def _try_match(cleaned: str, player_hint: str | None) -> ParsedControl | None: - """Match `cleaned` against control patterns; return ParsedControl or None.""" +def _try_match( + cleaned: str, + player_hint: str | None, + entities: list[Any] | None = None, +) -> ParsedControl | None: + """Match `cleaned` against control patterns; return ParsedControl or None. + + `entities` is `request.nlu.entities` from the Yandex envelope. When a + relative-volume verb matches without a captured digit, we fall back to + `YANDEX.NUMBER` from there before deciding to surface `volume_relative`. + """ if not cleaned: return None if vmatch := _VOLUME_SET_RE.match(cleaned): @@ -228,6 +289,38 @@ def _try_match(cleaned: str, player_hint: str | None) -> ParsedControl | None: value=max(0, min(100, value)), player_hint=player_hint, ) + # Relative-volume — try INCREASE forms ("прибавь N", "на N громче"), + # then DECREASE ("убавь N", "на N тише"). When the verb matches but + # the digit slot is empty, fall back to YANDEX.NUMBER from the + # request envelope. If neither yields a number, return None so the + # bare-verb fallthrough in `_CONTROL_PATTERNS` handles "прибавь" / + # "убавь" as volume_up / volume_down. + for pattern, sign in ( + (_VOLUME_INC_RE, +1), + (_VOLUME_NUM_INC_RE, +1), + (_VOLUME_DEC_RE, -1), + (_VOLUME_NUM_DEC_RE, -1), + ): + if rel_match := pattern.match(cleaned): + n: int | None = None + try: + raw = rel_match.group("n") + if raw is not None: + n = int(raw) + except (IndexError, TypeError, ValueError): + n = None + if n is None: + n = _yandex_number(entities) + if n is not None: + # Clamp the magnitude so an absurd "прибавь на 999" doesn't + # underflow/overflow downstream arithmetic. Sign is applied + # to the (clamped) magnitude. + magnitude = max(1, min(100, abs(n))) + return ParsedControl( + action="volume_relative", + value=sign * magnitude, + player_hint=player_hint, + ) if smatch := _SEEK_FORWARD_RE.match(cleaned): seconds = _seek_seconds(smatch) if seconds is not None: @@ -254,7 +347,10 @@ def _try_match(cleaned: str, player_hint: str | None) -> ParsedControl | None: _NA_BOUNDARY_RE = re.compile(r"\s+на\s+", re.IGNORECASE) -def parse_control(text: str) -> ParsedControl | None: +def parse_control( + text: str, + entities: list[Any] | None = None, +) -> ParsedControl | None: """Classify a voice utterance as a control command, or None to fall through. Tries each `на`-boundary in the cleaned text as a possible @@ -262,6 +358,10 @@ def parse_control(text: str) -> ParsedControl | None: (cleaned, None) for the whole-phrase case so that "поставь на паузу" still matches `pause` with no hint, even when the phrase contains "на" inside the action keywords. + + `entities` is the (optional) `request.nlu.entities` array from the + Yandex Dialogs envelope, used as a fallback source for `YANDEX.NUMBER` + when a relative-volume verb matched without a captured digit. """ if not text: return None @@ -272,7 +372,7 @@ def parse_control(text: str) -> ParsedControl | None: return None # Whole-phrase first (no hint). - if direct := _try_match(cleaned, player_hint=None): + if direct := _try_match(cleaned, player_hint=None, entities=entities): return direct # Then try each "на " split from right to left, so e.g. @@ -283,7 +383,7 @@ def parse_control(text: str) -> ParsedControl | None: hint = cleaned[m.end() :].strip().lower() if not rest or not hint: continue - if matched := _try_match(rest, player_hint=hint): + if matched := _try_match(rest, player_hint=hint, entities=entities): return matched return None @@ -348,6 +448,13 @@ def control_confirmation(control: ParsedControl) -> str: # noqa: PLR0911 return "Тише." if action == "volume_set": return f"Громкость {control.value}." + if action == "volume_relative": + delta = control.value or 0 + if delta > 0: + return f"Громче на {delta}." + if delta < 0: + return f"Тише на {-delta}." + return "Готово." if action == "mute": return "Звук выключен." if action == "unmute": @@ -412,6 +519,17 @@ async def execute_control( # noqa: PLR0915 elif action == "volume_set": value = max(0, min(100, control.value or 0)) await mass.players.cmd_volume_set(pid, value) + elif action == "volume_relative": + # Read current volume, apply signed delta, clamp [0, 100]. + # Falls back to 50 if the player exposes no volume_level + # (some virtual players do); the user feedback ("Громче на 20") + # then becomes a no-op rather than mis-targeting. + delta = control.value or 0 + current = getattr(player, "volume_level", None) + if not isinstance(current, (int, float)): + current = 50 + new_value = max(0, min(100, int(current) + int(delta))) + await mass.players.cmd_volume_set(pid, new_value) elif action == "mute": await mass.players.cmd_volume_mute(pid, True) elif action == "unmute": diff --git a/tests/test_dialogs.py b/tests/test_dialogs.py index bdc3132..8054568 100644 --- a/tests/test_dialogs.py +++ b/tests/test_dialogs.py @@ -874,7 +874,7 @@ class TestDisambiguation: """End-to-end tests for the disambiguation prompt + pending-command replay.""" async def test_multiple_matches_returns_disambiguation_prompt(self) -> None: - """Two candidates → response carries buttons + pending_command, end_session=False.""" + """Two candidates on a screened surface → response carries buttons + pending_command.""" track = MagicMock(uri="library://track/1", spec_set=["uri"]) mass = _make_mass( [ @@ -885,6 +885,7 @@ async def test_multiple_matches_returns_disambiguation_prompt(self) -> None: ) handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) body = { + "meta": {"interfaces": {"screen": {}}}, "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, "request": {"command": "включи Metallica на кухне"}, } @@ -905,6 +906,33 @@ async def test_multiple_matches_returns_disambiguation_prompt(self) -> None: # Nothing is played yet. mass.player_queues.play_media.assert_not_awaited() + async def test_disambiguation_voice_only_omits_buttons(self) -> None: + """Voice-only surface (no meta.interfaces.screen) → prompt without buttons.""" + track = MagicMock(uri="library://track/1", spec_set=["uri"]) + mass = _make_mass( + [ + MockPlayer(player_id="p1", name="Кухня большая"), + MockPlayer(player_id="p2", name="Кухня маленькая"), + ], + search_track=track, + ) + handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) + body = { + # No meta.interfaces — defaults to voice-only (Yandex Mini etc.) + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "включи Metallica на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + assert resp.status == 200 + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is False + # Voice prompt with ordinals is still present, just without buttons. + assert "buttons" not in body_out["response"] + assert "первая" in body_out["response"]["text"].lower() + # Pending command still saved for voice-ordinal resolution. + pending = body_out["session_state"]["pending_command"] + assert pending["candidate_ids"] == ["p1", "p2"] + async def test_button_press_resolves_pending(self) -> None: """ButtonPressed payload.player_id triggers a play of the saved pending_command.""" track = MagicMock(uri="library://track/1", spec_set=["uri"]) @@ -1079,6 +1107,7 @@ async def test_play_no_hint_no_default_offers_disambiguation(self) -> None: ) handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) body = { + "meta": {"interfaces": {"screen": {}}}, "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, "request": {"command": "включи Metallica"}, } @@ -1152,6 +1181,7 @@ async def test_disambiguation_clears_awaiting_query(self) -> None: handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) # Simulate the awaiting_query → ambiguous-resolution turn. body = { + "meta": {"interfaces": {"screen": {}}}, "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, "request": {"command": "Metallica на кухне"}, "state": {"session": {"awaiting_query": True}}, @@ -1243,6 +1273,7 @@ async def test_ordinal_out_of_range_reasks_does_not_fall_through(self) -> None: ) handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) body = { + "meta": {"interfaces": {"screen": {}}}, "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, "request": {"command": "третья"}, "state": { @@ -1452,6 +1483,7 @@ async def test_disambiguation_writes_pending_to_application_state(self) -> None: ) handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) body = { + "meta": {"interfaces": {"screen": {}}}, "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, "request": {"command": "включи джаз"}, } diff --git a/tests/test_dialogs_control.py b/tests/test_dialogs_control.py index 25a9307..9fe3beb 100644 --- a/tests/test_dialogs_control.py +++ b/tests/test_dialogs_control.py @@ -190,6 +190,106 @@ def test_play_phrases_return_none(self, phrase: str) -> None: assert parse_control(phrase) is None +class TestParseControlVolumeRelative: + """volume_relative — phrasings with a number, fed by regex or YANDEX.NUMBER.""" + + @pytest.mark.parametrize( + ("phrase", "expected_value", "expected_hint"), + [ + # increase forms — regex captures the digit (Yandex normalises + # spelled-out numbers in `request.command`). + ("прибавь 20", 20, None), + ("прибавь на 20", 20, None), + ("прибавь на 20 процентов", 20, None), + ("прибавьте на 5", 5, None), + ("сделай прибавь 10", 10, None), + ("на 15 громче", 15, None), + ("на 5 погромче", 5, None), + # decrease forms (regex) + ("убавь 10", -10, None), + ("убавь на 25", -25, None), + ("убавьте 5", -5, None), + ("на 20 тише", -20, None), + ("на 30 потише", -30, None), + # with player hint + ("прибавь на 10 на кухне", 10, "кухне"), + ("убавь 5 на спальне", -5, "спальне"), + # clamping (magnitude > 100 caps at 100) + ("прибавь на 999", 100, None), + ], + ) + def test_relative_with_digit( + self, phrase: str, expected_value: int, expected_hint: str | None + ) -> None: + """Relative-volume phrasings with a captured digit produce signed deltas.""" + result = parse_control(phrase) + assert result is not None, f"phrase={phrase!r}" + assert result.action == "volume_relative", f"phrase={phrase!r}" + assert result.value == expected_value, f"phrase={phrase!r}" + assert result.player_hint == expected_hint, f"phrase={phrase!r}" + + @pytest.mark.parametrize( + ("phrase", "sign"), + [ + ("прибавь", +1), + ("убавь", -1), + ("сделай прибавь", +1), + ], + ) + def test_relative_uses_yandex_number_when_regex_misses_digit( + self, phrase: str, sign: int + ) -> None: + """Verb matched without a digit → fall back to YANDEX.NUMBER entity.""" + entities = [{"type": "YANDEX.NUMBER", "value": 12, "tokens": {"start": 0, "end": 1}}] + result = parse_control(phrase, entities=entities) + assert result is not None + assert result.action == "volume_relative" + assert result.value == sign * 12 + + def test_bare_verb_without_entity_falls_through_to_volume_up(self) -> None: + """Without a digit and without YANDEX.NUMBER, "прибавь" stays on volume_up.""" + # No entities provided → relative pattern matches but yields no number, + # so the fallthrough lands on the bare-verb _CONTROL_PATTERNS rule. + result = parse_control("прибавь") + assert result is not None + assert result.action == "volume_up" + assert result.value is None + + def test_bare_decrease_falls_through_to_volume_down(self) -> None: + """Bare "убавь" without number stays on volume_down (existing behaviour).""" + result = parse_control("убавь") + assert result is not None + assert result.action == "volume_down" + + def test_entity_fallback_skips_non_number_entities(self) -> None: + """Only YANDEX.NUMBER counts; other entity types are ignored.""" + entities = [ + {"type": "YANDEX.GEO", "value": {"city": "Москва"}}, + {"type": "YANDEX.FIO", "value": {"first_name": "Иван"}}, + ] + result = parse_control("прибавь", entities=entities) + assert result is not None + assert result.action == "volume_up" # no number found → fallthrough + + def test_entity_fallback_first_number_wins(self) -> None: + """When several YANDEX.NUMBER entities are present, the first wins.""" + entities = [ + {"type": "YANDEX.NUMBER", "value": 7}, + {"type": "YANDEX.NUMBER", "value": 99}, + ] + result = parse_control("прибавь", entities=entities) + assert result is not None + assert result.action == "volume_relative" + assert result.value == 7 + + def test_volume_set_still_wins_with_keyword(self) -> None: + """volume_set with the explicit "громкость" keyword still matches first.""" + result = parse_control("громкость 30") + assert result is not None + assert result.action == "volume_set" + assert result.value == 30 + + class TestPluralRu: """Tests for the Russian quantitative-form picker.""" @@ -368,6 +468,57 @@ async def test_volume_set_none_falls_back_to_zero(self) -> None: await execute_control(mass, ParsedControl(action="volume_set", value=None), self._player()) mass.players.cmd_volume_set.assert_awaited_once_with("p1", 0) + async def test_volume_relative_increase(self) -> None: + """volume_relative reads current volume and bumps by signed delta.""" + mass = self._make_mass() + player = self._player() + player.volume_level = 40 + await execute_control( + mass, ParsedControl(action="volume_relative", value=20), player + ) + mass.players.cmd_volume_set.assert_awaited_once_with("p1", 60) + + async def test_volume_relative_decrease(self) -> None: + """volume_relative with negative delta lowers the current level.""" + mass = self._make_mass() + player = self._player() + player.volume_level = 70 + await execute_control( + mass, ParsedControl(action="volume_relative", value=-15), player + ) + mass.players.cmd_volume_set.assert_awaited_once_with("p1", 55) + + async def test_volume_relative_clamps_high(self) -> None: + """Resulting volume above 100 is clamped to 100.""" + mass = self._make_mass() + player = self._player() + player.volume_level = 90 + await execute_control( + mass, ParsedControl(action="volume_relative", value=50), player + ) + mass.players.cmd_volume_set.assert_awaited_once_with("p1", 100) + + async def test_volume_relative_clamps_low(self) -> None: + """Resulting volume below 0 is clamped to 0.""" + mass = self._make_mass() + player = self._player() + player.volume_level = 5 + await execute_control( + mass, ParsedControl(action="volume_relative", value=-30), player + ) + mass.players.cmd_volume_set.assert_awaited_once_with("p1", 0) + + async def test_volume_relative_missing_volume_level_uses_default(self) -> None: + """Player without a volume_level (virtual / unsupported) defaults to 50.""" + mass = self._make_mass() + player = self._player() + # Explicitly drop volume_level so getattr returns None. + del player.volume_level + await execute_control( + mass, ParsedControl(action="volume_relative", value=10), player + ) + mass.players.cmd_volume_set.assert_awaited_once_with("p1", 60) + async def test_mute(self) -> None: """action=mute invokes cmd_volume_mute(True).""" mass = self._make_mass() From f052e6fbd03e54ab3e729ed525d8e53960e79a2f Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 22:36:22 +0300 Subject: [PATCH 2/8] feat(ux): screen-aware response polish (Phase 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three platform-aware response improvements landing without new dependencies, all gated on meta.interfaces.screen so voice-only surfaces (Mini, Pro) are unaffected: - card parameter plumbed through _yandex_response (BigImage / ItemsList / ImageGallery shapes documented). Actual emission deferred to Phase 1.5 — BigImage requires image_id of a pre-uploaded asset and per-track album art can't be uploaded inside the 3-second webhook budget. - Suggestion buttons (Следующая / Пауза / Громче / Тише) appended to play- and control-success responses on screened surfaces. Lets the user follow up by tap without re-saying the activation phrase. - TTS dictionary moved to provider/tts_dictionary.py with two tables: WORD_REPLACEMENTS (Russian stress hints + ~26 foreign single-word artist transliterations) and PHRASE_REPLACEMENTS (16 multi-word bands, applied before the per-word regex). _tts_for now matches both Latin and Cyrillic so "Включаю Metallica" emits tts="Включ+аю Мет+аллика". text stays clean. - voice_continuation toggle (CONF_DIALOG_VOICE_CONTINUATION, default off): when enabled, play- and control-success keep end_session=False for natural follow-ups. "стоп / выключи / выключи музыку" always closes the session regardless of the toggle. No UI surface yet — power-user knob for now. Co-Authored-By: Claude Opus 4.7 (1M context) --- provider/constants.py | 9 +++ provider/dialogs.py | 111 ++++++++++++++++++++------ provider/plugin.py | 7 ++ provider/tts_dictionary.py | 98 +++++++++++++++++++++++ tests/test_dialogs.py | 158 ++++++++++++++++++++++++++++++++++++- 5 files changed, 354 insertions(+), 29 deletions(-) create mode 100644 provider/tts_dictionary.py diff --git a/provider/constants.py b/provider/constants.py index dab2858..0f87b8a 100644 --- a/provider/constants.py +++ b/provider/constants.py @@ -111,6 +111,15 @@ # CONF_INSTANCE_NAME field. CONF_USE_DIFFERENT_INSTANCE_NAME = "use_different_instance_name" +# Toggle: keep the conversation open after a play / control success (P1.4). +# Default OFF — historical voice-UX where the skill ends the session and +# the user re-says "Алиса, попроси " for the next command. ON keeps +# `end_session=false` after success so follow-ups skip the activation +# preamble at the cost of a "skill is listening" indicator on screened +# surfaces. Explicit "стоп / выключи / спасибо" still ends the session +# (those phrases parse as the existing `stop` control intent). +CONF_DIALOG_VOICE_CONTINUATION = "dialog_voice_continuation" + # Yandex Dialogs catalog voice options (TTS), passed to draft payload. # Wire values + display names extracted live from the dev console # (https://dialogs.yandex.ru/developer → skill → Голос dropdown) on diff --git a/provider/dialogs.py b/provider/dialogs.py index b0ed474..e8a8182 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -73,6 +73,7 @@ resolve_player_candidates, ) from .dialogs_player import play_for_alice, resolve_query +from .tts_dictionary import PHRASE_REPLACEMENTS, WORD_REPLACEMENTS if TYPE_CHECKING: from music_assistant.mass import MusicAssistant @@ -81,41 +82,51 @@ _LOGGER = logging.getLogger(__name__) -# Static stress-mark dictionary for common response words (P0.2). -# Keys are case-insensitive whole-word matches; the marker is `+` placed -# directly before the stressed vowel — Yandex Alice TTS supports this -# inline syntax. Keep small and high-confidence; band/track names are -# left as-is (those need a separate phoneme dict — P2.3). -_TTS_STRESS_MARKS: dict[str, str] = { - "включаю": "включ+аю", - "ставлю": "ст+авлю", - "пауза": "п+ауза", - "продолжаю": "продолж+аю", - "следующая": "сл+едующая", - "предыдущая": "пред+ыдущая", - "громче": "гр+омче", - "тише": "т+ише", - "громкость": "гр+омкость", - "колонке": "кол+онке", - "колонку": "кол+онку", -} - -_TTS_WORD_RE = re.compile(r"[А-Яа-яЁё]+") +# P0.2 — TTS pronunciation hints. The `_tts_for` helper rewrites known +# words to add `+` stress markers (Russian) or Cyrillic transliterations +# (foreign artist names) so Alice's TTS reads them naturally. Tables +# live in `tts_dictionary.py` for easier PR contributions; the regex +# matches BOTH Latin and Cyrillic words because foreign artist names +# arrive in Latin (e.g., the user said "Metallica" → command keeps it +# Latin → response text says "Metallica" → tts says "мет+аллика"). +_TTS_WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё]+") def _tts_for(text: str) -> str: - """Add `+` stress markers to known words for cleaner Alice TTS. - - Pure substitution — unknown words pass through unchanged. The map is - intentionally small (high-confidence Russian response words only); - expand via PRs as patterns emerge. + """Add `+` stress markers and foreign-name transliterations for Alice TTS. + + Two passes: + 1. Multi-word phrase replacement (longest first via the table's + declared order). Required for "Iron Maiden", "Pink Floyd" etc. + which the per-word regex cannot match across whitespace. + 2. Per-word substitution against ``WORD_REPLACEMENTS`` — covers + Russian response stresses and single-word foreign names. + + Unknown words pass through unchanged. The map is intentionally small + and curated — every entry is maintenance debt; add via PR when a + real-user log shows Alice mispronouncing a specific word. """ if not text: return text + # Phrase pass — case-insensitive whole-substring replacement. Walks + # the table in declared order so longer phrases (e.g. "red hot chili + # peppers") match before any sub-string entries. Result drops the + # original casing on the matched span — for TTS-only output that's + # acceptable (Alice doesn't render visual casing on voice surfaces; + # screen surfaces read `text`, not `tts`). + lowered = text.lower() + if any(phrase in lowered for phrase, _ in PHRASE_REPLACEMENTS): + for phrase, replacement in PHRASE_REPLACEMENTS: + idx = lowered.find(phrase) + while idx != -1: + text = text[:idx] + replacement + text[idx + len(phrase):] + lowered = text.lower() + idx = lowered.find(phrase, idx + len(replacement)) + def _sub(match: re.Match[str]) -> str: word = match.group(0) - replacement = _TTS_STRESS_MARKS.get(word.lower()) + replacement = WORD_REPLACEMENTS.get(word.lower()) if replacement is None: return word if word[:1].isupper(): @@ -130,6 +141,20 @@ def _safe_dict(value: Any) -> dict[str, Any]: return value if isinstance(value, dict) else {} +# Suggestion buttons appended to play-/control-success responses on +# screened surfaces (mobile Alice, station-max, navigator, smart-screen). +# Lets the user tap a follow-up without saying the activation phrase +# again. `hide=False` keeps them on screen until tapped or the next +# response replaces them. Voice-only surfaces (Mini, Pro, dumb speakers) +# don't render buttons — so we omit the field entirely on those. +_PLAYBACK_SUGGESTION_BUTTONS: list[dict[str, Any]] = [ + {"title": "Следующая", "hide": False}, + {"title": "Пауза", "hide": False}, + {"title": "Громче", "hide": False}, + {"title": "Тише", "hide": False}, +] + + def _has_screen(meta: Any) -> bool: """Return True if the calling surface has a display. @@ -240,6 +265,7 @@ def __init__( skill_id: str, webhook_secret: str, exposed_player_ids: set[str] | None = None, + voice_continuation: bool = False, logger: logging.Logger | None = None, ) -> None: """Initialize the handler. @@ -251,12 +277,19 @@ def __init__( webhook_secret: Random secret embedded in the webhook URL. exposed_player_ids: Optional restriction set; only these players are addressable by voice (passed to the player resolver). + voice_continuation: When True, play- and control-success + responses keep the conversation open (``end_session=False``) + so the user can issue follow-ups without re-saying the + activation phrase. Default False preserves today's + voice-UX. Stop / pause-with-no-resume utterances still + close the session via the existing control path. logger: Optional logger override. """ self._mass = mass self._skill_id = skill_id self._webhook_secret = webhook_secret self._exposed_player_ids = exposed_player_ids + self._voice_continuation = voice_continuation self._logger = logger or _LOGGER self._unregister_callbacks: list[Callable[[], None]] = [] # In-process state cache; see _STATE_CACHE_TTL_SEC / _MAX. @@ -573,6 +606,7 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: default_id=default_id, session_state_in=_without_pending(session_state_in), app_state_in=app_state_in, + has_screen=has_screen, ) # P0.4 — awaiting-query re-entry. If the previous turn asked "Что @@ -764,6 +798,7 @@ async def _dispatch_play( player=candidates[0], base_session_state=session_state_in, base_app_state=app_state_in, + has_screen=has_screen, ) # ------------------------------------------------------------------- @@ -778,6 +813,7 @@ def _handle_control( # noqa: PLR0915 default_id: str | None, session_state_in: dict[str, Any], app_state_in: dict[str, Any], + has_screen: bool = True, ) -> web.Response: """Resolve player + dispatch a control action; build response.""" # list_players is informational — no player resolution / dispatch. @@ -1001,13 +1037,21 @@ def _handle_control( # noqa: PLR0915 if isinstance(user_obj, dict) and user_obj.get("user_id"): user_state_update = {"preferred_player_id": player.player_id} text = control_confirmation(control) + # Stop is the natural session-end signal — even with voice + # continuation enabled, "стоп / выключи" should hand the mic + # back to the user instead of staying in the skill listening loop. + end_session = ( + True if control.action == "stop" else not self._voice_continuation + ) return self._yandex_response( incoming_session=session, text=text, tts=_tts_for(text), + end_session=end_session, session_state=new_session_state, application_state=new_app_state, user_state_update=user_state_update, + buttons=_PLAYBACK_SUGGESTION_BUTTONS if has_screen else None, ) # ------------------------------------------------------------------- @@ -1022,6 +1066,7 @@ async def _play_with_player( player: Any, base_session_state: dict[str, Any], base_app_state: dict[str, Any], + has_screen: bool = True, ) -> web.Response: """Search media, fire-and-forget play, build response with persisted state.""" try: @@ -1094,9 +1139,11 @@ async def _play_with_player( incoming_session=session, text=text, tts=_tts_for(text), + end_session=not self._voice_continuation, session_state=new_session_state, application_state=new_app_state, user_state_update=user_state_update, + buttons=_PLAYBACK_SUGGESTION_BUTTONS if has_screen else None, ) # ------------------------------------------------------------------- @@ -1348,6 +1395,7 @@ async def _try_resume_pending( player=chosen_player, base_session_state=session_state_in, base_app_state=app_state_in, + has_screen=has_screen, ) # ------------------------------------------------------------------- @@ -1365,6 +1413,7 @@ def _yandex_response( application_state: dict[str, Any] | None = None, user_state_update: dict[str, Any] | None = None, buttons: list[dict[str, Any]] | None = None, + card: dict[str, Any] | None = None, ) -> web.Response: """Build a Yandex Dialogs response envelope. @@ -1373,6 +1422,14 @@ def _yandex_response( user-scoped state (set keys to None to clear). Omit a parameter to leave that bucket unchanged on Yandex's side. + ``card`` accepts one of the three Yandex card shapes: + ``BigImage`` (single image + title + description), + ``ItemsList`` (1-5 items, each with image + title), or + ``ImageGallery`` (1-7 images). Yandex silently drops the field + on voice-only surfaces, so callers must still gate emission on + ``meta.interfaces.screen`` to avoid wasted bandwidth and to + honour the buttons-on-screen-only contract. + Side effect: any time we set ``session_state`` or ``application_state``, the merged value is also written to the in-process state cache as a third-tier fallback (see @@ -1401,6 +1458,8 @@ def _yandex_response( } if buttons: response_body["buttons"] = buttons + if card: + response_body["card"] = card payload: dict[str, Any] = { "version": "1.0", "session": echoed, diff --git a/provider/plugin.py b/provider/plugin.py index a2a9d28..0189419 100644 --- a/provider/plugin.py +++ b/provider/plugin.py @@ -22,6 +22,7 @@ from .constants import ( CONF_DIALOG_SKILL_ID, + CONF_DIALOG_VOICE_CONTINUATION, CONF_DIALOG_WEBHOOK_SECRET, CONF_EXPOSED_PLAYERS, CONF_INSTANCE_NAME, @@ -44,6 +45,11 @@ async def handle_async_init(self) -> None: self._exposed_player_ids: set[str] | None = {str(item) for item in exposed_raw} else: self._exposed_player_ids = None + # Voice continuation (P1.4) — power-user toggle, no UI surface yet. + # Read directly from the config bag; absent → False (today's behaviour). + self._voice_continuation = bool( + self.config.get_value(CONF_DIALOG_VOICE_CONTINUATION) or False + ) async def loaded_in_mass(self) -> None: """Register the Dialogs webhook route once the webserver is up. @@ -60,6 +66,7 @@ async def loaded_in_mass(self) -> None: skill_id=self._dialog_skill_id, webhook_secret=self._dialog_webhook_secret, exposed_player_ids=self._exposed_player_ids, + voice_continuation=self._voice_continuation, ) self._dialogs_handler.register_routes() diff --git a/provider/tts_dictionary.py b/provider/tts_dictionary.py new file mode 100644 index 0000000..b2f8f4c --- /dev/null +++ b/provider/tts_dictionary.py @@ -0,0 +1,98 @@ +# ruff: noqa: RUF001 +"""TTS pronunciation hints for Alice's text-to-speech. + +Yandex Alice's TTS gives passable Russian pronunciation out of the box but +often mangles foreign artist / band names — "Metallica" becomes +"мэ-та-ли-ка" (Latin-letters → English-phonemes path), "Coldplay" becomes +"чол-дплай", etc. We intercept by emitting a Cyrillic transliteration with +a `+` stress marker into ``response.tts`` while keeping ``response.text`` +clean (the user reads the original; Alice speaks the cleaned-up form). + +Two tables: + +* ``WORD_REPLACEMENTS`` — single-word, applied via the per-word regex in + ``dialogs._tts_for``. Includes both Russian response words (stress + hints) and foreign single-word artist names (Cyrillic transliteration). +* ``PHRASE_REPLACEMENTS`` — multi-word, applied via whole-string + substitution before the word regex. Required because the per-word + regex can't match phrases like "Iron Maiden" or "Pink Floyd". + +Entries are LOWERCASE keys; case is restored at substitution time. +Add via PR — opportunistically, when a real-user log shows Alice +mangling a name. Don't bulk-import — every entry is config debt. +""" + +from __future__ import annotations + +# --------------------------------------------------------------------------- +# Single-word replacements +# --------------------------------------------------------------------------- + +WORD_REPLACEMENTS: dict[str, str] = { + # ----- Russian response words (P0.2 — stress hints) ----- + "включаю": "включ+аю", + "ставлю": "ст+авлю", + "пауза": "п+ауза", + "продолжаю": "продолж+аю", + "следующая": "сл+едующая", + "предыдущая": "пред+ыдущая", + "громче": "гр+омче", + "тише": "т+ише", + "громкость": "гр+омкость", + "колонке": "кол+онке", + "колонку": "кол+онку", + # ----- Foreign artists (single word) ----- + # Add here when a real log shows mispronunciation. Order: alpha by key. + "abba": "+абба", + "adele": "ад+эль", + "aerosmith": "+аэросмит", + "beatles": "б+итлз", + "beyonce": "бэй+онсэ", + "blur": "блёр", + "coldplay": "к+олдплей", + "depeche": "деп+еш", + "drake": "дрейк", + "eminem": "эмин+эм", + "evanescence": "иванэсс+энс", + "gorillaz": "горил+ас", + "imagine": "имадж+ин", + "kiss": "кисс", + "madonna": "мад+онна", + "metallica": "мет+аллика", + "muse": "мьюз", + "nirvana": "нирв+ана", + "oasis": "о+азис", + "queen": "квин", + "radiohead": "р+адиохед", + "rammstein": "р+амштайн", + "rihanna": "рих+анна", + "scorpions": "ск+орпионс", + "skillet": "ск+иллет", + "sting": "стинг", +} + + +# --------------------------------------------------------------------------- +# Multi-word phrase replacements (applied before per-word substitution) +# --------------------------------------------------------------------------- + +PHRASE_REPLACEMENTS: tuple[tuple[str, str], ...] = ( + # Order: longer phrases first to avoid sub-string clashes + # (e.g. "red hot chili peppers" must beat any "red"-prefixed entry). + ("red hot chili peppers", "ред хот ч+или п+эпперс"), + ("imagine dragons", "имадж+ин др+агонс"), + ("arctic monkeys", "+арктик м+анкис"), + ("billie eilish", "б+илли +айлиш"), + ("black sabbath", "блэк с+аббат"), + ("foo fighters", "фу ф+айтерс"), + ("guns n roses", "ганз эн р+оузес"), + ("iron maiden", "+айрон м+эйден"), + ("lady gaga", "л+эди г+ага"), + ("led zeppelin", "лед цеппел+ин"), + ("linkin park", "л+инкин парк"), + ("pink floyd", "пинк фл+ойд"), + ("bruno mars", "бр+уно марс"), + ("daft punk", "дафт панк"), + ("ed sheeran", "эд ш+иран"), + ("taylor swift", "т+эйлор свифт"), +) diff --git a/tests/test_dialogs.py b/tests/test_dialogs.py index 8054568..41f5256 100644 --- a/tests/test_dialogs.py +++ b/tests/test_dialogs.py @@ -241,6 +241,138 @@ async def test_full_happy_path_starts_play_media(self) -> None: assert call_kwargs["media"] is track +@pytest.mark.asyncio +class TestSuggestionButtons: + """Phase 1 / P1.3: play- and control-success responses surface follow-up buttons on screen.""" + + def _make_handler(self, mass: MagicMock) -> DialogsWebhookHandler: + return DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) + + async def test_play_success_emits_buttons_on_screen(self) -> None: + """Play-success on screened surface includes Следующая/Пауза/Громче/Тише buttons.""" + track = MagicMock(uri="library://track/1", spec_set=["uri"]) + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")], search_track=track) + handler = self._make_handler(mass) + body = { + "meta": {"interfaces": {"screen": {}}}, + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "включи Metallica на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + assert resp.status == 200 + body_out = _response_body(resp) + button_titles = [b["title"] for b in body_out["response"]["buttons"]] + assert button_titles == ["Следующая", "Пауза", "Громче", "Тише"] + + async def test_play_success_no_buttons_voice_only(self) -> None: + """Play-success on a voice-only surface omits buttons entirely.""" + track = MagicMock(uri="library://track/1", spec_set=["uri"]) + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")], search_track=track) + handler = self._make_handler(mass) + body = { + # No meta.interfaces — voice-only (Yandex Mini etc.) + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "включи Metallica на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + assert "buttons" not in body_out["response"] + + async def test_control_success_emits_buttons_on_screen(self) -> None: + """Control-success (e.g. pause) on screened surface includes the same buttons.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.player_queues.pause = AsyncMock() + handler = self._make_handler(mass) + body = { + "meta": {"interfaces": {"screen": {}}}, + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "пауза на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + button_titles = [b["title"] for b in body_out["response"]["buttons"]] + assert button_titles == ["Следующая", "Пауза", "Громче", "Тише"] + + +@pytest.mark.asyncio +class TestVoiceContinuation: + """Phase 1 / P1.4: opt-in `end_session=false` after play / control success.""" + + async def test_play_success_ends_session_by_default(self) -> None: + """Without the toggle, play-success closes the session (today's UX).""" + track = MagicMock(uri="library://track/1", spec_set=["uri"]) + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")], search_track=track) + handler = DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "включи Metallica на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is True + + async def test_play_success_keeps_session_open_when_continuation_on(self) -> None: + """With continuation on, play-success keeps the conversation alive.""" + track = MagicMock(uri="library://track/1", spec_set=["uri"]) + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")], search_track=track) + handler = DialogsWebhookHandler( + mass, + skill_id="skill-uuid-1", + webhook_secret=_TEST_SECRET, + voice_continuation=True, + ) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "включи Metallica на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is False + + async def test_control_success_keeps_session_open_when_continuation_on(self) -> None: + """Continuation also applies to control-success (pause / volume / etc.).""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.player_queues.pause = AsyncMock() + handler = DialogsWebhookHandler( + mass, + skill_id="skill-uuid-1", + webhook_secret=_TEST_SECRET, + voice_continuation=True, + ) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "пауза на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is False + + async def test_stop_action_ends_session_even_with_continuation_on(self) -> None: + """`стоп / выключи` always closes the session regardless of the toggle.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.player_queues.stop = AsyncMock() + handler = DialogsWebhookHandler( + mass, + skill_id="skill-uuid-1", + webhook_secret=_TEST_SECRET, + voice_continuation=True, + ) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "стоп на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is True + + # --------------------------------------------------------------------------- # Yandex state envelope (P0.1) + tts split (P0.2) # --------------------------------------------------------------------------- @@ -384,9 +516,9 @@ async def test_session_state_preserved_on_player_not_found(self) -> None: class TestTtsHelper: """Tests for _tts_for stress-mark substitution.""" - def test_known_word_gets_stress_mark(self) -> None: - """A known word from the dict has `+` injected before the stressed vowel.""" - assert _tts_for("Включаю Metallica") == "Включ+аю Metallica" + def test_known_russian_word_gets_stress_mark(self) -> None: + """A known Russian word has `+` injected before the stressed vowel.""" + assert _tts_for("Включаю джаз") == "Включ+аю джаз" def test_unknown_word_passes_through(self) -> None: """A word not in the dict is unchanged.""" @@ -403,6 +535,26 @@ def test_capitalisation_preserved(self) -> None: # Capitalised original. assert _tts_for("Включаю джаз") == "Включ+аю джаз" + def test_foreign_band_transliterated(self) -> None: + """Latin band names are transliterated to Cyrillic with stress marks.""" + # Single-word foreign band (regex pass). + assert _tts_for("Включаю Metallica") == "Включ+аю Мет+аллика" + # Lowercase form preserved. + assert _tts_for("включаю metallica") == "включ+аю мет+аллика" + + def test_foreign_phrase_transliterated(self) -> None: + """Multi-word foreign band names are matched via the phrase pass.""" + result = _tts_for("Включаю Iron Maiden на кухне") + assert "+айрон м+эйден" in result.lower() + # Russian response words still get their stress mark in the same call. + assert "Включ+аю" in result + + def test_phrase_pass_handles_overlap(self) -> None: + """Longer phrases match before shorter sub-phrases (declared order).""" + # "imagine dragons" must win over the single-word "imagine" entry. + result = _tts_for("Imagine Dragons") + assert "имадж+ин др+агонс" in result.lower() + @pytest.mark.asyncio class TestTtsResponseField: From fdb24a8a895dd4647bb72933d00774645577982d Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 23:18:59 +0300 Subject: [PATCH 3/8] feat(nlu): declare custom intents on Yandex side, dispatch via request.nlu.intents (Phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delegates intent classification to Yandex's grammar engine for the closed set of control commands and the my_wave play intent. The platform's synchronously-classified `request.nlu.intents.` block now takes precedence in the webhook handler; the existing regex parsers (parse_command / parse_control) remain as fallback for phrases that don't match any declared grammar — so this is purely additive coverage, no regression risk. Eleven grammars ship in provider/dialogs_grammar.py: control.{pause,resume,next,previous,stop,volume_up,volume_down, shuffle_on,shuffle_off,now_playing} play.my_wave Each carries `positiveTests` for the dev-console "Протестировать" button and uses %lemma where multi-word morphology matters. All grammar bodies are conservative — Yandex's server-side validator catches malformed sources synchronously and surfaces them as DialogsIntentValidationError, so set_intents() will fail loud rather than silently deploying broken NLU. The intents pipeline runs between draft update and request_deploy, so they land in the same moderation cycle as the rest of the draft (no two-phase publish needed). Endpoints + payload shape were derived from a Playwright probe of the live dev console on 2026-05-07; the ya-dialogs-api>=2.1.0 dependency wraps the five new REST endpoints. Bumps ya-dialogs-api>=2.1.0 in pyproject.toml + manifest.json. Co-Authored-By: Claude Opus 4.7 (1M context) --- provider/auto_create.py | 2 + provider/auto_update.py | 2 + provider/dialogs.py | 36 +++++ provider/dialogs_grammar.py | 266 ++++++++++++++++++++++++++++++++++++ provider/manifest.json | 2 +- pyproject.toml | 2 +- tests/test_dialogs.py | 83 +++++++++++ 7 files changed, 391 insertions(+), 2 deletions(-) create mode 100644 provider/dialogs_grammar.py diff --git a/provider/auto_create.py b/provider/auto_create.py index 168ba56..231c25d 100644 --- a/provider/auto_create.py +++ b/provider/auto_create.py @@ -50,6 +50,7 @@ from .auth_session import cached_authenticated_session, make_cached_authenticator from .constants import DIALOG_CHANNEL +from .dialogs_grammar import build_grammar from .skill_logo import load_skill_logo_bytes if TYPE_CHECKING: @@ -258,6 +259,7 @@ async def _run_pipeline( description=description, structured_examples=structured_examples, activation_phrases=activation_phrases, + intents=build_grammar(), logo_bytes=load_skill_logo_bytes(), creator_factory=_make_logging_creator_factory(), ) diff --git a/provider/auto_update.py b/provider/auto_update.py index 77b0f6f..a592f15 100644 --- a/provider/auto_update.py +++ b/provider/auto_update.py @@ -27,6 +27,7 @@ from .auth_session import make_cached_authenticator from .constants import DIALOG_CHANNEL +from .dialogs_grammar import build_grammar _LOGGER = logging.getLogger(__name__) @@ -110,6 +111,7 @@ async def run_auto_update( description=description, structured_examples=structured_examples, activation_phrases=activation_phrases, + intents=build_grammar(), voice=voice, ) except InvalidCredentialsError as exc: diff --git a/provider/dialogs.py b/provider/dialogs.py index e8a8182..446edd5 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -59,11 +59,13 @@ DIALOG_WEBHOOK_BASE_PATH, ) from .dialogs_control import ( + ParsedControl, control_confirmation, execute_control, format_list_players, parse_control, ) +from .dialogs_grammar import parse_platform_intent from .dialogs_nlu import ( _VERB_RE, ParsedCommand, @@ -598,6 +600,40 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: if not isinstance(nlu, dict): nlu = {} nlu_entities = nlu.get("entities") if isinstance(nlu.get("entities"), list) else None + nlu_intents = nlu.get("intents") if isinstance(nlu.get("intents"), dict) else None + + # Phase 2 — platform-pre-classified intents take precedence over + # the regex parsers. When grammar matched the phrase upstream, + # the result lands in `request.nlu.intents.`; map it + # back to our existing ParsedControl / ParsedCommand and skip the + # regex pass. Falls through to the regex parsers when the block + # is empty (no grammar declared or no match). + platform = parse_platform_intent(nlu_intents) + if isinstance(platform, ParsedControl): + self._logger.debug( + "Platform intent → control %r (skipping regex parser)", platform + ) + return self._handle_control( + session=session, + control=platform, + default_id=default_id, + session_state_in=_without_pending(session_state_in), + app_state_in=app_state_in, + has_screen=has_screen, + ) + if isinstance(platform, ParsedCommand): + self._logger.debug( + "Platform intent → play %r (skipping regex parser)", platform + ) + return await self._dispatch_play( + session=session, + parsed=platform, + default_id=default_id, + session_state_in=session_state_in, + app_state_in=app_state_in, + has_screen=has_screen, + ) + if control := parse_control(command, entities=nlu_entities): self._logger.debug("Parsed dialog control %r → %r", command, control) return self._handle_control( diff --git a/provider/dialogs_grammar.py b/provider/dialogs_grammar.py new file mode 100644 index 0000000..44bdb7c --- /dev/null +++ b/provider/dialogs_grammar.py @@ -0,0 +1,266 @@ +# ruff: noqa: RUF001 +"""Yandex Dialogs custom-intent grammars for the Music Assistant skill. + +Each intent here is delivered to Yandex via `ya_dialogs_api.IntentDraft` + +`set_intents()` during skill provisioning. At runtime, when Yandex matches +a user's phrase against one of these grammars, it pre-classifies the +intent and surfaces it in `request.nlu.intents.` — the +webhook handler reads that block first and only falls back to the +in-house regex parsers (`parse_command` / `parse_control`) when the +platform produced no match. + +Design notes: + +* **Conservative baseline.** This module ships a *subset* of our + regex-covered intents. The aim is platform-side coverage of the most + common phrasings, not 1:1 parity. Phrases that don't match here fall + through to the regex parsers (which remain authoritative for the + long tail). +* **`%lemma` directive** matches all morphological forms of the lemma + (e.g. `%lemma включить` covers «включи / включите / включай / + включить / включим»). Applied conservatively to verbs that have + multiple commonly-used forms. +* **Grammar source is server-validated synchronously.** Bad syntax + surfaces as `DialogsIntentValidationError` from `set_intents()`, so + the test suite for this module asserts the fixtures load without + raising once contributed. +* **Positive tests** double as documentation and a self-check — + Yandex's "Протестировать" button in the dev console can run them + individually for visual regression. + +Adding a new intent: append a new entry, regenerate the skill via the +plugin's "Apply skill changes" form action, observe the moderation +cycle complete (minutes to hours for private skills), then exercise the +phrase against a live device. +""" + +from __future__ import annotations + +from typing import Any + +from ya_dialogs_api import IntentDraft + +from .dialogs_control import ParsedControl +from .dialogs_nlu import ParsedCommand + +# --------------------------------------------------------------------------- +# Grammar fragments — control intents +# --------------------------------------------------------------------------- + +_PAUSE_GRAMMAR = """\ +root: + %lemma пауза + поставь на паузу + %lemma останови музыку + на паузу +""" + +_RESUME_GRAMMAR = """\ +root: + %lemma продолжить + %lemma возобновить + включи снова +""" + +_NEXT_GRAMMAR = """\ +root: + %lemma следующая + %lemma следующий трек + %lemma дальше + %lemma переключи +""" + +_PREVIOUS_GRAMMAR = """\ +root: + %lemma предыдущая + %lemma предыдущий трек + %lemma назад + %lemma вернись +""" + +_STOP_GRAMMAR = """\ +root: + %lemma стоп + %lemma останови + %lemma выключи + выключи музыку +""" + +_VOLUME_UP_GRAMMAR = """\ +root: + %lemma громче + сделай громче + %lemma прибавь +""" + +_VOLUME_DOWN_GRAMMAR = """\ +root: + %lemma тише + сделай тише + %lemma убавь +""" + +_SHUFFLE_ON_GRAMMAR = """\ +root: + %lemma перемешай + включи перемешивание + случайный порядок + в случайном порядке +""" + +_SHUFFLE_OFF_GRAMMAR = """\ +root: + выключи перемешивание + не перемешивай + по порядку +""" + +_NOW_PLAYING_GRAMMAR = """\ +root: + что играет + что сейчас играет + что мы слушаем + что за песня + что за трек +""" + + +# --------------------------------------------------------------------------- +# Grammar fragments — play intents +# --------------------------------------------------------------------------- + +_MY_WAVE_GRAMMAR = """\ +root: + %lemma включи мою волну + %lemma включи моё радио + %lemma поставь мою волну + моя волна +""" + + +# --------------------------------------------------------------------------- +# Builder +# --------------------------------------------------------------------------- + + +def build_grammar() -> list[IntentDraft]: + """Return the full list of custom intents to declare on the skill.""" + return [ + IntentDraft( + form_name="control.pause", + human_readable_name="Пауза", + source_text=_PAUSE_GRAMMAR, + positive_tests="пауза\nпоставь на паузу\nостанови музыку\nна паузу", + negative_tests="включи\nследующая", + ), + IntentDraft( + form_name="control.resume", + human_readable_name="Продолжить", + source_text=_RESUME_GRAMMAR, + positive_tests="продолжи\nпродолжить\nвозобнови\nвключи снова", + ), + IntentDraft( + form_name="control.next", + human_readable_name="Следующий трек", + source_text=_NEXT_GRAMMAR, + positive_tests="следующая\nследующий трек\nдальше\nпереключи", + ), + IntentDraft( + form_name="control.previous", + human_readable_name="Предыдущий трек", + source_text=_PREVIOUS_GRAMMAR, + positive_tests="предыдущая\nпредыдущий трек\nназад\nвернись", + ), + IntentDraft( + form_name="control.stop", + human_readable_name="Стоп", + source_text=_STOP_GRAMMAR, + positive_tests="стоп\nостанови\nвыключи\nвыключи музыку", + ), + IntentDraft( + form_name="control.volume_up", + human_readable_name="Громче", + source_text=_VOLUME_UP_GRAMMAR, + positive_tests="громче\nсделай громче\nприбавь", + ), + IntentDraft( + form_name="control.volume_down", + human_readable_name="Тише", + source_text=_VOLUME_DOWN_GRAMMAR, + positive_tests="тише\nсделай тише\nубавь", + ), + IntentDraft( + form_name="control.shuffle_on", + human_readable_name="Включить перемешивание", + source_text=_SHUFFLE_ON_GRAMMAR, + positive_tests="перемешай\nвключи перемешивание\nслучайный порядок", + ), + IntentDraft( + form_name="control.shuffle_off", + human_readable_name="Выключить перемешивание", + source_text=_SHUFFLE_OFF_GRAMMAR, + positive_tests="выключи перемешивание\nне перемешивай\nпо порядку", + ), + IntentDraft( + form_name="control.now_playing", + human_readable_name="Что играет", + source_text=_NOW_PLAYING_GRAMMAR, + positive_tests="что играет\nчто сейчас играет\nчто мы слушаем", + ), + IntentDraft( + form_name="play.my_wave", + human_readable_name="Моя волна", + source_text=_MY_WAVE_GRAMMAR, + positive_tests="включи мою волну\nпоставь мою волну\nвключи моё радио", + ), + ] + + +# --------------------------------------------------------------------------- +# Runtime: map platform-classified intents back to our internal dataclasses +# --------------------------------------------------------------------------- + +# Each grammar above declares an intent under a stable ``form_name``. When +# Yandex matches a user phrase against one of them, the webhook receives +# the intent name in ``request.nlu.intents.`` (with any slot +# values, though our control intents declare none yet). This map keeps the +# runtime mapping in lockstep with the grammars — adding a new intent +# requires touching this dict so misclassification can't sneak through +# unnoticed. +_CONTROL_INTENT_MAP: dict[str, str] = { + "control.pause": "pause", + "control.resume": "resume", + "control.next": "next", + "control.previous": "previous", + "control.stop": "stop", + "control.volume_up": "volume_up", + "control.volume_down": "volume_down", + "control.shuffle_on": "shuffle_on", + "control.shuffle_off": "shuffle_off", + "control.now_playing": "now_playing", +} + + +def parse_platform_intent( + nlu_intents: dict[str, Any] | None, +) -> ParsedControl | ParsedCommand | None: + """Map a ``request.nlu.intents`` block to our dispatcher's dataclass. + + Returns ``None`` when: + * The block is missing / empty (no grammar declared, or no match). + * The matched intent name isn't one we ship a runtime handler for. + + Returns the FIRST recognised intent in iteration order. Yandex doesn't + guarantee a single match — when grammars overlap the platform may + surface several — but for our conservative grammar set the overlap + is engineered out (each phrase pattern lives in exactly one intent). + """ + if not isinstance(nlu_intents, dict) or not nlu_intents: + return None + for form_name in nlu_intents: + action = _CONTROL_INTENT_MAP.get(form_name) + if action is not None: + return ParsedControl(action=action) # type: ignore[arg-type] + if form_name == "play.my_wave": + return ParsedCommand(kind="my_wave", query="", radio_mode=True) + return None diff --git a/provider/manifest.json b/provider/manifest.json index b552a4b..4cc2eb9 100644 --- a/provider/manifest.json +++ b/provider/manifest.json @@ -9,7 +9,7 @@ ], "requirements": [ "ya-passport-auth==1.3.0", - "ya-dialogs-api>=2.0.0" + "ya-dialogs-api>=2.1.0" ], "documentation": "https://github.com/trudenboy/ma-provider-yandex-alice", "stage": "beta", diff --git a/pyproject.toml b/pyproject.toml index 283df33..cec23ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires-python = ">=3.12" dynamic = ["version"] dependencies = [ "ya-passport-auth>=1.3.0", - "ya-dialogs-api>=2.0.0", + "ya-dialogs-api>=2.1.0", ] [project.optional-dependencies] diff --git a/tests/test_dialogs.py b/tests/test_dialogs.py index 41f5256..4663337 100644 --- a/tests/test_dialogs.py +++ b/tests/test_dialogs.py @@ -297,6 +297,89 @@ async def test_control_success_emits_buttons_on_screen(self) -> None: assert button_titles == ["Следующая", "Пауза", "Громче", "Тише"] +@pytest.mark.asyncio +class TestPlatformIntentDispatch: + """Phase 2: request.nlu.intents pre-classification takes precedence over regex.""" + + def _handler(self, mass: MagicMock) -> DialogsWebhookHandler: + return DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) + + async def test_control_pause_via_platform_intent(self) -> None: + """`request.nlu.intents['control.pause']` → ParsedControl(action='pause').""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.player_queues.pause = AsyncMock() + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "пауза", + "nlu": {"intents": {"control.pause": {}}}, + }, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + assert resp.status == 200 + # Pause was dispatched even though command="пауза" would also match regex. + mass.player_queues.pause.assert_awaited_once_with("p1") + + async def test_play_my_wave_via_platform_intent(self) -> None: + """`request.nlu.intents['play.my_wave']` → ParsedCommand(kind='my_wave').""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + # _resolve_my_wave returns None when yandex_music provider absent → handler + # surfaces "не нашёл такую музыку" but still went through the my_wave path. + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + # `command` is the noisy raw — wouldn't normally classify as my_wave, + # but the platform intent overrides it. + "command": "что-то совсем другое", + "nlu": {"intents": {"play.my_wave": {}}}, + }, + } + resp = await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + body_out = _response_body(resp) + # Platform path responded — it didn't fall through to "не понял". + # When yandex_music isn't available, the response is a graceful + # "не нашёл такую музыку" rather than "не понял команду". + assert "не понял" not in body_out["response"]["text"].lower() + + async def test_unrecognised_intent_falls_back_to_regex(self) -> None: + """Unknown form_name in intents → falls through to parse_control / parse_command.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.player_queues.pause = AsyncMock() + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "пауза", + # Unknown intent form_name — regex should pick it up instead. + "nlu": {"intents": {"unknown.intent": {}}}, + }, + } + await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + # Regex parse_control caught "пауза" and dispatched. + mass.player_queues.pause.assert_awaited_once_with("p1") + + async def test_empty_intents_block_falls_back_to_regex(self) -> None: + """Empty `intents={}` (no grammar match) → regex parser still runs.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.player_queues.next = AsyncMock() + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "следующая", + "nlu": {"intents": {}}, + }, + } + await handler._handle_webhook(_build_request(body)) + await asyncio.sleep(0) + mass.player_queues.next.assert_awaited_once_with("p1") + + @pytest.mark.asyncio class TestVoiceContinuation: """Phase 1 / P1.4: opt-in `end_session=false` after play / control success.""" From 745291fb6a554f9a278dcab49df27438bc4c9191 Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 23:25:08 +0300 Subject: [PATCH 4/8] feat(nlu): handle built-in YANDEX.REJECT / YANDEX.HELP in pending flows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Yandex automatically classifies four built-in intents (CONFIRM / REJECT / HELP / REPEAT) once any custom grammar is declared on the skill — and we declared eleven in the previous Phase 2 commit. This wires up the two that have unambiguous behaviour in our flows: - YANDEX.REJECT during a pending disambiguation prompt or slot-elicit ("На какой колонке?" / "Что включить?") → respond "Хорошо, отменил.", clear pending state, end session. Outside of those prompts the intent falls through to normal command parsing — "отмена" without context is not a free-standing app-cancel signal. - YANDEX.HELP → contextual hint matching the current prompt: re-explain how to answer the disambiguation, suggest example queries during slot elicit, or surface a generic "включи рок на кухне" example otherwise. State is preserved so the user can answer the original question next. CONFIRM and REPEAT are deferred: - CONFIRM is ambiguous in our flows (which player is the user agreeing to? — we have no canonical "yes" target). - REPEAT requires caching the last response on session_state. Co-Authored-By: Claude Opus 4.7 (1M context) --- provider/dialogs.py | 44 ++++++++++++++ tests/test_dialogs.py | 133 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) diff --git a/provider/dialogs.py b/provider/dialogs.py index 446edd5..6827a25 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -602,6 +602,50 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: nlu_entities = nlu.get("entities") if isinstance(nlu.get("entities"), list) else None nlu_intents = nlu.get("intents") if isinstance(nlu.get("intents"), dict) else None + # Built-in YANDEX.* intents — emitted automatically by Yandex once + # any custom grammar is declared. Two we care about today: + # + # * YANDEX.REJECT ("отмена / нет / неважно / отстань") — back out + # of any in-flight prompt. Clears pending_command / awaiting_query + # and ends the session so the user can speak again from scratch. + # * YANDEX.HELP ("помоги / что я могу / помощь") — surface a + # contextual hint depending on the current prompt; keeps state + # so the user can answer the original question afterwards. + # + # YANDEX.CONFIRM and YANDEX.REPEAT aren't wired today: confirm is + # ambiguous in our flows (which player are you confirming?) and + # repeat would require caching the last response on session_state + # — both deferred to a later session. + if isinstance(nlu_intents, dict): + if "YANDEX.REJECT" in nlu_intents and (pending_in or awaiting_in): + self._logger.debug("YANDEX.REJECT in pending/awaiting state → cancel") + text = "Хорошо, отменил." + return self._yandex_response( + incoming_session=session, + text=text, + tts=_tts_for(text), + end_session=True, + session_state=_without_pending(session_state_in), + application_state=_without_pending(app_state_in), + ) + if "YANDEX.HELP" in nlu_intents: + if pending_in: + text = "Скажи имя колонки или её номер из списка." + elif awaiting_in: + text = "Скажи имя артиста, песни, альбома или плейлиста." + else: + text = "Скажи, например: включи рок на кухне." + self._logger.debug("YANDEX.HELP → contextual hint") + return self._yandex_response( + incoming_session=session, + text=text, + tts=_tts_for(text), + end_session=False, + # Preserve pending/awaiting state so the user can answer the + # original prompt right after this hint. + session_state=session_state_in, + ) + # Phase 2 — platform-pre-classified intents take precedence over # the regex parsers. When grammar matched the phrase upstream, # the result lands in `request.nlu.intents.`; map it diff --git a/tests/test_dialogs.py b/tests/test_dialogs.py index 4663337..8c9146f 100644 --- a/tests/test_dialogs.py +++ b/tests/test_dialogs.py @@ -380,6 +380,139 @@ async def test_empty_intents_block_falls_back_to_regex(self) -> None: mass.player_queues.next.assert_awaited_once_with("p1") +@pytest.mark.asyncio +class TestBuiltInIntents: + """Phase 2 follow-up: YANDEX.REJECT / YANDEX.HELP handling in pending flows.""" + + def _handler(self, mass: MagicMock) -> DialogsWebhookHandler: + return DialogsWebhookHandler(mass, skill_id="skill-uuid-1", webhook_secret=_TEST_SECRET) + + async def test_reject_in_pending_disambiguation_cancels(self) -> None: + """YANDEX.REJECT clears pending_command and ends session with confirmation.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "отмена", + "nlu": {"intents": {"YANDEX.REJECT": {}}}, + }, + "state": { + "session": { + "pending_command": { + "kind": "search", + "query": "metallica", + "radio_mode": True, + "candidate_ids": ["p1", "p2"], + } + } + }, + } + resp = await handler._handle_webhook(_build_request(body)) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is True + assert "отменил" in body_out["response"]["text"].lower() + # pending_command cleared from session_state on response. + assert "pending_command" not in body_out["session_state"] + mass.player_queues.play_media.assert_not_awaited() + + async def test_reject_in_awaiting_query_cancels(self) -> None: + """YANDEX.REJECT in slot-elicit ('Что включить?') also exits cleanly.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "неважно", + "nlu": {"intents": {"YANDEX.REJECT": {}}}, + }, + "state": {"session": {"awaiting_query": True}}, + } + resp = await handler._handle_webhook(_build_request(body)) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is True + assert "awaiting_query" not in body_out["session_state"] + + async def test_reject_with_no_pending_falls_through(self) -> None: + """YANDEX.REJECT outside of any prompt context → falls through to normal flow. + + The intent isn't a free-standing 'cancel app' signal — the user + could just be talking. If parse_command also can't make sense of + 'отмена', it lands as a normal "не нашёл" search response. + """ + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "отмена", + "nlu": {"intents": {"YANDEX.REJECT": {}}}, + }, + } + resp = await handler._handle_webhook(_build_request(body)) + body_out = _response_body(resp) + # NOT the cancel response — handler fell through to play-search. + assert "отменил" not in body_out["response"]["text"].lower() + + async def test_help_in_pending_emits_disambiguation_hint(self) -> None: + """YANDEX.HELP during disambiguation tells the user how to answer.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "помоги", + "nlu": {"intents": {"YANDEX.HELP": {}}}, + }, + "state": { + "session": { + "pending_command": { + "kind": "search", + "query": "metallica", + "radio_mode": True, + "candidate_ids": ["p1", "p2"], + } + } + }, + } + resp = await handler._handle_webhook(_build_request(body)) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is False + assert "колонки" in body_out["response"]["text"].lower() + + async def test_help_in_awaiting_emits_query_hint(self) -> None: + """YANDEX.HELP during slot-elicit suggests example queries.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "что я могу", + "nlu": {"intents": {"YANDEX.HELP": {}}}, + }, + "state": {"session": {"awaiting_query": True}}, + } + resp = await handler._handle_webhook(_build_request(body)) + body_out = _response_body(resp) + assert body_out["response"]["end_session"] is False + assert "артиста" in body_out["response"]["text"].lower() + + async def test_help_clean_state_emits_generic_hint(self) -> None: + """YANDEX.HELP with no in-flight prompt → generic example.""" + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": "помощь", + "nlu": {"intents": {"YANDEX.HELP": {}}}, + }, + } + resp = await handler._handle_webhook(_build_request(body)) + body_out = _response_body(resp) + assert "включи рок" in body_out["response"]["text"].lower() + + @pytest.mark.asyncio class TestVoiceContinuation: """Phase 1 / P1.4: opt-in `end_session=false` after play / control success.""" From a2e7e12554d68d59b13c2741adb9acca72e43e11 Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 23:30:26 +0300 Subject: [PATCH 5/8] docs: add CLAUDE.md aligned with upstream + convert docstrings to Sphinx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream Music Assistant CLAUDE.md mandates Sphinx-style docstrings (`:param:` syntax) and explicitly bans Google-style (`Args:`) and bullet-style (`- param:`) — flagged in PR #3843 review by @chrisuthe. This commit: 1. Adds a CLAUDE.md at the repo root that mirrors upstream's relevant sections (Behaviour, Code Style, Branching) and adapts the rest to this provider repo's specifics (sync workflow, provider/ layout, pre-commit gate, debugging via $HOME/.musicassistant). Cross-refs the Copilot review findings (is_public_https_url for any new network input) so future contributors don't re-introduce the same bug. 2. Converts the six Google-style docstring sections that had crept in (auth_page.py, auth_session.py, dialog_skill_meta.py, dialogs.py, dialogs_control.py, dialogs_nlu.py) to Sphinx-style. No behaviour change. The webhook-handler error-handling concern from the same review thread is mentioned in CLAUDE.md as a known follow-up but not addressed here — that's a separate code change. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 127 ++++++++++++++++++++++++++++++++++ provider/auth_page.py | 5 +- provider/auth_session.py | 3 +- provider/dialog_skill_meta.py | 5 +- provider/dialogs.py | 27 ++++---- provider/dialogs_control.py | 5 +- provider/dialogs_nlu.py | 6 +- 7 files changed, 150 insertions(+), 28 deletions(-) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..326e4d5 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,127 @@ +# CLAUDE.md + +Yandex Alice voice-skill provider for Music Assistant. Source repo for the +`yandex_alice` plugin provider that lives at +`music_assistant/providers/yandex_alice/` upstream — code is authored here +and synced to `music-assistant/server` via `ma-provider-tools`. + +This file aligns with the upstream Music Assistant `CLAUDE.md` so that +provider code authored locally is shaped exactly like provider code in +the upstream tree (Sphinx docstrings, Behaviour rules, branching). + +## Behaviour + +- NEVER automatically reply on GitHub (PRs, issues, discussions) without + explicit consent from the developer. + +## Layout + +- `provider/` — plugin source (mirrored to `music_assistant/providers/yandex_alice/` on sync) +- `tests/` — pytest suite (mirrored to `tests/providers/yandex_alice/`) +- `docs/` — research notes (`NLU_RESEARCH.md`, `VOICE_UX_RESEARCH.md`, `VOICE_COMMANDS.md`); not synced +- `provider/manifest.json` — provider metadata + runtime requirements +- `pyproject.toml` — dev-time deps + lint config; not synced + +## Development Commands + +- `.venv/bin/python -m pytest tests/` — run all tests +- `.venv/bin/python -m pytest tests/test_dialogs.py -k ` — single file / pattern +- `.venv/bin/python -m ruff check provider/ tests/` — lint +- `.venv/bin/python -m ruff format provider/ tests/` — auto-format +- `.venv/bin/python -m mypy provider/` — type check (strict mode) +- `pre-commit run --all-files` — full pre-commit gate + +Always run lint + tests + mypy before committing. Pre-commit hooks +mirror these checks plus gitleaks. CI runs `ruff format --check`, so +pushing without `ruff format` is the most common red build. + +## Code Style + +### Comments + +Only use comments to explain complex, multi-line blocks of code. Do not +comment obvious operations. + +### Docstring Format + +Use Sphinx-style docstrings with `:param:` / `:returns:` / `:raises:` +syntax. For simple functions, a single-line docstring is fine. + +Don't explain inner workings of the code in the docstrings (use inline +comments for that if/when needed). The docstring should provide clarity +to the **caller** of the function/method, not explain how it works +technically/internally. + +```python +def my_function(param1: str, param2: int, param3: bool = False) -> str: + """ + Brief one-line description of the function. + + :param param1: Description of what param1 is used for. + :param param2: Description of what param2 is used for. + :param param3: Description of what param3 is used for. + """ +``` + +Do **not** use Google-style (`Args:`) or bullet-style (`- param:`) +docstrings. AI assistants tend to generate Google-style by default — +explicitly steer them to Sphinx, and rewrite anything that slips +through. + +### Provider style + +- Match the layering of `provider/dialogs*.py`: webhook handler → + parsers (`dialogs_nlu.py`, `dialogs_control.py`, `dialogs_grammar.py`) + → resolvers (`dialogs_player.py`). Keep network I/O in the handler; + parsers and resolvers are pure or take `mass: MusicAssistant` as a + dependency. +- Public-network inputs (URLs, hostnames, host headers) MUST go through + `is_public_https_url` from `provider/url_helpers.py` — both + `build_backend_uri` and the webhook probe rejected this in code + review (PR #3843, v1.2.2 fix). Never gate on scheme alone. +- `from __future__ import annotations` at the top of every Python file. + +## Branching and PRs + +- Default branch: `dev`. All work-in-progress PRs target `dev`. +- Long-lived feature branches: `feat/` (e.g. `feat/platform-integration`). + Merge to `dev` once the feature lands. +- Versioned bugfixes go through `dev` too; tags / releases happen on + `dev` after sync to upstream completes. + +## Sync to upstream + +`ma-provider-tools` runs the sync workflow that propagates `provider/` +and `tests/` from this repo into `music-assistant/server` under their +canonical paths. Do not edit files inside `music-assistant/server/` +directly — changes there are overwritten on the next sync. + +CI in upstream `music-assistant/server` is the moderation gate; review +threads (e.g. PR #3843) drive bug fixes here, then a re-sync clears +them upstream. The CHANGELOG entries in this repo are the source of +truth for what landed. + +## Debugging + +- Music Assistant data: `$HOME/.musicassistant/` +- MA logs: `$HOME/.musicassistant/musicassistant.log` (current), + `musicassistant.log.1` etc. for older rotated logs +- MA database: `$HOME/.musicassistant/library.db` — query via `sqlite3`. + **Only execute SELECT queries** — never write to a live database. +- Webhook traffic during local testing: tail the MA log filtered by + `Webhook recv:` (the structured DEBUG line emitted on every Yandex + request). Bumping the dialog logger to DEBUG via + `python -m music_assistant --log-level debug` is enough. + +## Other notes + +- The plugin reuses Yandex Passport cookies via `ya-passport-auth` and + the `app-store-api` REST surface via `ya-dialogs-api`. Both packages + are owned by this same author; bump versions in `pyproject.toml` + + `provider/manifest.json` together. +- Tests never make live Yandex calls. Mock `aiohttp.ClientSession` per + the pattern in `tests/test_auto_create.py` if a new test needs HTTP. +- Webhook handler error handling (PR #3843 review thread): wrap + post-validation body in `try / except` so a parse / dispatch error + surfaces as a Russian "что-то пошло не так" reply instead of HTTP 500 + → Alice silence. diff --git a/provider/auth_page.py b/provider/auth_page.py index 2ef12eb..886e4f5 100644 --- a/provider/auth_page.py +++ b/provider/auth_page.py @@ -346,9 +346,8 @@ async def perform_device_auth( any other channel results in a popup the frontend isn't listening for, so it never appears. - Raises: - LoginFailed: the Device Flow timed out, was rejected by - Yandex, or another Passport-level error escaped. + :raises LoginFailed: the Device Flow timed out, was rejected by + Yandex, or another Passport-level error escaped. """ if not session_id: raise LoginFailed( diff --git a/provider/auth_session.py b/provider/auth_session.py index 83be0f5..a1ddd99 100644 --- a/provider/auth_session.py +++ b/provider/auth_session.py @@ -48,8 +48,7 @@ async def cached_authenticated_session(x_token: str) -> AsyncIterator[aiohttp.Cl ``refresh_passport_cookies`` propagates so the caller can clear the cached token and start a fresh Device Flow on the next click. - Raises: - ValueError: ``x_token`` is empty. + :raises ValueError: ``x_token`` is empty. """ if not x_token: msg = "x_token is empty — cached authenticator requires an existing token" diff --git a/provider/dialog_skill_meta.py b/provider/dialog_skill_meta.py index b038e27..3226edb 100644 --- a/provider/dialog_skill_meta.py +++ b/provider/dialog_skill_meta.py @@ -66,9 +66,8 @@ def build_backend_uri(base_url: str, webhook_secret: str) -> str: can't reach (e.g. ``https://192.168.1.10`` or ``https://localhost``) and the user would only discover the failure once moderation finishes. - Raises: - ValueError: ``base_url`` is empty / not a public HTTPS URL, or - ``webhook_secret`` is empty. + :raises ValueError: ``base_url`` is empty / not a public HTTPS URL, + or ``webhook_secret`` is empty. """ base = (base_url or "").strip().rstrip("/") if not base: diff --git a/provider/dialogs.py b/provider/dialogs.py index 6827a25..a0e5fcd 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -272,20 +272,19 @@ def __init__( ) -> None: """Initialize the handler. - Args: - mass: MusicAssistant instance. - skill_id: Configured ``CONF_DIALOG_SKILL_ID``; payloads with a - different ``session.skill_id`` are rejected. - webhook_secret: Random secret embedded in the webhook URL. - exposed_player_ids: Optional restriction set; only these players - are addressable by voice (passed to the player resolver). - voice_continuation: When True, play- and control-success - responses keep the conversation open (``end_session=False``) - so the user can issue follow-ups without re-saying the - activation phrase. Default False preserves today's - voice-UX. Stop / pause-with-no-resume utterances still - close the session via the existing control path. - logger: Optional logger override. + :param mass: MusicAssistant instance. + :param skill_id: Configured ``CONF_DIALOG_SKILL_ID``; payloads + with a different ``session.skill_id`` are rejected. + :param webhook_secret: Random secret embedded in the webhook URL. + :param exposed_player_ids: Optional restriction set; only these + players are addressable by voice (passed to the player resolver). + :param voice_continuation: When True, play- and control-success + responses keep the conversation open (``end_session=False``) + so the user can issue follow-ups without re-saying the + activation phrase. Default False preserves today's voice-UX. + Stop / pause-with-no-resume utterances still close the session + via the existing control path. + :param logger: Optional logger override. """ self._mass = mass self._skill_id = skill_id diff --git a/provider/dialogs_control.py b/provider/dialogs_control.py index 2e86480..517e427 100644 --- a/provider/dialogs_control.py +++ b/provider/dialogs_control.py @@ -396,9 +396,8 @@ def parse_control( def _plural_ru(n: int, forms: tuple[str, str, str]) -> str: """Pick the correct Russian quantitative form for `n`. - Args: - n: The number. - forms: ``(form_for_1, form_for_2_to_4, form_for_5_plus)``. + :param n: The number. + :param forms: ``(form_for_1, form_for_2_to_4, form_for_5_plus)``. Russian quantitative agreement: 1, 21, 31, … → form_for_1 (e.g. "колонку") diff --git a/provider/dialogs_nlu.py b/provider/dialogs_nlu.py index 76d9804..6955aef 100644 --- a/provider/dialogs_nlu.py +++ b/provider/dialogs_nlu.py @@ -349,9 +349,9 @@ def resolve_player_candidates( decision: chosen tier, candidate count, and the names of the candidates returned. - Returns: - A list with all players in the best non-empty tier. ``[]`` if - nothing matched. ``[player]`` for an unambiguous resolution. + :returns: A list with all players in the best non-empty tier. + ``[]`` if nothing matched. ``[player]`` for an unambiguous + resolution. """ candidates = list_exposed_players(mass, exposed_ids=exposed_ids) From d2ce60a6353478132089ff83840b480ffc067835 Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 23:34:25 +0300 Subject: [PATCH 6/8] fix(webhook): graceful fallback when inner dispatch raises (#3843) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @chrisuthe's review of upstream PR #3843: only `request.json()` was guarded; an unexpected raise from a parser, the resolver, or MA dispatch bubbled to aiohttp → HTTP 500 → Alice silence on the user's device. Refactors `_handle_webhook` so the post-auth body lives in a new `_handle_authenticated_request` method, called inside a `try / except` that catches any non-CancelledError exception, logs it, and returns a generic Russian fallback ("Что-то пошло не так. Попробуй ещё раз.") with `end_session=False` so the conversation can continue. The original exception is still emitted via `_logger.exception` so operators can debug from `$HOME/.musicassistant/musicassistant.log`. Adds a regression test that injects a RuntimeError into `mass.players.all_players` (deep inside the play-resolve path) and verifies the response is HTTP 200 with the Russian fallback text and `end_session=false` — not HTTP 500. CLAUDE.md updated to call out the contract so future branches don't regress it. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 11 ++++++---- provider/dialogs.py | 47 ++++++++++++++++++++++++++++++++++++++++++- tests/test_dialogs.py | 27 +++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 326e4d5..d3c637c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -121,7 +121,10 @@ truth for what landed. `provider/manifest.json` together. - Tests never make live Yandex calls. Mock `aiohttp.ClientSession` per the pattern in `tests/test_auto_create.py` if a new test needs HTTP. -- Webhook handler error handling (PR #3843 review thread): wrap - post-validation body in `try / except` so a parse / dispatch error - surfaces as a Russian "что-то пошло не так" reply instead of HTTP 500 - → Alice silence. +- Webhook handler error handling (PR #3843 review thread): the + post-auth dispatch is wrapped in `try / except` (`_handle_webhook` → + `_handle_authenticated_request`) so a parse / dispatch error surfaces + as a Russian "что-то пошло не так" reply instead of HTTP 500 → Alice + silence. Keep this guarantee intact when modifying the handler — any + new branch should also satisfy the + `test_unexpected_inner_exception_returns_graceful_fallback` test. diff --git a/provider/dialogs.py b/provider/dialogs.py index a0e5fcd..18c1497 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -406,7 +406,7 @@ def unregister_routes(self) -> None: # Webhook entry point # ------------------------------------------------------------------- - async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: PLR0915 + async def _handle_webhook(self, request: web.Request) -> web.Response: # Path secret already enforced by the route URL — getting here means # the secret matches. Still constant-time-compare it via the captured # path arg in case aiohttp routing ever changes. @@ -462,6 +462,51 @@ async def _handle_webhook(self, request: web.Request) -> web.Response: # noqa: # that resolve into a player action. Health signal only. self._authenticated_call_count += 1 + # Wrap post-auth dispatch so any unexpected exception (parser, + # search, MA dispatch, response builder) surfaces as a graceful + # Russian fallback instead of an aiohttp HTTP 500 → Alice silence. + # Logs the original exception for the operator to debug from + # `$HOME/.musicassistant/musicassistant.log`. Flagged in the + # upstream PR review (#3843, @chrisuthe) as a regression risk. + try: + return await self._handle_authenticated_request( + body=body, session=session, req=req, has_screen=has_screen + ) + except asyncio.CancelledError: + raise + except Exception: + self._logger.exception( + "Unhandled error in dialog webhook handler — " + "responding with generic fallback (session_id=%s)", + session.get("session_id", ""), + ) + text = "Что-то пошло не так. Попробуй ещё раз." + return self._yandex_response( + incoming_session=session, + text=text, + tts=_tts_for(text), + end_session=False, + ) + + async def _handle_authenticated_request( # noqa: PLR0915 + self, + *, + body: dict[str, Any], + session: dict[str, Any], + req: dict[str, Any], + has_screen: bool, + ) -> web.Response: + """Dispatch the request body once authentication has cleared. + + Wrapped in ``try / except`` by the caller so any unexpected raise + from a parser, the resolver, or MA dispatch lands as a graceful + fallback response rather than HTTP 500. Returns ``web.Response``. + + :param body: Parsed JSON envelope. + :param session: ``body["session"]`` already coerced to dict. + :param req: ``body["request"]`` already coerced to dict. + :param has_screen: Result of :func:`_has_screen` on the request. + """ # State buckets. Three-tier read priority: # 1. ``state.session`` — per-conversation, set by us last turn. # 2. ``state.application`` — per-device, mirrored fallback. diff --git a/tests/test_dialogs.py b/tests/test_dialogs.py index 8c9146f..40574bf 100644 --- a/tests/test_dialogs.py +++ b/tests/test_dialogs.py @@ -240,6 +240,33 @@ async def test_full_happy_path_starts_play_media(self) -> None: assert call_kwargs["queue_id"] == "p1" assert call_kwargs["media"] is track + async def test_unexpected_inner_exception_returns_graceful_fallback(self) -> None: + """An unexpected raise from inner dispatch surfaces as a Russian fallback, not HTTP 500. + + Flagged in the upstream PR review (#3843, @chrisuthe): only the + ``request.json()`` parse was guarded; everything afterwards + (parsers, search, dispatch) bubbled to aiohttp → HTTP 500 → + Alice silence. The handler now wraps the post-auth body in + ``try / except`` to keep the user-facing response intact. + """ + # Make `mass.players.all_players` raise — this triggers inside the + # play-resolve path so the exception happens DEEP in dispatch, + # well past the auth gate and parser pass. + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + mass.players.all_players = MagicMock(side_effect=RuntimeError("boom")) + handler = self._make_handler(mass) + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": {"command": "включи Metallica на кухне"}, + } + resp = await handler._handle_webhook(_build_request(body)) + # Critical: 200 OK with a Russian fallback, NOT HTTP 500. + assert resp.status == 200 + body_out = _response_body(resp) + assert "что-то пошло не так" in body_out["response"]["text"].lower() + # Session continues so the user can re-issue a command. + assert body_out["response"]["end_session"] is False + @pytest.mark.asyncio class TestSuggestionButtons: From 7f33a49e87203713cde8ba1f3f3763b543d33de5 Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 23:37:48 +0300 Subject: [PATCH 7/8] chore: add 'sting' to codespell ignore list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'sting' entry in tts_dictionary.py is the artist Стинг, not a typo of 'string'. Codespell flagged it on the PR #18 CI run. Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cec23ce..7847d6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,9 @@ filterwarnings = [ [tool.codespell] skip = "*.min.js,*.svg,docs-site/package-lock.json" -ignore-words-list = "hass," +# `sting` — the artist (Стинг), not a typo of "string". Lives in +# tts_dictionary.py. +ignore-words-list = "hass,sting" [tool.mypy] python_version = "3.12" From 77850cb3e2dfeb66ffa8a13aa7227cb44b3d007b Mon Sep 17 00:00:00 2001 From: Mikhail Nevskiy <139659391+trudenboy@users.noreply.github.com> Date: Thu, 7 May 2026 23:47:47 +0300 Subject: [PATCH 8/8] fix: address Copilot review on PR #18 + bump to 1.3.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three review threads from Copilot's pass on PR #18: - **dialogs.py:580** — DEBUG "Webhook recv" line was emitting `cmd` and `original_utterance` *before* the dangerous_context refusal branch, leaking flagged content into $HOME/.musicassistant/musicassistant.log. Both fields are now replaced with `` when the flag is set; the rest of the structured fields stay intact so operators still see traffic shape. Regression test injects a flagged phrase and asserts it's absent from caplog records. - **dialogs_control.py:318** — `volume_relative` clamped magnitude with `max(1, …)`, silently promoting "прибавь на 0" to a +1 bump. Clamp is now `max(0, …)` so the parsed delta matches the spoken number; zero is a valid no-op. Parametrised test covers all four phrasings. - **constants.py:119** — comment promised "спасибо" closes the session via the `stop` control intent, but parse_control does not match it. Corrected to the actually-matched phrases (стоп / останови / выключи / выключи музыку). Pure doc fix. Plus: pin `ya-dialogs-api==2.1.0` in provider/manifest.json (== rather than >=) so MA installs the exact version the provider was tested against. Bumps VERSION to 1.3.0 with a comprehensive CHANGELOG entry covering the full Phase 0–2 work landing in PR #18 plus these three Copilot fixes. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 106 ++++++++++++++++++++++++++++++++++ VERSION | 2 +- provider/constants.py | 5 +- provider/dialogs.py | 21 +++++-- provider/dialogs_control.py | 8 ++- provider/manifest.json | 2 +- tests/test_dialogs.py | 31 ++++++++++ tests/test_dialogs_control.py | 18 ++++++ 8 files changed, 180 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fff6b13..177c833 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,112 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.3.0] — 2026-05-07 + +Maximum-integration release for Yandex Dialogs platform features (Phases +0–2 of `docs/NLU_RESEARCH.md`). Six commits delivered on +`feat/platform-integration` and merged via PR +[#18](https://github.com/trudenboy/ma-provider-yandex-alice/pull/18). + +### Added + +- **Platform NLU consumption (Phase 0).** Read the rest of the Yandex + Dialogs request envelope: + - `meta.interfaces.screen` gates `buttons` emission so voice-only + surfaces (Mini, Pro) get the same ordinal-based prompt without + button payload. + - `request.markup.dangerous_context` short-circuits with a generic + "Не понял команду" + `end_session=true`; flagged content never + lands in `mass.music.search`. + - `request.nlu.entities[YANDEX.NUMBER]` feeds a new + `volume_relative` `ParsedControl` action: «прибавь на 20» / «убавь + 5» / «на 15 громче» reads current volume, applies signed delta, + clamps `[0, 100]`, dispatches `cmd_volume_set`. + - `request.original_utterance` logged alongside the normalised + `command` for misclassification post-mortems (DEBUG only). + +- **Response polish for screened surfaces (Phase 1).** + - `card` parameter plumbed through `_yandex_response` (BigImage / + ItemsList / ImageGallery shapes documented; emission deferred to + Phase 1.5 — needs separate image-upload infrastructure). + - Suggestion buttons (Следующая / Пауза / Громче / Тише) appended + to play- and control-success responses on screened surfaces. + - `provider/tts_dictionary.py` carries ~26 single-word foreign + artist transliterations (Metallica → мет+аллика, Coldplay → + к+олдплей, …) plus 16 multi-word phrases (Iron Maiden, Pink + Floyd, …); `_tts_for` now matches both Latin and Cyrillic words + so foreign band names get pronounced correctly while `text` + stays clean. + - `voice_continuation` opt-in toggle (`CONF_DIALOG_VOICE_CONTINUATION`, + default off): when enabled, play- and control-success responses + keep the conversation open. `стоп / останови / выключи` always + close the session. + +- **Custom-intent grammar (Phase 2).** Eleven grammars declared on the + skill and dispatched at runtime via `request.nlu.intents`: + - `control.{pause, resume, next, previous, stop, volume_up, + volume_down, shuffle_on, shuffle_off, now_playing}` + - `play.my_wave` + - Each carries `positiveTests` for the dev-console "Протестировать" + button and uses `%lemma` directives to absorb morphology. + - Yandex's built-in `YANDEX.REJECT` (cancel pending prompt) and + `YANDEX.HELP` (contextual hint) are unlocked automatically once + any custom grammar is declared and now have runtime handlers. + - Regex parsers (`parse_command` / `parse_control`) remain as the + fallback when `request.nlu.intents` is empty — purely additive + coverage, no regression risk. + - Bumps `ya-dialogs-api==2.1.0` for the new `IntentDraft` API and + `set_intents` diff-based sync. + +- **Root `CLAUDE.md`** aligned with upstream Music Assistant + `CLAUDE.md` — Sphinx-style docstrings, sync workflow, network-input + validation contract, debugging notes. + +### Fixed + +- **Webhook handler error handling**: post-auth dispatch is now wrapped + in `try / except` so a parser / resolver / MA-dispatch raise surfaces + as a Russian fallback ("Что-то пошло не так. Попробуй ещё раз.") + instead of HTTP 500 → Alice silence. Flagged in upstream + [music-assistant/server#3843](https://github.com/music-assistant/server/pull/3843) + by [@chrisuthe](https://github.com/chrisuthe). +- **Docstring style**: six existing Google-style docstrings (`Args:` / + `Raises:` / `Returns:`) converted to Sphinx-style (`:param:` / + `:raises:` / `:returns:`) per the upstream `CLAUDE.md` convention. + Flagged in the same upstream review. + +### Fixed (review on PR [#18](https://github.com/trudenboy/ma-provider-yandex-alice/pull/18)) + +- **Logs no longer leak flagged content.** When + `request.markup.dangerous_context=true`, the structured "Webhook recv" + DEBUG log was still emitting the `command` and `original_utterance` + fields *before* the refusal branch ran. Both are now redacted to + `` so flagged phrases never reach + `$HOME/.musicassistant/musicassistant.log`. Found by Copilot + ([#18 thread](https://github.com/trudenboy/ma-provider-yandex-alice/pull/18#discussion_r3204562269)). +- **`volume_relative` magnitude clamp accepts zero.** Previously + `max(1, …)` silently promoted "прибавь на 0" to a +1 bump. The + clamp is now `max(0, …)` so the parsed delta matches the spoken + number — `0` becomes a no-op rather than an unwanted volume change. + Found by Copilot + ([#18 thread](https://github.com/trudenboy/ma-provider-yandex-alice/pull/18#discussion_r3204562328)). +- **`CONF_DIALOG_VOICE_CONTINUATION` comment accuracy.** The doc-comment + promised that "спасибо" closes the session via the `stop` control + intent, but `parse_control` does not match it. Comment corrected to + the actual matched phrases: «стоп / останови / выключи / выключи + музыку». Found by Copilot + ([#18 thread](https://github.com/trudenboy/ma-provider-yandex-alice/pull/18#discussion_r3204562358)). + +### Internal + +- 466 unit tests (was 411). Coverage spans every new code path + including the dangerous-content log redaction, zero-magnitude + volume parse, suggestion-button gating, voice-continuation toggle, + platform-intent dispatch, REJECT / HELP handlers, and the + webhook-error-recovery fallback. +- `pyproject.toml`: `codespell` ignores `sting` (the artist Стинг in + `tts_dictionary.py`, not a typo of `string`). + ## [1.2.3] — 2026-05-07 ### Fixed diff --git a/VERSION b/VERSION index 0495c4a..f0bb29e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.3 +1.3.0 diff --git a/provider/constants.py b/provider/constants.py index 0f87b8a..7f738a1 100644 --- a/provider/constants.py +++ b/provider/constants.py @@ -116,8 +116,9 @@ # the user re-says "Алиса, попроси " for the next command. ON keeps # `end_session=false` after success so follow-ups skip the activation # preamble at the cost of a "skill is listening" indicator on screened -# surfaces. Explicit "стоп / выключи / спасибо" still ends the session -# (those phrases parse as the existing `stop` control intent). +# surfaces. Explicit "стоп / останови / выключи / выключи музыку" still +# end the session via the existing `stop` control intent (matched by +# `parse_control` patterns in `dialogs_control.py`). CONF_DIALOG_VOICE_CONTINUATION = "dialog_voice_continuation" # Yandex Dialogs catalog voice options (TTS), passed to draft payload. diff --git a/provider/dialogs.py b/provider/dialogs.py index 18c1497..ad33650 100644 --- a/provider/dialogs.py +++ b/provider/dialogs.py @@ -569,16 +569,25 @@ async def _handle_authenticated_request( # noqa: PLR0915 # normalised `command` (Yandex strips punctuation and converts # spelled-out numbers; the raw form helps misclassification # post-mortems). - raw_suffix = ( - f" raw={original_utterance!r}" - if original_utterance and original_utterance != command - else "" - ) + # Flagged content (`dangerous_context=true`) is redacted from + # both `cmd` and the raw suffix — Yandex flags suicide / hate / + # violence phrasings and we don't want to persist any of that + # in DEBUG logs even at the operator's request. + if dangerous_context: + cmd_for_log: str | None = "" + raw_suffix = "" + else: + cmd_for_log = command + raw_suffix = ( + f" raw={original_utterance!r}" + if original_utterance and original_utterance != command + else "" + ) self._logger.debug( "Webhook recv: cmd=%r%s req_type=%s is_new=%s pending=%s " "(session=%s app=%s cache=%s) awaiting=%s default_player=%s " "dangerous=%s session_id=%s", - command, + cmd_for_log, raw_suffix, req.get("type", "SimpleUtterance"), is_new, diff --git a/provider/dialogs_control.py b/provider/dialogs_control.py index 517e427..00ce84d 100644 --- a/provider/dialogs_control.py +++ b/provider/dialogs_control.py @@ -313,9 +313,11 @@ def _try_match( n = _yandex_number(entities) if n is not None: # Clamp the magnitude so an absurd "прибавь на 999" doesn't - # underflow/overflow downstream arithmetic. Sign is applied - # to the (clamped) magnitude. - magnitude = max(1, min(100, abs(n))) + # underflow/overflow downstream arithmetic. ``0`` stays + # ``0`` — "прибавь на 0" is a valid (if pointless) no-op + # rather than the user's spoken zero being silently + # promoted to one. + magnitude = max(0, min(100, abs(n))) return ParsedControl( action="volume_relative", value=sign * magnitude, diff --git a/provider/manifest.json b/provider/manifest.json index 4cc2eb9..8f7d879 100644 --- a/provider/manifest.json +++ b/provider/manifest.json @@ -9,7 +9,7 @@ ], "requirements": [ "ya-passport-auth==1.3.0", - "ya-dialogs-api>=2.1.0" + "ya-dialogs-api==2.1.0" ], "documentation": "https://github.com/trudenboy/ma-provider-yandex-alice", "stage": "beta", diff --git a/tests/test_dialogs.py b/tests/test_dialogs.py index 40574bf..f35882e 100644 --- a/tests/test_dialogs.py +++ b/tests/test_dialogs.py @@ -240,6 +240,37 @@ async def test_full_happy_path_starts_play_media(self) -> None: assert call_kwargs["queue_id"] == "p1" assert call_kwargs["media"] is track + async def test_dangerous_context_log_redacts_command( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """Flagged content must NOT leak into DEBUG logs even at operator's request. + + Copilot review on PR #18: the structured "Webhook recv" line was + emitting `cmd=...` and `raw=...` *before* the dangerous_context + refusal branch, so flagged phrases ended up in + $HOME/.musicassistant/musicassistant.log when DEBUG was on. + """ + mass = _make_mass([MockPlayer(player_id="p1", name="Кухня")]) + handler = self._make_handler(mass) + sensitive = "очень плохая фраза которую яндекс пометил" + body = { + "session": {"skill_id": "skill-uuid-1", "session_id": "s1", "new": False}, + "request": { + "command": sensitive, + "original_utterance": sensitive, + "markup": {"dangerous_context": True}, + }, + } + with caplog.at_level("DEBUG", logger="music_assistant.providers.yandex_alice.dialogs"): + await handler._handle_webhook(_build_request(body)) + # Flagged content must not be present in any log record. + for record in caplog.records: + assert sensitive not in record.getMessage() + # Confirm we DID emit the structured log line (with the redaction marker) + # — silent skip would also satisfy the negative assertion above and is + # not what we want. + assert any("redacted: dangerous_context" in r.getMessage() for r in caplog.records) + async def test_unexpected_inner_exception_returns_graceful_fallback(self) -> None: """An unexpected raise from inner dispatch surfaces as a Russian fallback, not HTTP 500. diff --git a/tests/test_dialogs_control.py b/tests/test_dialogs_control.py index 9fe3beb..484bad1 100644 --- a/tests/test_dialogs_control.py +++ b/tests/test_dialogs_control.py @@ -289,6 +289,24 @@ def test_volume_set_still_wins_with_keyword(self) -> None: assert result.action == "volume_set" assert result.value == 30 + @pytest.mark.parametrize( + ("phrase", "expected_value"), + [ + ("прибавь на 0", 0), + ("убавь 0", 0), + ("на 0 громче", 0), + ("на 0 тише", 0), + ], + ) + def test_zero_magnitude_passes_through_as_zero( + self, phrase: str, expected_value: int + ) -> None: + """Zero magnitude is preserved (not promoted to ±1) — Copilot review on PR #18.""" + result = parse_control(phrase) + assert result is not None + assert result.action == "volume_relative" + assert result.value == expected_value + class TestPluralRu: """Tests for the Russian quantitative-form picker."""