From 49d8896c929b9760a3b438008b4952dd3d0e425d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Braun Date: Sat, 23 May 2026 11:59:48 +0200 Subject: [PATCH] Repair surrogates in encode_with_unstable (Fixes #541) encode() and encode_ordinary() already wrap their Rust call in try/except UnicodeEncodeError and retry against the UTF-16 surrogatepass-repaired text. encode_with_unstable() did not, so it surfaced a raw UnicodeEncodeError on unmatched surrogate pairs and lone surrogates that the other two methods accept. This change mirrors the existing fallback in core.py:128-136 for encode_with_unstable, and adds a regression test next to test_encode_surrogate_pairs that exercises both code paths. --- tests/test_encoding.py | 26 ++++++++++++++++++++++++++ tiktoken/core.py | 10 +++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index b77ca135..b44d68f7 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -110,6 +110,32 @@ def test_encode_surrogate_pairs(): assert enc.encode("\ud83d") == enc.encode("�") +def test_encode_with_unstable_surrogate_pairs(): + # Regression for #541: `encode_with_unstable` historically surfaced a raw + # UnicodeEncodeError from the Rust boundary on unmatched surrogate pairs + # or lone surrogates, while `encode` / `encode_ordinary` already repaired + # such input via UTF-16 surrogatepass. The three methods should accept + # the same inputs. + enc = tiktoken.get_encoding("cl100k_base") + + # Split surrogate pair → must not raise. We compare against the repaired + # form (same path `encode` takes) rather than asserting an exact token + # layout, since `encode_with_unstable` is documented as itself unstable. + # The completions list comes back from a Rust HashSet, so ordering is + # not guaranteed; compare as a set of tuples. + pair_stable, pair_completions = enc.encode_with_unstable("👍") + emoji_stable, emoji_completions = enc.encode_with_unstable("👍") + assert pair_stable == emoji_stable + assert {tuple(c) for c in pair_completions} == {tuple(c) for c in emoji_completions} + + # Lone surrogate → matches the replacement-character encoding, mirroring + # the contract `test_encode_surrogate_pairs` asserts above for `encode`. + lone_stable, lone_completions = enc.encode_with_unstable("\ud83d") + repl_stable, repl_completions = enc.encode_with_unstable("�") + assert lone_stable == repl_stable + assert {tuple(c) for c in lone_completions} == {tuple(c) for c in repl_completions} + + @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]): enc = make_enc() diff --git a/tiktoken/core.py b/tiktoken/core.py index 530f8f59..8ecf233d 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -240,7 +240,15 @@ def encode_with_unstable( if match := _special_token_regex(disallowed_special).search(text): raise_disallowed_special_token(match.group()) - return self._core_bpe.encode_with_unstable(text, allowed_special) + try: + return self._core_bpe.encode_with_unstable(text, allowed_special) + except UnicodeEncodeError: + # Mirror the surrogate-repair fallback used in `encode` / `encode_ordinary` + # (see the comment in `encode`): lone surrogates or unmatched surrogate pairs + # can't round-trip through UTF-8, so we repair via UTF-16 surrogatepass and + # retry rather than surface the raw codec error from the Rust boundary. + text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") + return self._core_bpe.encode_with_unstable(text, allowed_special) def encode_single_token(self, text_or_bytes: str | bytes) -> int: """Encodes text corresponding to a single token to its token value.