From 49d8896c929b9760a3b438008b4952dd3d0e425d Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Braun <jbaptiste.braun@gmail.com>
Date: Sat, 23 May 2026 11:59:48 +0200
Subject: [PATCH] Repair surrogates in encode_with_unstable (Fixes #541)

encode() and encode_ordinary() already wrap their Rust call in
try/except UnicodeEncodeError and retry against the UTF-16
surrogatepass-repaired text. encode_with_unstable() did not, so it
surfaced a raw UnicodeEncodeError on unmatched surrogate pairs and
lone surrogates that the other two methods accept.

This change mirrors the existing fallback in core.py:128-136 for
encode_with_unstable, and adds a regression test next to
test_encode_surrogate_pairs that exercises both code paths.
---
 tests/test_encoding.py | 26 ++++++++++++++++++++++++++
 tiktoken/core.py       | 10 +++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index b77ca135..b44d68f7 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -110,6 +110,32 @@ def test_encode_surrogate_pairs():
     assert enc.encode("\ud83d") == enc.encode("�")
 
 
+def test_encode_with_unstable_surrogate_pairs():
+    # Regression for #541: `encode_with_unstable` historically surfaced a raw
+    # UnicodeEncodeError from the Rust boundary on unmatched surrogate pairs
+    # or lone surrogates, while `encode` / `encode_ordinary` already repaired
+    # such input via UTF-16 surrogatepass. The three methods should accept
+    # the same inputs.
+    enc = tiktoken.get_encoding("cl100k_base")
+
+    # Split surrogate pair → must not raise. We compare against the repaired
+    # form (same path `encode` takes) rather than asserting an exact token
+    # layout, since `encode_with_unstable` is documented as itself unstable.
+    # The completions list comes back from a Rust HashSet, so ordering is
+    # not guaranteed; compare as a set of tuples.
+    pair_stable, pair_completions = enc.encode_with_unstable("👍")
+    emoji_stable, emoji_completions = enc.encode_with_unstable("👍")
+    assert pair_stable == emoji_stable
+    assert {tuple(c) for c in pair_completions} == {tuple(c) for c in emoji_completions}
+
+    # Lone surrogate → matches the replacement-character encoding, mirroring
+    # the contract `test_encode_surrogate_pairs` asserts above for `encode`.
+    lone_stable, lone_completions = enc.encode_with_unstable("\ud83d")
+    repl_stable, repl_completions = enc.encode_with_unstable("�")
+    assert lone_stable == repl_stable
+    assert {tuple(c) for c in lone_completions} == {tuple(c) for c in repl_completions}
+
+
 @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
 def test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]):
     enc = make_enc()
diff --git a/tiktoken/core.py b/tiktoken/core.py
index 530f8f59..8ecf233d 100644
--- a/tiktoken/core.py
+++ b/tiktoken/core.py
@@ -240,7 +240,15 @@ def encode_with_unstable(
             if match := _special_token_regex(disallowed_special).search(text):
                 raise_disallowed_special_token(match.group())
 
-        return self._core_bpe.encode_with_unstable(text, allowed_special)
+        try:
+            return self._core_bpe.encode_with_unstable(text, allowed_special)
+        except UnicodeEncodeError:
+            # Mirror the surrogate-repair fallback used in `encode` / `encode_ordinary`
+            # (see the comment in `encode`): lone surrogates or unmatched surrogate pairs
+            # can't round-trip through UTF-8, so we repair via UTF-16 surrogatepass and
+            # retry rather than surface the raw codec error from the Rust boundary.
+            text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
+            return self._core_bpe.encode_with_unstable(text, allowed_special)
 
     def encode_single_token(self, text_or_bytes: str | bytes) -> int:
         """Encodes text corresponding to a single token to its token value.