diff --git a/tests/test_encoding.py b/tests/test_encoding.py index b77ca135..53185095 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -110,6 +110,17 @@ def test_encode_surrogate_pairs(): assert enc.encode("\ud83d") == enc.encode("�") +def test_encode_with_unstable_surrogate_pairs(): + enc = tiktoken.get_encoding("cl100k_base") + + # would raise UnicodeEncodeError before the fix in core.py + enc.encode_with_unstable("\ud83d\udc4d") + enc.encode_with_unstable("\ud83d") + + assert enc.encode("\ud83d\udc4d") == enc.encode("👍") + assert enc.encode("\ud83d") == enc.encode("�") + + @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]): enc = make_enc() diff --git a/tiktoken/core.py b/tiktoken/core.py index 530f8f59..f4fe87a4 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -240,7 +240,11 @@ def encode_with_unstable( if match := _special_token_regex(disallowed_special).search(text): raise_disallowed_special_token(match.group()) - return self._core_bpe.encode_with_unstable(text, allowed_special) + try: + return self._core_bpe.encode_with_unstable(text, allowed_special) + except UnicodeEncodeError: + text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") + return self._core_bpe.encode_with_unstable(text, allowed_special) def encode_single_token(self, text_or_bytes: str | bytes) -> int: """Encodes text corresponding to a single token to its token value.