fix punctuation processing

CrazySteve0605 · CrazySteve0605 · commit cf6ccb76545a · 2025-12-30T16:36:37.000+08:00
diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py
@@ -18,6 +18,7 @@
 from collections.abc import Callable
 from typing import Any
 import re
+import unicodedata
 
 import textUtils
 from logHandler import log
@@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
 		if len(self.wordEnds) <= 1:
 			return self.text
 
-		from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER
-
 		result = ""
 		for sepIndex in range(len(self.wordEnds) - 1):
 			preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1]
@@ -305,15 +304,13 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
 				# separator already present at either side -> skip adding
 				continue
 
-			# slice to check the next token (text between curIndex and postIndex)
-			nextSlice = self.text[curIndex:postIndex]
-
-			# Determine whether any punctuation forbids a separator BEFORE the next token
-			noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE)
-			# Determine whether any punctuation forbids a separator AFTER the current result
-			noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER)
+			# Determine whether any punctuation forbids a separator
+			noSep = (
+				unicodedata.category(self.text[curIndex - 1])[0] in "pP"
+				or unicodedata.category(self.text[curIndex])[0] in "pP"
+			)  # Punctuation categories
 
-			if not (noSepBefore or noSepAfter):
+			if not noSep:
 				# If neither side forbids the separator, add it
 				result += sep
 				if newSepIndex is not None:
diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py
@@ -117,77 +117,3 @@ def encodedToStrOffsets(
 		else:
 			resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
 			return (resultStart, resultEnd)
-
-
-# Punctuation that should NOT have a separator BEFORE it (no space before these marks)
-NO_SEP_BEFORE = {
-	# Common Chinese fullwidth punctuation
-	"。",
-	"，",
-	"、",
-	"；",
-	"：",
-	"？",
-	"！",
-	"…",
-	"...",
-	"—",
-	"–",
-	"——",
-	"）",
-	"】",
-	"》",
-	"〉",
-	"」",
-	"』",
-	"”",
-	"’",
-	"％",
-	"‰",
-	"￥",
-	# Common ASCII / halfwidth punctuation
-	".",
-	",",
-	";",
-	":",
-	"?",
-	"!",
-	"%",
-	".",
-	")",
-	"]",
-	"}",
-	">",
-	'"',
-	"'",
-}
-
-# Punctuation that should NOT have a separator AFTER it (no space after these marks)
-NO_SEP_AFTER = {
-	# Common Chinese fullwidth opening/leading punctuation
-	"（",
-	"【",
-	"《",
-	"〈",
-	"「",
-	"『",
-	"“",
-	"‘",
-	# Common ASCII / halfwidth opening/leading punctuation
-	"(",
-	"[",
-	"{",
-	"<",
-	'"',
-	"'",
-	# Currency and prefix-like symbols that typically bind to the following token
-	"$",
-	"€",
-	"£",
-	"¥",
-	"₹",
-	# Social/identifier prefixes
-	"@",
-	"#",
-	"&",
-}