Skip to content

Commit cf6ccb7

Browse files
fix punctuation processing
1 parent 134cd1c commit cf6ccb7

File tree

2 files changed

+7
-84
lines changed

2 files changed

+7
-84
lines changed

source/textUtils/wordSeg/wordSegStrategy.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from collections.abc import Callable
1919
from typing import Any
2020
import re
21+
import unicodedata
2122

2223
import textUtils
2324
from logHandler import log
@@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
289290
if len(self.wordEnds) <= 1:
290291
return self.text
291292

292-
from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER
293-
294293
result = ""
295294
for sepIndex in range(len(self.wordEnds) - 1):
296295
preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1]
@@ -305,15 +304,13 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
305304
# separator already present at either side -> skip adding
306305
continue
307306

308-
# slice to check the next token (text between curIndex and postIndex)
309-
nextSlice = self.text[curIndex:postIndex]
310-
311-
# Determine whether any punctuation forbids a separator BEFORE the next token
312-
noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE)
313-
# Determine whether any punctuation forbids a separator AFTER the current result
314-
noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER)
307+
# Determine whether any punctuation forbids a separator
308+
noSep = (
309+
unicodedata.category(self.text[curIndex - 1])[0] in "pP"
310+
or unicodedata.category(self.text[curIndex])[0] in "pP"
311+
) # Punctuation categories
315312

316-
if not (noSepBefore or noSepAfter):
313+
if not noSep:
317314
# If neither side forbids the separator, add it
318315
result += sep
319316
if newSepIndex is not None:

source/textUtils/wordSeg/wordSegUtils.py

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -117,77 +117,3 @@ def encodedToStrOffsets(
117117
else:
118118
resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
119119
return (resultStart, resultEnd)
120-
121-
122-
# Punctuation that should NOT have a separator BEFORE it (no space before these marks)
123-
NO_SEP_BEFORE = {
124-
# Common Chinese fullwidth punctuation
125-
"。",
126-
",",
127-
"、",
128-
";",
129-
":",
130-
"?",
131-
"!",
132-
"…",
133-
"...",
134-
"—",
135-
"–",
136-
"——",
137-
")",
138-
"】",
139-
"》",
140-
"〉",
141-
"」",
142-
"』",
143-
"”",
144-
"’",
145-
"%",
146-
"‰",
147-
"¥",
148-
# Common ASCII / halfwidth punctuation
149-
".",
150-
",",
151-
";",
152-
":",
153-
"?",
154-
"!",
155-
"%",
156-
".",
157-
")",
158-
"]",
159-
"}",
160-
">",
161-
'"',
162-
"'",
163-
}
164-
165-
# Punctuation that should NOT have a separator AFTER it (no space after these marks)
166-
NO_SEP_AFTER = {
167-
# Common Chinese fullwidth opening/leading punctuation
168-
"(",
169-
"【",
170-
"《",
171-
"〈",
172-
"「",
173-
"『",
174-
"“",
175-
"‘",
176-
# Common ASCII / halfwidth opening/leading punctuation
177-
"(",
178-
"[",
179-
"{",
180-
"<",
181-
'"',
182-
"'",
183-
# Currency and prefix-like symbols that typically bind to the following token
184-
"$",
185-
"€",
186-
"£",
187-
"¥",
188-
"₹",
189-
# Social/identifier prefixes
190-
"@",
191-
"#",
192-
"&",
193-
}

0 commit comments

Comments
 (0)