diff --git a/nvdaHelper/cppjieba/sconscript b/nvdaHelper/cppjieba/sconscript index d59fd0e3431..714c99330a9 100644 --- a/nvdaHelper/cppjieba/sconscript +++ b/nvdaHelper/cppjieba/sconscript @@ -42,7 +42,7 @@ env.AppendUnique( cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles) -if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning +if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning env.Install( outDir.Dir("dicts"), [ diff --git a/source/textUtils/wordSeg/wordSegStrategy.py b/source/textUtils/wordSeg/wordSegStrategy.py index f59f57aca0c..ef919e99cb7 100644 --- a/source/textUtils/wordSeg/wordSegStrategy.py +++ b/source/textUtils/wordSeg/wordSegStrategy.py @@ -18,6 +18,7 @@ from collections.abc import Callable from typing import Any import re +import unicodedata import textUtils from logHandler import log @@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> if len(self.wordEnds) <= 1: return self.text - from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER - result = "" for sepIndex in range(len(self.wordEnds) - 1): preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1] @@ -305,15 +304,15 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> # separator already present at either side -> skip adding continue - # slice to check the next token (text between curIndex and postIndex) - nextSlice = self.text[curIndex:postIndex] - - # Determine whether any punctuation forbids a separator BEFORE the next token - noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE) - # Determine whether any punctuation forbids a separator AFTER the current result - noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER) + # Unicode categories for punctuation + PUNCTUATION_CATEGORIES: str = "pP" + # Determine whether any punctuation forbids a separator + noSep = ( + unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES + or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES + ) - if not (noSepBefore or noSepAfter): + if not noSep: # If neither side forbids the separator, add it result += sep if newSepIndex is not None: diff --git a/source/textUtils/wordSeg/wordSegUtils.py b/source/textUtils/wordSeg/wordSegUtils.py index dff909e2945..d26a26cd9ba 100644 --- a/source/textUtils/wordSeg/wordSegUtils.py +++ b/source/textUtils/wordSeg/wordSegUtils.py @@ -19,7 +19,14 @@ def __init__(self, text: str): self.newSepIndex: list[int] = [] self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex) - @property + @cached_property + def _separatorFlag(self) -> list[bool]: + isSep = [False] * self.encodedStringLength + for pos in self.newSepIndex: + isSep[pos] = True + return isSep + + @cached_property def computedStrToEncodedOffsets(self) -> list[int]: """ Compute a list of offsets so that: @@ -32,23 +39,12 @@ def computedStrToEncodedOffsets(self) -> list[int]: original index. """ strLen = self.strLength - encodedLen = self.encodedStringLength - - # validate separator positions (optional but makes bugs obvious) - for pos in self.newSepIndex: - if pos < 0 or pos >= encodedLen: - raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}") - - # mark which encoded positions are separators - isSep = [False] * encodedLen - for pos in self.newSepIndex: - isSep[pos] = True # build explicit str -> encoded mapping strToEncoded: list[int] = [0] * strLen nextStrIndex = 0 - for encodedIndex in range(encodedLen): - if not isSep[encodedIndex]: + for encodedIndex in range(self.encodedStringLength): + if not self._separatorFlag[encodedIndex]: # assign the current original-char index to this encoded slot # then advance to the next original index if nextStrIndex >= strLen: @@ -60,27 +56,15 @@ def computedStrToEncodedOffsets(self) -> list[int]: return strToEncoded - @property + @cached_property def computedEncodedToStrOffsets(self) -> list[int]: - encodedLen = self.encodedStringLength - - # validate separator positions - for pos in self.newSepIndex: - if pos < 0 or pos >= encodedLen: - raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}") - - # mark which encoded positions are separators - isSep = [False] * encodedLen - for pos in self.newSepIndex: - isSep[pos] = True - # build explicit encoded -> str mapping # semantics: separator positions and the following encoded character # both map to the same upcoming original str index (insertion point semantics). - encodedToStr: list[int] = [0] * encodedLen + encodedToStr: list[int] = [0] * self.encodedStringLength nextStrIndex = 0 - for encodedIndex in range(encodedLen): - if isSep[encodedIndex]: + for encodedIndex in range(self.encodedStringLength): + if self._separatorFlag[encodedIndex]: # map separator to the next original character index (insertion point) encodedToStr[encodedIndex] = nextStrIndex else: @@ -133,77 +117,3 @@ def encodedToStrOffsets( else: resultEnd = self.computedEncodedToStrOffsets[encodedEnd] return (resultStart, resultEnd) - - -# Punctuation that should NOT have a separator BEFORE it (no space before these marks) -NO_SEP_BEFORE = { - # Common Chinese fullwidth punctuation - "。", - ",", - "、", - ";", - ":", - "?", - "!", - "…", - "...", - "—", - "–", - "——", - ")", - "】", - "》", - "〉", - "」", - "』", - "”", - "’", - "%", - "‰", - "¥", - # Common ASCII / halfwidth punctuation - ".", - ",", - ";", - ":", - "?", - "!", - "%", - ".", - ")", - "]", - "}", - ">", - '"', - "'", -} - -# Punctuation that should NOT have a separator AFTER it (no space after these marks) -NO_SEP_AFTER = { - # Common Chinese fullwidth opening/leading punctuation - "(", - "【", - "《", - "〈", - "「", - "『", - "“", - "‘", - # Common ASCII / halfwidth opening/leading punctuation - "(", - "[", - "{", - "<", - '"', - "'", - # Currency and prefix-like symbols that typically bind to the following token - "$", - "€", - "£", - "¥", - "₹", - # Social/identifier prefixes - "@", - "#", - "&", -} diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index e337a9e9e5a..6e3740b52bf 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -33,7 +33,7 @@ Windows 10 on ARM is also no longer supported. * In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes) * Chinese text can be navigated by word via build-in input gestures. Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605) -* Braille output for Chinese contains spaces as word separaters. (#18865, @CrazySteve0605) +* Braille output for Chinese contains spaces as word separators. (#18865, @CrazySteve0605) * In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes) ### Changes @@ -47,10 +47,6 @@ Windows 10 (Version 1507) is the minimum Windows version supported. We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl) * NVDA no longer supports 32bit Windows or Windows 10 on ARM. - - ->>>>>>> try-chineseWordSegmentation-staging - * Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667) * When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR) * The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898) @@ -117,17 +113,11 @@ Please open a GitHub issue if your add-on has an issue with updating to the new * the `rgpszUsageIdentifier` member of the `updateCheck.CERT_USAGE_MATCH` struct is now of type `POINTER(LPSTR)` rather than `c_void_p` to correctly align with Microsoft documentation. * The `UpdatableAddonsDialog.addonsList` is an instance of `gui.addonStoreGui.controls.addonList.AddonVirtualList`. (#18816, @nvdaes) * `visionEnhancementProviders.screenCurtain.Magnification` has been removed. -<<<<<<< HEAD - All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) -======= All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958) * `gui.nvdaControls.TabbableScrolledPanel` has been removed. - Use `wx.lib.scrolledpanel.ScrolledPanel` directly instead. (#17751) * The following Windows 8.x Start screen support symbols have been removed from `appModules.explorer` (File Explorer) app module with no replacement: `SuggestionListItem`, `SearchBoxClient`, `GridTileElement`, `GridListTileElement`, `GridGroup`, `ImmersiveLauncher`. (#18757, @josephsl) ->>>>>>> try-chineseWordSegmentation-staging - #### Deprecations * `winVersion.WIN81` constant has been deprecated from the `winVersion` module. (#18684, @josephsl):