Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nvdaHelper/cppjieba/sconscript
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ env.AppendUnique(

cppjiebaLib = env.SharedLibrary(target="cppjieba", source=sourceFiles)

if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # insure dicts installation happens only once and avoid a scons' warning
if not os.path.exists(outDir.Dir("dicts").get_abspath()) or not os.listdir(outDir.Dir("dicts").get_abspath()): # ensure dicts installation happens only once and avoid a scons' warning
env.Install(
outDir.Dir("dicts"),
[
Expand Down
18 changes: 8 additions & 10 deletions source/textUtils/wordSeg/wordSegStrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from collections.abc import Callable
from typing import Any
import re
import unicodedata

import textUtils
from logHandler import log
Expand Down Expand Up @@ -289,8 +290,6 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
if len(self.wordEnds) <= 1:
return self.text

from .wordSegUtils import NO_SEP_BEFORE, NO_SEP_AFTER

result = ""
for sepIndex in range(len(self.wordEnds) - 1):
preIndex = 0 if sepIndex == 0 else self.wordEnds[sepIndex - 1]
Expand All @@ -305,15 +304,14 @@ def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) ->
# separator already present at either side -> skip adding
continue

# slice to check the next token (text between curIndex and postIndex)
nextSlice = self.text[curIndex:postIndex]

# Determine whether any punctuation forbids a separator BEFORE the next token
noSepBefore = any(nextSlice.startswith(s) for s in NO_SEP_BEFORE)
# Determine whether any punctuation forbids a separator AFTER the current result
noSepAfter = any(result.endswith(s) for s in NO_SEP_AFTER)
# Determine whether any punctuation forbids a separator
PUNCTUATION_CATEGORIES: str = "pP" # Unicode categories for punctuation
noSep = (
unicodedata.category(self.text[curIndex - 1])[0] in PUNCTUATION_CATEGORIES
or unicodedata.category(self.text[curIndex])[0] in PUNCTUATION_CATEGORIES
)
Comment thread
seanbudd marked this conversation as resolved.

if not (noSepBefore or noSepAfter):
if not noSep:
# If neither side forbids the separator, add it
result += sep
if newSepIndex is not None:
Expand Down
118 changes: 14 additions & 104 deletions source/textUtils/wordSeg/wordSegUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@ def __init__(self, text: str):
self.newSepIndex: list[int] = []
self.encoded = WordSegmenter(text).segmentedText(sep=self.sep, newSepIndex=self.newSepIndex)

@property
@cached_property
def _separatorFlag(self) -> list[bool]:
isSep = [False] * self.encodedStringLength
for pos in self.newSepIndex:
isSep[pos] = True
return isSep

@cached_property
def computedStrToEncodedOffsets(self) -> list[int]:
"""
Compute a list of offsets so that:
Expand All @@ -32,23 +39,12 @@ def computedStrToEncodedOffsets(self) -> list[int]:
original index.
"""
strLen = self.strLength
encodedLen = self.encodedStringLength

# validate separator positions (optional but makes bugs obvious)
for pos in self.newSepIndex:
if pos < 0 or pos >= encodedLen:
raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}")

# mark which encoded positions are separators
isSep = [False] * encodedLen
for pos in self.newSepIndex:
isSep[pos] = True

# build explicit str -> encoded mapping
strToEncoded: list[int] = [0] * strLen
nextStrIndex = 0
for encodedIndex in range(encodedLen):
if not isSep[encodedIndex]:
for encodedIndex in range(self.encodedStringLength):
if not self._separatorFlag[encodedIndex]:
# assign the current original-char index to this encoded slot
# then advance to the next original index
if nextStrIndex >= strLen:
Expand All @@ -60,27 +56,15 @@ def computedStrToEncodedOffsets(self) -> list[int]:

return strToEncoded

@property
@cached_property
def computedEncodedToStrOffsets(self) -> list[int]:
encodedLen = self.encodedStringLength

# validate separator positions
for pos in self.newSepIndex:
if pos < 0 or pos >= encodedLen:
raise ValueError(f"separator position {pos} out of range for encoded length {encodedLen}")

# mark which encoded positions are separators
isSep = [False] * encodedLen
for pos in self.newSepIndex:
isSep[pos] = True

# build explicit encoded -> str mapping
# semantics: separator positions and the following encoded character
# both map to the same upcoming original str index (insertion point semantics).
encodedToStr: list[int] = [0] * encodedLen
encodedToStr: list[int] = [0] * self.encodedStringLength
nextStrIndex = 0
for encodedIndex in range(encodedLen):
if isSep[encodedIndex]:
for encodedIndex in range(self.encodedStringLength):
if self._separatorFlag[encodedIndex]:
# map separator to the next original character index (insertion point)
encodedToStr[encodedIndex] = nextStrIndex
else:
Expand Down Expand Up @@ -133,77 +117,3 @@ def encodedToStrOffsets(
else:
resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
return (resultStart, resultEnd)


# Punctuation that should NOT have a separator BEFORE it (no space before these marks)
NO_SEP_BEFORE = {
# Common Chinese fullwidth punctuation
"。",
",",
"、",
";",
":",
"?",
"!",
"…",
"...",
"—",
"–",
"——",
")",
"】",
"》",
"〉",
"」",
"』",
"”",
"’",
"%",
"‰",
"¥",
# Common ASCII / halfwidth punctuation
".",
",",
";",
":",
"?",
"!",
"%",
".",
")",
"]",
"}",
">",
'"',
"'",
}

# Punctuation that should NOT have a separator AFTER it (no space after these marks)
NO_SEP_AFTER = {
# Common Chinese fullwidth opening/leading punctuation
"(",
"【",
"《",
"〈",
"「",
"『",
"“",
"‘",
# Common ASCII / halfwidth opening/leading punctuation
"(",
"[",
"{",
"<",
'"',
"'",
# Currency and prefix-like symbols that typically bind to the following token
"$",
"€",
"£",
"¥",
"₹",
# Social/identifier prefixes
"@",
"#",
"&",
}
12 changes: 1 addition & 11 deletions user_docs/en/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Windows 10 on ARM is also no longer supported.
* In the Add-on Store, a new action has been added to see the latest changes for the current version of add-ons. (#14041, @josephsl, @nvdaes)
* Chinese text can be navigated by word via build-in input gestures.
Several GUI elements are added for its configuration in `Document Navigation` panel. (#18735, @CrazySteve0605)
* Braille output for Chinese contains spaces as word separaters. (#18865, @CrazySteve0605)
* Braille output for Chinese contains spaces as word separators. (#18865, @CrazySteve0605)
* In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes)

### Changes
Expand All @@ -47,10 +47,6 @@ Windows 10 (Version 1507) is the minimum Windows version supported.
We recommend using Windows 11, or if that is not possible, the latest Windows 10 release (Version 22H2). (#18684, @josephsl)
* NVDA no longer supports 32bit Windows or Windows 10 on ARM.



>>>>>>> try-chineseWordSegmentation-staging

* Added a button to the About dialog to copy the NVDA version number to the clipboard. (#18667)
* When entering a secure desktop, an installed copy of NVDA will automatically disable Braille temporarily, so that the secure desktop copy can access the braille display. (#2315, @LeonarddeR)
* The length of beeps used when "Line indentation reporting" is set to "Tones" or "Both Speech and Tones" has been reduced. (#18898)
Expand Down Expand Up @@ -117,17 +113,11 @@ Please open a GitHub issue if your add-on has an issue with updating to the new
* the `rgpszUsageIdentifier` member of the `updateCheck.CERT_USAGE_MATCH` struct is now of type `POINTER(LPSTR)` rather than `c_void_p` to correctly align with Microsoft documentation.
* The `UpdatableAddonsDialog.addonsList` is an instance of `gui.addonStoreGui.controls.addonList.AddonVirtualList`. (#18816, @nvdaes)
* `visionEnhancementProviders.screenCurtain.Magnification` has been removed.
<<<<<<< HEAD
All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958)
=======
All public symbols defined on this class are now accessible from `winBindings.magnification`. (#18958)
* `gui.nvdaControls.TabbableScrolledPanel` has been removed.

Use `wx.lib.scrolledpanel.ScrolledPanel` directly instead. (#17751)
* The following Windows 8.x Start screen support symbols have been removed from `appModules.explorer` (File Explorer) app module with no replacement: `SuggestionListItem`, `SearchBoxClient`, `GridTileElement`, `GridListTileElement`, `GridGroup`, `ImmersiveLauncher`. (#18757, @josephsl)

>>>>>>> try-chineseWordSegmentation-staging

#### Deprecations

* `winVersion.WIN81` constant has been deprecated from the `winVersion` module. (#18684, @josephsl):
Expand Down
Loading