Skip to content

Commit 4ca2629

Browse files
Merge pull request #523 from SixLabors/js/fix-522
Fix stale LB25 state causing spurious breaks after colon-separated text
2 parents 7396060 + 75f3567 commit 4ca2629

File tree

3 files changed

+196
-53
lines changed

3 files changed

+196
-53
lines changed

src/SixLabors.Fonts/Unicode/LineBreakEnumerator.cs

Lines changed: 112 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,45 @@
66
namespace SixLabors.Fonts.Unicode;
77

88
/// <summary>
9-
/// Supports a simple iteration over a linebreak collection.
10-
/// Implementation of the Unicode Line Break Algorithm. UAX:14
11-
/// <see href="https://www.unicode.org/reports/tr14/tr14-37.html"/>
9+
/// Enumerates potential line break opportunities for a span of text.
10+
/// This is the engine behind the Unicode Line Breaking Algorithm as defined by
11+
/// Unicode Standard Annex #14 (UAX #14):
12+
/// <see href="https://www.unicode.org/reports/tr14/"/>.
13+
///
14+
/// The Unicode rules are the source of truth for the logic below. Comments intentionally
15+
/// call out the UAX rule numbers so readers can cross-check the implementation against the
16+
/// specification instead of treating the local state flags as standalone behavior.
17+
///
18+
/// The implementation walks one code point at a time, keeps a two-code-point window
19+
/// (<see cref="currentClass"/> on the left and <see cref="nextClass"/> on the right), and
20+
/// carries a small amount of extra state for rules that depend on more than the current pair.
1221
/// Methods are pattern-matched by compiler to allow using foreach pattern.
1322
/// </summary>
1423
internal ref struct LineBreakEnumerator
1524
{
25+
// Iteration state:
26+
// - charPosition is the UTF-16 offset into the source span.
27+
// - position is the code point index after the most recently consumed code point.
28+
// - lastPosition is the candidate break boundary between currentClass and nextClass.
1629
private readonly ReadOnlySpan<char> source;
1730
private int charPosition;
1831
private readonly int pointsLength;
1932
private int position;
2033
private int lastPosition;
34+
35+
// The active pair under consideration. currentClass is the left side of the boundary and
36+
// nextClass is the right side. Most of UAX #14 reduces to deciding whether that pair breaks.
2137
private LineBreakClass currentClass;
2238
private LineBreakClass nextClass;
2339
private bool first;
40+
41+
// Tracks whether we are inside an AL/HL/NU run, including trailing combining marks.
42+
// This is used by LB30, which needs more context than the pair table alone can express.
2443
private int alphaNumericCount;
44+
45+
// Stateful rule flags for the UAX #14 rules that cannot be represented by a simple
46+
// currentClass/nextClass lookup. The field names intentionally mirror the rule numbers
47+
// so the implementation can be read side by side with the spec.
2548
private bool lb8a;
2649
private bool lb21a;
2750
private bool lb22ex;
@@ -31,6 +54,10 @@ internal ref struct LineBreakEnumerator
3154
private int lb30a;
3255
private bool lb31;
3356

57+
/// <summary>
58+
/// Initializes a new instance of the <see cref="LineBreakEnumerator"/> struct.
59+
/// </summary>
60+
/// <param name="source">The source text to inspect for UAX #14 break opportunities.</param>
3461
public LineBreakEnumerator(ReadOnlySpan<char> source)
3562
: this()
3663
{
@@ -53,6 +80,9 @@ public LineBreakEnumerator(ReadOnlySpan<char> source)
5380
this.lb30a = 0;
5481
}
5582

83+
/// <summary>
84+
/// Gets the most recently discovered line break opportunity.
85+
/// </summary>
5686
public LineBreak Current { get; private set; }
5787

5888
/// <summary>
@@ -70,7 +100,8 @@ public LineBreakEnumerator(ReadOnlySpan<char> source)
70100
/// </returns>
71101
public bool MoveNext()
72102
{
73-
// Get the first char if we're at the beginning of the string.
103+
// Prime the left side of the pair window. After this block the loop below always
104+
// decides the boundary between currentClass (left) and nextClass (right).
74105
if (this.first)
75106
{
76107
LineBreakClass firstClass = this.NextCharClass();
@@ -87,7 +118,8 @@ public bool MoveNext()
87118
LineBreakClass lastClass = this.nextClass;
88119
this.nextClass = this.NextCharClass();
89120

90-
// Explicit newline
121+
// Required breaks from BK/CR/LF are emitted before any pair-table logic.
122+
// This matches the UAX handling for explicit line terminators.
91123
switch (this.currentClass)
92124
{
93125
case LineBreakClass.BK:
@@ -97,9 +129,12 @@ public bool MoveNext()
97129
return true;
98130
}
99131

132+
// Handle the classes that have bespoke UAX behavior first, then fall back to the
133+
// pair table plus the stateful exceptions tracked on this enumerator.
100134
bool? shouldBreak = this.GetSimpleBreak() ?? (bool?)this.GetPairTableBreak(lastClass);
101135

102-
// Rule LB8a
136+
// LB8a suppresses breaks after a ZWJ. We record it after processing the current
137+
// boundary so it applies to the next boundary instead of the one we just decided.
103138
this.lb8a = this.nextClass == LineBreakClass.ZWJ;
104139

105140
if (shouldBreak.Value)
@@ -121,6 +156,8 @@ public bool MoveNext()
121156
break;
122157
}
123158

159+
// UAX also exposes an end-of-text boundary. Callers use that final boundary to
160+
// finalize the trailing line even when no earlier break opportunity was taken.
124161
this.Current = new LineBreak(this.FindPriorNonWhitespace(this.pointsLength), this.lastPosition, required);
125162
return true;
126163
}
@@ -129,6 +166,9 @@ public bool MoveNext()
129166
return false;
130167
}
131168

169+
/// <summary>
170+
/// Applies the LB1 class remapping required before any pair-table decisions are made.
171+
/// </summary>
132172
private static LineBreakClass MapClass(CodePoint cp, LineBreakClass c)
133173
{
134174
// LB 1
@@ -160,6 +200,9 @@ private static LineBreakClass MapClass(CodePoint cp, LineBreakClass c)
160200
}
161201
}
162202

203+
/// <summary>
204+
/// Applies the start-of-text normalization required for the first class in the stream.
205+
/// </summary>
163206
private static LineBreakClass MapFirst(LineBreakClass c)
164207
=> c switch
165208
{
@@ -168,33 +211,47 @@ private static LineBreakClass MapFirst(LineBreakClass c)
168211
_ => c,
169212
};
170213

214+
/// <summary>
215+
/// Returns <see langword="true"/> for the classes treated as alphanumeric by the
216+
/// stateful LB30 handling.
217+
/// </summary>
171218
private static bool IsAlphaNumeric(LineBreakClass cls)
172219
=> cls is LineBreakClass.AL
173220
or LineBreakClass.HL
174221
or LineBreakClass.NU;
175222

223+
/// <summary>
224+
/// Reads the next class without advancing the enumerator. This is only used by rules such as
225+
/// LB25 that need one-code-point lookahead to confirm a numeric punctuation sequence.
226+
/// </summary>
176227
private readonly LineBreakClass PeekNextCharClass()
177228
{
178229
CodePoint cp = CodePoint.DecodeFromUtf16At(this.source, this.charPosition);
179230
return MapClass(cp, CodePoint.GetLineBreakClass(cp));
180231
}
181232

182-
// Get the next character class
233+
/// <summary>
234+
/// Consumes the next code point, applies LB1 class resolution, and updates any stateful
235+
/// rule flags that depend on more than the current pair.
236+
/// </summary>
183237
private LineBreakClass NextCharClass()
184238
{
185239
CodePoint cp = CodePoint.DecodeFromUtf16At(this.source, this.charPosition, out int count);
186240
LineBreakClass cls = MapClass(cp, CodePoint.GetLineBreakClass(cp));
187241
this.charPosition += count;
188242
this.position++;
189243

190-
// Keep track of alphanumeric + any combining marks.
191-
// This is used for LB22 and LB30.
244+
// Track an alphanumeric run together with any trailing combining marks. LB30 needs to
245+
// know whether an opening punctuation follows such a run, but once currentClass advances
246+
// we would otherwise lose that earlier context.
192247
if (IsAlphaNumeric(this.currentClass) || (this.alphaNumericCount > 0 && cls == LineBreakClass.CM))
193248
{
194249
this.alphaNumericCount++;
195250
}
196251

197-
// Track combining mark exceptions. LB22
252+
// LB22 distinguishes between "CM after one of the explicitly allowed classes" and
253+
// "CM after anything else" when the next class is IN. Record that now before currentClass
254+
// collapses to CM on the next iteration.
198255
if (cls == LineBreakClass.CM)
199256
{
200257
switch (this.currentClass)
@@ -212,7 +269,8 @@ private LineBreakClass NextCharClass()
212269
}
213270
}
214271

215-
// Track combining mark exceptions. LB31
272+
// LB31 is another context-sensitive rule. We record the contexts that permit a break
273+
// before an opening punctuation and defer the final decision until OP is nextClass.
216274
if (this.first && cls == LineBreakClass.CM)
217275
{
218276
this.lb31 = true;
@@ -257,19 +315,21 @@ private LineBreakClass NextCharClass()
257315
this.lb31 = false;
258316
}
259317

260-
// Rule LB24
318+
// Seed the multi-code-point context used later by LB24.
261319
if (this.first && (cls == LineBreakClass.CL || cls == LineBreakClass.CP))
262320
{
263321
this.lb24ex = true;
264322
}
265323

266-
// Rule LB25
324+
// Seed the multi-code-point context used later by LB25.
267325
if (this.first
268326
&& (cls == LineBreakClass.CL || cls == LineBreakClass.IS || cls == LineBreakClass.SY))
269327
{
270328
this.lb25ex = true;
271329
}
272330

331+
// LB25 spans punctuation, spaces, and surrounding numeric runs. Look ahead one code point
332+
// so punctuation after a space or letter can still participate in the same numeric context.
273333
if (cls is LineBreakClass.SP or LineBreakClass.WJ or LineBreakClass.AL)
274334
{
275335
LineBreakClass next = this.PeekNextCharClass();
@@ -294,9 +354,15 @@ private LineBreakClass NextCharClass()
294354
return cls;
295355
}
296356

357+
/// <summary>
358+
/// Handles the UAX rules for spaces and explicit line terminators before falling back to the
359+
/// pair table for the ordinary pair-based decisions.
360+
/// </summary>
297361
private bool? GetSimpleBreak()
298362
{
299-
// handle classes not handled by the pair table
363+
// These classes are easier to express directly than through the pair table:
364+
// - spaces never break immediately before themselves,
365+
// - hard line terminators update currentClass so the next iteration emits a required break.
300366
switch (this.nextClass)
301367
{
302368
case LineBreakClass.SP:
@@ -316,9 +382,14 @@ private LineBreakClass NextCharClass()
316382
return null;
317383
}
318384

385+
/// <summary>
386+
/// Applies the pair-table result and then layers on the stateful UAX exceptions that require
387+
/// additional context beyond the current pair.
388+
/// </summary>
319389
private bool GetPairTableBreak(LineBreakClass lastClass)
320390
{
321-
// If not handled already, use the pair table
391+
// The pair table is the baseline answer from UAX #14. The rule-specific flags below
392+
// then tighten or loosen that answer where the spec requires extra context.
322393
bool shouldBreak = false;
323394
switch (LineBreakPairTable.Table[(int)this.currentClass][(int)this.nextClass])
324395
{
@@ -387,6 +458,20 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
387458
break;
388459
}
389460

461+
if (lastClass == LineBreakClass.SP
462+
&& this.nextClass is LineBreakClass.AL or LineBreakClass.HL)
463+
{
464+
// Once we have already broken at the punctuation-adjacent space, keep the
465+
// LB25 state alive only if the following letters immediately continue into
466+
// another CL/IS/SY sequence. Otherwise the punctuation context has ended
467+
// and must not leak forward to a later AL|NU pair.
468+
LineBreakClass ahead = this.PeekNextCharClass();
469+
if (ahead is not LineBreakClass.CL and not LineBreakClass.IS and not LineBreakClass.SY)
470+
{
471+
this.lb25ex = false;
472+
}
473+
}
474+
390475
if (this.nextClass is LineBreakClass.PR or LineBreakClass.NU)
391476
{
392477
shouldBreak = true;
@@ -428,7 +513,9 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
428513
break;
429514
}
430515

431-
// Rule LB22
516+
// Apply the remaining non-pair-table rules in the same place every time so the
517+
// interaction between pair-table output and rule-specific overrides stays obvious.
518+
// LB22
432519
if (this.nextClass == LineBreakClass.IN)
433520
{
434521
switch (lastClass)
@@ -459,6 +546,7 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
459546
}
460547
}
461548

549+
// LB8a suppresses a break after ZWJ regardless of the pair-table answer.
462550
if (this.lb8a)
463551
{
464552
shouldBreak = false;
@@ -490,7 +578,9 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
490578
this.lb30a = 0;
491579
}
492580

493-
// Rule LB30b
581+
// LB30b depends on the Extended_Pictographic property, but Mahjong tiles are a special case:
582+
// their Line_Break class is ID even though they live in an Extended_Pictographic-adjacent
583+
// block, so we need an explicit guard here to mirror the Unicode test data.
494584
if (this.nextClass == LineBreakClass.EM && this.lastPosition > 0)
495585
{
496586
// Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID
@@ -508,6 +598,11 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
508598
return shouldBreak;
509599
}
510600

601+
/// <summary>
602+
/// Walks backward from a wrap position to the nearest non-breaking trailing content so that
603+
/// measurement excludes trailing spaces and hard line terminators while wrapping still occurs
604+
/// at the original boundary.
605+
/// </summary>
511606
private readonly int FindPriorNonWhitespace(int from)
512607
{
513608
if (from > 0)

0 commit comments

Comments
 (0)