66namespace SixLabors . Fonts . Unicode ;
77
88/// <summary>
9- /// Supports a simple iteration over a linebreak collection.
10- /// Implementation of the Unicode Line Break Algorithm. UAX:14
11- /// <see href="https://www.unicode.org/reports/tr14/tr14-37.html"/>
9+ /// Enumerates potential line break opportunities for a span of text.
10+ /// This is the engine behind the Unicode Line Breaking Algorithm as defined by
11+ /// Unicode Standard Annex #14 (UAX #14):
12+ /// <see href="https://www.unicode.org/reports/tr14/"/>.
13+ ///
14+ /// The Unicode rules are the source of truth for the logic below. Comments intentionally
15+ /// call out the UAX rule numbers so readers can cross-check the implementation against the
16+ /// specification instead of treating the local state flags as standalone behavior.
17+ ///
18+ /// The implementation walks one code point at a time, keeps a two-code-point window
19+ /// (<see cref="currentClass"/> on the left and <see cref="nextClass"/> on the right), and
20+ /// carries a small amount of extra state for rules that depend on more than the current pair.
1221/// Methods are pattern-matched by compiler to allow using foreach pattern.
1322/// </summary>
1423internal ref struct LineBreakEnumerator
1524{
25+ // Iteration state:
26+ // - charPosition is the UTF-16 offset into the source span.
27+ // - position is the code point index after the most recently consumed code point.
28+ // - lastPosition is the candidate break boundary between currentClass and nextClass.
1629 private readonly ReadOnlySpan < char > source ;
1730 private int charPosition ;
1831 private readonly int pointsLength ;
1932 private int position ;
2033 private int lastPosition ;
34+
35+ // The active pair under consideration. currentClass is the left side of the boundary and
36+ // nextClass is the right side. Most of UAX #14 reduces to deciding whether that pair breaks.
2137 private LineBreakClass currentClass ;
2238 private LineBreakClass nextClass ;
2339 private bool first ;
40+
41+ // Tracks whether we are inside an AL/HL/NU run, including trailing combining marks.
42+ // This is used by LB30, which needs more context than the pair table alone can express.
2443 private int alphaNumericCount ;
44+
45+ // Stateful rule flags for the UAX #14 rules that cannot be represented by a simple
46+ // currentClass/nextClass lookup. The field names intentionally mirror the rule numbers
47+ // so the implementation can be read side by side with the spec.
2548 private bool lb8a ;
2649 private bool lb21a ;
2750 private bool lb22ex ;
@@ -31,6 +54,10 @@ internal ref struct LineBreakEnumerator
3154 private int lb30a ;
3255 private bool lb31 ;
3356
57+ /// <summary>
58+ /// Initializes a new instance of the <see cref="LineBreakEnumerator"/> struct.
59+ /// </summary>
60+ /// <param name="source">The source text to inspect for UAX #14 break opportunities.</param>
3461 public LineBreakEnumerator ( ReadOnlySpan < char > source )
3562 : this ( )
3663 {
@@ -53,6 +80,9 @@ public LineBreakEnumerator(ReadOnlySpan<char> source)
5380 this . lb30a = 0 ;
5481 }
5582
83+ /// <summary>
84+ /// Gets the most recently discovered line break opportunity.
85+ /// </summary>
5686 public LineBreak Current { get ; private set ; }
5787
5888 /// <summary>
@@ -70,7 +100,8 @@ public LineBreakEnumerator(ReadOnlySpan<char> source)
70100 /// </returns>
71101 public bool MoveNext ( )
72102 {
73- // Get the first char if we're at the beginning of the string.
103+ // Prime the left side of the pair window. After this block the loop below always
104+ // decides the boundary between currentClass (left) and nextClass (right).
74105 if ( this . first )
75106 {
76107 LineBreakClass firstClass = this . NextCharClass ( ) ;
@@ -87,7 +118,8 @@ public bool MoveNext()
87118 LineBreakClass lastClass = this . nextClass ;
88119 this . nextClass = this . NextCharClass ( ) ;
89120
90- // Explicit newline
121+ // Required breaks from BK/CR/LF are emitted before any pair-table logic.
122+ // This matches the UAX handling for explicit line terminators.
91123 switch ( this . currentClass )
92124 {
93125 case LineBreakClass . BK :
@@ -97,9 +129,12 @@ public bool MoveNext()
97129 return true ;
98130 }
99131
132+ // Handle the classes that have bespoke UAX behavior first, then fall back to the
133+ // pair table plus the stateful exceptions tracked on this enumerator.
100134 bool ? shouldBreak = this . GetSimpleBreak ( ) ?? ( bool ? ) this . GetPairTableBreak ( lastClass ) ;
101135
102- // Rule LB8a
136+ // LB8a suppresses breaks after a ZWJ. We record it after processing the current
137+ // boundary so it applies to the next boundary instead of the one we just decided.
103138 this . lb8a = this . nextClass == LineBreakClass . ZWJ ;
104139
105140 if ( shouldBreak . Value )
@@ -121,6 +156,8 @@ public bool MoveNext()
121156 break ;
122157 }
123158
159+ // UAX also exposes an end-of-text boundary. Callers use that final boundary to
160+ // finalize the trailing line even when no earlier break opportunity was taken.
124161 this . Current = new LineBreak ( this . FindPriorNonWhitespace ( this . pointsLength ) , this . lastPosition , required ) ;
125162 return true ;
126163 }
@@ -129,6 +166,9 @@ public bool MoveNext()
129166 return false ;
130167 }
131168
169+ /// <summary>
170+ /// Applies the LB1 class remapping required before any pair-table decisions are made.
171+ /// </summary>
132172 private static LineBreakClass MapClass ( CodePoint cp , LineBreakClass c )
133173 {
134174 // LB 1
@@ -160,6 +200,9 @@ private static LineBreakClass MapClass(CodePoint cp, LineBreakClass c)
160200 }
161201 }
162202
203+ /// <summary>
204+ /// Applies the start-of-text normalization required for the first class in the stream.
205+ /// </summary>
163206 private static LineBreakClass MapFirst ( LineBreakClass c )
164207 => c switch
165208 {
@@ -168,33 +211,47 @@ private static LineBreakClass MapFirst(LineBreakClass c)
168211 _ => c ,
169212 } ;
170213
214+ /// <summary>
215+ /// Returns <see langword="true"/> for the classes treated as alphanumeric by the
216+ /// stateful LB30 handling.
217+ /// </summary>
171218 private static bool IsAlphaNumeric ( LineBreakClass cls )
172219 => cls is LineBreakClass . AL
173220 or LineBreakClass . HL
174221 or LineBreakClass . NU ;
175222
223+ /// <summary>
224+ /// Reads the next class without advancing the enumerator. This is only used by rules such as
225+ /// LB25 that need one-code-point lookahead to confirm a numeric punctuation sequence.
226+ /// </summary>
176227 private readonly LineBreakClass PeekNextCharClass ( )
177228 {
178229 CodePoint cp = CodePoint . DecodeFromUtf16At ( this . source , this . charPosition ) ;
179230 return MapClass ( cp , CodePoint . GetLineBreakClass ( cp ) ) ;
180231 }
181232
182- // Get the next character class
233+ /// <summary>
234+ /// Consumes the next code point, applies LB1 class resolution, and updates any stateful
235+ /// rule flags that depend on more than the current pair.
236+ /// </summary>
183237 private LineBreakClass NextCharClass ( )
184238 {
185239 CodePoint cp = CodePoint . DecodeFromUtf16At ( this . source , this . charPosition , out int count ) ;
186240 LineBreakClass cls = MapClass ( cp , CodePoint . GetLineBreakClass ( cp ) ) ;
187241 this . charPosition += count ;
188242 this . position ++ ;
189243
190- // Keep track of alphanumeric + any combining marks.
191- // This is used for LB22 and LB30.
244+ // Track an alphanumeric run together with any trailing combining marks. LB30 needs to
245+ // know whether an opening punctuation follows such a run, but once currentClass advances
246+ // we would otherwise lose that earlier context.
192247 if ( IsAlphaNumeric ( this . currentClass ) || ( this . alphaNumericCount > 0 && cls == LineBreakClass . CM ) )
193248 {
194249 this . alphaNumericCount ++ ;
195250 }
196251
197- // Track combining mark exceptions. LB22
252+ // LB22 distinguishes between "CM after one of the explicitly allowed classes" and
253+ // "CM after anything else" when the next class is IN. Record that now before currentClass
254+ // collapses to CM on the next iteration.
198255 if ( cls == LineBreakClass . CM )
199256 {
200257 switch ( this . currentClass )
@@ -212,7 +269,8 @@ private LineBreakClass NextCharClass()
212269 }
213270 }
214271
215- // Track combining mark exceptions. LB31
272+ // LB31 is another context-sensitive rule. We record the contexts that permit a break
273+ // before an opening punctuation and defer the final decision until OP is nextClass.
216274 if ( this . first && cls == LineBreakClass . CM )
217275 {
218276 this . lb31 = true ;
@@ -257,19 +315,21 @@ private LineBreakClass NextCharClass()
257315 this . lb31 = false ;
258316 }
259317
260- // Rule LB24
318+ // Seed the multi-code-point context used later by LB24.
261319 if ( this . first && ( cls == LineBreakClass . CL || cls == LineBreakClass . CP ) )
262320 {
263321 this . lb24ex = true ;
264322 }
265323
266- // Rule LB25
324+ // Seed the multi-code-point context used later by LB25.
267325 if ( this . first
268326 && ( cls == LineBreakClass . CL || cls == LineBreakClass . IS || cls == LineBreakClass . SY ) )
269327 {
270328 this . lb25ex = true ;
271329 }
272330
331+ // LB25 spans punctuation, spaces, and surrounding numeric runs. Look ahead one code point
332+ // so punctuation after a space or letter can still participate in the same numeric context.
273333 if ( cls is LineBreakClass . SP or LineBreakClass . WJ or LineBreakClass . AL )
274334 {
275335 LineBreakClass next = this . PeekNextCharClass ( ) ;
@@ -294,9 +354,15 @@ private LineBreakClass NextCharClass()
294354 return cls ;
295355 }
296356
357+ /// <summary>
358+ /// Handles the UAX rules for spaces and explicit line terminators before falling back to the
359+ /// pair table for the ordinary pair-based decisions.
360+ /// </summary>
297361 private bool ? GetSimpleBreak ( )
298362 {
299- // handle classes not handled by the pair table
363+ // These classes are easier to express directly than through the pair table:
364+ // - spaces never break immediately before themselves,
365+ // - hard line terminators update currentClass so the next iteration emits a required break.
300366 switch ( this . nextClass )
301367 {
302368 case LineBreakClass . SP :
@@ -316,9 +382,14 @@ private LineBreakClass NextCharClass()
316382 return null ;
317383 }
318384
385+ /// <summary>
386+ /// Applies the pair-table result and then layers on the stateful UAX exceptions that require
387+ /// additional context beyond the current pair.
388+ /// </summary>
319389 private bool GetPairTableBreak ( LineBreakClass lastClass )
320390 {
321- // If not handled already, use the pair table
391+ // The pair table is the baseline answer from UAX #14. The rule-specific flags below
392+ // then tighten or loosen that answer where the spec requires extra context.
322393 bool shouldBreak = false ;
323394 switch ( LineBreakPairTable . Table [ ( int ) this . currentClass ] [ ( int ) this . nextClass ] )
324395 {
@@ -387,6 +458,20 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
387458 break ;
388459 }
389460
461+ if ( lastClass == LineBreakClass . SP
462+ && this . nextClass is LineBreakClass . AL or LineBreakClass . HL )
463+ {
464+ // Once we have already broken at the punctuation-adjacent space, keep the
465+ // LB25 state alive only if the following letters immediately continue into
466+ // another CL/IS/SY sequence. Otherwise the punctuation context has ended
467+ // and must not leak forward to a later AL|NU pair.
468+ LineBreakClass ahead = this . PeekNextCharClass ( ) ;
469+ if ( ahead is not LineBreakClass . CL and not LineBreakClass . IS and not LineBreakClass . SY )
470+ {
471+ this . lb25ex = false ;
472+ }
473+ }
474+
390475 if ( this . nextClass is LineBreakClass . PR or LineBreakClass . NU )
391476 {
392477 shouldBreak = true ;
@@ -428,7 +513,9 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
428513 break ;
429514 }
430515
431- // Rule LB22
516+ // Apply the remaining non-pair-table rules in the same place every time so the
517+ // interaction between pair-table output and rule-specific overrides stays obvious.
518+ // LB22
432519 if ( this . nextClass == LineBreakClass . IN )
433520 {
434521 switch ( lastClass )
@@ -459,6 +546,7 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
459546 }
460547 }
461548
549+ // LB8a suppresses a break after ZWJ regardless of the pair-table answer.
462550 if ( this . lb8a )
463551 {
464552 shouldBreak = false ;
@@ -490,7 +578,9 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
490578 this . lb30a = 0 ;
491579 }
492580
493- // Rule LB30b
581+ // LB30b depends on the Extended_Pictographic property, but Mahjong tiles are a special case:
582+ // their Line_Break class is ID even though they live in an Extended_Pictographic-adjacent
583+ // block, so we need an explicit guard here to mirror the Unicode test data.
494584 if ( this . nextClass == LineBreakClass . EM && this . lastPosition > 0 )
495585 {
496586 // Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID
@@ -508,6 +598,11 @@ private bool GetPairTableBreak(LineBreakClass lastClass)
508598 return shouldBreak ;
509599 }
510600
601+ /// <summary>
602+ /// Walks backward from a wrap position to the nearest non-breaking trailing content so that
603+ /// measurement excludes trailing spaces and hard line terminators while wrapping still occurs
604+ /// at the original boundary.
605+ /// </summary>
511606 private readonly int FindPriorNonWhitespace ( int from )
512607 {
513608 if ( from > 0 )
0 commit comments