diff --git a/diffmatchpatch/patch.go b/diffmatchpatch/patch.go index 0dbe3bd..651b290 100644 --- a/diffmatchpatch/patch.go +++ b/diffmatchpatch/patch.go @@ -16,8 +16,34 @@ import ( "regexp" "strconv" "strings" + "unicode/utf8" ) +// alignRuneStart backs up byte index idx to the nearest UTF-8 rune start byte. +// idx <= 0 or idx >= len(s) is returned unchanged. +// +// Motivation: Google's reference JS diff-match-patch uses string.substring +// (UTF-16 code unit) for all slicing — surrogate-pair-middle splits are +// well-defined. This Go port slices by UTF-8 byte index (text[a:b]). When +// MatchMain's Bitap returns a byte position that lands inside a multi-byte +// character (CJK 3B / em dash 3B / emoji 4B), the resulting slice is invalid +// UTF-8 — subsequent MatchMain calls miss anchors, and downstream byte slices +// can panic with "slice bounds out of range" (issue #132). +// +// Fix: align every byte slice index back to the nearest valid rune boundary +// before slicing. The operation costs at most 1-3 bytes of "shrinkage" per +// slice point but guarantees the resulting string is always valid UTF-8 and +// no MatchMain / DiffMain pass sees a continuation-byte-prefixed string. +func alignRuneStart(s string, idx int) int { + if idx <= 0 || idx >= len(s) { + return idx + } + for idx > 0 && !utf8.RuneStart(s[idx]) { + idx-- + } + return idx +} + // Patch represents one patch operation. type Patch struct { diffs []Diff @@ -91,12 +117,18 @@ func (dmp *DiffMatchPatch) PatchAddContext(patch Patch, text string) Patch { padding += dmp.PatchMargin // Add the prefix. - prefix := text[max(0, patch.Start2-padding):patch.Start2] + // rune-safety: align prefix start to nearest rune boundary so the slice + // never produces an invalid-UTF-8 string when the surrounding text contains + // multi-byte characters. + prefixStart := alignRuneStart(text, max(0, patch.Start2-padding)) + prefix := text[prefixStart:patch.Start2] if len(prefix) != 0 { patch.diffs = append([]Diff{Diff{DiffEqual, prefix}}, patch.diffs...) } // Add the suffix. - suffix := text[patch.Start2+patch.Length1 : min(len(text), patch.Start2+patch.Length1+padding)] + // rune-safety: align suffix end to nearest rune boundary. + suffixEnd := alignRuneStart(text, min(len(text), patch.Start2+patch.Length1+padding)) + suffix := text[patch.Start2+patch.Length1 : suffixEnd] if len(suffix) != 0 { patch.diffs = append(patch.diffs, Diff{DiffEqual, suffix}) } @@ -264,6 +296,13 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b } else { startLoc = dmp.MatchMain(text, text1, expectedLoc) } + // rune-safety: Bitap-returned byte position may land in the middle of a + // multi-byte character. Back it up to the nearest rune start so the + // subsequent text[startLoc:...] / text[:startLoc] slices never produce + // invalid UTF-8 (root cause of issue #132 panic). + if startLoc != -1 { + startLoc = alignRuneStart(text, startLoc) + } if startLoc == -1 { // No match found. :( results[x] = false @@ -274,14 +313,18 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b results[x] = true delta = startLoc - expectedLoc var text2 string + var text2End int if endLoc == -1 { - text2 = text[startLoc:int(math.Min(float64(startLoc+len(text1)), float64(len(text))))] + text2End = alignRuneStart(text, int(math.Min(float64(startLoc+len(text1)), float64(len(text))))) } else { - text2 = text[startLoc:int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))] + endLoc = alignRuneStart(text, endLoc) + text2End = alignRuneStart(text, int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))) } + text2 = text[startLoc:text2End] if text1 == text2 { // Perfect match, just shove the Replacement text in. - text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[startLoc+len(text1):] + replaceEnd := alignRuneStart(text, startLoc+len(text1)) + text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[replaceEnd:] } else { // Imperfect match. Run a diff to get a framework of equivalent indices. diffs := dmp.DiffMain(text1, text2, false) @@ -296,12 +339,16 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b index2 := dmp.DiffXIndex(diffs, index1) if aDiff.Type == DiffInsert { // Insertion - text = text[:startLoc+index2] + aDiff.Text + text[startLoc+index2:] + insertAt := alignRuneStart(text, startLoc+index2) + text = text[:insertAt] + aDiff.Text + text[insertAt:] } else if aDiff.Type == DiffDelete { // Deletion - startIndex := startLoc + index2 - text = text[:startIndex] + - text[startIndex+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))-index2:] + startIndex := alignRuneStart(text, startLoc+index2) + endIndex := alignRuneStart(text, startLoc+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))) + if endIndex < startIndex { + endIndex = startIndex + } + text = text[:startIndex] + text[endIndex:] } } if aDiff.Type != DiffDelete { @@ -416,7 +463,19 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch { bigpatch.diffs = bigpatch.diffs[1:] } else { // Deletion or equality. Only take as much as we can stomach. - diffText = diffText[:min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)] + // rune-safety: align cutAt to nearest rune boundary so we never + // slice diffText through a multi-byte character (the resulting + // invalid UTF-8 is the root cause of issue #132 panic). + cutAt := min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin) + cutAt = alignRuneStart(diffText, cutAt) + // Loop-progress guard: when alignment collapses cutAt to 0 but + // diffText is non-empty, advance by exactly one rune so the + // outer for-loop is guaranteed to consume input each iteration. + if cutAt == 0 && len(diffText) > 0 { + _, size := utf8.DecodeRuneInString(diffText) + cutAt = size + } + diffText = diffText[:cutAt] patch.Length1 += len(diffText) Start1 += len(diffText) @@ -430,21 +489,26 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch { if diffText == bigpatch.diffs[0].Text { bigpatch.diffs = bigpatch.diffs[1:] } else { - bigpatch.diffs[0].Text = - bigpatch.diffs[0].Text[len(diffText):] + // rune-safety: len(diffText) was already rune-aligned above, + // but align once more defensively for cutAt == 0 etc. + remStart := alignRuneStart(bigpatch.diffs[0].Text, len(diffText)) + bigpatch.diffs[0].Text = bigpatch.diffs[0].Text[remStart:] } } } // Compute the head context for the next patch. precontext = dmp.DiffText2(patch.diffs) - precontext = precontext[max(0, len(precontext)-dmp.PatchMargin):] + // rune-safety: align precontext start to nearest rune boundary. + precontext = precontext[alignRuneStart(precontext, max(0, len(precontext)-dmp.PatchMargin)):] postcontext := "" // Append the end context for this patch. - if len(dmp.DiffText1(bigpatch.diffs)) > dmp.PatchMargin { - postcontext = dmp.DiffText1(bigpatch.diffs)[:dmp.PatchMargin] + rawPost := dmp.DiffText1(bigpatch.diffs) + if len(rawPost) > dmp.PatchMargin { + // rune-safety: align before slicing to avoid mid-rune cut. + postcontext = rawPost[:alignRuneStart(rawPost, dmp.PatchMargin)] } else { - postcontext = dmp.DiffText1(bigpatch.diffs) + postcontext = rawPost } if len(postcontext) != 0 { diff --git a/diffmatchpatch/patch_rune_safety_test.go b/diffmatchpatch/patch_rune_safety_test.go new file mode 100644 index 0000000..11dbb4d --- /dev/null +++ b/diffmatchpatch/patch_rune_safety_test.go @@ -0,0 +1,134 @@ +package diffmatchpatch + +import ( + "strings" + "testing" +) + +// alignRuneStart should back any byte index inside a multi-byte UTF-8 +// character back to that character's first byte. +func TestAlignRuneStart(t *testing.T) { + // "abc中文def" — UTF-8: a=1 b=1 c=1 中=3(byte 3..5) 文=3(6..8) d=1 e=1 f=1 + s := "abc中文def" + cases := []struct { + in, want int + desc string + }{ + {0, 0, "0 unchanged"}, + {1, 1, "after 'a' already aligned"}, + {3, 3, "start of 中 aligned"}, + {4, 3, "middle of 中 backs to start"}, + {5, 3, "end-middle of 中 backs to start"}, + {6, 6, "start of 文 aligned"}, + {7, 6, "middle of 文 backs to start"}, + {9, 9, "after 文 = start of d, aligned"}, + {12, 12, "end-of-string"}, + {-1, -1, "negative unchanged"}, + {999, 999, "past end unchanged"}, + } + for _, c := range cases { + got := alignRuneStart(s, c.in) + if got != c.want { + t.Errorf("alignRuneStart(%q, %d) = %d, want %d (%s)", s, c.in, got, c.want, c.desc) + } + } +} + +// Issue #132 minimal repro: applying a patch containing a multi-byte +// character with a short anchor used to panic with "slice bounds out of range". +// After the rune-safe alignment, PatchApply should return cleanly (apply may +// fail with results[i]=false, but must never panic). +func TestPatchApply_Issue132_NoPanic(t *testing.T) { + dmp := New() + patches, err := dmp.PatchFromText("@@ -1,2 +1,3 @@\n %E2%98%9E \n+r\n") + if err != nil { + t.Fatalf("PatchFromText: %v", err) + } + defer func() { + if r := recover(); r != nil { + t.Fatalf("panic leaked: %v", r) + } + }() + _, results := dmp.PatchApply(patches, "☞ 𝗢𝗥𝗗𝗘𝗥 ") + if results == nil { + t.Fatalf("results nil") + } +} + +// PatchSplitMax on a CJK-heavy base must produce diff.Text slices that are +// all valid UTF-8 (prior to this fix, byte-level slicing of multi-byte +// characters could leave continuation-byte prefixes in the output). +func TestPatchSplitMax_NoInvalidUTF8(t *testing.T) { + dmp := New() + // CJK base larger than MatchMaxBits (32) forces PatchSplitMax to engage. + old := "中文段落一" + strings.Repeat("汉字", 30) + "结束" + newText := "中文段落一" + strings.Repeat("汉字", 30) + "结尾" + patches := dmp.PatchMake(old, dmp.DiffMain(old, newText, false)) + if len(patches) == 0 { + t.Skip("no patches generated") + } + // PatchMake itself must produce valid-UTF-8 diff text (PatchAddContext's + // prefix/suffix slice now aligns to rune boundaries). + for i, p := range patches { + for j, d := range p.diffs { + if !isValidUTF8(d.Text) { + t.Errorf("PatchMake patch %d diff %d invalid UTF-8: %q", i, j, d.Text) + } + } + } + defer func() { + if r := recover(); r != nil { + t.Fatalf("PatchSplitMax panic: %v", r) + } + }() + // PatchSplitMax must preserve valid UTF-8 (precontext / postcontext / + // diffText cut points all rune-aligned). + for i, p := range dmp.PatchSplitMax(patches) { + for j, d := range p.diffs { + if !isValidUTF8(d.Text) { + t.Errorf("PatchSplitMax patch %d diff %d invalid UTF-8: %q type=%d", i, j, d.Text, d.Type) + } + } + } +} + +// minimal UTF-8 validity check inlined to avoid adding a stdlib import vs. +// the existing import block in patch.go (utf8 is already there for the fix). +func isValidUTF8(s string) bool { + for i := 0; i < len(s); { + r, sz := utf8DecodeFirst(s[i:]) + if r == 0xFFFD && sz == 1 { + return false + } + i += sz + } + return true +} + +func utf8DecodeFirst(s string) (rune, int) { + if len(s) == 0 { + return 0, 0 + } + b := s[0] + switch { + case b < 0x80: + return rune(b), 1 + case b < 0xc0: + return 0xFFFD, 1 + case b < 0xe0: + if len(s) < 2 { + return 0xFFFD, 1 + } + return rune(b&0x1f)<<6 | rune(s[1]&0x3f), 2 + case b < 0xf0: + if len(s) < 3 { + return 0xFFFD, 1 + } + return rune(b&0x0f)<<12 | rune(s[1]&0x3f)<<6 | rune(s[2]&0x3f), 3 + default: + if len(s) < 4 { + return 0xFFFD, 1 + } + return rune(b&0x07)<<18 | rune(s[1]&0x3f)<<12 | rune(s[2]&0x3f)<<6 | rune(s[3]&0x3f), 4 + } +}