Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 80 additions & 16 deletions diffmatchpatch/patch.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,34 @@ import (
"regexp"
"strconv"
"strings"
"unicode/utf8"
)

// alignRuneStart backs up byte index idx to the nearest UTF-8 rune start byte.
// idx <= 0 or idx >= len(s) is returned unchanged.
//
// Motivation: Google's reference JS diff-match-patch uses string.substring
// (UTF-16 code unit) for all slicing — surrogate-pair-middle splits are
// well-defined. This Go port slices by UTF-8 byte index (text[a:b]). When
// MatchMain's Bitap returns a byte position that lands inside a multi-byte
// character (CJK 3B / em dash 3B / emoji 4B), the resulting slice is invalid
// UTF-8 — subsequent MatchMain calls miss anchors, and downstream byte slices
// can panic with "slice bounds out of range" (issue #132).
//
// Fix: align every byte slice index back to the nearest valid rune boundary
// before slicing. The operation costs at most 1-3 bytes of "shrinkage" per
// slice point but guarantees the resulting string is always valid UTF-8 and
// no MatchMain / DiffMain pass sees a continuation-byte-prefixed string.
func alignRuneStart(s string, idx int) int {
if idx <= 0 || idx >= len(s) {
return idx
}
for idx > 0 && !utf8.RuneStart(s[idx]) {
idx--
}
return idx
}

// Patch represents one patch operation.
type Patch struct {
diffs []Diff
Expand Down Expand Up @@ -91,12 +117,18 @@ func (dmp *DiffMatchPatch) PatchAddContext(patch Patch, text string) Patch {
padding += dmp.PatchMargin

// Add the prefix.
prefix := text[max(0, patch.Start2-padding):patch.Start2]
// rune-safety: align prefix start to nearest rune boundary so the slice
// never produces an invalid-UTF-8 string when the surrounding text contains
// multi-byte characters.
prefixStart := alignRuneStart(text, max(0, patch.Start2-padding))
prefix := text[prefixStart:patch.Start2]
if len(prefix) != 0 {
patch.diffs = append([]Diff{Diff{DiffEqual, prefix}}, patch.diffs...)
}
// Add the suffix.
suffix := text[patch.Start2+patch.Length1 : min(len(text), patch.Start2+patch.Length1+padding)]
// rune-safety: align suffix end to nearest rune boundary.
suffixEnd := alignRuneStart(text, min(len(text), patch.Start2+patch.Length1+padding))
suffix := text[patch.Start2+patch.Length1 : suffixEnd]
if len(suffix) != 0 {
patch.diffs = append(patch.diffs, Diff{DiffEqual, suffix})
}
Expand Down Expand Up @@ -264,6 +296,13 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
} else {
startLoc = dmp.MatchMain(text, text1, expectedLoc)
}
// rune-safety: Bitap-returned byte position may land in the middle of a
// multi-byte character. Back it up to the nearest rune start so the
// subsequent text[startLoc:...] / text[:startLoc] slices never produce
// invalid UTF-8 (root cause of issue #132 panic).
if startLoc != -1 {
startLoc = alignRuneStart(text, startLoc)
}
if startLoc == -1 {
// No match found. :(
results[x] = false
Expand All @@ -274,14 +313,18 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
results[x] = true
delta = startLoc - expectedLoc
var text2 string
var text2End int
if endLoc == -1 {
text2 = text[startLoc:int(math.Min(float64(startLoc+len(text1)), float64(len(text))))]
text2End = alignRuneStart(text, int(math.Min(float64(startLoc+len(text1)), float64(len(text)))))
} else {
text2 = text[startLoc:int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))]
endLoc = alignRuneStart(text, endLoc)
text2End = alignRuneStart(text, int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text)))))
}
text2 = text[startLoc:text2End]
if text1 == text2 {
// Perfect match, just shove the Replacement text in.
text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[startLoc+len(text1):]
replaceEnd := alignRuneStart(text, startLoc+len(text1))
text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[replaceEnd:]
} else {
// Imperfect match. Run a diff to get a framework of equivalent indices.
diffs := dmp.DiffMain(text1, text2, false)
Expand All @@ -296,12 +339,16 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
index2 := dmp.DiffXIndex(diffs, index1)
if aDiff.Type == DiffInsert {
// Insertion
text = text[:startLoc+index2] + aDiff.Text + text[startLoc+index2:]
insertAt := alignRuneStart(text, startLoc+index2)
text = text[:insertAt] + aDiff.Text + text[insertAt:]
} else if aDiff.Type == DiffDelete {
// Deletion
startIndex := startLoc + index2
text = text[:startIndex] +
text[startIndex+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))-index2:]
startIndex := alignRuneStart(text, startLoc+index2)
endIndex := alignRuneStart(text, startLoc+dmp.DiffXIndex(diffs, index1+len(aDiff.Text)))
if endIndex < startIndex {
endIndex = startIndex
}
text = text[:startIndex] + text[endIndex:]
}
}
if aDiff.Type != DiffDelete {
Expand Down Expand Up @@ -416,7 +463,19 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
bigpatch.diffs = bigpatch.diffs[1:]
} else {
// Deletion or equality. Only take as much as we can stomach.
diffText = diffText[:min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)]
// rune-safety: align cutAt to nearest rune boundary so we never
// slice diffText through a multi-byte character (the resulting
// invalid UTF-8 is the root cause of issue #132 panic).
cutAt := min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)
cutAt = alignRuneStart(diffText, cutAt)
// Loop-progress guard: when alignment collapses cutAt to 0 but
// diffText is non-empty, advance by exactly one rune so the
// outer for-loop is guaranteed to consume input each iteration.
if cutAt == 0 && len(diffText) > 0 {
_, size := utf8.DecodeRuneInString(diffText)
cutAt = size
}
diffText = diffText[:cutAt]

patch.Length1 += len(diffText)
Start1 += len(diffText)
Expand All @@ -430,21 +489,26 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
if diffText == bigpatch.diffs[0].Text {
bigpatch.diffs = bigpatch.diffs[1:]
} else {
bigpatch.diffs[0].Text =
bigpatch.diffs[0].Text[len(diffText):]
// rune-safety: len(diffText) was already rune-aligned above,
// but align once more defensively for cutAt == 0 etc.
remStart := alignRuneStart(bigpatch.diffs[0].Text, len(diffText))
bigpatch.diffs[0].Text = bigpatch.diffs[0].Text[remStart:]
}
}
}
// Compute the head context for the next patch.
precontext = dmp.DiffText2(patch.diffs)
precontext = precontext[max(0, len(precontext)-dmp.PatchMargin):]
// rune-safety: align precontext start to nearest rune boundary.
precontext = precontext[alignRuneStart(precontext, max(0, len(precontext)-dmp.PatchMargin)):]

postcontext := ""
// Append the end context for this patch.
if len(dmp.DiffText1(bigpatch.diffs)) > dmp.PatchMargin {
postcontext = dmp.DiffText1(bigpatch.diffs)[:dmp.PatchMargin]
rawPost := dmp.DiffText1(bigpatch.diffs)
if len(rawPost) > dmp.PatchMargin {
// rune-safety: align before slicing to avoid mid-rune cut.
postcontext = rawPost[:alignRuneStart(rawPost, dmp.PatchMargin)]
} else {
postcontext = dmp.DiffText1(bigpatch.diffs)
postcontext = rawPost
}

if len(postcontext) != 0 {
Expand Down
134 changes: 134 additions & 0 deletions diffmatchpatch/patch_rune_safety_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package diffmatchpatch

import (
"strings"
"testing"
)

// alignRuneStart should back any byte index inside a multi-byte UTF-8
// character back to that character's first byte.
func TestAlignRuneStart(t *testing.T) {
// "abc中文def" — UTF-8: a=1 b=1 c=1 中=3(byte 3..5) 文=3(6..8) d=1 e=1 f=1
s := "abc中文def"
cases := []struct {
in, want int
desc string
}{
{0, 0, "0 unchanged"},
{1, 1, "after 'a' already aligned"},
{3, 3, "start of 中 aligned"},
{4, 3, "middle of 中 backs to start"},
{5, 3, "end-middle of 中 backs to start"},
{6, 6, "start of 文 aligned"},
{7, 6, "middle of 文 backs to start"},
{9, 9, "after 文 = start of d, aligned"},
{12, 12, "end-of-string"},
{-1, -1, "negative unchanged"},
{999, 999, "past end unchanged"},
}
for _, c := range cases {
got := alignRuneStart(s, c.in)
if got != c.want {
t.Errorf("alignRuneStart(%q, %d) = %d, want %d (%s)", s, c.in, got, c.want, c.desc)
}
}
}

// Issue #132 minimal repro: applying a patch containing a multi-byte
// character with a short anchor used to panic with "slice bounds out of range".
// After the rune-safe alignment, PatchApply should return cleanly (apply may
// fail with results[i]=false, but must never panic).
func TestPatchApply_Issue132_NoPanic(t *testing.T) {
dmp := New()
patches, err := dmp.PatchFromText("@@ -1,2 +1,3 @@\n %E2%98%9E \n+r\n")
if err != nil {
t.Fatalf("PatchFromText: %v", err)
}
defer func() {
if r := recover(); r != nil {
t.Fatalf("panic leaked: %v", r)
}
}()
_, results := dmp.PatchApply(patches, "☞ 𝗢𝗥𝗗𝗘𝗥 ")
if results == nil {
t.Fatalf("results nil")
}
}

// PatchSplitMax on a CJK-heavy base must produce diff.Text slices that are
// all valid UTF-8 (prior to this fix, byte-level slicing of multi-byte
// characters could leave continuation-byte prefixes in the output).
func TestPatchSplitMax_NoInvalidUTF8(t *testing.T) {
dmp := New()
// CJK base larger than MatchMaxBits (32) forces PatchSplitMax to engage.
old := "中文段落一" + strings.Repeat("汉字", 30) + "结束"
newText := "中文段落一" + strings.Repeat("汉字", 30) + "结尾"
patches := dmp.PatchMake(old, dmp.DiffMain(old, newText, false))
if len(patches) == 0 {
t.Skip("no patches generated")
}
// PatchMake itself must produce valid-UTF-8 diff text (PatchAddContext's
// prefix/suffix slice now aligns to rune boundaries).
for i, p := range patches {
for j, d := range p.diffs {
if !isValidUTF8(d.Text) {
t.Errorf("PatchMake patch %d diff %d invalid UTF-8: %q", i, j, d.Text)
}
}
}
defer func() {
if r := recover(); r != nil {
t.Fatalf("PatchSplitMax panic: %v", r)
}
}()
// PatchSplitMax must preserve valid UTF-8 (precontext / postcontext /
// diffText cut points all rune-aligned).
for i, p := range dmp.PatchSplitMax(patches) {
for j, d := range p.diffs {
if !isValidUTF8(d.Text) {
t.Errorf("PatchSplitMax patch %d diff %d invalid UTF-8: %q type=%d", i, j, d.Text, d.Type)
}
}
}
}

// minimal UTF-8 validity check inlined to avoid adding a stdlib import vs.
// the existing import block in patch.go (utf8 is already there for the fix).
func isValidUTF8(s string) bool {
for i := 0; i < len(s); {
r, sz := utf8DecodeFirst(s[i:])
if r == 0xFFFD && sz == 1 {
return false
}
i += sz
}
return true
}

func utf8DecodeFirst(s string) (rune, int) {
if len(s) == 0 {
return 0, 0
}
b := s[0]
switch {
case b < 0x80:
return rune(b), 1
case b < 0xc0:
return 0xFFFD, 1
case b < 0xe0:
if len(s) < 2 {
return 0xFFFD, 1
}
return rune(b&0x1f)<<6 | rune(s[1]&0x3f), 2
case b < 0xf0:
if len(s) < 3 {
return 0xFFFD, 1
}
return rune(b&0x0f)<<12 | rune(s[1]&0x3f)<<6 | rune(s[2]&0x3f), 3
default:
if len(s) < 4 {
return 0xFFFD, 1
}
return rune(b&0x07)<<18 | rune(s[1]&0x3f)<<12 | rune(s[2]&0x3f)<<6 | rune(s[3]&0x3f), 4
}
}