sergi · SnowWarri0r · May 21, 2026
diff --git a/diffmatchpatch/patch.go b/diffmatchpatch/patch.go
@@ -16,8 +16,34 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"unicode/utf8"
 )
 
+// alignRuneStart backs up byte index idx to the nearest UTF-8 rune start byte.
+// idx <= 0 or idx >= len(s) is returned unchanged.
+//
+// Motivation: Google's reference JS diff-match-patch uses string.substring
+// (UTF-16 code unit) for all slicing — surrogate-pair-middle splits are
+// well-defined. This Go port slices by UTF-8 byte index (text[a:b]). When
+// MatchMain's Bitap returns a byte position that lands inside a multi-byte
+// character (CJK 3B / em dash 3B / emoji 4B), the resulting slice is invalid
+// UTF-8 — subsequent MatchMain calls miss anchors, and downstream byte slices
+// can panic with "slice bounds out of range" (issue #132).
+//
+// Fix: align every byte slice index back to the nearest valid rune boundary
+// before slicing. The operation costs at most 1-3 bytes of "shrinkage" per
+// slice point but guarantees the resulting string is always valid UTF-8 and
+// no MatchMain / DiffMain pass sees a continuation-byte-prefixed string.
+func alignRuneStart(s string, idx int) int {
+	if idx <= 0 || idx >= len(s) {
+		return idx
+	}
+	for idx > 0 && !utf8.RuneStart(s[idx]) {
+		idx--
+	}
+	return idx
+}
+
 // Patch represents one patch operation.
 type Patch struct {
 	diffs   []Diff
@@ -91,12 +117,18 @@ func (dmp *DiffMatchPatch) PatchAddContext(patch Patch, text string) Patch {
 	padding += dmp.PatchMargin
 
 	// Add the prefix.
-	prefix := text[max(0, patch.Start2-padding):patch.Start2]
+	// rune-safety: align prefix start to nearest rune boundary so the slice
+	// never produces an invalid-UTF-8 string when the surrounding text contains
+	// multi-byte characters.
+	prefixStart := alignRuneStart(text, max(0, patch.Start2-padding))
+	prefix := text[prefixStart:patch.Start2]
 	if len(prefix) != 0 {
 		patch.diffs = append([]Diff{Diff{DiffEqual, prefix}}, patch.diffs...)
 	}
 	// Add the suffix.
-	suffix := text[patch.Start2+patch.Length1 : min(len(text), patch.Start2+patch.Length1+padding)]
+	// rune-safety: align suffix end to nearest rune boundary.
+	suffixEnd := alignRuneStart(text, min(len(text), patch.Start2+patch.Length1+padding))
+	suffix := text[patch.Start2+patch.Length1 : suffixEnd]
 	if len(suffix) != 0 {
 		patch.diffs = append(patch.diffs, Diff{DiffEqual, suffix})
 	}
@@ -264,6 +296,13 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
 		} else {
 			startLoc = dmp.MatchMain(text, text1, expectedLoc)
 		}
+		// rune-safety: Bitap-returned byte position may land in the middle of a
+		// multi-byte character. Back it up to the nearest rune start so the
+		// subsequent text[startLoc:...] / text[:startLoc] slices never produce
+		// invalid UTF-8 (root cause of issue #132 panic).
+		if startLoc != -1 {
+			startLoc = alignRuneStart(text, startLoc)
+		}
 		if startLoc == -1 {
 			// No match found.  :(
 			results[x] = false
@@ -274,14 +313,18 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
 			results[x] = true
 			delta = startLoc - expectedLoc
 			var text2 string
+			var text2End int
 			if endLoc == -1 {
-				text2 = text[startLoc:int(math.Min(float64(startLoc+len(text1)), float64(len(text))))]
+				text2End = alignRuneStart(text, int(math.Min(float64(startLoc+len(text1)), float64(len(text)))))
 			} else {
-				text2 = text[startLoc:int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))]
+				endLoc = alignRuneStart(text, endLoc)
+				text2End = alignRuneStart(text, int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text)))))
 			}
+			text2 = text[startLoc:text2End]
 			if text1 == text2 {
 				// Perfect match, just shove the Replacement text in.
-				text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[startLoc+len(text1):]
+				replaceEnd := alignRuneStart(text, startLoc+len(text1))
+				text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[replaceEnd:]
 			} else {
 				// Imperfect match.  Run a diff to get a framework of equivalent indices.
 				diffs := dmp.DiffMain(text1, text2, false)
@@ -296,12 +339,16 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
 							index2 := dmp.DiffXIndex(diffs, index1)
 							if aDiff.Type == DiffInsert {
 								// Insertion
-								text = text[:startLoc+index2] + aDiff.Text + text[startLoc+index2:]
+								insertAt := alignRuneStart(text, startLoc+index2)
+								text = text[:insertAt] + aDiff.Text + text[insertAt:]
 							} else if aDiff.Type == DiffDelete {
 								// Deletion
-								startIndex := startLoc + index2
-								text = text[:startIndex] +
-									text[startIndex+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))-index2:]
+								startIndex := alignRuneStart(text, startLoc+index2)
+								endIndex := alignRuneStart(text, startLoc+dmp.DiffXIndex(diffs, index1+len(aDiff.Text)))
+								if endIndex < startIndex {
+									endIndex = startIndex
+								}
+								text = text[:startIndex] + text[endIndex:]
 							}
 						}
 						if aDiff.Type != DiffDelete {
@@ -416,7 +463,19 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
 					bigpatch.diffs = bigpatch.diffs[1:]
 				} else {
 					// Deletion or equality.  Only take as much as we can stomach.
-					diffText = diffText[:min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)]
+					// rune-safety: align cutAt to nearest rune boundary so we never
+					// slice diffText through a multi-byte character (the resulting
+					// invalid UTF-8 is the root cause of issue #132 panic).
+					cutAt := min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)
+					cutAt = alignRuneStart(diffText, cutAt)
+					// Loop-progress guard: when alignment collapses cutAt to 0 but
+					// diffText is non-empty, advance by exactly one rune so the
+					// outer for-loop is guaranteed to consume input each iteration.
+					if cutAt == 0 && len(diffText) > 0 {
+						_, size := utf8.DecodeRuneInString(diffText)
+						cutAt = size
+					}
+					diffText = diffText[:cutAt]
 
 					patch.Length1 += len(diffText)
 					Start1 += len(diffText)
@@ -430,21 +489,26 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
 					if diffText == bigpatch.diffs[0].Text {
 						bigpatch.diffs = bigpatch.diffs[1:]
 					} else {
-						bigpatch.diffs[0].Text =
-							bigpatch.diffs[0].Text[len(diffText):]
+						// rune-safety: len(diffText) was already rune-aligned above,
+						// but align once more defensively for cutAt == 0 etc.
+						remStart := alignRuneStart(bigpatch.diffs[0].Text, len(diffText))
+						bigpatch.diffs[0].Text = bigpatch.diffs[0].Text[remStart:]
 					}
 				}
 			}
 			// Compute the head context for the next patch.
 			precontext = dmp.DiffText2(patch.diffs)
-			precontext = precontext[max(0, len(precontext)-dmp.PatchMargin):]
+			// rune-safety: align precontext start to nearest rune boundary.
+			precontext = precontext[alignRuneStart(precontext, max(0, len(precontext)-dmp.PatchMargin)):]
 
 			postcontext := ""
 			// Append the end context for this patch.
-			if len(dmp.DiffText1(bigpatch.diffs)) > dmp.PatchMargin {
-				postcontext = dmp.DiffText1(bigpatch.diffs)[:dmp.PatchMargin]
+			rawPost := dmp.DiffText1(bigpatch.diffs)
+			if len(rawPost) > dmp.PatchMargin {
+				// rune-safety: align before slicing to avoid mid-rune cut.
+				postcontext = rawPost[:alignRuneStart(rawPost, dmp.PatchMargin)]
 			} else {
-				postcontext = dmp.DiffText1(bigpatch.diffs)
+				postcontext = rawPost
 			}
 
 			if len(postcontext) != 0 {

diff --git a/diffmatchpatch/patch_rune_safety_test.go b/diffmatchpatch/patch_rune_safety_test.go
@@ -0,0 +1,134 @@
+package diffmatchpatch
+
+import (
+	"strings"
+	"testing"
+)
+
+// alignRuneStart should back any byte index inside a multi-byte UTF-8
+// character back to that character's first byte.
+func TestAlignRuneStart(t *testing.T) {
+	// "abc中文def" — UTF-8: a=1 b=1 c=1 中=3(byte 3..5) 文=3(6..8) d=1 e=1 f=1
+	s := "abc中文def"
+	cases := []struct {
+		in, want int
+		desc     string
+	}{
+		{0, 0, "0 unchanged"},
+		{1, 1, "after 'a' already aligned"},
+		{3, 3, "start of 中 aligned"},
+		{4, 3, "middle of 中 backs to start"},
+		{5, 3, "end-middle of 中 backs to start"},
+		{6, 6, "start of 文 aligned"},
+		{7, 6, "middle of 文 backs to start"},
+		{9, 9, "after 文 = start of d, aligned"},
+		{12, 12, "end-of-string"},
+		{-1, -1, "negative unchanged"},
+		{999, 999, "past end unchanged"},
+	}
+	for _, c := range cases {
+		got := alignRuneStart(s, c.in)
+		if got != c.want {
+			t.Errorf("alignRuneStart(%q, %d) = %d, want %d  (%s)", s, c.in, got, c.want, c.desc)
+		}
+	}
+}
+
+// Issue #132 minimal repro: applying a patch containing a multi-byte
+// character with a short anchor used to panic with "slice bounds out of range".
+// After the rune-safe alignment, PatchApply should return cleanly (apply may
+// fail with results[i]=false, but must never panic).
+func TestPatchApply_Issue132_NoPanic(t *testing.T) {
+	dmp := New()
+	patches, err := dmp.PatchFromText("@@ -1,2 +1,3 @@\n %E2%98%9E \n+r\n")
+	if err != nil {
+		t.Fatalf("PatchFromText: %v", err)
+	}
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("panic leaked: %v", r)
+		}
+	}()
+	_, results := dmp.PatchApply(patches, "☞ 𝗢𝗥𝗗𝗘𝗥 ")
+	if results == nil {
+		t.Fatalf("results nil")
+	}
+}
+
+// PatchSplitMax on a CJK-heavy base must produce diff.Text slices that are
+// all valid UTF-8 (prior to this fix, byte-level slicing of multi-byte
+// characters could leave continuation-byte prefixes in the output).
+func TestPatchSplitMax_NoInvalidUTF8(t *testing.T) {
+	dmp := New()
+	// CJK base larger than MatchMaxBits (32) forces PatchSplitMax to engage.
+	old := "中文段落一" + strings.Repeat("汉字", 30) + "结束"
+	newText := "中文段落一" + strings.Repeat("汉字", 30) + "结尾"
+	patches := dmp.PatchMake(old, dmp.DiffMain(old, newText, false))
+	if len(patches) == 0 {
+		t.Skip("no patches generated")
+	}
+	// PatchMake itself must produce valid-UTF-8 diff text (PatchAddContext's
+	// prefix/suffix slice now aligns to rune boundaries).
+	for i, p := range patches {
+		for j, d := range p.diffs {
+			if !isValidUTF8(d.Text) {
+				t.Errorf("PatchMake patch %d diff %d invalid UTF-8: %q", i, j, d.Text)
+			}
+		}
+	}
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("PatchSplitMax panic: %v", r)
+		}
+	}()
+	// PatchSplitMax must preserve valid UTF-8 (precontext / postcontext /
+	// diffText cut points all rune-aligned).
+	for i, p := range dmp.PatchSplitMax(patches) {
+		for j, d := range p.diffs {
+			if !isValidUTF8(d.Text) {
+				t.Errorf("PatchSplitMax patch %d diff %d invalid UTF-8: %q  type=%d", i, j, d.Text, d.Type)
+			}
+		}
+	}
+}
+
+// minimal UTF-8 validity check inlined to avoid adding a stdlib import vs.
+// the existing import block in patch.go (utf8 is already there for the fix).
+func isValidUTF8(s string) bool {
+	for i := 0; i < len(s); {
+		r, sz := utf8DecodeFirst(s[i:])
+		if r == 0xFFFD && sz == 1 {
+			return false
+		}
+		i += sz
+	}
+	return true
+}
+
+func utf8DecodeFirst(s string) (rune, int) {
+	if len(s) == 0 {
+		return 0, 0
+	}
+	b := s[0]
+	switch {
+	case b < 0x80:
+		return rune(b), 1
+	case b < 0xc0:
+		return 0xFFFD, 1
+	case b < 0xe0:
+		if len(s) < 2 {
+			return 0xFFFD, 1
+		}
+		return rune(b&0x1f)<<6 | rune(s[1]&0x3f), 2
+	case b < 0xf0:
+		if len(s) < 3 {
+			return 0xFFFD, 1
+		}
+		return rune(b&0x0f)<<12 | rune(s[1]&0x3f)<<6 | rune(s[2]&0x3f), 3
+	default:
+		if len(s) < 4 {
+			return 0xFFFD, 1
+		}
+		return rune(b&0x07)<<18 | rune(s[1]&0x3f)<<12 | rune(s[2]&0x3f)<<6 | rune(s[3]&0x3f), 4
+	}
+}