sergi · SnowWarri0r · May 21, 2026 · May 21, 2026
diff --git a/diffmatchpatch/match_runes.go b/diffmatchpatch/match_runes.go
@@ -0,0 +1,167 @@
+// Copyright (c) 2012-2016 The go-diff authors. All rights reserved.
+//
+// Rune-based Match helpers — mirror MatchMain / MatchBitap / MatchAlphabet but
+// operate on []rune instead of UTF-8 bytes. Lengths and positions are counted
+// in runes (one Unicode code point each), which matches the reference JS
+// diff-match-patch's UTF-16 code unit semantics for all BMP characters.
+//
+// Motivation: the existing byte-based Bitap on multi-byte text (CJK, em dash,
+// emoji) uses len(text) (byte count), patches[].Length1 (byte count), and
+// substring slicing by byte index. The reference JS implementation uses
+// UTF-16 unit counts everywhere. When Go callers receive patch text emitted
+// by a JS counterpart (whose Length/Start numbers are UTF-16 units), the
+// byte-based Go path produces drift:
+//   - PatchSplitMax cuts CJK segments ~3x more finely (90 bytes vs 28 units),
+//     shortening Bitap patterns and weakening fuzzy match accuracy.
+//   - expected_loc = Start2 + delta drifts by N bytes for N multi-byte chars
+//     preceding the patch — fuzzy match's MatchDistance window may no longer
+//     cover the real anchor location.
+//   - MatchAlphabet keyed by byte conflates the three bytes of each CJK rune,
+//     scrambling the Bitap alphabet on dense CJK text.
+//
+// These functions provide a parallel rune-based path. Algorithm structure is
+// preserved 1:1; only the length / indexing model differs.
+
+package diffmatchpatch
+
+import "math"
+
+// MatchMainRunes — rune-mirror of MatchMain. text and pattern are []rune.
+// loc / return value are rune indices into text.
+func (dmp *DiffMatchPatch) MatchMainRunes(text, pattern []rune, loc int) int {
+	loc = int(math.Max(0, math.Min(float64(loc), float64(len(text)))))
+	if runesEqual(text, pattern) {
+		return 0
+	} else if len(text) == 0 {
+		return -1
+	} else if loc+len(pattern) <= len(text) && runesEqual(text[loc:loc+len(pattern)], pattern) {
+		return loc
+	}
+	return dmp.MatchBitapRunes(text, pattern, loc)
+}
+
+// MatchBitapRunes — rune-mirror of MatchBitap.
+func (dmp *DiffMatchPatch) MatchBitapRunes(text, pattern []rune, loc int) int {
+	s := dmp.MatchAlphabetRunes(pattern)
+
+	scoreThreshold := dmp.MatchThreshold
+	bestLoc := runesIndexOf(text, pattern, loc)
+	if bestLoc != -1 {
+		scoreThreshold = math.Min(dmp.matchBitapScoreRunes(0, bestLoc, loc, pattern), scoreThreshold)
+		bestLoc = runesLastIndexBefore(text, pattern, loc+len(pattern))
+		if bestLoc != -1 {
+			scoreThreshold = math.Min(dmp.matchBitapScoreRunes(0, bestLoc, loc, pattern), scoreThreshold)
+		}
+	}
+
+	matchmask := 1 << uint(len(pattern)-1)
+	bestLoc = -1
+
+	var binMin, binMid int
+	binMax := len(pattern) + len(text)
+	lastRd := []int{}
+	for d := 0; d < len(pattern); d++ {
+		binMin = 0
+		binMid = binMax
+		for binMin < binMid {
+			if dmp.matchBitapScoreRunes(d, loc+binMid, loc, pattern) <= scoreThreshold {
+				binMin = binMid
+			} else {
+				binMax = binMid
+			}
+			binMid = (binMax-binMin)/2 + binMin
+		}
+		binMax = binMid
+		start := int(math.Max(1, float64(loc-binMid+1)))
+		finish := int(math.Min(float64(loc+binMid), float64(len(text))) + float64(len(pattern)))
+
+		rd := make([]int, finish+2)
+		rd[finish+1] = (1 << uint(d)) - 1
+
+		for j := finish; j >= start; j-- {
+			var charMatch int
+			if len(text) <= j-1 {
+				charMatch = 0
+			} else if _, ok := s[text[j-1]]; !ok {
+				charMatch = 0
+			} else {
+				charMatch = s[text[j-1]]
+			}
+
+			if d == 0 {
+				rd[j] = ((rd[j+1] << 1) | 1) & charMatch
+			} else {
+				rd[j] = ((rd[j+1]<<1)|1)&charMatch | (((lastRd[j+1] | lastRd[j]) << 1) | 1) | lastRd[j+1]
+			}
+			if (rd[j] & matchmask) != 0 {
+				score := dmp.matchBitapScoreRunes(d, j-1, loc, pattern)
+				if score <= scoreThreshold {
+					scoreThreshold = score
+					bestLoc = j - 1
+					if bestLoc > loc {
+						start = int(math.Max(1, float64(2*loc-bestLoc)))
+					} else {
+						break
+					}
+				}
+			}
+		}
+		if dmp.matchBitapScoreRunes(d+1, loc, loc, pattern) > scoreThreshold {
+			break
+		}
+		lastRd = rd
+	}
+	return bestLoc
+}
+
+func (dmp *DiffMatchPatch) matchBitapScoreRunes(e, x, loc int, pattern []rune) float64 {
+	accuracy := float64(e) / float64(len(pattern))
+	proximity := math.Abs(float64(loc - x))
+	if dmp.MatchDistance == 0 {
+		if proximity == 0 {
+			return accuracy
+		}
+		return 1.0
+	}
+	return accuracy + (proximity / float64(dmp.MatchDistance))
+}
+
+// MatchAlphabetRunes — rune-mirror of MatchAlphabet. key is rune (not byte),
+// so multi-byte chars get their own alphabet entry instead of conflating with
+// their continuation bytes (the latter making sergi byte-Alphabet broken on CJK).
+func (dmp *DiffMatchPatch) MatchAlphabetRunes(pattern []rune) map[rune]int {
+	s := map[rune]int{}
+	for _, c := range pattern {
+		if _, ok := s[c]; !ok {
+			s[c] = 0
+		}
+	}
+	i := 0
+	for _, c := range pattern {
+		s[c] |= int(uint(1) << uint(len(pattern)-i-1))
+		i++
+	}
+	return s
+}
+
+// runesLastIndexBefore — rune-version of lastIndexOf (last occurrence of pattern
+// in target ending at or before beforeLoc). sergi has runesIndex/runesIndexOf
+// already; this completes the trio.
+func runesLastIndexBefore(target, pattern []rune, beforeLoc int) int {
+	if len(pattern) == 0 {
+		if beforeLoc > len(target) {
+			return len(target)
+		}
+		return beforeLoc
+	}
+	maxStart := beforeLoc - len(pattern)
+	if maxStart > len(target)-len(pattern) {
+		maxStart = len(target) - len(pattern)
+	}
+	for i := maxStart; i >= 0; i-- {
+		if runesEqual(target[i:i+len(pattern)], pattern) {
+			return i
+		}
+	}
+	return -1
+}
diff --git a/diffmatchpatch/patch.go b/diffmatchpatch/patch.go
@@ -16,8 +16,34 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"unicode/utf8"
 )
 
+// alignRuneStart backs up byte index idx to the nearest UTF-8 rune start byte.
+// idx <= 0 or idx >= len(s) is returned unchanged.
+//
+// Motivation: Google's reference JS diff-match-patch uses string.substring
+// (UTF-16 code unit) for all slicing — surrogate-pair-middle splits are
+// well-defined. This Go port slices by UTF-8 byte index (text[a:b]). When
+// MatchMain's Bitap returns a byte position that lands inside a multi-byte
+// character (CJK 3B / em dash 3B / emoji 4B), the resulting slice is invalid
+// UTF-8 — subsequent MatchMain calls miss anchors, and downstream byte slices
+// can panic with "slice bounds out of range" (issue #132).
+//
+// Fix: align every byte slice index back to the nearest valid rune boundary
+// before slicing. The operation costs at most 1-3 bytes of "shrinkage" per
+// slice point but guarantees the resulting string is always valid UTF-8 and
+// no MatchMain / DiffMain pass sees a continuation-byte-prefixed string.
+func alignRuneStart(s string, idx int) int {
+	if idx <= 0 || idx >= len(s) {
+		return idx
+	}
+	for idx > 0 && !utf8.RuneStart(s[idx]) {
+		idx--
+	}
+	return idx
+}
+
 // Patch represents one patch operation.
 type Patch struct {
 	diffs   []Diff
@@ -91,12 +117,18 @@ func (dmp *DiffMatchPatch) PatchAddContext(patch Patch, text string) Patch {
 	padding += dmp.PatchMargin
 
 	// Add the prefix.
-	prefix := text[max(0, patch.Start2-padding):patch.Start2]
+	// rune-safety: align prefix start to nearest rune boundary so the slice
+	// never produces an invalid-UTF-8 string when the surrounding text contains
+	// multi-byte characters.
+	prefixStart := alignRuneStart(text, max(0, patch.Start2-padding))
+	prefix := text[prefixStart:patch.Start2]
 	if len(prefix) != 0 {
 		patch.diffs = append([]Diff{Diff{DiffEqual, prefix}}, patch.diffs...)
 	}
 	// Add the suffix.
-	suffix := text[patch.Start2+patch.Length1 : min(len(text), patch.Start2+patch.Length1+padding)]
+	// rune-safety: align suffix end to nearest rune boundary.
+	suffixEnd := alignRuneStart(text, min(len(text), patch.Start2+patch.Length1+padding))
+	suffix := text[patch.Start2+patch.Length1 : suffixEnd]
 	if len(suffix) != 0 {
 		patch.diffs = append(patch.diffs, Diff{DiffEqual, suffix})
 	}
@@ -264,6 +296,13 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
 		} else {
 			startLoc = dmp.MatchMain(text, text1, expectedLoc)
 		}
+		// rune-safety: Bitap-returned byte position may land in the middle of a
+		// multi-byte character. Back it up to the nearest rune start so the
+		// subsequent text[startLoc:...] / text[:startLoc] slices never produce
+		// invalid UTF-8 (root cause of issue #132 panic).
+		if startLoc != -1 {
+			startLoc = alignRuneStart(text, startLoc)
+		}
 		if startLoc == -1 {
 			// No match found.  :(
 			results[x] = false
@@ -274,14 +313,18 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
 			results[x] = true
 			delta = startLoc - expectedLoc
 			var text2 string
+			var text2End int
 			if endLoc == -1 {
-				text2 = text[startLoc:int(math.Min(float64(startLoc+len(text1)), float64(len(text))))]
+				text2End = alignRuneStart(text, int(math.Min(float64(startLoc+len(text1)), float64(len(text)))))
 			} else {
-				text2 = text[startLoc:int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))]
+				endLoc = alignRuneStart(text, endLoc)
+				text2End = alignRuneStart(text, int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text)))))
 			}
+			text2 = text[startLoc:text2End]
 			if text1 == text2 {
 				// Perfect match, just shove the Replacement text in.
-				text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[startLoc+len(text1):]
+				replaceEnd := alignRuneStart(text, startLoc+len(text1))
+				text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[replaceEnd:]
 			} else {
 				// Imperfect match.  Run a diff to get a framework of equivalent indices.
 				diffs := dmp.DiffMain(text1, text2, false)
@@ -296,12 +339,16 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
 							index2 := dmp.DiffXIndex(diffs, index1)
 							if aDiff.Type == DiffInsert {
 								// Insertion
-								text = text[:startLoc+index2] + aDiff.Text + text[startLoc+index2:]
+								insertAt := alignRuneStart(text, startLoc+index2)
+								text = text[:insertAt] + aDiff.Text + text[insertAt:]
 							} else if aDiff.Type == DiffDelete {
 								// Deletion
-								startIndex := startLoc + index2
-								text = text[:startIndex] +
-									text[startIndex+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))-index2:]
+								startIndex := alignRuneStart(text, startLoc+index2)
+								endIndex := alignRuneStart(text, startLoc+dmp.DiffXIndex(diffs, index1+len(aDiff.Text)))
+								if endIndex < startIndex {
+									endIndex = startIndex
+								}
+								text = text[:startIndex] + text[endIndex:]
 							}
 						}
 						if aDiff.Type != DiffDelete {
@@ -416,7 +463,19 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
 					bigpatch.diffs = bigpatch.diffs[1:]
 				} else {
 					// Deletion or equality.  Only take as much as we can stomach.
-					diffText = diffText[:min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)]
+					// rune-safety: align cutAt to nearest rune boundary so we never
+					// slice diffText through a multi-byte character (the resulting
+					// invalid UTF-8 is the root cause of issue #132 panic).
+					cutAt := min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)
+					cutAt = alignRuneStart(diffText, cutAt)
+					// Loop-progress guard: when alignment collapses cutAt to 0 but
+					// diffText is non-empty, advance by exactly one rune so the
+					// outer for-loop is guaranteed to consume input each iteration.
+					if cutAt == 0 && len(diffText) > 0 {
+						_, size := utf8.DecodeRuneInString(diffText)
+						cutAt = size
+					}
+					diffText = diffText[:cutAt]
 
 					patch.Length1 += len(diffText)
 					Start1 += len(diffText)
@@ -430,21 +489,26 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
 					if diffText == bigpatch.diffs[0].Text {
 						bigpatch.diffs = bigpatch.diffs[1:]
 					} else {
-						bigpatch.diffs[0].Text =
-							bigpatch.diffs[0].Text[len(diffText):]
+						// rune-safety: len(diffText) was already rune-aligned above,
+						// but align once more defensively for cutAt == 0 etc.
+						remStart := alignRuneStart(bigpatch.diffs[0].Text, len(diffText))
+						bigpatch.diffs[0].Text = bigpatch.diffs[0].Text[remStart:]
 					}
 				}
 			}
 			// Compute the head context for the next patch.
 			precontext = dmp.DiffText2(patch.diffs)
-			precontext = precontext[max(0, len(precontext)-dmp.PatchMargin):]
+			// rune-safety: align precontext start to nearest rune boundary.
+			precontext = precontext[alignRuneStart(precontext, max(0, len(precontext)-dmp.PatchMargin)):]
 
 			postcontext := ""
 			// Append the end context for this patch.
-			if len(dmp.DiffText1(bigpatch.diffs)) > dmp.PatchMargin {
-				postcontext = dmp.DiffText1(bigpatch.diffs)[:dmp.PatchMargin]
+			rawPost := dmp.DiffText1(bigpatch.diffs)
+			if len(rawPost) > dmp.PatchMargin {
+				// rune-safety: align before slicing to avoid mid-rune cut.
+				postcontext = rawPost[:alignRuneStart(rawPost, dmp.PatchMargin)]
 			} else {
-				postcontext = dmp.DiffText1(bigpatch.diffs)
+				postcontext = rawPost
 			}
 
 			if len(postcontext) != 0 {