Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions diffmatchpatch/match_runes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// Copyright (c) 2012-2016 The go-diff authors. All rights reserved.
//
// Rune-based Match helpers — mirror MatchMain / MatchBitap / MatchAlphabet but
// operate on []rune instead of UTF-8 bytes. Lengths and positions are counted
// in runes (one Unicode code point each), which matches the reference JS
// diff-match-patch's UTF-16 code unit semantics for all BMP characters.
//
// Motivation: the existing byte-based Bitap on multi-byte text (CJK, em dash,
// emoji) uses len(text) (byte count), patches[].Length1 (byte count), and
// substring slicing by byte index. The reference JS implementation uses
// UTF-16 unit counts everywhere. When Go callers receive patch text emitted
// by a JS counterpart (whose Length/Start numbers are UTF-16 units), the
// byte-based Go path produces drift:
// - PatchSplitMax cuts CJK segments ~3x more finely (90 bytes vs 28 units),
// shortening Bitap patterns and weakening fuzzy match accuracy.
// - expected_loc = Start2 + delta drifts by N bytes for N multi-byte chars
// preceding the patch — fuzzy match's MatchDistance window may no longer
// cover the real anchor location.
// - MatchAlphabet keyed by byte conflates the three bytes of each CJK rune,
// scrambling the Bitap alphabet on dense CJK text.
//
// These functions provide a parallel rune-based path. Algorithm structure is
// preserved 1:1; only the length / indexing model differs.

package diffmatchpatch

import "math"

// MatchMainRunes — rune-mirror of MatchMain. text and pattern are []rune.
// loc / return value are rune indices into text.
func (dmp *DiffMatchPatch) MatchMainRunes(text, pattern []rune, loc int) int {
loc = int(math.Max(0, math.Min(float64(loc), float64(len(text)))))
if runesEqual(text, pattern) {
return 0
} else if len(text) == 0 {
return -1
} else if loc+len(pattern) <= len(text) && runesEqual(text[loc:loc+len(pattern)], pattern) {
return loc
}
return dmp.MatchBitapRunes(text, pattern, loc)
}

// MatchBitapRunes — rune-mirror of MatchBitap.
func (dmp *DiffMatchPatch) MatchBitapRunes(text, pattern []rune, loc int) int {
s := dmp.MatchAlphabetRunes(pattern)

scoreThreshold := dmp.MatchThreshold
bestLoc := runesIndexOf(text, pattern, loc)
if bestLoc != -1 {
scoreThreshold = math.Min(dmp.matchBitapScoreRunes(0, bestLoc, loc, pattern), scoreThreshold)
bestLoc = runesLastIndexBefore(text, pattern, loc+len(pattern))
if bestLoc != -1 {
scoreThreshold = math.Min(dmp.matchBitapScoreRunes(0, bestLoc, loc, pattern), scoreThreshold)
}
}

matchmask := 1 << uint(len(pattern)-1)
bestLoc = -1

var binMin, binMid int
binMax := len(pattern) + len(text)
lastRd := []int{}
for d := 0; d < len(pattern); d++ {
binMin = 0
binMid = binMax
for binMin < binMid {
if dmp.matchBitapScoreRunes(d, loc+binMid, loc, pattern) <= scoreThreshold {
binMin = binMid
} else {
binMax = binMid
}
binMid = (binMax-binMin)/2 + binMin
}
binMax = binMid
start := int(math.Max(1, float64(loc-binMid+1)))
finish := int(math.Min(float64(loc+binMid), float64(len(text))) + float64(len(pattern)))

rd := make([]int, finish+2)
rd[finish+1] = (1 << uint(d)) - 1

for j := finish; j >= start; j-- {
var charMatch int
if len(text) <= j-1 {
charMatch = 0
} else if _, ok := s[text[j-1]]; !ok {
charMatch = 0
} else {
charMatch = s[text[j-1]]
}

if d == 0 {
rd[j] = ((rd[j+1] << 1) | 1) & charMatch
} else {
rd[j] = ((rd[j+1]<<1)|1)&charMatch | (((lastRd[j+1] | lastRd[j]) << 1) | 1) | lastRd[j+1]
}
if (rd[j] & matchmask) != 0 {
score := dmp.matchBitapScoreRunes(d, j-1, loc, pattern)
if score <= scoreThreshold {
scoreThreshold = score
bestLoc = j - 1
if bestLoc > loc {
start = int(math.Max(1, float64(2*loc-bestLoc)))
} else {
break
}
}
}
}
if dmp.matchBitapScoreRunes(d+1, loc, loc, pattern) > scoreThreshold {
break
}
lastRd = rd
}
return bestLoc
}

func (dmp *DiffMatchPatch) matchBitapScoreRunes(e, x, loc int, pattern []rune) float64 {
accuracy := float64(e) / float64(len(pattern))
proximity := math.Abs(float64(loc - x))
if dmp.MatchDistance == 0 {
if proximity == 0 {
return accuracy
}
return 1.0
}
return accuracy + (proximity / float64(dmp.MatchDistance))
}

// MatchAlphabetRunes — rune-mirror of MatchAlphabet. key is rune (not byte),
// so multi-byte chars get their own alphabet entry instead of conflating with
// their continuation bytes (the latter making sergi byte-Alphabet broken on CJK).
func (dmp *DiffMatchPatch) MatchAlphabetRunes(pattern []rune) map[rune]int {
s := map[rune]int{}
for _, c := range pattern {
if _, ok := s[c]; !ok {
s[c] = 0
}
}
i := 0
for _, c := range pattern {
s[c] |= int(uint(1) << uint(len(pattern)-i-1))
i++
}
return s
}

// runesLastIndexBefore — rune-version of lastIndexOf (last occurrence of pattern
// in target ending at or before beforeLoc). sergi has runesIndex/runesIndexOf
// already; this completes the trio.
func runesLastIndexBefore(target, pattern []rune, beforeLoc int) int {
if len(pattern) == 0 {
if beforeLoc > len(target) {
return len(target)
}
return beforeLoc
}
maxStart := beforeLoc - len(pattern)
if maxStart > len(target)-len(pattern) {
maxStart = len(target) - len(pattern)
}
for i := maxStart; i >= 0; i-- {
if runesEqual(target[i:i+len(pattern)], pattern) {
return i
}
}
return -1
}
96 changes: 80 additions & 16 deletions diffmatchpatch/patch.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,34 @@ import (
"regexp"
"strconv"
"strings"
"unicode/utf8"
)

// alignRuneStart backs up byte index idx to the nearest UTF-8 rune start byte.
// idx <= 0 or idx >= len(s) is returned unchanged.
//
// Motivation: Google's reference JS diff-match-patch uses string.substring
// (UTF-16 code unit) for all slicing — surrogate-pair-middle splits are
// well-defined. This Go port slices by UTF-8 byte index (text[a:b]). When
// MatchMain's Bitap returns a byte position that lands inside a multi-byte
// character (CJK 3B / em dash 3B / emoji 4B), the resulting slice is invalid
// UTF-8 — subsequent MatchMain calls miss anchors, and downstream byte slices
// can panic with "slice bounds out of range" (issue #132).
//
// Fix: align every byte slice index back to the nearest valid rune boundary
// before slicing. The operation costs at most 1-3 bytes of "shrinkage" per
// slice point but guarantees the resulting string is always valid UTF-8 and
// no MatchMain / DiffMain pass sees a continuation-byte-prefixed string.
func alignRuneStart(s string, idx int) int {
if idx <= 0 || idx >= len(s) {
return idx
}
for idx > 0 && !utf8.RuneStart(s[idx]) {
idx--
}
return idx
}

// Patch represents one patch operation.
type Patch struct {
diffs []Diff
Expand Down Expand Up @@ -91,12 +117,18 @@ func (dmp *DiffMatchPatch) PatchAddContext(patch Patch, text string) Patch {
padding += dmp.PatchMargin

// Add the prefix.
prefix := text[max(0, patch.Start2-padding):patch.Start2]
// rune-safety: align prefix start to nearest rune boundary so the slice
// never produces an invalid-UTF-8 string when the surrounding text contains
// multi-byte characters.
prefixStart := alignRuneStart(text, max(0, patch.Start2-padding))
prefix := text[prefixStart:patch.Start2]
if len(prefix) != 0 {
patch.diffs = append([]Diff{Diff{DiffEqual, prefix}}, patch.diffs...)
}
// Add the suffix.
suffix := text[patch.Start2+patch.Length1 : min(len(text), patch.Start2+patch.Length1+padding)]
// rune-safety: align suffix end to nearest rune boundary.
suffixEnd := alignRuneStart(text, min(len(text), patch.Start2+patch.Length1+padding))
suffix := text[patch.Start2+patch.Length1 : suffixEnd]
if len(suffix) != 0 {
patch.diffs = append(patch.diffs, Diff{DiffEqual, suffix})
}
Expand Down Expand Up @@ -264,6 +296,13 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
} else {
startLoc = dmp.MatchMain(text, text1, expectedLoc)
}
// rune-safety: Bitap-returned byte position may land in the middle of a
// multi-byte character. Back it up to the nearest rune start so the
// subsequent text[startLoc:...] / text[:startLoc] slices never produce
// invalid UTF-8 (root cause of issue #132 panic).
if startLoc != -1 {
startLoc = alignRuneStart(text, startLoc)
}
if startLoc == -1 {
// No match found. :(
results[x] = false
Expand All @@ -274,14 +313,18 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
results[x] = true
delta = startLoc - expectedLoc
var text2 string
var text2End int
if endLoc == -1 {
text2 = text[startLoc:int(math.Min(float64(startLoc+len(text1)), float64(len(text))))]
text2End = alignRuneStart(text, int(math.Min(float64(startLoc+len(text1)), float64(len(text)))))
} else {
text2 = text[startLoc:int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))]
endLoc = alignRuneStart(text, endLoc)
text2End = alignRuneStart(text, int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text)))))
}
text2 = text[startLoc:text2End]
if text1 == text2 {
// Perfect match, just shove the Replacement text in.
text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[startLoc+len(text1):]
replaceEnd := alignRuneStart(text, startLoc+len(text1))
text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[replaceEnd:]
} else {
// Imperfect match. Run a diff to get a framework of equivalent indices.
diffs := dmp.DiffMain(text1, text2, false)
Expand All @@ -296,12 +339,16 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b
index2 := dmp.DiffXIndex(diffs, index1)
if aDiff.Type == DiffInsert {
// Insertion
text = text[:startLoc+index2] + aDiff.Text + text[startLoc+index2:]
insertAt := alignRuneStart(text, startLoc+index2)
text = text[:insertAt] + aDiff.Text + text[insertAt:]
} else if aDiff.Type == DiffDelete {
// Deletion
startIndex := startLoc + index2
text = text[:startIndex] +
text[startIndex+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))-index2:]
startIndex := alignRuneStart(text, startLoc+index2)
endIndex := alignRuneStart(text, startLoc+dmp.DiffXIndex(diffs, index1+len(aDiff.Text)))
if endIndex < startIndex {
endIndex = startIndex
}
text = text[:startIndex] + text[endIndex:]
}
}
if aDiff.Type != DiffDelete {
Expand Down Expand Up @@ -416,7 +463,19 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
bigpatch.diffs = bigpatch.diffs[1:]
} else {
// Deletion or equality. Only take as much as we can stomach.
diffText = diffText[:min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)]
// rune-safety: align cutAt to nearest rune boundary so we never
// slice diffText through a multi-byte character (the resulting
// invalid UTF-8 is the root cause of issue #132 panic).
cutAt := min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)
cutAt = alignRuneStart(diffText, cutAt)
// Loop-progress guard: when alignment collapses cutAt to 0 but
// diffText is non-empty, advance by exactly one rune so the
// outer for-loop is guaranteed to consume input each iteration.
if cutAt == 0 && len(diffText) > 0 {
_, size := utf8.DecodeRuneInString(diffText)
cutAt = size
}
diffText = diffText[:cutAt]

patch.Length1 += len(diffText)
Start1 += len(diffText)
Expand All @@ -430,21 +489,26 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch {
if diffText == bigpatch.diffs[0].Text {
bigpatch.diffs = bigpatch.diffs[1:]
} else {
bigpatch.diffs[0].Text =
bigpatch.diffs[0].Text[len(diffText):]
// rune-safety: len(diffText) was already rune-aligned above,
// but align once more defensively for cutAt == 0 etc.
remStart := alignRuneStart(bigpatch.diffs[0].Text, len(diffText))
bigpatch.diffs[0].Text = bigpatch.diffs[0].Text[remStart:]
}
}
}
// Compute the head context for the next patch.
precontext = dmp.DiffText2(patch.diffs)
precontext = precontext[max(0, len(precontext)-dmp.PatchMargin):]
// rune-safety: align precontext start to nearest rune boundary.
precontext = precontext[alignRuneStart(precontext, max(0, len(precontext)-dmp.PatchMargin)):]

postcontext := ""
// Append the end context for this patch.
if len(dmp.DiffText1(bigpatch.diffs)) > dmp.PatchMargin {
postcontext = dmp.DiffText1(bigpatch.diffs)[:dmp.PatchMargin]
rawPost := dmp.DiffText1(bigpatch.diffs)
if len(rawPost) > dmp.PatchMargin {
// rune-safety: align before slicing to avoid mid-rune cut.
postcontext = rawPost[:alignRuneStart(rawPost, dmp.PatchMargin)]
} else {
postcontext = dmp.DiffText1(bigpatch.diffs)
postcontext = rawPost
}

if len(postcontext) != 0 {
Expand Down
Loading