diff --git a/diffmatchpatch/match_runes.go b/diffmatchpatch/match_runes.go new file mode 100644 index 0000000..4fe756d --- /dev/null +++ b/diffmatchpatch/match_runes.go @@ -0,0 +1,167 @@ +// Copyright (c) 2012-2016 The go-diff authors. All rights reserved. +// +// Rune-based Match helpers — mirror MatchMain / MatchBitap / MatchAlphabet but +// operate on []rune instead of UTF-8 bytes. Lengths and positions are counted +// in runes (one Unicode code point each), which matches the reference JS +// diff-match-patch's UTF-16 code unit semantics for all BMP characters. +// +// Motivation: the existing byte-based Bitap on multi-byte text (CJK, em dash, +// emoji) uses len(text) (byte count), patches[].Length1 (byte count), and +// substring slicing by byte index. The reference JS implementation uses +// UTF-16 unit counts everywhere. When Go callers receive patch text emitted +// by a JS counterpart (whose Length/Start numbers are UTF-16 units), the +// byte-based Go path produces drift: +// - PatchSplitMax cuts CJK segments ~3x more finely (90 bytes vs 28 units), +// shortening Bitap patterns and weakening fuzzy match accuracy. +// - expected_loc = Start2 + delta drifts by N bytes for N multi-byte chars +// preceding the patch — fuzzy match's MatchDistance window may no longer +// cover the real anchor location. +// - MatchAlphabet keyed by byte conflates the three bytes of each CJK rune, +// scrambling the Bitap alphabet on dense CJK text. +// +// These functions provide a parallel rune-based path. Algorithm structure is +// preserved 1:1; only the length / indexing model differs. + +package diffmatchpatch + +import "math" + +// MatchMainRunes — rune-mirror of MatchMain. text and pattern are []rune. +// loc / return value are rune indices into text. +func (dmp *DiffMatchPatch) MatchMainRunes(text, pattern []rune, loc int) int { + loc = int(math.Max(0, math.Min(float64(loc), float64(len(text))))) + if runesEqual(text, pattern) { + return 0 + } else if len(text) == 0 { + return -1 + } else if loc+len(pattern) <= len(text) && runesEqual(text[loc:loc+len(pattern)], pattern) { + return loc + } + return dmp.MatchBitapRunes(text, pattern, loc) +} + +// MatchBitapRunes — rune-mirror of MatchBitap. +func (dmp *DiffMatchPatch) MatchBitapRunes(text, pattern []rune, loc int) int { + s := dmp.MatchAlphabetRunes(pattern) + + scoreThreshold := dmp.MatchThreshold + bestLoc := runesIndexOf(text, pattern, loc) + if bestLoc != -1 { + scoreThreshold = math.Min(dmp.matchBitapScoreRunes(0, bestLoc, loc, pattern), scoreThreshold) + bestLoc = runesLastIndexBefore(text, pattern, loc+len(pattern)) + if bestLoc != -1 { + scoreThreshold = math.Min(dmp.matchBitapScoreRunes(0, bestLoc, loc, pattern), scoreThreshold) + } + } + + matchmask := 1 << uint(len(pattern)-1) + bestLoc = -1 + + var binMin, binMid int + binMax := len(pattern) + len(text) + lastRd := []int{} + for d := 0; d < len(pattern); d++ { + binMin = 0 + binMid = binMax + for binMin < binMid { + if dmp.matchBitapScoreRunes(d, loc+binMid, loc, pattern) <= scoreThreshold { + binMin = binMid + } else { + binMax = binMid + } + binMid = (binMax-binMin)/2 + binMin + } + binMax = binMid + start := int(math.Max(1, float64(loc-binMid+1))) + finish := int(math.Min(float64(loc+binMid), float64(len(text))) + float64(len(pattern))) + + rd := make([]int, finish+2) + rd[finish+1] = (1 << uint(d)) - 1 + + for j := finish; j >= start; j-- { + var charMatch int + if len(text) <= j-1 { + charMatch = 0 + } else if _, ok := s[text[j-1]]; !ok { + charMatch = 0 + } else { + charMatch = s[text[j-1]] + } + + if d == 0 { + rd[j] = ((rd[j+1] << 1) | 1) & charMatch + } else { + rd[j] = ((rd[j+1]<<1)|1)&charMatch | (((lastRd[j+1] | lastRd[j]) << 1) | 1) | lastRd[j+1] + } + if (rd[j] & matchmask) != 0 { + score := dmp.matchBitapScoreRunes(d, j-1, loc, pattern) + if score <= scoreThreshold { + scoreThreshold = score + bestLoc = j - 1 + if bestLoc > loc { + start = int(math.Max(1, float64(2*loc-bestLoc))) + } else { + break + } + } + } + } + if dmp.matchBitapScoreRunes(d+1, loc, loc, pattern) > scoreThreshold { + break + } + lastRd = rd + } + return bestLoc +} + +func (dmp *DiffMatchPatch) matchBitapScoreRunes(e, x, loc int, pattern []rune) float64 { + accuracy := float64(e) / float64(len(pattern)) + proximity := math.Abs(float64(loc - x)) + if dmp.MatchDistance == 0 { + if proximity == 0 { + return accuracy + } + return 1.0 + } + return accuracy + (proximity / float64(dmp.MatchDistance)) +} + +// MatchAlphabetRunes — rune-mirror of MatchAlphabet. key is rune (not byte), +// so multi-byte chars get their own alphabet entry instead of conflating with +// their continuation bytes (the latter making sergi byte-Alphabet broken on CJK). +func (dmp *DiffMatchPatch) MatchAlphabetRunes(pattern []rune) map[rune]int { + s := map[rune]int{} + for _, c := range pattern { + if _, ok := s[c]; !ok { + s[c] = 0 + } + } + i := 0 + for _, c := range pattern { + s[c] |= int(uint(1) << uint(len(pattern)-i-1)) + i++ + } + return s +} + +// runesLastIndexBefore — rune-version of lastIndexOf (last occurrence of pattern +// in target ending at or before beforeLoc). sergi has runesIndex/runesIndexOf +// already; this completes the trio. +func runesLastIndexBefore(target, pattern []rune, beforeLoc int) int { + if len(pattern) == 0 { + if beforeLoc > len(target) { + return len(target) + } + return beforeLoc + } + maxStart := beforeLoc - len(pattern) + if maxStart > len(target)-len(pattern) { + maxStart = len(target) - len(pattern) + } + for i := maxStart; i >= 0; i-- { + if runesEqual(target[i:i+len(pattern)], pattern) { + return i + } + } + return -1 +} diff --git a/diffmatchpatch/patch.go b/diffmatchpatch/patch.go index 0dbe3bd..651b290 100644 --- a/diffmatchpatch/patch.go +++ b/diffmatchpatch/patch.go @@ -16,8 +16,34 @@ import ( "regexp" "strconv" "strings" + "unicode/utf8" ) +// alignRuneStart backs up byte index idx to the nearest UTF-8 rune start byte. +// idx <= 0 or idx >= len(s) is returned unchanged. +// +// Motivation: Google's reference JS diff-match-patch uses string.substring +// (UTF-16 code unit) for all slicing — surrogate-pair-middle splits are +// well-defined. This Go port slices by UTF-8 byte index (text[a:b]). When +// MatchMain's Bitap returns a byte position that lands inside a multi-byte +// character (CJK 3B / em dash 3B / emoji 4B), the resulting slice is invalid +// UTF-8 — subsequent MatchMain calls miss anchors, and downstream byte slices +// can panic with "slice bounds out of range" (issue #132). +// +// Fix: align every byte slice index back to the nearest valid rune boundary +// before slicing. The operation costs at most 1-3 bytes of "shrinkage" per +// slice point but guarantees the resulting string is always valid UTF-8 and +// no MatchMain / DiffMain pass sees a continuation-byte-prefixed string. +func alignRuneStart(s string, idx int) int { + if idx <= 0 || idx >= len(s) { + return idx + } + for idx > 0 && !utf8.RuneStart(s[idx]) { + idx-- + } + return idx +} + // Patch represents one patch operation. type Patch struct { diffs []Diff @@ -91,12 +117,18 @@ func (dmp *DiffMatchPatch) PatchAddContext(patch Patch, text string) Patch { padding += dmp.PatchMargin // Add the prefix. - prefix := text[max(0, patch.Start2-padding):patch.Start2] + // rune-safety: align prefix start to nearest rune boundary so the slice + // never produces an invalid-UTF-8 string when the surrounding text contains + // multi-byte characters. + prefixStart := alignRuneStart(text, max(0, patch.Start2-padding)) + prefix := text[prefixStart:patch.Start2] if len(prefix) != 0 { patch.diffs = append([]Diff{Diff{DiffEqual, prefix}}, patch.diffs...) } // Add the suffix. - suffix := text[patch.Start2+patch.Length1 : min(len(text), patch.Start2+patch.Length1+padding)] + // rune-safety: align suffix end to nearest rune boundary. + suffixEnd := alignRuneStart(text, min(len(text), patch.Start2+patch.Length1+padding)) + suffix := text[patch.Start2+patch.Length1 : suffixEnd] if len(suffix) != 0 { patch.diffs = append(patch.diffs, Diff{DiffEqual, suffix}) } @@ -264,6 +296,13 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b } else { startLoc = dmp.MatchMain(text, text1, expectedLoc) } + // rune-safety: Bitap-returned byte position may land in the middle of a + // multi-byte character. Back it up to the nearest rune start so the + // subsequent text[startLoc:...] / text[:startLoc] slices never produce + // invalid UTF-8 (root cause of issue #132 panic). + if startLoc != -1 { + startLoc = alignRuneStart(text, startLoc) + } if startLoc == -1 { // No match found. :( results[x] = false @@ -274,14 +313,18 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b results[x] = true delta = startLoc - expectedLoc var text2 string + var text2End int if endLoc == -1 { - text2 = text[startLoc:int(math.Min(float64(startLoc+len(text1)), float64(len(text))))] + text2End = alignRuneStart(text, int(math.Min(float64(startLoc+len(text1)), float64(len(text))))) } else { - text2 = text[startLoc:int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))] + endLoc = alignRuneStart(text, endLoc) + text2End = alignRuneStart(text, int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text))))) } + text2 = text[startLoc:text2End] if text1 == text2 { // Perfect match, just shove the Replacement text in. - text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[startLoc+len(text1):] + replaceEnd := alignRuneStart(text, startLoc+len(text1)) + text = text[:startLoc] + dmp.DiffText2(aPatch.diffs) + text[replaceEnd:] } else { // Imperfect match. Run a diff to get a framework of equivalent indices. diffs := dmp.DiffMain(text1, text2, false) @@ -296,12 +339,16 @@ func (dmp *DiffMatchPatch) PatchApply(patches []Patch, text string) (string, []b index2 := dmp.DiffXIndex(diffs, index1) if aDiff.Type == DiffInsert { // Insertion - text = text[:startLoc+index2] + aDiff.Text + text[startLoc+index2:] + insertAt := alignRuneStart(text, startLoc+index2) + text = text[:insertAt] + aDiff.Text + text[insertAt:] } else if aDiff.Type == DiffDelete { // Deletion - startIndex := startLoc + index2 - text = text[:startIndex] + - text[startIndex+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))-index2:] + startIndex := alignRuneStart(text, startLoc+index2) + endIndex := alignRuneStart(text, startLoc+dmp.DiffXIndex(diffs, index1+len(aDiff.Text))) + if endIndex < startIndex { + endIndex = startIndex + } + text = text[:startIndex] + text[endIndex:] } } if aDiff.Type != DiffDelete { @@ -416,7 +463,19 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch { bigpatch.diffs = bigpatch.diffs[1:] } else { // Deletion or equality. Only take as much as we can stomach. - diffText = diffText[:min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin)] + // rune-safety: align cutAt to nearest rune boundary so we never + // slice diffText through a multi-byte character (the resulting + // invalid UTF-8 is the root cause of issue #132 panic). + cutAt := min(len(diffText), patchSize-patch.Length1-dmp.PatchMargin) + cutAt = alignRuneStart(diffText, cutAt) + // Loop-progress guard: when alignment collapses cutAt to 0 but + // diffText is non-empty, advance by exactly one rune so the + // outer for-loop is guaranteed to consume input each iteration. + if cutAt == 0 && len(diffText) > 0 { + _, size := utf8.DecodeRuneInString(diffText) + cutAt = size + } + diffText = diffText[:cutAt] patch.Length1 += len(diffText) Start1 += len(diffText) @@ -430,21 +489,26 @@ func (dmp *DiffMatchPatch) PatchSplitMax(patches []Patch) []Patch { if diffText == bigpatch.diffs[0].Text { bigpatch.diffs = bigpatch.diffs[1:] } else { - bigpatch.diffs[0].Text = - bigpatch.diffs[0].Text[len(diffText):] + // rune-safety: len(diffText) was already rune-aligned above, + // but align once more defensively for cutAt == 0 etc. + remStart := alignRuneStart(bigpatch.diffs[0].Text, len(diffText)) + bigpatch.diffs[0].Text = bigpatch.diffs[0].Text[remStart:] } } } // Compute the head context for the next patch. precontext = dmp.DiffText2(patch.diffs) - precontext = precontext[max(0, len(precontext)-dmp.PatchMargin):] + // rune-safety: align precontext start to nearest rune boundary. + precontext = precontext[alignRuneStart(precontext, max(0, len(precontext)-dmp.PatchMargin)):] postcontext := "" // Append the end context for this patch. - if len(dmp.DiffText1(bigpatch.diffs)) > dmp.PatchMargin { - postcontext = dmp.DiffText1(bigpatch.diffs)[:dmp.PatchMargin] + rawPost := dmp.DiffText1(bigpatch.diffs) + if len(rawPost) > dmp.PatchMargin { + // rune-safety: align before slicing to avoid mid-rune cut. + postcontext = rawPost[:alignRuneStart(rawPost, dmp.PatchMargin)] } else { - postcontext = dmp.DiffText1(bigpatch.diffs) + postcontext = rawPost } if len(postcontext) != 0 { diff --git a/diffmatchpatch/patch_rune_safety_test.go b/diffmatchpatch/patch_rune_safety_test.go new file mode 100644 index 0000000..11dbb4d --- /dev/null +++ b/diffmatchpatch/patch_rune_safety_test.go @@ -0,0 +1,134 @@ +package diffmatchpatch + +import ( + "strings" + "testing" +) + +// alignRuneStart should back any byte index inside a multi-byte UTF-8 +// character back to that character's first byte. +func TestAlignRuneStart(t *testing.T) { + // "abc中文def" — UTF-8: a=1 b=1 c=1 中=3(byte 3..5) 文=3(6..8) d=1 e=1 f=1 + s := "abc中文def" + cases := []struct { + in, want int + desc string + }{ + {0, 0, "0 unchanged"}, + {1, 1, "after 'a' already aligned"}, + {3, 3, "start of 中 aligned"}, + {4, 3, "middle of 中 backs to start"}, + {5, 3, "end-middle of 中 backs to start"}, + {6, 6, "start of 文 aligned"}, + {7, 6, "middle of 文 backs to start"}, + {9, 9, "after 文 = start of d, aligned"}, + {12, 12, "end-of-string"}, + {-1, -1, "negative unchanged"}, + {999, 999, "past end unchanged"}, + } + for _, c := range cases { + got := alignRuneStart(s, c.in) + if got != c.want { + t.Errorf("alignRuneStart(%q, %d) = %d, want %d (%s)", s, c.in, got, c.want, c.desc) + } + } +} + +// Issue #132 minimal repro: applying a patch containing a multi-byte +// character with a short anchor used to panic with "slice bounds out of range". +// After the rune-safe alignment, PatchApply should return cleanly (apply may +// fail with results[i]=false, but must never panic). +func TestPatchApply_Issue132_NoPanic(t *testing.T) { + dmp := New() + patches, err := dmp.PatchFromText("@@ -1,2 +1,3 @@\n %E2%98%9E \n+r\n") + if err != nil { + t.Fatalf("PatchFromText: %v", err) + } + defer func() { + if r := recover(); r != nil { + t.Fatalf("panic leaked: %v", r) + } + }() + _, results := dmp.PatchApply(patches, "☞ 𝗢𝗥𝗗𝗘𝗥 ") + if results == nil { + t.Fatalf("results nil") + } +} + +// PatchSplitMax on a CJK-heavy base must produce diff.Text slices that are +// all valid UTF-8 (prior to this fix, byte-level slicing of multi-byte +// characters could leave continuation-byte prefixes in the output). +func TestPatchSplitMax_NoInvalidUTF8(t *testing.T) { + dmp := New() + // CJK base larger than MatchMaxBits (32) forces PatchSplitMax to engage. + old := "中文段落一" + strings.Repeat("汉字", 30) + "结束" + newText := "中文段落一" + strings.Repeat("汉字", 30) + "结尾" + patches := dmp.PatchMake(old, dmp.DiffMain(old, newText, false)) + if len(patches) == 0 { + t.Skip("no patches generated") + } + // PatchMake itself must produce valid-UTF-8 diff text (PatchAddContext's + // prefix/suffix slice now aligns to rune boundaries). + for i, p := range patches { + for j, d := range p.diffs { + if !isValidUTF8(d.Text) { + t.Errorf("PatchMake patch %d diff %d invalid UTF-8: %q", i, j, d.Text) + } + } + } + defer func() { + if r := recover(); r != nil { + t.Fatalf("PatchSplitMax panic: %v", r) + } + }() + // PatchSplitMax must preserve valid UTF-8 (precontext / postcontext / + // diffText cut points all rune-aligned). + for i, p := range dmp.PatchSplitMax(patches) { + for j, d := range p.diffs { + if !isValidUTF8(d.Text) { + t.Errorf("PatchSplitMax patch %d diff %d invalid UTF-8: %q type=%d", i, j, d.Text, d.Type) + } + } + } +} + +// minimal UTF-8 validity check inlined to avoid adding a stdlib import vs. +// the existing import block in patch.go (utf8 is already there for the fix). +func isValidUTF8(s string) bool { + for i := 0; i < len(s); { + r, sz := utf8DecodeFirst(s[i:]) + if r == 0xFFFD && sz == 1 { + return false + } + i += sz + } + return true +} + +func utf8DecodeFirst(s string) (rune, int) { + if len(s) == 0 { + return 0, 0 + } + b := s[0] + switch { + case b < 0x80: + return rune(b), 1 + case b < 0xc0: + return 0xFFFD, 1 + case b < 0xe0: + if len(s) < 2 { + return 0xFFFD, 1 + } + return rune(b&0x1f)<<6 | rune(s[1]&0x3f), 2 + case b < 0xf0: + if len(s) < 3 { + return 0xFFFD, 1 + } + return rune(b&0x0f)<<12 | rune(s[1]&0x3f)<<6 | rune(s[2]&0x3f), 3 + default: + if len(s) < 4 { + return 0xFFFD, 1 + } + return rune(b&0x07)<<18 | rune(s[1]&0x3f)<<12 | rune(s[2]&0x3f)<<6 | rune(s[3]&0x3f), 4 + } +} diff --git a/diffmatchpatch/patch_runes.go b/diffmatchpatch/patch_runes.go new file mode 100644 index 0000000..a1ebef7 --- /dev/null +++ b/diffmatchpatch/patch_runes.go @@ -0,0 +1,372 @@ +// Copyright (c) 2012-2016 The go-diff authors. All rights reserved. +// +// Rune-based Patch helpers — mirror Patch{Apply,SplitMax,AddPadding} and +// supporting Diff helpers but count length / slice by rune index instead of +// UTF-8 byte. See match_runes.go for motivation. +// +// API surface: +// - PatchApplyRunes(patches, text) ([]rune, []bool) — new public entry +// point. Callers convert their string to []rune before the call and +// string() the returned []rune. Patch.Start1/Start2/Length1/Length2 are +// interpreted as rune counts (matching the reference JS implementation's +// UTF-16 unit semantics for BMP characters). +// - The original PatchApply (byte-based) is unchanged. + +package diffmatchpatch + +import ( + "math" + "unicode/utf8" +) + +// runesEqualString — runes vs string equality, avoids materialising one side. +// Used in perfect-match shortcut in PatchApplyRunes (compare patch-source-text +// with extracted base segment). +func runesEqualString(rs []rune, s string) bool { + if len(s) != utf8.RuneCountInString(s) && len(rs) != utf8.RuneCountInString(s) { + // short-circuit length mismatch via rune count + } + i := 0 + for _, r := range s { + if i >= len(rs) || rs[i] != r { + return false + } + i++ + } + return i == len(rs) +} + +// DiffText1Runes returns the source text of diffs as a rune slice. +// (sergi's DiffText1 returns string concatenation by byte; we keep rune slices +// to avoid byte/rune conversions in hot loops.) +func (dmp *DiffMatchPatch) DiffText1Runes(diffs []Diff) []rune { + out := []rune{} + for _, d := range diffs { + if d.Type != DiffInsert { + out = append(out, []rune(d.Text)...) + } + } + return out +} + +func (dmp *DiffMatchPatch) DiffText2Runes(diffs []Diff) []rune { + out := []rune{} + for _, d := range diffs { + if d.Type != DiffDelete { + out = append(out, []rune(d.Text)...) + } + } + return out +} + +// DiffXIndexRunes — rune-mirror of DiffXIndex. loc is rune index in text1 +// (concatenation of equality+deletion diffs), returns rune index in text2. +func (dmp *DiffMatchPatch) DiffXIndexRunes(diffs []Diff, loc int) int { + chars1 := 0 + chars2 := 0 + lastChars1 := 0 + lastChars2 := 0 + var lastDiff Diff + for _, aDiff := range diffs { + runeLen := utf8.RuneCountInString(aDiff.Text) + if aDiff.Type != DiffInsert { + chars1 += runeLen + } + if aDiff.Type != DiffDelete { + chars2 += runeLen + } + if chars1 > loc { + lastDiff = aDiff + break + } + lastChars1 = chars1 + lastChars2 = chars2 + } + if lastDiff.Type == DiffDelete { + return lastChars2 + } + return lastChars2 + (loc - lastChars1) +} + +// PatchAddPaddingRunes — rune-mirror of PatchAddPadding. Returns padding as +// []rune (each ASCII control char 0x01..0x04 is 1 rune == 1 UTF-16 unit). +// All patches' Start1/Start2 and length fields are bumped in rune units. +func (dmp *DiffMatchPatch) PatchAddPaddingRunes(patches []Patch) []rune { + paddingLength := dmp.PatchMargin + nullPadding := make([]rune, 0, paddingLength) + for x := 1; x <= paddingLength; x++ { + nullPadding = append(nullPadding, rune(x)) + } + + for i := range patches { + patches[i].Start1 += paddingLength + patches[i].Start2 += paddingLength + } + + // Add some padding on start of first diff. + if len(patches[0].diffs) == 0 || patches[0].diffs[0].Type != DiffEqual { + patches[0].diffs = append([]Diff{{DiffEqual, string(nullPadding)}}, patches[0].diffs...) + patches[0].Start1 -= paddingLength + patches[0].Start2 -= paddingLength + patches[0].Length1 += paddingLength + patches[0].Length2 += paddingLength + } else { + firstRunes := utf8.RuneCountInString(patches[0].diffs[0].Text) + if paddingLength > firstRunes { + extra := paddingLength - firstRunes + patches[0].diffs[0].Text = string(nullPadding[firstRunes:]) + patches[0].diffs[0].Text + patches[0].Start1 -= extra + patches[0].Start2 -= extra + patches[0].Length1 += extra + patches[0].Length2 += extra + } + } + + last := len(patches) - 1 + if len(patches[last].diffs) == 0 || patches[last].diffs[len(patches[last].diffs)-1].Type != DiffEqual { + patches[last].diffs = append(patches[last].diffs, Diff{DiffEqual, string(nullPadding)}) + patches[last].Length1 += paddingLength + patches[last].Length2 += paddingLength + } else { + lastDiff := patches[last].diffs[len(patches[last].diffs)-1] + lastRunes := utf8.RuneCountInString(lastDiff.Text) + if paddingLength > lastRunes { + extra := paddingLength - lastRunes + patches[last].diffs[len(patches[last].diffs)-1].Text += string(nullPadding[:extra]) + patches[last].Length1 += extra + patches[last].Length2 += extra + } + } + + return nullPadding +} + +// PatchSplitMaxRunes — rune-mirror of PatchSplitMax. patch_size / +// Patch_Margin compared against rune count, diffText sliced by rune index. +// This is the key fix vs sergi byte-mode: a 30-CJK-char patch is no longer +// chopped into ~9-char tiny segments (which weakens Bitap pattern strength). +func (dmp *DiffMatchPatch) PatchSplitMaxRunes(patches []Patch) []Patch { + patchSize := dmp.MatchMaxBits + for x := 0; x < len(patches); x++ { + if patches[x].Length1 <= patchSize { + continue + } + bigpatch := patches[x] + patches = append(patches[:x], patches[x+1:]...) + x-- + + Start1 := bigpatch.Start1 + Start2 := bigpatch.Start2 + precontext := []rune{} + for len(bigpatch.diffs) != 0 { + patch := Patch{} + empty := true + patch.Start1 = Start1 - len(precontext) + patch.Start2 = Start2 - len(precontext) + if len(precontext) != 0 { + patch.Length1 = len(precontext) + patch.Length2 = len(precontext) + patch.diffs = append(patch.diffs, Diff{DiffEqual, string(precontext)}) + } + for len(bigpatch.diffs) != 0 && patch.Length1 < patchSize-dmp.PatchMargin { + diffType := bigpatch.diffs[0].Type + diffRunes := []rune(bigpatch.diffs[0].Text) + diffRuneLen := len(diffRunes) + if diffType == DiffInsert { + patch.Length2 += diffRuneLen + Start2 += diffRuneLen + patch.diffs = append(patch.diffs, bigpatch.diffs[0]) + bigpatch.diffs = bigpatch.diffs[1:] + empty = false + } else if diffType == DiffDelete && len(patch.diffs) == 1 && patch.diffs[0].Type == DiffEqual && diffRuneLen > 2*patchSize { + patch.Length1 += diffRuneLen + Start1 += diffRuneLen + empty = false + patch.diffs = append(patch.diffs, Diff{diffType, bigpatch.diffs[0].Text}) + bigpatch.diffs = bigpatch.diffs[1:] + } else { + cutAt := patchSize - patch.Length1 - dmp.PatchMargin + if cutAt > diffRuneLen { + cutAt = diffRuneLen + } + if cutAt < 0 { + cutAt = 0 + } + // loop-progress invariant: cutAt == 0 时强制吃 1 rune + if cutAt == 0 && diffRuneLen > 0 { + cutAt = 1 + } + taken := diffRunes[:cutAt] + takenStr := string(taken) + takenRuneLen := len(taken) + + patch.Length1 += takenRuneLen + Start1 += takenRuneLen + if diffType == DiffEqual { + patch.Length2 += takenRuneLen + Start2 += takenRuneLen + } else { + empty = false + } + patch.diffs = append(patch.diffs, Diff{diffType, takenStr}) + if takenRuneLen == diffRuneLen { + bigpatch.diffs = bigpatch.diffs[1:] + } else { + bigpatch.diffs[0].Text = string(diffRunes[cutAt:]) + } + } + } + + precontext = dmp.DiffText2Runes(patch.diffs) + if len(precontext) > dmp.PatchMargin { + precontext = precontext[len(precontext)-dmp.PatchMargin:] + } + + rawPost := dmp.DiffText1Runes(bigpatch.diffs) + var postcontext []rune + if len(rawPost) > dmp.PatchMargin { + postcontext = rawPost[:dmp.PatchMargin] + } else { + postcontext = rawPost + } + if len(postcontext) != 0 { + patch.Length1 += len(postcontext) + patch.Length2 += len(postcontext) + if len(patch.diffs) != 0 && patch.diffs[len(patch.diffs)-1].Type == DiffEqual { + patch.diffs[len(patch.diffs)-1].Text += string(postcontext) + } else { + patch.diffs = append(patch.diffs, Diff{DiffEqual, string(postcontext)}) + } + } + if !empty { + x++ + patches = append(patches[:x], append([]Patch{patch}, patches[x:]...)...) + } + } + } + return patches +} + +// PatchApplyRunes — rune-mirror of PatchApply. text is rune slice; +// Patch.Start1/Start2/Length1/Length2 treated as rune counts (== UTF-16 unit +// for BMP chars, matching Google JS DMP wire format). +// +// Behavior parity with PatchApply preserved at structure level. The +// difference is that all length comparisons, slice indices, MatchMain calls, +// and substring extractions operate in rune space, eliminating the byte / +// UTF-16-unit mismatch that causes silent fuzzy drift on multi-byte text. +func (dmp *DiffMatchPatch) PatchApplyRunes(patches []Patch, text []rune) ([]rune, []bool) { + if len(patches) == 0 { + return text, []bool{} + } + + patches = dmp.PatchDeepCopy(patches) + + nullPadding := dmp.PatchAddPaddingRunes(patches) + padded := make([]rune, 0, len(text)+2*len(nullPadding)) + padded = append(padded, nullPadding...) + padded = append(padded, text...) + padded = append(padded, nullPadding...) + text = padded + + patches = dmp.PatchSplitMaxRunes(patches) + + x := 0 + delta := 0 + results := make([]bool, len(patches)) + for _, aPatch := range patches { + expectedLoc := aPatch.Start2 + delta + text1 := dmp.DiffText1Runes(aPatch.diffs) + var startLoc int + endLoc := -1 + if len(text1) > dmp.MatchMaxBits { + startLoc = dmp.MatchMainRunes(text, text1[:dmp.MatchMaxBits], expectedLoc) + if startLoc != -1 { + endLoc = dmp.MatchMainRunes(text, + text1[len(text1)-dmp.MatchMaxBits:], expectedLoc+len(text1)-dmp.MatchMaxBits) + if endLoc == -1 || startLoc >= endLoc { + startLoc = -1 + } + } + } else { + startLoc = dmp.MatchMainRunes(text, text1, expectedLoc) + } + if startLoc == -1 { + results[x] = false + delta -= aPatch.Length2 - aPatch.Length1 + } else { + results[x] = true + delta = startLoc - expectedLoc + var text2 []rune + if endLoc == -1 { + end := int(math.Min(float64(startLoc+len(text1)), float64(len(text)))) + text2 = text[startLoc:end] + } else { + end := int(math.Min(float64(endLoc+dmp.MatchMaxBits), float64(len(text)))) + text2 = text[startLoc:end] + } + if runesEqual(text1, text2) { + replacement := dmp.DiffText2Runes(aPatch.diffs) + newText := make([]rune, 0, len(text)+len(replacement)-len(text1)) + newText = append(newText, text[:startLoc]...) + newText = append(newText, replacement...) + newText = append(newText, text[startLoc+len(text1):]...) + text = newText + } else { + diffs := dmp.DiffMain(string(text1), string(text2), false) + if len(text1) > dmp.MatchMaxBits && float64(dmp.DiffLevenshtein(diffs))/float64(len(text1)) > dmp.PatchDeleteThreshold { + results[x] = false + } else { + diffs = dmp.DiffCleanupSemanticLossless(diffs) + index1 := 0 + for _, aDiff := range aPatch.diffs { + if aDiff.Type != DiffEqual { + index2 := dmp.DiffXIndexRunes(diffs, index1) + if aDiff.Type == DiffInsert { + insertRunes := []rune(aDiff.Text) + insertAt := startLoc + index2 + if insertAt < 0 { + insertAt = 0 + } + if insertAt > len(text) { + insertAt = len(text) + } + newText := make([]rune, 0, len(text)+len(insertRunes)) + newText = append(newText, text[:insertAt]...) + newText = append(newText, insertRunes...) + newText = append(newText, text[insertAt:]...) + text = newText + } else if aDiff.Type == DiffDelete { + delLen := utf8.RuneCountInString(aDiff.Text) + startIdx := startLoc + index2 + endIdx := startLoc + dmp.DiffXIndexRunes(diffs, index1+delLen) + if startIdx < 0 { + startIdx = 0 + } + if endIdx > len(text) { + endIdx = len(text) + } + if endIdx < startIdx { + endIdx = startIdx + } + newText := make([]rune, 0, len(text)-(endIdx-startIdx)) + newText = append(newText, text[:startIdx]...) + newText = append(newText, text[endIdx:]...) + text = newText + } + } + if aDiff.Type != DiffDelete { + index1 += utf8.RuneCountInString(aDiff.Text) + } + } + } + } + } + x++ + } + // strip padding + if len(text) >= 2*len(nullPadding) { + text = text[len(nullPadding) : len(text)-len(nullPadding)] + } + return text, results +} diff --git a/diffmatchpatch/patch_runes_test.go b/diffmatchpatch/patch_runes_test.go new file mode 100644 index 0000000..9dd13cc --- /dev/null +++ b/diffmatchpatch/patch_runes_test.go @@ -0,0 +1,90 @@ +package diffmatchpatch + +import ( + "strings" + "testing" +) + +// PatchApplyRunes 在含中文 base 上的端到端验证: PatchMake 生成的 patch 经 PatchToText +// → PatchFromText round-trip 后,通过 PatchApplyRunes 应字节级还原 target。 +// 这条路径在 byte-based PatchApply 上由于 PatchSplitMax 把 CJK 段切碎 + Bitap +// alphabet 误把 UTF-8 continuation byte 当独立 char,fuzzy 容错弱,极端 case 422。 +func TestPatchApplyRunes_RoundTripCJK(t *testing.T) { + dmp := New() + old := "中文段落" + strings.Repeat("汉字", 30) + "结束" + new := "中文段落" + strings.Repeat("汉字", 30) + "新结尾添加更多内容" + + patches := dmp.PatchMake(old, dmp.DiffMain(old, new, false)) + patchText := dmp.PatchToText(patches) + parsed, err := dmp.PatchFromText(patchText) + if err != nil { + t.Fatalf("PatchFromText: %v", err) + } + + gotRunes, applied := dmp.PatchApplyRunes(parsed, []rune(old)) + for i, ok := range applied { + if !ok { + t.Errorf("patch %d apply failed", i) + } + } + got := string(gotRunes) + if got != new { + t.Errorf("rune-apply mismatch:\n got: %q\n want: %q", got, new) + } +} + +// 纯 ASCII 场景: PatchApplyRunes 应该跟 PatchApply (byte-mode) 字节级等价。 +func TestPatchApplyRunes_ASCII_EquivToByteApply(t *testing.T) { + dmp := New() + old := strings.Repeat("the quick brown fox jumps over the lazy dog.\n", 5) + new := strings.Repeat("the SLOW brown fox JUMPS over the lazy dog.\n", 5) + patches := dmp.PatchMake(old, dmp.DiffMain(old, new, false)) + + gotByte, _ := dmp.PatchApply(patches, old) + gotRunes, _ := dmp.PatchApplyRunes(patches, []rune(old)) + if string(gotRunes) != gotByte { + t.Errorf("ASCII path divergence:\n byte: %q\n rune: %q", gotByte, string(gotRunes)) + } + if string(gotRunes) != new { + t.Errorf("rune-apply mismatch:\n got: %q\n want: %q", string(gotRunes), new) + } +} + +// MatchMainRunes 单元: rune index 命中 (跟 MatchMain byte index 命中对照)。 +func TestMatchMainRunes_BasicCJK(t *testing.T) { + dmp := New() + text := []rune("前缀文本 abc 中文锚点 def 后缀") + pattern := []rune("中文锚点") + loc := 10 // rune index, near where the pattern is + got := dmp.MatchMainRunes(text, pattern, loc) + // 期望命中 "中文锚点" 在 text 中的 rune index + want := -1 + for i := 0; i+len(pattern) <= len(text); i++ { + match := true + for j := range pattern { + if text[i+j] != pattern[j] { + match = false + break + } + } + if match { + want = i + break + } + } + if got != want { + t.Errorf("MatchMainRunes = %d, want %d", got, want) + } +} + +// DiffXIndexRunes 单元: 计 rune index 不是 byte index。 +func TestDiffXIndexRunes_CountsRunes(t *testing.T) { + dmp := New() + // 用 DiffMain 构造一个 diffs 列表 (含中文) + diffs := dmp.DiffMain("中文 hello", "中文 world", false) + // loc=3 (rune index past "中文 ") 应映射到 target 同位置 (相同 prefix) + got := dmp.DiffXIndexRunes(diffs, 3) + if got != 3 { + t.Errorf("DiffXIndexRunes(loc=3) = %d, want 3 (rune-aligned prefix length)", got) + } +}