difflib.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. /*Package difflib is a partial port of Python difflib module.
  2. Original source: https://github.com/pmezard/go-difflib
  3. This file is trimmed to only the parts used by this repository.
  4. */
  5. package difflib // import "gotest.tools/internal/difflib"
  6. func min(a, b int) int {
  7. if a < b {
  8. return a
  9. }
  10. return b
  11. }
  12. func max(a, b int) int {
  13. if a > b {
  14. return a
  15. }
  16. return b
  17. }
  18. // Match stores line numbers of size of match
  19. type Match struct {
  20. A int
  21. B int
  22. Size int
  23. }
  24. // OpCode identifies the type of diff
  25. type OpCode struct {
  26. Tag byte
  27. I1 int
  28. I2 int
  29. J1 int
  30. J2 int
  31. }
  32. // SequenceMatcher compares sequence of strings. The basic
  33. // algorithm predates, and is a little fancier than, an algorithm
  34. // published in the late 1980's by Ratcliff and Obershelp under the
  35. // hyperbolic name "gestalt pattern matching". The basic idea is to find
  36. // the longest contiguous matching subsequence that contains no "junk"
  37. // elements (R-O doesn't address junk). The same idea is then applied
  38. // recursively to the pieces of the sequences to the left and to the right
  39. // of the matching subsequence. This does not yield minimal edit
  40. // sequences, but does tend to yield matches that "look right" to people.
  41. //
  42. // SequenceMatcher tries to compute a "human-friendly diff" between two
  43. // sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the
  44. // longest *contiguous* & junk-free matching subsequence. That's what
  45. // catches peoples' eyes. The Windows(tm) windiff has another interesting
  46. // notion, pairing up elements that appear uniquely in each sequence.
  47. // That, and the method here, appear to yield more intuitive difference
  48. // reports than does diff. This method appears to be the least vulnerable
  49. // to synching up on blocks of "junk lines", though (like blank lines in
  50. // ordinary text files, or maybe "<P>" lines in HTML files). That may be
  51. // because this is the only method of the 3 that has a *concept* of
  52. // "junk" <wink>.
  53. //
  54. // Timing: Basic R-O is cubic time worst case and quadratic time expected
  55. // case. SequenceMatcher is quadratic time for the worst case and has
  56. // expected-case behavior dependent in a complicated way on how many
  57. // elements the sequences have in common; best case time is linear.
  58. type SequenceMatcher struct {
  59. a []string
  60. b []string
  61. b2j map[string][]int
  62. IsJunk func(string) bool
  63. autoJunk bool
  64. bJunk map[string]struct{}
  65. matchingBlocks []Match
  66. fullBCount map[string]int
  67. bPopular map[string]struct{}
  68. opCodes []OpCode
  69. }
  70. // NewMatcher returns a new SequenceMatcher
  71. func NewMatcher(a, b []string) *SequenceMatcher {
  72. m := SequenceMatcher{autoJunk: true}
  73. m.SetSeqs(a, b)
  74. return &m
  75. }
  76. // SetSeqs sets two sequences to be compared.
  77. func (m *SequenceMatcher) SetSeqs(a, b []string) {
  78. m.SetSeq1(a)
  79. m.SetSeq2(b)
  80. }
  81. // SetSeq1 sets the first sequence to be compared. The second sequence to be compared is
  82. // not changed.
  83. //
  84. // SequenceMatcher computes and caches detailed information about the second
  85. // sequence, so if you want to compare one sequence S against many sequences,
  86. // use .SetSeq2(s) once and call .SetSeq1(x) repeatedly for each of the other
  87. // sequences.
  88. //
  89. // See also SetSeqs() and SetSeq2().
  90. func (m *SequenceMatcher) SetSeq1(a []string) {
  91. if &a == &m.a {
  92. return
  93. }
  94. m.a = a
  95. m.matchingBlocks = nil
  96. m.opCodes = nil
  97. }
  98. // SetSeq2 sets the second sequence to be compared. The first sequence to be compared is
  99. // not changed.
  100. func (m *SequenceMatcher) SetSeq2(b []string) {
  101. if &b == &m.b {
  102. return
  103. }
  104. m.b = b
  105. m.matchingBlocks = nil
  106. m.opCodes = nil
  107. m.fullBCount = nil
  108. m.chainB()
  109. }
  110. func (m *SequenceMatcher) chainB() {
  111. // Populate line -> index mapping
  112. b2j := map[string][]int{}
  113. for i, s := range m.b {
  114. indices := b2j[s]
  115. indices = append(indices, i)
  116. b2j[s] = indices
  117. }
  118. // Purge junk elements
  119. m.bJunk = map[string]struct{}{}
  120. if m.IsJunk != nil {
  121. junk := m.bJunk
  122. for s := range b2j {
  123. if m.IsJunk(s) {
  124. junk[s] = struct{}{}
  125. }
  126. }
  127. for s := range junk {
  128. delete(b2j, s)
  129. }
  130. }
  131. // Purge remaining popular elements
  132. popular := map[string]struct{}{}
  133. n := len(m.b)
  134. if m.autoJunk && n >= 200 {
  135. ntest := n/100 + 1
  136. for s, indices := range b2j {
  137. if len(indices) > ntest {
  138. popular[s] = struct{}{}
  139. }
  140. }
  141. for s := range popular {
  142. delete(b2j, s)
  143. }
  144. }
  145. m.bPopular = popular
  146. m.b2j = b2j
  147. }
  148. func (m *SequenceMatcher) isBJunk(s string) bool {
  149. _, ok := m.bJunk[s]
  150. return ok
  151. }
  152. // Find longest matching block in a[alo:ahi] and b[blo:bhi].
  153. //
  154. // If IsJunk is not defined:
  155. //
  156. // Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
  157. // alo <= i <= i+k <= ahi
  158. // blo <= j <= j+k <= bhi
  159. // and for all (i',j',k') meeting those conditions,
  160. // k >= k'
  161. // i <= i'
  162. // and if i == i', j <= j'
  163. //
  164. // In other words, of all maximal matching blocks, return one that
  165. // starts earliest in a, and of all those maximal matching blocks that
  166. // start earliest in a, return the one that starts earliest in b.
  167. //
  168. // If IsJunk is defined, first the longest matching block is
  169. // determined as above, but with the additional restriction that no
  170. // junk element appears in the block. Then that block is extended as
  171. // far as possible by matching (only) junk elements on both sides. So
  172. // the resulting block never matches on junk except as identical junk
  173. // happens to be adjacent to an "interesting" match.
  174. //
  175. // If no blocks match, return (alo, blo, 0).
  176. func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
  177. // CAUTION: stripping common prefix or suffix would be incorrect.
  178. // E.g.,
  179. // ab
  180. // acab
  181. // Longest matching block is "ab", but if common prefix is
  182. // stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
  183. // strip, so ends up claiming that ab is changed to acab by
  184. // inserting "ca" in the middle. That's minimal but unintuitive:
  185. // "it's obvious" that someone inserted "ac" at the front.
  186. // Windiff ends up at the same place as diff, but by pairing up
  187. // the unique 'b's and then matching the first two 'a's.
  188. besti, bestj, bestsize := alo, blo, 0
  189. // find longest junk-free match
  190. // during an iteration of the loop, j2len[j] = length of longest
  191. // junk-free match ending with a[i-1] and b[j]
  192. j2len := map[int]int{}
  193. for i := alo; i != ahi; i++ {
  194. // look at all instances of a[i] in b; note that because
  195. // b2j has no junk keys, the loop is skipped if a[i] is junk
  196. newj2len := map[int]int{}
  197. for _, j := range m.b2j[m.a[i]] {
  198. // a[i] matches b[j]
  199. if j < blo {
  200. continue
  201. }
  202. if j >= bhi {
  203. break
  204. }
  205. k := j2len[j-1] + 1
  206. newj2len[j] = k
  207. if k > bestsize {
  208. besti, bestj, bestsize = i-k+1, j-k+1, k
  209. }
  210. }
  211. j2len = newj2len
  212. }
  213. // Extend the best by non-junk elements on each end. In particular,
  214. // "popular" non-junk elements aren't in b2j, which greatly speeds
  215. // the inner loop above, but also means "the best" match so far
  216. // doesn't contain any junk *or* popular non-junk elements.
  217. for besti > alo && bestj > blo && !m.isBJunk(m.b[bestj-1]) &&
  218. m.a[besti-1] == m.b[bestj-1] {
  219. besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
  220. }
  221. for besti+bestsize < ahi && bestj+bestsize < bhi &&
  222. !m.isBJunk(m.b[bestj+bestsize]) &&
  223. m.a[besti+bestsize] == m.b[bestj+bestsize] {
  224. bestsize += 1
  225. }
  226. // Now that we have a wholly interesting match (albeit possibly
  227. // empty!), we may as well suck up the matching junk on each
  228. // side of it too. Can't think of a good reason not to, and it
  229. // saves post-processing the (possibly considerable) expense of
  230. // figuring out what to do with it. In the case of an empty
  231. // interesting match, this is clearly the right thing to do,
  232. // because no other kind of match is possible in the regions.
  233. for besti > alo && bestj > blo && m.isBJunk(m.b[bestj-1]) &&
  234. m.a[besti-1] == m.b[bestj-1] {
  235. besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
  236. }
  237. for besti+bestsize < ahi && bestj+bestsize < bhi &&
  238. m.isBJunk(m.b[bestj+bestsize]) &&
  239. m.a[besti+bestsize] == m.b[bestj+bestsize] {
  240. bestsize += 1
  241. }
  242. return Match{A: besti, B: bestj, Size: bestsize}
  243. }
  244. // GetMatchingBlocks returns a list of triples describing matching subsequences.
  245. //
  246. // Each triple is of the form (i, j, n), and means that
  247. // a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
  248. // i and in j. It's also guaranteed that if (i, j, n) and (i', j', n') are
  249. // adjacent triples in the list, and the second is not the last triple in the
  250. // list, then i+n != i' or j+n != j'. IOW, adjacent triples never describe
  251. // adjacent equal blocks.
  252. //
  253. // The last triple is a dummy, (len(a), len(b), 0), and is the only
  254. // triple with n==0.
  255. func (m *SequenceMatcher) GetMatchingBlocks() []Match {
  256. if m.matchingBlocks != nil {
  257. return m.matchingBlocks
  258. }
  259. var matchBlocks func(alo, ahi, blo, bhi int, matched []Match) []Match
  260. matchBlocks = func(alo, ahi, blo, bhi int, matched []Match) []Match {
  261. match := m.findLongestMatch(alo, ahi, blo, bhi)
  262. i, j, k := match.A, match.B, match.Size
  263. if match.Size > 0 {
  264. if alo < i && blo < j {
  265. matched = matchBlocks(alo, i, blo, j, matched)
  266. }
  267. matched = append(matched, match)
  268. if i+k < ahi && j+k < bhi {
  269. matched = matchBlocks(i+k, ahi, j+k, bhi, matched)
  270. }
  271. }
  272. return matched
  273. }
  274. matched := matchBlocks(0, len(m.a), 0, len(m.b), nil)
  275. // It's possible that we have adjacent equal blocks in the
  276. // matching_blocks list now.
  277. nonAdjacent := []Match{}
  278. i1, j1, k1 := 0, 0, 0
  279. for _, b := range matched {
  280. // Is this block adjacent to i1, j1, k1?
  281. i2, j2, k2 := b.A, b.B, b.Size
  282. if i1+k1 == i2 && j1+k1 == j2 {
  283. // Yes, so collapse them -- this just increases the length of
  284. // the first block by the length of the second, and the first
  285. // block so lengthened remains the block to compare against.
  286. k1 += k2
  287. } else {
  288. // Not adjacent. Remember the first block (k1==0 means it's
  289. // the dummy we started with), and make the second block the
  290. // new block to compare against.
  291. if k1 > 0 {
  292. nonAdjacent = append(nonAdjacent, Match{i1, j1, k1})
  293. }
  294. i1, j1, k1 = i2, j2, k2
  295. }
  296. }
  297. if k1 > 0 {
  298. nonAdjacent = append(nonAdjacent, Match{i1, j1, k1})
  299. }
  300. nonAdjacent = append(nonAdjacent, Match{len(m.a), len(m.b), 0})
  301. m.matchingBlocks = nonAdjacent
  302. return m.matchingBlocks
  303. }
  304. // GetOpCodes returns a list of 5-tuples describing how to turn a into b.
  305. //
  306. // Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
  307. // has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
  308. // tuple preceding it, and likewise for j1 == the previous j2.
  309. //
  310. // The tags are characters, with these meanings:
  311. //
  312. // 'r' (replace): a[i1:i2] should be replaced by b[j1:j2]
  313. //
  314. // 'd' (delete): a[i1:i2] should be deleted, j1==j2 in this case.
  315. //
  316. // 'i' (insert): b[j1:j2] should be inserted at a[i1:i1], i1==i2 in this case.
  317. //
  318. // 'e' (equal): a[i1:i2] == b[j1:j2]
  319. func (m *SequenceMatcher) GetOpCodes() []OpCode {
  320. if m.opCodes != nil {
  321. return m.opCodes
  322. }
  323. i, j := 0, 0
  324. matching := m.GetMatchingBlocks()
  325. opCodes := make([]OpCode, 0, len(matching))
  326. for _, m := range matching {
  327. // invariant: we've pumped out correct diffs to change
  328. // a[:i] into b[:j], and the next matching block is
  329. // a[ai:ai+size] == b[bj:bj+size]. So we need to pump
  330. // out a diff to change a[i:ai] into b[j:bj], pump out
  331. // the matching block, and move (i,j) beyond the match
  332. ai, bj, size := m.A, m.B, m.Size
  333. tag := byte(0)
  334. if i < ai && j < bj {
  335. tag = 'r'
  336. } else if i < ai {
  337. tag = 'd'
  338. } else if j < bj {
  339. tag = 'i'
  340. }
  341. if tag > 0 {
  342. opCodes = append(opCodes, OpCode{tag, i, ai, j, bj})
  343. }
  344. i, j = ai+size, bj+size
  345. // the list of matching blocks is terminated by a
  346. // sentinel with size 0
  347. if size > 0 {
  348. opCodes = append(opCodes, OpCode{'e', ai, i, bj, j})
  349. }
  350. }
  351. m.opCodes = opCodes
  352. return m.opCodes
  353. }
  354. // GetGroupedOpCodes isolates change clusters by eliminating ranges with no changes.
  355. //
  356. // Return a generator of groups with up to n lines of context.
  357. // Each group is in the same format as returned by GetOpCodes().
  358. func (m *SequenceMatcher) GetGroupedOpCodes(n int) [][]OpCode {
  359. if n < 0 {
  360. n = 3
  361. }
  362. codes := m.GetOpCodes()
  363. if len(codes) == 0 {
  364. codes = []OpCode{{'e', 0, 1, 0, 1}}
  365. }
  366. // Fixup leading and trailing groups if they show no changes.
  367. if codes[0].Tag == 'e' {
  368. c := codes[0]
  369. i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2
  370. codes[0] = OpCode{c.Tag, max(i1, i2-n), i2, max(j1, j2-n), j2}
  371. }
  372. if codes[len(codes)-1].Tag == 'e' {
  373. c := codes[len(codes)-1]
  374. i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2
  375. codes[len(codes)-1] = OpCode{c.Tag, i1, min(i2, i1+n), j1, min(j2, j1+n)}
  376. }
  377. nn := n + n
  378. groups := [][]OpCode{}
  379. group := []OpCode{}
  380. for _, c := range codes {
  381. i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2
  382. // End the current group and start a new one whenever
  383. // there is a large range with no changes.
  384. if c.Tag == 'e' && i2-i1 > nn {
  385. group = append(group, OpCode{c.Tag, i1, min(i2, i1+n),
  386. j1, min(j2, j1+n)})
  387. groups = append(groups, group)
  388. group = []OpCode{}
  389. i1, j1 = max(i1, i2-n), max(j1, j2-n)
  390. }
  391. group = append(group, OpCode{c.Tag, i1, i2, j1, j2})
  392. }
  393. if len(group) > 0 && !(len(group) == 1 && group[0].Tag == 'e') {
  394. groups = append(groups, group)
  395. }
  396. return groups
  397. }