Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Choose
diff-sequences
for diff algorithm
Summary: Before this change, there are 3 Myers diff algorithms used in the dependency tree: - diff-match-patch (1.0.5) - diff (4.0.1) - diff-sequences (via jest -> jest-diff -> diff-sequences) We'd like to simplify the dependency tree. The short answer is: - Use `diff-sequences`, or `jest-diff` which uses `diff-sequences` internally. For best performance, do: - Strip common prefix and suffix. - Make line comparison O(1), avoid `line1 === line2` which can be O(line length). - Consider skipping "cleanup" in `jest-diff` for long input. ---- Long answer of picking a diff library: I wrote a benchmark script to get some idea about their performance: const fs = require('fs') const dmp = new (require('diff-match-patch').diff_match_patch)(); const diff = require('diff'); const ds = require('diff-sequences').default; const jd = require('jest-diff'); dmp.Diff_Timeout = 120; // Diff functions. Output format: Chunk[] // Chunk is one of: // [0, n]: n common lines (same on both side) // [-1, n]: n left-side-only lines // [1, n]: n right-side-only lines function diff1(chars1, chars2) { return dmp.diff_main(chars1, chars2).map(v => [v[0], v[1].length]); } function diff1a(chars1, chars2) { return dmp.diff_main(chars1, chars2, false).map(v => [v[0], v[1].length]); } function diff2(chars1, chars2) { return diff.diffChars(chars1, chars2).map(v => { const d = v.added ? 1 : (v.removed ? -1 : 0); return [d, v.count]; }); } function diff3(chars1, chars2) { function isCommon(ai, bi) { return chars1[ai] == chars2[bi]; } const r = []; let lastA = 0, lastB = 0; function foundSequence(n, na, nb) { if (na > lastA) { r.push([-1, na - lastA]); lastA = na; } if (nb > lastB) { r.push([1, nb - lastB]); lastB = nb; } if (n > 0) { r.push([0, n]); lastA += n; lastB += n; } } ds(chars1.length, chars2.length, isCommon, foundSequence); foundSequence(0, chars1.length, chars2.length); return r; } function diff3a(chars1, chars2) { return jd.diffStringsRaw(chars1, chars2, false).map((d) => [d[0], d[1].length]); } function diff3b(chars1, chars2) { return jd.diffStringsRaw(chars1, chars2, true).map((d) => [d[0], d[1].length]); } function bench(a, b) { const {chars1, chars2} = dmp.diff_linesToChars_(a, b); function stringify(obj) { if (obj.length > 20) { return `${obj.length} items`; } else { return JSON.stringify(obj); } } [ ['diff-match-patch', diff1], ['diff-match-patch (checklines=false)', diff1a], ['diff-sequences', diff3], ['jest-diff (diff-sequences), no cleanup', diff3a], ['jest-diff (diff-sequences), with cleanup', diff3b], ['jsdiff', diff2], ].forEach(([name, diffFunc]) => { // node --expose_gc if (global.gc) { gc(); } const label = ` ${name}`; console.time(label); console.log(' ', stringify(diffFunc(chars1, chars2))); console.timeEnd(label); }); } let a, b; console.log('\nwith common prefix and suffix 1'); a = 'aaaaaaa\n'.repeat(50000) + 'bbbb\n' + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50000) + 'cccc\n' + 'dddd\n'.repeat(50000); bench(a, b); console.log('\nwith common prefix and suffix 2'); a = 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900); bench(a, b); console.log('\nwithout common prefix or suffix 1'); a = 'c\n' + 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50000); b = 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50100) + 'z\n'; bench(a, b); console.log('\nwithout common prefix or suffix 2'); a = 'cccc\n' + 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000) + 'z\n'; b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900) + 'z\ny\n'; bench(a, b); // Hearthstone cards.json in different languages. // This is somewhat challenging since many lines are changed. // wget https://api.hearthstonejson.com/v1/168129/enUS/cards.json -O 1 // wget https://api.hearthstonejson.com/v1/168129/zhCN/cards.json -O 2 // python3 -m json.tool < 1 > 1.json // python3 -m json.tool < 2 > 2.json console.log('\ncards.json with different languages'); a = fs.readFileSync('1.json', {encoding: 'utf-8'}); b = fs.readFileSync('2.json', {encoding: 'utf-8'}); bench(a, b); The output looks like: with common prefix and suffix 1 [[0,50000],[-1,1],[1,1],[0,50000]] diff-match-patch: 5.073ms [[0,50000],[-1,1],[1,1],[0,50000]] diff-match-patch (checklines=false): 0.481ms [[0,50000],[-1,1],[1,1],[0,50000]] diff-sequences: 7.589ms [[0,50000],[-1,1],[1,1],[0,50000]] jest-diff (diff-sequences), no cleanup: 10.915ms [[0,50000],[-1,1],[1,1],[0,50000]] jest-diff (diff-sequences), with cleanup: 10.588ms [[0,50000],[-1,1],[1,1],[0,50000]] jsdiff: 22.664ms with common prefix and suffix 2 [[0,50000],[-1,101],[1,101],[0,49900]] diff-match-patch: 10.688ms [[0,50000],[-1,101],[1,101],[0,49900]] diff-match-patch (checklines=false): 2.619ms [[0,50000],[-1,101],[1,101],[0,49900]] diff-sequences: 12.687ms [[0,50000],[-1,101],[1,101],[0,49900]] jest-diff (diff-sequences), no cleanup: 11.055ms [[0,50000],[-1,101],[1,101],[0,49900]] jest-diff (diff-sequences), with cleanup: 4.356ms [[0,50000],[-1,1],[1,101],[0,49900],[-1,100]] jsdiff: 59.359ms without common prefix or suffix 1 [[-1,1],[0,100000],[1,101]] diff-match-patch: 632.863ms [[-1,1],[0,100000],[1,101]] diff-match-patch (checklines=false): 607.796ms [[-1,1],[0,50000],[1,51],[0,50000],[1,50]] diff-sequences: 12.366ms [[-1,1],[0,50000],[1,51],[0,50000],[1,50]] jest-diff (diff-sequences), no cleanup: 11.096ms [[-1,1],[0,100000],[1,51],[1,50]] jest-diff (diff-sequences), with cleanup: 1.029s [[-1,1],[0,100000],[1,101]] jsdiff: 13.163ms without common prefix or suffix 2 [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-match-patch: 2.773s [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-match-patch (checklines=false): 1.402s [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] diff-sequences: 22.216ms [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] jest-diff (diff-sequences), no cleanup: 20.546ms [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]] jest-diff (diff-sequences), with cleanup: 19.222ms [[-1,1],[0,50000],[-1,1],[1,101],[0,49900],[-1,100],[0,1],[1,1]] jsdiff: 33.82ms cards.json with different languages 67781 items diff-match-patch: 1:04.122 (m:ss.mmm) 57514 items diff-match-patch (checklines=false): 2:00.283 (m:ss.mmm) 67781 items diff-sequences: 1:09.486 (m:ss.mmm) 67781 items jest-diff (diff-sequences), no cleanup: 1:06.452 (m:ss.mmm) 52937 items jest-diff (diff-sequences), with cleanup: 1:09.118 (m:ss.mmm) ... (jsdiff cannot complete this test case in 20+ minutes) Observations: - In the last test case, `jsdiff` does not implement O(D^2) -> O(D) space optimization so it is practically unusable (reported as kpdecker/jsdiff#396). `diff-match-patch` and `jest-diff` both implement the linear space optimization, and have similar performance. - `diff-match-patch` strips common prefix and suffix, which makes it faster than `jest-diff` in "common prefix and suffix" test cases. - Both `diff-match-patch` and `jest-diff` can take a long time on "cleanup". See the "without common prefix or suffix 1" test case. We probably want to only enable cleanup for smaller input. - `diff-match-patch` performs visibly worse on the "without common prefix or suffix 2" test case. From the code it looks like `diff-match-patch` uses some kind of heuristics that tries to speed up things but ends up slowing it down. - Without cleanup, `jest-diff` might output `[1,51],[1,50]` that can be "obviously" merged to `[1,101]`. We might use a lightweight cleanup logic for that. - Reading the code, `diff-match-patch` turns lines into char codes. It cannot handle 65536 unique lines. (https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503) Conclusions: - `jest-diff` (and `diff-sequences` under the hood) is overall the best choice. It has expected time and space complexities, and provides flexibility to skip the potentially slow "cleanup", and can support >65k unique lines. - `jest-diff` misses the "skip common prefix / suffix" optimization that `diff-match-patch` has, and seems practically important (editing a line in the editor - all lines are common prefixes and suffixes except for the line being edited). The optimization is not hard to implement. This diff implements it. - For certain use-cases (ex. linelog) where the diff content is not needed (at least for the left / "a" side), it should use `diff-sequences` to avoid overhead preparing the diff content. - `jest-diff`'s `diffLines` outputs one line per `Diff` but we want one chunk per `Diff`. - `jest-diff`'s `diffStringsRaw` produces one `Diff` per chunk, and because [`string.slice` is O(1) in V8](https://stackoverflow.com/a/72545403), it has acceptable performance. But mapping lines to chars would introduce the 65535 unique line limit undesirably. Reviewed By: evangrayk Differential Revision: D43857949 fbshipit-source-id: 9a3d85ebf10c9b82da8ab5cba4e14e519bbf264d
- Loading branch information