1*4882a593Smuzhiyun''' 2*4882a593SmuzhiyunSimple Diff for Python version 1.0 3*4882a593Smuzhiyun 4*4882a593SmuzhiyunAnnotate two versions of a list with the values that have been 5*4882a593Smuzhiyunchanged between the versions, similar to unix's `diff` but with 6*4882a593Smuzhiyuna dead-simple Python interface. 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun(C) Paul Butler 2008-2012 <http://www.paulbutler.org/> 9*4882a593SmuzhiyunMay be used and distributed under the zlib/libpng license 10*4882a593Smuzhiyun<http://www.opensource.org/licenses/zlib-license.php> 11*4882a593Smuzhiyun''' 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun__all__ = ['diff', 'string_diff', 'html_diff'] 14*4882a593Smuzhiyun__version__ = '1.0' 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun 17*4882a593Smuzhiyundef diff(old, new): 18*4882a593Smuzhiyun ''' 19*4882a593Smuzhiyun Find the differences between two lists. Returns a list of pairs, where the 20*4882a593Smuzhiyun first value is in ['+','-','='] and represents an insertion, deletion, or 21*4882a593Smuzhiyun no change for that list. The second value of the pair is the list 22*4882a593Smuzhiyun of elements. 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun Params: 25*4882a593Smuzhiyun old the old list of immutable, comparable values (ie. a list 26*4882a593Smuzhiyun of strings) 27*4882a593Smuzhiyun new the new list of immutable, comparable values 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun Returns: 30*4882a593Smuzhiyun A list of pairs, with the first part of the pair being one of three 31*4882a593Smuzhiyun strings ('-', '+', '=') and the second part being a list of values from 32*4882a593Smuzhiyun the original old and/or new lists. The first part of the pair 33*4882a593Smuzhiyun corresponds to whether the list of values is a deletion, insertion, or 34*4882a593Smuzhiyun unchanged, respectively. 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun Examples: 37*4882a593Smuzhiyun >>> diff([1,2,3,4],[1,3,4]) 38*4882a593Smuzhiyun [('=', [1]), ('-', [2]), ('=', [3, 4])] 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun >>> diff([1,2,3,4],[2,3,4,1]) 41*4882a593Smuzhiyun [('-', [1]), ('=', [2, 3, 4]), ('+', [1])] 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun >>> diff('The quick brown fox jumps over the lazy dog'.split(), 44*4882a593Smuzhiyun ... 'The slow blue cheese drips over the lazy carrot'.split()) 45*4882a593Smuzhiyun ... # doctest: +NORMALIZE_WHITESPACE 46*4882a593Smuzhiyun [('=', ['The']), 47*4882a593Smuzhiyun ('-', ['quick', 'brown', 'fox', 'jumps']), 48*4882a593Smuzhiyun ('+', ['slow', 'blue', 'cheese', 'drips']), 49*4882a593Smuzhiyun ('=', ['over', 'the', 'lazy']), 50*4882a593Smuzhiyun ('-', ['dog']), 51*4882a593Smuzhiyun ('+', ['carrot'])] 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun ''' 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun # Create a map from old values to their indices 56*4882a593Smuzhiyun old_index_map = dict() 57*4882a593Smuzhiyun for i, val in enumerate(old): 58*4882a593Smuzhiyun old_index_map.setdefault(val,list()).append(i) 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun # Find the largest substring common to old and new. 61*4882a593Smuzhiyun # We use a dynamic programming approach here. 62*4882a593Smuzhiyun # 63*4882a593Smuzhiyun # We iterate over each value in the `new` list, calling the 64*4882a593Smuzhiyun # index `inew`. At each iteration, `overlap[i]` is the 65*4882a593Smuzhiyun # length of the largest suffix of `old[:i]` equal to a suffix 66*4882a593Smuzhiyun # of `new[:inew]` (or unset when `old[i]` != `new[inew]`). 67*4882a593Smuzhiyun # 68*4882a593Smuzhiyun # At each stage of iteration, the new `overlap` (called 69*4882a593Smuzhiyun # `_overlap` until the original `overlap` is no longer needed) 70*4882a593Smuzhiyun # is built from the old one. 71*4882a593Smuzhiyun # 72*4882a593Smuzhiyun # If the length of overlap exceeds the largest substring 73*4882a593Smuzhiyun # seen so far (`sub_length`), we update the largest substring 74*4882a593Smuzhiyun # to the overlapping strings. 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun overlap = dict() 77*4882a593Smuzhiyun # `sub_start_old` is the index of the beginning of the largest overlapping 78*4882a593Smuzhiyun # substring in the old list. `sub_start_new` is the index of the beginning 79*4882a593Smuzhiyun # of the same substring in the new list. `sub_length` is the length that 80*4882a593Smuzhiyun # overlaps in both. 81*4882a593Smuzhiyun # These track the largest overlapping substring seen so far, so naturally 82*4882a593Smuzhiyun # we start with a 0-length substring. 83*4882a593Smuzhiyun sub_start_old = 0 84*4882a593Smuzhiyun sub_start_new = 0 85*4882a593Smuzhiyun sub_length = 0 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun for inew, val in enumerate(new): 88*4882a593Smuzhiyun _overlap = dict() 89*4882a593Smuzhiyun for iold in old_index_map.get(val,list()): 90*4882a593Smuzhiyun # now we are considering all values of iold such that 91*4882a593Smuzhiyun # `old[iold] == new[inew]`. 92*4882a593Smuzhiyun _overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1 93*4882a593Smuzhiyun if(_overlap[iold] > sub_length): 94*4882a593Smuzhiyun # this is the largest substring seen so far, so store its 95*4882a593Smuzhiyun # indices 96*4882a593Smuzhiyun sub_length = _overlap[iold] 97*4882a593Smuzhiyun sub_start_old = iold - sub_length + 1 98*4882a593Smuzhiyun sub_start_new = inew - sub_length + 1 99*4882a593Smuzhiyun overlap = _overlap 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun if sub_length == 0: 102*4882a593Smuzhiyun # If no common substring is found, we return an insert and delete... 103*4882a593Smuzhiyun return (old and [('-', old)] or []) + (new and [('+', new)] or []) 104*4882a593Smuzhiyun else: 105*4882a593Smuzhiyun # ...otherwise, the common substring is unchanged and we recursively 106*4882a593Smuzhiyun # diff the text before and after that substring 107*4882a593Smuzhiyun return diff(old[ : sub_start_old], new[ : sub_start_new]) + \ 108*4882a593Smuzhiyun [('=', new[sub_start_new : sub_start_new + sub_length])] + \ 109*4882a593Smuzhiyun diff(old[sub_start_old + sub_length : ], 110*4882a593Smuzhiyun new[sub_start_new + sub_length : ]) 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun 113*4882a593Smuzhiyundef string_diff(old, new): 114*4882a593Smuzhiyun ''' 115*4882a593Smuzhiyun Returns the difference between the old and new strings when split on 116*4882a593Smuzhiyun whitespace. Considers punctuation a part of the word 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun This function is intended as an example; you'll probably want 119*4882a593Smuzhiyun a more sophisticated wrapper in practice. 120*4882a593Smuzhiyun 121*4882a593Smuzhiyun Params: 122*4882a593Smuzhiyun old the old string 123*4882a593Smuzhiyun new the new string 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun Returns: 126*4882a593Smuzhiyun the output of `diff` on the two strings after splitting them 127*4882a593Smuzhiyun on whitespace (a list of change instructions; see the docstring 128*4882a593Smuzhiyun of `diff`) 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun Examples: 131*4882a593Smuzhiyun >>> string_diff('The quick brown fox', 'The fast blue fox') 132*4882a593Smuzhiyun ... # doctest: +NORMALIZE_WHITESPACE 133*4882a593Smuzhiyun [('=', ['The']), 134*4882a593Smuzhiyun ('-', ['quick', 'brown']), 135*4882a593Smuzhiyun ('+', ['fast', 'blue']), 136*4882a593Smuzhiyun ('=', ['fox'])] 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun ''' 139*4882a593Smuzhiyun return diff(old.split(), new.split()) 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun 142*4882a593Smuzhiyundef html_diff(old, new): 143*4882a593Smuzhiyun ''' 144*4882a593Smuzhiyun Returns the difference between two strings (as in stringDiff) in 145*4882a593Smuzhiyun HTML format. HTML code in the strings is NOT escaped, so you 146*4882a593Smuzhiyun will get weird results if the strings contain HTML. 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun This function is intended as an example; you'll probably want 149*4882a593Smuzhiyun a more sophisticated wrapper in practice. 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun Params: 152*4882a593Smuzhiyun old the old string 153*4882a593Smuzhiyun new the new string 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun Returns: 156*4882a593Smuzhiyun the output of the diff expressed with HTML <ins> and <del> 157*4882a593Smuzhiyun tags. 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun Examples: 160*4882a593Smuzhiyun >>> html_diff('The quick brown fox', 'The fast blue fox') 161*4882a593Smuzhiyun 'The <del>quick brown</del> <ins>fast blue</ins> fox' 162*4882a593Smuzhiyun ''' 163*4882a593Smuzhiyun con = {'=': (lambda x: x), 164*4882a593Smuzhiyun '+': (lambda x: "<ins>" + x + "</ins>"), 165*4882a593Smuzhiyun '-': (lambda x: "<del>" + x + "</del>")} 166*4882a593Smuzhiyun return " ".join([(con[a])(" ".join(b)) for a, b in string_diff(old, new)]) 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun 169*4882a593Smuzhiyundef check_diff(old, new): 170*4882a593Smuzhiyun ''' 171*4882a593Smuzhiyun This tests that diffs returned by `diff` are valid. You probably won't 172*4882a593Smuzhiyun want to use this function, but it's provided for documentation and 173*4882a593Smuzhiyun testing. 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun A diff should satisfy the property that the old input is equal to the 176*4882a593Smuzhiyun elements of the result annotated with '-' or '=' concatenated together. 177*4882a593Smuzhiyun Likewise, the new input is equal to the elements of the result annotated 178*4882a593Smuzhiyun with '+' or '=' concatenated together. This function compares `old`, 179*4882a593Smuzhiyun `new`, and the results of `diff(old, new)` to ensure this is true. 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun Tests: 182*4882a593Smuzhiyun >>> check_diff('ABCBA', 'CBABA') 183*4882a593Smuzhiyun >>> check_diff('Foobarbaz', 'Foobarbaz') 184*4882a593Smuzhiyun >>> check_diff('Foobarbaz', 'Boobazbam') 185*4882a593Smuzhiyun >>> check_diff('The quick brown fox', 'Some quick brown car') 186*4882a593Smuzhiyun >>> check_diff('A thick red book', 'A quick blue book') 187*4882a593Smuzhiyun >>> check_diff('dafhjkdashfkhasfjsdafdasfsda', 'asdfaskjfhksahkfjsdha') 188*4882a593Smuzhiyun >>> check_diff('88288822828828288282828', '88288882882828282882828') 189*4882a593Smuzhiyun >>> check_diff('1234567890', '24689') 190*4882a593Smuzhiyun ''' 191*4882a593Smuzhiyun old = list(old) 192*4882a593Smuzhiyun new = list(new) 193*4882a593Smuzhiyun result = diff(old, new) 194*4882a593Smuzhiyun _old = [val for (a, vals) in result if (a in '=-') for val in vals] 195*4882a593Smuzhiyun assert old == _old, 'Expected %s, got %s' % (old, _old) 196*4882a593Smuzhiyun _new = [val for (a, vals) in result if (a in '=+') for val in vals] 197*4882a593Smuzhiyun assert new == _new, 'Expected %s, got %s' % (new, _new) 198*4882a593Smuzhiyun 199