xref: /OK3568_Linux_fs/yocto/bitbake/lib/simplediff/__init__.py (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun'''
2*4882a593SmuzhiyunSimple Diff for Python version 1.0
3*4882a593Smuzhiyun
4*4882a593SmuzhiyunAnnotate two versions of a list with the values that have been
5*4882a593Smuzhiyunchanged between the versions, similar to unix's `diff` but with
6*4882a593Smuzhiyuna dead-simple Python interface.
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun(C) Paul Butler 2008-2012 <http://www.paulbutler.org/>
9*4882a593SmuzhiyunMay be used and distributed under the zlib/libpng license
10*4882a593Smuzhiyun<http://www.opensource.org/licenses/zlib-license.php>
11*4882a593Smuzhiyun'''
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun__all__ = ['diff', 'string_diff', 'html_diff']
14*4882a593Smuzhiyun__version__ = '1.0'
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun
17*4882a593Smuzhiyundef diff(old, new):
18*4882a593Smuzhiyun    '''
19*4882a593Smuzhiyun    Find the differences between two lists. Returns a list of pairs, where the
20*4882a593Smuzhiyun    first value is in ['+','-','='] and represents an insertion, deletion, or
21*4882a593Smuzhiyun    no change for that list. The second value of the pair is the list
22*4882a593Smuzhiyun    of elements.
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun    Params:
25*4882a593Smuzhiyun        old     the old list of immutable, comparable values (ie. a list
26*4882a593Smuzhiyun                of strings)
27*4882a593Smuzhiyun        new     the new list of immutable, comparable values
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun    Returns:
30*4882a593Smuzhiyun        A list of pairs, with the first part of the pair being one of three
31*4882a593Smuzhiyun        strings ('-', '+', '=') and the second part being a list of values from
32*4882a593Smuzhiyun        the original old and/or new lists. The first part of the pair
33*4882a593Smuzhiyun        corresponds to whether the list of values is a deletion, insertion, or
34*4882a593Smuzhiyun        unchanged, respectively.
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun    Examples:
37*4882a593Smuzhiyun        >>> diff([1,2,3,4],[1,3,4])
38*4882a593Smuzhiyun        [('=', [1]), ('-', [2]), ('=', [3, 4])]
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun        >>> diff([1,2,3,4],[2,3,4,1])
41*4882a593Smuzhiyun        [('-', [1]), ('=', [2, 3, 4]), ('+', [1])]
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun        >>> diff('The quick brown fox jumps over the lazy dog'.split(),
44*4882a593Smuzhiyun        ...      'The slow blue cheese drips over the lazy carrot'.split())
45*4882a593Smuzhiyun        ... # doctest: +NORMALIZE_WHITESPACE
46*4882a593Smuzhiyun        [('=', ['The']),
47*4882a593Smuzhiyun         ('-', ['quick', 'brown', 'fox', 'jumps']),
48*4882a593Smuzhiyun         ('+', ['slow', 'blue', 'cheese', 'drips']),
49*4882a593Smuzhiyun         ('=', ['over', 'the', 'lazy']),
50*4882a593Smuzhiyun         ('-', ['dog']),
51*4882a593Smuzhiyun         ('+', ['carrot'])]
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun    '''
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun    # Create a map from old values to their indices
56*4882a593Smuzhiyun    old_index_map = dict()
57*4882a593Smuzhiyun    for i, val in enumerate(old):
58*4882a593Smuzhiyun        old_index_map.setdefault(val,list()).append(i)
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun    # Find the largest substring common to old and new.
61*4882a593Smuzhiyun    # We use a dynamic programming approach here.
62*4882a593Smuzhiyun    #
63*4882a593Smuzhiyun    # We iterate over each value in the `new` list, calling the
64*4882a593Smuzhiyun    # index `inew`. At each iteration, `overlap[i]` is the
65*4882a593Smuzhiyun    # length of the largest suffix of `old[:i]` equal to a suffix
66*4882a593Smuzhiyun    # of `new[:inew]` (or unset when `old[i]` != `new[inew]`).
67*4882a593Smuzhiyun    #
68*4882a593Smuzhiyun    # At each stage of iteration, the new `overlap` (called
69*4882a593Smuzhiyun    # `_overlap` until the original `overlap` is no longer needed)
70*4882a593Smuzhiyun    # is built from the old one.
71*4882a593Smuzhiyun    #
72*4882a593Smuzhiyun    # If the length of overlap exceeds the largest substring
73*4882a593Smuzhiyun    # seen so far (`sub_length`), we update the largest substring
74*4882a593Smuzhiyun    # to the overlapping strings.
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun    overlap = dict()
77*4882a593Smuzhiyun    # `sub_start_old` is the index of the beginning of the largest overlapping
78*4882a593Smuzhiyun    # substring in the old list. `sub_start_new` is the index of the beginning
79*4882a593Smuzhiyun    # of the same substring in the new list. `sub_length` is the length that
80*4882a593Smuzhiyun    # overlaps in both.
81*4882a593Smuzhiyun    # These track the largest overlapping substring seen so far, so naturally
82*4882a593Smuzhiyun    # we start with a 0-length substring.
83*4882a593Smuzhiyun    sub_start_old = 0
84*4882a593Smuzhiyun    sub_start_new = 0
85*4882a593Smuzhiyun    sub_length = 0
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun    for inew, val in enumerate(new):
88*4882a593Smuzhiyun        _overlap = dict()
89*4882a593Smuzhiyun        for iold in old_index_map.get(val,list()):
90*4882a593Smuzhiyun            # now we are considering all values of iold such that
91*4882a593Smuzhiyun            # `old[iold] == new[inew]`.
92*4882a593Smuzhiyun            _overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1
93*4882a593Smuzhiyun            if(_overlap[iold] > sub_length):
94*4882a593Smuzhiyun                # this is the largest substring seen so far, so store its
95*4882a593Smuzhiyun                # indices
96*4882a593Smuzhiyun                sub_length = _overlap[iold]
97*4882a593Smuzhiyun                sub_start_old = iold - sub_length + 1
98*4882a593Smuzhiyun                sub_start_new = inew - sub_length + 1
99*4882a593Smuzhiyun        overlap = _overlap
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun    if sub_length == 0:
102*4882a593Smuzhiyun        # If no common substring is found, we return an insert and delete...
103*4882a593Smuzhiyun        return (old and [('-', old)] or []) + (new and [('+', new)] or [])
104*4882a593Smuzhiyun    else:
105*4882a593Smuzhiyun        # ...otherwise, the common substring is unchanged and we recursively
106*4882a593Smuzhiyun        # diff the text before and after that substring
107*4882a593Smuzhiyun        return diff(old[ : sub_start_old], new[ : sub_start_new]) + \
108*4882a593Smuzhiyun               [('=', new[sub_start_new : sub_start_new + sub_length])] + \
109*4882a593Smuzhiyun               diff(old[sub_start_old + sub_length : ],
110*4882a593Smuzhiyun                       new[sub_start_new + sub_length : ])
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun
113*4882a593Smuzhiyundef string_diff(old, new):
114*4882a593Smuzhiyun    '''
115*4882a593Smuzhiyun    Returns the difference between the old and new strings when split on
116*4882a593Smuzhiyun    whitespace. Considers punctuation a part of the word
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun    This function is intended as an example; you'll probably want
119*4882a593Smuzhiyun    a more sophisticated wrapper in practice.
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun    Params:
122*4882a593Smuzhiyun        old     the old string
123*4882a593Smuzhiyun        new     the new string
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun    Returns:
126*4882a593Smuzhiyun        the output of `diff` on the two strings after splitting them
127*4882a593Smuzhiyun        on whitespace (a list of change instructions; see the docstring
128*4882a593Smuzhiyun        of `diff`)
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun    Examples:
131*4882a593Smuzhiyun        >>> string_diff('The quick brown fox', 'The fast blue fox')
132*4882a593Smuzhiyun        ... # doctest: +NORMALIZE_WHITESPACE
133*4882a593Smuzhiyun        [('=', ['The']),
134*4882a593Smuzhiyun         ('-', ['quick', 'brown']),
135*4882a593Smuzhiyun         ('+', ['fast', 'blue']),
136*4882a593Smuzhiyun         ('=', ['fox'])]
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun    '''
139*4882a593Smuzhiyun    return diff(old.split(), new.split())
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun
142*4882a593Smuzhiyundef html_diff(old, new):
143*4882a593Smuzhiyun    '''
144*4882a593Smuzhiyun    Returns the difference between two strings (as in stringDiff) in
145*4882a593Smuzhiyun    HTML format. HTML code in the strings is NOT escaped, so you
146*4882a593Smuzhiyun    will get weird results if the strings contain HTML.
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun    This function is intended as an example; you'll probably want
149*4882a593Smuzhiyun    a more sophisticated wrapper in practice.
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun    Params:
152*4882a593Smuzhiyun        old     the old string
153*4882a593Smuzhiyun        new     the new string
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun    Returns:
156*4882a593Smuzhiyun        the output of the diff expressed with HTML <ins> and <del>
157*4882a593Smuzhiyun        tags.
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun    Examples:
160*4882a593Smuzhiyun        >>> html_diff('The quick brown fox', 'The fast blue fox')
161*4882a593Smuzhiyun        'The <del>quick brown</del> <ins>fast blue</ins> fox'
162*4882a593Smuzhiyun    '''
163*4882a593Smuzhiyun    con = {'=': (lambda x: x),
164*4882a593Smuzhiyun           '+': (lambda x: "<ins>" + x + "</ins>"),
165*4882a593Smuzhiyun           '-': (lambda x: "<del>" + x + "</del>")}
166*4882a593Smuzhiyun    return " ".join([(con[a])(" ".join(b)) for a, b in string_diff(old, new)])
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun
169*4882a593Smuzhiyundef check_diff(old, new):
170*4882a593Smuzhiyun    '''
171*4882a593Smuzhiyun    This tests that diffs returned by `diff` are valid. You probably won't
172*4882a593Smuzhiyun    want to use this function, but it's provided for documentation and
173*4882a593Smuzhiyun    testing.
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun    A diff should satisfy the property that the old input is equal to the
176*4882a593Smuzhiyun    elements of the result annotated with '-' or '=' concatenated together.
177*4882a593Smuzhiyun    Likewise, the new input is equal to the elements of the result annotated
178*4882a593Smuzhiyun    with '+' or '=' concatenated together. This function compares `old`,
179*4882a593Smuzhiyun    `new`, and the results of `diff(old, new)` to ensure this is true.
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun    Tests:
182*4882a593Smuzhiyun        >>> check_diff('ABCBA', 'CBABA')
183*4882a593Smuzhiyun        >>> check_diff('Foobarbaz', 'Foobarbaz')
184*4882a593Smuzhiyun        >>> check_diff('Foobarbaz', 'Boobazbam')
185*4882a593Smuzhiyun        >>> check_diff('The quick brown fox', 'Some quick brown car')
186*4882a593Smuzhiyun        >>> check_diff('A thick red book', 'A quick blue book')
187*4882a593Smuzhiyun        >>> check_diff('dafhjkdashfkhasfjsdafdasfsda', 'asdfaskjfhksahkfjsdha')
188*4882a593Smuzhiyun        >>> check_diff('88288822828828288282828', '88288882882828282882828')
189*4882a593Smuzhiyun        >>> check_diff('1234567890', '24689')
190*4882a593Smuzhiyun    '''
191*4882a593Smuzhiyun    old = list(old)
192*4882a593Smuzhiyun    new = list(new)
193*4882a593Smuzhiyun    result = diff(old, new)
194*4882a593Smuzhiyun    _old = [val for (a, vals) in result if (a in '=-') for val in vals]
195*4882a593Smuzhiyun    assert old == _old, 'Expected %s, got %s' % (old, _old)
196*4882a593Smuzhiyun    _new = [val for (a, vals) in result if (a in '=+') for val in vals]
197*4882a593Smuzhiyun    assert new == _new, 'Expected %s, got %s' % (new, _new)
198*4882a593Smuzhiyun
199