forked from brl/citadel
199 lines
7.3 KiB
Python
199 lines
7.3 KiB
Python
|
'''
|
||
|
Simple Diff for Python version 1.0
|
||
|
|
||
|
Annotate two versions of a list with the values that have been
|
||
|
changed between the versions, similar to unix's `diff` but with
|
||
|
a dead-simple Python interface.
|
||
|
|
||
|
(C) Paul Butler 2008-2012 <http://www.paulbutler.org/>
|
||
|
May be used and distributed under the zlib/libpng license
|
||
|
<http://www.opensource.org/licenses/zlib-license.php>
|
||
|
'''
|
||
|
|
||
|
__all__ = ['diff', 'string_diff', 'html_diff']
|
||
|
__version__ = '1.0'
|
||
|
|
||
|
|
||
|
def diff(old, new):
|
||
|
'''
|
||
|
Find the differences between two lists. Returns a list of pairs, where the
|
||
|
first value is in ['+','-','='] and represents an insertion, deletion, or
|
||
|
no change for that list. The second value of the pair is the list
|
||
|
of elements.
|
||
|
|
||
|
Params:
|
||
|
old the old list of immutable, comparable values (ie. a list
|
||
|
of strings)
|
||
|
new the new list of immutable, comparable values
|
||
|
|
||
|
Returns:
|
||
|
A list of pairs, with the first part of the pair being one of three
|
||
|
strings ('-', '+', '=') and the second part being a list of values from
|
||
|
the original old and/or new lists. The first part of the pair
|
||
|
corresponds to whether the list of values is a deletion, insertion, or
|
||
|
unchanged, respectively.
|
||
|
|
||
|
Examples:
|
||
|
>>> diff([1,2,3,4],[1,3,4])
|
||
|
[('=', [1]), ('-', [2]), ('=', [3, 4])]
|
||
|
|
||
|
>>> diff([1,2,3,4],[2,3,4,1])
|
||
|
[('-', [1]), ('=', [2, 3, 4]), ('+', [1])]
|
||
|
|
||
|
>>> diff('The quick brown fox jumps over the lazy dog'.split(),
|
||
|
... 'The slow blue cheese drips over the lazy carrot'.split())
|
||
|
... # doctest: +NORMALIZE_WHITESPACE
|
||
|
[('=', ['The']),
|
||
|
('-', ['quick', 'brown', 'fox', 'jumps']),
|
||
|
('+', ['slow', 'blue', 'cheese', 'drips']),
|
||
|
('=', ['over', 'the', 'lazy']),
|
||
|
('-', ['dog']),
|
||
|
('+', ['carrot'])]
|
||
|
|
||
|
'''
|
||
|
|
||
|
# Create a map from old values to their indices
|
||
|
old_index_map = dict()
|
||
|
for i, val in enumerate(old):
|
||
|
old_index_map.setdefault(val,list()).append(i)
|
||
|
|
||
|
# Find the largest substring common to old and new.
|
||
|
# We use a dynamic programming approach here.
|
||
|
#
|
||
|
# We iterate over each value in the `new` list, calling the
|
||
|
# index `inew`. At each iteration, `overlap[i]` is the
|
||
|
# length of the largest suffix of `old[:i]` equal to a suffix
|
||
|
# of `new[:inew]` (or unset when `old[i]` != `new[inew]`).
|
||
|
#
|
||
|
# At each stage of iteration, the new `overlap` (called
|
||
|
# `_overlap` until the original `overlap` is no longer needed)
|
||
|
# is built from the old one.
|
||
|
#
|
||
|
# If the length of overlap exceeds the largest substring
|
||
|
# seen so far (`sub_length`), we update the largest substring
|
||
|
# to the overlapping strings.
|
||
|
|
||
|
overlap = dict()
|
||
|
# `sub_start_old` is the index of the beginning of the largest overlapping
|
||
|
# substring in the old list. `sub_start_new` is the index of the beginning
|
||
|
# of the same substring in the new list. `sub_length` is the length that
|
||
|
# overlaps in both.
|
||
|
# These track the largest overlapping substring seen so far, so naturally
|
||
|
# we start with a 0-length substring.
|
||
|
sub_start_old = 0
|
||
|
sub_start_new = 0
|
||
|
sub_length = 0
|
||
|
|
||
|
for inew, val in enumerate(new):
|
||
|
_overlap = dict()
|
||
|
for iold in old_index_map.get(val,list()):
|
||
|
# now we are considering all values of iold such that
|
||
|
# `old[iold] == new[inew]`.
|
||
|
_overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1
|
||
|
if(_overlap[iold] > sub_length):
|
||
|
# this is the largest substring seen so far, so store its
|
||
|
# indices
|
||
|
sub_length = _overlap[iold]
|
||
|
sub_start_old = iold - sub_length + 1
|
||
|
sub_start_new = inew - sub_length + 1
|
||
|
overlap = _overlap
|
||
|
|
||
|
if sub_length == 0:
|
||
|
# If no common substring is found, we return an insert and delete...
|
||
|
return (old and [('-', old)] or []) + (new and [('+', new)] or [])
|
||
|
else:
|
||
|
# ...otherwise, the common substring is unchanged and we recursively
|
||
|
# diff the text before and after that substring
|
||
|
return diff(old[ : sub_start_old], new[ : sub_start_new]) + \
|
||
|
[('=', new[sub_start_new : sub_start_new + sub_length])] + \
|
||
|
diff(old[sub_start_old + sub_length : ],
|
||
|
new[sub_start_new + sub_length : ])
|
||
|
|
||
|
|
||
|
def string_diff(old, new):
|
||
|
'''
|
||
|
Returns the difference between the old and new strings when split on
|
||
|
whitespace. Considers punctuation a part of the word
|
||
|
|
||
|
This function is intended as an example; you'll probably want
|
||
|
a more sophisticated wrapper in practice.
|
||
|
|
||
|
Params:
|
||
|
old the old string
|
||
|
new the new string
|
||
|
|
||
|
Returns:
|
||
|
the output of `diff` on the two strings after splitting them
|
||
|
on whitespace (a list of change instructions; see the docstring
|
||
|
of `diff`)
|
||
|
|
||
|
Examples:
|
||
|
>>> string_diff('The quick brown fox', 'The fast blue fox')
|
||
|
... # doctest: +NORMALIZE_WHITESPACE
|
||
|
[('=', ['The']),
|
||
|
('-', ['quick', 'brown']),
|
||
|
('+', ['fast', 'blue']),
|
||
|
('=', ['fox'])]
|
||
|
|
||
|
'''
|
||
|
return diff(old.split(), new.split())
|
||
|
|
||
|
|
||
|
def html_diff(old, new):
|
||
|
'''
|
||
|
Returns the difference between two strings (as in stringDiff) in
|
||
|
HTML format. HTML code in the strings is NOT escaped, so you
|
||
|
will get weird results if the strings contain HTML.
|
||
|
|
||
|
This function is intended as an example; you'll probably want
|
||
|
a more sophisticated wrapper in practice.
|
||
|
|
||
|
Params:
|
||
|
old the old string
|
||
|
new the new string
|
||
|
|
||
|
Returns:
|
||
|
the output of the diff expressed with HTML <ins> and <del>
|
||
|
tags.
|
||
|
|
||
|
Examples:
|
||
|
>>> html_diff('The quick brown fox', 'The fast blue fox')
|
||
|
'The <del>quick brown</del> <ins>fast blue</ins> fox'
|
||
|
'''
|
||
|
con = {'=': (lambda x: x),
|
||
|
'+': (lambda x: "<ins>" + x + "</ins>"),
|
||
|
'-': (lambda x: "<del>" + x + "</del>")}
|
||
|
return " ".join([(con[a])(" ".join(b)) for a, b in string_diff(old, new)])
|
||
|
|
||
|
|
||
|
def check_diff(old, new):
|
||
|
'''
|
||
|
This tests that diffs returned by `diff` are valid. You probably won't
|
||
|
want to use this function, but it's provided for documentation and
|
||
|
testing.
|
||
|
|
||
|
A diff should satisfy the property that the old input is equal to the
|
||
|
elements of the result annotated with '-' or '=' concatenated together.
|
||
|
Likewise, the new input is equal to the elements of the result annotated
|
||
|
with '+' or '=' concatenated together. This function compares `old`,
|
||
|
`new`, and the results of `diff(old, new)` to ensure this is true.
|
||
|
|
||
|
Tests:
|
||
|
>>> check_diff('ABCBA', 'CBABA')
|
||
|
>>> check_diff('Foobarbaz', 'Foobarbaz')
|
||
|
>>> check_diff('Foobarbaz', 'Boobazbam')
|
||
|
>>> check_diff('The quick brown fox', 'Some quick brown car')
|
||
|
>>> check_diff('A thick red book', 'A quick blue book')
|
||
|
>>> check_diff('dafhjkdashfkhasfjsdafdasfsda', 'asdfaskjfhksahkfjsdha')
|
||
|
>>> check_diff('88288822828828288282828', '88288882882828282882828')
|
||
|
>>> check_diff('1234567890', '24689')
|
||
|
'''
|
||
|
old = list(old)
|
||
|
new = list(new)
|
||
|
result = diff(old, new)
|
||
|
_old = [val for (a, vals) in result if (a in '=-') for val in vals]
|
||
|
assert old == _old, 'Expected %s, got %s' % (old, _old)
|
||
|
_new = [val for (a, vals) in result if (a in '=+') for val in vals]
|
||
|
assert new == _new, 'Expected %s, got %s' % (new, _new)
|
||
|
|