This file is indexed.

/usr/lib/python3/dist-packages/lxml/html/diff.py is in python3-lxml 4.2.1-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
import difflib
from lxml import etree
from lxml.html import fragment_fromstring
import re

__all__ = ['html_annotate', 'htmldiff']

try:
    from html import escape as html_escape
except ImportError:
    from cgi import escape as html_escape
try:
    _unicode = unicode
except NameError:
    # Python 3
    _unicode = str
try:
    basestring
except NameError:
    # Python 3
    basestring = str

############################################################
## Annotation
############################################################

def default_markup(text, version):
    return '<span title="%s">%s</span>' % (
        html_escape(_unicode(version), 1), text)

def html_annotate(doclist, markup=default_markup):
    """
    doclist should be ordered from oldest to newest, like::

        >>> version1 = 'Hello World'
        >>> version2 = 'Goodbye World'
        >>> print(html_annotate([(version1, 'version 1'),
        ...                      (version2, 'version 2')]))
        <span title="version 2">Goodbye</span> <span title="version 1">World</span>

    The documents must be *fragments* (str/UTF8 or unicode), not
    complete documents

    The markup argument is a function to markup the spans of words.
    This function is called like markup('Hello', 'version 2'), and
    returns HTML.  The first argument is text and never includes any
    markup.  The default uses a span with a title:

        >>> print(default_markup('Some Text', 'by Joe'))
        <span title="by Joe">Some Text</span>
    """
    # The basic strategy we have is to split the documents up into
    # logical tokens (which are words with attached markup).  We then
    # do diffs of each of the versions to track when a token first
    # appeared in the document; the annotation attached to the token
    # is the version where it first appeared.
    tokenlist = [tokenize_annotated(doc, version)
                 for doc, version in doclist]
    cur_tokens = tokenlist[0]
    for tokens in tokenlist[1:]:
        html_annotate_merge_annotations(cur_tokens, tokens)
        cur_tokens = tokens

    # After we've tracked all the tokens, we can combine spans of text
    # that are adjacent and have the same annotation
    cur_tokens = compress_tokens(cur_tokens)
    # And finally add markup
    result = markup_serialize_tokens(cur_tokens, markup)
    return ''.join(result).strip()

def tokenize_annotated(doc, annotation): 
    """Tokenize a document and add an annotation attribute to each token
    """
    tokens = tokenize(doc, include_hrefs=False)
    for tok in tokens: 
        tok.annotation = annotation
    return tokens

def html_annotate_merge_annotations(tokens_old, tokens_new): 
    """Merge the annotations from tokens_old into tokens_new, when the
    tokens in the new document already existed in the old document.
    """
    s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
    commands = s.get_opcodes()

    for command, i1, i2, j1, j2 in commands:
        if command == 'equal': 
            eq_old = tokens_old[i1:i2]
            eq_new = tokens_new[j1:j2]
            copy_annotations(eq_old, eq_new)

def copy_annotations(src, dest): 
    """
    Copy annotations from the tokens listed in src to the tokens in dest
    """
    assert len(src) == len(dest)
    for src_tok, dest_tok in zip(src, dest): 
        dest_tok.annotation = src_tok.annotation

def compress_tokens(tokens):
    """
    Combine adjacent tokens when there is no HTML between the tokens, 
    and they share an annotation
    """
    result = [tokens[0]] 
    for tok in tokens[1:]: 
        if (not result[-1].post_tags and 
            not tok.pre_tags and 
            result[-1].annotation == tok.annotation): 
            compress_merge_back(result, tok)
        else: 
            result.append(tok)
    return result

def compress_merge_back(tokens, tok): 
    """ Merge tok into the last element of tokens (modifying the list of
    tokens in-place).  """
    last = tokens[-1]
    if type(last) is not token or type(tok) is not token: 
        tokens.append(tok)
    else:
        text = _unicode(last)
        if last.trailing_whitespace:
            text += last.trailing_whitespace
        text += tok
        merged = token(text,
                       pre_tags=last.pre_tags,
                       post_tags=tok.post_tags,
                       trailing_whitespace=tok.trailing_whitespace)
        merged.annotation = last.annotation
        tokens[-1] = merged
    
def markup_serialize_tokens(tokens, markup_func):
    """
    Serialize the list of tokens into a list of text chunks, calling
    markup_func around text to add annotations.
    """
    for token in tokens:
        for pre in token.pre_tags:
            yield pre
        html = token.html()
        html = markup_func(html, token.annotation)
        if token.trailing_whitespace:
            html += token.trailing_whitespace
        yield html
        for post in token.post_tags:
            yield post


############################################################
## HTML Diffs
############################################################

def htmldiff(old_html, new_html):
    ## FIXME: this should take parsed documents too, and use their body
    ## or other content.
    """ Do a diff of the old and new document.  The documents are HTML
    *fragments* (str/UTF8 or unicode), they are not complete documents
    (i.e., no <html> tag).

    Returns HTML with <ins> and <del> tags added around the
    appropriate text.  

    Markup is generally ignored, with the markup from new_html
    preserved, and possibly some markup from old_html (though it is
    considered acceptable to lose some of the old markup).  Only the
    words in the HTML are diffed.  The exception is <img> tags, which
    are treated like words, and the href attribute of <a> tags, which
    are noted inside the tag itself when there are changes.
    """ 
    old_html_tokens = tokenize(old_html)
    new_html_tokens = tokenize(new_html)
    result = htmldiff_tokens(old_html_tokens, new_html_tokens)
    result = ''.join(result).strip()
    return fixup_ins_del_tags(result)

def htmldiff_tokens(html1_tokens, html2_tokens):
    """ Does a diff on the tokens themselves, returning a list of text
    chunks (not tokens).
    """
    # There are several passes as we do the differences.  The tokens
    # isolate the portion of the content we care to diff; difflib does
    # all the actual hard work at that point.  
    #
    # Then we must create a valid document from pieces of both the old
    # document and the new document.  We generally prefer to take
    # markup from the new document, and only do a best effort attempt
    # to keep markup from the old document; anything that we can't
    # resolve we throw away.  Also we try to put the deletes as close
    # to the location where we think they would have been -- because
    # we are only keeping the markup from the new document, it can be
    # fuzzy where in the new document the old text would have gone.
    # Again we just do a best effort attempt.
    s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
    commands = s.get_opcodes()
    result = []
    for command, i1, i2, j1, j2 in commands:
        if command == 'equal':
            result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
            continue
        if command == 'insert' or command == 'replace':
            ins_tokens = expand_tokens(html2_tokens[j1:j2])
            merge_insert(ins_tokens, result)
        if command == 'delete' or command == 'replace':
            del_tokens = expand_tokens(html1_tokens[i1:i2])
            merge_delete(del_tokens, result)
    # If deletes were inserted directly as <del> then we'd have an
    # invalid document at this point.  Instead we put in special
    # markers, and when the complete diffed document has been created
    # we try to move the deletes around and resolve any problems.
    result = cleanup_delete(result)

    return result

def expand_tokens(tokens, equal=False):
    """Given a list of tokens, return a generator of the chunks of
    text for the data in the tokens.
    """
    for token in tokens:
        for pre in token.pre_tags:
            yield pre
        if not equal or not token.hide_when_equal:
            if token.trailing_whitespace:
                yield token.html() + token.trailing_whitespace
            else:
                yield token.html()
        for post in token.post_tags:
            yield post

def merge_insert(ins_chunks, doc):
    """ doc is the already-handled document (as a list of text chunks);
    here we add <ins>ins_chunks</ins> to the end of that.  """
    # Though we don't throw away unbalanced_start or unbalanced_end
    # (we assume there is accompanying markup later or earlier in the
    # document), we only put <ins> around the balanced portion.
    unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
    doc.extend(unbalanced_start)
    if doc and not doc[-1].endswith(' '):
        # Fix up the case where the word before the insert didn't end with 
        # a space
        doc[-1] += ' '
    doc.append('<ins>')
    if balanced and balanced[-1].endswith(' '):
        # We move space outside of </ins>
        balanced[-1] = balanced[-1][:-1]
    doc.extend(balanced)
    doc.append('</ins> ')
    doc.extend(unbalanced_end)

# These are sentinals to represent the start and end of a <del>
# segment, until we do the cleanup phase to turn them into proper
# markup:
class DEL_START:
    pass
class DEL_END:
    pass

class NoDeletes(Exception):
    """ Raised when the document no longer contains any pending deletes
    (DEL_START/DEL_END) """

def merge_delete(del_chunks, doc):
    """ Adds the text chunks in del_chunks to the document doc (another
    list of text chunks) with marker to show it is a delete.
    cleanup_delete later resolves these markers into <del> tags."""
    doc.append(DEL_START)
    doc.extend(del_chunks)
    doc.append(DEL_END)

def cleanup_delete(chunks):
    """ Cleans up any DEL_START/DEL_END markers in the document, replacing
    them with <del></del>.  To do this while keeping the document
    valid, it may need to drop some tags (either start or end tags).

    It may also move the del into adjacent tags to try to move it to a
    similar location where it was originally located (e.g., moving a
    delete into preceding <div> tag, if the del looks like (DEL_START,
    'Text</div>', DEL_END)"""
    while 1:
        # Find a pending DEL_START/DEL_END, splitting the document
        # into stuff-preceding-DEL_START, stuff-inside, and
        # stuff-following-DEL_END
        try:
            pre_delete, delete, post_delete = split_delete(chunks)
        except NoDeletes:
            # Nothing found, we've cleaned up the entire doc
            break
        # The stuff-inside-DEL_START/END may not be well balanced
        # markup.  First we figure out what unbalanced portions there are:
        unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
        # Then we move the span forward and/or backward based on these
        # unbalanced portions:
        locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
        locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
        doc = pre_delete
        if doc and not doc[-1].endswith(' '):
            # Fix up case where the word before us didn't have a trailing space
            doc[-1] += ' '
        doc.append('<del>')
        if balanced and balanced[-1].endswith(' '):
            # We move space outside of </del>
            balanced[-1] = balanced[-1][:-1]
        doc.extend(balanced)
        doc.append('</del> ')
        doc.extend(post_delete)
        chunks = doc
    return chunks

def split_unbalanced(chunks):
    """Return (unbalanced_start, balanced, unbalanced_end), where each is
    a list of text and tag chunks.

    unbalanced_start is a list of all the tags that are opened, but
    not closed in this span.  Similarly, unbalanced_end is a list of
    tags that are closed but were not opened.  Extracting these might
    mean some reordering of the chunks."""
    start = []
    end = []
    tag_stack = []
    balanced = []
    for chunk in chunks:
        if not chunk.startswith('<'):
            balanced.append(chunk)
            continue
        endtag = chunk[1] == '/'
        name = chunk.split()[0].strip('<>/')
        if name in empty_tags:
            balanced.append(chunk)
            continue
        if endtag:
            if tag_stack and tag_stack[-1][0] == name:
                balanced.append(chunk)
                name, pos, tag = tag_stack.pop()
                balanced[pos] = tag
            elif tag_stack:
                start.extend([tag for name, pos, tag in tag_stack])
                tag_stack = []
                end.append(chunk)
            else:
                end.append(chunk)
        else:
            tag_stack.append((name, len(balanced), chunk))
            balanced.append(None)
    start.extend(
        [chunk for name, pos, chunk in tag_stack])
    balanced = [chunk for chunk in balanced if chunk is not None]
    return start, balanced, end

def split_delete(chunks):
    """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
    stuff_after_DEL_END).  Returns the first case found (there may be
    more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
    there's no DEL_START found. """
    try:
        pos = chunks.index(DEL_START)
    except ValueError:
        raise NoDeletes
    pos2 = chunks.index(DEL_END)
    return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]

def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
    """ pre_delete and post_delete implicitly point to a place in the
    document (where the two were split).  This moves that point (by
    popping items from one and pushing them onto the other).  It moves
    the point to try to find a place where unbalanced_start applies.

    As an example::

        >>> unbalanced_start = ['<div>']
        >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
        >>> pre, post = doc[:3], doc[3:]
        >>> pre, post
        (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
        >>> locate_unbalanced_start(unbalanced_start, pre, post)
        >>> pre, post
        (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])

    As you can see, we moved the point so that the dangling <div> that
    we found will be effectively replaced by the div in the original
    document.  If this doesn't work out, we just throw away
    unbalanced_start without doing anything.
    """
    while 1:
        if not unbalanced_start:
            # We have totally succeeded in finding the position
            break
        finding = unbalanced_start[0]
        finding_name = finding.split()[0].strip('<>')
        if not post_delete:
            break
        next = post_delete[0]
        if next is DEL_START or not next.startswith('<'):
            # Reached a word, we can't move the delete text forward
            break
        if next[1] == '/':
            # Reached a closing tag, can we go further?  Maybe not...
            break
        name = next.split()[0].strip('<>')
        if name == 'ins':
            # Can't move into an insert
            break
        assert name != 'del', (
            "Unexpected delete tag: %r" % next)
        if name == finding_name:
            unbalanced_start.pop(0)
            pre_delete.append(post_delete.pop(0))
        else:
            # Found a tag that doesn't match
            break

def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
    """ like locate_unbalanced_start, except handling end tags and
    possibly moving the point earlier in the document.  """
    while 1:
        if not unbalanced_end:
            # Success
            break
        finding = unbalanced_end[-1]
        finding_name = finding.split()[0].strip('<>/')
        if not pre_delete:
            break
        next = pre_delete[-1]
        if next is DEL_END or not next.startswith('</'):
            # A word or a start tag
            break
        name = next.split()[0].strip('<>/')
        if name == 'ins' or name == 'del':
            # Can't move into an insert or delete
            break
        if name == finding_name:
            unbalanced_end.pop()
            post_delete.insert(0, pre_delete.pop())
        else:
            # Found a tag that doesn't match
            break

class token(_unicode):
    """ Represents a diffable token, generally a word that is displayed to
    the user.  Opening tags are attached to this token when they are
    adjacent (pre_tags) and closing tags that follow the word
    (post_tags).  Some exceptions occur when there are empty tags
    adjacent to a word, so there may be close tags in pre_tags, or
    open tags in post_tags.

    We also keep track of whether the word was originally followed by
    whitespace, even though we do not want to treat the word as
    equivalent to a similar word that does not have a trailing
    space."""

    # When this is true, the token will be eliminated from the
    # displayed diff if no change has occurred:
    hide_when_equal = False

    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
        obj = _unicode.__new__(cls, text)

        if pre_tags is not None:
            obj.pre_tags = pre_tags
        else:
            obj.pre_tags = []

        if post_tags is not None:
            obj.post_tags = post_tags
        else:
            obj.post_tags = []

        obj.trailing_whitespace = trailing_whitespace

        return obj

    def __repr__(self):
        return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
                                          self.post_tags, self.trailing_whitespace)

    def html(self):
        return _unicode(self)

class tag_token(token):

    """ Represents a token that is actually a tag.  Currently this is just
    the <img> tag, which takes up visible space just like a word but
    is only represented in a document by a tag.  """

    def __new__(cls, tag, data, html_repr, pre_tags=None, 
                post_tags=None, trailing_whitespace=""):
        obj = token.__new__(cls, "%s: %s" % (type, data), 
                            pre_tags=pre_tags, 
                            post_tags=post_tags, 
                            trailing_whitespace=trailing_whitespace)
        obj.tag = tag
        obj.data = data
        obj.html_repr = html_repr
        return obj

    def __repr__(self):
        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
            self.tag, 
            self.data, 
            self.html_repr, 
            self.pre_tags, 
            self.post_tags, 
            self.trailing_whitespace)
    def html(self):
        return self.html_repr

class href_token(token):

    """ Represents the href in an anchor tag.  Unlike other words, we only
    show the href when it changes.  """

    hide_when_equal = True

    def html(self):
        return ' Link: %s' % self

def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).

    This parses only the content of a page; anything in the head is
    ignored, and the <head> and <body> elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).

    <ins> and <del> tags are also eliminated from the document, as
    that gets confusing.

    If include_hrefs is true, then the href attribute of <a> tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks)

def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a <div> tag that was not in the original document.

    If cleanup is true, make sure there's no <head> or <body>, and get
    rid of any <ins> and <del> tags.
    """
    if cleanup:
        # This removes any extra markup or structure like <head>:
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True)

_body_re = re.compile(r'<body.*?>', re.I|re.S)
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)

def cleanup_html(html):
    """ This 'cleans' the HTML, meaning that any page structure is removed
    (only the contents of <body> are used, if there is any <body).
    Also <ins> and <del> tags are removed.  """
    match = _body_re.search(html)
    if match:
        html = html[match.end():]
    match = _end_body_re.search(html)
    if match:
        html = html[:match.start()]
    html = _ins_del_re.sub('', html)
    return html
    

end_whitespace_re = re.compile(r'[ \t\n\r]$')

def split_trailing_whitespace(word):
    """
    This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
    """
    stripped_length = len(word.rstrip())
    return word[0:stripped_length], word[stripped_length:]


def fixup_chunks(chunks):
    """
    This function takes a list of chunks and produces a list of tokens.
    """
    tag_accum = []
    cur_word = None
    result = []
    for chunk in chunks:
        if isinstance(chunk, tuple):
            if chunk[0] == 'img':
                src = chunk[1]
                tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
                cur_word = tag_token('img', src, html_repr=tag,
                                     pre_tags=tag_accum,
                                     trailing_whitespace=trailing_whitespace)
                tag_accum = []
                result.append(cur_word)

            elif chunk[0] == 'href':
                href = chunk[1]
                cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
                tag_accum = []
                result.append(cur_word)
            continue

        if is_word(chunk):
            chunk, trailing_whitespace = split_trailing_whitespace(chunk)
            cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
            tag_accum = []
            result.append(cur_word)

        elif is_start_tag(chunk):
            tag_accum.append(chunk)

        elif is_end_tag(chunk):
            if tag_accum:
                tag_accum.append(chunk)
            else:
                assert cur_word, (
                    "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
                    % (cur_word, result, chunk, chunks))
                cur_word.post_tags.append(chunk)
        else:
            assert(0)

    if not result:
        return [token('', pre_tags=tag_accum)]
    else:
        result[-1].post_tags.extend(tag_accum)

    return result


# All the tags in HTML that don't require end tags:
empty_tags = (
    'param', 'img', 'area', 'br', 'basefont', 'input',
    'base', 'meta', 'link', 'col')

block_level_tags = (
    'address',
    'blockquote',
    'center',
    'dir',
    'div',
    'dl',
    'fieldset',
    'form',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'hr',
    'isindex',
    'menu',
    'noframes',
    'noscript',
    'ol',
    'p',
    'pre',
    'table',
    'ul',
    )

block_level_container_tags = (
    'dd',
    'dt',
    'frameset',
    'li',
    'tbody',
    'td',
    'tfoot',
    'th',
    'thead',
    'tr',
    )


def flatten_el(el, include_hrefs, skip_tag=False):
    """ Takes an lxml element el, and generates all the text chunks for
    that tag.  Each start tag is a chunk, each word is a chunk, and each
    end tag is a chunk.

    If skip_tag is true, then the outermost container tag is
    not returned (just its contents)."""
    if not skip_tag:
        if el.tag == 'img':
            yield ('img', el.get('src'), start_tag(el))
        else:
            yield start_tag(el)
    if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
        return
    start_words = split_words(el.text)
    for word in start_words:
        yield html_escape(word)
    for child in el:
        for item in flatten_el(child, include_hrefs=include_hrefs):
            yield item
    if el.tag == 'a' and el.get('href') and include_hrefs:
        yield ('href', el.get('href'))
    if not skip_tag:
        yield end_tag(el)
        end_words = split_words(el.tail)
        for word in end_words:
            yield html_escape(word)

split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)

def split_words(text):
    """ Splits some text into words. Includes trailing whitespace
    on each word when appropriate.  """
    if not text or not text.strip():
        return []

    words = split_words_re.findall(text)
    return words

start_whitespace_re = re.compile(r'^[ \t\n\r]')

def start_tag(el):
    """
    The text representation of the start tag for a tag.
    """
    return '<%s%s>' % (
        el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
                         for name, value in el.attrib.items()]))

def end_tag(el):
    """ The text representation of an end tag for a tag.  Includes
    trailing whitespace when appropriate.  """
    if el.tail and start_whitespace_re.search(el.tail):
        extra = ' '
    else:
        extra = ''
    return '</%s>%s' % (el.tag, extra)

def is_word(tok):
    return not tok.startswith('<')

def is_end_tag(tok):
    return tok.startswith('</')

def is_start_tag(tok):
    return tok.startswith('<') and not tok.startswith('</')

def fixup_ins_del_tags(html):
    """ Given an html string, move any <ins> or <del> tags inside of any
    block-level elements, e.g. transform <ins><p>word</p></ins> to
    <p><ins>word</ins></p> """
    doc = parse_html(html, cleanup=False)
    _fixup_ins_del_tags(doc)
    html = serialize_html_fragment(doc, skip_outer=True)
    return html

def serialize_html_fragment(el, skip_outer=False):
    """ Serialize a single lxml element as HTML.  The serialized form
    includes the elements tail.  

    If skip_outer is true, then don't serialize the outermost tag
    """
    assert not isinstance(el, basestring), (
        "You should pass in an element, not a string like %r" % el)
    html = etree.tostring(el, method="html", encoding=_unicode)
    if skip_outer:
        # Get rid of the extra starting tag:
        html = html[html.find('>')+1:]
        # Get rid of the extra end tag:
        html = html[:html.rfind('<')]
        return html.strip()
    else:
        return html

def _fixup_ins_del_tags(doc):
    """fixup_ins_del_tags that works on an lxml document in-place
    """
    for tag in ['ins', 'del']:
        for el in doc.xpath('descendant-or-self::%s' % tag):
            if not _contains_block_level_tag(el):
                continue
            _move_el_inside_block(el, tag=tag)
            el.drop_tag()
            #_merge_element_contents(el)

def _contains_block_level_tag(el):
    """True if the element contains any block-level elements, like <p>, <td>, etc.
    """
    if el.tag in block_level_tags or el.tag in block_level_container_tags:
        return True
    for child in el:
        if _contains_block_level_tag(child):
            return True
    return False

def _move_el_inside_block(el, tag):
    """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
    and moves them inside any block-level tags.  """
    for child in el:
        if _contains_block_level_tag(child):
            break
    else:
        import sys
        # No block-level tags in any child
        children_tag = etree.Element(tag)
        children_tag.text = el.text
        el.text = None
        children_tag.extend(list(el))
        el[:] = [children_tag]
        return
    for child in list(el):
        if _contains_block_level_tag(child):
            _move_el_inside_block(child, tag)
            if child.tail:
                tail_tag = etree.Element(tag)
                tail_tag.text = child.tail
                child.tail = None
                el.insert(el.index(child)+1, tail_tag)
        else:
            child_tag = etree.Element(tag)
            el.replace(child, child_tag)
            child_tag.append(child)
    if el.text:
        text_tag = etree.Element(tag)
        text_tag.text = el.text
        el.text = None
        el.insert(0, text_tag)
            
def _merge_element_contents(el):
    """
    Removes an element, but merges its contents into its place, e.g.,
    given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
    <p>Hi there!</p>
    """
    parent = el.getparent()
    text = el.text or ''
    if el.tail:
        if not len(el):
            text += el.tail
        else:
            if el[-1].tail:
                el[-1].tail += el.tail
            else:
                el[-1].tail = el.tail
    index = parent.index(el)
    if text:
        if index == 0:
            previous = None
        else:
            previous = parent[index-1]
        if previous is None:
            if parent.text:
                parent.text += text
            else:
                parent.text = text
        else:
            if previous.tail:
                previous.tail += text
            else:
                previous.tail = text
    parent[index:index+1] = el.getchildren()

class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
    """
    Acts like SequenceMatcher, but tries not to find very small equal
    blocks amidst large spans of changes
    """

    threshold = 2
    
    def get_matching_blocks(self):
        size = min(len(self.b), len(self.b))
        threshold = min(self.threshold, size / 4)
        actual = difflib.SequenceMatcher.get_matching_blocks(self)
        return [item for item in actual
                if item[2] > threshold
                or not item[2]]

if __name__ == '__main__':
    from lxml.html import _diffcommand
    _diffcommand.main()