Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/lxml/html/diff.py
+++ b/venv/lib/python3.10/site-packages/lxml/html/diff.py
@@ -0,0 +1,972 @@
+# cython: language_level=3
+
+try:
+    import cython
+except ImportError:
+    class fake_cython:
+        compiled = False
+        def cfunc(self, func): return func
+        def cclass(self, func): return func
+        def declare(self, _, value): return value
+        def __getattr__(self, type_name): return "object"
+
+    cython = fake_cython()
+
+try:
+    from . import _difflib as difflib
+    import inspect
+    if inspect.isfunction(difflib.get_close_matches):
+        raise ImportError(
+            "Embedded difflib is not compiled to a fast binary, using the stdlib instead.")
+    from cython.cimports.lxml.html._difflib import SequenceMatcher
+except ImportError:
+    import difflib
+    if not cython.compiled:
+        from difflib import SequenceMatcher
+
+import itertools
+import functools
+import operator
+import re
+
+from lxml import etree
+from lxml.html import fragment_fromstring
+from . import defs
+
+__all__ = ['html_annotate', 'htmldiff']
+
+group_by_first_item = functools.partial(itertools.groupby, key=operator.itemgetter(0))
+
+
+############################################################
+## Annotation
+############################################################
+
+@cython.cfunc
+def html_escape(text: str, _escapes: tuple = ('&amp;', '&lt;', '&gt;', '&quot;', '&#x27;')) -> str:
+    # Not so slow compiled version of 'html.escape()'.
+    # Most of the time, we replace little to nothing, so use a fast decision what needs to be done.
+    ch: cython.Py_UCS4
+    replace: cython.char[5] = [False] * 5
+    for ch in text:
+        replace[0] |= ch == '&'
+        replace[1] |= ch == '<'
+        replace[2] |= ch == '>'
+        replace[3] |= ch == '"'
+        replace[4] |= ch == "'"
+
+    for i in range(5):
+        if replace[i]:
+            text = text.replace('&<>"\''[i], _escapes[i])
+
+    return text
+
+
+if not cython.compiled:
+    from html import escape as html_escape
+
+
+def default_markup(text, version):
+    return '<span title="%s">%s</span>' % (
+        html_escape(version), text)
+
+def html_annotate(doclist, markup=default_markup):
+    """
+    doclist should be ordered from oldest to newest, like::
+
+        >>> version1 = 'Hello World'
+        >>> version2 = 'Goodbye World'
+        >>> print(html_annotate([(version1, 'version 1'),
+        ...                      (version2, 'version 2')]))
+        <span title="version 2">Goodbye</span> <span title="version 1">World</span>
+
+    The documents must be *fragments* (str/UTF8 or unicode), not
+    complete documents
+
+    The markup argument is a function to markup the spans of words.
+    This function is called like markup('Hello', 'version 2'), and
+    returns HTML.  The first argument is text and never includes any
+    markup.  The default uses a span with a title:
+
+        >>> print(default_markup('Some Text', 'by Joe'))
+        <span title="by Joe">Some Text</span>
+    """
+    # The basic strategy we have is to split the documents up into
+    # logical tokens (which are words with attached markup).  We then
+    # do diffs of each of the versions to track when a token first
+    # appeared in the document; the annotation attached to the token
+    # is the version where it first appeared.
+    tokenlist = [tokenize_annotated(doc, version)
+                 for doc, version in doclist]
+    cur_tokens = tokenlist[0]
+    for tokens in tokenlist[1:]:
+        html_annotate_merge_annotations(cur_tokens, tokens)
+        cur_tokens = tokens
+
+    # After we've tracked all the tokens, we can combine spans of text
+    # that are adjacent and have the same annotation
+    cur_tokens = compress_tokens(cur_tokens)
+    # And finally add markup
+    result = markup_serialize_tokens(cur_tokens, markup)
+    return ''.join(result).strip()
+
+def tokenize_annotated(doc, annotation):
+    """Tokenize a document and add an annotation attribute to each token
+    """
+    tokens = tokenize(doc, include_hrefs=False)
+    for tok in tokens:
+        tok.annotation = annotation
+    return tokens
+
+def html_annotate_merge_annotations(tokens_old, tokens_new):
+    """Merge the annotations from tokens_old into tokens_new, when the
+    tokens in the new document already existed in the old document.
+    """
+    s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+    commands = s.get_opcodes()
+
+    for command, i1, i2, j1, j2 in commands:
+        if command == 'equal':
+            eq_old = tokens_old[i1:i2]
+            eq_new = tokens_new[j1:j2]
+            copy_annotations(eq_old, eq_new)
+
+def copy_annotations(src, dest):
+    """
+    Copy annotations from the tokens listed in src to the tokens in dest
+    """
+    assert len(src) == len(dest)
+    for src_tok, dest_tok in zip(src, dest):
+        dest_tok.annotation = src_tok.annotation
+
+def compress_tokens(tokens):
+    """
+    Combine adjacent tokens when there is no HTML between the tokens,
+    and they share an annotation
+    """
+    result = [tokens[0]]
+    for tok in tokens[1:]:
+        if (not tok.pre_tags and
+                not result[-1].post_tags and
+                result[-1].annotation == tok.annotation):
+            compress_merge_back(result, tok)
+        else:
+            result.append(tok)
+    return result
+
+@cython.cfunc
+def compress_merge_back(tokens: list, tok):
+    """ Merge tok into the last element of tokens (modifying the list of
+    tokens in-place).  """
+    last = tokens[-1]
+    if type(last) is not token or type(tok) is not token:
+        tokens.append(tok)
+    else:
+        text = last + last.trailing_whitespace + tok
+        merged = token(text,
+                       pre_tags=last.pre_tags,
+                       post_tags=tok.post_tags,
+                       trailing_whitespace=tok.trailing_whitespace)
+        merged.annotation = last.annotation
+        tokens[-1] = merged
+
+def markup_serialize_tokens(tokens, markup_func):
+    """
+    Serialize the list of tokens into a list of text chunks, calling
+    markup_func around text to add annotations.
+    """
+    for token in tokens:
+        yield from token.pre_tags
+        html = token.html()
+        html = markup_func(html, token.annotation) + token.trailing_whitespace
+        yield html
+        yield from token.post_tags
+
+
+############################################################
+## HTML Diffs
+############################################################
+
+def htmldiff(old_html, new_html):
+    ## FIXME: this should take parsed documents too, and use their body
+    ## or other content.
+    """ Do a diff of the old and new document.  The documents are HTML
+    *fragments* (str/UTF8 or unicode), they are not complete documents
+    (i.e., no <html> tag).
+
+    Returns HTML with <ins> and <del> tags added around the
+    appropriate text.
+
+    Markup is generally ignored, with the markup from new_html
+    preserved, and possibly some markup from old_html (though it is
+    considered acceptable to lose some of the old markup).  Only the
+    words in the HTML are diffed.  The exception is <img> tags, which
+    are treated like words, and the href attribute of <a> tags, which
+    are noted inside the tag itself when there are changes.
+    """
+    old_html_tokens = tokenize(old_html)
+    new_html_tokens = tokenize(new_html)
+    result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+    try:
+        result = ''.join(result).strip()
+    except (ValueError, TypeError) as exc:
+        print(exc)
+        result = ''
+    return fixup_ins_del_tags(result)
+
+
+def htmldiff_tokens(html1_tokens, html2_tokens):
+    """ Does a diff on the tokens themselves, returning a list of text
+    chunks (not tokens).
+    """
+    # There are several passes as we do the differences.  The tokens
+    # isolate the portion of the content we care to diff; difflib does
+    # all the actual hard work at that point.
+    #
+    # Then we must create a valid document from pieces of both the old
+    # document and the new document.  We generally prefer to take
+    # markup from the new document, and only do a best effort attempt
+    # to keep markup from the old document; anything that we can't
+    # resolve we throw away.  Also we try to put the deletes as close
+    # to the location where we think they would have been -- because
+    # we are only keeping the markup from the new document, it can be
+    # fuzzy where in the new document the old text would have gone.
+    # Again we just do a best effort attempt.
+    s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+    commands = s.get_opcodes()
+    result = []
+    for command, i1, i2, j1, j2 in commands:
+        if command == 'equal':
+            result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+            continue
+        if command == 'insert' or command == 'replace':
+            ins_tokens = expand_tokens(html2_tokens[j1:j2])
+            merge_insert(ins_tokens, result)
+        if command == 'delete' or command == 'replace':
+            del_tokens = expand_tokens(html1_tokens[i1:i2])
+            merge_delete(del_tokens, result)
+
+    # If deletes were inserted directly as <del> then we'd have an
+    # invalid document at this point.  Instead we put in special
+    # markers, and when the complete diffed document has been created
+    # we try to move the deletes around and resolve any problems.
+    cleanup_delete(result)
+
+    return result
+
+
+def expand_tokens(tokens, equal=False):
+    """Given a list of tokens, return a generator of the chunks of
+    text for the data in the tokens.
+    """
+    for token in tokens:
+        yield from token.pre_tags
+        if not equal or not token.hide_when_equal:
+            yield token.html() + token.trailing_whitespace
+        yield from token.post_tags
+
+
+def merge_insert(ins_chunks, doc: list):
+    """ doc is the already-handled document (as a list of text chunks);
+    here we add <ins>ins_chunks</ins> to the end of that.  """
+    # Though we don't throw away unbalanced start/end tags
+    # (we assume there is accompanying markup later or earlier in the
+    # document), we only put <ins> around the balanced portion.
+
+    # Legacy note: We make a choice here. Originally, we merged all sequences of
+    # unbalanced tags together into separate start and end tag groups. Now, we look at
+    # each sequence separately, leading to more fine-grained diffs but different
+    # tag structure than before.
+
+    item: tuple
+    for balanced, marked_chunks in group_by_first_item(mark_unbalanced(ins_chunks)):
+        chunks = [item[1] for item in marked_chunks]
+        if balanced == 'b':
+            if doc and not doc[-1].endswith(' '):
+                # Fix up the case where the word before the insert didn't end with a space.
+                doc[-1] += ' '
+            doc.append('<ins>')
+            doc.extend(chunks)
+            if doc[-1].endswith(' '):
+                # We move space outside of </ins>.
+                doc[-1] = doc[-1][:-1]
+            doc.append('</ins> ')
+        else:
+            # unmatched start or end
+            doc.extend(chunks)
+
+
+@cython.cfunc
+def tag_name_of_chunk(chunk: str) -> str:
+    i: cython.Py_ssize_t
+    ch: cython.Py_UCS4
+
+    if chunk[0] != '<':
+        return ""
+
+    start_pos = 1
+    for i, ch in enumerate(chunk):
+        if ch == '/':
+            start_pos = 2
+        elif ch == '>':
+            return chunk[start_pos:i]
+        elif ch.isspace():
+            return chunk[start_pos:i]
+
+    return chunk[start_pos:]
+
+if not cython.compiled:
+    # Avoid performance regression in Python due to string iteration.
+    def tag_name_of_chunk(chunk: str) -> str:
+        return chunk.split(None, 1)[0].strip('<>/')
+
+
+# These are sentinels to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+    pass
+class DEL_END:
+    pass
+
+
+def merge_delete(del_chunks, doc: list):
+    """ Adds the text chunks in del_chunks to the document doc (another
+    list of text chunks) with marker to show it is a delete.
+    cleanup_delete later resolves these markers into <del> tags."""
+
+    doc.append(DEL_START)
+    doc.extend(del_chunks)
+    doc.append(DEL_END)
+
+
+def cleanup_delete(chunks: list):
+    """ Cleans up any DEL_START/DEL_END markers in the document, replacing
+    them with <del></del>.  To do this while keeping the document
+    valid, it may need to drop some tags (either start or end tags).
+
+    It may also move the del into adjacent tags to try to move it to a
+    similar location where it was originally located (e.g., moving a
+    delete into preceding <div> tag, if the del looks like (DEL_START,
+    'Text</div>', DEL_END)
+    """
+    chunk_count = len(chunks)
+
+    i: cython.Py_ssize_t
+    del_start: cython.Py_ssize_t
+    del_end: cython.Py_ssize_t
+    shift_start_right: cython.Py_ssize_t
+    shift_end_left: cython.Py_ssize_t
+    unbalanced_start: cython.Py_ssize_t
+    unbalanced_end: cython.Py_ssize_t
+    pos: cython.Py_ssize_t
+    start_pos: cython.Py_ssize_t
+    chunk: str
+
+    start_pos = 0
+    while 1:
+        # Find a pending DEL_START/DEL_END, splitting the document
+        # into stuff-preceding-DEL_START, stuff-inside, and
+        # stuff-following-DEL_END
+        try:
+            del_start = chunks.index(DEL_START, start_pos)
+        except ValueError:
+            # Nothing found, we've cleaned up the entire doc
+            break
+        else:
+            del_end = chunks.index(DEL_END, del_start + 1)
+
+        shift_end_left = shift_start_right = 0
+        unbalanced_start = unbalanced_end = 0
+        deleted_chunks = mark_unbalanced(chunks[del_start+1:del_end])
+
+        # For unbalanced start tags at the beginning, find matching (non-deleted)
+        # end tags after the current DEL_END and move the start tag outside.
+        for balanced, del_chunk in deleted_chunks:
+            if balanced != 'us':
+                break
+            unbalanced_start += 1
+            unbalanced_start_name = tag_name_of_chunk(del_chunk)
+            for i in range(del_end+1, chunk_count):
+                if chunks[i] is DEL_START:
+                    break
+                chunk = chunks[i]
+                if chunk[0] != '<' or chunk[1] == '/':
+                    # Reached a word or closing tag.
+                    break
+                name = tag_name_of_chunk(chunk)
+                if name == 'ins':
+                    # Cannot move into an insert.
+                    break
+                assert name != 'del', f"Unexpected delete tag: {chunk!r}"
+                if name != unbalanced_start_name:
+                    # Avoid mixing in other start tags.
+                    break
+                # Exclude start tag to balance the end tag.
+                shift_start_right += 1
+
+        # For unbalanced end tags at the end, find matching (non-deleted)
+        # start tags before the currend DEL_START and move the end tag outside.
+        for balanced, del_chunk in reversed(deleted_chunks):
+            if balanced != 'ue':
+                break
+            unbalanced_end += 1
+            unbalanced_end_name = tag_name_of_chunk(del_chunk)
+            for i in range(del_start - 1, -1, -1):
+                if chunks[i] is DEL_END:
+                    break
+                chunk = chunks[i]
+                if chunk[0] == '<' and chunk[1] != '/':
+                    # Reached an opening tag, can we go further?  Maybe not...
+                    break
+                name = tag_name_of_chunk(chunk)
+                if name == 'ins' or name == 'del':
+                    # Cannot move into an insert or delete.
+                    break
+                if name != unbalanced_end_name:
+                    # Avoid mixing in other start tags.
+                    break
+                # Exclude end tag to balance the start tag.
+                shift_end_left += 1
+
+        """
+        # This is what we do below in loops, spelled out using slicing and list copying:
+
+        chunks[del_start - shift_end_left : del_end + shift_start_right + 1] = [
+            *chunks[del_start + 1: del_start + shift_start_right + 1],
+            '<del>',
+            *chunks[del_start + unbalanced_start + 1 : del_end - unbalanced_end],
+            '</del> ',
+            *chunks[del_end - shift_end_left: del_end],
+        ]
+
+        new_del_end = del_end - 2 * shift_end_left
+        assert chunks[new_del_end] == '</del> '
+        del_end = new_del_end
+
+        if new_del_start > 0 and not chunks[new_del_start - 1].endswith(' '):
+            # Fix up case where the word before us didn't have a trailing space.
+            chunks[new_del_start - 1] += ' '
+        if new_del_end > 0 and chunks[new_del_end - 1].endswith(' '):
+            # Move space outside of </del>.
+            chunks[new_del_end - 1] = chunks[new_del_end - 1][:-1]
+        """
+        pos = del_start - shift_end_left
+        # Move re-balanced start tags before the '<del>'.
+        for i in range(del_start + 1, del_start + shift_start_right + 1):
+            chunks[pos] = chunks[i]
+            pos += 1
+        if pos and not chunks[pos - 1].endswith(' '):
+            # Fix up the case where the word before '<del>' didn't have a trailing space.
+            chunks[pos - 1] += ' '
+        chunks[pos] = '<del>'
+        pos += 1
+        # Copy only the balanced deleted content between '<del>' and '</del>'.
+        for i in range(del_start + unbalanced_start + 1, del_end - unbalanced_end):
+            chunks[pos] = chunks[i]
+            pos += 1
+        if chunks[pos - 1].endswith(' '):
+            # Move trailing space outside of </del>.
+            chunks[pos - 1] = chunks[pos - 1][:-1]
+        chunks[pos] = '</del> '
+        pos += 1
+        # Move re-balanced end tags after the '</del>'.
+        for i in range(del_end - shift_end_left, del_end):
+            chunks[pos] = chunks[i]
+            pos += 1
+        # Adjust the length of the processed part in 'chunks'.
+        del chunks[pos : del_end + shift_start_right + 1]
+        start_pos = pos
+
+
+@cython.cfunc
+def mark_unbalanced(chunks) -> list:
+    tag_stack = []
+    marked = []
+
+    chunk: str
+    parents: list
+
+    for chunk in chunks:
+        if not chunk.startswith('<'):
+            marked.append(('b', chunk))
+            continue
+
+        name = tag_name_of_chunk(chunk)
+        if name in empty_tags:
+            marked.append(('b', chunk))
+            continue
+
+        if chunk[1] == '/':
+            # closing tag found, unwind tag stack
+            while tag_stack:
+                start_name, start_chunk, parents = tag_stack.pop()
+                if start_name == name:
+                    # balanced tag closing, keep rest of stack intact
+                    parents.append(('b', start_chunk))
+                    parents.extend(marked)
+                    parents.append(('b', chunk))
+                    marked = parents
+                    chunk = None
+                    break
+                else:
+                    # unmatched start tag
+                    parents.append(('us', start_chunk))
+                    parents.extend(marked)
+                    marked = parents
+
+            if chunk is not None:
+                # unmatched end tag left after clearing the stack
+                marked.append(('ue', chunk))
+        else:
+            # new start tag found
+            tag_stack.append((name, chunk, marked))
+            marked = []
+
+    # add any unbalanced start tags
+    while tag_stack:
+        _, start_chunk, parents = tag_stack.pop()
+        parents.append(('us', start_chunk))
+        parents.extend(marked)
+        marked = parents
+
+    return marked
+
+
+class token(str):
+    """ Represents a diffable token, generally a word that is displayed to
+    the user.  Opening tags are attached to this token when they are
+    adjacent (pre_tags) and closing tags that follow the word
+    (post_tags).  Some exceptions occur when there are empty tags
+    adjacent to a word, so there may be close tags in pre_tags, or
+    open tags in post_tags.
+
+    We also keep track of whether the word was originally followed by
+    whitespace, even though we do not want to treat the word as
+    equivalent to a similar word that does not have a trailing
+    space."""
+
+    # When this is true, the token will be eliminated from the
+    # displayed diff if no change has occurred:
+    hide_when_equal = False
+
+    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
+        obj = str.__new__(cls, text)
+
+        obj.pre_tags = pre_tags if pre_tags is not None else []
+        obj.post_tags = post_tags if post_tags is not None else []
+        obj.trailing_whitespace = trailing_whitespace
+
+        return obj
+
+    def __repr__(self):
+        return 'token(%s, %r, %r, %r)' % (
+            str.__repr__(self), self.pre_tags, self.post_tags, self.trailing_whitespace)
+
+    def html(self):
+        return str(self)
+
+class tag_token(token):
+
+    """ Represents a token that is actually a tag.  Currently this is just
+    the <img> tag, which takes up visible space just like a word but
+    is only represented in a document by a tag.  """
+
+    def __new__(cls, tag, data, html_repr, pre_tags=None,
+                post_tags=None, trailing_whitespace=""):
+        obj = token.__new__(cls, f"{type}: {data}",
+                            pre_tags=pre_tags,
+                            post_tags=post_tags,
+                            trailing_whitespace=trailing_whitespace)
+        obj.tag = tag
+        obj.data = data
+        obj.html_repr = html_repr
+        return obj
+
+    def __repr__(self):
+        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
+            self.tag,
+            self.data,
+            self.html_repr,
+            self.pre_tags,
+            self.post_tags,
+            self.trailing_whitespace)
+    def html(self):
+        return self.html_repr
+
+class href_token(token):
+
+    """ Represents the href in an anchor tag.  Unlike other words, we only
+    show the href when it changes.  """
+
+    hide_when_equal = True
+
+    def html(self):
+        return ' Link: %s' % self
+
+
+def tokenize(html, include_hrefs=True):
+    """
+    Parse the given HTML and returns token objects (words with attached tags).
+
+    This parses only the content of a page; anything in the head is
+    ignored, and the <head> and <body> elements are themselves
+    optional.  The content is then parsed by lxml, which ensures the
+    validity of the resulting parsed document (though lxml may make
+    incorrect guesses when the markup is particular bad).
+
+    <ins> and <del> tags are also eliminated from the document, as
+    that gets confusing.
+
+    If include_hrefs is true, then the href attribute of <a> tags is
+    included as a special kind of diffable token."""
+    if etree.iselement(html):
+        body_el = html
+    else:
+        body_el = parse_html(html, cleanup=True)
+    # Then we split the document into text chunks for each tag, word, and end tag:
+    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
+    # Finally re-joining them into token objects:
+    return fixup_chunks(chunks)
+
+
+def parse_html(html, cleanup=True):
+    """
+    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
+    wrapped in a <div> tag that was not in the original document.
+
+    If cleanup is true, make sure there's no <head> or <body>, and get
+    rid of any <ins> and <del> tags.
+    """
+    if cleanup:
+        # This removes any extra markup or structure like <head>:
+        html = cleanup_html(html)
+    return fragment_fromstring(html, create_parent=True)
+
+
+_search_body = re.compile(r'<body.*?>', re.I|re.S).search
+_search_end_body = re.compile(r'</body.*?>', re.I|re.S).search
+_replace_ins_del = re.compile(r'</?(ins|del).*?>', re.I|re.S).sub
+
+def cleanup_html(html):
+    """ This 'cleans' the HTML, meaning that any page structure is removed
+    (only the contents of <body> are used, if there is any <body).
+    Also <ins> and <del> tags are removed.  """
+    match = _search_body(html)
+    if match:
+        html = html[match.end():]
+    match = _search_end_body(html)
+    if match:
+        html = html[:match.start()]
+    html = _replace_ins_del('', html)
+    return html
+
+
+def split_trailing_whitespace(word):
+    """
+    This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+    """
+    stripped_length = len(word.rstrip())
+    return word[0:stripped_length], word[stripped_length:]
+
+
+def fixup_chunks(chunks):
+    """
+    This function takes a list of chunks and produces a list of tokens.
+    """
+    tag_accum = []
+    cur_word = None
+    result = []
+    for chunk in chunks:
+        if isinstance(chunk, tuple):
+            if chunk[0] == 'img':
+                src = chunk[1]
+                tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
+                cur_word = tag_token('img', src, html_repr=tag,
+                                     pre_tags=tag_accum,
+                                     trailing_whitespace=trailing_whitespace)
+                tag_accum = []
+                result.append(cur_word)
+
+            elif chunk[0] == 'href':
+                href = chunk[1]
+                cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
+                tag_accum = []
+                result.append(cur_word)
+            continue
+
+        if is_word(chunk):
+            chunk, trailing_whitespace = split_trailing_whitespace(chunk)
+            cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+            tag_accum = []
+            result.append(cur_word)
+
+        elif is_start_tag(chunk):
+            tag_accum.append(chunk)
+
+        elif is_end_tag(chunk):
+            if tag_accum:
+                tag_accum.append(chunk)
+            else:
+                assert cur_word, (
+                    "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+                    % (cur_word, result, chunk, chunks))
+                cur_word.post_tags.append(chunk)
+        else:
+            assert False
+
+    if not result:
+        return [token('', pre_tags=tag_accum)]
+    else:
+        result[-1].post_tags.extend(tag_accum)
+
+    return result
+
+
+# All the tags in HTML that don't require end tags:
+empty_tags = cython.declare(frozenset, defs.empty_tags)
+
+block_level_tags = cython.declare(frozenset, frozenset([
+    'address',
+    'blockquote',
+    'center',
+    'dir',
+    'div',
+    'dl',
+    'fieldset',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'hr',
+    'isindex',
+    'menu',
+    'noframes',
+    'noscript',
+    'ol',
+    'p',
+    'pre',
+    'table',
+    'ul',
+]))
+
+block_level_container_tags = cython.declare(frozenset, frozenset([
+    'dd',
+    'dt',
+    'frameset',
+    'li',
+    'tbody',
+    'td',
+    'tfoot',
+    'th',
+    'thead',
+    'tr',
+]))
+
+any_block_level_tag = cython.declare(tuple, tuple(sorted(
+    block_level_tags | block_level_container_tags))
+)
+
+
+def flatten_el(el, include_hrefs, skip_tag=False):
+    """ Takes an lxml element el, and generates all the text chunks for
+    that tag.  Each start tag is a chunk, each word is a chunk, and each
+    end tag is a chunk.
+
+    If skip_tag is true, then the outermost container tag is
+    not returned (just its contents)."""
+    if not skip_tag:
+        if el.tag == 'img':
+            yield ('img', el.get('src'), start_tag(el))
+        else:
+            yield start_tag(el)
+    if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
+        return
+    start_words = split_words(el.text)
+    for word in start_words:
+        yield html_escape(word)
+    for child in el:
+        yield from flatten_el(child, include_hrefs=include_hrefs)
+    if el.tag == 'a' and el.get('href') and include_hrefs:
+        yield ('href', el.get('href'))
+    if not skip_tag:
+        yield end_tag(el)
+        end_words = split_words(el.tail)
+        for word in end_words:
+            yield html_escape(word)
+
+_find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall
+
+def split_words(text):
+    """ Splits some text into words. Includes trailing whitespace
+    on each word when appropriate.  """
+    if not text or not text.strip():
+        return []
+
+    words = _find_words(text)
+    return words
+
+_has_start_whitespace = re.compile(r'^[ \t\n\r]').match
+
+def start_tag(el):
+    """
+    The text representation of the start tag for a tag.
+    """
+    attributes = ''.join([
+        f' {name}="{html_escape(value)}"'
+        for name, value in el.attrib.items()
+    ])
+    return f'<{el.tag}{attributes}>'
+
+def end_tag(el):
+    """ The text representation of an end tag for a tag.  Includes
+    trailing whitespace when appropriate.  """
+    tail = el.tail
+    extra = ' ' if tail and _has_start_whitespace(tail) else ''
+    return f'</{el.tag}>{extra}'
+
+def is_word(tok):
+    return not tok.startswith('<')
+
+def is_end_tag(tok):
+    return tok.startswith('</')
+
+def is_start_tag(tok):
+    return tok.startswith('<') and not tok.startswith('</')
+
+def fixup_ins_del_tags(html):
+    """ Given an html string, move any <ins> or <del> tags inside of any
+    block-level elements, e.g. transform <ins><p>word</p></ins> to
+    <p><ins>word</ins></p> """
+    doc = parse_html(html, cleanup=False)
+    _fixup_ins_del_tags(doc)
+    html = serialize_html_fragment(doc, skip_outer=True)
+    return html
+
+def serialize_html_fragment(el, skip_outer=False):
+    """ Serialize a single lxml element as HTML.  The serialized form
+    includes the elements tail.
+
+    If skip_outer is true, then don't serialize the outermost tag
+    """
+    assert not isinstance(el, str), (
+        f"You should pass in an element, not a string like {el!r}")
+    html = etree.tostring(el, method="html", encoding='unicode')
+    if skip_outer:
+        # Get rid of the extra starting tag:
+        html = html[html.find('>')+1:]
+        # Get rid of the extra end tag:
+        html = html[:html.rfind('<')]
+        return html.strip()
+    else:
+        return html
+
+
+@cython.cfunc
+def _fixup_ins_del_tags(doc):
+    """fixup_ins_del_tags that works on an lxml document in-place
+    """
+    for el in list(doc.iter('ins', 'del')):
+        if not _contains_block_level_tag(el):
+            continue
+        _move_el_inside_block(el, tag=el.tag)
+        el.drop_tag()
+        #_merge_element_contents(el)
+
+
+@cython.cfunc
+def _contains_block_level_tag(el):
+    """True if the element contains any block-level elements, like <p>, <td>, etc.
+    """
+    for el in el.iter(*any_block_level_tag):
+        return True
+    return False
+
+
+@cython.cfunc
+def _move_el_inside_block(el, tag):
+    """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+    and moves them inside any block-level tags.  """
+    makeelement = el.makeelement
+    for block_level_el in el.iter(*any_block_level_tag):
+        if block_level_el is not el:
+            break
+    else:
+        # No block-level tags in any child
+        children_tag = makeelement(tag)
+        children_tag.text = el.text
+        el.text = None
+        children_tag.extend(iter(el))
+        el[:] = [children_tag]
+        return
+
+    for child in list(el):
+        if _contains_block_level_tag(child):
+            _move_el_inside_block(child, tag)
+            if child.tail:
+                tail_tag = makeelement(tag)
+                tail_tag.text = child.tail
+                child.tail = None
+                child.addnext(tail_tag)
+        else:
+            child_tag = makeelement(tag)
+            el.replace(child, child_tag)
+            child_tag.append(child)
+    if el.text:
+        text_tag = makeelement(tag)
+        text_tag.text = el.text
+        el.text = None
+        el.insert(0, text_tag)
+
+
+def _merge_element_contents(el):
+    """
+    Removes an element, but merges its contents into its place, e.g.,
+    given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+    <p>Hi there!</p>
+    """
+    parent = el.getparent()
+    text = el.text
+    tail = el.tail
+    if tail:
+        if not len(el):
+            text = (text or '') + tail
+        else:
+            el[-1].tail = (el[-1].tail or '') + tail
+    index = parent.index(el)
+    if text:
+        previous = el.getprevious()
+        if previous is None:
+            parent.text = (parent.text or '') + text
+        else:
+            previous.tail = (previous.tail or '') + text
+    parent[index:index+1] = el.getchildren()
+
+
+@cython.final
+@cython.cclass
+class InsensitiveSequenceMatcher(SequenceMatcher):
+    """
+    Acts like SequenceMatcher, but tries not to find very small equal
+    blocks amidst large spans of changes
+    """
+
+    threshold = 2
+
+    @cython.cfunc
+    def get_matching_blocks(self) -> list:
+        size: cython.Py_ssize_t = min(len(self.b), len(self.b))
+        threshold: cython.Py_ssize_t = self.threshold
+        threshold = min(threshold, size // 4)
+        actual = SequenceMatcher.get_matching_blocks(self)
+        return [item for item in actual
+                if item[2] > threshold
+                or not item[2]]
+
+
+if __name__ == '__main__':
+    from lxml.html import _diffcommand
+    _diffcommand.main()