Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,10 @@
__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
"""
__all__ = ["parse", "convert_tree"]
from .soupparser import convert_tree, parse as _parse
def parse(file, beautifulsoup=None, makeelement=None):
root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
return root.getroot()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,86 @@
import optparse
import sys
import re
import os
from .diff import htmldiff
description = """\
"""
parser = optparse.OptionParser(
usage="%prog [OPTIONS] FILE1 FILE2\n"
"%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
description=description,
)
parser.add_option(
'-o', '--output',
metavar="FILE",
dest="output",
default="-",
help="File to write the difference to",
)
parser.add_option(
'-a', '--annotation',
action="store_true",
dest="annotation",
help="Do an annotation")
def main(args=None):
if args is None:
args = sys.argv[1:]
options, args = parser.parse_args(args)
if options.annotation:
return annotate(options, args)
if len(args) != 2:
print('Error: you must give two files')
parser.print_help()
sys.exit(1)
file1, file2 = args
input1 = read_file(file1)
input2 = read_file(file2)
body1 = split_body(input1)[1]
pre, body2, post = split_body(input2)
result = htmldiff(body1, body2)
result = pre + result + post
if options.output == '-':
if not result.endswith('\n'):
result += '\n'
sys.stdout.write(result)
else:
with open(options.output, 'wb') as f:
f.write(result)
def read_file(filename):
if filename == '-':
c = sys.stdin.read()
elif not os.path.exists(filename):
raise OSError(
"Input file %s does not exist" % filename)
else:
with open(filename, 'rb') as f:
c = f.read()
return c
body_start_re = re.compile(
r"<body.*?>", re.I|re.S)
body_end_re = re.compile(
r"</body.*?>", re.I|re.S)
def split_body(html):
pre = post = ''
match = body_start_re.search(html)
if match:
pre = html[:match.end()]
html = html[match.end():]
match = body_end_re.search(html)
if match:
post = html[match.start():]
html = html[:match.start()]
return pre, html, post
def annotate(options, args):
print("Not yet implemented")
sys.exit(1)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,100 @@
"""
Legacy module - don't use in new code!
html5lib now has its own proper implementation.
This module implements a tree builder for html5lib that generates lxml
html element trees. This module uses camelCase as it follows the
html5lib style guide.
"""
from html5lib.treebuilders import _base, etree as etree_builders
from lxml import html, etree
class DocumentType:
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document:
def __init__(self):
self._elementTree = None
self.childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
def __init__(self, *args, **kwargs):
html_builder = etree_builders.getETreeModule(html, fullTree=False)
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
self.elementClass = html_builder.Element
self.commentClass = etree_builder.Comment
_base.TreeBuilder.__init__(self, *args, **kwargs)
def reset(self):
_base.TreeBuilder.reset(self)
self.rootInserted = False
self.initialComments = []
self.doctype = None
def getDocument(self):
return self.document._elementTree
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, name, publicId, systemId):
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertComment(self, data, parent=None):
if not self.rootInserted:
self.initialComments.append(data)
else:
_base.TreeBuilder.insertComment(self, data, parent)
def insertRoot(self, name):
buf = []
if self.doctype and self.doctype.name:
buf.append('<!DOCTYPE %s' % self.doctype.name)
if self.doctype.publicId is not None or self.doctype.systemId is not None:
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
self.doctype.systemId))
buf.append('>')
buf.append('<html></html>')
root = html.fromstring(''.join(buf))
# Append the initial comments:
for comment in self.initialComments:
root.addprevious(etree.Comment(comment))
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name)
root_element._element = root
self.document.childNodes.append(root_element)
self.openElements.append(root_element)
self.rootInserted = True

View File

@@ -0,0 +1,56 @@
try:
from collections.abc import MutableSet
except ImportError:
from collections.abc import MutableSet
class SetMixin(MutableSet):
"""
Mix-in for sets. You must define __iter__, add, remove
"""
def __len__(self):
length = 0
for item in self:
length += 1
return length
def __contains__(self, item):
for has_item in self:
if item == has_item:
return True
return False
issubset = MutableSet.__le__
issuperset = MutableSet.__ge__
union = MutableSet.__or__
intersection = MutableSet.__and__
difference = MutableSet.__sub__
symmetric_difference = MutableSet.__xor__
def copy(self):
return set(self)
def update(self, other):
self |= other
def intersection_update(self, other):
self &= other
def difference_update(self, other):
self -= other
def symmetric_difference_update(self, other):
self ^= other
def discard(self, item):
try:
self.remove(item)
except KeyError:
pass
@classmethod
def _from_iterable(cls, it):
return set(it)

View File

@@ -0,0 +1,173 @@
# --------------------------------------------------------------------
# The ElementTree toolkit is
# Copyright (c) 1999-2004 by Fredrik Lundh
# --------------------------------------------------------------------
"""
A set of HTML generator tags for building HTML documents.
Usage::
>>> from lxml.html.builder import *
>>> html = HTML(
... HEAD( TITLE("Hello World") ),
... BODY( CLASS("main"),
... H1("Hello World !")
... )
... )
>>> import lxml.etree
>>> print lxml.etree.tostring(html, pretty_print=True)
<html>
<head>
<title>Hello World</title>
</head>
<body class="main">
<h1>Hello World !</h1>
</body>
</html>
"""
from lxml.builder import ElementMaker
from lxml.html import html_parser
E = ElementMaker(makeelement=html_parser.makeelement)
# elements
A = E.a #: anchor
ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
ACRONYM = E.acronym #:
ADDRESS = E.address #: information on author
APPLET = E.applet #: Java applet (DEPRECATED)
AREA = E.area #: client-side image map area
ARTICLE = E.article #: self-contained article
ASIDE = E.aside #: indirectly-related content
AUDIO = E.audio #: embedded audio file
B = E.b #: bold text style
BASE = E.base #: document base URI
BASEFONT = E.basefont #: base font size (DEPRECATED)
BDI = E.bdi #: isolate bidirectional text
BDO = E.bdo #: I18N BiDi over-ride
BIG = E.big #: large text style
BLOCKQUOTE = E.blockquote #: long quotation
BODY = E.body #: document body
BR = E.br #: forced line break
BUTTON = E.button #: push button
CANVAS = E.canvas #: scriptable graphics container
CAPTION = E.caption #: table caption
CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
CITE = E.cite #: citation
CODE = E.code #: computer code fragment
COL = E.col #: table column
COLGROUP = E.colgroup #: table column group
DATA = E.data #: machine-readable translation
DATALIST = E.datalist #: list of options for an input
DD = E.dd #: definition description
DEL = getattr(E, 'del') #: deleted text
DETAILS = E.details #: expandable section
DFN = E.dfn #: instance definition
DIALOG = E.dialog #: dialog box
DIR = E.dir #: directory list (DEPRECATED)
DIV = E.div #: generic language/style container
DL = E.dl #: definition list
DT = E.dt #: definition term
EM = E.em #: emphasis
EMBED = E.embed #: embedded external content
FIELDSET = E.fieldset #: form control group
FIGCAPTION = E.figcaption #: figure caption
FIGURE = E.figure #: self-contained, possibly-captioned content
FONT = E.font #: local change to font (DEPRECATED)
FOOTER = E.footer #: footer for nearest ancestor
FORM = E.form #: interactive form
FRAME = E.frame #: subwindow
FRAMESET = E.frameset #: window subdivision
H1 = E.h1 #: heading
H2 = E.h2 #: heading
H3 = E.h3 #: heading
H4 = E.h4 #: heading
H5 = E.h5 #: heading
H6 = E.h6 #: heading
HEAD = E.head #: document head
HEADER = E.header #: heading content
HGROUP = E.hgroup #: heading group
HR = E.hr #: horizontal rule
HTML = E.html #: document root element
I = E.i #: italic text style
IFRAME = E.iframe #: inline subwindow
IMG = E.img #: Embedded image
INPUT = E.input #: form control
INS = E.ins #: inserted text
ISINDEX = E.isindex #: single line prompt (DEPRECATED)
KBD = E.kbd #: text to be entered by the user
LABEL = E.label #: form field label text
LEGEND = E.legend #: fieldset legend
LI = E.li #: list item
LINK = E.link #: a media-independent link
MAIN = E.main #: main content
MAP = E.map #: client-side image map
MARK = E.mark #: marked/highlighted text
MARQUEE = E.marquee #: scrolling text
MENU = E.menu #: menu list (DEPRECATED)
META = E.meta #: generic metainformation
METER = E.meter #: numerical value display
NAV = E.nav #: navigation section
NOBR = E.nobr #: prevent wrapping
NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
OBJECT = E.object #: generic embedded object
OL = E.ol #: ordered list
OPTGROUP = E.optgroup #: option group
OPTION = E.option #: selectable choice
OUTPUT = E.output #: result of a calculation
P = E.p #: paragraph
PARAM = E.param #: named property value
PICTURE = E.picture #: picture with multiple sources
PORTAL = E.portal #: embedded preview
PRE = E.pre #: preformatted text
PROGRESS = E.progress #: progress bar
Q = E.q #: short inline quotation
RB = E.rb #: ruby base text
RP = E.rp #: ruby parentheses
RT = E.rt #: ruby text component
RTC = E.rtc #: ruby semantic annotation
RUBY = E.ruby #: ruby annotations
S = E.s #: strike-through text style (DEPRECATED)
SAMP = E.samp #: sample program output, scripts, etc.
SCRIPT = E.script #: script statements
SEARCH = E.search #: set of form controls for a search
SECTION = E.section #: generic standalone section
SELECT = E.select #: option selector
SLOT = E.slot #: placeholder for JS use
SMALL = E.small #: small text style
SOURCE = E.source #: source for picture/audio/video element
SPAN = E.span #: generic language/style container
STRIKE = E.strike #: strike-through text (DEPRECATED)
STRONG = E.strong #: strong emphasis
STYLE = E.style #: style info
SUB = E.sub #: subscript
SUMMARY = E.summary #: summary for <details>
SUP = E.sup #: superscript
TABLE = E.table #:
TBODY = E.tbody #: table body
TD = E.td #: table data cell
TEMPLATE = E.template #: fragment for JS use
TEXTAREA = E.textarea #: multi-line text field
TFOOT = E.tfoot #: table footer
TH = E.th #: table header cell
THEAD = E.thead #: table header
TIME = E.time #: date/time
TITLE = E.title #: document title
TR = E.tr #: table row
TRACK = E.track #: audio/video track
TT = E.tt #: teletype or monospaced text style
U = E.u #: underlined text style (DEPRECATED)
UL = E.ul #: unordered list
VAR = E.var #: instance of a variable or program argument
VIDEO = E.video #: embedded video file
WBR = E.wbr #: word break
# attributes (only reserved words are included here)
ATTR = dict
def CLASS(v): return {'class': v}
def FOR(v): return {'for': v}

View File

@@ -0,0 +1,21 @@
# cython: language_level=3str
"""Backward-compatibility module for lxml_html_clean"""
try:
from lxml_html_clean import *
__all__ = [
"clean_html",
"clean",
"Cleaner",
"autolink",
"autolink_html",
"word_break",
"word_break_html",
]
except ImportError:
raise ImportError(
"lxml.html.clean module is now a separate project lxml_html_clean.\n"
"Install lxml[html_clean] or lxml_html_clean directly."
) from None

View File

@@ -0,0 +1,135 @@
# FIXME: this should all be confirmed against what a DTD says
# (probably in a test; this may not match the DTD exactly, but we
# should document just how it differs).
"""
Data taken from https://www.w3.org/TR/html401/index/elements.html
and https://html.spec.whatwg.org/multipage/syntax.html#elements-2
for html5_tags.
"""
empty_tags = frozenset([
'area', 'base', 'basefont', 'br', 'col', 'embed', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track', 'wbr'])
deprecated_tags = frozenset([
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
'menu', 's', 'strike', 'u'])
# archive actually takes a space-separated list of URIs
link_attrs = frozenset([
'action', 'archive', 'background', 'cite', 'classid',
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
'usemap',
# Not standard:
'dynsrc', 'lowsrc',
# HTML5 formaction
'formaction'
])
# Not in the HTML 4 spec:
# onerror, onresize
event_attrs = frozenset([
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
'onunload',
])
safe_attrs = frozenset([
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
# From http://htmlhelp.com/reference/html40/olist.html
top_level_tags = frozenset([
'html', 'head', 'body', 'frameset',
])
head_tags = frozenset([
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
])
general_block_tags = frozenset([
'address',
'blockquote',
'center',
'del',
'div',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'ins',
'isindex',
'noscript',
'p',
'pre',
])
list_tags = frozenset([
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
])
table_tags = frozenset([
'table', 'caption', 'colgroup', 'col',
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
])
# just this one from
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
block_tags = general_block_tags | list_tags | table_tags | frozenset([
# Partial form tags
'fieldset', 'form', 'legend', 'optgroup', 'option',
])
form_tags = frozenset([
'form', 'button', 'fieldset', 'legend', 'input', 'label',
'select', 'optgroup', 'option', 'textarea',
])
special_inline_tags = frozenset([
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
'img', 'map', 'area', 'object', 'param', 'q', 'script',
'span', 'sub', 'sup',
])
phrase_tags = frozenset([
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
'ins', 'kbd', 'samp', 'strong', 'var',
])
font_style_tags = frozenset([
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
])
frame_tags = frozenset([
'frameset', 'frame', 'noframes',
])
html5_tags = frozenset([
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
'svg', 'time', 'track', 'video', 'wbr'
])
# These tags aren't standard
nonstandard_tags = frozenset(['blink', 'marquee'])
tags = (top_level_tags | head_tags | general_block_tags | list_tags
| table_tags | form_tags | special_inline_tags | phrase_tags
| font_style_tags | nonstandard_tags | html5_tags)

View File

@@ -0,0 +1,972 @@
# cython: language_level=3
try:
import cython
except ImportError:
class fake_cython:
compiled = False
def cfunc(self, func): return func
def cclass(self, func): return func
def declare(self, _, value): return value
def __getattr__(self, type_name): return "object"
cython = fake_cython()
try:
from . import _difflib as difflib
import inspect
if inspect.isfunction(difflib.get_close_matches):
raise ImportError(
"Embedded difflib is not compiled to a fast binary, using the stdlib instead.")
from cython.cimports.lxml.html._difflib import SequenceMatcher
except ImportError:
import difflib
if not cython.compiled:
from difflib import SequenceMatcher
import itertools
import functools
import operator
import re
from lxml import etree
from lxml.html import fragment_fromstring
from . import defs
__all__ = ['html_annotate', 'htmldiff']
group_by_first_item = functools.partial(itertools.groupby, key=operator.itemgetter(0))
############################################################
## Annotation
############################################################
@cython.cfunc
def html_escape(text: str, _escapes: tuple = ('&amp;', '&lt;', '&gt;', '&quot;', '&#x27;')) -> str:
# Not so slow compiled version of 'html.escape()'.
# Most of the time, we replace little to nothing, so use a fast decision what needs to be done.
ch: cython.Py_UCS4
replace: cython.char[5] = [False] * 5
for ch in text:
replace[0] |= ch == '&'
replace[1] |= ch == '<'
replace[2] |= ch == '>'
replace[3] |= ch == '"'
replace[4] |= ch == "'"
for i in range(5):
if replace[i]:
text = text.replace('&<>"\''[i], _escapes[i])
return text
if not cython.compiled:
from html import escape as html_escape
def default_markup(text, version):
return '<span title="%s">%s</span>' % (
html_escape(version), text)
def html_annotate(doclist, markup=default_markup):
"""
doclist should be ordered from oldest to newest, like::
>>> version1 = 'Hello World'
>>> version2 = 'Goodbye World'
>>> print(html_annotate([(version1, 'version 1'),
... (version2, 'version 2')]))
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
The documents must be *fragments* (str/UTF8 or unicode), not
complete documents
The markup argument is a function to markup the spans of words.
This function is called like markup('Hello', 'version 2'), and
returns HTML. The first argument is text and never includes any
markup. The default uses a span with a title:
>>> print(default_markup('Some Text', 'by Joe'))
<span title="by Joe">Some Text</span>
"""
# The basic strategy we have is to split the documents up into
# logical tokens (which are words with attached markup). We then
# do diffs of each of the versions to track when a token first
# appeared in the document; the annotation attached to the token
# is the version where it first appeared.
tokenlist = [tokenize_annotated(doc, version)
for doc, version in doclist]
cur_tokens = tokenlist[0]
for tokens in tokenlist[1:]:
html_annotate_merge_annotations(cur_tokens, tokens)
cur_tokens = tokens
# After we've tracked all the tokens, we can combine spans of text
# that are adjacent and have the same annotation
cur_tokens = compress_tokens(cur_tokens)
# And finally add markup
result = markup_serialize_tokens(cur_tokens, markup)
return ''.join(result).strip()
def tokenize_annotated(doc, annotation):
"""Tokenize a document and add an annotation attribute to each token
"""
tokens = tokenize(doc, include_hrefs=False)
for tok in tokens:
tok.annotation = annotation
return tokens
def html_annotate_merge_annotations(tokens_old, tokens_new):
"""Merge the annotations from tokens_old into tokens_new, when the
tokens in the new document already existed in the old document.
"""
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
commands = s.get_opcodes()
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
eq_old = tokens_old[i1:i2]
eq_new = tokens_new[j1:j2]
copy_annotations(eq_old, eq_new)
def copy_annotations(src, dest):
"""
Copy annotations from the tokens listed in src to the tokens in dest
"""
assert len(src) == len(dest)
for src_tok, dest_tok in zip(src, dest):
dest_tok.annotation = src_tok.annotation
def compress_tokens(tokens):
"""
Combine adjacent tokens when there is no HTML between the tokens,
and they share an annotation
"""
result = [tokens[0]]
for tok in tokens[1:]:
if (not tok.pre_tags and
not result[-1].post_tags and
result[-1].annotation == tok.annotation):
compress_merge_back(result, tok)
else:
result.append(tok)
return result
@cython.cfunc
def compress_merge_back(tokens: list, tok):
""" Merge tok into the last element of tokens (modifying the list of
tokens in-place). """
last = tokens[-1]
if type(last) is not token or type(tok) is not token:
tokens.append(tok)
else:
text = last + last.trailing_whitespace + tok
merged = token(text,
pre_tags=last.pre_tags,
post_tags=tok.post_tags,
trailing_whitespace=tok.trailing_whitespace)
merged.annotation = last.annotation
tokens[-1] = merged
def markup_serialize_tokens(tokens, markup_func):
"""
Serialize the list of tokens into a list of text chunks, calling
markup_func around text to add annotations.
"""
for token in tokens:
yield from token.pre_tags
html = token.html()
html = markup_func(html, token.annotation) + token.trailing_whitespace
yield html
yield from token.post_tags
############################################################
## HTML Diffs
############################################################
def htmldiff(old_html, new_html):
## FIXME: this should take parsed documents too, and use their body
## or other content.
""" Do a diff of the old and new document. The documents are HTML
*fragments* (str/UTF8 or unicode), they are not complete documents
(i.e., no <html> tag).
Returns HTML with <ins> and <del> tags added around the
appropriate text.
Markup is generally ignored, with the markup from new_html
preserved, and possibly some markup from old_html (though it is
considered acceptable to lose some of the old markup). Only the
words in the HTML are diffed. The exception is <img> tags, which
are treated like words, and the href attribute of <a> tags, which
are noted inside the tag itself when there are changes.
"""
old_html_tokens = tokenize(old_html)
new_html_tokens = tokenize(new_html)
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
try:
result = ''.join(result).strip()
except (ValueError, TypeError) as exc:
print(exc)
result = ''
return fixup_ins_del_tags(result)
def htmldiff_tokens(html1_tokens, html2_tokens):
""" Does a diff on the tokens themselves, returning a list of text
chunks (not tokens).
"""
# There are several passes as we do the differences. The tokens
# isolate the portion of the content we care to diff; difflib does
# all the actual hard work at that point.
#
# Then we must create a valid document from pieces of both the old
# document and the new document. We generally prefer to take
# markup from the new document, and only do a best effort attempt
# to keep markup from the old document; anything that we can't
# resolve we throw away. Also we try to put the deletes as close
# to the location where we think they would have been -- because
# we are only keeping the markup from the new document, it can be
# fuzzy where in the new document the old text would have gone.
# Again we just do a best effort attempt.
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
commands = s.get_opcodes()
result = []
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
continue
if command == 'insert' or command == 'replace':
ins_tokens = expand_tokens(html2_tokens[j1:j2])
merge_insert(ins_tokens, result)
if command == 'delete' or command == 'replace':
del_tokens = expand_tokens(html1_tokens[i1:i2])
merge_delete(del_tokens, result)
# If deletes were inserted directly as <del> then we'd have an
# invalid document at this point. Instead we put in special
# markers, and when the complete diffed document has been created
# we try to move the deletes around and resolve any problems.
cleanup_delete(result)
return result
def expand_tokens(tokens, equal=False):
"""Given a list of tokens, return a generator of the chunks of
text for the data in the tokens.
"""
for token in tokens:
yield from token.pre_tags
if not equal or not token.hide_when_equal:
yield token.html() + token.trailing_whitespace
yield from token.post_tags
def merge_insert(ins_chunks, doc: list):
""" doc is the already-handled document (as a list of text chunks);
here we add <ins>ins_chunks</ins> to the end of that. """
# Though we don't throw away unbalanced start/end tags
# (we assume there is accompanying markup later or earlier in the
# document), we only put <ins> around the balanced portion.
# Legacy note: We make a choice here. Originally, we merged all sequences of
# unbalanced tags together into separate start and end tag groups. Now, we look at
# each sequence separately, leading to more fine-grained diffs but different
# tag structure than before.
item: tuple
for balanced, marked_chunks in group_by_first_item(mark_unbalanced(ins_chunks)):
chunks = [item[1] for item in marked_chunks]
if balanced == 'b':
if doc and not doc[-1].endswith(' '):
# Fix up the case where the word before the insert didn't end with a space.
doc[-1] += ' '
doc.append('<ins>')
doc.extend(chunks)
if doc[-1].endswith(' '):
# We move space outside of </ins>.
doc[-1] = doc[-1][:-1]
doc.append('</ins> ')
else:
# unmatched start or end
doc.extend(chunks)
@cython.cfunc
def tag_name_of_chunk(chunk: str) -> str:
i: cython.Py_ssize_t
ch: cython.Py_UCS4
if chunk[0] != '<':
return ""
start_pos = 1
for i, ch in enumerate(chunk):
if ch == '/':
start_pos = 2
elif ch == '>':
return chunk[start_pos:i]
elif ch.isspace():
return chunk[start_pos:i]
return chunk[start_pos:]
if not cython.compiled:
# Avoid performance regression in Python due to string iteration.
def tag_name_of_chunk(chunk: str) -> str:
return chunk.split(None, 1)[0].strip('<>/')
# These are sentinels to represent the start and end of a <del>
# segment, until we do the cleanup phase to turn them into proper
# markup:
class DEL_START:
pass
class DEL_END:
pass
def merge_delete(del_chunks, doc: list):
""" Adds the text chunks in del_chunks to the document doc (another
list of text chunks) with marker to show it is a delete.
cleanup_delete later resolves these markers into <del> tags."""
doc.append(DEL_START)
doc.extend(del_chunks)
doc.append(DEL_END)
def cleanup_delete(chunks: list):
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
them with <del></del>. To do this while keeping the document
valid, it may need to drop some tags (either start or end tags).
It may also move the del into adjacent tags to try to move it to a
similar location where it was originally located (e.g., moving a
delete into preceding <div> tag, if the del looks like (DEL_START,
'Text</div>', DEL_END)
"""
chunk_count = len(chunks)
i: cython.Py_ssize_t
del_start: cython.Py_ssize_t
del_end: cython.Py_ssize_t
shift_start_right: cython.Py_ssize_t
shift_end_left: cython.Py_ssize_t
unbalanced_start: cython.Py_ssize_t
unbalanced_end: cython.Py_ssize_t
pos: cython.Py_ssize_t
start_pos: cython.Py_ssize_t
chunk: str
start_pos = 0
while 1:
# Find a pending DEL_START/DEL_END, splitting the document
# into stuff-preceding-DEL_START, stuff-inside, and
# stuff-following-DEL_END
try:
del_start = chunks.index(DEL_START, start_pos)
except ValueError:
# Nothing found, we've cleaned up the entire doc
break
else:
del_end = chunks.index(DEL_END, del_start + 1)
shift_end_left = shift_start_right = 0
unbalanced_start = unbalanced_end = 0
deleted_chunks = mark_unbalanced(chunks[del_start+1:del_end])
# For unbalanced start tags at the beginning, find matching (non-deleted)
# end tags after the current DEL_END and move the start tag outside.
for balanced, del_chunk in deleted_chunks:
if balanced != 'us':
break
unbalanced_start += 1
unbalanced_start_name = tag_name_of_chunk(del_chunk)
for i in range(del_end+1, chunk_count):
if chunks[i] is DEL_START:
break
chunk = chunks[i]
if chunk[0] != '<' or chunk[1] == '/':
# Reached a word or closing tag.
break
name = tag_name_of_chunk(chunk)
if name == 'ins':
# Cannot move into an insert.
break
assert name != 'del', f"Unexpected delete tag: {chunk!r}"
if name != unbalanced_start_name:
# Avoid mixing in other start tags.
break
# Exclude start tag to balance the end tag.
shift_start_right += 1
# For unbalanced end tags at the end, find matching (non-deleted)
# start tags before the currend DEL_START and move the end tag outside.
for balanced, del_chunk in reversed(deleted_chunks):
if balanced != 'ue':
break
unbalanced_end += 1
unbalanced_end_name = tag_name_of_chunk(del_chunk)
for i in range(del_start - 1, -1, -1):
if chunks[i] is DEL_END:
break
chunk = chunks[i]
if chunk[0] == '<' and chunk[1] != '/':
# Reached an opening tag, can we go further? Maybe not...
break
name = tag_name_of_chunk(chunk)
if name == 'ins' or name == 'del':
# Cannot move into an insert or delete.
break
if name != unbalanced_end_name:
# Avoid mixing in other start tags.
break
# Exclude end tag to balance the start tag.
shift_end_left += 1
"""
# This is what we do below in loops, spelled out using slicing and list copying:
chunks[del_start - shift_end_left : del_end + shift_start_right + 1] = [
*chunks[del_start + 1: del_start + shift_start_right + 1],
'<del>',
*chunks[del_start + unbalanced_start + 1 : del_end - unbalanced_end],
'</del> ',
*chunks[del_end - shift_end_left: del_end],
]
new_del_end = del_end - 2 * shift_end_left
assert chunks[new_del_end] == '</del> '
del_end = new_del_end
if new_del_start > 0 and not chunks[new_del_start - 1].endswith(' '):
# Fix up case where the word before us didn't have a trailing space.
chunks[new_del_start - 1] += ' '
if new_del_end > 0 and chunks[new_del_end - 1].endswith(' '):
# Move space outside of </del>.
chunks[new_del_end - 1] = chunks[new_del_end - 1][:-1]
"""
pos = del_start - shift_end_left
# Move re-balanced start tags before the '<del>'.
for i in range(del_start + 1, del_start + shift_start_right + 1):
chunks[pos] = chunks[i]
pos += 1
if pos and not chunks[pos - 1].endswith(' '):
# Fix up the case where the word before '<del>' didn't have a trailing space.
chunks[pos - 1] += ' '
chunks[pos] = '<del>'
pos += 1
# Copy only the balanced deleted content between '<del>' and '</del>'.
for i in range(del_start + unbalanced_start + 1, del_end - unbalanced_end):
chunks[pos] = chunks[i]
pos += 1
if chunks[pos - 1].endswith(' '):
# Move trailing space outside of </del>.
chunks[pos - 1] = chunks[pos - 1][:-1]
chunks[pos] = '</del> '
pos += 1
# Move re-balanced end tags after the '</del>'.
for i in range(del_end - shift_end_left, del_end):
chunks[pos] = chunks[i]
pos += 1
# Adjust the length of the processed part in 'chunks'.
del chunks[pos : del_end + shift_start_right + 1]
start_pos = pos
@cython.cfunc
def mark_unbalanced(chunks) -> list:
tag_stack = []
marked = []
chunk: str
parents: list
for chunk in chunks:
if not chunk.startswith('<'):
marked.append(('b', chunk))
continue
name = tag_name_of_chunk(chunk)
if name in empty_tags:
marked.append(('b', chunk))
continue
if chunk[1] == '/':
# closing tag found, unwind tag stack
while tag_stack:
start_name, start_chunk, parents = tag_stack.pop()
if start_name == name:
# balanced tag closing, keep rest of stack intact
parents.append(('b', start_chunk))
parents.extend(marked)
parents.append(('b', chunk))
marked = parents
chunk = None
break
else:
# unmatched start tag
parents.append(('us', start_chunk))
parents.extend(marked)
marked = parents
if chunk is not None:
# unmatched end tag left after clearing the stack
marked.append(('ue', chunk))
else:
# new start tag found
tag_stack.append((name, chunk, marked))
marked = []
# add any unbalanced start tags
while tag_stack:
_, start_chunk, parents = tag_stack.pop()
parents.append(('us', start_chunk))
parents.extend(marked)
marked = parents
return marked
class token(str):
""" Represents a diffable token, generally a word that is displayed to
the user. Opening tags are attached to this token when they are
adjacent (pre_tags) and closing tags that follow the word
(post_tags). Some exceptions occur when there are empty tags
adjacent to a word, so there may be close tags in pre_tags, or
open tags in post_tags.
We also keep track of whether the word was originally followed by
whitespace, even though we do not want to treat the word as
equivalent to a similar word that does not have a trailing
space."""
# When this is true, the token will be eliminated from the
# displayed diff if no change has occurred:
hide_when_equal = False
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
obj = str.__new__(cls, text)
obj.pre_tags = pre_tags if pre_tags is not None else []
obj.post_tags = post_tags if post_tags is not None else []
obj.trailing_whitespace = trailing_whitespace
return obj
def __repr__(self):
return 'token(%s, %r, %r, %r)' % (
str.__repr__(self), self.pre_tags, self.post_tags, self.trailing_whitespace)
def html(self):
return str(self)
class tag_token(token):
""" Represents a token that is actually a tag. Currently this is just
the <img> tag, which takes up visible space just like a word but
is only represented in a document by a tag. """
def __new__(cls, tag, data, html_repr, pre_tags=None,
post_tags=None, trailing_whitespace=""):
obj = token.__new__(cls, f"{type}: {data}",
pre_tags=pre_tags,
post_tags=post_tags,
trailing_whitespace=trailing_whitespace)
obj.tag = tag
obj.data = data
obj.html_repr = html_repr
return obj
def __repr__(self):
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
self.tag,
self.data,
self.html_repr,
self.pre_tags,
self.post_tags,
self.trailing_whitespace)
def html(self):
return self.html_repr
class href_token(token):
""" Represents the href in an anchor tag. Unlike other words, we only
show the href when it changes. """
hide_when_equal = True
def html(self):
return ' Link: %s' % self
def tokenize(html, include_hrefs=True):
"""
Parse the given HTML and returns token objects (words with attached tags).
This parses only the content of a page; anything in the head is
ignored, and the <head> and <body> elements are themselves
optional. The content is then parsed by lxml, which ensures the
validity of the resulting parsed document (though lxml may make
incorrect guesses when the markup is particular bad).
<ins> and <del> tags are also eliminated from the document, as
that gets confusing.
If include_hrefs is true, then the href attribute of <a> tags is
included as a special kind of diffable token."""
if etree.iselement(html):
body_el = html
else:
body_el = parse_html(html, cleanup=True)
# Then we split the document into text chunks for each tag, word, and end tag:
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
# Finally re-joining them into token objects:
return fixup_chunks(chunks)
def parse_html(html, cleanup=True):
"""
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
wrapped in a <div> tag that was not in the original document.
If cleanup is true, make sure there's no <head> or <body>, and get
rid of any <ins> and <del> tags.
"""
if cleanup:
# This removes any extra markup or structure like <head>:
html = cleanup_html(html)
return fragment_fromstring(html, create_parent=True)
_search_body = re.compile(r'<body.*?>', re.I|re.S).search
_search_end_body = re.compile(r'</body.*?>', re.I|re.S).search
_replace_ins_del = re.compile(r'</?(ins|del).*?>', re.I|re.S).sub
def cleanup_html(html):
""" This 'cleans' the HTML, meaning that any page structure is removed
(only the contents of <body> are used, if there is any <body).
Also <ins> and <del> tags are removed. """
match = _search_body(html)
if match:
html = html[match.end():]
match = _search_end_body(html)
if match:
html = html[:match.start()]
html = _replace_ins_del('', html)
return html
def split_trailing_whitespace(word):
"""
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
"""
stripped_length = len(word.rstrip())
return word[0:stripped_length], word[stripped_length:]
def fixup_chunks(chunks):
"""
This function takes a list of chunks and produces a list of tokens.
"""
tag_accum = []
cur_word = None
result = []
for chunk in chunks:
if isinstance(chunk, tuple):
if chunk[0] == 'img':
src = chunk[1]
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
cur_word = tag_token('img', src, html_repr=tag,
pre_tags=tag_accum,
trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif chunk[0] == 'href':
href = chunk[1]
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
tag_accum = []
result.append(cur_word)
continue
if is_word(chunk):
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif is_start_tag(chunk):
tag_accum.append(chunk)
elif is_end_tag(chunk):
if tag_accum:
tag_accum.append(chunk)
else:
assert cur_word, (
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
% (cur_word, result, chunk, chunks))
cur_word.post_tags.append(chunk)
else:
assert False
if not result:
return [token('', pre_tags=tag_accum)]
else:
result[-1].post_tags.extend(tag_accum)
return result
# All the tags in HTML that don't require end tags:
empty_tags = cython.declare(frozenset, defs.empty_tags)
block_level_tags = cython.declare(frozenset, frozenset([
'address',
'blockquote',
'center',
'dir',
'div',
'dl',
'fieldset',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'isindex',
'menu',
'noframes',
'noscript',
'ol',
'p',
'pre',
'table',
'ul',
]))
block_level_container_tags = cython.declare(frozenset, frozenset([
'dd',
'dt',
'frameset',
'li',
'tbody',
'td',
'tfoot',
'th',
'thead',
'tr',
]))
any_block_level_tag = cython.declare(tuple, tuple(sorted(
block_level_tags | block_level_container_tags))
)
def flatten_el(el, include_hrefs, skip_tag=False):
""" Takes an lxml element el, and generates all the text chunks for
that tag. Each start tag is a chunk, each word is a chunk, and each
end tag is a chunk.
If skip_tag is true, then the outermost container tag is
not returned (just its contents)."""
if not skip_tag:
if el.tag == 'img':
yield ('img', el.get('src'), start_tag(el))
else:
yield start_tag(el)
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
return
start_words = split_words(el.text)
for word in start_words:
yield html_escape(word)
for child in el:
yield from flatten_el(child, include_hrefs=include_hrefs)
if el.tag == 'a' and el.get('href') and include_hrefs:
yield ('href', el.get('href'))
if not skip_tag:
yield end_tag(el)
end_words = split_words(el.tail)
for word in end_words:
yield html_escape(word)
_find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall
def split_words(text):
""" Splits some text into words. Includes trailing whitespace
on each word when appropriate. """
if not text or not text.strip():
return []
words = _find_words(text)
return words
_has_start_whitespace = re.compile(r'^[ \t\n\r]').match
def start_tag(el):
"""
The text representation of the start tag for a tag.
"""
attributes = ''.join([
f' {name}="{html_escape(value)}"'
for name, value in el.attrib.items()
])
return f'<{el.tag}{attributes}>'
def end_tag(el):
""" The text representation of an end tag for a tag. Includes
trailing whitespace when appropriate. """
tail = el.tail
extra = ' ' if tail and _has_start_whitespace(tail) else ''
return f'</{el.tag}>{extra}'
def is_word(tok):
return not tok.startswith('<')
def is_end_tag(tok):
return tok.startswith('</')
def is_start_tag(tok):
return tok.startswith('<') and not tok.startswith('</')
def fixup_ins_del_tags(html):
""" Given an html string, move any <ins> or <del> tags inside of any
block-level elements, e.g. transform <ins><p>word</p></ins> to
<p><ins>word</ins></p> """
doc = parse_html(html, cleanup=False)
_fixup_ins_del_tags(doc)
html = serialize_html_fragment(doc, skip_outer=True)
return html
def serialize_html_fragment(el, skip_outer=False):
""" Serialize a single lxml element as HTML. The serialized form
includes the elements tail.
If skip_outer is true, then don't serialize the outermost tag
"""
assert not isinstance(el, str), (
f"You should pass in an element, not a string like {el!r}")
html = etree.tostring(el, method="html", encoding='unicode')
if skip_outer:
# Get rid of the extra starting tag:
html = html[html.find('>')+1:]
# Get rid of the extra end tag:
html = html[:html.rfind('<')]
return html.strip()
else:
return html
@cython.cfunc
def _fixup_ins_del_tags(doc):
"""fixup_ins_del_tags that works on an lxml document in-place
"""
for el in list(doc.iter('ins', 'del')):
if not _contains_block_level_tag(el):
continue
_move_el_inside_block(el, tag=el.tag)
el.drop_tag()
#_merge_element_contents(el)
@cython.cfunc
def _contains_block_level_tag(el):
"""True if the element contains any block-level elements, like <p>, <td>, etc.
"""
for el in el.iter(*any_block_level_tag):
return True
return False
@cython.cfunc
def _move_el_inside_block(el, tag):
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
and moves them inside any block-level tags. """
makeelement = el.makeelement
for block_level_el in el.iter(*any_block_level_tag):
if block_level_el is not el:
break
else:
# No block-level tags in any child
children_tag = makeelement(tag)
children_tag.text = el.text
el.text = None
children_tag.extend(iter(el))
el[:] = [children_tag]
return
for child in list(el):
if _contains_block_level_tag(child):
_move_el_inside_block(child, tag)
if child.tail:
tail_tag = makeelement(tag)
tail_tag.text = child.tail
child.tail = None
child.addnext(tail_tag)
else:
child_tag = makeelement(tag)
el.replace(child, child_tag)
child_tag.append(child)
if el.text:
text_tag = makeelement(tag)
text_tag.text = el.text
el.text = None
el.insert(0, text_tag)
def _merge_element_contents(el):
"""
Removes an element, but merges its contents into its place, e.g.,
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
<p>Hi there!</p>
"""
parent = el.getparent()
text = el.text
tail = el.tail
if tail:
if not len(el):
text = (text or '') + tail
else:
el[-1].tail = (el[-1].tail or '') + tail
index = parent.index(el)
if text:
previous = el.getprevious()
if previous is None:
parent.text = (parent.text or '') + text
else:
previous.tail = (previous.tail or '') + text
parent[index:index+1] = el.getchildren()
@cython.final
@cython.cclass
class InsensitiveSequenceMatcher(SequenceMatcher):
"""
Acts like SequenceMatcher, but tries not to find very small equal
blocks amidst large spans of changes
"""
threshold = 2
@cython.cfunc
def get_matching_blocks(self) -> list:
size: cython.Py_ssize_t = min(len(self.b), len(self.b))
threshold: cython.Py_ssize_t = self.threshold
threshold = min(threshold, size // 4)
actual = SequenceMatcher.get_matching_blocks(self)
return [item for item in actual
if item[2] > threshold
or not item[2]]
if __name__ == '__main__':
from lxml.html import _diffcommand
_diffcommand.main()

View File

@@ -0,0 +1,299 @@
from lxml.etree import XPath, ElementBase
from lxml.html import fromstring, XHTML_NAMESPACE
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
from lxml.html import defs
import copy
try:
basestring
except NameError:
# Python 3
basestring = str
__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
'insert_errors', 'insert_errors_html',
'DefaultErrorCreator']
class FormNotFound(LookupError):
"""
Raised when no form can be found
"""
_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
namespaces={'x':XHTML_NAMESPACE})
_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
namespaces={'x':XHTML_NAMESPACE})
_name_xpath = XPath('descendant-or-self::*[@name=$name]')
def fill_form(
el,
values,
form_id=None,
form_index=None,
):
el = _find_form(el, form_id=form_id, form_index=form_index)
_fill_form(el, values)
def fill_form_html(html, values, form_id=None, form_index=None):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
fill_form(doc, values, form_id=form_id, form_index=form_index)
return _transform_result(result_type, doc)
def _fill_form(el, values):
counts = {}
if hasattr(values, 'mixed'):
# For Paste request parameters
values = values.mixed()
inputs = _input_xpath(el)
for input in inputs:
name = input.get('name')
if not name:
continue
if _takes_multiple(input):
value = values.get(name, [])
if not isinstance(value, (list, tuple)):
value = [value]
_fill_multiple(input, value)
elif name not in values:
continue
else:
index = counts.get(name, 0)
counts[name] = index + 1
value = values[name]
if isinstance(value, (list, tuple)):
try:
value = value[index]
except IndexError:
continue
elif index > 0:
continue
_fill_single(input, value)
def _takes_multiple(input):
if _nons(input.tag) == 'select' and input.get('multiple'):
# FIXME: multiple="0"?
return True
type = input.get('type', '').lower()
if type in ('radio', 'checkbox'):
return True
return False
def _fill_multiple(input, value):
type = input.get('type', '').lower()
if type == 'checkbox':
v = input.get('value')
if v is None:
if not value:
result = False
else:
result = value[0]
if isinstance(value, basestring):
# The only valid "on" value for an unnamed checkbox is 'on'
result = result == 'on'
_check(input, result)
else:
_check(input, v in value)
elif type == 'radio':
v = input.get('value')
_check(input, v in value)
else:
assert _nons(input.tag) == 'select'
for option in _options_xpath(input):
v = option.get('value')
if v is None:
# This seems to be the default, at least on IE
# FIXME: but I'm not sure
v = option.text_content()
_select(option, v in value)
def _check(el, check):
if check:
el.set('checked', '')
else:
if 'checked' in el.attrib:
del el.attrib['checked']
def _select(el, select):
if select:
el.set('selected', '')
else:
if 'selected' in el.attrib:
del el.attrib['selected']
def _fill_single(input, value):
if _nons(input.tag) == 'textarea':
input.text = value
else:
input.set('value', value)
def _find_form(el, form_id=None, form_index=None):
if form_id is None and form_index is None:
forms = _forms_xpath(el)
for form in forms:
return form
raise FormNotFound(
"No forms in page")
if form_id is not None:
form = el.get_element_by_id(form_id)
if form is not None:
return form
forms = _form_name_xpath(el, name=form_id)
if forms:
return forms[0]
else:
raise FormNotFound(
"No form with the name or id of %r (forms: %s)"
% (id, ', '.join(_find_form_ids(el))))
if form_index is not None:
forms = _forms_xpath(el)
try:
return forms[form_index]
except IndexError:
raise FormNotFound(
"There is no form with the index %r (%i forms found)"
% (form_index, len(forms)))
def _find_form_ids(el):
forms = _forms_xpath(el)
if not forms:
yield '(no forms)'
return
for index, form in enumerate(forms):
if form.get('id'):
if form.get('name'):
yield '%s or %s' % (form.get('id'),
form.get('name'))
else:
yield form.get('id')
elif form.get('name'):
yield form.get('name')
else:
yield '(unnamed form %s)' % index
############################################################
## Error filling
############################################################
class DefaultErrorCreator:
insert_before = True
block_inside = True
error_container_tag = 'div'
error_message_class = 'error-message'
error_block_class = 'error-block'
default_message = "Invalid"
def __init__(self, **kw):
for name, value in kw.items():
if not hasattr(self, name):
raise TypeError(
"Unexpected keyword argument: %s" % name)
setattr(self, name, value)
def __call__(self, el, is_block, message):
error_el = el.makeelement(self.error_container_tag)
if self.error_message_class:
error_el.set('class', self.error_message_class)
if is_block and self.error_block_class:
error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
if message is None or message == '':
message = self.default_message
if isinstance(message, ElementBase):
error_el.append(message)
else:
assert isinstance(message, basestring), (
"Bad message; should be a string or element: %r" % message)
error_el.text = message or self.default_message
if is_block and self.block_inside:
if self.insert_before:
error_el.tail = el.text
el.text = None
el.insert(0, error_el)
else:
el.append(error_el)
else:
parent = el.getparent()
pos = parent.index(el)
if self.insert_before:
parent.insert(pos, error_el)
else:
error_el.tail = el.tail
el.tail = None
parent.insert(pos+1, error_el)
default_error_creator = DefaultErrorCreator()
def insert_errors(
el,
errors,
form_id=None,
form_index=None,
error_class="error",
error_creator=default_error_creator,
):
el = _find_form(el, form_id=form_id, form_index=form_index)
for name, error in errors.items():
if error is None:
continue
for error_el, message in _find_elements_for_name(el, name, error):
assert isinstance(message, (basestring, type(None), ElementBase)), (
"Bad message: %r" % message)
_insert_error(error_el, message, error_class, error_creator)
def insert_errors_html(html, values, **kw):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
insert_errors(doc, values, **kw)
return _transform_result(result_type, doc)
def _insert_error(el, error, error_class, error_creator):
if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
is_block = False
else:
is_block = True
if _nons(el.tag) != 'form' and error_class:
_add_class(el, error_class)
if el.get('id'):
labels = _label_for_xpath(el, for_id=el.get('id'))
if labels:
for label in labels:
_add_class(label, error_class)
error_creator(el, is_block, error)
def _add_class(el, class_name):
if el.get('class'):
el.set('class', el.get('class')+' '+class_name)
else:
el.set('class', class_name)
def _find_elements_for_name(form, name, error):
if name is None:
# An error for the entire form
yield form, error
return
if name.startswith('#'):
# By id
el = form.get_element_by_id(name[1:])
if el is not None:
yield el, error
return
els = _name_xpath(form, name=name)
if not els:
# FIXME: should this raise an exception?
return
if not isinstance(error, (list, tuple)):
yield els[0], error
return
# FIXME: if error is longer than els, should it raise an error?
for el, err in zip(els, error):
if err is None:
continue
yield el, err

View File

@@ -0,0 +1,260 @@
"""
An interface to html5lib that mimics the lxml.html interface.
"""
import sys
import string
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
from lxml import etree
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
# python3 compatibility
try:
_strings = basestring
except NameError:
_strings = (bytes, str)
try:
from urllib2 import urlopen
except ImportError:
from urllib.request import urlopen
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
class HTMLParser(_HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
try:
from html5lib import XHTMLParser as _XHTMLParser
except ImportError:
pass
else:
class XHTMLParser(_XHTMLParser):
"""An html5lib XHTML Parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
xhtml_parser = XHTMLParser()
def _find_tag(tree, tag):
elem = tree.find(tag)
if elem is not None:
return elem
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
def document_fromstring(html, guess_charset=None, parser=None):
"""
Parse a whole document into a string.
If `guess_charset` is true, or if the input is not Unicode but a
byte string, the `chardet` library will perform charset guessing
on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
options = {}
if guess_charset is None and isinstance(html, bytes):
# html5lib does not accept useChardet as an argument, if it
# detected the html argument would produce unicode objects.
guess_charset = True
if guess_charset is not None:
options['useChardet'] = guess_charset
return parser.parse(html, **options).getroot()
def fragments_fromstring(html, no_leading_text=False,
guess_charset=None, parser=None):
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.
If `guess_charset` is true, the `chardet` library will perform charset
guessing on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
options = {}
if guess_charset is None and isinstance(html, bytes):
# html5lib does not accept useChardet as an argument, if it
# detected the html argument would produce unicode objects.
guess_charset = False
if guess_charset is not None:
options['useChardet'] = guess_charset
children = parser.parseFragment(html, 'div', **options)
if children and isinstance(children[0], _strings):
if no_leading_text:
if children[0].strip():
raise etree.ParserError('There is leading text: %r' %
children[0])
del children[0]
return children
def fragment_fromstring(html, create_parent=False,
guess_charset=None, parser=None):
"""Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.
If 'create_parent' is true (or is a tag name) then a parent node
will be created to encapsulate the HTML in a single element. In
this case, leading or trailing text is allowed.
If `guess_charset` is true, the `chardet` library will perform charset
guessing on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
accept_leading_text = bool(create_parent)
elements = fragments_fromstring(
html, guess_charset=guess_charset, parser=parser,
no_leading_text=not accept_leading_text)
if create_parent:
if not isinstance(create_parent, _strings):
create_parent = 'div'
new_root = Element(create_parent)
if elements:
if isinstance(elements[0], _strings):
new_root.text = elements[0]
del elements[0]
new_root.extend(elements)
return new_root
if not elements:
raise etree.ParserError('No elements found')
if len(elements) > 1:
raise etree.ParserError('Multiple elements found')
result = elements[0]
if result.tail and result.tail.strip():
raise etree.ParserError('Element followed by text: %r' % result.tail)
result.tail = None
return result
def fromstring(html, guess_charset=None, parser=None):
"""Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
'base_url' will set the document's base_url attribute (and the tree's
docinfo.URL)
If `guess_charset` is true, or if the input is not Unicode but a
byte string, the `chardet` library will perform charset guessing
on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
doc = document_fromstring(html, parser=parser,
guess_charset=guess_charset)
# document starts with doctype or <html>, full document!
start = html[:50]
if isinstance(start, bytes):
# Allow text comparison in python3.
# Decode as ascii, that also covers latin-1 and utf-8 for the
# characters we need.
start = start.decode('ascii', 'replace')
start = start.lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
return doc
head = _find_tag(doc, 'head')
# if the head is not empty we have a full document
if len(head):
return doc
body = _find_tag(doc, 'body')
# The body has just one element, so it was probably a single
# element passed in
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
return body[0]
# Now we have a body which represents a bunch of tags which have the
# content that was passed in. We will create a fake container, which
# is the body tag, except <body> implies too much structure.
if _contains_block_level_tag(body):
body.tag = 'div'
else:
body.tag = 'span'
return body
def parse(filename_url_or_file, guess_charset=None, parser=None):
"""Parse a filename, URL, or file-like object into an HTML document
tree. Note: this returns a tree, not an element. Use
``parse(...).getroot()`` to get the document root.
If ``guess_charset`` is true, the ``useChardet`` option is passed into
html5lib to enable character detection. This option is on by default
when parsing from URLs, off by default when parsing from file(-like)
objects (which tend to return Unicode more often than not), and on by
default when parsing from a file path (which is read in binary mode).
"""
if parser is None:
parser = html_parser
if not isinstance(filename_url_or_file, _strings):
fp = filename_url_or_file
if guess_charset is None:
# assume that file-like objects return Unicode more often than bytes
guess_charset = False
elif _looks_like_url(filename_url_or_file):
fp = urlopen(filename_url_or_file)
if guess_charset is None:
# assume that URLs return bytes
guess_charset = True
else:
fp = open(filename_url_or_file, 'rb')
if guess_charset is None:
guess_charset = True
options = {}
# html5lib does not accept useChardet as an argument, if it
# detected the html argument would produce unicode objects.
if guess_charset:
options['useChardet'] = guess_charset
return parser.parse(fp, **options)
def _looks_like_url(str):
scheme = urlparse(str)[0]
if not scheme:
return False
elif (sys.platform == 'win32' and
scheme in string.ascii_letters
and len(scheme) == 1):
# looks like a 'normal' absolute path
return False
else:
return True
html_parser = HTMLParser()

View File

@@ -0,0 +1,314 @@
"""External interface to the BeautifulSoup HTML parser.
"""
__all__ = ["fromstring", "parse", "convert_tree"]
import re
from lxml import etree, html
try:
from bs4 import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
Declaration, Doctype)
_DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
except ImportError:
from BeautifulSoup import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
Declaration)
_DECLARATION_OR_DOCTYPE = Declaration
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a string of HTML data into an Element tree using the
BeautifulSoup parser.
Returns the root ``<html>`` Element of the tree.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
return _parse(data, beautifulsoup, makeelement, **bsargs)
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
if not hasattr(file, 'read'):
file = open(file)
root = _parse(file, beautifulsoup, makeelement, **bsargs)
return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
"""Convert a BeautifulSoup tree to a list of Element trees.
Returns a list instead of a single root Element to support
HTML-like soup with more than one root element.
You can pass a different Element factory through the `makeelement`
keyword.
"""
root = _convert_tree(beautiful_soup_tree, makeelement)
children = root.getchildren()
for child in children:
root.remove(child)
return children
# helpers
def _parse(source, beautifulsoup, makeelement, **bsargs):
if beautifulsoup is None:
beautifulsoup = BeautifulSoup
if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
if 'features' not in bsargs:
bsargs['features'] = 'html.parser' # use Python html parser
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
if len(root) == 1 and root[0].tag == "html":
return root[0]
root.tag = "html"
return root
_parse_doctype_declaration = re.compile(
r'(?:\s|[<!])*DOCTYPE\s*HTML'
r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
re.IGNORECASE).match
class _PseudoTag:
# Minimal imitation of BeautifulSoup.Tag
def __init__(self, contents):
self.name = 'html'
self.attrs = []
self.contents = contents
def __iter__(self):
return self.contents.__iter__()
def _convert_tree(beautiful_soup_tree, makeelement):
if makeelement is None:
makeelement = html.html_parser.makeelement
# Split the tree into three parts:
# i) everything before the root element: document type
# declaration, comments, processing instructions, whitespace
# ii) the root(s),
# iii) everything after the root: comments, processing
# instructions, whitespace
first_element_idx = last_element_idx = None
html_root = declaration = None
for i, e in enumerate(beautiful_soup_tree):
if isinstance(e, Tag):
if first_element_idx is None:
first_element_idx = i
last_element_idx = i
if html_root is None and e.name and e.name.lower() == 'html':
html_root = e
elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
declaration = e
# For a nice, well-formatted document, the variable roots below is
# a list consisting of a single <html> element. However, the document
# may be a soup like '<meta><head><title>Hello</head><body>Hi
# all<\p>'. In this example roots is a list containing meta, head
# and body elements.
if first_element_idx is None:
pre_root = post_root = []
roots = beautiful_soup_tree.contents
else:
pre_root = beautiful_soup_tree.contents[:first_element_idx]
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
# Reorganize so that there is one <html> root...
if html_root is not None:
# ... use existing one if possible, ...
i = roots.index(html_root)
html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
else:
# ... otherwise create a new one.
html_root = _PseudoTag(roots)
convert_node = _init_node_converters(makeelement)
# Process pre_root
res_root = convert_node(html_root)
prev = res_root
for e in reversed(pre_root):
converted = convert_node(e)
if converted is not None:
prev.addprevious(converted)
prev = converted
# ditto for post_root
prev = res_root
for e in post_root:
converted = convert_node(e)
if converted is not None:
prev.addnext(converted)
prev = converted
if declaration is not None:
try:
# bs4 provides full Doctype string
doctype_string = declaration.output_ready()
except AttributeError:
doctype_string = declaration.string
match = _parse_doctype_declaration(doctype_string)
if not match:
# Something is wrong if we end up in here. Since soupparser should
# tolerate errors, do not raise Exception, just let it pass.
pass
else:
external_id, sys_uri = match.groups()
docinfo = res_root.getroottree().docinfo
# strip quotes and update DOCTYPE values (any of None, '', '...')
docinfo.public_id = external_id and external_id[1:-1]
docinfo.system_url = sys_uri and sys_uri[1:-1]
return res_root
def _init_node_converters(makeelement):
converters = {}
ordered_node_types = []
def converter(*types):
def add(handler):
for t in types:
converters[t] = handler
ordered_node_types.append(t)
return handler
return add
def find_best_converter(node):
for t in ordered_node_types:
if isinstance(node, t):
return converters[t]
return None
def convert_node(bs_node, parent=None):
# duplicated in convert_tag() below
try:
handler = converters[type(bs_node)]
except KeyError:
handler = converters[type(bs_node)] = find_best_converter(bs_node)
if handler is None:
return None
return handler(bs_node, parent)
def map_attrs(bs_attrs):
if isinstance(bs_attrs, dict): # bs4
attribs = {}
for k, v in bs_attrs.items():
if isinstance(v, list):
v = " ".join(v)
attribs[k] = unescape(v)
else:
attribs = {k: unescape(v) for k, v in bs_attrs}
return attribs
def append_text(parent, text):
if len(parent) == 0:
parent.text = (parent.text or '') + text
else:
parent[-1].tail = (parent[-1].tail or '') + text
# converters are tried in order of their definition
@converter(Tag, _PseudoTag)
def convert_tag(bs_node, parent):
attrs = bs_node.attrs
if parent is not None:
attribs = map_attrs(attrs) if attrs else None
res = etree.SubElement(parent, bs_node.name, attrib=attribs)
else:
attribs = map_attrs(attrs) if attrs else {}
res = makeelement(bs_node.name, attrib=attribs)
for child in bs_node:
# avoid double recursion by inlining convert_node(), see above
try:
handler = converters[type(child)]
except KeyError:
pass
else:
if handler is not None:
handler(child, res)
continue
convert_node(child, res)
return res
@converter(Comment)
def convert_comment(bs_node, parent):
res = html.HtmlComment(bs_node)
if parent is not None:
parent.append(res)
return res
@converter(ProcessingInstruction)
def convert_pi(bs_node, parent):
if bs_node.endswith('?'):
# The PI is of XML style (<?as df?>) but BeautifulSoup
# interpreted it as being SGML style (<?as df>). Fix.
bs_node = bs_node[:-1]
res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
if parent is not None:
parent.append(res)
return res
@converter(NavigableString)
def convert_text(bs_node, parent):
if parent is not None:
append_text(parent, unescape(bs_node))
return None
return convert_node
# copied from ET's ElementSoup
try:
from html.entities import name2codepoint # Python 3
except ImportError:
from htmlentitydefs import name2codepoint
handle_entities = re.compile(r"&(\w+);").sub
try:
unichr
except NameError:
# Python 3
unichr = chr
def unescape(string):
if not string:
return ''
# work around oddities in BeautifulSoup's entity handling
def unescape_entity(m):
try:
return unichr(name2codepoint[m.group(1)])
except KeyError:
return m.group(0) # use as is
return handle_entities(unescape_entity, string)

View File

@@ -0,0 +1,13 @@
"""Doctest module for HTML comparison.
Usage::
>>> import lxml.html.usedoctest
>>> # now do your HTML doctests ...
See `lxml.doctestcompare`.
"""
from lxml import doctestcompare
doctestcompare.temp_install(html=True, del_module=__name__)