added completly new version for haslach 2025
This commit is contained in:
10
.venv/lib/python3.7/site-packages/lxml/html/ElementSoup.py
Normal file
10
.venv/lib/python3.7/site-packages/lxml/html/ElementSoup.py
Normal file
@@ -0,0 +1,10 @@
|
||||
__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
|
||||
"""
|
||||
|
||||
__all__ = ["parse", "convert_tree"]
|
||||
|
||||
from .soupparser import convert_tree, parse as _parse
|
||||
|
||||
def parse(file, beautifulsoup=None, makeelement=None):
|
||||
root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
|
||||
return root.getroot()
|
1946
.venv/lib/python3.7/site-packages/lxml/html/__init__.py
Normal file
1946
.venv/lib/python3.7/site-packages/lxml/html/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
88
.venv/lib/python3.7/site-packages/lxml/html/_diffcommand.py
Normal file
88
.venv/lib/python3.7/site-packages/lxml/html/_diffcommand.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import optparse
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
from .diff import htmldiff
|
||||
|
||||
description = """\
|
||||
"""
|
||||
|
||||
parser = optparse.OptionParser(
|
||||
usage="%prog [OPTIONS] FILE1 FILE2\n"
|
||||
"%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
|
||||
description=description,
|
||||
)
|
||||
|
||||
parser.add_option(
|
||||
'-o', '--output',
|
||||
metavar="FILE",
|
||||
dest="output",
|
||||
default="-",
|
||||
help="File to write the difference to",
|
||||
)
|
||||
|
||||
parser.add_option(
|
||||
'-a', '--annotation',
|
||||
action="store_true",
|
||||
dest="annotation",
|
||||
help="Do an annotation")
|
||||
|
||||
def main(args=None):
|
||||
if args is None:
|
||||
args = sys.argv[1:]
|
||||
options, args = parser.parse_args(args)
|
||||
if options.annotation:
|
||||
return annotate(options, args)
|
||||
if len(args) != 2:
|
||||
print('Error: you must give two files')
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
file1, file2 = args
|
||||
input1 = read_file(file1)
|
||||
input2 = read_file(file2)
|
||||
body1 = split_body(input1)[1]
|
||||
pre, body2, post = split_body(input2)
|
||||
result = htmldiff(body1, body2)
|
||||
result = pre + result + post
|
||||
if options.output == '-':
|
||||
if not result.endswith('\n'):
|
||||
result += '\n'
|
||||
sys.stdout.write(result)
|
||||
else:
|
||||
with open(options.output, 'wb') as f:
|
||||
f.write(result)
|
||||
|
||||
def read_file(filename):
|
||||
if filename == '-':
|
||||
c = sys.stdin.read()
|
||||
elif not os.path.exists(filename):
|
||||
raise OSError(
|
||||
"Input file %s does not exist" % filename)
|
||||
else:
|
||||
with open(filename, 'rb') as f:
|
||||
c = f.read()
|
||||
return c
|
||||
|
||||
body_start_re = re.compile(
|
||||
r"<body.*?>", re.I|re.S)
|
||||
body_end_re = re.compile(
|
||||
r"</body.*?>", re.I|re.S)
|
||||
|
||||
def split_body(html):
|
||||
pre = post = ''
|
||||
match = body_start_re.search(html)
|
||||
if match:
|
||||
pre = html[:match.end()]
|
||||
html = html[match.end():]
|
||||
match = body_end_re.search(html)
|
||||
if match:
|
||||
post = html[match.start():]
|
||||
html = html[:match.start()]
|
||||
return pre, html, post
|
||||
|
||||
def annotate(options, args):
|
||||
print("Not yet implemented")
|
||||
sys.exit(1)
|
||||
|
100
.venv/lib/python3.7/site-packages/lxml/html/_html5builder.py
Normal file
100
.venv/lib/python3.7/site-packages/lxml/html/_html5builder.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Legacy module - don't use in new code!
|
||||
|
||||
html5lib now has its own proper implementation.
|
||||
|
||||
This module implements a tree builder for html5lib that generates lxml
|
||||
html element trees. This module uses camelCase as it follows the
|
||||
html5lib style guide.
|
||||
"""
|
||||
|
||||
from html5lib.treebuilders import _base, etree as etree_builders
|
||||
from lxml import html, etree
|
||||
|
||||
|
||||
class DocumentType(object):
|
||||
|
||||
def __init__(self, name, publicId, systemId):
|
||||
self.name = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
class Document(object):
|
||||
|
||||
def __init__(self):
|
||||
self._elementTree = None
|
||||
self.childNodes = []
|
||||
|
||||
def appendChild(self, element):
|
||||
self._elementTree.getroot().addnext(element._element)
|
||||
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
commentClass = None
|
||||
fragmentClass = Document
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html_builder = etree_builders.getETreeModule(html, fullTree=False)
|
||||
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
|
||||
self.elementClass = html_builder.Element
|
||||
self.commentClass = etree_builder.Comment
|
||||
_base.TreeBuilder.__init__(self, *args, **kwargs)
|
||||
|
||||
def reset(self):
|
||||
_base.TreeBuilder.reset(self)
|
||||
self.rootInserted = False
|
||||
self.initialComments = []
|
||||
self.doctype = None
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._elementTree
|
||||
|
||||
def getFragment(self):
|
||||
fragment = []
|
||||
element = self.openElements[0]._element
|
||||
if element.text:
|
||||
fragment.append(element.text)
|
||||
fragment.extend(element.getchildren())
|
||||
if element.tail:
|
||||
fragment.append(element.tail)
|
||||
return fragment
|
||||
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
if not self.rootInserted:
|
||||
self.initialComments.append(data)
|
||||
else:
|
||||
_base.TreeBuilder.insertComment(self, data, parent)
|
||||
|
||||
def insertRoot(self, name):
|
||||
buf = []
|
||||
if self.doctype and self.doctype.name:
|
||||
buf.append('<!DOCTYPE %s' % self.doctype.name)
|
||||
if self.doctype.publicId is not None or self.doctype.systemId is not None:
|
||||
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
|
||||
self.doctype.systemId))
|
||||
buf.append('>')
|
||||
buf.append('<html></html>')
|
||||
root = html.fromstring(''.join(buf))
|
||||
|
||||
# Append the initial comments:
|
||||
for comment in self.initialComments:
|
||||
root.addprevious(etree.Comment(comment))
|
||||
|
||||
# Create the root document and add the ElementTree to it
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
# Add the root element to the internal child/open data structures
|
||||
root_element = self.elementClass(name)
|
||||
root_element._element = root
|
||||
self.document.childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
||||
self.rootInserted = True
|
56
.venv/lib/python3.7/site-packages/lxml/html/_setmixin.py
Normal file
56
.venv/lib/python3.7/site-packages/lxml/html/_setmixin.py
Normal file
@@ -0,0 +1,56 @@
|
||||
try:
|
||||
from collections.abc import MutableSet
|
||||
except ImportError:
|
||||
from collections import MutableSet
|
||||
|
||||
|
||||
class SetMixin(MutableSet):
|
||||
|
||||
"""
|
||||
Mix-in for sets. You must define __iter__, add, remove
|
||||
"""
|
||||
|
||||
def __len__(self):
|
||||
length = 0
|
||||
for item in self:
|
||||
length += 1
|
||||
return length
|
||||
|
||||
def __contains__(self, item):
|
||||
for has_item in self:
|
||||
if item == has_item:
|
||||
return True
|
||||
return False
|
||||
|
||||
issubset = MutableSet.__le__
|
||||
issuperset = MutableSet.__ge__
|
||||
|
||||
union = MutableSet.__or__
|
||||
intersection = MutableSet.__and__
|
||||
difference = MutableSet.__sub__
|
||||
symmetric_difference = MutableSet.__xor__
|
||||
|
||||
def copy(self):
|
||||
return set(self)
|
||||
|
||||
def update(self, other):
|
||||
self |= other
|
||||
|
||||
def intersection_update(self, other):
|
||||
self &= other
|
||||
|
||||
def difference_update(self, other):
|
||||
self -= other
|
||||
|
||||
def symmetric_difference_update(self, other):
|
||||
self ^= other
|
||||
|
||||
def discard(self, item):
|
||||
try:
|
||||
self.remove(item)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def _from_iterable(cls, it):
|
||||
return set(it)
|
133
.venv/lib/python3.7/site-packages/lxml/html/builder.py
Normal file
133
.venv/lib/python3.7/site-packages/lxml/html/builder.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# --------------------------------------------------------------------
|
||||
# The ElementTree toolkit is
|
||||
# Copyright (c) 1999-2004 by Fredrik Lundh
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
A set of HTML generator tags for building HTML documents.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> from lxml.html.builder import *
|
||||
>>> html = HTML(
|
||||
... HEAD( TITLE("Hello World") ),
|
||||
... BODY( CLASS("main"),
|
||||
... H1("Hello World !")
|
||||
... )
|
||||
... )
|
||||
|
||||
>>> import lxml.etree
|
||||
>>> print lxml.etree.tostring(html, pretty_print=True)
|
||||
<html>
|
||||
<head>
|
||||
<title>Hello World</title>
|
||||
</head>
|
||||
<body class="main">
|
||||
<h1>Hello World !</h1>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
"""
|
||||
|
||||
from lxml.builder import ElementMaker
|
||||
from lxml.html import html_parser
|
||||
|
||||
E = ElementMaker(makeelement=html_parser.makeelement)
|
||||
|
||||
# elements
|
||||
A = E.a #: anchor
|
||||
ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
|
||||
ACRONYM = E.acronym #:
|
||||
ADDRESS = E.address #: information on author
|
||||
APPLET = E.applet #: Java applet (DEPRECATED)
|
||||
AREA = E.area #: client-side image map area
|
||||
B = E.b #: bold text style
|
||||
BASE = E.base #: document base URI
|
||||
BASEFONT = E.basefont #: base font size (DEPRECATED)
|
||||
BDO = E.bdo #: I18N BiDi over-ride
|
||||
BIG = E.big #: large text style
|
||||
BLOCKQUOTE = E.blockquote #: long quotation
|
||||
BODY = E.body #: document body
|
||||
BR = E.br #: forced line break
|
||||
BUTTON = E.button #: push button
|
||||
CAPTION = E.caption #: table caption
|
||||
CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
|
||||
CITE = E.cite #: citation
|
||||
CODE = E.code #: computer code fragment
|
||||
COL = E.col #: table column
|
||||
COLGROUP = E.colgroup #: table column group
|
||||
DD = E.dd #: definition description
|
||||
DEL = getattr(E, 'del') #: deleted text
|
||||
DFN = E.dfn #: instance definition
|
||||
DIR = E.dir #: directory list (DEPRECATED)
|
||||
DIV = E.div #: generic language/style container
|
||||
DL = E.dl #: definition list
|
||||
DT = E.dt #: definition term
|
||||
EM = E.em #: emphasis
|
||||
FIELDSET = E.fieldset #: form control group
|
||||
FONT = E.font #: local change to font (DEPRECATED)
|
||||
FORM = E.form #: interactive form
|
||||
FRAME = E.frame #: subwindow
|
||||
FRAMESET = E.frameset #: window subdivision
|
||||
H1 = E.h1 #: heading
|
||||
H2 = E.h2 #: heading
|
||||
H3 = E.h3 #: heading
|
||||
H4 = E.h4 #: heading
|
||||
H5 = E.h5 #: heading
|
||||
H6 = E.h6 #: heading
|
||||
HEAD = E.head #: document head
|
||||
HR = E.hr #: horizontal rule
|
||||
HTML = E.html #: document root element
|
||||
I = E.i #: italic text style
|
||||
IFRAME = E.iframe #: inline subwindow
|
||||
IMG = E.img #: Embedded image
|
||||
INPUT = E.input #: form control
|
||||
INS = E.ins #: inserted text
|
||||
ISINDEX = E.isindex #: single line prompt (DEPRECATED)
|
||||
KBD = E.kbd #: text to be entered by the user
|
||||
LABEL = E.label #: form field label text
|
||||
LEGEND = E.legend #: fieldset legend
|
||||
LI = E.li #: list item
|
||||
LINK = E.link #: a media-independent link
|
||||
MAP = E.map #: client-side image map
|
||||
MENU = E.menu #: menu list (DEPRECATED)
|
||||
META = E.meta #: generic metainformation
|
||||
NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
|
||||
NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
|
||||
OBJECT = E.object #: generic embedded object
|
||||
OL = E.ol #: ordered list
|
||||
OPTGROUP = E.optgroup #: option group
|
||||
OPTION = E.option #: selectable choice
|
||||
P = E.p #: paragraph
|
||||
PARAM = E.param #: named property value
|
||||
PRE = E.pre #: preformatted text
|
||||
Q = E.q #: short inline quotation
|
||||
S = E.s #: strike-through text style (DEPRECATED)
|
||||
SAMP = E.samp #: sample program output, scripts, etc.
|
||||
SCRIPT = E.script #: script statements
|
||||
SELECT = E.select #: option selector
|
||||
SMALL = E.small #: small text style
|
||||
SPAN = E.span #: generic language/style container
|
||||
STRIKE = E.strike #: strike-through text (DEPRECATED)
|
||||
STRONG = E.strong #: strong emphasis
|
||||
STYLE = E.style #: style info
|
||||
SUB = E.sub #: subscript
|
||||
SUP = E.sup #: superscript
|
||||
TABLE = E.table #:
|
||||
TBODY = E.tbody #: table body
|
||||
TD = E.td #: table data cell
|
||||
TEXTAREA = E.textarea #: multi-line text field
|
||||
TFOOT = E.tfoot #: table footer
|
||||
TH = E.th #: table header cell
|
||||
THEAD = E.thead #: table header
|
||||
TITLE = E.title #: document title
|
||||
TR = E.tr #: table row
|
||||
TT = E.tt #: teletype or monospaced text style
|
||||
U = E.u #: underlined text style (DEPRECATED)
|
||||
UL = E.ul #: unordered list
|
||||
VAR = E.var #: instance of a variable or program argument
|
||||
|
||||
# attributes (only reserved words are included here)
|
||||
ATTR = dict
|
||||
def CLASS(v): return {'class': v}
|
||||
def FOR(v): return {'for': v}
|
Binary file not shown.
785
.venv/lib/python3.7/site-packages/lxml/html/clean.py
Normal file
785
.venv/lib/python3.7/site-packages/lxml/html/clean.py
Normal file
@@ -0,0 +1,785 @@
|
||||
# cython: language_level=3str
|
||||
|
||||
"""A cleanup tool for HTML.
|
||||
|
||||
Removes unwanted tags and content. See the `Cleaner` class for
|
||||
details.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import copy
|
||||
import re
|
||||
import sys
|
||||
try:
|
||||
from urlparse import urlsplit
|
||||
from urllib import unquote_plus
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib.parse import urlsplit, unquote_plus
|
||||
from lxml import etree
|
||||
from lxml.html import defs
|
||||
from lxml.html import fromstring, XHTML_NAMESPACE
|
||||
from lxml.html import xhtml_to_html, _transform_result
|
||||
|
||||
try:
|
||||
unichr
|
||||
except NameError:
|
||||
# Python 3
|
||||
unichr = chr
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
# Python 3
|
||||
unicode = str
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = (str, bytes)
|
||||
|
||||
|
||||
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
|
||||
'word_break', 'word_break_html']
|
||||
|
||||
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
|
||||
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
|
||||
# I have multiple kinds of schemes searched; but should schemes be
|
||||
# whitelisted instead?
|
||||
# max height?
|
||||
# remove images? Also in CSS? background attribute?
|
||||
# Some way to whitelist object, iframe, etc (e.g., if you want to
|
||||
# allow *just* embedded YouTube movies)
|
||||
# Log what was deleted and why?
|
||||
# style="behavior: ..." might be bad in IE?
|
||||
# Should we have something for just <meta http-equiv>? That's the worst of the
|
||||
# metas.
|
||||
# UTF-7 detections? Example:
|
||||
# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
|
||||
# you don't always have to have the charset set, if the page has no charset
|
||||
# and there's UTF7-like code in it.
|
||||
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
|
||||
|
||||
|
||||
# This is an IE-specific construct you can have in a stylesheet to
|
||||
# run some Javascript:
|
||||
_replace_css_javascript = re.compile(
|
||||
r'expression\s*\(.*?\)', re.S|re.I).sub
|
||||
|
||||
# Do I have to worry about @\nimport?
|
||||
_replace_css_import = re.compile(
|
||||
r'@\s*import', re.I).sub
|
||||
|
||||
_looks_like_tag_content = re.compile(
|
||||
r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
|
||||
*((re.ASCII,) if sys.version_info[0] >= 3 else ())).search
|
||||
|
||||
# All kinds of schemes besides just javascript: that can cause
|
||||
# execution:
|
||||
_find_image_dataurls = re.compile(
|
||||
r'data:image/(.+);base64,', re.I).findall
|
||||
_possibly_malicious_schemes = re.compile(
|
||||
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
|
||||
re.I).findall
|
||||
# SVG images can contain script content
|
||||
_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
|
||||
|
||||
def _has_javascript_scheme(s):
|
||||
safe_image_urls = 0
|
||||
for image_type in _find_image_dataurls(s):
|
||||
if _is_unsafe_image_type(image_type):
|
||||
return True
|
||||
safe_image_urls += 1
|
||||
return len(_possibly_malicious_schemes(s)) > safe_image_urls
|
||||
|
||||
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
|
||||
|
||||
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
|
||||
_conditional_comment_re = re.compile(
|
||||
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
|
||||
|
||||
_find_styled_elements = etree.XPath(
|
||||
"descendant-or-self::*[@style]")
|
||||
|
||||
_find_external_links = etree.XPath(
|
||||
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
|
||||
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
|
||||
|
||||
class Cleaner(object):
|
||||
"""
|
||||
Instances cleans the document of each of the possible offending
|
||||
elements. The cleaning is controlled by attributes; you can
|
||||
override attributes in a subclass, or set them in the constructor.
|
||||
|
||||
``scripts``:
|
||||
Removes any ``<script>`` tags.
|
||||
|
||||
``javascript``:
|
||||
Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
|
||||
as they could contain Javascript.
|
||||
|
||||
``comments``:
|
||||
Removes any comments.
|
||||
|
||||
``style``:
|
||||
Removes any style tags.
|
||||
|
||||
``inline_style``
|
||||
Removes any style attributes. Defaults to the value of the ``style`` option.
|
||||
|
||||
``links``:
|
||||
Removes any ``<link>`` tags
|
||||
|
||||
``meta``:
|
||||
Removes any ``<meta>`` tags
|
||||
|
||||
``page_structure``:
|
||||
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
|
||||
|
||||
``processing_instructions``:
|
||||
Removes any processing instructions.
|
||||
|
||||
``embedded``:
|
||||
Removes any embedded objects (flash, iframes)
|
||||
|
||||
``frames``:
|
||||
Removes any frame-related tags
|
||||
|
||||
``forms``:
|
||||
Removes any form tags
|
||||
|
||||
``annoying_tags``:
|
||||
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
|
||||
|
||||
``remove_tags``:
|
||||
A list of tags to remove. Only the tags will be removed,
|
||||
their content will get pulled up into the parent tag.
|
||||
|
||||
``kill_tags``:
|
||||
A list of tags to kill. Killing also removes the tag's content,
|
||||
i.e. the whole subtree, not just the tag itself.
|
||||
|
||||
``allow_tags``:
|
||||
A list of tags to include (default include all).
|
||||
|
||||
``remove_unknown_tags``:
|
||||
Remove any tags that aren't standard parts of HTML.
|
||||
|
||||
``safe_attrs_only``:
|
||||
If true, only include 'safe' attributes (specifically the list
|
||||
from the feedparser HTML sanitisation web site).
|
||||
|
||||
``safe_attrs``:
|
||||
A set of attribute names to override the default list of attributes
|
||||
considered 'safe' (when safe_attrs_only=True).
|
||||
|
||||
``add_nofollow``:
|
||||
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
|
||||
|
||||
``host_whitelist``:
|
||||
A list or set of hosts that you can use for embedded content
|
||||
(for content like ``<object>``, ``<link rel="stylesheet">``, etc).
|
||||
You can also implement/override the method
|
||||
``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
|
||||
implement more complex rules for what can be embedded.
|
||||
Anything that passes this test will be shown, regardless of
|
||||
the value of (for instance) ``embedded``.
|
||||
|
||||
Note that this parameter might not work as intended if you do not
|
||||
make the links absolute before doing the cleaning.
|
||||
|
||||
Note that you may also need to set ``whitelist_tags``.
|
||||
|
||||
``whitelist_tags``:
|
||||
A set of tags that can be included with ``host_whitelist``.
|
||||
The default is ``iframe`` and ``embed``; you may wish to
|
||||
include other tags like ``script``, or you may want to
|
||||
implement ``allow_embedded_url`` for more control. Set to None to
|
||||
include all tags.
|
||||
|
||||
This modifies the document *in place*.
|
||||
"""
|
||||
|
||||
scripts = True
|
||||
javascript = True
|
||||
comments = True
|
||||
style = False
|
||||
inline_style = None
|
||||
links = True
|
||||
meta = True
|
||||
page_structure = True
|
||||
processing_instructions = True
|
||||
embedded = True
|
||||
frames = True
|
||||
forms = True
|
||||
annoying_tags = True
|
||||
remove_tags = None
|
||||
allow_tags = None
|
||||
kill_tags = None
|
||||
remove_unknown_tags = True
|
||||
safe_attrs_only = True
|
||||
safe_attrs = defs.safe_attrs
|
||||
add_nofollow = False
|
||||
host_whitelist = ()
|
||||
whitelist_tags = {'iframe', 'embed'}
|
||||
|
||||
def __init__(self, **kw):
|
||||
not_an_attribute = object()
|
||||
for name, value in kw.items():
|
||||
default = getattr(self, name, not_an_attribute)
|
||||
if (default is not None and default is not True and default is not False
|
||||
and not isinstance(default, (frozenset, set, tuple, list))):
|
||||
raise TypeError(
|
||||
"Unknown parameter: %s=%r" % (name, value))
|
||||
setattr(self, name, value)
|
||||
if self.inline_style is None and 'inline_style' not in kw:
|
||||
self.inline_style = self.style
|
||||
|
||||
if kw.get("allow_tags"):
|
||||
if kw.get("remove_unknown_tags"):
|
||||
raise ValueError("It does not make sense to pass in both "
|
||||
"allow_tags and remove_unknown_tags")
|
||||
self.remove_unknown_tags = False
|
||||
|
||||
# Used to lookup the primary URL for a given tag that is up for
|
||||
# removal:
|
||||
_tag_link_attrs = dict(
|
||||
script='src',
|
||||
link='href',
|
||||
# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
|
||||
# From what I can tell, both attributes can contain a link:
|
||||
applet=['code', 'object'],
|
||||
iframe='src',
|
||||
embed='src',
|
||||
layer='src',
|
||||
# FIXME: there doesn't really seem like a general way to figure out what
|
||||
# links an <object> tag uses; links often go in <param> tags with values
|
||||
# that we don't really know. You'd have to have knowledge about specific
|
||||
# kinds of plugins (probably keyed off classid), and match against those.
|
||||
##object=?,
|
||||
# FIXME: not looking at the action currently, because it is more complex
|
||||
# than than -- if you keep the form, you should keep the form controls.
|
||||
##form='action',
|
||||
a='href',
|
||||
)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""
|
||||
Cleans the document.
|
||||
"""
|
||||
try:
|
||||
getroot = doc.getroot
|
||||
except AttributeError:
|
||||
pass # Element instance
|
||||
else:
|
||||
doc = getroot() # ElementTree instance, instead of an element
|
||||
# convert XHTML to HTML
|
||||
xhtml_to_html(doc)
|
||||
# Normalize a case that IE treats <image> like <img>, and that
|
||||
# can confuse either this step or later steps.
|
||||
for el in doc.iter('image'):
|
||||
el.tag = 'img'
|
||||
if not self.comments:
|
||||
# Of course, if we were going to kill comments anyway, we don't
|
||||
# need to worry about this
|
||||
self.kill_conditional_comments(doc)
|
||||
|
||||
kill_tags = set(self.kill_tags or ())
|
||||
remove_tags = set(self.remove_tags or ())
|
||||
allow_tags = set(self.allow_tags or ())
|
||||
|
||||
if self.scripts:
|
||||
kill_tags.add('script')
|
||||
if self.safe_attrs_only:
|
||||
safe_attrs = set(self.safe_attrs)
|
||||
for el in doc.iter(etree.Element):
|
||||
attrib = el.attrib
|
||||
for aname in attrib.keys():
|
||||
if aname not in safe_attrs:
|
||||
del attrib[aname]
|
||||
if self.javascript:
|
||||
if not (self.safe_attrs_only and
|
||||
self.safe_attrs == defs.safe_attrs):
|
||||
# safe_attrs handles events attributes itself
|
||||
for el in doc.iter(etree.Element):
|
||||
attrib = el.attrib
|
||||
for aname in attrib.keys():
|
||||
if aname.startswith('on'):
|
||||
del attrib[aname]
|
||||
doc.rewrite_links(self._remove_javascript_link,
|
||||
resolve_base_href=False)
|
||||
# If we're deleting style then we don't have to remove JS links
|
||||
# from styles, otherwise...
|
||||
if not self.inline_style:
|
||||
for el in _find_styled_elements(doc):
|
||||
old = el.get('style')
|
||||
new = _replace_css_javascript('', old)
|
||||
new = _replace_css_import('', new)
|
||||
if self._has_sneaky_javascript(new):
|
||||
# Something tricky is going on...
|
||||
del el.attrib['style']
|
||||
elif new != old:
|
||||
el.set('style', new)
|
||||
if not self.style:
|
||||
for el in list(doc.iter('style')):
|
||||
if el.get('type', '').lower().strip() == 'text/javascript':
|
||||
el.drop_tree()
|
||||
continue
|
||||
old = el.text or ''
|
||||
new = _replace_css_javascript('', old)
|
||||
# The imported CSS can do anything; we just can't allow:
|
||||
new = _replace_css_import('', new)
|
||||
if self._has_sneaky_javascript(new):
|
||||
# Something tricky is going on...
|
||||
el.text = '/* deleted */'
|
||||
elif new != old:
|
||||
el.text = new
|
||||
if self.comments:
|
||||
kill_tags.add(etree.Comment)
|
||||
if self.processing_instructions:
|
||||
kill_tags.add(etree.ProcessingInstruction)
|
||||
if self.style:
|
||||
kill_tags.add('style')
|
||||
if self.inline_style:
|
||||
etree.strip_attributes(doc, 'style')
|
||||
if self.links:
|
||||
kill_tags.add('link')
|
||||
elif self.style or self.javascript:
|
||||
# We must get rid of included stylesheets if Javascript is not
|
||||
# allowed, as you can put Javascript in them
|
||||
for el in list(doc.iter('link')):
|
||||
if 'stylesheet' in el.get('rel', '').lower():
|
||||
# Note this kills alternate stylesheets as well
|
||||
if not self.allow_element(el):
|
||||
el.drop_tree()
|
||||
if self.meta:
|
||||
kill_tags.add('meta')
|
||||
if self.page_structure:
|
||||
remove_tags.update(('head', 'html', 'title'))
|
||||
if self.embedded:
|
||||
# FIXME: is <layer> really embedded?
|
||||
# We should get rid of any <param> tags not inside <applet>;
|
||||
# These are not really valid anyway.
|
||||
for el in list(doc.iter('param')):
|
||||
parent = el.getparent()
|
||||
while parent is not None and parent.tag not in ('applet', 'object'):
|
||||
parent = parent.getparent()
|
||||
if parent is None:
|
||||
el.drop_tree()
|
||||
kill_tags.update(('applet',))
|
||||
# The alternate contents that are in an iframe are a good fallback:
|
||||
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
|
||||
if self.frames:
|
||||
# FIXME: ideally we should look at the frame links, but
|
||||
# generally frames don't mix properly with an HTML
|
||||
# fragment anyway.
|
||||
kill_tags.update(defs.frame_tags)
|
||||
if self.forms:
|
||||
remove_tags.add('form')
|
||||
kill_tags.update(('button', 'input', 'select', 'textarea'))
|
||||
if self.annoying_tags:
|
||||
remove_tags.update(('blink', 'marquee'))
|
||||
|
||||
_remove = []
|
||||
_kill = []
|
||||
for el in doc.iter():
|
||||
if el.tag in kill_tags:
|
||||
if self.allow_element(el):
|
||||
continue
|
||||
_kill.append(el)
|
||||
elif el.tag in remove_tags:
|
||||
if self.allow_element(el):
|
||||
continue
|
||||
_remove.append(el)
|
||||
|
||||
if _remove and _remove[0] == doc:
|
||||
# We have to drop the parent-most tag, which we can't
|
||||
# do. Instead we'll rewrite it:
|
||||
el = _remove.pop(0)
|
||||
el.tag = 'div'
|
||||
el.attrib.clear()
|
||||
elif _kill and _kill[0] == doc:
|
||||
# We have to drop the parent-most element, which we can't
|
||||
# do. Instead we'll clear it:
|
||||
el = _kill.pop(0)
|
||||
if el.tag != 'html':
|
||||
el.tag = 'div'
|
||||
el.clear()
|
||||
|
||||
_kill.reverse() # start with innermost tags
|
||||
for el in _kill:
|
||||
el.drop_tree()
|
||||
for el in _remove:
|
||||
el.drop_tag()
|
||||
|
||||
if self.remove_unknown_tags:
|
||||
if allow_tags:
|
||||
raise ValueError(
|
||||
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
|
||||
allow_tags = set(defs.tags)
|
||||
if allow_tags:
|
||||
# make sure we do not remove comments/PIs if users want them (which is rare enough)
|
||||
if not self.comments:
|
||||
allow_tags.add(etree.Comment)
|
||||
if not self.processing_instructions:
|
||||
allow_tags.add(etree.ProcessingInstruction)
|
||||
|
||||
bad = []
|
||||
for el in doc.iter():
|
||||
if el.tag not in allow_tags:
|
||||
bad.append(el)
|
||||
if bad:
|
||||
if bad[0] is doc:
|
||||
el = bad.pop(0)
|
||||
el.tag = 'div'
|
||||
el.attrib.clear()
|
||||
for el in bad:
|
||||
el.drop_tag()
|
||||
if self.add_nofollow:
|
||||
for el in _find_external_links(doc):
|
||||
if not self.allow_follow(el):
|
||||
rel = el.get('rel')
|
||||
if rel:
|
||||
if ('nofollow' in rel
|
||||
and ' nofollow ' in (' %s ' % rel)):
|
||||
continue
|
||||
rel = '%s nofollow' % rel
|
||||
else:
|
||||
rel = 'nofollow'
|
||||
el.set('rel', rel)
|
||||
|
||||
def allow_follow(self, anchor):
|
||||
"""
|
||||
Override to suppress rel="nofollow" on some anchors.
|
||||
"""
|
||||
return False
|
||||
|
||||
def allow_element(self, el):
|
||||
"""
|
||||
Decide whether an element is configured to be accepted or rejected.
|
||||
|
||||
:param el: an element.
|
||||
:return: true to accept the element or false to reject/discard it.
|
||||
"""
|
||||
if el.tag not in self._tag_link_attrs:
|
||||
return False
|
||||
attr = self._tag_link_attrs[el.tag]
|
||||
if isinstance(attr, (list, tuple)):
|
||||
for one_attr in attr:
|
||||
url = el.get(one_attr)
|
||||
if not url:
|
||||
return False
|
||||
if not self.allow_embedded_url(el, url):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
url = el.get(attr)
|
||||
if not url:
|
||||
return False
|
||||
return self.allow_embedded_url(el, url)
|
||||
|
||||
def allow_embedded_url(self, el, url):
|
||||
"""
|
||||
Decide whether a URL that was found in an element's attributes or text
|
||||
if configured to be accepted or rejected.
|
||||
|
||||
:param el: an element.
|
||||
:param url: a URL found on the element.
|
||||
:return: true to accept the URL and false to reject it.
|
||||
"""
|
||||
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
|
||||
return False
|
||||
parts = urlsplit(url)
|
||||
if parts.scheme not in ('http', 'https'):
|
||||
return False
|
||||
if parts.hostname in self.host_whitelist:
|
||||
return True
|
||||
return False
|
||||
|
||||
def kill_conditional_comments(self, doc):
|
||||
"""
|
||||
IE conditional comments basically embed HTML that the parser
|
||||
doesn't normally see. We can't allow anything like that, so
|
||||
we'll kill any comments that could be conditional.
|
||||
"""
|
||||
has_conditional_comment = _conditional_comment_re.search
|
||||
self._kill_elements(
|
||||
doc, lambda el: has_conditional_comment(el.text),
|
||||
etree.Comment)
|
||||
|
||||
def _kill_elements(self, doc, condition, iterate=None):
|
||||
bad = []
|
||||
for el in doc.iter(iterate):
|
||||
if condition(el):
|
||||
bad.append(el)
|
||||
for el in bad:
|
||||
el.drop_tree()
|
||||
|
||||
def _remove_javascript_link(self, link):
|
||||
# links like "j a v a s c r i p t:" might be interpreted in IE
|
||||
new = _substitute_whitespace('', unquote_plus(link))
|
||||
if _has_javascript_scheme(new):
|
||||
# FIXME: should this be None to delete?
|
||||
return ''
|
||||
return link
|
||||
|
||||
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
|
||||
|
||||
def _has_sneaky_javascript(self, style):
|
||||
"""
|
||||
Depending on the browser, stuff like ``e x p r e s s i o n(...)``
|
||||
can get interpreted, or ``expre/* stuff */ssion(...)``. This
|
||||
checks for attempt to do stuff like this.
|
||||
|
||||
Typically the response will be to kill the entire style; if you
|
||||
have just a bit of Javascript in the style another rule will catch
|
||||
that and remove only the Javascript from the style; this catches
|
||||
more sneaky attempts.
|
||||
"""
|
||||
style = self._substitute_comments('', style)
|
||||
style = style.replace('\\', '')
|
||||
style = _substitute_whitespace('', style)
|
||||
style = style.lower()
|
||||
if _has_javascript_scheme(style):
|
||||
return True
|
||||
if 'expression(' in style:
|
||||
return True
|
||||
if '@import' in style:
|
||||
return True
|
||||
if '</noscript' in style:
|
||||
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
|
||||
return True
|
||||
if _looks_like_tag_content(style):
|
||||
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
|
||||
return True
|
||||
return False
|
||||
|
||||
def clean_html(self, html):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
self(doc)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
clean = Cleaner()
|
||||
clean_html = clean.clean_html
|
||||
|
||||
############################################################
|
||||
## Autolinking
|
||||
############################################################
|
||||
|
||||
_link_regexes = [
|
||||
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
|
||||
# This is conservative, but autolinking can be a bit conservative:
|
||||
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
|
||||
]
|
||||
|
||||
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
|
||||
|
||||
_avoid_hosts = [
|
||||
re.compile(r'^localhost', re.I),
|
||||
re.compile(r'\bexample\.(?:com|org|net)$', re.I),
|
||||
re.compile(r'^127\.0\.0\.1$'),
|
||||
]
|
||||
|
||||
_avoid_classes = ['nolink']
|
||||
|
||||
def autolink(el, link_regexes=_link_regexes,
|
||||
avoid_elements=_avoid_elements,
|
||||
avoid_hosts=_avoid_hosts,
|
||||
avoid_classes=_avoid_classes):
|
||||
"""
|
||||
Turn any URLs into links.
|
||||
|
||||
It will search for links identified by the given regular
|
||||
expressions (by default mailto and http(s) links).
|
||||
|
||||
It won't link text in an element in avoid_elements, or an element
|
||||
with a class in avoid_classes. It won't link to anything with a
|
||||
host that matches one of the regular expressions in avoid_hosts
|
||||
(default localhost and 127.0.0.1).
|
||||
|
||||
If you pass in an element, the element's tail will not be
|
||||
substituted, only the contents of the element.
|
||||
"""
|
||||
if el.tag in avoid_elements:
|
||||
return
|
||||
class_name = el.get('class')
|
||||
if class_name:
|
||||
class_name = class_name.split()
|
||||
for match_class in avoid_classes:
|
||||
if match_class in class_name:
|
||||
return
|
||||
for child in list(el):
|
||||
autolink(child, link_regexes=link_regexes,
|
||||
avoid_elements=avoid_elements,
|
||||
avoid_hosts=avoid_hosts,
|
||||
avoid_classes=avoid_classes)
|
||||
if child.tail:
|
||||
text, tail_children = _link_text(
|
||||
child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
|
||||
if tail_children:
|
||||
child.tail = text
|
||||
index = el.index(child)
|
||||
el[index+1:index+1] = tail_children
|
||||
if el.text:
|
||||
text, pre_children = _link_text(
|
||||
el.text, link_regexes, avoid_hosts, factory=el.makeelement)
|
||||
if pre_children:
|
||||
el.text = text
|
||||
el[:0] = pre_children
|
||||
|
||||
def _link_text(text, link_regexes, avoid_hosts, factory):
|
||||
leading_text = ''
|
||||
links = []
|
||||
last_pos = 0
|
||||
while 1:
|
||||
best_match, best_pos = None, None
|
||||
for regex in link_regexes:
|
||||
regex_pos = last_pos
|
||||
while 1:
|
||||
match = regex.search(text, pos=regex_pos)
|
||||
if match is None:
|
||||
break
|
||||
host = match.group('host')
|
||||
for host_regex in avoid_hosts:
|
||||
if host_regex.search(host):
|
||||
regex_pos = match.end()
|
||||
break
|
||||
else:
|
||||
break
|
||||
if match is None:
|
||||
continue
|
||||
if best_pos is None or match.start() < best_pos:
|
||||
best_match = match
|
||||
best_pos = match.start()
|
||||
if best_match is None:
|
||||
# No more matches
|
||||
if links:
|
||||
assert not links[-1].tail
|
||||
links[-1].tail = text
|
||||
else:
|
||||
assert not leading_text
|
||||
leading_text = text
|
||||
break
|
||||
link = best_match.group(0)
|
||||
end = best_match.end()
|
||||
if link.endswith('.') or link.endswith(','):
|
||||
# These punctuation marks shouldn't end a link
|
||||
end -= 1
|
||||
link = link[:-1]
|
||||
prev_text = text[:best_match.start()]
|
||||
if links:
|
||||
assert not links[-1].tail
|
||||
links[-1].tail = prev_text
|
||||
else:
|
||||
assert not leading_text
|
||||
leading_text = prev_text
|
||||
anchor = factory('a')
|
||||
anchor.set('href', link)
|
||||
body = best_match.group('body')
|
||||
if not body:
|
||||
body = link
|
||||
if body.endswith('.') or body.endswith(','):
|
||||
body = body[:-1]
|
||||
anchor.text = body
|
||||
links.append(anchor)
|
||||
text = text[end:]
|
||||
return leading_text, links
|
||||
|
||||
def autolink_html(html, *args, **kw):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
autolink(doc, *args, **kw)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
autolink_html.__doc__ = autolink.__doc__
|
||||
|
||||
############################################################
|
||||
## Word wrapping
|
||||
############################################################
|
||||
|
||||
_avoid_word_break_elements = ['pre', 'textarea', 'code']
|
||||
_avoid_word_break_classes = ['nobreak']
|
||||
|
||||
def word_break(el, max_width=40,
|
||||
avoid_elements=_avoid_word_break_elements,
|
||||
avoid_classes=_avoid_word_break_classes,
|
||||
break_character=unichr(0x200b)):
|
||||
"""
|
||||
Breaks any long words found in the body of the text (not attributes).
|
||||
|
||||
Doesn't effect any of the tags in avoid_elements, by default
|
||||
``<textarea>`` and ``<pre>``
|
||||
|
||||
Breaks words by inserting ​, which is a unicode character
|
||||
for Zero Width Space character. This generally takes up no space
|
||||
in rendering, but does copy as a space, and in monospace contexts
|
||||
usually takes up space.
|
||||
|
||||
See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
|
||||
"""
|
||||
# Character suggestion of ​ comes from:
|
||||
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
|
||||
if el.tag in _avoid_word_break_elements:
|
||||
return
|
||||
class_name = el.get('class')
|
||||
if class_name:
|
||||
dont_break = False
|
||||
class_name = class_name.split()
|
||||
for avoid in avoid_classes:
|
||||
if avoid in class_name:
|
||||
dont_break = True
|
||||
break
|
||||
if dont_break:
|
||||
return
|
||||
if el.text:
|
||||
el.text = _break_text(el.text, max_width, break_character)
|
||||
for child in el:
|
||||
word_break(child, max_width=max_width,
|
||||
avoid_elements=avoid_elements,
|
||||
avoid_classes=avoid_classes,
|
||||
break_character=break_character)
|
||||
if child.tail:
|
||||
child.tail = _break_text(child.tail, max_width, break_character)
|
||||
|
||||
def word_break_html(html, *args, **kw):
|
||||
result_type = type(html)
|
||||
doc = fromstring(html)
|
||||
word_break(doc, *args, **kw)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
def _break_text(text, max_width, break_character):
|
||||
words = text.split()
|
||||
for word in words:
|
||||
if len(word) > max_width:
|
||||
replacement = _insert_break(word, max_width, break_character)
|
||||
text = text.replace(word, replacement)
|
||||
return text
|
||||
|
||||
_break_prefer_re = re.compile(r'[^a-z]', re.I)
|
||||
|
||||
def _insert_break(word, width, break_character):
|
||||
orig_word = word
|
||||
result = ''
|
||||
while len(word) > width:
|
||||
start = word[:width]
|
||||
breaks = list(_break_prefer_re.finditer(start))
|
||||
if breaks:
|
||||
last_break = breaks[-1]
|
||||
# Only walk back up to 10 characters to find a nice break:
|
||||
if last_break.end() > width-10:
|
||||
# FIXME: should the break character be at the end of the
|
||||
# chunk, or the beginning of the next chunk?
|
||||
start = word[:last_break.end()]
|
||||
result += start + break_character
|
||||
word = word[len(start):]
|
||||
result += word
|
||||
return result
|
||||
|
135
.venv/lib/python3.7/site-packages/lxml/html/defs.py
Normal file
135
.venv/lib/python3.7/site-packages/lxml/html/defs.py
Normal file
@@ -0,0 +1,135 @@
|
||||
# FIXME: this should all be confirmed against what a DTD says
|
||||
# (probably in a test; this may not match the DTD exactly, but we
|
||||
# should document just how it differs).
|
||||
|
||||
"""
|
||||
Data taken from https://www.w3.org/TR/html401/index/elements.html
|
||||
and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
|
||||
for html5_tags.
|
||||
"""
|
||||
|
||||
empty_tags = frozenset([
|
||||
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
||||
'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
|
||||
|
||||
deprecated_tags = frozenset([
|
||||
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
|
||||
'menu', 's', 'strike', 'u'])
|
||||
|
||||
# archive actually takes a space-separated list of URIs
|
||||
link_attrs = frozenset([
|
||||
'action', 'archive', 'background', 'cite', 'classid',
|
||||
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
|
||||
'usemap',
|
||||
# Not standard:
|
||||
'dynsrc', 'lowsrc',
|
||||
# HTML5 formaction
|
||||
'formaction'
|
||||
])
|
||||
|
||||
# Not in the HTML 4 spec:
|
||||
# onerror, onresize
|
||||
event_attrs = frozenset([
|
||||
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
|
||||
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
|
||||
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
|
||||
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
|
||||
'onunload',
|
||||
])
|
||||
|
||||
safe_attrs = frozenset([
|
||||
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
|
||||
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
|
||||
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
|
||||
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
|
||||
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
|
||||
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
|
||||
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
|
||||
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
|
||||
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
||||
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
|
||||
|
||||
# From http://htmlhelp.com/reference/html40/olist.html
|
||||
top_level_tags = frozenset([
|
||||
'html', 'head', 'body', 'frameset',
|
||||
])
|
||||
|
||||
head_tags = frozenset([
|
||||
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
|
||||
])
|
||||
|
||||
general_block_tags = frozenset([
|
||||
'address',
|
||||
'blockquote',
|
||||
'center',
|
||||
'del',
|
||||
'div',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'ins',
|
||||
'isindex',
|
||||
'noscript',
|
||||
'p',
|
||||
'pre',
|
||||
])
|
||||
|
||||
list_tags = frozenset([
|
||||
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
|
||||
])
|
||||
|
||||
table_tags = frozenset([
|
||||
'table', 'caption', 'colgroup', 'col',
|
||||
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
|
||||
])
|
||||
|
||||
# just this one from
|
||||
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
|
||||
block_tags = general_block_tags | list_tags | table_tags | frozenset([
|
||||
# Partial form tags
|
||||
'fieldset', 'form', 'legend', 'optgroup', 'option',
|
||||
])
|
||||
|
||||
form_tags = frozenset([
|
||||
'form', 'button', 'fieldset', 'legend', 'input', 'label',
|
||||
'select', 'optgroup', 'option', 'textarea',
|
||||
])
|
||||
|
||||
special_inline_tags = frozenset([
|
||||
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
|
||||
'img', 'map', 'area', 'object', 'param', 'q', 'script',
|
||||
'span', 'sub', 'sup',
|
||||
])
|
||||
|
||||
phrase_tags = frozenset([
|
||||
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
|
||||
'ins', 'kbd', 'samp', 'strong', 'var',
|
||||
])
|
||||
|
||||
font_style_tags = frozenset([
|
||||
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
|
||||
])
|
||||
|
||||
frame_tags = frozenset([
|
||||
'frameset', 'frame', 'noframes',
|
||||
])
|
||||
|
||||
html5_tags = frozenset([
|
||||
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
|
||||
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
|
||||
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
|
||||
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
|
||||
'svg', 'time', 'track', 'video', 'wbr'
|
||||
])
|
||||
|
||||
# These tags aren't standard
|
||||
nonstandard_tags = frozenset(['blink', 'marquee'])
|
||||
|
||||
|
||||
tags = (top_level_tags | head_tags | general_block_tags | list_tags
|
||||
| table_tags | form_tags | special_inline_tags | phrase_tags
|
||||
| font_style_tags | nonstandard_tags | html5_tags)
|
Binary file not shown.
884
.venv/lib/python3.7/site-packages/lxml/html/diff.py
Normal file
884
.venv/lib/python3.7/site-packages/lxml/html/diff.py
Normal file
@@ -0,0 +1,884 @@
|
||||
# cython: language_level=3
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import difflib
|
||||
from lxml import etree
|
||||
from lxml.html import fragment_fromstring
|
||||
import re
|
||||
|
||||
__all__ = ['html_annotate', 'htmldiff']
|
||||
|
||||
try:
|
||||
from html import escape as html_escape
|
||||
except ImportError:
|
||||
from cgi import escape as html_escape
|
||||
try:
|
||||
_unicode = unicode
|
||||
except NameError:
|
||||
# Python 3
|
||||
_unicode = str
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3
|
||||
basestring = str
|
||||
|
||||
############################################################
|
||||
## Annotation
|
||||
############################################################
|
||||
|
||||
def default_markup(text, version):
|
||||
return '<span title="%s">%s</span>' % (
|
||||
html_escape(_unicode(version), 1), text)
|
||||
|
||||
def html_annotate(doclist, markup=default_markup):
|
||||
"""
|
||||
doclist should be ordered from oldest to newest, like::
|
||||
|
||||
>>> version1 = 'Hello World'
|
||||
>>> version2 = 'Goodbye World'
|
||||
>>> print(html_annotate([(version1, 'version 1'),
|
||||
... (version2, 'version 2')]))
|
||||
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
|
||||
|
||||
The documents must be *fragments* (str/UTF8 or unicode), not
|
||||
complete documents
|
||||
|
||||
The markup argument is a function to markup the spans of words.
|
||||
This function is called like markup('Hello', 'version 2'), and
|
||||
returns HTML. The first argument is text and never includes any
|
||||
markup. The default uses a span with a title:
|
||||
|
||||
>>> print(default_markup('Some Text', 'by Joe'))
|
||||
<span title="by Joe">Some Text</span>
|
||||
"""
|
||||
# The basic strategy we have is to split the documents up into
|
||||
# logical tokens (which are words with attached markup). We then
|
||||
# do diffs of each of the versions to track when a token first
|
||||
# appeared in the document; the annotation attached to the token
|
||||
# is the version where it first appeared.
|
||||
tokenlist = [tokenize_annotated(doc, version)
|
||||
for doc, version in doclist]
|
||||
cur_tokens = tokenlist[0]
|
||||
for tokens in tokenlist[1:]:
|
||||
html_annotate_merge_annotations(cur_tokens, tokens)
|
||||
cur_tokens = tokens
|
||||
|
||||
# After we've tracked all the tokens, we can combine spans of text
|
||||
# that are adjacent and have the same annotation
|
||||
cur_tokens = compress_tokens(cur_tokens)
|
||||
# And finally add markup
|
||||
result = markup_serialize_tokens(cur_tokens, markup)
|
||||
return ''.join(result).strip()
|
||||
|
||||
def tokenize_annotated(doc, annotation):
|
||||
"""Tokenize a document and add an annotation attribute to each token
|
||||
"""
|
||||
tokens = tokenize(doc, include_hrefs=False)
|
||||
for tok in tokens:
|
||||
tok.annotation = annotation
|
||||
return tokens
|
||||
|
||||
def html_annotate_merge_annotations(tokens_old, tokens_new):
|
||||
"""Merge the annotations from tokens_old into tokens_new, when the
|
||||
tokens in the new document already existed in the old document.
|
||||
"""
|
||||
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
|
||||
commands = s.get_opcodes()
|
||||
|
||||
for command, i1, i2, j1, j2 in commands:
|
||||
if command == 'equal':
|
||||
eq_old = tokens_old[i1:i2]
|
||||
eq_new = tokens_new[j1:j2]
|
||||
copy_annotations(eq_old, eq_new)
|
||||
|
||||
def copy_annotations(src, dest):
|
||||
"""
|
||||
Copy annotations from the tokens listed in src to the tokens in dest
|
||||
"""
|
||||
assert len(src) == len(dest)
|
||||
for src_tok, dest_tok in zip(src, dest):
|
||||
dest_tok.annotation = src_tok.annotation
|
||||
|
||||
def compress_tokens(tokens):
|
||||
"""
|
||||
Combine adjacent tokens when there is no HTML between the tokens,
|
||||
and they share an annotation
|
||||
"""
|
||||
result = [tokens[0]]
|
||||
for tok in tokens[1:]:
|
||||
if (not result[-1].post_tags and
|
||||
not tok.pre_tags and
|
||||
result[-1].annotation == tok.annotation):
|
||||
compress_merge_back(result, tok)
|
||||
else:
|
||||
result.append(tok)
|
||||
return result
|
||||
|
||||
def compress_merge_back(tokens, tok):
|
||||
""" Merge tok into the last element of tokens (modifying the list of
|
||||
tokens in-place). """
|
||||
last = tokens[-1]
|
||||
if type(last) is not token or type(tok) is not token:
|
||||
tokens.append(tok)
|
||||
else:
|
||||
text = _unicode(last)
|
||||
if last.trailing_whitespace:
|
||||
text += last.trailing_whitespace
|
||||
text += tok
|
||||
merged = token(text,
|
||||
pre_tags=last.pre_tags,
|
||||
post_tags=tok.post_tags,
|
||||
trailing_whitespace=tok.trailing_whitespace)
|
||||
merged.annotation = last.annotation
|
||||
tokens[-1] = merged
|
||||
|
||||
def markup_serialize_tokens(tokens, markup_func):
|
||||
"""
|
||||
Serialize the list of tokens into a list of text chunks, calling
|
||||
markup_func around text to add annotations.
|
||||
"""
|
||||
for token in tokens:
|
||||
for pre in token.pre_tags:
|
||||
yield pre
|
||||
html = token.html()
|
||||
html = markup_func(html, token.annotation)
|
||||
if token.trailing_whitespace:
|
||||
html += token.trailing_whitespace
|
||||
yield html
|
||||
for post in token.post_tags:
|
||||
yield post
|
||||
|
||||
|
||||
############################################################
|
||||
## HTML Diffs
|
||||
############################################################
|
||||
|
||||
def htmldiff(old_html, new_html):
|
||||
## FIXME: this should take parsed documents too, and use their body
|
||||
## or other content.
|
||||
""" Do a diff of the old and new document. The documents are HTML
|
||||
*fragments* (str/UTF8 or unicode), they are not complete documents
|
||||
(i.e., no <html> tag).
|
||||
|
||||
Returns HTML with <ins> and <del> tags added around the
|
||||
appropriate text.
|
||||
|
||||
Markup is generally ignored, with the markup from new_html
|
||||
preserved, and possibly some markup from old_html (though it is
|
||||
considered acceptable to lose some of the old markup). Only the
|
||||
words in the HTML are diffed. The exception is <img> tags, which
|
||||
are treated like words, and the href attribute of <a> tags, which
|
||||
are noted inside the tag itself when there are changes.
|
||||
"""
|
||||
old_html_tokens = tokenize(old_html)
|
||||
new_html_tokens = tokenize(new_html)
|
||||
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
|
||||
result = ''.join(result).strip()
|
||||
return fixup_ins_del_tags(result)
|
||||
|
||||
def htmldiff_tokens(html1_tokens, html2_tokens):
|
||||
""" Does a diff on the tokens themselves, returning a list of text
|
||||
chunks (not tokens).
|
||||
"""
|
||||
# There are several passes as we do the differences. The tokens
|
||||
# isolate the portion of the content we care to diff; difflib does
|
||||
# all the actual hard work at that point.
|
||||
#
|
||||
# Then we must create a valid document from pieces of both the old
|
||||
# document and the new document. We generally prefer to take
|
||||
# markup from the new document, and only do a best effort attempt
|
||||
# to keep markup from the old document; anything that we can't
|
||||
# resolve we throw away. Also we try to put the deletes as close
|
||||
# to the location where we think they would have been -- because
|
||||
# we are only keeping the markup from the new document, it can be
|
||||
# fuzzy where in the new document the old text would have gone.
|
||||
# Again we just do a best effort attempt.
|
||||
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
|
||||
commands = s.get_opcodes()
|
||||
result = []
|
||||
for command, i1, i2, j1, j2 in commands:
|
||||
if command == 'equal':
|
||||
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
|
||||
continue
|
||||
if command == 'insert' or command == 'replace':
|
||||
ins_tokens = expand_tokens(html2_tokens[j1:j2])
|
||||
merge_insert(ins_tokens, result)
|
||||
if command == 'delete' or command == 'replace':
|
||||
del_tokens = expand_tokens(html1_tokens[i1:i2])
|
||||
merge_delete(del_tokens, result)
|
||||
# If deletes were inserted directly as <del> then we'd have an
|
||||
# invalid document at this point. Instead we put in special
|
||||
# markers, and when the complete diffed document has been created
|
||||
# we try to move the deletes around and resolve any problems.
|
||||
result = cleanup_delete(result)
|
||||
|
||||
return result
|
||||
|
||||
def expand_tokens(tokens, equal=False):
|
||||
"""Given a list of tokens, return a generator of the chunks of
|
||||
text for the data in the tokens.
|
||||
"""
|
||||
for token in tokens:
|
||||
for pre in token.pre_tags:
|
||||
yield pre
|
||||
if not equal or not token.hide_when_equal:
|
||||
if token.trailing_whitespace:
|
||||
yield token.html() + token.trailing_whitespace
|
||||
else:
|
||||
yield token.html()
|
||||
for post in token.post_tags:
|
||||
yield post
|
||||
|
||||
def merge_insert(ins_chunks, doc):
|
||||
""" doc is the already-handled document (as a list of text chunks);
|
||||
here we add <ins>ins_chunks</ins> to the end of that. """
|
||||
# Though we don't throw away unbalanced_start or unbalanced_end
|
||||
# (we assume there is accompanying markup later or earlier in the
|
||||
# document), we only put <ins> around the balanced portion.
|
||||
unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
|
||||
doc.extend(unbalanced_start)
|
||||
if doc and not doc[-1].endswith(' '):
|
||||
# Fix up the case where the word before the insert didn't end with
|
||||
# a space
|
||||
doc[-1] += ' '
|
||||
doc.append('<ins>')
|
||||
if balanced and balanced[-1].endswith(' '):
|
||||
# We move space outside of </ins>
|
||||
balanced[-1] = balanced[-1][:-1]
|
||||
doc.extend(balanced)
|
||||
doc.append('</ins> ')
|
||||
doc.extend(unbalanced_end)
|
||||
|
||||
# These are sentinels to represent the start and end of a <del>
|
||||
# segment, until we do the cleanup phase to turn them into proper
|
||||
# markup:
|
||||
class DEL_START:
|
||||
pass
|
||||
class DEL_END:
|
||||
pass
|
||||
|
||||
class NoDeletes(Exception):
|
||||
""" Raised when the document no longer contains any pending deletes
|
||||
(DEL_START/DEL_END) """
|
||||
|
||||
def merge_delete(del_chunks, doc):
|
||||
""" Adds the text chunks in del_chunks to the document doc (another
|
||||
list of text chunks) with marker to show it is a delete.
|
||||
cleanup_delete later resolves these markers into <del> tags."""
|
||||
doc.append(DEL_START)
|
||||
doc.extend(del_chunks)
|
||||
doc.append(DEL_END)
|
||||
|
||||
def cleanup_delete(chunks):
|
||||
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
|
||||
them with <del></del>. To do this while keeping the document
|
||||
valid, it may need to drop some tags (either start or end tags).
|
||||
|
||||
It may also move the del into adjacent tags to try to move it to a
|
||||
similar location where it was originally located (e.g., moving a
|
||||
delete into preceding <div> tag, if the del looks like (DEL_START,
|
||||
'Text</div>', DEL_END)"""
|
||||
while 1:
|
||||
# Find a pending DEL_START/DEL_END, splitting the document
|
||||
# into stuff-preceding-DEL_START, stuff-inside, and
|
||||
# stuff-following-DEL_END
|
||||
try:
|
||||
pre_delete, delete, post_delete = split_delete(chunks)
|
||||
except NoDeletes:
|
||||
# Nothing found, we've cleaned up the entire doc
|
||||
break
|
||||
# The stuff-inside-DEL_START/END may not be well balanced
|
||||
# markup. First we figure out what unbalanced portions there are:
|
||||
unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
|
||||
# Then we move the span forward and/or backward based on these
|
||||
# unbalanced portions:
|
||||
locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
|
||||
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
|
||||
doc = pre_delete
|
||||
if doc and not doc[-1].endswith(' '):
|
||||
# Fix up case where the word before us didn't have a trailing space
|
||||
doc[-1] += ' '
|
||||
doc.append('<del>')
|
||||
if balanced and balanced[-1].endswith(' '):
|
||||
# We move space outside of </del>
|
||||
balanced[-1] = balanced[-1][:-1]
|
||||
doc.extend(balanced)
|
||||
doc.append('</del> ')
|
||||
doc.extend(post_delete)
|
||||
chunks = doc
|
||||
return chunks
|
||||
|
||||
def split_unbalanced(chunks):
|
||||
"""Return (unbalanced_start, balanced, unbalanced_end), where each is
|
||||
a list of text and tag chunks.
|
||||
|
||||
unbalanced_start is a list of all the tags that are opened, but
|
||||
not closed in this span. Similarly, unbalanced_end is a list of
|
||||
tags that are closed but were not opened. Extracting these might
|
||||
mean some reordering of the chunks."""
|
||||
start = []
|
||||
end = []
|
||||
tag_stack = []
|
||||
balanced = []
|
||||
for chunk in chunks:
|
||||
if not chunk.startswith('<'):
|
||||
balanced.append(chunk)
|
||||
continue
|
||||
endtag = chunk[1] == '/'
|
||||
name = chunk.split()[0].strip('<>/')
|
||||
if name in empty_tags:
|
||||
balanced.append(chunk)
|
||||
continue
|
||||
if endtag:
|
||||
if tag_stack and tag_stack[-1][0] == name:
|
||||
balanced.append(chunk)
|
||||
name, pos, tag = tag_stack.pop()
|
||||
balanced[pos] = tag
|
||||
elif tag_stack:
|
||||
start.extend([tag for name, pos, tag in tag_stack])
|
||||
tag_stack = []
|
||||
end.append(chunk)
|
||||
else:
|
||||
end.append(chunk)
|
||||
else:
|
||||
tag_stack.append((name, len(balanced), chunk))
|
||||
balanced.append(None)
|
||||
start.extend(
|
||||
[chunk for name, pos, chunk in tag_stack])
|
||||
balanced = [chunk for chunk in balanced if chunk is not None]
|
||||
return start, balanced, end
|
||||
|
||||
def split_delete(chunks):
|
||||
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
|
||||
stuff_after_DEL_END). Returns the first case found (there may be
|
||||
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
|
||||
there's no DEL_START found. """
|
||||
try:
|
||||
pos = chunks.index(DEL_START)
|
||||
except ValueError:
|
||||
raise NoDeletes
|
||||
pos2 = chunks.index(DEL_END)
|
||||
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
|
||||
|
||||
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
|
||||
""" pre_delete and post_delete implicitly point to a place in the
|
||||
document (where the two were split). This moves that point (by
|
||||
popping items from one and pushing them onto the other). It moves
|
||||
the point to try to find a place where unbalanced_start applies.
|
||||
|
||||
As an example::
|
||||
|
||||
>>> unbalanced_start = ['<div>']
|
||||
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
|
||||
>>> pre, post = doc[:3], doc[3:]
|
||||
>>> pre, post
|
||||
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
|
||||
>>> locate_unbalanced_start(unbalanced_start, pre, post)
|
||||
>>> pre, post
|
||||
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
|
||||
|
||||
As you can see, we moved the point so that the dangling <div> that
|
||||
we found will be effectively replaced by the div in the original
|
||||
document. If this doesn't work out, we just throw away
|
||||
unbalanced_start without doing anything.
|
||||
"""
|
||||
while 1:
|
||||
if not unbalanced_start:
|
||||
# We have totally succeeded in finding the position
|
||||
break
|
||||
finding = unbalanced_start[0]
|
||||
finding_name = finding.split()[0].strip('<>')
|
||||
if not post_delete:
|
||||
break
|
||||
next = post_delete[0]
|
||||
if next is DEL_START or not next.startswith('<'):
|
||||
# Reached a word, we can't move the delete text forward
|
||||
break
|
||||
if next[1] == '/':
|
||||
# Reached a closing tag, can we go further? Maybe not...
|
||||
break
|
||||
name = next.split()[0].strip('<>')
|
||||
if name == 'ins':
|
||||
# Can't move into an insert
|
||||
break
|
||||
assert name != 'del', (
|
||||
"Unexpected delete tag: %r" % next)
|
||||
if name == finding_name:
|
||||
unbalanced_start.pop(0)
|
||||
pre_delete.append(post_delete.pop(0))
|
||||
else:
|
||||
# Found a tag that doesn't match
|
||||
break
|
||||
|
||||
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
|
||||
""" like locate_unbalanced_start, except handling end tags and
|
||||
possibly moving the point earlier in the document. """
|
||||
while 1:
|
||||
if not unbalanced_end:
|
||||
# Success
|
||||
break
|
||||
finding = unbalanced_end[-1]
|
||||
finding_name = finding.split()[0].strip('<>/')
|
||||
if not pre_delete:
|
||||
break
|
||||
next = pre_delete[-1]
|
||||
if next is DEL_END or not next.startswith('</'):
|
||||
# A word or a start tag
|
||||
break
|
||||
name = next.split()[0].strip('<>/')
|
||||
if name == 'ins' or name == 'del':
|
||||
# Can't move into an insert or delete
|
||||
break
|
||||
if name == finding_name:
|
||||
unbalanced_end.pop()
|
||||
post_delete.insert(0, pre_delete.pop())
|
||||
else:
|
||||
# Found a tag that doesn't match
|
||||
break
|
||||
|
||||
class token(_unicode):
|
||||
""" Represents a diffable token, generally a word that is displayed to
|
||||
the user. Opening tags are attached to this token when they are
|
||||
adjacent (pre_tags) and closing tags that follow the word
|
||||
(post_tags). Some exceptions occur when there are empty tags
|
||||
adjacent to a word, so there may be close tags in pre_tags, or
|
||||
open tags in post_tags.
|
||||
|
||||
We also keep track of whether the word was originally followed by
|
||||
whitespace, even though we do not want to treat the word as
|
||||
equivalent to a similar word that does not have a trailing
|
||||
space."""
|
||||
|
||||
# When this is true, the token will be eliminated from the
|
||||
# displayed diff if no change has occurred:
|
||||
hide_when_equal = False
|
||||
|
||||
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
|
||||
obj = _unicode.__new__(cls, text)
|
||||
|
||||
if pre_tags is not None:
|
||||
obj.pre_tags = pre_tags
|
||||
else:
|
||||
obj.pre_tags = []
|
||||
|
||||
if post_tags is not None:
|
||||
obj.post_tags = post_tags
|
||||
else:
|
||||
obj.post_tags = []
|
||||
|
||||
obj.trailing_whitespace = trailing_whitespace
|
||||
|
||||
return obj
|
||||
|
||||
def __repr__(self):
|
||||
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
|
||||
self.post_tags, self.trailing_whitespace)
|
||||
|
||||
def html(self):
|
||||
return _unicode(self)
|
||||
|
||||
class tag_token(token):
|
||||
|
||||
""" Represents a token that is actually a tag. Currently this is just
|
||||
the <img> tag, which takes up visible space just like a word but
|
||||
is only represented in a document by a tag. """
|
||||
|
||||
def __new__(cls, tag, data, html_repr, pre_tags=None,
|
||||
post_tags=None, trailing_whitespace=""):
|
||||
obj = token.__new__(cls, "%s: %s" % (type, data),
|
||||
pre_tags=pre_tags,
|
||||
post_tags=post_tags,
|
||||
trailing_whitespace=trailing_whitespace)
|
||||
obj.tag = tag
|
||||
obj.data = data
|
||||
obj.html_repr = html_repr
|
||||
return obj
|
||||
|
||||
def __repr__(self):
|
||||
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
|
||||
self.tag,
|
||||
self.data,
|
||||
self.html_repr,
|
||||
self.pre_tags,
|
||||
self.post_tags,
|
||||
self.trailing_whitespace)
|
||||
def html(self):
|
||||
return self.html_repr
|
||||
|
||||
class href_token(token):
|
||||
|
||||
""" Represents the href in an anchor tag. Unlike other words, we only
|
||||
show the href when it changes. """
|
||||
|
||||
hide_when_equal = True
|
||||
|
||||
def html(self):
|
||||
return ' Link: %s' % self
|
||||
|
||||
def tokenize(html, include_hrefs=True):
|
||||
"""
|
||||
Parse the given HTML and returns token objects (words with attached tags).
|
||||
|
||||
This parses only the content of a page; anything in the head is
|
||||
ignored, and the <head> and <body> elements are themselves
|
||||
optional. The content is then parsed by lxml, which ensures the
|
||||
validity of the resulting parsed document (though lxml may make
|
||||
incorrect guesses when the markup is particular bad).
|
||||
|
||||
<ins> and <del> tags are also eliminated from the document, as
|
||||
that gets confusing.
|
||||
|
||||
If include_hrefs is true, then the href attribute of <a> tags is
|
||||
included as a special kind of diffable token."""
|
||||
if etree.iselement(html):
|
||||
body_el = html
|
||||
else:
|
||||
body_el = parse_html(html, cleanup=True)
|
||||
# Then we split the document into text chunks for each tag, word, and end tag:
|
||||
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
|
||||
# Finally re-joining them into token objects:
|
||||
return fixup_chunks(chunks)
|
||||
|
||||
def parse_html(html, cleanup=True):
|
||||
"""
|
||||
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
|
||||
wrapped in a <div> tag that was not in the original document.
|
||||
|
||||
If cleanup is true, make sure there's no <head> or <body>, and get
|
||||
rid of any <ins> and <del> tags.
|
||||
"""
|
||||
if cleanup:
|
||||
# This removes any extra markup or structure like <head>:
|
||||
html = cleanup_html(html)
|
||||
return fragment_fromstring(html, create_parent=True)
|
||||
|
||||
_body_re = re.compile(r'<body.*?>', re.I|re.S)
|
||||
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
|
||||
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
|
||||
|
||||
def cleanup_html(html):
|
||||
""" This 'cleans' the HTML, meaning that any page structure is removed
|
||||
(only the contents of <body> are used, if there is any <body).
|
||||
Also <ins> and <del> tags are removed. """
|
||||
match = _body_re.search(html)
|
||||
if match:
|
||||
html = html[match.end():]
|
||||
match = _end_body_re.search(html)
|
||||
if match:
|
||||
html = html[:match.start()]
|
||||
html = _ins_del_re.sub('', html)
|
||||
return html
|
||||
|
||||
|
||||
end_whitespace_re = re.compile(r'[ \t\n\r]$')
|
||||
|
||||
def split_trailing_whitespace(word):
|
||||
"""
|
||||
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
|
||||
"""
|
||||
stripped_length = len(word.rstrip())
|
||||
return word[0:stripped_length], word[stripped_length:]
|
||||
|
||||
|
||||
def fixup_chunks(chunks):
|
||||
"""
|
||||
This function takes a list of chunks and produces a list of tokens.
|
||||
"""
|
||||
tag_accum = []
|
||||
cur_word = None
|
||||
result = []
|
||||
for chunk in chunks:
|
||||
if isinstance(chunk, tuple):
|
||||
if chunk[0] == 'img':
|
||||
src = chunk[1]
|
||||
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
|
||||
cur_word = tag_token('img', src, html_repr=tag,
|
||||
pre_tags=tag_accum,
|
||||
trailing_whitespace=trailing_whitespace)
|
||||
tag_accum = []
|
||||
result.append(cur_word)
|
||||
|
||||
elif chunk[0] == 'href':
|
||||
href = chunk[1]
|
||||
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
|
||||
tag_accum = []
|
||||
result.append(cur_word)
|
||||
continue
|
||||
|
||||
if is_word(chunk):
|
||||
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
|
||||
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
|
||||
tag_accum = []
|
||||
result.append(cur_word)
|
||||
|
||||
elif is_start_tag(chunk):
|
||||
tag_accum.append(chunk)
|
||||
|
||||
elif is_end_tag(chunk):
|
||||
if tag_accum:
|
||||
tag_accum.append(chunk)
|
||||
else:
|
||||
assert cur_word, (
|
||||
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
|
||||
% (cur_word, result, chunk, chunks))
|
||||
cur_word.post_tags.append(chunk)
|
||||
else:
|
||||
assert False
|
||||
|
||||
if not result:
|
||||
return [token('', pre_tags=tag_accum)]
|
||||
else:
|
||||
result[-1].post_tags.extend(tag_accum)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# All the tags in HTML that don't require end tags:
|
||||
empty_tags = (
|
||||
'param', 'img', 'area', 'br', 'basefont', 'input',
|
||||
'base', 'meta', 'link', 'col')
|
||||
|
||||
block_level_tags = (
|
||||
'address',
|
||||
'blockquote',
|
||||
'center',
|
||||
'dir',
|
||||
'div',
|
||||
'dl',
|
||||
'fieldset',
|
||||
'form',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'isindex',
|
||||
'menu',
|
||||
'noframes',
|
||||
'noscript',
|
||||
'ol',
|
||||
'p',
|
||||
'pre',
|
||||
'table',
|
||||
'ul',
|
||||
)
|
||||
|
||||
block_level_container_tags = (
|
||||
'dd',
|
||||
'dt',
|
||||
'frameset',
|
||||
'li',
|
||||
'tbody',
|
||||
'td',
|
||||
'tfoot',
|
||||
'th',
|
||||
'thead',
|
||||
'tr',
|
||||
)
|
||||
|
||||
|
||||
def flatten_el(el, include_hrefs, skip_tag=False):
|
||||
""" Takes an lxml element el, and generates all the text chunks for
|
||||
that tag. Each start tag is a chunk, each word is a chunk, and each
|
||||
end tag is a chunk.
|
||||
|
||||
If skip_tag is true, then the outermost container tag is
|
||||
not returned (just its contents)."""
|
||||
if not skip_tag:
|
||||
if el.tag == 'img':
|
||||
yield ('img', el.get('src'), start_tag(el))
|
||||
else:
|
||||
yield start_tag(el)
|
||||
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
|
||||
return
|
||||
start_words = split_words(el.text)
|
||||
for word in start_words:
|
||||
yield html_escape(word)
|
||||
for child in el:
|
||||
for item in flatten_el(child, include_hrefs=include_hrefs):
|
||||
yield item
|
||||
if el.tag == 'a' and el.get('href') and include_hrefs:
|
||||
yield ('href', el.get('href'))
|
||||
if not skip_tag:
|
||||
yield end_tag(el)
|
||||
end_words = split_words(el.tail)
|
||||
for word in end_words:
|
||||
yield html_escape(word)
|
||||
|
||||
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
|
||||
|
||||
def split_words(text):
|
||||
""" Splits some text into words. Includes trailing whitespace
|
||||
on each word when appropriate. """
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
words = split_words_re.findall(text)
|
||||
return words
|
||||
|
||||
start_whitespace_re = re.compile(r'^[ \t\n\r]')
|
||||
|
||||
def start_tag(el):
|
||||
"""
|
||||
The text representation of the start tag for a tag.
|
||||
"""
|
||||
return '<%s%s>' % (
|
||||
el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
|
||||
for name, value in el.attrib.items()]))
|
||||
|
||||
def end_tag(el):
|
||||
""" The text representation of an end tag for a tag. Includes
|
||||
trailing whitespace when appropriate. """
|
||||
if el.tail and start_whitespace_re.search(el.tail):
|
||||
extra = ' '
|
||||
else:
|
||||
extra = ''
|
||||
return '</%s>%s' % (el.tag, extra)
|
||||
|
||||
def is_word(tok):
|
||||
return not tok.startswith('<')
|
||||
|
||||
def is_end_tag(tok):
|
||||
return tok.startswith('</')
|
||||
|
||||
def is_start_tag(tok):
|
||||
return tok.startswith('<') and not tok.startswith('</')
|
||||
|
||||
def fixup_ins_del_tags(html):
|
||||
""" Given an html string, move any <ins> or <del> tags inside of any
|
||||
block-level elements, e.g. transform <ins><p>word</p></ins> to
|
||||
<p><ins>word</ins></p> """
|
||||
doc = parse_html(html, cleanup=False)
|
||||
_fixup_ins_del_tags(doc)
|
||||
html = serialize_html_fragment(doc, skip_outer=True)
|
||||
return html
|
||||
|
||||
def serialize_html_fragment(el, skip_outer=False):
|
||||
""" Serialize a single lxml element as HTML. The serialized form
|
||||
includes the elements tail.
|
||||
|
||||
If skip_outer is true, then don't serialize the outermost tag
|
||||
"""
|
||||
assert not isinstance(el, basestring), (
|
||||
"You should pass in an element, not a string like %r" % el)
|
||||
html = etree.tostring(el, method="html", encoding=_unicode)
|
||||
if skip_outer:
|
||||
# Get rid of the extra starting tag:
|
||||
html = html[html.find('>')+1:]
|
||||
# Get rid of the extra end tag:
|
||||
html = html[:html.rfind('<')]
|
||||
return html.strip()
|
||||
else:
|
||||
return html
|
||||
|
||||
def _fixup_ins_del_tags(doc):
|
||||
"""fixup_ins_del_tags that works on an lxml document in-place
|
||||
"""
|
||||
for tag in ['ins', 'del']:
|
||||
for el in doc.xpath('descendant-or-self::%s' % tag):
|
||||
if not _contains_block_level_tag(el):
|
||||
continue
|
||||
_move_el_inside_block(el, tag=tag)
|
||||
el.drop_tag()
|
||||
#_merge_element_contents(el)
|
||||
|
||||
def _contains_block_level_tag(el):
|
||||
"""True if the element contains any block-level elements, like <p>, <td>, etc.
|
||||
"""
|
||||
if el.tag in block_level_tags or el.tag in block_level_container_tags:
|
||||
return True
|
||||
for child in el:
|
||||
if _contains_block_level_tag(child):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _move_el_inside_block(el, tag):
|
||||
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
|
||||
and moves them inside any block-level tags. """
|
||||
for child in el:
|
||||
if _contains_block_level_tag(child):
|
||||
break
|
||||
else:
|
||||
# No block-level tags in any child
|
||||
children_tag = etree.Element(tag)
|
||||
children_tag.text = el.text
|
||||
el.text = None
|
||||
children_tag.extend(list(el))
|
||||
el[:] = [children_tag]
|
||||
return
|
||||
for child in list(el):
|
||||
if _contains_block_level_tag(child):
|
||||
_move_el_inside_block(child, tag)
|
||||
if child.tail:
|
||||
tail_tag = etree.Element(tag)
|
||||
tail_tag.text = child.tail
|
||||
child.tail = None
|
||||
el.insert(el.index(child)+1, tail_tag)
|
||||
else:
|
||||
child_tag = etree.Element(tag)
|
||||
el.replace(child, child_tag)
|
||||
child_tag.append(child)
|
||||
if el.text:
|
||||
text_tag = etree.Element(tag)
|
||||
text_tag.text = el.text
|
||||
el.text = None
|
||||
el.insert(0, text_tag)
|
||||
|
||||
def _merge_element_contents(el):
|
||||
"""
|
||||
Removes an element, but merges its contents into its place, e.g.,
|
||||
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
|
||||
<p>Hi there!</p>
|
||||
"""
|
||||
parent = el.getparent()
|
||||
text = el.text or ''
|
||||
if el.tail:
|
||||
if not len(el):
|
||||
text += el.tail
|
||||
else:
|
||||
if el[-1].tail:
|
||||
el[-1].tail += el.tail
|
||||
else:
|
||||
el[-1].tail = el.tail
|
||||
index = parent.index(el)
|
||||
if text:
|
||||
if index == 0:
|
||||
previous = None
|
||||
else:
|
||||
previous = parent[index-1]
|
||||
if previous is None:
|
||||
if parent.text:
|
||||
parent.text += text
|
||||
else:
|
||||
parent.text = text
|
||||
else:
|
||||
if previous.tail:
|
||||
previous.tail += text
|
||||
else:
|
||||
previous.tail = text
|
||||
parent[index:index+1] = el.getchildren()
|
||||
|
||||
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
|
||||
"""
|
||||
Acts like SequenceMatcher, but tries not to find very small equal
|
||||
blocks amidst large spans of changes
|
||||
"""
|
||||
|
||||
threshold = 2
|
||||
|
||||
def get_matching_blocks(self):
|
||||
size = min(len(self.b), len(self.b))
|
||||
threshold = min(self.threshold, size / 4)
|
||||
actual = difflib.SequenceMatcher.get_matching_blocks(self)
|
||||
return [item for item in actual
|
||||
if item[2] > threshold
|
||||
or not item[2]]
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml.html import _diffcommand
|
||||
_diffcommand.main()
|
||||
|
299
.venv/lib/python3.7/site-packages/lxml/html/formfill.py
Normal file
299
.venv/lib/python3.7/site-packages/lxml/html/formfill.py
Normal file
@@ -0,0 +1,299 @@
|
||||
from lxml.etree import XPath, ElementBase
|
||||
from lxml.html import fromstring, XHTML_NAMESPACE
|
||||
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
|
||||
from lxml.html import defs
|
||||
import copy
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3
|
||||
basestring = str
|
||||
|
||||
__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
|
||||
'insert_errors', 'insert_errors_html',
|
||||
'DefaultErrorCreator']
|
||||
|
||||
class FormNotFound(LookupError):
|
||||
"""
|
||||
Raised when no form can be found
|
||||
"""
|
||||
|
||||
_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
|
||||
_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
_name_xpath = XPath('descendant-or-self::*[@name=$name]')
|
||||
|
||||
def fill_form(
|
||||
el,
|
||||
values,
|
||||
form_id=None,
|
||||
form_index=None,
|
||||
):
|
||||
el = _find_form(el, form_id=form_id, form_index=form_index)
|
||||
_fill_form(el, values)
|
||||
|
||||
def fill_form_html(html, values, form_id=None, form_index=None):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
fill_form(doc, values, form_id=form_id, form_index=form_index)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
def _fill_form(el, values):
|
||||
counts = {}
|
||||
if hasattr(values, 'mixed'):
|
||||
# For Paste request parameters
|
||||
values = values.mixed()
|
||||
inputs = _input_xpath(el)
|
||||
for input in inputs:
|
||||
name = input.get('name')
|
||||
if not name:
|
||||
continue
|
||||
if _takes_multiple(input):
|
||||
value = values.get(name, [])
|
||||
if not isinstance(value, (list, tuple)):
|
||||
value = [value]
|
||||
_fill_multiple(input, value)
|
||||
elif name not in values:
|
||||
continue
|
||||
else:
|
||||
index = counts.get(name, 0)
|
||||
counts[name] = index + 1
|
||||
value = values[name]
|
||||
if isinstance(value, (list, tuple)):
|
||||
try:
|
||||
value = value[index]
|
||||
except IndexError:
|
||||
continue
|
||||
elif index > 0:
|
||||
continue
|
||||
_fill_single(input, value)
|
||||
|
||||
def _takes_multiple(input):
|
||||
if _nons(input.tag) == 'select' and input.get('multiple'):
|
||||
# FIXME: multiple="0"?
|
||||
return True
|
||||
type = input.get('type', '').lower()
|
||||
if type in ('radio', 'checkbox'):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _fill_multiple(input, value):
|
||||
type = input.get('type', '').lower()
|
||||
if type == 'checkbox':
|
||||
v = input.get('value')
|
||||
if v is None:
|
||||
if not value:
|
||||
result = False
|
||||
else:
|
||||
result = value[0]
|
||||
if isinstance(value, basestring):
|
||||
# The only valid "on" value for an unnamed checkbox is 'on'
|
||||
result = result == 'on'
|
||||
_check(input, result)
|
||||
else:
|
||||
_check(input, v in value)
|
||||
elif type == 'radio':
|
||||
v = input.get('value')
|
||||
_check(input, v in value)
|
||||
else:
|
||||
assert _nons(input.tag) == 'select'
|
||||
for option in _options_xpath(input):
|
||||
v = option.get('value')
|
||||
if v is None:
|
||||
# This seems to be the default, at least on IE
|
||||
# FIXME: but I'm not sure
|
||||
v = option.text_content()
|
||||
_select(option, v in value)
|
||||
|
||||
def _check(el, check):
|
||||
if check:
|
||||
el.set('checked', '')
|
||||
else:
|
||||
if 'checked' in el.attrib:
|
||||
del el.attrib['checked']
|
||||
|
||||
def _select(el, select):
|
||||
if select:
|
||||
el.set('selected', '')
|
||||
else:
|
||||
if 'selected' in el.attrib:
|
||||
del el.attrib['selected']
|
||||
|
||||
def _fill_single(input, value):
|
||||
if _nons(input.tag) == 'textarea':
|
||||
input.text = value
|
||||
else:
|
||||
input.set('value', value)
|
||||
|
||||
def _find_form(el, form_id=None, form_index=None):
|
||||
if form_id is None and form_index is None:
|
||||
forms = _forms_xpath(el)
|
||||
for form in forms:
|
||||
return form
|
||||
raise FormNotFound(
|
||||
"No forms in page")
|
||||
if form_id is not None:
|
||||
form = el.get_element_by_id(form_id)
|
||||
if form is not None:
|
||||
return form
|
||||
forms = _form_name_xpath(el, name=form_id)
|
||||
if forms:
|
||||
return forms[0]
|
||||
else:
|
||||
raise FormNotFound(
|
||||
"No form with the name or id of %r (forms: %s)"
|
||||
% (id, ', '.join(_find_form_ids(el))))
|
||||
if form_index is not None:
|
||||
forms = _forms_xpath(el)
|
||||
try:
|
||||
return forms[form_index]
|
||||
except IndexError:
|
||||
raise FormNotFound(
|
||||
"There is no form with the index %r (%i forms found)"
|
||||
% (form_index, len(forms)))
|
||||
|
||||
def _find_form_ids(el):
|
||||
forms = _forms_xpath(el)
|
||||
if not forms:
|
||||
yield '(no forms)'
|
||||
return
|
||||
for index, form in enumerate(forms):
|
||||
if form.get('id'):
|
||||
if form.get('name'):
|
||||
yield '%s or %s' % (form.get('id'),
|
||||
form.get('name'))
|
||||
else:
|
||||
yield form.get('id')
|
||||
elif form.get('name'):
|
||||
yield form.get('name')
|
||||
else:
|
||||
yield '(unnamed form %s)' % index
|
||||
|
||||
############################################################
|
||||
## Error filling
|
||||
############################################################
|
||||
|
||||
class DefaultErrorCreator(object):
|
||||
insert_before = True
|
||||
block_inside = True
|
||||
error_container_tag = 'div'
|
||||
error_message_class = 'error-message'
|
||||
error_block_class = 'error-block'
|
||||
default_message = "Invalid"
|
||||
|
||||
def __init__(self, **kw):
|
||||
for name, value in kw.items():
|
||||
if not hasattr(self, name):
|
||||
raise TypeError(
|
||||
"Unexpected keyword argument: %s" % name)
|
||||
setattr(self, name, value)
|
||||
|
||||
def __call__(self, el, is_block, message):
|
||||
error_el = el.makeelement(self.error_container_tag)
|
||||
if self.error_message_class:
|
||||
error_el.set('class', self.error_message_class)
|
||||
if is_block and self.error_block_class:
|
||||
error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
|
||||
if message is None or message == '':
|
||||
message = self.default_message
|
||||
if isinstance(message, ElementBase):
|
||||
error_el.append(message)
|
||||
else:
|
||||
assert isinstance(message, basestring), (
|
||||
"Bad message; should be a string or element: %r" % message)
|
||||
error_el.text = message or self.default_message
|
||||
if is_block and self.block_inside:
|
||||
if self.insert_before:
|
||||
error_el.tail = el.text
|
||||
el.text = None
|
||||
el.insert(0, error_el)
|
||||
else:
|
||||
el.append(error_el)
|
||||
else:
|
||||
parent = el.getparent()
|
||||
pos = parent.index(el)
|
||||
if self.insert_before:
|
||||
parent.insert(pos, error_el)
|
||||
else:
|
||||
error_el.tail = el.tail
|
||||
el.tail = None
|
||||
parent.insert(pos+1, error_el)
|
||||
|
||||
default_error_creator = DefaultErrorCreator()
|
||||
|
||||
|
||||
def insert_errors(
|
||||
el,
|
||||
errors,
|
||||
form_id=None,
|
||||
form_index=None,
|
||||
error_class="error",
|
||||
error_creator=default_error_creator,
|
||||
):
|
||||
el = _find_form(el, form_id=form_id, form_index=form_index)
|
||||
for name, error in errors.items():
|
||||
if error is None:
|
||||
continue
|
||||
for error_el, message in _find_elements_for_name(el, name, error):
|
||||
assert isinstance(message, (basestring, type(None), ElementBase)), (
|
||||
"Bad message: %r" % message)
|
||||
_insert_error(error_el, message, error_class, error_creator)
|
||||
|
||||
def insert_errors_html(html, values, **kw):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
insert_errors(doc, values, **kw)
|
||||
return _transform_result(result_type, doc)
|
||||
|
||||
def _insert_error(el, error, error_class, error_creator):
|
||||
if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
|
||||
is_block = False
|
||||
else:
|
||||
is_block = True
|
||||
if _nons(el.tag) != 'form' and error_class:
|
||||
_add_class(el, error_class)
|
||||
if el.get('id'):
|
||||
labels = _label_for_xpath(el, for_id=el.get('id'))
|
||||
if labels:
|
||||
for label in labels:
|
||||
_add_class(label, error_class)
|
||||
error_creator(el, is_block, error)
|
||||
|
||||
def _add_class(el, class_name):
|
||||
if el.get('class'):
|
||||
el.set('class', el.get('class')+' '+class_name)
|
||||
else:
|
||||
el.set('class', class_name)
|
||||
|
||||
def _find_elements_for_name(form, name, error):
|
||||
if name is None:
|
||||
# An error for the entire form
|
||||
yield form, error
|
||||
return
|
||||
if name.startswith('#'):
|
||||
# By id
|
||||
el = form.get_element_by_id(name[1:])
|
||||
if el is not None:
|
||||
yield el, error
|
||||
return
|
||||
els = _name_xpath(form, name=name)
|
||||
if not els:
|
||||
# FIXME: should this raise an exception?
|
||||
return
|
||||
if not isinstance(error, (list, tuple)):
|
||||
yield els[0], error
|
||||
return
|
||||
# FIXME: if error is longer than els, should it raise an error?
|
||||
for el, err in zip(els, error):
|
||||
if err is None:
|
||||
continue
|
||||
yield el, err
|
260
.venv/lib/python3.7/site-packages/lxml/html/html5parser.py
Normal file
260
.venv/lib/python3.7/site-packages/lxml/html/html5parser.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
An interface to html5lib that mimics the lxml.html interface.
|
||||
"""
|
||||
import sys
|
||||
import string
|
||||
|
||||
from html5lib import HTMLParser as _HTMLParser
|
||||
from html5lib.treebuilders.etree_lxml import TreeBuilder
|
||||
from lxml import etree
|
||||
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
|
||||
|
||||
# python3 compatibility
|
||||
try:
|
||||
_strings = basestring
|
||||
except NameError:
|
||||
_strings = (bytes, str)
|
||||
try:
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
from urllib.request import urlopen
|
||||
try:
|
||||
from urlparse import urlparse
|
||||
except ImportError:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
class HTMLParser(_HTMLParser):
|
||||
"""An html5lib HTML parser with lxml as tree."""
|
||||
|
||||
def __init__(self, strict=False, **kwargs):
|
||||
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
|
||||
|
||||
|
||||
try:
|
||||
from html5lib import XHTMLParser as _XHTMLParser
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
class XHTMLParser(_XHTMLParser):
|
||||
"""An html5lib XHTML Parser with lxml as tree."""
|
||||
|
||||
def __init__(self, strict=False, **kwargs):
|
||||
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
|
||||
|
||||
xhtml_parser = XHTMLParser()
|
||||
|
||||
|
||||
def _find_tag(tree, tag):
|
||||
elem = tree.find(tag)
|
||||
if elem is not None:
|
||||
return elem
|
||||
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
|
||||
|
||||
|
||||
def document_fromstring(html, guess_charset=None, parser=None):
|
||||
"""
|
||||
Parse a whole document into a string.
|
||||
|
||||
If `guess_charset` is true, or if the input is not Unicode but a
|
||||
byte string, the `chardet` library will perform charset guessing
|
||||
on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
options = {}
|
||||
if guess_charset is None and isinstance(html, bytes):
|
||||
# html5lib does not accept useChardet as an argument, if it
|
||||
# detected the html argument would produce unicode objects.
|
||||
guess_charset = True
|
||||
if guess_charset is not None:
|
||||
options['useChardet'] = guess_charset
|
||||
return parser.parse(html, **options).getroot()
|
||||
|
||||
|
||||
def fragments_fromstring(html, no_leading_text=False,
|
||||
guess_charset=None, parser=None):
|
||||
"""Parses several HTML elements, returning a list of elements.
|
||||
|
||||
The first item in the list may be a string. If no_leading_text is true,
|
||||
then it will be an error if there is leading text, and it will always be
|
||||
a list of only elements.
|
||||
|
||||
If `guess_charset` is true, the `chardet` library will perform charset
|
||||
guessing on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
options = {}
|
||||
if guess_charset is None and isinstance(html, bytes):
|
||||
# html5lib does not accept useChardet as an argument, if it
|
||||
# detected the html argument would produce unicode objects.
|
||||
guess_charset = False
|
||||
if guess_charset is not None:
|
||||
options['useChardet'] = guess_charset
|
||||
children = parser.parseFragment(html, 'div', **options)
|
||||
if children and isinstance(children[0], _strings):
|
||||
if no_leading_text:
|
||||
if children[0].strip():
|
||||
raise etree.ParserError('There is leading text: %r' %
|
||||
children[0])
|
||||
del children[0]
|
||||
return children
|
||||
|
||||
|
||||
def fragment_fromstring(html, create_parent=False,
|
||||
guess_charset=None, parser=None):
|
||||
"""Parses a single HTML element; it is an error if there is more than
|
||||
one element, or if anything but whitespace precedes or follows the
|
||||
element.
|
||||
|
||||
If 'create_parent' is true (or is a tag name) then a parent node
|
||||
will be created to encapsulate the HTML in a single element. In
|
||||
this case, leading or trailing text is allowed.
|
||||
|
||||
If `guess_charset` is true, the `chardet` library will perform charset
|
||||
guessing on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
accept_leading_text = bool(create_parent)
|
||||
|
||||
elements = fragments_fromstring(
|
||||
html, guess_charset=guess_charset, parser=parser,
|
||||
no_leading_text=not accept_leading_text)
|
||||
|
||||
if create_parent:
|
||||
if not isinstance(create_parent, _strings):
|
||||
create_parent = 'div'
|
||||
new_root = Element(create_parent)
|
||||
if elements:
|
||||
if isinstance(elements[0], _strings):
|
||||
new_root.text = elements[0]
|
||||
del elements[0]
|
||||
new_root.extend(elements)
|
||||
return new_root
|
||||
|
||||
if not elements:
|
||||
raise etree.ParserError('No elements found')
|
||||
if len(elements) > 1:
|
||||
raise etree.ParserError('Multiple elements found')
|
||||
result = elements[0]
|
||||
if result.tail and result.tail.strip():
|
||||
raise etree.ParserError('Element followed by text: %r' % result.tail)
|
||||
result.tail = None
|
||||
return result
|
||||
|
||||
|
||||
def fromstring(html, guess_charset=None, parser=None):
|
||||
"""Parse the html, returning a single element/document.
|
||||
|
||||
This tries to minimally parse the chunk of text, without knowing if it
|
||||
is a fragment or a document.
|
||||
|
||||
'base_url' will set the document's base_url attribute (and the tree's
|
||||
docinfo.URL)
|
||||
|
||||
If `guess_charset` is true, or if the input is not Unicode but a
|
||||
byte string, the `chardet` library will perform charset guessing
|
||||
on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
doc = document_fromstring(html, parser=parser,
|
||||
guess_charset=guess_charset)
|
||||
|
||||
# document starts with doctype or <html>, full document!
|
||||
start = html[:50]
|
||||
if isinstance(start, bytes):
|
||||
# Allow text comparison in python3.
|
||||
# Decode as ascii, that also covers latin-1 and utf-8 for the
|
||||
# characters we need.
|
||||
start = start.decode('ascii', 'replace')
|
||||
|
||||
start = start.lstrip().lower()
|
||||
if start.startswith('<html') or start.startswith('<!doctype'):
|
||||
return doc
|
||||
|
||||
head = _find_tag(doc, 'head')
|
||||
|
||||
# if the head is not empty we have a full document
|
||||
if len(head):
|
||||
return doc
|
||||
|
||||
body = _find_tag(doc, 'body')
|
||||
|
||||
# The body has just one element, so it was probably a single
|
||||
# element passed in
|
||||
if (len(body) == 1 and (not body.text or not body.text.strip())
|
||||
and (not body[-1].tail or not body[-1].tail.strip())):
|
||||
return body[0]
|
||||
|
||||
# Now we have a body which represents a bunch of tags which have the
|
||||
# content that was passed in. We will create a fake container, which
|
||||
# is the body tag, except <body> implies too much structure.
|
||||
if _contains_block_level_tag(body):
|
||||
body.tag = 'div'
|
||||
else:
|
||||
body.tag = 'span'
|
||||
return body
|
||||
|
||||
|
||||
def parse(filename_url_or_file, guess_charset=None, parser=None):
|
||||
"""Parse a filename, URL, or file-like object into an HTML document
|
||||
tree. Note: this returns a tree, not an element. Use
|
||||
``parse(...).getroot()`` to get the document root.
|
||||
|
||||
If ``guess_charset`` is true, the ``useChardet`` option is passed into
|
||||
html5lib to enable character detection. This option is on by default
|
||||
when parsing from URLs, off by default when parsing from file(-like)
|
||||
objects (which tend to return Unicode more often than not), and on by
|
||||
default when parsing from a file path (which is read in binary mode).
|
||||
"""
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
if not isinstance(filename_url_or_file, _strings):
|
||||
fp = filename_url_or_file
|
||||
if guess_charset is None:
|
||||
# assume that file-like objects return Unicode more often than bytes
|
||||
guess_charset = False
|
||||
elif _looks_like_url(filename_url_or_file):
|
||||
fp = urlopen(filename_url_or_file)
|
||||
if guess_charset is None:
|
||||
# assume that URLs return bytes
|
||||
guess_charset = True
|
||||
else:
|
||||
fp = open(filename_url_or_file, 'rb')
|
||||
if guess_charset is None:
|
||||
guess_charset = True
|
||||
|
||||
options = {}
|
||||
# html5lib does not accept useChardet as an argument, if it
|
||||
# detected the html argument would produce unicode objects.
|
||||
if guess_charset:
|
||||
options['useChardet'] = guess_charset
|
||||
return parser.parse(fp, **options)
|
||||
|
||||
|
||||
def _looks_like_url(str):
|
||||
scheme = urlparse(str)[0]
|
||||
if not scheme:
|
||||
return False
|
||||
elif (sys.platform == 'win32' and
|
||||
scheme in string.ascii_letters
|
||||
and len(scheme) == 1):
|
||||
# looks like a 'normal' absolute path
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
html_parser = HTMLParser()
|
314
.venv/lib/python3.7/site-packages/lxml/html/soupparser.py
Normal file
314
.venv/lib/python3.7/site-packages/lxml/html/soupparser.py
Normal file
@@ -0,0 +1,314 @@
|
||||
"""External interface to the BeautifulSoup HTML parser.
|
||||
"""
|
||||
|
||||
__all__ = ["fromstring", "parse", "convert_tree"]
|
||||
|
||||
import re
|
||||
from lxml import etree, html
|
||||
|
||||
try:
|
||||
from bs4 import (
|
||||
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
|
||||
Declaration, Doctype)
|
||||
_DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
|
||||
except ImportError:
|
||||
from BeautifulSoup import (
|
||||
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
|
||||
Declaration)
|
||||
_DECLARATION_OR_DOCTYPE = Declaration
|
||||
|
||||
|
||||
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
|
||||
"""Parse a string of HTML data into an Element tree using the
|
||||
BeautifulSoup parser.
|
||||
|
||||
Returns the root ``<html>`` Element of the tree.
|
||||
|
||||
You can pass a different BeautifulSoup parser through the
|
||||
`beautifulsoup` keyword, and a diffent Element factory function
|
||||
through the `makeelement` keyword. By default, the standard
|
||||
``BeautifulSoup`` class and the default factory of `lxml.html` are
|
||||
used.
|
||||
"""
|
||||
return _parse(data, beautifulsoup, makeelement, **bsargs)
|
||||
|
||||
|
||||
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
|
||||
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
|
||||
|
||||
You can pass a different BeautifulSoup parser through the
|
||||
`beautifulsoup` keyword, and a diffent Element factory function
|
||||
through the `makeelement` keyword. By default, the standard
|
||||
``BeautifulSoup`` class and the default factory of `lxml.html` are
|
||||
used.
|
||||
"""
|
||||
if not hasattr(file, 'read'):
|
||||
file = open(file)
|
||||
root = _parse(file, beautifulsoup, makeelement, **bsargs)
|
||||
return etree.ElementTree(root)
|
||||
|
||||
|
||||
def convert_tree(beautiful_soup_tree, makeelement=None):
|
||||
"""Convert a BeautifulSoup tree to a list of Element trees.
|
||||
|
||||
Returns a list instead of a single root Element to support
|
||||
HTML-like soup with more than one root element.
|
||||
|
||||
You can pass a different Element factory through the `makeelement`
|
||||
keyword.
|
||||
"""
|
||||
root = _convert_tree(beautiful_soup_tree, makeelement)
|
||||
children = root.getchildren()
|
||||
for child in children:
|
||||
root.remove(child)
|
||||
return children
|
||||
|
||||
|
||||
# helpers
|
||||
|
||||
def _parse(source, beautifulsoup, makeelement, **bsargs):
|
||||
if beautifulsoup is None:
|
||||
beautifulsoup = BeautifulSoup
|
||||
if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
|
||||
if 'convertEntities' not in bsargs:
|
||||
bsargs['convertEntities'] = 'html'
|
||||
if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
|
||||
if 'features' not in bsargs:
|
||||
bsargs['features'] = 'html.parser' # use Python html parser
|
||||
tree = beautifulsoup(source, **bsargs)
|
||||
root = _convert_tree(tree, makeelement)
|
||||
# from ET: wrap the document in a html root element, if necessary
|
||||
if len(root) == 1 and root[0].tag == "html":
|
||||
return root[0]
|
||||
root.tag = "html"
|
||||
return root
|
||||
|
||||
|
||||
_parse_doctype_declaration = re.compile(
|
||||
r'(?:\s|[<!])*DOCTYPE\s*HTML'
|
||||
r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
|
||||
r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
|
||||
re.IGNORECASE).match
|
||||
|
||||
|
||||
class _PseudoTag:
|
||||
# Minimal imitation of BeautifulSoup.Tag
|
||||
def __init__(self, contents):
|
||||
self.name = 'html'
|
||||
self.attrs = []
|
||||
self.contents = contents
|
||||
|
||||
def __iter__(self):
|
||||
return self.contents.__iter__()
|
||||
|
||||
|
||||
def _convert_tree(beautiful_soup_tree, makeelement):
|
||||
if makeelement is None:
|
||||
makeelement = html.html_parser.makeelement
|
||||
|
||||
# Split the tree into three parts:
|
||||
# i) everything before the root element: document type
|
||||
# declaration, comments, processing instructions, whitespace
|
||||
# ii) the root(s),
|
||||
# iii) everything after the root: comments, processing
|
||||
# instructions, whitespace
|
||||
first_element_idx = last_element_idx = None
|
||||
html_root = declaration = None
|
||||
for i, e in enumerate(beautiful_soup_tree):
|
||||
if isinstance(e, Tag):
|
||||
if first_element_idx is None:
|
||||
first_element_idx = i
|
||||
last_element_idx = i
|
||||
if html_root is None and e.name and e.name.lower() == 'html':
|
||||
html_root = e
|
||||
elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
|
||||
declaration = e
|
||||
|
||||
# For a nice, well-formatted document, the variable roots below is
|
||||
# a list consisting of a single <html> element. However, the document
|
||||
# may be a soup like '<meta><head><title>Hello</head><body>Hi
|
||||
# all<\p>'. In this example roots is a list containing meta, head
|
||||
# and body elements.
|
||||
if first_element_idx is None:
|
||||
pre_root = post_root = []
|
||||
roots = beautiful_soup_tree.contents
|
||||
else:
|
||||
pre_root = beautiful_soup_tree.contents[:first_element_idx]
|
||||
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
|
||||
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
|
||||
|
||||
# Reorganize so that there is one <html> root...
|
||||
if html_root is not None:
|
||||
# ... use existing one if possible, ...
|
||||
i = roots.index(html_root)
|
||||
html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
|
||||
else:
|
||||
# ... otherwise create a new one.
|
||||
html_root = _PseudoTag(roots)
|
||||
|
||||
convert_node = _init_node_converters(makeelement)
|
||||
|
||||
# Process pre_root
|
||||
res_root = convert_node(html_root)
|
||||
prev = res_root
|
||||
for e in reversed(pre_root):
|
||||
converted = convert_node(e)
|
||||
if converted is not None:
|
||||
prev.addprevious(converted)
|
||||
prev = converted
|
||||
|
||||
# ditto for post_root
|
||||
prev = res_root
|
||||
for e in post_root:
|
||||
converted = convert_node(e)
|
||||
if converted is not None:
|
||||
prev.addnext(converted)
|
||||
prev = converted
|
||||
|
||||
if declaration is not None:
|
||||
try:
|
||||
# bs4 provides full Doctype string
|
||||
doctype_string = declaration.output_ready()
|
||||
except AttributeError:
|
||||
doctype_string = declaration.string
|
||||
|
||||
match = _parse_doctype_declaration(doctype_string)
|
||||
if not match:
|
||||
# Something is wrong if we end up in here. Since soupparser should
|
||||
# tolerate errors, do not raise Exception, just let it pass.
|
||||
pass
|
||||
else:
|
||||
external_id, sys_uri = match.groups()
|
||||
docinfo = res_root.getroottree().docinfo
|
||||
# strip quotes and update DOCTYPE values (any of None, '', '...')
|
||||
docinfo.public_id = external_id and external_id[1:-1]
|
||||
docinfo.system_url = sys_uri and sys_uri[1:-1]
|
||||
|
||||
return res_root
|
||||
|
||||
|
||||
def _init_node_converters(makeelement):
|
||||
converters = {}
|
||||
ordered_node_types = []
|
||||
|
||||
def converter(*types):
|
||||
def add(handler):
|
||||
for t in types:
|
||||
converters[t] = handler
|
||||
ordered_node_types.append(t)
|
||||
return handler
|
||||
return add
|
||||
|
||||
def find_best_converter(node):
|
||||
for t in ordered_node_types:
|
||||
if isinstance(node, t):
|
||||
return converters[t]
|
||||
return None
|
||||
|
||||
def convert_node(bs_node, parent=None):
|
||||
# duplicated in convert_tag() below
|
||||
try:
|
||||
handler = converters[type(bs_node)]
|
||||
except KeyError:
|
||||
handler = converters[type(bs_node)] = find_best_converter(bs_node)
|
||||
if handler is None:
|
||||
return None
|
||||
return handler(bs_node, parent)
|
||||
|
||||
def map_attrs(bs_attrs):
|
||||
if isinstance(bs_attrs, dict): # bs4
|
||||
attribs = {}
|
||||
for k, v in bs_attrs.items():
|
||||
if isinstance(v, list):
|
||||
v = " ".join(v)
|
||||
attribs[k] = unescape(v)
|
||||
else:
|
||||
attribs = dict((k, unescape(v)) for k, v in bs_attrs)
|
||||
return attribs
|
||||
|
||||
def append_text(parent, text):
|
||||
if len(parent) == 0:
|
||||
parent.text = (parent.text or '') + text
|
||||
else:
|
||||
parent[-1].tail = (parent[-1].tail or '') + text
|
||||
|
||||
# converters are tried in order of their definition
|
||||
|
||||
@converter(Tag, _PseudoTag)
|
||||
def convert_tag(bs_node, parent):
|
||||
attrs = bs_node.attrs
|
||||
if parent is not None:
|
||||
attribs = map_attrs(attrs) if attrs else None
|
||||
res = etree.SubElement(parent, bs_node.name, attrib=attribs)
|
||||
else:
|
||||
attribs = map_attrs(attrs) if attrs else {}
|
||||
res = makeelement(bs_node.name, attrib=attribs)
|
||||
|
||||
for child in bs_node:
|
||||
# avoid double recursion by inlining convert_node(), see above
|
||||
try:
|
||||
handler = converters[type(child)]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if handler is not None:
|
||||
handler(child, res)
|
||||
continue
|
||||
convert_node(child, res)
|
||||
return res
|
||||
|
||||
@converter(Comment)
|
||||
def convert_comment(bs_node, parent):
|
||||
res = html.HtmlComment(bs_node)
|
||||
if parent is not None:
|
||||
parent.append(res)
|
||||
return res
|
||||
|
||||
@converter(ProcessingInstruction)
|
||||
def convert_pi(bs_node, parent):
|
||||
if bs_node.endswith('?'):
|
||||
# The PI is of XML style (<?as df?>) but BeautifulSoup
|
||||
# interpreted it as being SGML style (<?as df>). Fix.
|
||||
bs_node = bs_node[:-1]
|
||||
res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
|
||||
if parent is not None:
|
||||
parent.append(res)
|
||||
return res
|
||||
|
||||
@converter(NavigableString)
|
||||
def convert_text(bs_node, parent):
|
||||
if parent is not None:
|
||||
append_text(parent, unescape(bs_node))
|
||||
return None
|
||||
|
||||
return convert_node
|
||||
|
||||
|
||||
# copied from ET's ElementSoup
|
||||
|
||||
try:
|
||||
from html.entities import name2codepoint # Python 3
|
||||
except ImportError:
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
|
||||
handle_entities = re.compile(r"&(\w+);").sub
|
||||
|
||||
|
||||
try:
|
||||
unichr
|
||||
except NameError:
|
||||
# Python 3
|
||||
unichr = chr
|
||||
|
||||
|
||||
def unescape(string):
|
||||
if not string:
|
||||
return ''
|
||||
# work around oddities in BeautifulSoup's entity handling
|
||||
def unescape_entity(m):
|
||||
try:
|
||||
return unichr(name2codepoint[m.group(1)])
|
||||
except KeyError:
|
||||
return m.group(0) # use as is
|
||||
return handle_entities(unescape_entity, string)
|
13
.venv/lib/python3.7/site-packages/lxml/html/usedoctest.py
Normal file
13
.venv/lib/python3.7/site-packages/lxml/html/usedoctest.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""Doctest module for HTML comparison.
|
||||
|
||||
Usage::
|
||||
|
||||
>>> import lxml.html.usedoctest
|
||||
>>> # now do your HTML doctests ...
|
||||
|
||||
See `lxml.doctestcompare`.
|
||||
"""
|
||||
|
||||
from lxml import doctestcompare
|
||||
|
||||
doctestcompare.temp_install(html=True, del_module=__name__)
|
Reference in New Issue
Block a user