Spaces:
Runtime error
Runtime error
from __future__ import absolute_import, division, unicode_literals | |
from pip._vendor.six import text_type | |
import re | |
from codecs import register_error, xmlcharrefreplace_errors | |
from .constants import voidElements, booleanAttributes, spaceCharacters | |
from .constants import rcdataElements, entities, xmlEntities | |
from . import treewalkers, _utils | |
from xml.sax.saxutils import escape | |
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`" | |
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]") | |
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars + | |
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" | |
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" | |
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" | |
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" | |
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007" | |
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f" | |
"\u3000]") | |
_encode_entity_map = {} | |
_is_ucs4 = len("\U0010FFFF") == 1 | |
for k, v in list(entities.items()): | |
# skip multi-character entities | |
if ((_is_ucs4 and len(v) > 1) or | |
(not _is_ucs4 and len(v) > 2)): | |
continue | |
if v != "&": | |
if len(v) == 2: | |
v = _utils.surrogatePairToCodepoint(v) | |
else: | |
v = ord(v) | |
if v not in _encode_entity_map or k.islower(): | |
# prefer < over < and similarly for &, >, etc. | |
_encode_entity_map[v] = k | |
def htmlentityreplace_errors(exc): | |
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): | |
res = [] | |
codepoints = [] | |
skip = False | |
for i, c in enumerate(exc.object[exc.start:exc.end]): | |
if skip: | |
skip = False | |
continue | |
index = i + exc.start | |
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): | |
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2]) | |
skip = True | |
else: | |
codepoint = ord(c) | |
codepoints.append(codepoint) | |
for cp in codepoints: | |
e = _encode_entity_map.get(cp) | |
if e: | |
res.append("&") | |
res.append(e) | |
if not e.endswith(";"): | |
res.append(";") | |
else: | |
res.append("&#x%s;" % (hex(cp)[2:])) | |
return ("".join(res), exc.end) | |
else: | |
return xmlcharrefreplace_errors(exc) | |
register_error("htmlentityreplace", htmlentityreplace_errors) | |
def serialize(input, tree="etree", encoding=None, **serializer_opts): | |
"""Serializes the input token stream using the specified treewalker | |
:arg input: the token stream to serialize | |
:arg tree: the treewalker to use | |
:arg encoding: the encoding to use | |
:arg serializer_opts: any options to pass to the | |
:py:class:`html5lib.serializer.HTMLSerializer` that gets created | |
:returns: the tree serialized as a string | |
Example: | |
>>> from html5lib.html5parser import parse | |
>>> from html5lib.serializer import serialize | |
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>') | |
>>> serialize(token_stream, omit_optional_tags=False) | |
'<html><head></head><body><p>Hi!</p></body></html>' | |
""" | |
# XXX: Should we cache this? | |
walker = treewalkers.getTreeWalker(tree) | |
s = HTMLSerializer(**serializer_opts) | |
return s.render(walker(input), encoding) | |
class HTMLSerializer(object): | |
# attribute quoting options | |
quote_attr_values = "legacy" # be secure by default | |
quote_char = '"' | |
use_best_quote_char = True | |
# tag syntax options | |
omit_optional_tags = True | |
minimize_boolean_attributes = True | |
use_trailing_solidus = False | |
space_before_trailing_solidus = True | |
# escaping options | |
escape_lt_in_attrs = False | |
escape_rcdata = False | |
resolve_entities = True | |
# miscellaneous options | |
alphabetical_attributes = False | |
inject_meta_charset = True | |
strip_whitespace = False | |
sanitize = False | |
options = ("quote_attr_values", "quote_char", "use_best_quote_char", | |
"omit_optional_tags", "minimize_boolean_attributes", | |
"use_trailing_solidus", "space_before_trailing_solidus", | |
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities", | |
"alphabetical_attributes", "inject_meta_charset", | |
"strip_whitespace", "sanitize") | |
def __init__(self, **kwargs): | |
"""Initialize HTMLSerializer | |
:arg inject_meta_charset: Whether or not to inject the meta charset. | |
Defaults to ``True``. | |
:arg quote_attr_values: Whether to quote attribute values that don't | |
require quoting per legacy browser behavior (``"legacy"``), when | |
required by the standard (``"spec"``), or always (``"always"``). | |
Defaults to ``"legacy"``. | |
:arg quote_char: Use given quote character for attribute quoting. | |
Defaults to ``"`` which will use double quotes unless attribute | |
value contains a double quote, in which case single quotes are | |
used. | |
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute | |
values. | |
Defaults to ``False``. | |
:arg escape_rcdata: Whether to escape characters that need to be | |
escaped within normal elements within rcdata elements such as | |
style. | |
Defaults to ``False``. | |
:arg resolve_entities: Whether to resolve named character entities that | |
appear in the source tree. The XML predefined entities < > | |
& " ' are unaffected by this setting. | |
Defaults to ``True``. | |
:arg strip_whitespace: Whether to remove semantically meaningless | |
whitespace. (This compresses all whitespace to a single space | |
except within ``pre``.) | |
Defaults to ``False``. | |
:arg minimize_boolean_attributes: Shortens boolean attributes to give | |
just the attribute value, for example:: | |
<input disabled="disabled"> | |
becomes:: | |
<input disabled> | |
Defaults to ``True``. | |
:arg use_trailing_solidus: Includes a close-tag slash at the end of the | |
start tag of void elements (empty elements whose end tag is | |
forbidden). E.g. ``<hr/>``. | |
Defaults to ``False``. | |
:arg space_before_trailing_solidus: Places a space immediately before | |
the closing slash in a tag using a trailing solidus. E.g. | |
``<hr />``. Requires ``use_trailing_solidus=True``. | |
Defaults to ``True``. | |
:arg sanitize: Strip all unsafe or unknown constructs from output. | |
See :py:class:`html5lib.filters.sanitizer.Filter`. | |
Defaults to ``False``. | |
:arg omit_optional_tags: Omit start/end tags that are optional. | |
Defaults to ``True``. | |
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order. | |
Defaults to ``False``. | |
""" | |
unexpected_args = frozenset(kwargs) - frozenset(self.options) | |
if len(unexpected_args) > 0: | |
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args))) | |
if 'quote_char' in kwargs: | |
self.use_best_quote_char = False | |
for attr in self.options: | |
setattr(self, attr, kwargs.get(attr, getattr(self, attr))) | |
self.errors = [] | |
self.strict = False | |
def encode(self, string): | |
assert(isinstance(string, text_type)) | |
if self.encoding: | |
return string.encode(self.encoding, "htmlentityreplace") | |
else: | |
return string | |
def encodeStrict(self, string): | |
assert(isinstance(string, text_type)) | |
if self.encoding: | |
return string.encode(self.encoding, "strict") | |
else: | |
return string | |
def serialize(self, treewalker, encoding=None): | |
# pylint:disable=too-many-nested-blocks | |
self.encoding = encoding | |
in_cdata = False | |
self.errors = [] | |
if encoding and self.inject_meta_charset: | |
from .filters.inject_meta_charset import Filter | |
treewalker = Filter(treewalker, encoding) | |
# Alphabetical attributes is here under the assumption that none of | |
# the later filters add or change order of attributes; it needs to be | |
# before the sanitizer so escaped elements come out correctly | |
if self.alphabetical_attributes: | |
from .filters.alphabeticalattributes import Filter | |
treewalker = Filter(treewalker) | |
# WhitespaceFilter should be used before OptionalTagFilter | |
# for maximum efficiently of this latter filter | |
if self.strip_whitespace: | |
from .filters.whitespace import Filter | |
treewalker = Filter(treewalker) | |
if self.sanitize: | |
from .filters.sanitizer import Filter | |
treewalker = Filter(treewalker) | |
if self.omit_optional_tags: | |
from .filters.optionaltags import Filter | |
treewalker = Filter(treewalker) | |
for token in treewalker: | |
type = token["type"] | |
if type == "Doctype": | |
doctype = "<!DOCTYPE %s" % token["name"] | |
if token["publicId"]: | |
doctype += ' PUBLIC "%s"' % token["publicId"] | |
elif token["systemId"]: | |
doctype += " SYSTEM" | |
if token["systemId"]: | |
if token["systemId"].find('"') >= 0: | |
if token["systemId"].find("'") >= 0: | |
self.serializeError("System identifier contains both single and double quote characters") | |
quote_char = "'" | |
else: | |
quote_char = '"' | |
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) | |
doctype += ">" | |
yield self.encodeStrict(doctype) | |
elif type in ("Characters", "SpaceCharacters"): | |
if type == "SpaceCharacters" or in_cdata: | |
if in_cdata and token["data"].find("</") >= 0: | |
self.serializeError("Unexpected </ in CDATA") | |
yield self.encode(token["data"]) | |
else: | |
yield self.encode(escape(token["data"])) | |
elif type in ("StartTag", "EmptyTag"): | |
name = token["name"] | |
yield self.encodeStrict("<%s" % name) | |
if name in rcdataElements and not self.escape_rcdata: | |
in_cdata = True | |
elif in_cdata: | |
self.serializeError("Unexpected child element of a CDATA element") | |
for (_, attr_name), attr_value in token["data"].items(): | |
# TODO: Add namespace support here | |
k = attr_name | |
v = attr_value | |
yield self.encodeStrict(' ') | |
yield self.encodeStrict(k) | |
if not self.minimize_boolean_attributes or \ | |
(k not in booleanAttributes.get(name, tuple()) and | |
k not in booleanAttributes.get("", tuple())): | |
yield self.encodeStrict("=") | |
if self.quote_attr_values == "always" or len(v) == 0: | |
quote_attr = True | |
elif self.quote_attr_values == "spec": | |
quote_attr = _quoteAttributeSpec.search(v) is not None | |
elif self.quote_attr_values == "legacy": | |
quote_attr = _quoteAttributeLegacy.search(v) is not None | |
else: | |
raise ValueError("quote_attr_values must be one of: " | |
"'always', 'spec', or 'legacy'") | |
v = v.replace("&", "&") | |
if self.escape_lt_in_attrs: | |
v = v.replace("<", "<") | |
if quote_attr: | |
quote_char = self.quote_char | |
if self.use_best_quote_char: | |
if "'" in v and '"' not in v: | |
quote_char = '"' | |
elif '"' in v and "'" not in v: | |
quote_char = "'" | |
if quote_char == "'": | |
v = v.replace("'", "'") | |
else: | |
v = v.replace('"', """) | |
yield self.encodeStrict(quote_char) | |
yield self.encode(v) | |
yield self.encodeStrict(quote_char) | |
else: | |
yield self.encode(v) | |
if name in voidElements and self.use_trailing_solidus: | |
if self.space_before_trailing_solidus: | |
yield self.encodeStrict(" /") | |
else: | |
yield self.encodeStrict("/") | |
yield self.encode(">") | |
elif type == "EndTag": | |
name = token["name"] | |
if name in rcdataElements: | |
in_cdata = False | |
elif in_cdata: | |
self.serializeError("Unexpected child element of a CDATA element") | |
yield self.encodeStrict("</%s>" % name) | |
elif type == "Comment": | |
data = token["data"] | |
if data.find("--") >= 0: | |
self.serializeError("Comment contains --") | |
yield self.encodeStrict("<!--%s-->" % token["data"]) | |
elif type == "Entity": | |
name = token["name"] | |
key = name + ";" | |
if key not in entities: | |
self.serializeError("Entity %s not recognized" % name) | |
if self.resolve_entities and key not in xmlEntities: | |
data = entities[key] | |
else: | |
data = "&%s;" % name | |
yield self.encodeStrict(data) | |
else: | |
self.serializeError(token["data"]) | |
def render(self, treewalker, encoding=None): | |
"""Serializes the stream from the treewalker into a string | |
:arg treewalker: the treewalker to serialize | |
:arg encoding: the string encoding to use | |
:returns: the serialized tree | |
Example: | |
>>> from html5lib import parse, getTreeWalker | |
>>> from html5lib.serializer import HTMLSerializer | |
>>> token_stream = parse('<html><body>Hi!</body></html>') | |
>>> walker = getTreeWalker('etree') | |
>>> serializer = HTMLSerializer(omit_optional_tags=False) | |
>>> serializer.render(walker(token_stream)) | |
'<html><head></head><body>Hi!</body></html>' | |
""" | |
if encoding: | |
return b"".join(list(self.serialize(treewalker, encoding))) | |
else: | |
return "".join(list(self.serialize(treewalker))) | |
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): | |
# XXX The idea is to make data mandatory. | |
self.errors.append(data) | |
if self.strict: | |
raise SerializeError | |
class SerializeError(Exception): | |
"""Error in serialized tree""" | |
pass | |