"""Shim module exporting the same ElementTree API for lxml and |
xml.etree backends. |
When lxml is installed, it is automatically preferred over the built-in |
xml.etree module. |
On Python 2.7, the cElementTree module is preferred over the pure-python |
ElementTree module. |
Besides exporting a unified interface, this also defines extra functions |
or subclasses built-in ElementTree classes to add features that are |
only availble in lxml, like OrderedDict for attributes, pretty_print and |
iterwalk. |
""" |
from fontTools.misc.textTools import tostr |
XML_DECLARATION = """<?xml version='1.0' encoding='%s'?>""" |
__all__ = [ |
"Comment", |
"dump", |
"Element", |
"ElementTree", |
"fromstring", |
"fromstringlist", |
"iselement", |
"iterparse", |
"parse", |
"ParseError", |
"PI", |
"ProcessingInstruction", |
"QName", |
"SubElement", |
"tostring", |
"tostringlist", |
"TreeBuilder", |
"XML", |
"XMLParser", |
"register_namespace", |
] |
try: |
from lxml.etree import * |
_have_lxml = True |
except ImportError: |
try: |
from xml.etree.cElementTree import * |
from xml.etree.ElementTree import XML |
except ImportError: |
from xml.etree.ElementTree import * |
_have_lxml = False |
import sys |
PY36 = sys.version_info >= (3, 6) |
try: |
import __pypy__ |
except ImportError: |
__pypy__ = None |
_dict_is_ordered = bool(PY36 or __pypy__) |
del PY36, __pypy__ |
if _dict_is_ordered: |
_Attrib = dict |
else: |
from collections import OrderedDict as _Attrib |
if isinstance(Element, type): |
_Element = Element |
else: |
from xml.etree.ElementTree import Element as _Element |
class Element(_Element): |
"""Element subclass that keeps the order of attributes.""" |
def __init__(self, tag, attrib=_Attrib(), **extra): |
super(Element, self).__init__(tag) |
self.attrib = _Attrib() |
if attrib: |
self.attrib.update(attrib) |
if extra: |
self.attrib.update(extra) |
def SubElement(parent, tag, attrib=_Attrib(), **extra): |
"""Must override SubElement as well otherwise _elementtree.SubElement |
fails if 'parent' is a subclass of Element object. |
""" |
element = parent.__class__(tag, attrib, **extra) |
parent.append(element) |
return element |
def _iterwalk(element, events, tag): |
include = tag is None or element.tag == tag |
if include and "start" in events: |
yield ("start", element) |
for e in element: |
for item in _iterwalk(e, events, tag): |
yield item |
if include: |
yield ("end", element) |
def iterwalk(element_or_tree, events=("end",), tag=None): |
"""A tree walker that generates events from an existing tree as |
if it was parsing XML data with iterparse(). |
Drop-in replacement for lxml.etree.iterwalk. |
""" |
if iselement(element_or_tree): |
element = element_or_tree |
else: |
element = element_or_tree.getroot() |
if tag == "*": |
tag = None |
for item in _iterwalk(element, events, tag): |
yield item |
_ElementTree = ElementTree |
class ElementTree(_ElementTree): |
"""ElementTree subclass that adds 'pretty_print' and 'doctype' |
arguments to the 'write' method. |
Currently these are only supported for the default XML serialization |
'method', and not also for "html" or "text", for these are delegated |
to the base class. |
""" |
def write( |
self, |
file_or_filename, |
encoding=None, |
xml_declaration=False, |
method=None, |
doctype=None, |
pretty_print=False, |
): |
if method and method != "xml": |
super(ElementTree, self).write( |
file_or_filename, |
encoding=encoding, |
xml_declaration=xml_declaration, |
method=method, |
) |
return |
if encoding is not None and encoding.lower() == "unicode": |
if xml_declaration: |
raise ValueError( |
"Serialisation to unicode must not request an XML declaration" |
) |
write_declaration = False |
encoding = "unicode" |
elif xml_declaration is None: |
write_declaration = encoding is not None and encoding.upper() not in ( |
"ASCII", |
"UTF-8", |
"UTF8", |
) |
else: |
write_declaration = xml_declaration |
if encoding is None: |
encoding = "ASCII" |
if pretty_print: |
_indent(self._root) |
with _get_writer(file_or_filename, encoding) as write: |
if write_declaration: |
write(XML_DECLARATION % encoding.upper()) |
if pretty_print: |
write("\n") |
if doctype: |
write(_tounicode(doctype)) |
if pretty_print: |
write("\n") |
qnames, namespaces = _namespaces(self._root) |
_serialize_xml(write, self._root, qnames, namespaces) |
import io |
def tostring( |
element, |
encoding=None, |
xml_declaration=None, |
method=None, |
doctype=None, |
pretty_print=False, |
): |
"""Custom 'tostring' function that uses our ElementTree subclass, with |
pretty_print support. |
""" |
stream = io.StringIO() if encoding == "unicode" else io.BytesIO() |
ElementTree(element).write( |
stream, |
encoding=encoding, |
xml_declaration=xml_declaration, |
method=method, |
doctype=doctype, |
pretty_print=pretty_print, |
) |
return stream.getvalue() |
import re |
UCS2 = sys.maxunicode < 0x10FFFF |
if UCS2: |
_invalid_xml_string = re.compile( |
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]" |
) |
else: |
_invalid_xml_string = re.compile( |
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]" |
) |
def _tounicode(s): |
"""Test if a string is valid user input and decode it to unicode string |
using ASCII encoding if it's a bytes string. |
Reject all bytes/unicode input that contains non-XML characters. |
Reject all bytes input that contains non-ASCII characters. |
""" |
try: |
s = tostr(s, encoding="ascii", errors="strict") |
except UnicodeDecodeError: |
raise ValueError( |
"Bytes strings can only contain ASCII characters. " |
"Use unicode strings for non-ASCII characters." |
) |
except AttributeError: |
_raise_serialization_error(s) |
if s and _invalid_xml_string.search(s): |
raise ValueError( |
"All strings must be XML compatible: Unicode or ASCII, " |
"no NULL bytes or control characters" |
) |
return s |
import contextlib |
@contextlib.contextmanager |
def _get_writer(file_or_filename, encoding): |
try: |
write = file_or_filename.write |
except AttributeError: |
f = open( |
file_or_filename, |
"w", |
encoding="utf-8" if encoding == "unicode" else encoding, |
errors="xmlcharrefreplace", |
) |
with f: |
yield f.write |
else: |
if encoding == "unicode": |
yield write |
else: |
detach_buffer = False |
if isinstance(file_or_filename, io.BufferedIOBase): |
buf = file_or_filename |
elif isinstance(file_or_filename, io.RawIOBase): |
buf = io.BufferedWriter(file_or_filename) |
detach_buffer = True |
else: |
buf = io.BufferedIOBase() |
buf.writable = lambda: True |
buf.write = write |
try: |
buf.seekable = file_or_filename.seekable |
buf.tell = file_or_filename.tell |
except AttributeError: |
pass |
wrapper = io.TextIOWrapper( |
buf, |
encoding=encoding, |
errors="xmlcharrefreplace", |
newline="\n", |
) |
try: |
yield wrapper.write |
finally: |
wrapper.detach() |
if detach_buffer: |
buf.detach() |
from xml.etree.ElementTree import _namespace_map |
def _namespaces(elem): |
qnames = {None: None} |
namespaces = {} |
def add_qname(qname): |
try: |
qname = _tounicode(qname) |
if qname[:1] == "{": |
uri, tag = qname[1:].rsplit("}", 1) |
prefix = namespaces.get(uri) |
if prefix is None: |
prefix = _namespace_map.get(uri) |
if prefix is None: |
prefix = "ns%d" % len(namespaces) |
else: |
prefix = _tounicode(prefix) |
if prefix != "xml": |
namespaces[uri] = prefix |
if prefix: |
qnames[qname] = "%s:%s" % (prefix, tag) |
else: |
qnames[qname] = tag |
else: |
qnames[qname] = qname |
except TypeError: |
_raise_serialization_error(qname) |
for elem in elem.iter(): |
tag = elem.tag |
if isinstance(tag, QName): |
if tag.text not in qnames: |
add_qname(tag.text) |
elif isinstance(tag, str): |
if tag not in qnames: |
add_qname(tag) |
elif tag is not None and tag is not Comment and tag is not PI: |
_raise_serialization_error(tag) |
for key, value in elem.items(): |
if isinstance(key, QName): |
key = key.text |
if key not in qnames: |
add_qname(key) |
if isinstance(value, QName) and value.text not in qnames: |
add_qname(value.text) |
text = elem.text |
if isinstance(text, QName) and text.text not in qnames: |
add_qname(text.text) |
return qnames, namespaces |
def _serialize_xml(write, elem, qnames, namespaces, **kwargs): |
tag = elem.tag |
text = elem.text |
if tag is Comment: |
write("<!--%s-->" % _tounicode(text)) |
elif tag is ProcessingInstruction: |
write("<?%s?>" % _tounicode(text)) |
else: |
tag = qnames[_tounicode(tag) if tag is not None else None] |
if tag is None: |
if text: |
write(_escape_cdata(text)) |
for e in elem: |
_serialize_xml(write, e, qnames, None) |
else: |
write("<" + tag) |
if namespaces: |
for uri, prefix in sorted( |
namespaces.items(), key=lambda x: x[1] |
): |
if prefix: |
prefix = ":" + prefix |
write(' xmlns%s="%s"' % (prefix, _escape_attrib(uri))) |
attrs = elem.attrib |
if attrs: |
if len(attrs) <= 1 or type(attrs) is _Attrib: |
items = attrs.items() |
else: |
items = sorted(attrs.items()) |
for k, v in items: |
if isinstance(k, QName): |
k = _tounicode(k.text) |
else: |
k = _tounicode(k) |
if isinstance(v, QName): |
v = qnames[_tounicode(v.text)] |
else: |
v = _escape_attrib(v) |
write(' %s="%s"' % (qnames[k], v)) |
if text is not None or len(elem): |
write(">") |
if text: |
write(_escape_cdata(text)) |
for e in elem: |
_serialize_xml(write, e, qnames, None) |
write("</" + tag + ">") |
else: |
write("/>") |
if elem.tail: |
write(_escape_cdata(elem.tail)) |
def _raise_serialization_error(text): |
raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__)) |
def _escape_cdata(text): |
try: |
text = _tounicode(text) |
if "&" in text: |
text = text.replace("&", "&") |
if "<" in text: |
text = text.replace("<", "<") |
if ">" in text: |
text = text.replace(">", ">") |
return text |
except (TypeError, AttributeError): |
_raise_serialization_error(text) |
def _escape_attrib(text): |
try: |
text = _tounicode(text) |
if "&" in text: |
text = text.replace("&", "&") |
if "<" in text: |
text = text.replace("<", "<") |
if ">" in text: |
text = text.replace(">", ">") |
if '"' in text: |
text = text.replace('"', """) |
if "\n" in text: |
text = text.replace("\n", " ") |
return text |
except (TypeError, AttributeError): |
_raise_serialization_error(text) |
def _indent(elem, level=0): |
i = "\n" + level * " " |
if len(elem): |
if not elem.text or not elem.text.strip(): |
elem.text = i + " " |
if not elem.tail or not elem.tail.strip(): |
elem.tail = i |
for elem in elem: |
_indent(elem, level + 1) |
if not elem.tail or not elem.tail.strip(): |
elem.tail = i |
else: |
if level and (not elem.tail or not elem.tail.strip()): |
elem.tail = i |