Spaces:
Running
Running
File size: 6,430 Bytes
47b2311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
from __future__ import annotations
import codecs
import re
import typing as t
import urllib.parse
from urllib.parse import quote
from urllib.parse import unquote
from urllib.parse import urlencode
from urllib.parse import urlsplit
from urllib.parse import urlunsplit
from .datastructures import iter_multi_items
def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]:
"""Used in :func:`uri_to_iri` after unquoting to re-quote any
invalid bytes.
"""
# the docs state that UnicodeError does have these attributes,
# but mypy isn't picking them up
out = quote(e.object[e.start : e.end], safe="") # type: ignore
return out, e.end # type: ignore
codecs.register_error("werkzeug.url_quote", _codec_error_url_quote)
def _make_unquote_part(name: str, chars: str) -> t.Callable[[str], str]:
"""Create a function that unquotes all percent encoded characters except those
given. This allows working with unquoted characters if possible while not changing
the meaning of a given part of a URL.
"""
choices = "|".join(f"{ord(c):02X}" for c in sorted(chars))
pattern = re.compile(f"((?:%(?:{choices}))+)", re.I)
def _unquote_partial(value: str) -> str:
parts = iter(pattern.split(value))
out = []
for part in parts:
out.append(unquote(part, "utf-8", "werkzeug.url_quote"))
out.append(next(parts, ""))
return "".join(out)
_unquote_partial.__name__ = f"_unquote_{name}"
return _unquote_partial
# characters that should remain quoted in URL parts
# based on https://url.spec.whatwg.org/#percent-encoded-bytes
# always keep all controls, space, and % quoted
_always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode()
_unquote_fragment = _make_unquote_part("fragment", _always_unsafe)
_unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#")
_unquote_path = _make_unquote_part("path", _always_unsafe + "/?#")
_unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#")
def uri_to_iri(uri: str) -> str:
"""Convert a URI to an IRI. All valid UTF-8 characters are unquoted,
leaving all reserved and invalid characters quoted. If the URL has
a domain, it is decoded from Punycode.
>>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF")
'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF'
:param uri: The URI to convert.
.. versionchanged:: 3.0
Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters,
are removed.
.. versionchanged:: 2.3
Which characters remain quoted is specific to each part of the URL.
.. versionchanged:: 0.15
All reserved and invalid characters remain quoted. Previously,
only some reserved characters were preserved, and invalid bytes
were replaced instead of left quoted.
.. versionadded:: 0.6
"""
parts = urlsplit(uri)
path = _unquote_path(parts.path)
query = _unquote_query(parts.query)
fragment = _unquote_fragment(parts.fragment)
if parts.hostname:
netloc = _decode_idna(parts.hostname)
else:
netloc = ""
if ":" in netloc:
netloc = f"[{netloc}]"
if parts.port:
netloc = f"{netloc}:{parts.port}"
if parts.username:
auth = _unquote_user(parts.username)
if parts.password:
password = _unquote_user(parts.password)
auth = f"{auth}:{password}"
netloc = f"{auth}@{netloc}"
return urlunsplit((parts.scheme, netloc, path, query, fragment))
def iri_to_uri(iri: str) -> str:
"""Convert an IRI to a URI. All non-ASCII and unsafe characters are
quoted. If the URL has a domain, it is encoded to Punycode.
>>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF')
'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF'
:param iri: The IRI to convert.
.. versionchanged:: 3.0
Passing a tuple or bytes, the ``charset`` and ``errors`` parameters,
and the ``safe_conversion`` parameter, are removed.
.. versionchanged:: 2.3
Which characters remain unquoted is specific to each part of the URL.
.. versionchanged:: 0.15
All reserved characters remain unquoted. Previously, only some reserved
characters were left unquoted.
.. versionchanged:: 0.9.6
The ``safe_conversion`` parameter was added.
.. versionadded:: 0.6
"""
parts = urlsplit(iri)
# safe = https://url.spec.whatwg.org/#url-path-segment-string
# as well as percent for things that are already quoted
path = quote(parts.path, safe="%!$&'()*+,/:;=@")
query = quote(parts.query, safe="%!$&'()*+,/:;=?@")
fragment = quote(parts.fragment, safe="%!#$&'()*+,/:;=?@")
if parts.hostname:
netloc = parts.hostname.encode("idna").decode("ascii")
else:
netloc = ""
if ":" in netloc:
netloc = f"[{netloc}]"
if parts.port:
netloc = f"{netloc}:{parts.port}"
if parts.username:
auth = quote(parts.username, safe="%!$&'()*+,;=")
if parts.password:
password = quote(parts.password, safe="%!$&'()*+,;=")
auth = f"{auth}:{password}"
netloc = f"{auth}@{netloc}"
return urlunsplit((parts.scheme, netloc, path, query, fragment))
# Python < 3.12
# itms-services was worked around in previous iri_to_uri implementations, but
# we can tell Python directly that it needs to preserve the //.
if "itms-services" not in urllib.parse.uses_netloc:
urllib.parse.uses_netloc.append("itms-services")
def _decode_idna(domain: str) -> str:
try:
data = domain.encode("ascii")
except UnicodeEncodeError:
# If the domain is not ASCII, it's decoded already.
return domain
try:
# Try decoding in one shot.
return data.decode("idna")
except UnicodeDecodeError:
pass
# Decode each part separately, leaving invalid parts as punycode.
parts = []
for part in data.split(b"."):
try:
parts.append(part.decode("idna"))
except UnicodeDecodeError:
parts.append(part.decode("ascii"))
return ".".join(parts)
def _urlencode(query: t.Mapping[str, str] | t.Iterable[tuple[str, str]]) -> str:
items = [x for x in iter_multi_items(query) if x[1] is not None]
# safe = https://url.spec.whatwg.org/#percent-encoded-bytes
return urlencode(items, safe="!$'()*,/:;?@")
|