Spaces:
Runtime error
Runtime error
""" | |
Simple formatting on strings. Further string formatting code is in trans.py. | |
""" | |
import re | |
import sys | |
from functools import lru_cache | |
from typing import Final, List, Match, Pattern | |
from black._width_table import WIDTH_TABLE | |
from blib2to3.pytree import Leaf | |
STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. | |
STRING_PREFIX_RE: Final = re.compile( | |
r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL | |
) | |
FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") | |
UNICODE_ESCAPE_RE: Final = re.compile( | |
r"(?P<backslashes>\\+)(?P<body>" | |
r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx | |
r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx | |
r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh | |
r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database | |
r")", | |
re.VERBOSE, | |
) | |
def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: | |
"""Replace `regex` with `replacement` twice on `original`. | |
This is used by string normalization to perform replaces on | |
overlapping matches. | |
""" | |
return regex.sub(replacement, regex.sub(replacement, original)) | |
def has_triple_quotes(string: str) -> bool: | |
""" | |
Returns: | |
True iff @string starts with three quotation characters. | |
""" | |
raw_string = string.lstrip(STRING_PREFIX_CHARS) | |
return raw_string[:3] in {'"""', "'''"} | |
def lines_with_leading_tabs_expanded(s: str) -> List[str]: | |
""" | |
Splits string into lines and expands only leading tabs (following the normal | |
Python rules) | |
""" | |
lines = [] | |
for line in s.splitlines(): | |
# Find the index of the first non-whitespace character after a string of | |
# whitespace that includes at least one tab | |
match = FIRST_NON_WHITESPACE_RE.match(line) | |
if match: | |
first_non_whitespace_idx = match.start(1) | |
lines.append( | |
line[:first_non_whitespace_idx].expandtabs() | |
+ line[first_non_whitespace_idx:] | |
) | |
else: | |
lines.append(line) | |
return lines | |
def fix_docstring(docstring: str, prefix: str) -> str: | |
# https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation | |
if not docstring: | |
return "" | |
lines = lines_with_leading_tabs_expanded(docstring) | |
# Determine minimum indentation (first line doesn't count): | |
indent = sys.maxsize | |
for line in lines[1:]: | |
stripped = line.lstrip() | |
if stripped: | |
indent = min(indent, len(line) - len(stripped)) | |
# Remove indentation (first line is special): | |
trimmed = [lines[0].strip()] | |
if indent < sys.maxsize: | |
last_line_idx = len(lines) - 2 | |
for i, line in enumerate(lines[1:]): | |
stripped_line = line[indent:].rstrip() | |
if stripped_line or i == last_line_idx: | |
trimmed.append(prefix + stripped_line) | |
else: | |
trimmed.append("") | |
return "\n".join(trimmed) | |
def get_string_prefix(string: str) -> str: | |
""" | |
Pre-conditions: | |
* assert_is_leaf_string(@string) | |
Returns: | |
@string's prefix (e.g. '', 'r', 'f', or 'rf'). | |
""" | |
assert_is_leaf_string(string) | |
prefix = "" | |
prefix_idx = 0 | |
while string[prefix_idx] in STRING_PREFIX_CHARS: | |
prefix += string[prefix_idx] | |
prefix_idx += 1 | |
return prefix | |
def assert_is_leaf_string(string: str) -> None: | |
""" | |
Checks the pre-condition that @string has the format that you would expect | |
of `leaf.value` where `leaf` is some Leaf such that `leaf.type == | |
token.STRING`. A more precise description of the pre-conditions that are | |
checked are listed below. | |
Pre-conditions: | |
* @string starts with either ', ", <prefix>', or <prefix>" where | |
`set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`. | |
* @string ends with a quote character (' or "). | |
Raises: | |
AssertionError(...) if the pre-conditions listed above are not | |
satisfied. | |
""" | |
dquote_idx = string.find('"') | |
squote_idx = string.find("'") | |
if -1 in [dquote_idx, squote_idx]: | |
quote_idx = max(dquote_idx, squote_idx) | |
else: | |
quote_idx = min(squote_idx, dquote_idx) | |
assert ( | |
0 <= quote_idx < len(string) - 1 | |
), f"{string!r} is missing a starting quote character (' or \")." | |
assert string[-1] in ( | |
"'", | |
'"', | |
), f"{string!r} is missing an ending quote character (' or \")." | |
assert set(string[:quote_idx]).issubset( | |
set(STRING_PREFIX_CHARS) | |
), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}." | |
def normalize_string_prefix(s: str) -> str: | |
"""Make all string prefixes lowercase.""" | |
match = STRING_PREFIX_RE.match(s) | |
assert match is not None, f"failed to match string {s!r}" | |
orig_prefix = match.group(1) | |
new_prefix = ( | |
orig_prefix.replace("F", "f") | |
.replace("B", "b") | |
.replace("U", "") | |
.replace("u", "") | |
) | |
# Python syntax guarantees max 2 prefixes and that one of them is "r" | |
if len(new_prefix) == 2 and "r" != new_prefix[0].lower(): | |
new_prefix = new_prefix[::-1] | |
return f"{new_prefix}{match.group(2)}" | |
# Re(gex) does actually cache patterns internally but this still improves | |
# performance on a long list literal of strings by 5-9% since lru_cache's | |
# caching overhead is much lower. | |
def _cached_compile(pattern: str) -> Pattern[str]: | |
return re.compile(pattern) | |
def normalize_string_quotes(s: str) -> str: | |
"""Prefer double quotes but only if it doesn't cause more escaping. | |
Adds or removes backslashes as appropriate. Doesn't parse and fix | |
strings nested in f-strings. | |
""" | |
value = s.lstrip(STRING_PREFIX_CHARS) | |
if value[:3] == '"""': | |
return s | |
elif value[:3] == "'''": | |
orig_quote = "'''" | |
new_quote = '"""' | |
elif value[0] == '"': | |
orig_quote = '"' | |
new_quote = "'" | |
else: | |
orig_quote = "'" | |
new_quote = '"' | |
first_quote_pos = s.find(orig_quote) | |
if first_quote_pos == -1: | |
return s # There's an internal error | |
prefix = s[:first_quote_pos] | |
unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") | |
escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") | |
escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") | |
body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)] | |
if "r" in prefix.casefold(): | |
if unescaped_new_quote.search(body): | |
# There's at least one unescaped new_quote in this raw string | |
# so converting is impossible | |
return s | |
# Do not introduce or remove backslashes in raw strings | |
new_body = body | |
else: | |
# remove unnecessary escapes | |
new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body) | |
if body != new_body: | |
# Consider the string without unnecessary escapes as the original | |
body = new_body | |
s = f"{prefix}{orig_quote}{body}{orig_quote}" | |
new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) | |
new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) | |
if "f" in prefix.casefold(): | |
matches = re.findall( | |
r""" | |
(?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single { | |
([^{].*?) # contents of the brackets except if begins with {{ | |
\}(?:(?!\})|$) # A } followed by end of the string or a non-} | |
""", | |
new_body, | |
re.VERBOSE, | |
) | |
for m in matches: | |
if "\\" in str(m): | |
# Do not introduce backslashes in interpolated expressions | |
return s | |
if new_quote == '"""' and new_body[-1:] == '"': | |
# edge case: | |
new_body = new_body[:-1] + '\\"' | |
orig_escape_count = body.count("\\") | |
new_escape_count = new_body.count("\\") | |
if new_escape_count > orig_escape_count: | |
return s # Do not introduce more escaping | |
if new_escape_count == orig_escape_count and orig_quote == '"': | |
return s # Prefer double quotes | |
return f"{prefix}{new_quote}{new_body}{new_quote}" | |
def normalize_unicode_escape_sequences(leaf: Leaf) -> None: | |
"""Replace hex codes in Unicode escape sequences with lowercase representation.""" | |
text = leaf.value | |
prefix = get_string_prefix(text) | |
if "r" in prefix.lower(): | |
return | |
def replace(m: Match[str]) -> str: | |
groups = m.groupdict() | |
back_slashes = groups["backslashes"] | |
if len(back_slashes) % 2 == 0: | |
return back_slashes + groups["body"] | |
if groups["u"]: | |
# \u | |
return back_slashes + "u" + groups["u"].lower() | |
elif groups["U"]: | |
# \U | |
return back_slashes + "U" + groups["U"].lower() | |
elif groups["x"]: | |
# \x | |
return back_slashes + "x" + groups["x"].lower() | |
else: | |
assert groups["N"], f"Unexpected match: {m}" | |
# \N{} | |
return back_slashes + "N{" + groups["N"].upper() + "}" | |
leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) | |
def char_width(char: str) -> int: | |
"""Return the width of a single character as it would be displayed in a | |
terminal or editor (which respects Unicode East Asian Width). | |
Full width characters are counted as 2, while half width characters are | |
counted as 1. Also control characters are counted as 0. | |
""" | |
table = WIDTH_TABLE | |
codepoint = ord(char) | |
highest = len(table) - 1 | |
lowest = 0 | |
idx = highest // 2 | |
while True: | |
start_codepoint, end_codepoint, width = table[idx] | |
if codepoint < start_codepoint: | |
highest = idx - 1 | |
elif codepoint > end_codepoint: | |
lowest = idx + 1 | |
else: | |
return 0 if width < 0 else width | |
if highest < lowest: | |
break | |
idx = (highest + lowest) // 2 | |
return 1 | |
def str_width(line_str: str) -> int: | |
"""Return the width of `line_str` as it would be displayed in a terminal | |
or editor (which respects Unicode East Asian Width). | |
You could utilize this function to determine, for example, if a string | |
is too wide to display in a terminal or editor. | |
""" | |
if line_str.isascii(): | |
# Fast path for a line consisting of only ASCII characters | |
return len(line_str) | |
return sum(map(char_width, line_str)) | |
def count_chars_in_width(line_str: str, max_width: int) -> int: | |
"""Count the number of characters in `line_str` that would fit in a | |
terminal or editor of `max_width` (which respects Unicode East Asian | |
Width). | |
""" | |
total_width = 0 | |
for i, char in enumerate(line_str): | |
width = char_width(char) | |
if width + total_width > max_width: | |
return i | |
total_width += width | |
return len(line_str) | |