Elron commited on
Commit
37115ae
·
verified ·
1 Parent(s): 9c6fadc

Upload type_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. type_utils.py +137 -0
type_utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import collections.abc
2
  import io
3
  import itertools
@@ -7,6 +8,135 @@ import typing
7
  from .utils import safe_eval
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def parse_type_string(type_string: str) -> typing.Any:
11
  """Parses a string representing a Python type hint and evaluates it to return the corresponding type object.
12
 
@@ -24,6 +154,11 @@ def parse_type_string(type_string: str) -> typing.Any:
24
  ValueError: If the type string contains elements not allowed in the safe context
25
  or tokens list.
26
 
 
 
 
 
 
27
  The function uses a predefined safe context with common types from the `typing` module
28
  and basic Python data types. It also defines a list of safe tokens that are allowed
29
  in the type string.
@@ -41,6 +176,8 @@ def parse_type_string(type_string: str) -> typing.Any:
41
  "Optional": typing.Optional,
42
  }
43
 
 
 
44
  safe_tokens = ["[", "]", ",", " "]
45
  return safe_eval(type_string, safe_context, safe_tokens)
46
 
 
1
+ import ast
2
  import collections.abc
3
  import io
4
  import itertools
 
8
  from .utils import safe_eval
9
 
10
 
11
+ def convert_union_type(type_string: str) -> str:
12
+ """Converts Python 3.10 union type hints into form compatible with Python 3.9 version.
13
+
14
+ Args:
15
+ type_string (str): A string representation of a Python type hint. It can be any
16
+ valid Python type, which does not contain strings (e.g. 'Literal').
17
+ Examples include 'List[int|float]', 'str|float|bool' etc.
18
+
19
+ Formally, the function depends on the input string adhering to the following rules.
20
+ Assuming that the input is a valid type hint the function does not check that 'word' is
21
+ 'str', 'bool', 'List' etc. It just depends on the following general structure (spaces ignored):
22
+ type -> word OR type( | type)* OR word[type( , type)*]
23
+ word is a sequence of (0 or more) chars, each being any char but: [ ] , |
24
+ This implies that if any of these 4 chars shows not as a meta char of the input
25
+ type_string, but inside some constant string (of Literal, for example), the scheme
26
+ will not work.
27
+
28
+ Cases like Literal, that might contain occurrences of the four chars above not as meta chars
29
+ in the type string, must be handled as special cases by this function, as shown for Literal,
30
+ as an example. Because 'format_type_string' serves as preprocessing for 'parse_type_string',
31
+ which has a list of allowed types, of which Literal is not a member, Literal and such are not
32
+ relevant at all now; and the case is brought here just for an example for future use.
33
+
34
+
35
+ Returns:
36
+ str: A type string with converted union types, which is compatible with typing module.
37
+
38
+ Examples:
39
+ convert_union_type('List[int|float]') -> 'List[Union[int,float]]'
40
+ convert_union_type('Optional[int|float|bool]') -> 'Optional[Union[int,float,bool]]'
41
+
42
+ """
43
+
44
+ def consume_literal(string: str) -> str:
45
+ # identifies the prefix of string that matches a full Literal typing, with all its constants, including
46
+ # constants that contain [ ] , etc. on which construct_union_part depends.
47
+ # string starts with the [ that follows 'Literal'
48
+ candidate_end = string.find("]")
49
+ while candidate_end != -1:
50
+ try:
51
+ ast.literal_eval(string[: candidate_end + 1])
52
+ break
53
+ except Exception:
54
+ candidate_end = string.find("]", candidate_end + 1)
55
+
56
+ if candidate_end == -1:
57
+ raise ValueError("invalid Literal in input type_string")
58
+ return string[: candidate_end + 1]
59
+
60
+ stack = [""] # the start of a type
61
+ input = type_string.strip()
62
+ next_word = re.compile(r"([^\[\],|]*)([\[\],|]|$)")
63
+ while len(input) > 0:
64
+ word = next_word.match(input)
65
+ input = input[len(word.group(0)) :].strip()
66
+ stack[-1] += word.group(1)
67
+ if word.group(2) in ["]", ",", ""]: # "" for eol:$
68
+ # top of stack is now complete to a whole type
69
+ lwt = stack.pop()
70
+ if (
71
+ "|" in lwt
72
+ ): # the | -s are only at the top level of lwt, not inside any subtype
73
+ lwt = "Union[" + lwt.replace("|", ",") + "]"
74
+ lwt += word.group(2)
75
+ if len(stack) > 0:
76
+ stack[-1] += lwt
77
+ else:
78
+ stack = [lwt]
79
+ if word.group(2) == ",":
80
+ stack.append("") # to start the expected next type
81
+
82
+ elif word.group(2) in ["|"]:
83
+ # top of stack is the last whole element(s) to be union-ed,
84
+ # and more are expected
85
+ stack[-1] += "|"
86
+
87
+ else: # "["
88
+ if word.group(1) == "Literal":
89
+ literal_ops = consume_literal("[" + input)
90
+ stack[-1] += literal_ops
91
+ input = input[len(literal_ops) - 1 :]
92
+ else:
93
+ stack[-1] += "["
94
+ stack.append("")
95
+ # start type (,type)* inside the []
96
+
97
+ assert len(stack) == 1
98
+ if "|" in stack[0]: # these belong to the top level only
99
+ stack[0] = "Union[" + stack[0].replace("|", ",") + "]"
100
+ return stack[0]
101
+
102
+
103
+ def format_type_string(type_string: str) -> str:
104
+ """Formats a string representing a valid Python type hint so that it is compatible with Python 3.9 notation.
105
+
106
+ Args:
107
+ type_string (str): A string representation of a Python type hint. This can be any
108
+ valid type, which does not contain strings (e.g. 'Literal').
109
+ Examples include 'List[int]', 'Dict[str, Any]', 'Optional[List[str]]', etc.
110
+
111
+ Returns:
112
+ str: A formatted type string.
113
+
114
+ Examples:
115
+ format_type_string('list[int | float]') -> 'List[Union[int,float]]'
116
+ format_type_string('dict[str, Optional[str]]') -> 'Dict[str,Optional[str]]'
117
+
118
+ The function formats valid type string (either after or before Python 3.10) into a
119
+ form compatible with 3.9. This is done by captilizing the first letter of a lower-cased
120
+ type name and transferring the 'bitwise or operator' into 'Union' notation. The function
121
+ also removes whitespaces and redundant module name in type names imported from 'typing'
122
+ module, e.g. 'typing.Tuple' -> 'Tuple'.
123
+
124
+ Currently, the capitalization is applied only to types which unitxt allows, i.e.
125
+ 'list', 'dict', 'tuple'. Moreover, the function expects the input to not contain types
126
+ which contain strings, for example 'Literal'.
127
+ """
128
+ types_map = {
129
+ "list": "List",
130
+ "tuple": "Tuple",
131
+ "dict": "Dict",
132
+ "typing.": "",
133
+ " ": "",
134
+ }
135
+ for old_type, new_type in types_map.items():
136
+ type_string = type_string.replace(old_type, new_type)
137
+ return convert_union_type(type_string)
138
+
139
+
140
  def parse_type_string(type_string: str) -> typing.Any:
141
  """Parses a string representing a Python type hint and evaluates it to return the corresponding type object.
142
 
 
154
  ValueError: If the type string contains elements not allowed in the safe context
155
  or tokens list.
156
 
157
+ The function formats the string first if it represents a new Python type hint
158
+ (i.e. valid since Python 3.10), which uses lowercased names for some types and
159
+ 'bitwise or operator' instead of 'Union', for example: 'list[int|float]' instead
160
+ of 'List[Union[int,float]]' etc.
161
+
162
  The function uses a predefined safe context with common types from the `typing` module
163
  and basic Python data types. It also defines a list of safe tokens that are allowed
164
  in the type string.
 
176
  "Optional": typing.Optional,
177
  }
178
 
179
+ type_string = format_type_string(type_string)
180
+
181
  safe_tokens = ["[", "]", ",", " "]
182
  return safe_eval(type_string, safe_context, safe_tokens)
183