Spaces:
Runtime error
Runtime error
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | |
# Licensed to PSF under a Contributor Agreement. | |
"""This module defines the data structures used to represent a grammar. | |
These are a bit arcane because they are derived from the data | |
structures used by Python's 'pgen' parser generator. | |
There's also a table here mapping operators to their names in the | |
token module; the Python tokenize module reports all operators as the | |
fallback token code OP, but the parser needs the actual token code. | |
""" | |
# Python imports | |
import os | |
import pickle | |
import tempfile | |
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union | |
# Local imports | |
from . import token | |
_P = TypeVar("_P", bound="Grammar") | |
Label = Tuple[int, Optional[str]] | |
DFA = List[List[Tuple[int, int]]] | |
DFAS = Tuple[DFA, Dict[int, int]] | |
Path = Union[str, "os.PathLike[str]"] | |
class Grammar: | |
"""Pgen parsing tables conversion class. | |
Once initialized, this class supplies the grammar tables for the | |
parsing engine implemented by parse.py. The parsing engine | |
accesses the instance variables directly. The class here does not | |
provide initialization of the tables; several subclasses exist to | |
do this (see the conv and pgen modules). | |
The load() method reads the tables from a pickle file, which is | |
much faster than the other ways offered by subclasses. The pickle | |
file is written by calling dump() (after loading the grammar | |
tables using a subclass). The report() method prints a readable | |
representation of the tables to stdout, for debugging. | |
The instance variables are as follows: | |
symbol2number -- a dict mapping symbol names to numbers. Symbol | |
numbers are always 256 or higher, to distinguish | |
them from token numbers, which are between 0 and | |
255 (inclusive). | |
number2symbol -- a dict mapping numbers to symbol names; | |
these two are each other's inverse. | |
states -- a list of DFAs, where each DFA is a list of | |
states, each state is a list of arcs, and each | |
arc is a (i, j) pair where i is a label and j is | |
a state number. The DFA number is the index into | |
this list. (This name is slightly confusing.) | |
Final states are represented by a special arc of | |
the form (0, j) where j is its own state number. | |
dfas -- a dict mapping symbol numbers to (DFA, first) | |
pairs, where DFA is an item from the states list | |
above, and first is a set of tokens that can | |
begin this grammar rule (represented by a dict | |
whose values are always 1). | |
labels -- a list of (x, y) pairs where x is either a token | |
number or a symbol number, and y is either None | |
or a string; the strings are keywords. The label | |
number is the index in this list; label numbers | |
are used to mark state transitions (arcs) in the | |
DFAs. | |
start -- the number of the grammar's start symbol. | |
keywords -- a dict mapping keyword strings to arc labels. | |
tokens -- a dict mapping token numbers to arc labels. | |
""" | |
def __init__(self) -> None: | |
self.symbol2number: Dict[str, int] = {} | |
self.number2symbol: Dict[int, str] = {} | |
self.states: List[DFA] = [] | |
self.dfas: Dict[int, DFAS] = {} | |
self.labels: List[Label] = [(0, "EMPTY")] | |
self.keywords: Dict[str, int] = {} | |
self.soft_keywords: Dict[str, int] = {} | |
self.tokens: Dict[int, int] = {} | |
self.symbol2label: Dict[str, int] = {} | |
self.version: Tuple[int, int] = (0, 0) | |
self.start = 256 | |
# Python 3.7+ parses async as a keyword, not an identifier | |
self.async_keywords = False | |
def dump(self, filename: Path) -> None: | |
"""Dump the grammar tables to a pickle file.""" | |
# mypyc generates objects that don't have a __dict__, but they | |
# do have __getstate__ methods that will return an equivalent | |
# dictionary | |
if hasattr(self, "__dict__"): | |
d = self.__dict__ | |
else: | |
d = self.__getstate__() # type: ignore | |
with tempfile.NamedTemporaryFile( | |
dir=os.path.dirname(filename), delete=False | |
) as f: | |
pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) | |
os.replace(f.name, filename) | |
def _update(self, attrs: Dict[str, Any]) -> None: | |
for k, v in attrs.items(): | |
setattr(self, k, v) | |
def load(self, filename: Path) -> None: | |
"""Load the grammar tables from a pickle file.""" | |
with open(filename, "rb") as f: | |
d = pickle.load(f) | |
self._update(d) | |
def loads(self, pkl: bytes) -> None: | |
"""Load the grammar tables from a pickle bytes object.""" | |
self._update(pickle.loads(pkl)) | |
def copy(self: _P) -> _P: | |
""" | |
Copy the grammar. | |
""" | |
new = self.__class__() | |
for dict_attr in ( | |
"symbol2number", | |
"number2symbol", | |
"dfas", | |
"keywords", | |
"soft_keywords", | |
"tokens", | |
"symbol2label", | |
): | |
setattr(new, dict_attr, getattr(self, dict_attr).copy()) | |
new.labels = self.labels[:] | |
new.states = self.states[:] | |
new.start = self.start | |
new.version = self.version | |
new.async_keywords = self.async_keywords | |
return new | |
def report(self) -> None: | |
"""Dump the grammar tables to standard output, for debugging.""" | |
from pprint import pprint | |
print("s2n") | |
pprint(self.symbol2number) | |
print("n2s") | |
pprint(self.number2symbol) | |
print("states") | |
pprint(self.states) | |
print("dfas") | |
pprint(self.dfas) | |
print("labels") | |
pprint(self.labels) | |
print("start", self.start) | |
# Map from operator to number (since tokenize doesn't do this) | |
opmap_raw = """ | |
( LPAR | |
) RPAR | |
[ LSQB | |
] RSQB | |
: COLON | |
, COMMA | |
; SEMI | |
+ PLUS | |
- MINUS | |
* STAR | |
/ SLASH | |
| VBAR | |
& AMPER | |
< LESS | |
> GREATER | |
= EQUAL | |
. DOT | |
% PERCENT | |
` BACKQUOTE | |
{ LBRACE | |
} RBRACE | |
@ AT | |
@= ATEQUAL | |
== EQEQUAL | |
!= NOTEQUAL | |
<> NOTEQUAL | |
<= LESSEQUAL | |
>= GREATEREQUAL | |
~ TILDE | |
^ CIRCUMFLEX | |
<< LEFTSHIFT | |
>> RIGHTSHIFT | |
** DOUBLESTAR | |
+= PLUSEQUAL | |
-= MINEQUAL | |
*= STAREQUAL | |
/= SLASHEQUAL | |
%= PERCENTEQUAL | |
&= AMPEREQUAL | |
|= VBAREQUAL | |
^= CIRCUMFLEXEQUAL | |
<<= LEFTSHIFTEQUAL | |
>>= RIGHTSHIFTEQUAL | |
**= DOUBLESTAREQUAL | |
// DOUBLESLASH | |
//= DOUBLESLASHEQUAL | |
-> RARROW | |
:= COLONEQUAL | |
""" | |
opmap = {} | |
for line in opmap_raw.splitlines(): | |
if line: | |
op, name = line.split() | |
opmap[op] = getattr(token, name) | |