Spaces:
Running
Running
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | |
# Licensed to PSF under a Contributor Agreement. | |
"""Convert graminit.[ch] spit out by pgen to Python code. | |
Pgen is the Python parser generator. It is useful to quickly create a | |
parser from a grammar file in Python's grammar notation. But I don't | |
want my parsers to be written in C (yet), so I'm translating the | |
parsing tables to Python data structures and writing a Python parse | |
engine. | |
Note that the token numbers are constants determined by the standard | |
Python tokenizer. The standard token module defines these numbers and | |
their names (the names are not used much). The token numbers are | |
hardcoded into the Python tokenizer and into pgen. A Python | |
implementation of the Python tokenizer is also available, in the | |
standard tokenize module. | |
On the other hand, symbol numbers (representing the grammar's | |
non-terminals) are assigned by pgen based on the actual grammar | |
input. | |
Note: this module is pretty much obsolete; the pgen module generates | |
equivalent grammar tables directly from the Grammar.txt input file | |
without having to invoke the Python pgen C program. | |
""" | |
# Python imports | |
import re | |
# Local imports | |
from pgen2 import grammar, token | |
class Converter(grammar.Grammar): | |
"""Grammar subclass that reads classic pgen output files. | |
The run() method reads the tables as produced by the pgen parser | |
generator, typically contained in two C files, graminit.h and | |
graminit.c. The other methods are for internal use only. | |
See the base class for more documentation. | |
""" | |
def run(self, graminit_h, graminit_c): | |
"""Load the grammar tables from the text files written by pgen.""" | |
self.parse_graminit_h(graminit_h) | |
self.parse_graminit_c(graminit_c) | |
self.finish_off() | |
def parse_graminit_h(self, filename): | |
"""Parse the .h file written by pgen. (Internal) | |
This file is a sequence of #define statements defining the | |
nonterminals of the grammar as numbers. We build two tables | |
mapping the numbers to names and back. | |
""" | |
try: | |
f = open(filename) | |
except OSError as err: | |
print("Can't open %s: %s" % (filename, err)) | |
return False | |
self.symbol2number = {} | |
self.number2symbol = {} | |
lineno = 0 | |
for line in f: | |
lineno += 1 | |
mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) | |
if not mo and line.strip(): | |
print("%s(%s): can't parse %s" % (filename, lineno, | |
line.strip())) | |
else: | |
symbol, number = mo.groups() | |
number = int(number) | |
assert symbol not in self.symbol2number | |
assert number not in self.number2symbol | |
self.symbol2number[symbol] = number | |
self.number2symbol[number] = symbol | |
return True | |
def parse_graminit_c(self, filename): | |
"""Parse the .c file written by pgen. (Internal) | |
The file looks as follows. The first two lines are always this: | |
#include "pgenheaders.h" | |
#include "grammar.h" | |
After that come four blocks: | |
1) one or more state definitions | |
2) a table defining dfas | |
3) a table defining labels | |
4) a struct defining the grammar | |
A state definition has the following form: | |
- one or more arc arrays, each of the form: | |
static arc arcs_<n>_<m>[<k>] = { | |
{<i>, <j>}, | |
... | |
}; | |
- followed by a state array, of the form: | |
static state states_<s>[<t>] = { | |
{<k>, arcs_<n>_<m>}, | |
... | |
}; | |
""" | |
try: | |
f = open(filename) | |
except OSError as err: | |
print("Can't open %s: %s" % (filename, err)) | |
return False | |
# The code below essentially uses f's iterator-ness! | |
lineno = 0 | |
# Expect the two #include lines | |
lineno, line = lineno+1, next(f) | |
assert line == '#include "pgenheaders.h"\n', (lineno, line) | |
lineno, line = lineno+1, next(f) | |
assert line == '#include "grammar.h"\n', (lineno, line) | |
# Parse the state definitions | |
lineno, line = lineno+1, next(f) | |
allarcs = {} | |
states = [] | |
while line.startswith("static arc "): | |
while line.startswith("static arc "): | |
mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", | |
line) | |
assert mo, (lineno, line) | |
n, m, k = list(map(int, mo.groups())) | |
arcs = [] | |
for _ in range(k): | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"\s+{(\d+), (\d+)},$", line) | |
assert mo, (lineno, line) | |
i, j = list(map(int, mo.groups())) | |
arcs.append((i, j)) | |
lineno, line = lineno+1, next(f) | |
assert line == "};\n", (lineno, line) | |
allarcs[(n, m)] = arcs | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) | |
assert mo, (lineno, line) | |
s, t = list(map(int, mo.groups())) | |
assert s == len(states), (lineno, line) | |
state = [] | |
for _ in range(t): | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) | |
assert mo, (lineno, line) | |
k, n, m = list(map(int, mo.groups())) | |
arcs = allarcs[n, m] | |
assert k == len(arcs), (lineno, line) | |
state.append(arcs) | |
states.append(state) | |
lineno, line = lineno+1, next(f) | |
assert line == "};\n", (lineno, line) | |
lineno, line = lineno+1, next(f) | |
self.states = states | |
# Parse the dfas | |
dfas = {} | |
mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) | |
assert mo, (lineno, line) | |
ndfas = int(mo.group(1)) | |
for i in range(ndfas): | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', | |
line) | |
assert mo, (lineno, line) | |
symbol = mo.group(2) | |
number, x, y, z = list(map(int, mo.group(1, 3, 4, 5))) | |
assert self.symbol2number[symbol] == number, (lineno, line) | |
assert self.number2symbol[number] == symbol, (lineno, line) | |
assert x == 0, (lineno, line) | |
state = states[z] | |
assert y == len(state), (lineno, line) | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) | |
assert mo, (lineno, line) | |
first = {} | |
rawbitset = eval(mo.group(1)) | |
for i, c in enumerate(rawbitset): | |
byte = ord(c) | |
for j in range(8): | |
if byte & (1<<j): | |
first[i*8 + j] = 1 | |
dfas[number] = (state, first) | |
lineno, line = lineno+1, next(f) | |
assert line == "};\n", (lineno, line) | |
self.dfas = dfas | |
# Parse the labels | |
labels = [] | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"static label labels\[(\d+)\] = {$", line) | |
assert mo, (lineno, line) | |
nlabels = int(mo.group(1)) | |
for i in range(nlabels): | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) | |
assert mo, (lineno, line) | |
x, y = mo.groups() | |
x = int(x) | |
if y == "0": | |
y = None | |
else: | |
y = eval(y) | |
labels.append((x, y)) | |
lineno, line = lineno+1, next(f) | |
assert line == "};\n", (lineno, line) | |
self.labels = labels | |
# Parse the grammar struct | |
lineno, line = lineno+1, next(f) | |
assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"\s+(\d+),$", line) | |
assert mo, (lineno, line) | |
ndfas = int(mo.group(1)) | |
assert ndfas == len(self.dfas) | |
lineno, line = lineno+1, next(f) | |
assert line == "\tdfas,\n", (lineno, line) | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"\s+{(\d+), labels},$", line) | |
assert mo, (lineno, line) | |
nlabels = int(mo.group(1)) | |
assert nlabels == len(self.labels), (lineno, line) | |
lineno, line = lineno+1, next(f) | |
mo = re.match(r"\s+(\d+)$", line) | |
assert mo, (lineno, line) | |
start = int(mo.group(1)) | |
assert start in self.number2symbol, (lineno, line) | |
self.start = start | |
lineno, line = lineno+1, next(f) | |
assert line == "};\n", (lineno, line) | |
try: | |
lineno, line = lineno+1, next(f) | |
except StopIteration: | |
pass | |
else: | |
assert 0, (lineno, line) | |
def finish_off(self): | |
"""Create additional useful structures. (Internal).""" | |
self.keywords = {} # map from keyword strings to arc labels | |
self.tokens = {} # map from numeric token values to arc labels | |
for ilabel, (type, value) in enumerate(self.labels): | |
if type == token.NAME and value is not None: | |
self.keywords[value] = ilabel | |
elif value is None: | |
self.tokens[type] = ilabel | |