# -*- Mode: Python -*-
import charset
import regular
import automata
from pprint import pprint as pp
import sys
W = sys.stdout.write
# two approaches for generating a lexer:
# 1) generate tables and dispatch through them at run-time.
# 2) generate range-test code from the dfa.
# #2 should be faster, especially if we try a little harder.
# idea: feed a bunch of sample data through the lexer, collecting
# probabilities at each transition, use this to reorder them.
# [a branch predictor will do the same thing, is it worth it?]
# I think tables are better once the lexer hits a certain size.
class lexer:
def __init__ (self, lexicon):
# combine all the regexps into one, with each sub-expression wrapped with TOKEN
tokens = regular.n_or ([ (regular.TOKEN, regular.parse (x), y) for (x, y) in lexicon ])
#pp (tokens)
c0 = automata.regexp_to_nfa_converter()
nfa, start, end = c0.go (tokens)
#pp (nfa)
c1 = automata.nfa_dfa_converter (nfa, start, end)
dfa, finals, actions = c1.go (c0.tokens)
flat = c1.coalesce()
#print 'dfa='
#pp (flat)
#print 'finals=', finals
self.dfa = flat
#print 'actions=', actions
# XXX finals + actions == redundant
self.finals = finals
self.actions = {}
for state, action in actions:
if not self.actions.has_key (state):
# give preference to tokens defined earlier in the lexicon
self.actions[state] = action
#print 'actions=', self.actions
def sort_transitions (self, trans):
# sort transitions in a way that generates the most efficient set of
# range tests.
tests = [ (sym.as_ranges()[1], sym, ts) for (sym, ts) in trans ]
tests.sort (lambda a,b: cmp (len(a[0]), len(b[0])))
return [ (sym, ts) for (ranges, sym, ts) in tests ]
def read (self, file):
# this is useful for debugging your lexicon. feed it data you'd
# like to lex. when it works the way you like, then call
# gen_irken/c/etc() on it.
s = file.read()
state = 0
current = []
good = None
slen = len(s)
i = 0
for ch in s:
while 1:
entry = self.dfa[state]
ostate = state
for sym, ts in entry:
if sym.has (ch):
state = ts
break
print '%d %r => %d' % (ostate, ch, state)
#W ('[%d|%d]' % (ord(ch),state,))
action = self.actions.get (state, None)
if not good and action:
# transition in
good = action
break
elif good and not action:
# transition out
print good, repr(''.join (current))
current = []
state = 0
good = None
else:
# update final state if one...
good = action
break
current.append (ch)
i += 1
def generate (self):
print " def step (self, ch):"
for i in range (len (self.dfa)):
if i == 0:
print " if self.state == %d:" % i
else:
print " elif self.state == %d:" % i
state_len = len (self.dfa[i])
trans = self.sort_transitions (self.dfa[i])
for j in range (state_len):
sym, ts = trans[j]
if state_len == 1:
print " self.state = %d" % (ts,)
elif j == state_len - 1:
# last range is always a catch-all
print " else:"
print " self.state = %d" % (ts,)
else:
invert, ranges = sym.as_ranges()
test = []
for start, end in ranges:
if start == end:
test.append ("ch == %d" % (start,))
else:
test.append ("%d <= ch <= %d" % (start, end))
test = ' or '.join (test)
if invert:
test = 'not (%s)' % (test,)
if j == 0:
print " if %s:" % (test,)
else:
print " elif %s:" % (test,)
print " self.state = %d" % (ts,)
if self.actions.has_key (i):
print " # TOKEN %s" % (self.actions[i],)
def gen_c (self):
print "int step (int state, char ch)"
print "{"
for i in range (len (self.dfa)):
if i == 0:
print " if (state == %d) {" % i
else:
print " } else if (state == %d) {" % i
state_len = len (self.dfa[i])
trans = self.sort_transitions (self.dfa[i])
for j in range (state_len):
sym, ts = trans[j]
if state_len == 1:
print " state = %d;" % (ts,)
elif j == state_len - 1:
# last range is always a catch-all
print " } else {"
print " state = %d;" % (ts,)
print " }"
else:
invert, ranges = sym.as_ranges()
test = []
for start, end in ranges:
if start == end:
test.append ("(ch == %d)" % (start,))
else:
test.append ("(ch >= %d && ch <= %d)" % (start, end))
test = ' || '.join (test)
if invert:
test = '!(%s)' % (test,)
if j == 0:
print " if (%s) {" % (test,)
else:
print " } else if (%s) {" % (test,)
print " state = %d;" % (ts,)
print " }"
print " return state;"
print "}"
def find_sink (self):
# is the sink is always the second state?
for i in range (len (self.dfa)):
entry = self.dfa[i]
if len(entry) == 1:
[(sym, ts)] = entry
if sym == charset.DOT and ts == i:
return i
else:
raise ValueError
def gen_irken (self, file):
# generate a table-based dfa, using strings
assert (len (self.dfa) < 256)
W = file.write
W (";; generated by lexer.py\n")
tables = []
for i in range (len (self.dfa)):
table = [1] * 256
for sym, ts in self.dfa[i]:
for i in range (256):
if sym[i]:
table[i] = ts
table = ''.join ([chr(x) for x in table])
tables.append (table)
unique = {}
for t in tables:
if not unique.has_key (t):
unique[t] = len (unique)
indirect = []
for t in tables:
indirect.append (unique[t])
W ("(define dfa \n")
W (" (let* (\n")
items = unique.items()
items.sort (lambda a,b: cmp (a[1], b[1]))
for table, index in items:
W (" (t%d \"%s\")\n" % (index, ''.join (["\\x%02x" % (ord (x),) for x in table])))
W (" )\n")
W (" #(%s)))\n" % (" ".join (["t%d" % (x,) for x in indirect])))
# find the sink state
sink = self.find_sink()
W ("(define finals\n")
W (" '#(\n")
for i in range (len (self.dfa)):
f = self.actions.get (i, None)
if f:
W (" %s\n" % f)
else:
W (" not-final\n")
W (" ))\n")
W ("(define (step ch state)\n")
W (" (char->ascii (string-ref dfa[state] (char->ascii ch))))\n")
return tables
keywords = '|'.join ('and is in not if then else elif yield while for try def class'.split())
augassign = ('(' + '|'.join (r'\+ - \* / % & \| \^ << >> \*\* //'.split()) + ')=')
python_lexicon = [
(r'[0-9]+', 'number'),
(r'[A-Za-z_][A-Za-z0-9_]*', 'ident'),
(r'"([^\\\n"]|(\\.))*"', 'string1'),
(r"'([^\\\n']|(\\.))*'", 'string2'),
(r'[ \t]*#[^\n]*\n', 'comment'),
(r'[-+]', 'addop'),
(r'([*/%]|//)', 'mulop'),
(r'<|>|<=|>=|==', 'compare'),
(r'<<|>>', 'shift'),
(r'\*\*', 'power'),
(r'[ \t]+', 'whitespace'),
(r'[\n]', 'newline'),
(r'\.', 'getattr'),
(r'=', 'assign'),
(r'\(', 'lparen'),
(r'\)', 'rparen'),
(r',', 'comma'),
(r':', 'colon'),
(r';', 'semicolon'),
(r'{', 'lbracket'),
(r'}', 'rbracket'),
(r'\[', 'lbrace'),
(r'\]', 'rbrace'),
(r'&', 'bitand'),
(r'\|', 'bitor'),
(r'^', 'bitxor'),
(r'~', 'bitnot'),
(keywords, 'keyword'),
(augassign, 'augassign'),
]
if __name__ == '__main__':
m = lexer (python_lexicon)
#m.gen_scheme_code()
#m.gen_scheme_table()
m.gen_irken (open ("lexstep.scm", 'wb'))
#m.read (open ("../nodes.py"))