# -*- Mode: Python; tab-width: 4 -*-
import string
import sys
is_a = isinstance
# XXX I notice that the <atom> class is used for all the atoms
# except for symbols (which are simply unadulterated strings).
# should fix this.
class atom:
def __init__ (self, kind, value):
self.kind = kind
self.value = value
def __eq__ (self, other):
return is_a (other, atom) and other.kind == self.kind and other.value == self.value
def __repr__ (self):
return '<A %s %r>' % (self.kind, self.value)
class reader:
def __init__ (self, file):
self.file = file
self.char = None
self.line = 1
def peek (self):
if self.char is None:
self.char = self.file.read (1)
return self.char
def next (self):
result, self.char = self.char, self.file.read (1)
if result == '\n':
self.line += 1
return result
def skip_whitespace (self):
while 1:
ch = self.peek()
if not ch:
break
elif ch not in string.whitespace:
if ch == ';':
while self.next() not in '\r\n':
pass
else:
break
else:
self.next()
def read (self):
self.skip_whitespace()
ch = self.peek()
if ch == '':
raise EOFError, "Unexpected end of file"
elif ch == '(':
result = self.read_list()
elif ch == '{':
result = self.read_record()
elif ch == '"':
result = self.read_string()
elif ch == "'":
self.next()
result = ['quote', self.read()]
elif ch == "`":
self.next()
result = ['backquote', self.read()]
elif ch == ",":
self.next()
result = ['comma', self.read()]
elif ch == ':':
# (for constructor syntax)
self.next()
result = ['colon', None, self.read()]
# unquote, etc.. can be found in old lumberjack code if needed.
elif ch == '#':
self.next()
ch = self.peek()
if ch == '\\':
self.next()
probe = self.read_atom()
if probe == 'newline':
ch = '\n'
elif probe == 'space':
ch = ' '
elif probe == 'return':
ch = '\r'
elif probe == 'eof':
ch = 'eof'
elif probe == 'nul':
ch = '\000'
elif probe == 'tab':
ch = '\t'
elif len(probe) > 1:
raise ValueError ("unknown character constant: %r" % probe)
else:
ch = probe[0]
result = atom ('char', ch)
elif ch in 'Xx':
self.next()
result = atom ('int', string.atoi (self.read_atom(), 16))
elif ch in 'Oo':
self.next()
result = atom ('int', string.atoi (self.read_atom(), 8))
elif ch in 'Bb':
self.next()
result = atom ('int', string.atoi (self.read_atom(), 2))
elif ch in 'Tt':
self.next()
result = [['colon', 'bool', 'true']]
elif ch in 'Ff':
self.next()
result = [['colon', 'bool', 'false']]
elif ch in 'Uu':
self.next()
result = atom ('undefined', 'undefined')
elif ch == '(':
# hmm... a vector shouldn't be an atom, I think.
result = atom ('vector', self.read_list())
# it's arguable: "{...}" or "#{...}" - the latter is more scheme-like
# but pointlessly noisier.
#elif ch == '{':
# result = atom ('record', self.read_record())
else:
raise SyntaxError, 'Illegal #-escape character: "%s"' % ch
elif ch in '-0123456789':
a = self.read_atom()
if a == '-':
# bad, bad, bad
result = '-'
else:
all_digits = 1
for ch in a:
if ch not in '-0123456789':
all_digits = 0
break
if all_digits:
result = atom ('int', string.atoi (a))
else:
result = a
else:
result = self.read_atom()
self.skip_whitespace()
ch = self.peek()
# support postfix array-reference syntax
if ch != '' and ch == '[':
index = self.read_array_index()
return ['%%array-ref', result, index]
# support infix colon syntax
elif ch != '' and ch == ':':
self.next()
rhs = self.read()
return ['colon', result, rhs]
else:
return result
def read_atom (self):
# read at least one character
line = self.line
result = self.next()
while 1:
ch = self.peek()
if ch in string.whitespace or ch in '()[]{}:':
return result
else:
result = result + self.next()
special = {'n':'\n','t':'\t'}
def read_string (self):
result = ''
line = self.line
# throw away the quote.
ch = self.next()
while 1:
ch = self.peek()
if ch == '"':
# throw away the close-quote
ch = self.next()
return atom ('string', result)
elif ch == '\\':
# ignore this backslash, read the next char
self.next()
ch = self.next()
if ch in 'xX':
# ascii escapes introduced only R6RS, *however*, theirs
# is terminated by a semicolon and can be more than two hex
# digits.
hex0 = self.next()
hex1 = self.next()
ch = chr (string.atoi (hex0 + hex1, 16))
result += ch
else:
result += self.special.get (ch, ch)
else:
result += self.next()
def read_list (self):
result = []
# throw away the paren
paren = self.next()
while 1:
self.skip_whitespace()
p = self.peek()
if p == ')':
# throw away the paren
ch = self.next()
return result
else:
exp = self.read()
if is_a (exp, list) and len(exp) and exp[0] == 'include':
self.read_include (exp, result)
else:
result.append (exp)
def read_name (self):
result = []
while 1:
p = self.peek()
if p == '.':
# special case '...' name
self.next()
assert (self.next() == '.')
assert (self.next() == '.')
return '...'
else:
# record names have a more limited space
if not (p in string.letters or p in string.digits or p in "-"):
return ''.join (result)
else:
result.append (p)
self.next()
def read_record (self):
# { label=value label=value }
result = []
# skip open bracket
self.next()
while 1:
self.skip_whitespace()
p = self.peek()
if p == '}':
self.next()
return atom ('record', result)
else:
name = self.read_name()
self.skip_whitespace()
if name == '...':
result.append (('...', None))
if self.peek() != '}':
raise SyntaxError ("expected '}' after '...' in record literal")
self.next()
return atom ('record', result)
else:
if self.next() != '=':
raise SyntaxError ("expected '=' in record literal")
else:
val = self.read()
result.append ((name, val))
def read_array_index (self):
# throw away open bracket
self.next()
exp = self.read()
if self.read() not in ']}':
raise SyntaxError ("expected closing ']/}' character")
return exp
def read_all (self):
forms = []
try:
while 1:
form = self.read()
if is_a (form, list) and form[0] == 'include':
self.read_include (form, forms)
else:
forms.append (form)
except EOFError:
return forms
# XXX I'm not happy with this here, but if I put it in the transformer, it
# will require an extra pass *before* the transformer, because expand_body()
# will not recognize things hidden in an include (e.g., 'define' forms).
def read_include (self, exp, result):
filename = exp[1].value
for sub in reader (open (filename, 'rb')).read_all():
result.append (sub)
if __name__ == '__main__':
import pprint
import sys
if len (sys.argv) < 2:
file = sys.stdin
else:
file = open (sys.argv[1], 'r')
p = reader (file)
pprint.pprint (p.read_all())