;; -*- Mode: Irken -*-
;; An Earley Parser.
;;
;; This is based on the description from https://en.wikipedia.org/wiki/Earley_parser
;;
(datatype prod
(:nt symbol)
(:t symbol)
)
(define terminal?
(prod:t _) -> #t
_ -> #f
)
(define prod->name
(prod:nt name) -> name
(prod:t name) -> name
)
(define prod-repr
(prod:nt name) -> (format "<" (sym name) ">")
(prod:t name) -> (format "{" (sym name) "}")
)
(define EOF (prod:t 'eof))
(define NULTOK {kind='nul val=""})
(typealias token {kind=symbol val=string})
(datatype parse
(:nt symbol (list parse))
(:t token)
)
(define parse-repr
(parse:nt kind subs)
-> (format "(" (sym kind) " " (join parse-repr " " subs) ")")
(parse:t tok)
-> (format "{" (sym tok.kind) ":" tok.val "}")
)
(define parse->sexp
(parse:nt kind subs)
-> (sexp1 kind (map parse->sexp subs))
(parse:t tok)
-> (sexp:list (LIST (sexp:symbol tok.kind) (sexp:string tok.val)))
)
(define (token-repr tok)
(format "{" (sym tok.kind) ":" (string tok.val) "}"))
(define (state-repr s)
(let ((prod (map prod-repr s.prod)))
(insert! prod s.dot "•")
(set! prod (list:cons (prod-repr s.nt) (list:cons "→" prod)))
(format "(" (join " " prod) " @" (int s.start) ")")))
;; a growable vector object
(define (make-gvec len item)
(let ((ob {v=(make-vector len item) len0=len len1=len}))
(define (ref n) ob.v[n])
(define (set! n item) (set! ob.v[n] item))
(define (len) ob.len0)
(define (list)
(let ((r '()))
(for-range i ob.len0
(PUSH r ob.v[i]))
(reverse r)))
(define (append item)
(if (= ob.len0 ob.len1)
;; we need to grow
(let ((newlen (+ 1 (/ (* ob.len0 3) 2))) ;; 50%
(newvec (make-vector newlen item)))
(for-range i ob.len0
(set! newvec[i] ob.v[i]))
;; note: done above.
;; (set! newvec[ob.len0] item)
(set! ob.v newvec)
(set! ob.len1 newlen)
(set! ob.len0 (+ 1 ob.len0)))
;; room
(begin
(set! ob.v[ob.len0] item)
(set! ob.len0 (+ 1 ob.len0)))))
{ref=ref set=set! len=len append=append list=list}
))
(define (completed? state)
(= state.dot (length state.prod)))
(define (prod=? p0 p1)
(eq? (prod->name p0) (prod->name p1)))
(define (earley grammar nt0 tokgen)
(let ((S (make-gvec 1 (make-gvec 1 {nt=nt0 dot=0 prod=(LIST nt0 EOF) start=0})))
(k 0)
(toks (make-gvec 0 NULTOK)))
(define get-prod
(prod:nt name)
-> (alist/get grammar name "no such production")
x -> (error1 "get-prod: not an NT?" x)
)
(define (maybe-add index state)
(let ((states (S.ref index))
(found #f))
(for-range i (states.len)
(if (magic=? state (states.ref i))
(set! found #t)))
(when (not found)
(states.append state))))
(define (bump-dot x)
{nt=x.nt dot=(+ 1 x.dot) prod=x.prod start=x.start}
)
(define (completer nt start)
;; For every state in S(k) of the form (X → γ •, j), find states
;; in S(j) of the form (Y → α • X β, i) and add (Y → α X • β, i)
;; to S(k).
(let ((states (S.ref start)))
(for-range i (states.len)
(let ((state (states.ref i)))
(if (and
(> (length state.prod) state.dot)
(prod=? nt (nth state.prod state.dot)))
(maybe-add k (bump-dot state))
)))))
(define (predictor nt)
;; For every state in S(k) of the form (X → α • Y β, j) (where j
;; is the origin position as above), add (Y → • γ, k) to S(k)
;; for every production in the grammar with Y on the left-hand
;; side (Y → γ).
(for-list prod (get-prod nt)
(maybe-add k {nt=nt dot=0 prod=prod start=k})))
(define (add-next state)
(if (= (S.len) (+ k 1))
(S.append (make-gvec 1 state))
(maybe-add (+ 1 k) state)))
(define (step tok)
(let ((states (S.ref k))
(j 0)
(scanned? #f))
(while (< j (states.len))
(let ((state (states.ref j)))
(set! j (+ j 1))
;; each state in the set falls into one of three categories:
;; 1) the state is complete (i.e., the dot is at the end)
;; 2) the state expects a terminal
;; 3) the state expects a non-terminal
(if (completed? state)
;; 1 state is complete
(completer state.nt state.start)
(let ((nextprod (nth state.prod state.dot)))
(if (terminal? nextprod)
;; 2) expects a terminal
(when (eq? tok.kind (prod->name nextprod))
;; If a is the next symbol in the input stream, for every state
;; in S(k) of the form (X → α • a β, j), add (X → α a • β, j) to
;; S(k+1).
(set! scanned? #t)
(add-next (bump-dot state)))
;; 3) a non-terminal - predict it
(predictor nextprod))))))
(when (not scanned?)
(raise (:NoParse)))
))
(define (build-parse-tree)
;; this uses the technique described in "Parsing Techniques - A Practical Guide".
;; [https://dickgrune.com/Books/PTAPG_1st_Edition/]
;; the method described by Earley is confusing (and wrong in some cases).
;; convert to a vector of lists, including only completed states.
;; also: reverse the states since we want to visit them in that order.
(define (complete-states)
(list->vector
(map (lambda (x)
(reverse (filter completed? (x.list))))
(S.list))))
;; remove non-completed states
(let ((all (complete-states)))
(define (walk d nt end)
(let/cc return
(while (> (length all[end]) 0)
;; pop each state off of the list in order to avoid
;; infinite recursion on the same rule.
(let ((item (pop all[end])))
(if (prod=? item.nt nt)
(let ((r '()))
(for-list x (reverse item.prod)
(if (not (terminal? x))
(let (((y end0) (walk (+ d 1) x end)))
(PUSH r y)
(set! end end0))
(begin
(set! end (- end 1))
(PUSH r (parse:t (toks.ref end)))
)))
(return (:tuple (parse:nt (prod->name nt) r) end)))))
)
(raise (:NoParse))
))
(let ((end0 (- (vector-length all) 1))
((r end) (walk 0 nt0 end0)))
(match r with
;; strip off outer result caused by the fake production
;; rule we put in of `root := root EOF`.
(parse:nt _ (root eof))
-> root
_ -> (impossible)
))))
(for tok tokgen
(step tok)
(toks.append tok)
(set! k (+ 1 k)))
(build-parse-tree)
))