Compare commits

...

16 Commits

Author SHA1 Message Date
Calum Grant
8938cd0db8 WIP 2024-05-20 08:49:19 +01:00
Calum Grant
89738825ac WIP 2024-05-18 08:18:55 +01:00
Calum Grant
5749fa51b0 WIP 2024-05-18 08:07:17 +01:00
Calum Grant
cc4aa16257 Add test case 2024-05-18 07:40:09 +01:00
Calum Grant
31612b0028 Final version for now 2024-05-17 18:01:26 +01:00
Calum Grant
d88aae424e Add a test 2024-05-17 17:12:28 +01:00
Calum Grant
faf480c995 Reformat 2024-05-17 17:00:00 +01:00
Calum Grant
b200caba1b It works! 2024-05-17 16:50:10 +01:00
Calum Grant
26f7bf9ac4 WIP 2024-05-17 16:29:29 +01:00
Calum Grant
64ecf647a4 Write out closures a bit better 2024-05-17 14:20:21 +01:00
Calum Grant
2faf9a71e6 Important fix 2024-05-17 13:37:58 +01:00
Calum Grant
d7500b14c3 WIP 2024-05-17 13:30:58 +01:00
Calum Grant
fd8c1c4e97 WIP 2024-05-17 13:11:12 +01:00
Calum Grant
a05cb8ad02 WIP 2024-05-17 11:34:14 +01:00
Calum Grant
bcee43d806 WIP 2024-05-17 08:33:06 +01:00
Calum Grant
b5432cf08a WIP 2024-05-17 07:24:11 +01:00
5 changed files with 506 additions and 0 deletions

View File

@@ -0,0 +1,441 @@
/*
* GLR parser framework.
* This is purely for fun.
*/
// Specify the grammar as a set of rules of the form "A -> B C D"
// The right hand side can be empty
signature string grammar();
// Specify the input as a sequence of tokens, with the token being equal to a symbol in the grammar
// There is no need to terminate with $ as this is done automatically.
// The input is indexed by an integer starting at 0.
signature string input(int i);
// A GLR parser
module GLR<grammar/0 g> {
// We need to add a special rule that includes the end of file symbol $
string initialRule() { result = "Start_ -> G $" }
// Gets the left hand side of a rule - for internal use
string getRuleLhs(Rule rule) { result = rule.splitAt(" ", 0) }
// Gets a right hand side of a rule - for internal use
string getRuleRhs(Rule rule, int i) { i >= 0 and result = rule.splitAt(" ", 2 + i) }
// A rule, which is either from the supplied rules, or the bootstrap rule
class Rule extends string {
Rule() { this = [g(), initialRule()] }
NonTerminal getLhs() { result = getRuleLhs(this) }
Symbol getRhs(int i) { result = getRuleRhs(this, i) }
int getLength() { result = count(int i | exists(this.getRhs(i))) }
}
// Any symbol in the grammar
class Symbol extends string {
Symbol() { exists(Rule rule | this = getRuleLhs(rule) or this = getRuleRhs(rule, _)) }
// Determines whether this symbol can be empty
// There must exist one rule that may be empty
// For terminal symbols, with no rules, this will be false
predicate maybeEmpty() {
exists(Rule rule |
rule.getLhs() = this and
forall(int i | exists(rule.getRhs(i)) | rule.getRhs(i).(NonTerminal).maybeEmpty())
)
}
// Gets a terminal symbol that may start this rule.
// This is needed in order to find lookahead symbols
Terminal getFirst() {
// We are a terminal
result = this
or
// We are a nonterminal
// Also handle the possibility that symbols may be empty
exists(Rule rule, int i |
rule.getLhs() = this and
forall(int j | j in [0 .. i - 1] | rule.getRhs(j).(Symbol).maybeEmpty()) and
result = rule.getRhs(i).(Symbol).getFirst()
)
}
}
// A non-terminal symbol, which means it has at leaset one rule in the grammar
class NonTerminal extends Symbol {
NonTerminal() { this = getRuleLhs(any(Rule r)) }
}
// A terminal symbol, as implied by the fact that it has no rules in the grammar
class Terminal extends Symbol {
Terminal() { not this instanceof NonTerminal }
}
// The end of file symbol
class Eof extends Terminal {
Eof() { this = "$" }
}
// What terminal symbol can follow a symbol in a rule
// Used for calculate lookaheads
private Terminal follows(Rule rule, int dot, Terminal lookahead) {
dot >= 0 and
(
dot + 1 = rule.getLength() and lookahead = result
or
rule.getRhs(dot).maybeEmpty() and result = follows(rule, dot + 1, lookahead)
or
result = rule.getRhs(dot + 1).getFirst()
)
}
// For a given rule, what symbols (terminals or non-terminals) can be shifted?
// We automatically implement closures at this point
private predicate transition(
Rule rule1, int dot1, Terminal lookahead1, Symbol symbol, Rule rule2, int dot2,
Terminal lookahead2
) {
// We shift the symbol directly (a terminal or non-terminal)
rule1.getRhs(dot1) = symbol and
rule2 = rule1 and
dot2 = dot1 + 1 and
lookahead2 = lookahead1
or
exists(Rule rule3 | rule3.getLhs() = rule1.getRhs(dot1) |
transition(rule3, 0, follows(rule1, dot1, lookahead1), symbol, rule2, dot2, lookahead2)
)
}
bindingset[dot, lookahead]
string itemToString(Rule rule, int dot, Terminal lookahead) {
result =
rule.getLhs() + " -> " + concat(int i | i in [0 .. dot - 1] | rule.getRhs(i) + " " order by i)
+ ". " +
concat(int i | i in [dot .. rule.getLength() - 1] | rule.getRhs(i) + " " order by i) + "{" +
lookahead + "}"
}
// Convert a kernel item (excluding lookahead) into a string
string kernelItem(Rule rule, int dot) {
dot = [0 .. rule.getLength()] and
result =
rule.getLhs() + " -> " + concat(int i | i in [0 .. dot - 1] | rule.getRhs(i) + " " order by i)
+ ". " +
concat(int i | i in [dot .. rule.getLength() - 1] | rule.getRhs(i) + " " order by i)
}
// The initial state of the parser, expressed as an item.
predicate initialState(Rule rule, int dot, Terminal lookahead) {
rule = initialRule() and dot = 0 and lookahead = "$"
}
// What are the items in the state graph
// This can be computed without actually computing the states
predicate item(Rule rule, int dot, Terminal lookahead) {
initialState(rule, dot, lookahead)
or
exists(Rule rule0, int dot0, Terminal lookahead0, Symbol s |
item(rule0, dot0, lookahead0) and
transition(rule0, dot0, lookahead0, s, rule, dot, lookahead)
)
}
// The initial state string (just an identifier).
// We use strings to identify states
string getInitialState(Rule rule, int dot, Terminal lookahead) {
initialState(rule, dot, lookahead) and
result = "$" + kernelItem(rule, dot)
}
// A state after a transition from "previous"
// This is really the heart of the state computation graph, where we'll concat a string of all items
// reachable from the previous state, and give the items in the new state.
// We can decide whether to go LALR or CLR at this point, by deciding how to construct the state string.
// In this implementation, we'll keep it as small as possible using LALR by removing lookaheads from the state identifier (using kernelItem())
// Amazingly, recursion doesn't blow up by pulling transitionState() out of the concat.
string transitionState(string previous, Symbol s, Rule rule, int dot, Terminal lookahead) {
exists(Rule rule0, int dot0, Terminal lookahead0 | item(rule0, dot0, lookahead0) |
previous =
[transitionState(_, _, rule0, dot0, lookahead0), getInitialState(rule0, dot0, lookahead0)] and
transition(rule0, dot0, lookahead0, s, rule, dot, lookahead) and
// Concern here that if there are multiple transitions from the previous state, then
// we'll get multiple result strings
result =
concat(string itemstring |
exists(Rule rule3, int dot3, Terminal lookahead3 |
transition(rule0, dot0, lookahead0, s, rule3, dot3, lookahead3) and
itemstring = kernelItem(rule3, dot3)
)
|
itemstring + "; " order by itemstring
)
)
}
// Helper class for managing states in the state machine
// Each state is a constructed string, but getNumber() converts states to integers.
class State extends string {
State() {
this = getInitialState(_, _, _)
or
this = transitionState(_, _, _, _, _)
}
// Gets a kernel item from the state
predicate hasItem(Rule rule, int dot, Terminal lookahead) {
this = getInitialState(rule, dot, lookahead)
or
this = transitionState(_, _, rule, dot, lookahead)
}
// Gets a transition in this state, by shifting a terminal or non-terminal symbol
State getTransition(Symbol s) { result = transitionState(this, s, _, _, _) }
// Gets an internal state number for this state
int getNumber() { this = rank[result](State s) }
// Gets a goto transition for this state
State goto(NonTerminal s) { result = this.getTransition(s) }
// In this state, when the lookahead symbol is `s`, we can shift to `result`
State shift(Terminal s) { result = this.getTransition(s) }
// In this state, when the lookahead symbol is `s`, we can reduce rule `result`
Rule reduce(Terminal s) { this.hasItem(result, result.getLength(), s) }
// This state has a reduce/reduce conflict, given by the possible shift to `state` or reduce by `rule`
predicate shiftReduceConflict(Terminal s, State state, Rule rule) {
this.shift(s) = state and this.reduce(s) = rule
}
// This state has a reduce/reduce conflict, given by the possible reduce by `rule1` or `rule2`
predicate reduceReduceConflict(Terminal s, Rule rule1, Rule rule2) {
this.reduce(s) = rule1 and this.reduce(s) = rule2 and rule1 != rule2
}
// Gets all of the closure (expanded) items in this state
// This is only used for debugging/printing.
predicate hasClosure(Rule rule, int dot, Terminal lookahead) {
this.hasItem(rule, dot, lookahead)
or
exists(Rule rule0, int dot0, Terminal lookahead0 |
this.hasClosure(rule0, dot0, lookahead0) and
rule.getLhs() = rule0.getRhs(dot0) and
dot = 0 and
lookahead = follows(rule0, dot0, lookahead0)
)
}
// Gets a string showing the closure of the state for debugging purposes
string getClosureString() {
result =
concat(string s |
exists(Rule rule, int dot, Terminal lookahead | this.hasClosure(rule, dot, lookahead) |
s = itemToString(rule, dot, lookahead)
)
|
s + "; "
)
}
}
// The initial state
class InitialState extends State {
InitialState() { this = getInitialState(_, _, _) }
}
// Module containing the parsing algorithm
module GLRparser<input/1 i> {
// We need to insert $ at the end of the input
string getInputToken(int inputPosition) {
result = i(inputPosition)
or
inputPosition = max(int p | exists(i(p))) + 1 and result = "$"
}
/*
* The parse stack implements a linked list where the "previous" column is the previous node on the stack.
* With reductions `TReduce` on the stack, `skip()` skips large chunks of the stack.
*/
newtype Stack =
TEmpty() or
TShift(int position, State state, Terminal terminal, ParseNode previous) {
position = previous.getInputPosition() + 1 and
terminal = getInputToken(position - 1) and
state = previous.getState().shift(terminal)
} or
TReduce(int position, State state, Rule rule, ParseNode previous) {
position = previous.getInputPosition() and
rule = previous.getState().reduce(getInputToken(position)) and
state = previous.skip(rule.getLength()).getState().goto(rule.getLhs())
}
// A node in the parse tree, corresponding exactly to a node in the stack
class ParseNode extends Stack {
abstract string toString();
// The input position of the node
// Which is actually the rightmost position of the node due to LR parsing
abstract int getInputPosition();
// Gets the state on top of the stack
abstract State getState();
// Gets the previous node on the stack
// with the stack being implemented as a linked list.
abstract ParseNode getPrevious();
// Gets the symbol of this parse node
abstract Symbol getSymbol();
string debugString() { result = this.toString() + "->" + this.getPrevious().debugString() }
// Move i places up the parse stack.
ParseNode skip(int i) {
i = 0 and result = this
or
i > 0 and result = this.getPrevious().skip(i - 1)
}
predicate isReachable() { exists(Root r | this = r.getAChild*()) }
ParseNode getChild(int i) { none() }
ParseNode getAChild() { result = this.getChild(_) }
}
// The bottom of the stack, which needs a representation
class EmptyNode extends ParseNode, TEmpty {
override string toString() { result = "Empty" }
override int getInputPosition() { result = 0 }
override State getState() { result instanceof InitialState }
override ParseNode getPrevious() { none() }
override Symbol getSymbol() { none() }
override string debugString() { result = "ø" }
}
// A terminal node, created by shifting a token onto the stack
class TerminalNode extends ParseNode, TShift {
int position;
State state;
Terminal terminal;
ParseNode previous;
TerminalNode() { this = TShift(position, state, terminal, previous) }
override string toString() { result = terminal + "@" + position }
override string debugString() {
result =
terminal + "@" + position + " state " + state.getNumber() + "->" + previous.debugString()
}
override State getState() { result = state }
override ParseNode getPrevious() { result = previous }
// Gets the input position that matched this node
// This will be on the right hand side of the node.
override int getInputPosition() { result = position }
override Symbol getSymbol() { result = terminal }
}
// A non-terminal node, created by reducing a rule on the stack
class NonTerminalNode extends ParseNode, TReduce {
int position;
State state;
Rule rule;
ParseNode previous;
NonTerminalNode() { this = TReduce(position, state, rule, previous) }
override string toString() { result = rule + "@" + position }
override State getState() { result = state }
// When we reduce, we collapse the top of the stack.
// Since we can't actually do that in CodeQL, we just skip the correct number
// of places in the stack instead.
override ParseNode getPrevious() { result = previous.skip(rule.getLength()) }
override int getInputPosition() { result = position }
override Symbol getSymbol() { result = rule.getLhs() }
// The children of this node are stored on the stack below us.
override ParseNode getChild(int i) {
i in [0 .. rule.getLength() - 1] and
result = previous.skip(rule.getLength() - i - 1)
}
}
// The root node, which is called G.
// There can be multiple root nodes.
class Root extends NonTerminalNode {
Root() { rule.getLhs() = "G" }
}
// Dump the parser stack
predicate dumpNodes(
ParseNode node, int position, State state, string closure, int stateNumber, string debug
) {
position = node.getInputPosition() and
state = node.getState() and
stateNumber = state.getNumber() and
closure = state.getClosureString() and
debug = node.debugString()
}
// Simple dump of the parse tree
predicate tree(NonTerminalNode node, int i, ParseNode child) {
node.getChild(i) = child and child.isReachable()
}
// Gives the position of the syntax error if there is one
int syntax_error_position() {
result = max(int i | exists(ParseNode node | i = node.getInputPosition())) and
not exists(Root r)
}
}
// A combination of the actions and gotos table in one convenient output
predicate parseTable(int state, string closure, Symbol symbol, string action) {
exists(State s | s.getNumber() = state and closure = s.getClosureString() |
action = "shift to state " + s.shift(symbol).getNumber() or
action = "reduce by rule " + s.reduce(symbol) or
action = "goto state " + s.goto(symbol).getNumber()
)
}
string conflicts() {
exists(int sr, int rr |
sr = count(State s | s.shiftReduceConflict(_, _, _)) and
rr = count(State s | s.reduceReduceConflict(_, _, _))
|
(sr > 0 or rr > 0) and
result = "There are " + sr + " shift/reduce conflicts and " + rr + " reduce/reduce conflicts."
)
}
// This should be empty
predicate debugStates(State state1, Symbol s) {
count(state1.getTransition(s))>1
}
}
// Just some debugging information needed to quickeval predicates in modules
string test_input(int i) { result = "I+I-I".charAt(i) }
string test_grammar() { result = ["G -> E", "E -> E + I", "E -> E - I", "E -> I"] }
module DebugContext = GLR<test_grammar/0>::GLRparser<test_input/1>;

View File

@@ -0,0 +1,13 @@
import glr
string grammar1() {
result = ["G -> E", "E -> x", "E -> E x"]
}
string input1(int i) { result = "xxxx".charAt(i) }
module Test1 = GLR<grammar1/0>::GLRparser<input1/1>;
from Test1::ParseNode parent, int i, Test1::ParseNode child
where Test1::tree(parent,i,child)
select parent, i, child

View File

@@ -0,0 +1,18 @@
import glr
string grammar2() {
result = ["G -> E", "E -> E - I", "E -> I"]
}
query string conflicts() { result = GLR<grammar2/0>::conflicts() }
query int syntax_error() { result = Test2::syntax_error_position() }
string input2(int i) { result = "I-I-I".charAt(i) }
module Test2 = GLR<grammar2/0>::GLRparser<input2/1>;
from Test2::ParseNode parent, int i, Test2::ParseNode child
where Test2::tree(parent,i,child)
select parent, i, child

View File

@@ -0,0 +1,17 @@
import glr
string grammar3() {
result = ["G -> S", "S -> x", "S -> x S x"]
}
string input3(int i) { result = "xxxxxxx".charAt(i) }
module Test3 = GLR<grammar3/0>::GLRparser<input3/1>;
query string conflicts() { result = GLR<grammar3/0>::conflicts() }
query int syntax_error() { result = Test3::syntax_error_position() }
from Test3::ParseNode parent, int i, Test3::ParseNode child
where Test3::tree(parent,i,child)
select parent, i, child

View File

@@ -0,0 +1,17 @@
import glr
string grammar4() {
result = ["G -> E", "E -> E + I", "E -> E - I", "E -> I"]
}
string input4(int i) { result = "I+I-I".charAt(i) }
module Test4 = GLR<grammar4/0>::GLRparser<input4/1>;
query string conflicts() { result = GLR<grammar4/0>::conflicts() }
query int syntax_error() { result = Test4::syntax_error_position() }
from Test4::ParseNode parent, int i, Test4::ParseNode child
where Test4::tree(parent,i,child)
select parent, i, child