mirror of
https://github.com/github/codeql.git
synced 2025-12-16 16:53:25 +01:00
93 lines
3.2 KiB
Python
93 lines
3.2 KiB
Python
|
|
'''
|
|
Explanation of the syntax
|
|
|
|
start_decl: The starting transition table
|
|
alias_decl: Declare short hand, e.g. digits = '0' or '1' or ...
|
|
table_decl: Declare transition table: name and list of transitions.
|
|
transition: Transitions from one state to another. From is: state (or choice of states) -> new-state for possible-characters [ do action or actions; ]
|
|
action: Actions are:
|
|
"emit(kind [, text]): emits a token of kind using the givn text or text from the stream. The token starts at the last mark and ends at the current location.
|
|
"push(table)": pushes a transition table to the stack.
|
|
"pop" : pops a transition table from the stack.
|
|
"pushback": pushes the last character back to the stream.
|
|
"mark": marks the current location as the start of the next token.
|
|
"emit_indent": Emits zero or more INDENT or DEDENT tokens depending on current indentation.
|
|
"newline": Increments the line number and sets the column offset back to zero.
|
|
|
|
States:
|
|
All states are given names.
|
|
The state "0" is the start state and always exists.
|
|
All other states are implicitly defined when used (this is for Python after all :)
|
|
'*' means all states for which a transition is not explicitly defined.
|
|
So the transitions:
|
|
0 -> end for '\n'
|
|
0 -> other for *
|
|
0 -> a_b for 'a' or 'b'
|
|
mean that '0' will transition to 'other' for all characters other than 'a', 'b' and `\n`.
|
|
The order of transitions in the state machine description is irrelevant.
|
|
'''
|
|
|
|
|
|
grammar = r"""
|
|
start : machine
|
|
machine : declaration+
|
|
declaration : alias_decl | table_decl | start_decl
|
|
start_decl : "start" ":" IDENTIFIER
|
|
table_decl : table_header "{" transition+ "}"
|
|
table_header : "table" IDENTIFIER ( "(" IDENTIFIER ")" )?
|
|
alias_decl : IDENTIFIER "=" choice
|
|
choice : item ( "or" item)*
|
|
item : alias | char
|
|
alias : IDENTIFIER
|
|
char : LITERAL
|
|
transition : state_choice "->" state "for" (choice | any) action_list?
|
|
any : "*"
|
|
state_choice : state ( "or" state)*
|
|
state : IDENTIFIER | DIGIT
|
|
action_list : "do" action ";" (action ";")*
|
|
action : emit | pop | push | pushback | mark | emit_indent | newline
|
|
emit : "emit" "(" IDENTIFIER optional_text? ")"
|
|
optional_text : "," LITERAL
|
|
pop : "pop"
|
|
push : "push" "(" IDENTIFIER ")"
|
|
pushback : "pushback"
|
|
mark : "mark"
|
|
emit_indent : "emit_indent"
|
|
newline : "newline"
|
|
|
|
LITERAL : ("\"" /[^"]/* "\"") | ("'" /[^']/* "'")
|
|
IDENTIFIER : LETTER ( LETTER | DIGIT | "_" )*
|
|
LETTER : "A".."Z" | "a".."z"
|
|
DIGIT : "0".."9"
|
|
WHITESPACE : (" " | "\t" | "\r" | "\n")+
|
|
|
|
%import common.NEWLINE
|
|
COMMENT : "#" /(.)*/ NEWLINE
|
|
%ignore WHITESPACE
|
|
%ignore COMMENT
|
|
"""
|
|
|
|
|
|
|
|
from lark import Lark
|
|
|
|
class Parser(Lark):
|
|
|
|
def __init__(self):
|
|
Lark.__init__(self, grammar, parser="earley", lexer="standard")
|
|
|
|
def parse(src):
|
|
parser = Parser()
|
|
return parser.parse(src)
|
|
|
|
def main():
|
|
import sys
|
|
file = sys.argv[1]
|
|
with open(file) as fd:
|
|
tree = parse(fd.read())
|
|
print(tree.pretty())
|
|
|
|
if __name__ == "__main__":
|
|
main()
|