Python: Copy Python extractor to codeql repo

This commit is contained in:
Taus
2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
Copyright © 2017 Erez Shinan
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,7 @@
from .tree import Tree
from .visitors import Transformer, Visitor, v_args, Discard
from .visitors import InlineTransformer, inline_args # XXX Deprecated
from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
from .lark import Lark
__version__ = "0.6.3"

View File

@@ -0,0 +1,87 @@
import re
import sys
from .utils import get_regexp_width
Py36 = (sys.version_info[:2] >= (3, 6))
###{standalone
###}
class LexerConf:
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None):
self.tokens = tokens
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
class ParserConf:
def __init__(self, rules, callback, start):
self.rules = rules
self.callback = callback
self.start = start
class Pattern(object):
def __init__(self, value, flags=()):
self.value = value
self.flags = frozenset(flags)
def __repr__(self):
return repr(self.to_regexp())
# Pattern Hashing assumes all subclasses have a different priority!
def __hash__(self):
return hash((type(self), self.value, self.flags))
def __eq__(self, other):
return type(self) == type(other) and self.value == other.value and self.flags == other.flags
def to_regexp(self):
raise NotImplementedError()
if Py36:
# Python 3.6 changed syntax for flags in regular expression
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s:%s)' % (f, value))
return value
else:
def _get_flags(self, value):
for f in self.flags:
value = ('(?%s)' % f) + value
return value
class PatternStr(Pattern):
def to_regexp(self):
return self._get_flags(re.escape(self.value))
@property
def min_width(self):
return len(self.value)
max_width = min_width
class PatternRE(Pattern):
def to_regexp(self):
return self._get_flags(self.value)
@property
def min_width(self):
return get_regexp_width(self.to_regexp())[0]
@property
def max_width(self):
return get_regexp_width(self.to_regexp())[1]
class TokenDef(object):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority
def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)

View File

@@ -0,0 +1,86 @@
from .utils import STRING_TYPE
class LarkError(Exception):
pass
class GrammarError(LarkError):
pass
class ParseError(LarkError):
pass
class LexError(LarkError):
pass
class UnexpectedInput(LarkError):
pos_in_stream = None
def get_context(self, text, span=40):
pos = self.pos_in_stream
start = max(pos - span, 0)
end = pos + span
before = text[start:pos].rsplit('\n', 1)[-1]
after = text[pos:end].split('\n', 1)[0]
return before + after + '\n' + ' ' * len(before) + '^\n'
def match_examples(self, parse_fn, examples):
""" Given a parser instance and a dictionary mapping some label with
some malformed syntax examples, it'll return the label for the
example that bests matches the current error.
"""
assert self.state is not None, "Not supported for this exception"
candidate = None
for label, example in examples.items():
assert not isinstance(example, STRING_TYPE)
for malformed in example:
try:
parse_fn(malformed)
except UnexpectedInput as ut:
if ut.state == self.state:
try:
if ut.token == self.token: # Try exact match first
return label
except AttributeError:
pass
if not candidate:
candidate = label
return candidate
class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None):
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)
self.line = line
self.column = column
self.allowed = allowed
self.considered_tokens = considered_tokens
self.pos_in_stream = lex_pos
self.state = state
message += '\n\n' + self.get_context(seq)
if allowed:
message += '\nExpecting: %s\n' % allowed
super(UnexpectedCharacters, self).__init__(message)
class UnexpectedToken(ParseError, UnexpectedInput):
def __init__(self, token, expected, considered_rules=None, state=None):
self.token = token
self.expected = expected # XXX str shouldn't necessary
self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?')
self.considered_rules = considered_rules
self.state = state
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
message = ("Unexpected token %r at line %s, column %s.\n"
"Expected: %s\n"
% (token, self.line, self.column, ', '.join(self.expected)))
super(UnexpectedToken, self).__init__(message)

View File

@@ -0,0 +1,60 @@
class Symbol(object):
is_term = NotImplemented
def __init__(self, name):
self.name = name
def __eq__(self, other):
assert isinstance(other, Symbol), other
return self.is_term == other.is_term and self.name == other.name
def __ne__(self, other):
return not (self == other)
def __hash__(self):
return hash(self.name)
def __repr__(self):
return '%s(%r)' % (type(self).__name__, self.name)
class Terminal(Symbol):
is_term = True
def __init__(self, name, filter_out=False):
self.name = name
self.filter_out = filter_out
class NonTerminal(Symbol):
is_term = False
class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.options = options
def __str__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
def __repr__(self):
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, priority=None):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
def __repr__(self):
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
)

View File

@@ -0,0 +1,49 @@
//
// Numbers
//
DIGIT: "0".."9"
HEXDIGIT: "a".."f"|"A".."F"|DIGIT
INT: DIGIT+
SIGNED_INT: ["+"|"-"] INT
DECIMAL: INT "." INT? | "." INT
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
_EXP: ("e"|"E") SIGNED_INT
FLOAT: INT _EXP | DECIMAL _EXP?
SIGNED_FLOAT: ["+"|"-"] FLOAT
NUMBER: FLOAT | INT
SIGNED_NUMBER: ["+"|"-"] NUMBER
//
// Strings
//
//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/
STRING_INNER: ("\\\""|/[^"]/)
ESCAPED_STRING: "\"" STRING_INNER* "\""
//
// Names (Variables)
//
LCASE_LETTER: "a".."z"
UCASE_LETTER: "A".."Z"
LETTER: UCASE_LETTER | LCASE_LETTER
WORD: LETTER+
CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
//
// Whitespace
//
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+
CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+

View File

@@ -0,0 +1,55 @@
"Provides Indentation services for languages with indentation similar to Python"
from .lexer import Token
###{standalone
class Indenter:
def __init__(self):
self.paren_level = 0
self.indent_level = [0]
def handle_NL(self, token):
if self.paren_level > 0:
return
yield token
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
if indent > self.indent_level[-1]:
self.indent_level.append(indent)
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
else:
while indent < self.indent_level[-1]:
self.indent_level.pop()
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
def process(self, stream):
for token in stream:
if token.type == self.NL_type:
for t in self.handle_NL(token):
yield t
else:
yield token
if token.type in self.OPEN_PAREN_types:
self.paren_level += 1
elif token.type in self.CLOSE_PAREN_types:
self.paren_level -= 1
assert self.paren_level >= 0
while len(self.indent_level) > 1:
self.indent_level.pop()
yield Token(self.DEDENT_type, '')
assert self.indent_level == [0], self.indent_level
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
@property
def always_accept(self):
return (self.NL_type,)
###}

View File

@@ -0,0 +1,235 @@
from __future__ import absolute_import
import os
import time
from collections import defaultdict
from io import open
from .utils import STRING_TYPE
from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf
from .lexer import Lexer, TraditionalLexer
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend
class LarkOptions(object):
"""Specifies the options for Lark
"""
OPTIONS_DOC = """
parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley")
Note: "lalr" requires a lexer
lexer - Decides whether or not to use a lexer stage
"standard": Use a standard lexer
"contextual": Stronger lexer (only works with parser="lalr")
"dynamic": Flexible and powerful (only with parser="earley")
"dynamic_complete": Same as dynamic, but tries *every* variation
of tokenizing possible. (only with parser="earley")
"auto" (default): Choose for me based on grammar and parser
ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
"resolve": The parser will automatically choose the simplest derivation
(it chooses consistently: greedy for tokens, non-greedy for rules)
"explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).
transformer - Applies the transformer to every parse tree
debug - Affects verbosity (default: False)
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
cache_grammar - Cache the Lark grammar (Default: False)
postlex - Lexer post-processing (Requires standard lexer. Default: None)
start - The start symbol (Default: start)
profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
"""
__doc__ = OPTIONS_DOC
def __init__(self, options_dict):
o = dict(options_dict)
self.debug = bool(o.pop('debug', False))
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False))
self.tree_class = o.pop('tree_class', Tree)
self.cache_grammar = o.pop('cache_grammar', False)
self.postlex = o.pop('postlex', None)
self.parser = o.pop('parser', 'earley')
self.lexer = o.pop('lexer', 'auto')
self.transformer = o.pop('transformer', None)
self.start = o.pop('start', 'start')
self.profile = o.pop('profile', False)
self.ambiguity = o.pop('ambiguity', 'auto')
self.propagate_positions = o.pop('propagate_positions', False)
self.earley__predict_all = o.pop('earley__predict_all', False)
self.lexer_callbacks = o.pop('lexer_callbacks', {})
assert self.parser in ('earley', 'lalr', 'cyk', None)
if self.parser == 'earley' and self.transformer:
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)')
if o:
raise ValueError("Unknown options: %s" % o.keys())
class Profiler:
def __init__(self):
self.total_time = defaultdict(float)
self.cur_section = '__init__'
self.last_enter_time = time.time()
def enter_section(self, name):
cur_time = time.time()
self.total_time[self.cur_section] += cur_time - self.last_enter_time
self.last_enter_time = cur_time
self.cur_section = name
def make_wrapper(self, name, f):
def wrapper(*args, **kwargs):
last_section = self.cur_section
self.enter_section(name)
try:
return f(*args, **kwargs)
finally:
self.enter_section(last_section)
return wrapper
class Lark:
def __init__(self, grammar, **options):
"""
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
options : a dictionary controlling various aspects of Lark.
"""
self.options = LarkOptions(options)
# Some, but not all file-like objects have a 'name' attribute
try:
self.source = grammar.name
except AttributeError:
self.source = '<string>'
cache_file = "larkcache_%s" % str(hash(grammar)%(2**32))
else:
cache_file = "larkcache_%s" % os.path.basename(self.source)
# Drain file-like objects to get their contents
try:
read = grammar.read
except AttributeError:
pass
else:
grammar = read()
assert isinstance(grammar, STRING_TYPE)
if self.options.cache_grammar:
raise NotImplementedError("Not available yet")
assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None
if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
self.options.lexer = 'contextual'
elif self.options.parser == 'earley':
self.options.lexer = 'dynamic'
elif self.options.parser == 'cyk':
self.options.lexer = 'standard'
else:
assert False, self.options.parser
lexer = self.options.lexer
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)
if self.options.ambiguity == 'auto':
if self.options.parser == 'earley':
self.options.ambiguity = 'resolve'
else:
disambig_parsers = ['earley', 'cyk']
assert self.options.parser in disambig_parsers, (
'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', 'resolve__antiscore_sum')
# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source)
# Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile()
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks)
if self.options.parser:
self.parser = self._build_parser()
elif lexer:
self.lexer = self._build_lexer()
if self.profiler: self.profiler.enter_section('outside_lark')
__init__.__doc__ = "\nOPTIONS:" + LarkOptions.OPTIONS_DOC
def _build_lexer(self):
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer)
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr')
callback = self._parse_tree_builder.create_callback(self.options.transformer)
if self.profiler:
for f in dir(callback):
if not (f.startswith('__') and f.endswith('__')):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
parser_conf = ParserConf(self.rules, callback, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
@classmethod
def open(cls, grammar_filename, rel_to=None, **options):
"""Create an instance of Lark with the grammar given by its filename
If rel_to is provided, the function will find the grammar filename in relation to it.
Example:
>>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
Lark(...)
"""
if rel_to:
basepath = os.path.dirname(rel_to)
grammar_filename = os.path.join(basepath, grammar_filename)
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)
def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
def lex(self, text):
"Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
if not hasattr(self, 'lexer'):
self.lexer = self._build_lexer()
stream = self.lexer.lex(text)
if self.options.postlex:
return self.options.postlex.process(stream)
return stream
def parse(self, text):
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise."
return self.parser.parse(text)
# if self.profiler:
# self.profiler.enter_section('lex')
# l = list(self.lex(text))
# self.profiler.enter_section('parse')
# try:
# return self.parser.parse(l)
# finally:
# self.profiler.enter_section('outside_lark')
# else:
# l = list(self.lex(text))
# return self.parser.parse(l)

View File

@@ -0,0 +1,252 @@
## Lexer Implementation
import re
from .utils import Str, classify
from .common import PatternStr, PatternRE, TokenDef
from .exceptions import UnexpectedCharacters, LexError
###{standalone
class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
self = super(Token, cls).__new__(cls, value)
self.type = type_
self.pos_in_stream = pos_in_stream
self.value = value
self.line = line
self.column = column
self.end_line = None
self.end_column = None
return self
@classmethod
def new_borrow_pos(cls, type_, value, borrow_t):
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
def __reduce__(self):
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
def __repr__(self):
return 'Token(%s, %r)' % (self.type, self.value)
def __deepcopy__(self, memo):
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
return False
return Str.__eq__(self, other)
__hash__ = Str.__hash__
class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 1
self.line_start_pos = 0
def feed(self, token, test_newline=True):
"""Consume a token and calculate the new line & column.
As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos + 1
class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer, state=None):
self.lexer = lexer
self.state = state
def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
ignore_types = list(ignore_types)
line_ctr = LineCounter()
t = None
while True:
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t
else:
if type_ in lexer.callback:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t)
line_ctr.feed(value, type_ in newline_types)
if t:
t.end_line = line_ctr.line
t.end_column = line_ctr.column
break
else:
if line_ctr.char_pos < len(stream):
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, state=self.state)
break
class UnlessCallback:
def __init__(self, mres):
self.mres = mres
def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
t.type = type_from_index[m.lastindex]
break
return t
###}
def _create_unless(tokens):
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
callback = {}
for retok in tokens_by_type.get(PatternRE, []):
unless = [] # {}
for strtok in tokens_by_type.get(PatternStr, []):
if strtok.priority > retok.priority:
continue
s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s)
if m and m.group(0) == s:
unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
tokens = [t for t in tokens if t not in embedded_strs]
return tokens, callback
def _build_mres(tokens, max_size, match_whole):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while tokens:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(tokens, max_size//2, match_whole)
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
tokens = tokens[max_size:]
return mres
def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole)
def _regexp_has_newline(r):
return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r)
class Lexer:
"""Lexer interface
Method Signatures:
lex(self, stream) -> Iterator[Token]
set_parser_state(self, state) # Optional
"""
set_parser_state = NotImplemented
lex = NotImplemented
class TraditionalLexer(Lexer):
def __init__(self, tokens, ignore=(), user_callbacks={}):
assert all(isinstance(t, TokenDef) for t in tokens), tokens
tokens = list(tokens)
# Sanitization
for t in tokens:
try:
re.compile(t.pattern.to_regexp())
except:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
assert set(ignore) <= {t.name for t in tokens}
# Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(ignore)
tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
tokens, self.callback = _create_unless(tokens)
assert all(self.callback.values())
for type_, f in user_callbacks.items():
assert type_ not in self.callback
self.callback[type_] = f
self.tokens = tokens
self.mres = build_mres(tokens)
def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
class ContextualLexer(Lexer):
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
tokens_by_name = {}
for t in tokens:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t
lexer_by_tokens = {}
self.lexers = {}
for state, accepts in states.items():
key = frozenset(accepts)
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer_by_tokens[key] = lexer
self.lexers[state] = lexer
self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
self.set_parser_state(None) # Needs to be set on the outside
def set_parser_state(self, state):
self.parser_state = state
def lex(self, stream):
l = _Lex(self.lexers[self.parser_state], self.parser_state)
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]
l.state = self.parser_state

View File

@@ -0,0 +1,741 @@
"Parses and creates Grammar objects"
import os.path
import sys
from itertools import chain
import re
from ast import literal_eval
from copy import deepcopy
import pkgutil
from .lexer import Token
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import LALR_TraditionalLexer
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .utils import classify, suppress
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
from .tree import Tree, SlottedTree as ST
from .visitors import Transformer, Visitor, v_args, Transformer_InPlace
inline_args = v_args(inline=True)
__path__ = os.path.dirname(__file__)
GRAMMAR_PACKAGES = ['lark.grammars']
EXT = '.lark'
_RE_FLAGS = 'imslux'
def is_terminal(sym):
return sym.isupper()
_TERMINAL_NAMES = {
'.' : 'DOT',
',' : 'COMMA',
':' : 'COLON',
';' : 'SEMICOLON',
'+' : 'PLUS',
'-' : 'MINUS',
'*' : 'STAR',
'/' : 'SLASH',
'\\' : 'BACKSLASH',
'|' : 'VBAR',
'?' : 'QMARK',
'!' : 'BANG',
'@' : 'AT',
'#' : 'HASH',
'$' : 'DOLLAR',
'%' : 'PERCENT',
'^' : 'CIRCUMFLEX',
'&' : 'AMPERSAND',
'_' : 'UNDERSCORE',
'<' : 'LESSTHAN',
'>' : 'MORETHAN',
'=' : 'EQUAL',
'"' : 'DBLQUOTE',
'\'' : 'QUOTE',
'`' : 'BACKQUOTE',
'~' : 'TILDE',
'(' : 'LPAR',
')' : 'RPAR',
'{' : 'LBRACE',
'}' : 'RBRACE',
'[' : 'LSQB',
']' : 'RSQB',
'\n' : 'NEWLINE',
'\r\n' : 'CRLF',
'\t' : 'TAB',
' ' : 'SPACE',
}
# Grammar Parser
TERMINALS = {
'_LPAR': r'\(',
'_RPAR': r'\)',
'_LBRA': r'\[',
'_RBRA': r'\]',
'OP': '[+*][?]?|[?](?![a-z])',
'_COLON': ':',
'_COMMA': ',',
'_OR': r'\|',
'_DOT': r'\.',
'TILDE': '~',
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TERMINAL': '_?[A-Z][_A-Z0-9]*',
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*',
'_TO': '->',
'_IGNORE': r'%ignore',
'_DECLARE': r'%declare',
'_IMPORT': r'%import',
'NUMBER': r'\d+',
}
RULES = {
'start': ['_list'],
'_list': ['_item', '_list _item'],
'_item': ['rule', 'token', 'statement', '_NL'],
'rule': ['RULE _COLON expansions _NL',
'RULE _DOT NUMBER _COLON expansions _NL'],
'expansions': ['alias',
'expansions _OR alias',
'expansions _NL _OR alias'],
'?alias': ['expansion _TO RULE', 'expansion'],
'expansion': ['_expansion'],
'_expansion': ['', '_expansion expr'],
'?expr': ['atom',
'atom OP',
'atom TILDE NUMBER',
'atom TILDE NUMBER _DOT _DOT NUMBER',
],
'?atom': ['_LPAR expansions _RPAR',
'maybe',
'value'],
'value': ['terminal',
'nonterminal',
'literal',
'range'],
'terminal': ['TERMINAL'],
'nonterminal': ['RULE'],
'?name': ['RULE', 'TERMINAL'],
'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],
'token': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare'],
'ignore': ['_IGNORE expansions _NL'],
'declare': ['_DECLARE _declare_args _NL'],
'import': ['_IMPORT _import_path _NL',
'_IMPORT _import_path _LPAR name_list _RPAR _NL',
'_IMPORT _import_path _TO TERMINAL _NL'],
'_import_path': ['import_lib', 'import_rel'],
'import_lib': ['_import_args'],
'import_rel': ['_DOT _import_args'],
'_import_args': ['name', '_import_args _DOT name'],
'name_list': ['_name_list'],
'_name_list': ['name', '_name_list _COMMA name'],
'_declare_args': ['name', '_declare_args name'],
'literal': ['REGEXP', 'STRING'],
}
@inline_args
class EBNF_to_BNF(Transformer_InPlace):
def __init__(self):
self.new_rules = []
self.rules_by_expr = {}
self.prefix = 'anon'
self.i = 0
self.rule_options = None
def _add_recurse_rule(self, type_, expr):
if expr in self.rules_by_expr:
return self.rules_by_expr[expr]
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1
t = NonTerminal(Token('RULE', new_name, -1))
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t
return t
def expr(self, rule, op, *args):
if op.value == '?':
return ST('expansions', [rule, ST('expansion', [])])
elif op.value == '+':
# a : b c+ d
# -->
# a : b _c d
# _c : _c c | c;
return self._add_recurse_rule('plus', rule)
elif op.value == '*':
# a : b c* d
# -->
# a : b _c? d
# _c : _c c | c;
new_name = self._add_recurse_rule('star', rule)
return ST('expansions', [new_name, ST('expansion', [])])
elif op.value == '~':
if len(args) == 1:
mn = mx = int(args[0])
else:
mn, mx = map(int, args)
if mx < mn:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
assert False, op
class SimplifyRule_Visitor(Visitor):
@staticmethod
def _flatten(tree):
while True:
to_expand = [i for i, child in enumerate(tree.children)
if isinstance(child, Tree) and child.data == tree.data]
if not to_expand:
break
tree.expand_kids_by_index(*to_expand)
def expansion(self, tree):
# rules_list unpacking
# a : b (c|d) e
# -->
# a : b c e | b d e
#
# In AST terms:
# expansion(b, expansions(c, d), e)
# -->
# expansions( expansion(b, c, e), expansion(b, d, e) )
self._flatten(tree)
for i, child in enumerate(tree.children):
if isinstance(child, Tree) and child.data == 'expansions':
tree.data = 'expansions'
tree.children = [self.visit(ST('expansion', [option if i==j else other
for j, other in enumerate(tree.children)]))
for option in set(child.children)]
self._flatten(tree)
break
def alias(self, tree):
rule, alias_name = tree.children
if rule.data == 'expansions':
aliases = []
for child in tree.children[0].children:
aliases.append(ST('alias', [child, alias_name]))
tree.data = 'expansions'
tree.children = aliases
def expansions(self, tree):
self._flatten(tree)
tree.children = list(set(tree.children))
class RuleTreeToText(Transformer):
def expansions(self, x):
return x
def expansion(self, symbols):
return symbols, None
def alias(self, x):
(expansion, _alias), alias = x
assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed
return expansion, alias.value
@inline_args
class CanonizeTree(Transformer_InPlace):
def maybe(self, expr):
return ST('expr', [expr, Token('OP', '?', -1)])
def tokenmods(self, *args):
if len(args) == 1:
return list(args)
tokenmods, value = args
return tokenmods + [value]
class PrepareAnonTerminals(Transformer_InPlace):
"Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
def __init__(self, tokens):
self.tokens = tokens
self.token_set = {td.name for td in self.tokens}
self.token_reverse = {td.pattern: td for td in tokens}
self.i = 0
@inline_args
def pattern(self, p):
value = p.value
if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
token_name = None
if isinstance(p, PatternStr):
try:
# If already defined, use the user-defined token name
token_name = self.token_reverse[p].name
except KeyError:
# Try to assign an indicative anon-token name
try:
token_name = _TERMINAL_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set:
with suppress(UnicodeEncodeError):
value.upper().encode('ascii') # Make sure we don't have unicode in our token names
token_name = value.upper()
elif isinstance(p, PatternRE):
if p in self.token_reverse: # Kind of a wierd placement.name
token_name = self.token_reverse[p].name
else:
assert False, p
if token_name is None:
token_name = '__ANON_%d' % self.i
self.i += 1
if token_name not in self.token_set:
assert p not in self.token_reverse
self.token_set.add(token_name)
tokendef = TokenDef(token_name, p)
self.token_reverse[p] = tokendef
self.tokens.append(tokendef)
return Terminal(token_name, filter_out=isinstance(p, PatternStr))
def _rfind(s, choices):
return max(s.rfind(c) for c in choices)
def _fix_escaping(s):
w = ''
i = iter(s)
for n in i:
w += n
if n == '\\':
n2 = next(i)
if n2 == '\\':
w += '\\\\'
elif n2 not in 'unftr':
w += '\\'
w += n2
w = w.replace('\\"', '"').replace("'", "\\'")
to_eval = "u'''%s'''" % w
try:
s = literal_eval(to_eval)
except SyntaxError as e:
raise ValueError(s, e)
return s
def _literal_to_pattern(literal):
v = literal.value
flag_start = _rfind(v, '/"')+1
assert flag_start > 0
flags = v[flag_start:]
assert all(f in _RE_FLAGS for f in flags), flags
v = v[:flag_start]
assert v[0] == v[-1] and v[0] in '"/'
x = v[1:-1]
s = _fix_escaping(x)
if literal.type == 'STRING':
s = s.replace('\\\\', '\\')
return { 'STRING': PatternStr,
'REGEXP': PatternRE }[literal.type](s, flags)
@inline_args
class PrepareLiterals(Transformer_InPlace):
def literal(self, literal):
return ST('pattern', [_literal_to_pattern(literal)])
def range(self, start, end):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(start) == len(end) == 1, (start, end, len(start), len(end))
regexp = '[%s-%s]' % (start, end)
return ST('pattern', [PatternRE(regexp)])
class TokenTreeToPattern(Transformer):
def pattern(self, ps):
p ,= ps
return p
def expansion(self, items):
assert items
if len(items) == 1:
return items[0]
if len({i.flags for i in items}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())
def expansions(self, exps):
if len(exps) == 1:
return exps[0]
if len({i.flags for i in exps}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)
def expr(self, args):
inner, op = args[:2]
if op == '~':
if len(args) == 3:
op = "{%d}" % int(args[2])
else:
mn, mx = map(int, args[2:])
if mx < mn:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
op = "{%d,%d}" % (mn, mx)
else:
assert len(args) == 2
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
def alias(self, t):
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
def value(self, v):
return v[0]
class PrepareSymbols(Transformer_InPlace):
def value(self, v):
v ,= v
if isinstance(v, Tree):
return v
elif v.type == 'RULE':
return NonTerminal(v.value)
elif v.type == 'TERMINAL':
return Terminal(v.value, filter_out=v.startswith('_'))
assert False
def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
class Grammar:
def __init__(self, rule_defs, token_defs, ignore):
self.token_defs = token_defs
self.rule_defs = rule_defs
self.ignore = ignore
def compile(self):
# We change the trees in-place (to support huge grammars)
# So deepcopy allows calling compile more than once.
token_defs = deepcopy(list(self.token_defs))
rule_defs = deepcopy(self.rule_defs)
# =================
# Compile Tokens
# =================
# Convert token-trees to strings/regexps
transformer = PrepareLiterals() * TokenTreeToPattern()
for name, (token_tree, priority) in token_defs:
if token_tree is None: # Terminal added through %declare
continue
expansions = list(token_tree.find_data('expansion'))
if len(expansions) == 1 and not expansions[0].children:
raise GrammarError("Terminals cannot be empty (%s)" % name)
tokens = [TokenDef(name, transformer.transform(token_tree), priority)
for name, (token_tree, priority) in token_defs if token_tree]
# =================
# Compile Rules
# =================
# 1. Pre-process terminals
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens
# 2. Convert EBNF to BNF (and apply step 1)
ebnf_to_bnf = EBNF_to_BNF()
rules = []
for name, rule_tree, options in rule_defs:
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
tree = transformer.transform(rule_tree)
rules.append((name, ebnf_to_bnf.transform(tree), options))
rules += ebnf_to_bnf.new_rules
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
# 3. Compile tree to Rule objects
rule_tree_to_text = RuleTreeToText()
simplify_rule = SimplifyRule_Visitor()
compiled_rules = []
for name, tree, options in rules:
simplify_rule.visit(tree)
expansions = rule_tree_to_text.transform(tree)
for expansion, alias in expansions:
if alias and name.startswith('_'):
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
assert all(isinstance(x, Symbol) for x in expansion), expansion
rule = Rule(NonTerminal(name), expansion, alias, options)
compiled_rules.append(rule)
return tokens, compiled_rules, self.ignore
_imported_grammars = {}
def import_grammar(grammar_path):
if grammar_path not in _imported_grammars:
for package in GRAMMAR_PACKAGES:
text = pkgutil.get_data(package, grammar_path).decode("utf-8")
grammar = load_grammar(text, grammar_path)
_imported_grammars[grammar_path] = grammar
return _imported_grammars[grammar_path]
def resolve_token_references(token_defs):
# TODO Cycles detection
# TODO Solve with transitive closure (maybe)
token_dict = {k:t for k, (t,_p) in token_defs}
assert len(token_dict) == len(token_defs), "Same name defined twice?"
while True:
changed = False
for name, (token_tree, _p) in token_defs:
if token_tree is None: # Terminal added through %declare
continue
for exp in token_tree.find_data('value'):
item ,= exp.children
if isinstance(item, Token):
if item.type == 'RULE':
raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
if item.type == 'TERMINAL':
exp.children[0] = token_dict[item]
changed = True
if not changed:
break
def options_from_rule(name, *x):
if len(x) > 1:
priority, expansions = x
priority = int(priority)
else:
expansions ,= x
priority = None
keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')
return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
def symbols_from_strcase(expansion):
return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion]
@inline_args
class PrepareGrammar(Transformer_InPlace):
def terminal(self, name):
return name
def nonterminal(self, name):
return name
class GrammarLoader:
def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
self.canonize_tree = CanonizeTree()
def load_grammar(self, grammar_text, grammar_name='<?>'):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
try:
tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
except UnexpectedCharacters as e:
context = e.get_context(grammar_text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, grammar_name, context))
except UnexpectedToken as e:
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, {
'Unclosed parenthesis': ['a: (\n'],
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
'Alias expects lowercase name': ['a: -> "a"\n'],
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'],
'%import expects a name': ['%import "a"\n'],
'%ignore expects a value': ['%ignore %import\n'],
})
if error:
raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise
tree = PrepareGrammar().transform(tree)
# Extract grammar items
defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
token_defs = defs.pop('token', [])
rule_defs = defs.pop('rule', [])
statements = defs.pop('statement', [])
assert not defs
token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]
token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]
# Execute statements
ignore = []
declared = []
for (stmt,) in statements:
if stmt.data == 'ignore':
t ,= stmt.children
ignore.append(t)
elif stmt.data == 'import':
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node ,= stmt.children
arg1 = None
dotted_path = path_node.children
if isinstance(arg1, Tree): # Multi import
names = arg1.children
aliases = names # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
names = [dotted_path[-1]] # Get name from dotted path
aliases = [arg1] if arg1 else names # Aliases if exist
dotted_path = dotted_path[:-1]
grammar_path = os.path.join(*dotted_path) + EXT
if path_node.data == 'import_lib': # Import from library
g = import_grammar(grammar_path)
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
base_file = os.path.abspath(sys.modules['__main__'].__file__)
else:
base_file = grammar_name # Import relative to grammar file path if external grammar file
base_path = os.path.split(base_file)[0]
g = import_grammar(grammar_path, base_paths=[base_path])
for name, alias in zip(names, aliases):
token_options = dict(g.token_defs)[name]
assert isinstance(token_options, tuple) and len(token_options)==2
token_defs.append([alias.value, token_options])
elif stmt.data == 'declare':
for t in stmt.children:
token_defs.append([t.value, (None, None)])
else:
assert False, stmt
# Verify correctness 1
for name, _ in token_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
# Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate tokens (two names, one value)
ignore_names = []
for t in ignore:
if t.data=='expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
ignore_names.append(item.value)
continue
name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
token_defs.append((name, (t, 0)))
# Verify correctness 2
token_names = set()
for name, _ in token_defs:
if name in token_names:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)
if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
# Resolve token references
resolve_token_references(token_defs)
rules = [options_from_rule(*x) for x in rule_defs]
rule_names = set()
for name, _x, _o in rules:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % name)
rule_names.add(name)
for name, expansions, _o in rules:
used_symbols = {t for x in expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
# TODO don't include unused tokens, they can only cause trouble!
return Grammar(rules, token_defs, ignore_names)
load_grammar = GrammarLoader().load_grammar

View File

@@ -0,0 +1,164 @@
from .exceptions import GrammarError
from .utils import suppress
from .lexer import Token
from .grammar import Rule
from .tree import Tree
from .visitors import InlineTransformer # XXX Deprecated
###{standalone
from functools import partial, wraps
class ExpandSingleChild:
def __init__(self, node_builder):
self.node_builder = node_builder
def __call__(self, children):
if len(children) == 1:
return children[0]
else:
return self.node_builder(children)
class PropagatePositions:
def __init__(self, node_builder):
self.node_builder = node_builder
def __call__(self, children):
res = self.node_builder(children)
if children and isinstance(res, Tree):
for a in children:
if isinstance(a, Tree):
res.meta.line = a.meta.line
res.meta.column = a.meta.column
elif isinstance(a, Token):
res.meta.line = a.line
res.meta.column = a.column
break
for a in reversed(children):
# with suppress(AttributeError):
if isinstance(a, Tree):
res.meta.end_line = a.meta.end_line
res.meta.end_column = a.meta.end_column
elif isinstance(a, Token):
res.meta.end_line = a.end_line
res.meta.end_column = a.end_column
break
return res
class ChildFilter:
def __init__(self, to_include, node_builder):
self.node_builder = node_builder
self.to_include = to_include
def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
if to_expand:
filtered += children[i].children
else:
filtered.append(children[i])
return self.node_builder(filtered)
class ChildFilterLALR(ChildFilter):
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
if to_expand:
if filtered:
filtered += children[i].children
else: # Optimize for left-recursion
filtered = children[i].children
else:
filtered.append(children[i])
return self.node_builder(filtered)
def _should_expand(sym):
return not sym.is_term and sym.name.startswith('_')
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion)
if keep_all_tokens or not (sym.is_term and sym.filter_out)]
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)
class Callback(object):
pass
def inline_args(func):
@wraps(func)
def f(children):
return func(*children)
return f
class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False):
self.tree_class = tree_class
self.propagate_positions = propagate_positions
self.always_keep_all_tokens = keep_all_tokens
self.ambiguous = ambiguous
self.rule_builders = list(self._init_builders(rules))
self.user_aliases = {}
def _init_builders(self, rules):
for rule in rules:
options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
expand_single_child = options.expand1 if options else False
wrapper_chain = filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
self.propagate_positions and PropagatePositions,
])
yield rule, wrapper_chain
def create_callback(self, transformer=None):
callback = Callback()
i = 0
for rule, wrapper_chain in self.rule_builders:
internal_callback_name = '_cb%d_%s' % (i, rule.origin)
i += 1
user_callback_name = rule.alias or rule.origin.name
try:
f = getattr(transformer, user_callback_name)
assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer"
# XXX InlineTransformer is deprecated!
if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer):
f = inline_args(f)
except AttributeError:
f = partial(self.tree_class, user_callback_name)
self.user_aliases[rule] = rule.alias
rule.alias = internal_callback_name
for w in wrapper_chain:
f = w(f)
if hasattr(callback, internal_callback_name):
raise GrammarError("Rule '%s' already exists" % (rule,))
setattr(callback, internal_callback_name, f)
return callback
###}

View File

@@ -0,0 +1,189 @@
import re
from functools import partial
from .utils import get_regexp_width
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .tree import Tree
class WithLexer:
lexer = None
parser = None
lexer_conf = None
def init_traditional_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
def init_contextual_lexer(self, lexer_conf):
self.lexer_conf = lexer_conf
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, states,
ignore=lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=lexer_conf.callbacks)
def lex(self, text):
stream = self.lexer.lex(text)
if self.lexer_conf.postlex:
return self.lexer_conf.postlex.process(stream)
return stream
def parse(self, text):
token_stream = self.lex(text)
sps = self.lexer.set_parser_state
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])
class LALR_TraditionalLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.init_traditional_lexer(lexer_conf)
class LALR_ContextualLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.init_contextual_lexer(lexer_conf)
class LALR_CustomLexer(WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.parser = lalr_parser.Parser(parser_conf)
self.lexer_conf = lexer_conf
self.lexer = lexer_cls(lexer_conf)
def get_ambiguity_resolver(options):
if not options or options.ambiguity == 'resolve':
return resolve_ambig.standard_resolve_ambig
elif options.ambiguity == 'resolve__antiscore_sum':
return resolve_ambig.antiscore_sum_resolve_ambig
elif options.ambiguity == 'explicit':
return None
raise ValueError(options)
def tokenize_text(text):
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
yield Token('CHAR', ch, line=line, column=i - col_start_pos)
class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)
self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options))
def match(self, term, token):
return term.name == token.type
class XEarley:
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self._prepare_match(lexer_conf)
self.parser = xearley.Parser(parser_conf,
self.match,
resolve_ambiguity=get_ambiguity_resolver(options),
ignore=lexer_conf.ignore,
predict_all=options.earley__predict_all,
**kw
)
def match(self, term, text, index=0):
return self.regexps[term.name].match(text, index)
def _prepare_match(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp()
try:
width = get_regexp_width(regexp)[0]
except ValueError:
raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
else:
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
self.regexps[t.name] = re.compile(regexp)
def parse(self, text):
return self.parser.parse(text)
class XEarley_CompleteLex(XEarley):
def __init__(self, *args, **kw):
super(self).__init__(*args, complete_lex=True, **kw)
class CYK(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)
self._analysis = GrammarAnalyzer(parser_conf)
self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
self._postprocess = {}
for rule in parser_conf.rules:
a = rule.alias
self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
def parse(self, text):
tokens = list(self.lex(text))
parse = self._parser.parse(tokens)
parse = self._transform(parse)
return parse
def _transform(self, tree):
subtrees = list(tree.iter_subtrees())
for subtree in subtrees:
subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
return self._apply_callback(tree)
def _apply_callback(self, tree):
children = tree.children
callback = self._postprocess[tree.rule.alias]
assert callback, tree.rule.alias
r = callback(children)
return r
def get_frontend(parser, lexer):
if parser=='lalr':
if lexer is None:
raise ValueError('The LALR parser requires use of a lexer')
elif lexer == 'standard':
return LALR_TraditionalLexer
elif lexer == 'contextual':
return LALR_ContextualLexer
elif issubclass(lexer, Lexer):
return partial(LALR_CustomLexer, lexer)
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser=='earley':
if lexer=='standard':
return Earley
elif lexer=='dynamic':
return XEarley
elif lexer=='dynamic_complete':
return XEarley_CompleteLex
elif lexer=='contextual':
raise ValueError('The Earley parser does not support the contextual parser')
else:
raise ValueError('Unknown lexer: %s' % lexer)
elif parser == 'cyk':
if lexer == 'standard':
return CYK
else:
raise ValueError('CYK parser requires using standard parser.')
else:
raise ValueError('Unknown parser: %s' % parser)

View File

@@ -0,0 +1,342 @@
"""This module implements a CYK parser."""
# Author: https://github.com/ehudt (2018)
#
# Adapted by Erez
from collections import defaultdict
import itertools
from ..exceptions import ParseError
from ..lexer import Token
from ..tree import Tree
from ..grammar import Terminal as T, NonTerminal as NT, Symbol
try:
xrange
except NameError:
xrange = range
def match(t, s):
assert isinstance(t, T)
return t.name == s.type
class Rule(object):
"""Context-free grammar rule."""
def __init__(self, lhs, rhs, weight, alias):
super(Rule, self).__init__()
assert isinstance(lhs, NT), lhs
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
self.lhs = lhs
self.rhs = rhs
self.weight = weight
self.alias = alias
def __str__(self):
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
def __repr__(self):
return str(self)
def __hash__(self):
return hash((self.lhs, tuple(self.rhs)))
def __eq__(self, other):
return self.lhs == other.lhs and self.rhs == other.rhs
def __ne__(self, other):
return not (self == other)
class Grammar(object):
"""Context-free grammar."""
def __init__(self, rules):
self.rules = frozenset(rules)
def __eq__(self, other):
return self.rules == other.rules
def __str__(self):
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
def __repr__(self):
return str(self)
# Parse tree data structures
class RuleNode(object):
"""A node in the parse tree, which also contains the full rhs rule."""
def __init__(self, rule, children, weight=0):
self.rule = rule
self.children = children
self.weight = weight
def __repr__(self):
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
class Parser(object):
"""Parser wrapper."""
def __init__(self, rules, start):
super(Parser, self).__init__()
self.orig_rules = {rule.alias: rule for rule in rules}
rules = [self._to_rule(rule) for rule in rules]
self.grammar = to_cnf(Grammar(rules))
self.start = NT(start)
def _to_rule(self, lark_rule):
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
assert isinstance(lark_rule.origin, NT)
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
return Rule(
lark_rule.origin, lark_rule.expansion,
weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0,
alias=lark_rule.alias)
def parse(self, tokenized): # pylint: disable=invalid-name
"""Parses input, which is a list of tokens."""
table, trees = _parse(tokenized, self.grammar)
# Check if the parse succeeded.
if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
raise ParseError('Parsing failed.')
parse = trees[(0, len(tokenized) - 1)][self.start]
return self._to_tree(revert_cnf(parse))
def _to_tree(self, rule_node):
"""Converts a RuleNode parse tree to a lark Tree."""
orig_rule = self.orig_rules[rule_node.rule.alias]
children = []
for child in rule_node.children:
if isinstance(child, RuleNode):
children.append(self._to_tree(child))
else:
assert isinstance(child.name, Token)
children.append(child.name)
t = Tree(orig_rule.origin, children)
t.rule=orig_rule
return t
def print_parse(node, indent=0):
if isinstance(node, RuleNode):
print(' ' * (indent * 2) + str(node.rule.lhs))
for child in node.children:
print_parse(child, indent + 1)
else:
print(' ' * (indent * 2) + str(node.s))
def _parse(s, g):
"""Parses sentence 's' using CNF grammar 'g'."""
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
table = defaultdict(set)
# Top-level structure is similar to the CYK table. Each cell is a dict from
# rule name to the best (lightest) tree for that rule.
trees = defaultdict(dict)
# Populate base case with existing terminal production rules
for i, w in enumerate(s):
for terminal, rules in g.terminal_rules.items():
if match(terminal, w):
for rule in rules:
table[(i, i)].add(rule)
if (rule.lhs not in trees[(i, i)] or
rule.weight < trees[(i, i)][rule.lhs].weight):
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
# Iterate over lengths of sub-sentences
for l in xrange(2, len(s) + 1):
# Iterate over sub-sentences with the given length
for i in xrange(len(s) - l + 1):
# Choose partition of the sub-sentence in [1, l)
for p in xrange(i + 1, i + l):
span1 = (i, p - 1)
span2 = (p, i + l - 1)
for r1, r2 in itertools.product(table[span1], table[span2]):
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
table[(i, i + l - 1)].add(rule)
r1_tree = trees[span1][r1.lhs]
r2_tree = trees[span2][r2.lhs]
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
if (rule.lhs not in trees[(i, i + l - 1)]
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
return table, trees
# This section implements context-free grammar converter to Chomsky normal form.
# It also implements a conversion of parse trees from its CNF to the original
# grammar.
# Overview:
# Applies the following operations in this order:
# * TERM: Eliminates non-solitary terminals from all rules
# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
# * UNIT: Eliminates non-terminal unit rules
#
# The following grammar characteristics aren't featured:
# * Start symbol appears on RHS
# * Empty rules (epsilon rules)
class CnfWrapper(object):
"""CNF wrapper for grammar.
Validates that the input grammar is CNF and provides helper data structures.
"""
def __init__(self, grammar):
super(CnfWrapper, self).__init__()
self.grammar = grammar
self.rules = grammar.rules
self.terminal_rules = defaultdict(list)
self.nonterminal_rules = defaultdict(list)
for r in self.rules:
# Validate that the grammar is CNF and populate auxiliary data structures.
assert isinstance(r.lhs, NT), r
assert len(r.rhs) in [1, 2], r
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
self.terminal_rules[r.rhs[0]].append(r)
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
self.nonterminal_rules[tuple(r.rhs)].append(r)
else:
assert False, r
def __eq__(self, other):
return self.grammar == other.grammar
def __repr__(self):
return repr(self.grammar)
class UnitSkipRule(Rule):
"""A rule that records NTs that were skipped during transformation."""
def __init__(self, lhs, rhs, skipped_rules, weight, alias):
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
self.skipped_rules = skipped_rules
def __eq__(self, other):
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
__hash__ = Rule.__hash__
def build_unit_skiprule(unit_rule, target_rule):
skipped_rules = []
if isinstance(unit_rule, UnitSkipRule):
skipped_rules += unit_rule.skipped_rules
skipped_rules.append(target_rule)
if isinstance(target_rule, UnitSkipRule):
skipped_rules += target_rule.skipped_rules
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
def get_any_nt_unit_rule(g):
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
for rule in g.rules:
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
return rule
return None
def _remove_unit_rule(g, rule):
"""Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
new_rules = [x for x in g.rules if x != rule]
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
return Grammar(new_rules)
def _split(rule):
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
rule_name = '__SP_%s' % (rule_str) + '_%d'
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
for i in xrange(1, len(rule.rhs) - 2):
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
def _term(g):
"""Applies the TERM rule on 'g' (see top comment)."""
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
else:
new_rules.append(rule)
return Grammar(new_rules)
def _bin(g):
"""Applies the BIN rule to 'g' (see top comment)."""
new_rules = []
for rule in g.rules:
if len(rule.rhs) > 2:
new_rules += _split(rule)
else:
new_rules.append(rule)
return Grammar(new_rules)
def _unit(g):
"""Applies the UNIT rule to 'g' (see top comment)."""
nt_unit_rule = get_any_nt_unit_rule(g)
while nt_unit_rule:
g = _remove_unit_rule(g, nt_unit_rule)
nt_unit_rule = get_any_nt_unit_rule(g)
return g
def to_cnf(g):
"""Creates a CNF grammar from a general context-free grammar 'g'."""
g = _unit(_bin(_term(g)))
return CnfWrapper(g)
def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
if not skipped_rules:
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
else:
weight = weight - skipped_rules[0].weight
return RuleNode(
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
skipped_rules[1:], children,
skipped_rules[0].weight, skipped_rules[0].alias)
], weight=weight)
def revert_cnf(node):
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
if isinstance(node, T):
return node
# Reverts TERM rule.
if node.rule.lhs.name.startswith('__T_'):
return node.children[0]
else:
children = []
for child in map(revert_cnf, node.children):
# Reverts BIN rule.
if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'):
children += child.children
else:
children.append(child)
# Reverts UNIT rule.
if isinstance(node.rule, UnitSkipRule):
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
node.rule.skipped_rules, children,
node.rule.weight, node.rule.alias)
else:
return RuleNode(node.rule, children)

View File

@@ -0,0 +1,239 @@
"This module implements an Earley Parser"
# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from ..tree import Tree
from ..visitors import Transformer_InPlace, v_args
from ..exceptions import ParseError, UnexpectedToken
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
class Derivation(Tree):
def __init__(self, rule, items=None):
Tree.__init__(self, 'drv', items or [])
self.meta.rule = rule
self._hash = None
def _pretty_label(self): # Nicer pretty for debugging the parser
return self.rule.origin if self.rule else self.data
def __hash__(self):
if self._hash is None:
self._hash = Tree.__hash__(self)
return self._hash
class Item(object):
"An Earley Item, the atom of the algorithm."
def __init__(self, rule, ptr, start, tree):
self.rule = rule
self.ptr = ptr
self.start = start
self.tree = tree if tree is not None else Derivation(self.rule)
@property
def expect(self):
return self.rule.expansion[self.ptr]
@property
def is_complete(self):
return self.ptr == len(self.rule.expansion)
def advance(self, tree):
assert self.tree.data == 'drv'
new_tree = Derivation(self.rule, self.tree.children + [tree])
return self.__class__(self.rule, self.ptr+1, self.start, new_tree)
def __eq__(self, other):
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
def __hash__(self):
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__
def __repr__(self):
before = list(map(str, self.rule.expansion[:self.ptr]))
after = list(map(str, self.rule.expansion[self.ptr:]))
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))
class NewsList(list):
"Keeps track of newly added items (append-only)"
def __init__(self, initial=None):
list.__init__(self, initial or [])
self.last_iter = 0
def get_news(self):
i = self.last_iter
self.last_iter = len(self)
return self[i:]
class Column:
"An entry in the table, aka Earley Chart. Contains lists of items."
def __init__(self, i, FIRST, predict_all=False):
self.i = i
self.to_reduce = NewsList()
self.to_predict = NewsList()
self.to_scan = []
self.item_count = 0
self.FIRST = FIRST
self.predicted = set()
self.completed = {}
self.predict_all = predict_all
def add(self, items):
"""Sort items into scan/predict/reduce newslists
Makes sure only unique items are added.
"""
for item in items:
item_key = item, item.tree # Elsewhere, tree is not part of the comparison
if item.is_complete:
# XXX Potential bug: What happens if there's ambiguity in an empty rule?
if item.rule.expansion and item_key in self.completed:
old_tree = self.completed[item_key].tree
if old_tree == item.tree:
is_empty = not self.FIRST[item.rule.origin]
if not is_empty:
continue
if old_tree.data != '_ambig':
new_tree = old_tree.copy()
new_tree.meta.rule = old_tree.meta.rule
old_tree.set('_ambig', [new_tree])
old_tree.meta.rule = None # No longer a 'drv' node
if item.tree.children[0] is old_tree: # XXX a little hacky!
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule)
if item.tree not in old_tree.children:
old_tree.children.append(item.tree)
# old_tree.children.append(item.tree)
else:
self.completed[item_key] = item
self.to_reduce.append(item)
else:
if item.expect.is_term:
self.to_scan.append(item)
else:
k = item_key if self.predict_all else item
if k in self.predicted:
continue
self.predicted.add(k)
self.to_predict.append(item)
self.item_count += 1 # Only count if actually added
def __bool__(self):
return bool(self.item_count)
__nonzero__ = __bool__ # Py2 backwards-compatibility
class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.FIRST = analysis.FIRST
self.postprocess = {}
self.predictions = {}
for rule in parser_conf.rules:
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
self.term_matcher = term_matcher
def parse(self, stream, start_symbol=None):
# Define parser functions
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
_Item = Item
match = self.term_matcher
def predict(nonterm, column):
assert not nonterm.is_term, nonterm
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
def complete(item):
name = item.rule.origin
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
def predict_and_complete(column):
while True:
to_predict = {x.expect for x in column.to_predict.get_news()
if x.ptr} # if not part of an already predicted batch
to_reduce = set(column.to_reduce.get_news())
if not (to_predict or to_reduce):
break
for nonterm in to_predict:
column.add( predict(nonterm, column) )
for item in to_reduce:
new_items = list(complete(item))
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)
def scan(i, token, column):
next_set = Column(i, self.FIRST)
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))
if not next_set:
expect = {i.expect.name for i in column.to_scan}
raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan))
return next_set
# Main loop starts
column0 = Column(0, self.FIRST)
column0.add(predict(start_symbol, column0))
column = column0
for i, token in enumerate(stream):
predict_and_complete(column)
column = scan(i, token, column)
predict_and_complete(column)
# Parse ended. Now build a parse tree
solutions = [n.tree for n in column.to_reduce
if n.rule.origin==start_symbol and n.start is column0]
if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input')
elif len(solutions) == 1:
tree = solutions[0]
else:
tree = Tree('_ambig', solutions)
if self.resolve_ambiguity:
tree = self.resolve_ambiguity(tree)
return ApplyCallbacks(self.postprocess).transform(tree)
class ApplyCallbacks(Transformer_InPlace):
def __init__(self, postprocess):
self.postprocess = postprocess
@v_args(meta=True)
def drv(self, children, meta):
return self.postprocess[meta.rule](children)

View File

@@ -0,0 +1,148 @@
from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal
class RulePtr(object):
__slots__ = ('rule', 'index')
def __init__(self, rule, index):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index
def __repr__(self):
before = self.rule.expansion[:self.index]
after = self.rule.expansion[self.index:]
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after))
@property
def next(self):
return self.rule.expansion[self.index]
def advance(self, sym):
assert self.next == sym
return RulePtr(self.rule, self.index+1)
@property
def is_satisfied(self):
return self.index == len(self.rule.expansion)
def __eq__(self, other):
return self.rule == other.rule and self.index == other.index
def __hash__(self):
return hash((self.rule, self.index))
def update_set(set1, set2):
if not set2:
return False
copy = set(set1)
set1 |= set2
return set1 != copy
def calculate_sets(rules):
"""Calculate FOLLOW sets.
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
# foreach grammar rule X ::= Y(1) ... Y(k)
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
# NULLABLE = NULLABLE union {X}
# for i = 1 to k
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
# FIRST(X) = FIRST(X) union FIRST(Y(i))
# for j = i+1 to k
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration
NULLABLE = set()
FIRST = {}
FOLLOW = {}
for sym in symbols:
FIRST[sym]={sym} if sym.is_term else set()
FOLLOW[sym]=set()
# Calculate NULLABLE and FIRST
changed = True
while changed:
changed = False
for rule in rules:
if set(rule.expansion) <= NULLABLE:
if update_set(NULLABLE, {rule.origin}):
changed = True
for i, sym in enumerate(rule.expansion):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
# Calculate FOLLOW
changed = True
while changed:
changed = False
for rule in rules:
for i, sym in enumerate(rule.expansion):
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE:
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
changed = True
for j in range(i+1, len(rule.expansion)):
if set(rule.expansion[i+1:j]) <= NULLABLE:
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
changed = True
return FIRST, FOLLOW, NULLABLE
class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False):
self.debug = debug
rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])]
self.rules_by_origin = classify(rules, lambda r: r.origin)
assert len(rules) == len(set(rules))
for r in rules:
for sym in r.expansion:
if not (sym.is_term or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation
self.start_state = self.expand_rule(NonTerminal('$root'))
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)"
init_ptrs = set()
def _expand_rule(rule):
assert not rule.is_term, rule
for r in self.rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)
if r.expansion: # if not empty rule
new_r = init_ptr.next
if not new_r.is_term:
yield new_r
for _ in bfs([rule], _expand_rule):
pass
return fzset(init_ptrs)
def _first(self, r):
if r.is_term:
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}

View File

@@ -0,0 +1,108 @@
"""This module builds a LALR(1) transition-table for lalr_parser.py
For now, shift/reduce conflicts are automatically resolved as shifts.
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
import logging
from collections import defaultdict
from ..utils import classify, classify_bool, bfs, fzset
from ..exceptions import GrammarError
from .grammar_analysis import GrammarAnalyzer, Terminal
class Action:
def __init__(self, name):
self.name = name
def __str__(self):
return self.name
def __repr__(self):
return str(self)
Shift = Action('Shift')
Reduce = Action('Reduce')
class ParseTable:
def __init__(self, states, start_state, end_state):
self.states = states
self.start_state = start_state
self.end_state = end_state
class IntParseTable(ParseTable):
@classmethod
def from_ParseTable(cls, parse_table):
enum = list(parse_table.states)
state_to_idx = {s:i for i,s in enumerate(enum)}
int_states = {}
for s, la in parse_table.states.items():
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
for k,v in la.items()}
int_states[ state_to_idx[s] ] = la
start_state = state_to_idx[parse_table.start_state]
end_state = state_to_idx[parse_table.end_state]
return cls(int_states, start_state, end_state)
class LALR_Analyzer(GrammarAnalyzer):
def compute_lookahead(self):
self.end_states = []
self.states = {}
def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append((Reduce, rp.rule))
d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}
for rp in set(rps):
if not rp.is_satisfied and not rp.next.is_term:
rps |= self.expand_rule(rp.next)
new_state = fzset(rps)
lookahead[sym].append((Shift, new_state))
if sym == Terminal('$END'):
self.end_states.append( new_state )
yield new_state
for k, v in lookahead.items():
if len(v) > 1:
if self.debug:
logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v)
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] is Shift:
lookahead[k] = [x]
for k, v in lookahead.items():
if not len(v) == 1:
raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v])))
self.states[state] = {k.name:v[0] for k, v in lookahead.items()}
for _ in bfs([self.start_state], step):
pass
self.end_state ,= self.end_states
self._parse_table = ParseTable(self.states, self.start_state, self.end_state)
if self.debug:
self.parse_table = self._parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)

View File

@@ -0,0 +1,90 @@
"""This module implements a LALR(1) Parser
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from ..exceptions import UnexpectedToken
from .lalr_analysis import LALR_Analyzer, Shift
class Parser:
def __init__(self, parser_conf):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
analysis = LALR_Analyzer(parser_conf)
analysis.compute_lookahead()
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
for rule in parser_conf.rules}
self._parse_table = analysis.parse_table
self.parser_conf = parser_conf
self.parser = _Parser(analysis.parse_table, callbacks)
self.parse = self.parser.parse
###{standalone
class _Parser:
def __init__(self, parse_table, callbacks):
self.states = parse_table.states
self.start_state = parse_table.start_state
self.end_state = parse_table.end_state
self.callbacks = callbacks
def parse(self, seq, set_state=None):
i = 0
token = None
stream = iter(seq)
states = self.states
state_stack = [self.start_state]
value_stack = []
if set_state: set_state(self.start_state)
def get_action(key):
state = state_stack[-1]
try:
return states[state][key]
except KeyError:
expected = states[state].keys()
raise UnexpectedToken(token, expected, state=state) # TODO filter out rules from expected
def reduce(rule):
size = len(rule.expansion)
if size:
s = value_stack[-size:]
del state_stack[-size:]
del value_stack[-size:]
else:
s = []
value = self.callbacks[rule](s)
_action, new_state = get_action(rule.origin.name)
assert _action is Shift
state_stack.append(new_state)
value_stack.append(value)
# Main LALR-parser loop
for i, token in enumerate(stream):
while True:
action, arg = get_action(token.type)
assert arg != self.end_state
if action is Shift:
state_stack.append(arg)
value_stack.append(token)
if set_state: set_state(arg)
break # next token
else:
reduce(arg)
while True:
_action, arg = get_action('$END')
if _action is Shift:
assert arg == self.end_state
val ,= value_stack
return val
else:
reduce(arg)
###}

View File

@@ -0,0 +1,109 @@
from ..utils import compare
from functools import cmp_to_key
from ..tree import Tree
# Standard ambiguity resolver (uses comparison)
#
# Author: Erez Sh
def _compare_rules(rule1, rule2):
return -compare( len(rule1.expansion), len(rule2.expansion))
def _sum_priority(tree):
p = 0
for n in tree.iter_subtrees():
try:
p += n.meta.rule.options.priority or 0
except AttributeError:
pass
return p
def _compare_priority(tree1, tree2):
tree1.iter_subtrees()
def _compare_drv(tree1, tree2):
try:
rule1 = tree1.meta.rule
except AttributeError:
rule1 = None
try:
rule2 = tree2.meta.rule
except AttributeError:
rule2 = None
if None == rule1 == rule2:
return compare(tree1, tree2)
elif rule1 is None:
return -1
elif rule2 is None:
return 1
assert tree1.data != '_ambig'
assert tree2.data != '_ambig'
p1 = _sum_priority(tree1)
p2 = _sum_priority(tree2)
c = (p1 or p2) and compare(p1, p2)
if c:
return c
c = _compare_rules(tree1.meta.rule, tree2.meta.rule)
if c:
return c
# rules are "equal", so compare trees
if len(tree1.children) == len(tree2.children):
for t1, t2 in zip(tree1.children, tree2.children):
c = _compare_drv(t1, t2)
if c:
return c
return compare(len(tree1.children), len(tree2.children))
def _standard_resolve_ambig(tree):
assert tree.data == '_ambig'
key_f = cmp_to_key(_compare_drv)
best = max(tree.children, key=key_f)
assert best.data == 'drv'
tree.set('drv', best.children)
tree.meta.rule = best.meta.rule # needed for applying callbacks
def standard_resolve_ambig(tree):
for ambig in tree.find_data('_ambig'):
_standard_resolve_ambig(ambig)
return tree
# Anti-score Sum
#
# Author: Uriva (https://github.com/uriva)
def _antiscore_sum_drv(tree):
if not isinstance(tree, Tree):
return 0
assert tree.data != '_ambig'
return _sum_priority(tree)
def _antiscore_sum_resolve_ambig(tree):
assert tree.data == '_ambig'
best = min(tree.children, key=_antiscore_sum_drv)
assert best.data == 'drv'
tree.set('drv', best.children)
tree.meta.rule = best.meta.rule # needed for applying callbacks
def antiscore_sum_resolve_ambig(tree):
for ambig in tree.find_data('_ambig'):
_antiscore_sum_resolve_ambig(ambig)
return tree

View File

@@ -0,0 +1,156 @@
"This module implements an experimental Earley Parser with a dynamic lexer"
# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer and Visitor, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
# uses regular expressions by necessity, achieving high-performance while maintaining all of
# Earley's power in parsing any CFG.
#
#
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com
from collections import defaultdict
from ..exceptions import ParseError, UnexpectedCharacters
from ..lexer import Token
from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal, Terminal
from .earley import ApplyCallbacks, Item, Column
class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False):
self.analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.ignore = [Terminal(t) for t in ignore]
self.predict_all = predict_all
self.complete_lex = complete_lex
self.FIRST = self.analysis.FIRST
self.postprocess = {}
self.predictions = {}
for rule in parser_conf.rules:
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
self.term_matcher = term_matcher
def parse(self, stream, start_symbol=None):
# Define parser functions
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
delayed_matches = defaultdict(list)
match = self.term_matcher
text_line = 1
text_column = 1
def predict(nonterm, column):
assert not nonterm.is_term, nonterm
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
def complete(item):
name = item.rule.origin
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
def predict_and_complete(column):
while True:
to_predict = {x.expect for x in column.to_predict.get_news()
if x.ptr} # if not part of an already predicted batch
to_reduce = column.to_reduce.get_news()
if not (to_predict or to_reduce):
break
for nonterm in to_predict:
column.add( predict(nonterm, column) )
for item in to_reduce:
new_items = list(complete(item))
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)
def scan(i, column):
to_scan = column.to_scan
for x in self.ignore:
m = match(x, stream, i)
if m:
delayed_matches[m.end()] += set(to_scan)
delayed_matches[m.end()] += set(column.to_reduce)
# TODO add partial matches for ignore too?
# s = m.group(0)
# for j in range(1, len(s)):
# m = x.match(s[:-j])
# if m:
# delayed_matches[m.end()] += to_scan
for item in to_scan:
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append(item.advance(t))
if self.complete_lex:
s = m.group(0)
for j in range(1, len(s)):
m = match(item.expect, s[:-j])
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t))
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
next_set.add(delayed_matches[i+1])
del delayed_matches[i+1] # No longer needed, so unburden memory
if not next_set and not delayed_matches:
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))
return next_set
# Main loop starts
column0 = Column(0, self.FIRST, predict_all=self.predict_all)
column0.add(predict(start_symbol, column0))
column = column0
for i, token in enumerate(stream):
predict_and_complete(column)
column = scan(i, column)
if token == '\n':
text_line += 1
text_column = 1
else:
text_column += 1
predict_and_complete(column)
# Parse ended. Now build a parse tree
solutions = [n.tree for n in column.to_reduce
if n.rule.origin==start_symbol and n.start is column0]
if not solutions:
expected_tokens = [t.expect for t in column.to_scan]
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
elif len(solutions) == 1:
tree = solutions[0]
else:
tree = Tree('_ambig', solutions)
if self.resolve_ambiguity:
tree = self.resolve_ambiguity(tree)
return ApplyCallbacks(self.postprocess).transform(tree)

View File

@@ -0,0 +1,129 @@
from collections import defaultdict
from .tree import Tree
from .visitors import Transformer_InPlace
from .common import ParserConf, PatternStr
from .lexer import Token
from .parsers import earley, resolve_ambig
from .grammar import Rule, Terminal, NonTerminal
def is_discarded_terminal(t):
return t.is_term and t.filter_out
def is_iter_empty(i):
try:
_ = next(i)
return False
except StopIteration:
return True
class WriteTokensTransformer(Transformer_InPlace):
def __init__(self, tokens):
self.tokens = tokens
def __default__(self, data, children, meta):
# if not isinstance(t, MatchTree):
# return t
if not getattr(meta, 'match_tree', False):
return Tree(data, children)
iter_args = iter(children)
to_write = []
for sym in meta.orig_expansion:
if is_discarded_terminal(sym):
t = self.tokens[sym.name]
assert isinstance(t.pattern, PatternStr)
to_write.append(t.pattern.value)
else:
x = next(iter_args)
if isinstance(x, list):
to_write += x
else:
if isinstance(x, Token):
assert Terminal(x.type) == sym, x
else:
assert NonTerminal(x.data) == sym, (sym, x)
to_write.append(x)
assert is_iter_empty(iter_args)
return to_write
class MatchTree(Tree):
pass
class MakeMatchTree:
def __init__(self, name, expansion):
self.name = name
self.expansion = expansion
def __call__(self, args):
t = MatchTree(self.name, args)
t.meta.match_tree = True
t.meta.orig_expansion = self.expansion
return t
class Reconstructor:
def __init__(self, parser):
# XXX TODO calling compile twice returns different results!
tokens, rules, _grammar_extra = parser.grammar.compile()
self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens})
self.rules = list(self._build_recons_rules(rules))
def _build_recons_rules(self, rules):
expand1s = {r.origin for r in rules if r.options and r.options.expand1}
aliases = defaultdict(list)
for r in rules:
if r.alias:
aliases[r.origin].append( r.alias )
rule_names = {r.origin for r in rules}
nonterminals = {sym for sym in rule_names
if sym.name.startswith('_') or sym in expand1s or sym in aliases }
for r in rules:
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
for sym in r.expansion if not is_discarded_terminal(sym)]
# Skip self-recursive constructs
if recons_exp == [r.origin]:
continue
sym = NonTerminal(r.alias) if r.alias else r.origin
yield Rule(sym, recons_exp, MakeMatchTree(sym.name, r.expansion))
for origin, rule_aliases in aliases.items():
for alias in rule_aliases:
yield Rule(origin, [Terminal(alias)], MakeMatchTree(origin.name, [NonTerminal(alias)]))
yield Rule(origin, [Terminal(origin.name)], MakeMatchTree(origin.name, [origin]))
def _match(self, term, token):
if isinstance(token, Tree):
return Terminal(token.data) == term
elif isinstance(token, Token):
return term == Terminal(token.type)
assert False
def _reconstruct(self, tree):
# TODO: ambiguity?
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig)
unreduced_tree = parser.parse(tree.children) # find a full derivation
assert unreduced_tree.data == tree.data
res = self.write_tokens.transform(unreduced_tree)
for item in res:
if isinstance(item, Tree):
for x in self._reconstruct(item):
yield x
else:
yield item
def reconstruct(self, tree):
return ''.join(self._reconstruct(tree))

View File

View File

@@ -0,0 +1,186 @@
"Converts between Lark and Nearley grammars. Work in progress!"
import os.path
import sys
import codecs
from lark import Lark, InlineTransformer
nearley_grammar = r"""
start: (ruledef|directive)+
directive: "@" NAME (STRING|NAME)
| "@" JS -> js_code
ruledef: NAME "->" expansions
| NAME REGEXP "->" expansions -> macro
expansions: expansion ("|" expansion)*
expansion: expr+ js
?expr: item [":" /[+*?]/]
?item: rule|string|regexp
| "(" expansions ")"
rule: NAME
string: STRING
regexp: REGEXP
JS: /{%.*?%}/s
js: JS?
NAME: /[a-zA-Z_$]\w*/
COMMENT: /#[^\n]*/
REGEXP: /\[.*?\]/
STRING: /".*?"/
%import common.WS
%ignore WS
%ignore COMMENT
"""
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard')
def _get_rulename(name):
name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)
return 'n_' + name.replace('$', '__DOLLAR__').lower()
class NearleyToLark(InlineTransformer):
def __init__(self):
self._count = 0
self.extra_rules = {}
self.extra_rules_rev = {}
self.alias_js_code = {}
def _new_function(self, code):
name = 'alias_%d' % self._count
self._count += 1
self.alias_js_code[name] = code
return name
def _extra_rule(self, rule):
if rule in self.extra_rules_rev:
return self.extra_rules_rev[rule]
name = 'xrule_%d' % len(self.extra_rules)
assert name not in self.extra_rules
self.extra_rules[name] = rule
self.extra_rules_rev[rule] = name
return name
def rule(self, name):
return _get_rulename(name)
def ruledef(self, name, exps):
return '!%s: %s' % (_get_rulename(name), exps)
def expr(self, item, op):
rule = '(%s)%s' % (item, op)
return self._extra_rule(rule)
def regexp(self, r):
return '/%s/' % r
def string(self, s):
return self._extra_rule(s)
def expansion(self, *x):
x, js = x[:-1], x[-1]
if js.children:
js_code ,= js.children
js_code = js_code[2:-2]
alias = '-> ' + self._new_function(js_code)
else:
alias = ''
return ' '.join(x) + alias
def expansions(self, *x):
return '%s' % ('\n |'.join(x))
def start(self, *rules):
return '\n'.join(filter(None, rules))
def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
rule_defs = []
tree = nearley_grammar_parser.parse(g)
for statement in tree.children:
if statement.data == 'directive':
directive, arg = statement.children
if directive in ('builtin', 'include'):
folder = builtin_path if directive == 'builtin' else folder_path
path = os.path.join(folder, arg[1:-1])
if path not in includes:
includes.add(path)
with codecs.open(path, encoding='utf8') as f:
text = f.read()
rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
else:
assert False, directive
elif statement.data == 'js_code':
code ,= statement.children
code = code[2:-2]
js_code.append(code)
elif statement.data == 'macro':
pass # TODO Add support for macros!
elif statement.data == 'ruledef':
rule_defs.append( n2l.transform(statement) )
else:
raise Exception("Unknown statement: %s" % statement)
return rule_defs
def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
import js2py
emit_code = []
def emit(x=None):
if x:
emit_code.append(x)
emit_code.append('\n')
js_code = ['function id(x) {return x[0];}']
n2l = NearleyToLark()
rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
lark_g = '\n'.join(rule_defs)
lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
emit('from lark import Lark, Transformer')
emit()
emit('grammar = ' + repr(lark_g))
emit()
for alias, code in n2l.alias_js_code.items():
js_code.append('%s = (%s);' % (alias, code))
emit(js2py.translate_js('\n'.join(js_code)))
emit('class TransformNearley(Transformer):')
for alias in n2l.alias_js_code:
emit(" %s = var.get('%s').to_python()" % (alias, alias))
emit(" __default__ = lambda self, n, c, m: c if c else None")
emit()
emit('parser = Lark(grammar, start="n_%s")' % start)
emit('def parse(text):')
emit(' return TransformNearley().transform(parser.parse(text))')
return ''.join(emit_code)
def main(fn, start, nearley_lib):
with codecs.open(fn, encoding='utf8') as f:
grammar = f.read()
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)))
if __name__ == '__main__':
if len(sys.argv) < 4:
print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.")
print("Usage: %s <nearley_grammar_path> <start_rule> <nearley_lib_path>" % sys.argv[0])
sys.exit(1)
fn, start, nearley_lib = sys.argv[1:]
print(main(fn, start, nearley_lib))

View File

@@ -0,0 +1,7 @@
# This file used to contain the Lark standalone tool.
#
# We do not use it, and it is licensed under the GPL, which is much
# more restrictive than the rest of Lark. In order to avoid depending
# on it accidentally, we exclude it from our repository and distribution.
# When LARK is upgraded, this file should be kept in preference to the
# original.

View File

@@ -0,0 +1,162 @@
try:
from future_builtins import filter
except ImportError:
pass
from copy import deepcopy
class Meta:
pass
###{standalone
class Tree(object):
def __init__(self, data, children, meta=None):
self.data = data
self.children = children
self._meta = meta
@property
def meta(self):
if self._meta is None:
self._meta = Meta()
return self._meta
def __repr__(self):
return 'Tree(%s, %s)' % (self.data, self.children)
def _pretty_label(self):
return self.data
def _pretty(self, level, indent_str):
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
return [ indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n']
l = [ indent_str*level, self._pretty_label(), '\n' ]
for n in self.children:
if isinstance(n, Tree):
l += n._pretty(level+1, indent_str)
else:
l += [ indent_str*(level+1), '%s' % (n,), '\n' ]
return l
def pretty(self, indent_str=' '):
return ''.join(self._pretty(0, indent_str))
###}
def expand_kids_by_index(self, *indices):
"Expand (inline) children at the given indices"
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children
def __eq__(self, other):
try:
return self.data == other.data and self.children == other.children
except AttributeError:
return False
def __ne__(self, other):
return not (self == other)
def __hash__(self):
return hash((self.data, tuple(self.children)))
def find_pred(self, pred):
"Find all nodes where pred(tree) == True"
return filter(pred, self.iter_subtrees())
def find_data(self, data):
"Find all nodes where tree.data == data"
return self.find_pred(lambda t: t.data == data)
def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c
def iter_subtrees(self):
# TODO: Re-write as a more efficient version
visited = set()
q = [self]
l = []
while q:
subtree = q.pop()
l.append( subtree )
if id(subtree) in visited:
continue # already been here from another branch
visited.add(id(subtree))
q += [c for c in subtree.children if isinstance(c, Tree)]
seen = set()
for x in reversed(l):
if id(x) not in seen:
yield x
seen.add(id(x))
def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo))
def copy(self):
return type(self)(self.data, self.children)
def set(self, data, children):
self.data = data
self.children = children
# XXX Deprecated! Here for backwards compatibility <0.6.0
@property
def line(self):
return self.meta.line
@property
def column(self):
return self.meta.column
@property
def end_line(self):
return self.meta.end_line
@property
def end_column(self):
return self.meta.end_column
class SlottedTree(Tree):
__slots__ = 'data', 'children', 'rule', '_meta'
def pydot__tree_to_png(tree, filename):
"Creates a colorful image that represents the tree (data+children, without meta)"
import pydot
graph = pydot.Dot(graph_type='digraph', rankdir="LR")
i = [0]
def new_leaf(leaf):
node = pydot.Node(i[0], label=repr(leaf))
i[0] += 1
graph.add_node(node)
return node
def _to_pydot(subtree):
color = hash(subtree.data) & 0xffffff
color |= 0x808080
subnodes = [_to_pydot(child) if isinstance(child, Tree) else new_leaf(child)
for child in subtree.children]
node = pydot.Node(i[0], style="filled", fillcolor="#%x"%color, label=subtree.data)
i[0] += 1
graph.add_node(node)
for subnode in subnodes:
graph.add_edge(pydot.Edge(node, subnode))
return node
_to_pydot(tree)
graph.write_png(filename)

View File

@@ -0,0 +1,127 @@
from collections import deque
import sys
class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))
def classify_bool(seq, pred):
true_elems = []
false_elems = []
for elem in seq:
if pred(elem):
true_elems.append(elem)
else:
false_elems.append(elem)
return true_elems, false_elems
def classify(seq, key=None, value=None):
d = {}
for item in seq:
k = key(item) if (key is not None) else item
v = value(item) if (value is not None) else item
if k in d:
d[k].append(v)
else:
d[k] = [v]
return d
def bfs(initial, expand):
open_q = deque(list(initial))
visited = set(open_q)
while open_q:
node = open_q.popleft()
yield node
for next_node in expand(node):
if next_node not in visited:
visited.add(next_node)
open_q.append(next_node)
try:
STRING_TYPE = basestring
except NameError: # Python 3
STRING_TYPE = str
###{standalone
import types
from functools import wraps, partial
from contextlib import contextmanager
Str = type(u'')
def smart_decorator(f, create_decorator):
if isinstance(f, types.FunctionType):
return wraps(f)(create_decorator(f, True))
elif isinstance(f, (type, types.BuiltinFunctionType)):
return wraps(f)(create_decorator(f, False))
elif isinstance(f, types.MethodType):
return wraps(f)(create_decorator(f.__func__, True))
elif isinstance(f, partial):
# wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
return create_decorator(f.__func__, True)
else:
return create_decorator(f.__func__.__call__, True)
try:
from contextlib import suppress # Python 3
except ImportError:
@contextmanager
def suppress(*excs):
'''Catch and dismiss the provided exception
>>> x = 'hello'
>>> with suppress(IndexError):
... x = x[10]
>>> x
'hello'
'''
try:
yield
except excs:
pass
###}
try:
compare = cmp
except NameError:
def compare(a, b):
if a == b:
return 0
elif a > b:
return 1
return -1
def get_regexp_width(regexp):
# in 3.11 sre_parse was replaced with re._parser
# see implementation in https://github.com/python/cpython/blob/3.11/Lib/sre_parse.py
if sys.version_info >= (3, 11):
import re
try:
return re._parser.parse(regexp).getwidth()
except re.error:
raise ValueError(regexp)
else:
import sre_constants
import sre_parse
try:
return sre_parse.parse(regexp).getwidth()
except sre_constants.error:
raise ValueError(regexp)

View File

@@ -0,0 +1,250 @@
from inspect import getmembers, getmro
from functools import wraps
from .utils import smart_decorator
from .tree import Tree
class Discard(Exception):
pass
# Transformers
class Transformer:
"""Visits the tree recursively, starting with the leaves and finally the root (bottom-up)
Calls its methods (provided by user via inheritance) according to tree.data
The returned value replaces the old one in the structure.
Can be used to implement map or reduce.
"""
def _call_userfunc(self, tree, new_children=None):
# Assumes tree is already transformed
children = new_children if new_children is not None else tree.children
try:
f = getattr(self, tree.data)
except AttributeError:
return self.__default__(tree.data, children, tree.meta)
else:
if getattr(f, 'meta', False):
return f(children, tree.meta)
elif getattr(f, 'inline', False):
return f(*children)
elif getattr(f, 'whole_tree', False):
if new_children is not None:
raise NotImplementedError("Doesn't work with the base Transformer class")
return f(tree)
else:
return f(children)
def _transform_children(self, children):
for c in children:
try:
yield self._transform_tree(c) if isinstance(c, Tree) else c
except Discard:
pass
def _transform_tree(self, tree):
children = list(self._transform_children(tree.children))
return self._call_userfunc(tree, children)
def transform(self, tree):
return self._transform_tree(tree)
def __mul__(self, other):
return TransformerChain(self, other)
def __default__(self, data, children, meta):
"Default operation on tree (for override)"
return Tree(data, children, meta)
@classmethod
def _apply_decorator(cls, decorator, **kwargs):
mro = getmro(cls)
assert mro[0] is cls
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
for name, value in getmembers(cls):
if name.startswith('_') or name in libmembers:
continue
setattr(cls, name, decorator(value, **kwargs))
return cls
class InlineTransformer(Transformer): # XXX Deprecated
def _call_userfunc(self, tree, new_children=None):
# Assumes tree is already transformed
children = new_children if new_children is not None else tree.children
try:
f = getattr(self, tree.data)
except AttributeError:
return self.__default__(tree.data, children, tree.meta)
else:
return f(*children)
class TransformerChain(object):
def __init__(self, *transformers):
self.transformers = transformers
def transform(self, tree):
for t in self.transformers:
tree = t.transform(tree)
return tree
def __mul__(self, other):
return TransformerChain(*self.transformers + (other,))
class Transformer_InPlace(Transformer):
"Non-recursive. Changes the tree in-place instead of returning new instances"
def _transform_tree(self, tree): # Cancel recursion
return self._call_userfunc(tree)
def transform(self, tree):
for subtree in tree.iter_subtrees():
subtree.children = list(self._transform_children(subtree.children))
return self._transform_tree(tree)
class Transformer_InPlaceRecursive(Transformer):
"Recursive. Changes the tree in-place instead of returning new instances"
def _transform_tree(self, tree):
tree.children = list(self._transform_children(tree.children))
return self._call_userfunc(tree)
# Visitors
class VisitorBase:
def _call_userfunc(self, tree):
return getattr(self, tree.data, self.__default__)(tree)
def __default__(self, tree):
"Default operation on tree (for override)"
return tree
class Visitor(VisitorBase):
"""Bottom-up visitor, non-recursive
Visits the tree, starting with the leaves and finally the root (bottom-up)
Calls its methods (provided by user via inheritance) according to tree.data
"""
def visit(self, tree):
for subtree in tree.iter_subtrees():
self._call_userfunc(subtree)
return tree
class Visitor_Recursive(VisitorBase):
"""Bottom-up visitor, recursive
Visits the tree, starting with the leaves and finally the root (bottom-up)
Calls its methods (provided by user via inheritance) according to tree.data
"""
def visit(self, tree):
for child in tree.children:
if isinstance(child, Tree):
self.visit(child)
f = getattr(self, tree.data, self.__default__)
f(tree)
return tree
def visit_children_decor(func):
"See Interpreter"
@wraps(func)
def inner(cls, tree):
values = cls.visit_children(tree)
return func(cls, values)
return inner
class Interpreter:
"""Top-down visitor, recursive
Visits the tree, starting with the root and finally the leaves (top-down)
Calls its methods (provided by user via inheritance) according to tree.data
Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
The user has to explicitly call visit_children, or use the @visit_children_decor
"""
def visit(self, tree):
return getattr(self, tree.data)(tree)
def visit_children(self, tree):
return [self.visit(child) if isinstance(child, Tree) else child
for child in tree.children]
def __getattr__(self, name):
return self.__default__
def __default__(self, tree):
return self.visit_children(tree)
# Decorators
def _apply_decorator(obj, decorator, **kwargs):
try:
_apply = obj._apply_decorator
except AttributeError:
return decorator(obj, **kwargs)
else:
return _apply(decorator, **kwargs)
def _inline_args__func(func):
@wraps(func)
def create_decorator(_f, with_self):
if with_self:
def f(self, children):
return _f(self, *children)
else:
def f(self, children):
return _f(*children)
return f
return smart_decorator(func, create_decorator)
def inline_args(obj): # XXX Deprecated
return _apply_decorator(obj, _inline_args__func)
def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False):
assert [whole_tree, meta, inline].count(True) <= 1
def create_decorator(_f, with_self):
if with_self:
def f(self, *args, **kwargs):
return _f(self, *args, **kwargs)
else:
def f(self, *args, **kwargs):
return _f(*args, **kwargs)
return f
f = smart_decorator(func, create_decorator)
f.inline = inline
f.meta = meta
f.whole_tree = whole_tree
return f
def v_args(inline=False, meta=False, tree=False):
"A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
if [tree, meta, inline].count(True) > 1:
raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.")
def _visitor_args_dec(obj):
return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree)
return _visitor_args_dec