mirror of
https://github.com/github/codeql.git
synced 2025-12-16 16:53:25 +01:00
Python: Copy Python extractor to codeql repo
This commit is contained in:
19
python/extractor/lark/LICENSE
Normal file
19
python/extractor/lark/LICENSE
Normal file
@@ -0,0 +1,19 @@
|
||||
Copyright © 2017 Erez Shinan
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
7
python/extractor/lark/__init__.py
Normal file
7
python/extractor/lark/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .tree import Tree
|
||||
from .visitors import Transformer, Visitor, v_args, Discard
|
||||
from .visitors import InlineTransformer, inline_args # XXX Deprecated
|
||||
from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters
|
||||
from .lark import Lark
|
||||
|
||||
__version__ = "0.6.3"
|
||||
87
python/extractor/lark/common.py
Normal file
87
python/extractor/lark/common.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import re
|
||||
import sys
|
||||
|
||||
from .utils import get_regexp_width
|
||||
|
||||
Py36 = (sys.version_info[:2] >= (3, 6))
|
||||
|
||||
|
||||
###{standalone
|
||||
###}
|
||||
|
||||
|
||||
|
||||
class LexerConf:
|
||||
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None):
|
||||
self.tokens = tokens
|
||||
self.ignore = ignore
|
||||
self.postlex = postlex
|
||||
self.callbacks = callbacks or {}
|
||||
|
||||
class ParserConf:
|
||||
def __init__(self, rules, callback, start):
|
||||
self.rules = rules
|
||||
self.callback = callback
|
||||
self.start = start
|
||||
|
||||
|
||||
|
||||
class Pattern(object):
|
||||
def __init__(self, value, flags=()):
|
||||
self.value = value
|
||||
self.flags = frozenset(flags)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.to_regexp())
|
||||
|
||||
# Pattern Hashing assumes all subclasses have a different priority!
|
||||
def __hash__(self):
|
||||
return hash((type(self), self.value, self.flags))
|
||||
def __eq__(self, other):
|
||||
return type(self) == type(other) and self.value == other.value and self.flags == other.flags
|
||||
|
||||
def to_regexp(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
if Py36:
|
||||
# Python 3.6 changed syntax for flags in regular expression
|
||||
def _get_flags(self, value):
|
||||
for f in self.flags:
|
||||
value = ('(?%s:%s)' % (f, value))
|
||||
return value
|
||||
|
||||
else:
|
||||
def _get_flags(self, value):
|
||||
for f in self.flags:
|
||||
value = ('(?%s)' % f) + value
|
||||
return value
|
||||
|
||||
class PatternStr(Pattern):
|
||||
def to_regexp(self):
|
||||
return self._get_flags(re.escape(self.value))
|
||||
|
||||
@property
|
||||
def min_width(self):
|
||||
return len(self.value)
|
||||
max_width = min_width
|
||||
|
||||
class PatternRE(Pattern):
|
||||
def to_regexp(self):
|
||||
return self._get_flags(self.value)
|
||||
|
||||
@property
|
||||
def min_width(self):
|
||||
return get_regexp_width(self.to_regexp())[0]
|
||||
@property
|
||||
def max_width(self):
|
||||
return get_regexp_width(self.to_regexp())[1]
|
||||
|
||||
class TokenDef(object):
|
||||
def __init__(self, name, pattern, priority=1):
|
||||
assert isinstance(pattern, Pattern), pattern
|
||||
self.name = name
|
||||
self.pattern = pattern
|
||||
self.priority = priority
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
|
||||
86
python/extractor/lark/exceptions.py
Normal file
86
python/extractor/lark/exceptions.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from .utils import STRING_TYPE
|
||||
|
||||
class LarkError(Exception):
|
||||
pass
|
||||
|
||||
class GrammarError(LarkError):
|
||||
pass
|
||||
|
||||
class ParseError(LarkError):
|
||||
pass
|
||||
|
||||
class LexError(LarkError):
|
||||
pass
|
||||
|
||||
class UnexpectedInput(LarkError):
|
||||
pos_in_stream = None
|
||||
|
||||
def get_context(self, text, span=40):
|
||||
pos = self.pos_in_stream
|
||||
start = max(pos - span, 0)
|
||||
end = pos + span
|
||||
before = text[start:pos].rsplit('\n', 1)[-1]
|
||||
after = text[pos:end].split('\n', 1)[0]
|
||||
return before + after + '\n' + ' ' * len(before) + '^\n'
|
||||
|
||||
def match_examples(self, parse_fn, examples):
|
||||
""" Given a parser instance and a dictionary mapping some label with
|
||||
some malformed syntax examples, it'll return the label for the
|
||||
example that bests matches the current error.
|
||||
"""
|
||||
assert self.state is not None, "Not supported for this exception"
|
||||
|
||||
candidate = None
|
||||
for label, example in examples.items():
|
||||
assert not isinstance(example, STRING_TYPE)
|
||||
|
||||
for malformed in example:
|
||||
try:
|
||||
parse_fn(malformed)
|
||||
except UnexpectedInput as ut:
|
||||
if ut.state == self.state:
|
||||
try:
|
||||
if ut.token == self.token: # Try exact match first
|
||||
return label
|
||||
except AttributeError:
|
||||
pass
|
||||
if not candidate:
|
||||
candidate = label
|
||||
|
||||
return candidate
|
||||
|
||||
|
||||
class UnexpectedCharacters(LexError, UnexpectedInput):
|
||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None):
|
||||
message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)
|
||||
|
||||
self.line = line
|
||||
self.column = column
|
||||
self.allowed = allowed
|
||||
self.considered_tokens = considered_tokens
|
||||
self.pos_in_stream = lex_pos
|
||||
self.state = state
|
||||
|
||||
message += '\n\n' + self.get_context(seq)
|
||||
if allowed:
|
||||
message += '\nExpecting: %s\n' % allowed
|
||||
|
||||
super(UnexpectedCharacters, self).__init__(message)
|
||||
|
||||
|
||||
|
||||
class UnexpectedToken(ParseError, UnexpectedInput):
|
||||
def __init__(self, token, expected, considered_rules=None, state=None):
|
||||
self.token = token
|
||||
self.expected = expected # XXX str shouldn't necessary
|
||||
self.line = getattr(token, 'line', '?')
|
||||
self.column = getattr(token, 'column', '?')
|
||||
self.considered_rules = considered_rules
|
||||
self.state = state
|
||||
self.pos_in_stream = getattr(token, 'pos_in_stream', None)
|
||||
|
||||
message = ("Unexpected token %r at line %s, column %s.\n"
|
||||
"Expected: %s\n"
|
||||
% (token, self.line, self.column, ', '.join(self.expected)))
|
||||
|
||||
super(UnexpectedToken, self).__init__(message)
|
||||
60
python/extractor/lark/grammar.py
Normal file
60
python/extractor/lark/grammar.py
Normal file
@@ -0,0 +1,60 @@
|
||||
class Symbol(object):
|
||||
is_term = NotImplemented
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def __eq__(self, other):
|
||||
assert isinstance(other, Symbol), other
|
||||
return self.is_term == other.is_term and self.name == other.name
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%r)' % (type(self).__name__, self.name)
|
||||
|
||||
class Terminal(Symbol):
|
||||
is_term = True
|
||||
|
||||
def __init__(self, name, filter_out=False):
|
||||
self.name = name
|
||||
self.filter_out = filter_out
|
||||
|
||||
|
||||
class NonTerminal(Symbol):
|
||||
is_term = False
|
||||
|
||||
class Rule(object):
|
||||
"""
|
||||
origin : a symbol
|
||||
expansion : a list of symbols
|
||||
"""
|
||||
def __init__(self, origin, expansion, alias=None, options=None):
|
||||
self.origin = origin
|
||||
self.expansion = expansion
|
||||
self.alias = alias
|
||||
self.options = options
|
||||
|
||||
def __str__(self):
|
||||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
|
||||
|
||||
def __repr__(self):
|
||||
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
|
||||
|
||||
|
||||
class RuleOptions:
|
||||
def __init__(self, keep_all_tokens=False, expand1=False, priority=None):
|
||||
self.keep_all_tokens = keep_all_tokens
|
||||
self.expand1 = expand1
|
||||
self.priority = priority
|
||||
|
||||
def __repr__(self):
|
||||
return 'RuleOptions(%r, %r, %r)' % (
|
||||
self.keep_all_tokens,
|
||||
self.expand1,
|
||||
self.priority,
|
||||
)
|
||||
0
python/extractor/lark/grammars/__init__.py
Normal file
0
python/extractor/lark/grammars/__init__.py
Normal file
49
python/extractor/lark/grammars/common.lark
Normal file
49
python/extractor/lark/grammars/common.lark
Normal file
@@ -0,0 +1,49 @@
|
||||
//
|
||||
// Numbers
|
||||
//
|
||||
|
||||
DIGIT: "0".."9"
|
||||
HEXDIGIT: "a".."f"|"A".."F"|DIGIT
|
||||
|
||||
INT: DIGIT+
|
||||
SIGNED_INT: ["+"|"-"] INT
|
||||
DECIMAL: INT "." INT? | "." INT
|
||||
|
||||
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
|
||||
_EXP: ("e"|"E") SIGNED_INT
|
||||
FLOAT: INT _EXP | DECIMAL _EXP?
|
||||
SIGNED_FLOAT: ["+"|"-"] FLOAT
|
||||
|
||||
NUMBER: FLOAT | INT
|
||||
SIGNED_NUMBER: ["+"|"-"] NUMBER
|
||||
|
||||
//
|
||||
// Strings
|
||||
//
|
||||
//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/
|
||||
STRING_INNER: ("\\\""|/[^"]/)
|
||||
ESCAPED_STRING: "\"" STRING_INNER* "\""
|
||||
|
||||
|
||||
//
|
||||
// Names (Variables)
|
||||
//
|
||||
LCASE_LETTER: "a".."z"
|
||||
UCASE_LETTER: "A".."Z"
|
||||
|
||||
LETTER: UCASE_LETTER | LCASE_LETTER
|
||||
WORD: LETTER+
|
||||
|
||||
CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
|
||||
|
||||
|
||||
//
|
||||
// Whitespace
|
||||
//
|
||||
WS_INLINE: (" "|/\t/)+
|
||||
WS: /[ \t\f\r\n]/+
|
||||
|
||||
CR : /\r/
|
||||
LF : /\n/
|
||||
NEWLINE: (CR? LF)+
|
||||
|
||||
55
python/extractor/lark/indenter.py
Normal file
55
python/extractor/lark/indenter.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"Provides Indentation services for languages with indentation similar to Python"
|
||||
|
||||
from .lexer import Token
|
||||
|
||||
###{standalone
|
||||
class Indenter:
|
||||
def __init__(self):
|
||||
self.paren_level = 0
|
||||
self.indent_level = [0]
|
||||
|
||||
def handle_NL(self, token):
|
||||
if self.paren_level > 0:
|
||||
return
|
||||
|
||||
yield token
|
||||
|
||||
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
|
||||
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
|
||||
|
||||
if indent > self.indent_level[-1]:
|
||||
self.indent_level.append(indent)
|
||||
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
|
||||
else:
|
||||
while indent < self.indent_level[-1]:
|
||||
self.indent_level.pop()
|
||||
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
|
||||
|
||||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
|
||||
|
||||
def process(self, stream):
|
||||
for token in stream:
|
||||
if token.type == self.NL_type:
|
||||
for t in self.handle_NL(token):
|
||||
yield t
|
||||
else:
|
||||
yield token
|
||||
|
||||
if token.type in self.OPEN_PAREN_types:
|
||||
self.paren_level += 1
|
||||
elif token.type in self.CLOSE_PAREN_types:
|
||||
self.paren_level -= 1
|
||||
assert self.paren_level >= 0
|
||||
|
||||
while len(self.indent_level) > 1:
|
||||
self.indent_level.pop()
|
||||
yield Token(self.DEDENT_type, '')
|
||||
|
||||
assert self.indent_level == [0], self.indent_level
|
||||
|
||||
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
|
||||
@property
|
||||
def always_accept(self):
|
||||
return (self.NL_type,)
|
||||
|
||||
###}
|
||||
235
python/extractor/lark/lark.py
Normal file
235
python/extractor/lark/lark.py
Normal file
@@ -0,0 +1,235 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from io import open
|
||||
|
||||
from .utils import STRING_TYPE
|
||||
from .load_grammar import load_grammar
|
||||
from .tree import Tree
|
||||
from .common import LexerConf, ParserConf
|
||||
|
||||
from .lexer import Lexer, TraditionalLexer
|
||||
from .parse_tree_builder import ParseTreeBuilder
|
||||
from .parser_frontends import get_frontend
|
||||
|
||||
|
||||
class LarkOptions(object):
|
||||
"""Specifies the options for Lark
|
||||
|
||||
"""
|
||||
OPTIONS_DOC = """
|
||||
parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley")
|
||||
Note: "lalr" requires a lexer
|
||||
|
||||
lexer - Decides whether or not to use a lexer stage
|
||||
"standard": Use a standard lexer
|
||||
"contextual": Stronger lexer (only works with parser="lalr")
|
||||
"dynamic": Flexible and powerful (only with parser="earley")
|
||||
"dynamic_complete": Same as dynamic, but tries *every* variation
|
||||
of tokenizing possible. (only with parser="earley")
|
||||
"auto" (default): Choose for me based on grammar and parser
|
||||
|
||||
ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
|
||||
"resolve": The parser will automatically choose the simplest derivation
|
||||
(it chooses consistently: greedy for tokens, non-greedy for rules)
|
||||
"explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).
|
||||
|
||||
transformer - Applies the transformer to every parse tree
|
||||
debug - Affects verbosity (default: False)
|
||||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
|
||||
cache_grammar - Cache the Lark grammar (Default: False)
|
||||
postlex - Lexer post-processing (Requires standard lexer. Default: None)
|
||||
start - The start symbol (Default: start)
|
||||
profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
|
||||
propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
|
||||
lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
|
||||
"""
|
||||
__doc__ = OPTIONS_DOC
|
||||
def __init__(self, options_dict):
|
||||
o = dict(options_dict)
|
||||
|
||||
self.debug = bool(o.pop('debug', False))
|
||||
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False))
|
||||
self.tree_class = o.pop('tree_class', Tree)
|
||||
self.cache_grammar = o.pop('cache_grammar', False)
|
||||
self.postlex = o.pop('postlex', None)
|
||||
self.parser = o.pop('parser', 'earley')
|
||||
self.lexer = o.pop('lexer', 'auto')
|
||||
self.transformer = o.pop('transformer', None)
|
||||
self.start = o.pop('start', 'start')
|
||||
self.profile = o.pop('profile', False)
|
||||
self.ambiguity = o.pop('ambiguity', 'auto')
|
||||
self.propagate_positions = o.pop('propagate_positions', False)
|
||||
self.earley__predict_all = o.pop('earley__predict_all', False)
|
||||
self.lexer_callbacks = o.pop('lexer_callbacks', {})
|
||||
|
||||
assert self.parser in ('earley', 'lalr', 'cyk', None)
|
||||
|
||||
if self.parser == 'earley' and self.transformer:
|
||||
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
|
||||
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)')
|
||||
|
||||
if o:
|
||||
raise ValueError("Unknown options: %s" % o.keys())
|
||||
|
||||
|
||||
class Profiler:
|
||||
def __init__(self):
|
||||
self.total_time = defaultdict(float)
|
||||
self.cur_section = '__init__'
|
||||
self.last_enter_time = time.time()
|
||||
|
||||
def enter_section(self, name):
|
||||
cur_time = time.time()
|
||||
self.total_time[self.cur_section] += cur_time - self.last_enter_time
|
||||
self.last_enter_time = cur_time
|
||||
self.cur_section = name
|
||||
|
||||
def make_wrapper(self, name, f):
|
||||
def wrapper(*args, **kwargs):
|
||||
last_section = self.cur_section
|
||||
self.enter_section(name)
|
||||
try:
|
||||
return f(*args, **kwargs)
|
||||
finally:
|
||||
self.enter_section(last_section)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class Lark:
|
||||
def __init__(self, grammar, **options):
|
||||
"""
|
||||
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
|
||||
options : a dictionary controlling various aspects of Lark.
|
||||
"""
|
||||
self.options = LarkOptions(options)
|
||||
|
||||
# Some, but not all file-like objects have a 'name' attribute
|
||||
try:
|
||||
self.source = grammar.name
|
||||
except AttributeError:
|
||||
self.source = '<string>'
|
||||
cache_file = "larkcache_%s" % str(hash(grammar)%(2**32))
|
||||
else:
|
||||
cache_file = "larkcache_%s" % os.path.basename(self.source)
|
||||
|
||||
# Drain file-like objects to get their contents
|
||||
try:
|
||||
read = grammar.read
|
||||
except AttributeError:
|
||||
pass
|
||||
else:
|
||||
grammar = read()
|
||||
|
||||
assert isinstance(grammar, STRING_TYPE)
|
||||
|
||||
if self.options.cache_grammar:
|
||||
raise NotImplementedError("Not available yet")
|
||||
|
||||
assert not self.options.profile, "Feature temporarily disabled"
|
||||
self.profiler = Profiler() if self.options.profile else None
|
||||
|
||||
if self.options.lexer == 'auto':
|
||||
if self.options.parser == 'lalr':
|
||||
self.options.lexer = 'contextual'
|
||||
elif self.options.parser == 'earley':
|
||||
self.options.lexer = 'dynamic'
|
||||
elif self.options.parser == 'cyk':
|
||||
self.options.lexer = 'standard'
|
||||
else:
|
||||
assert False, self.options.parser
|
||||
lexer = self.options.lexer
|
||||
assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer)
|
||||
|
||||
if self.options.ambiguity == 'auto':
|
||||
if self.options.parser == 'earley':
|
||||
self.options.ambiguity = 'resolve'
|
||||
else:
|
||||
disambig_parsers = ['earley', 'cyk']
|
||||
assert self.options.parser in disambig_parsers, (
|
||||
'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
|
||||
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', 'resolve__antiscore_sum')
|
||||
|
||||
# Parse the grammar file and compose the grammars (TODO)
|
||||
self.grammar = load_grammar(grammar, self.source)
|
||||
|
||||
# Compile the EBNF grammar into BNF
|
||||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile()
|
||||
|
||||
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks)
|
||||
|
||||
if self.options.parser:
|
||||
self.parser = self._build_parser()
|
||||
elif lexer:
|
||||
self.lexer = self._build_lexer()
|
||||
|
||||
if self.profiler: self.profiler.enter_section('outside_lark')
|
||||
|
||||
__init__.__doc__ = "\nOPTIONS:" + LarkOptions.OPTIONS_DOC
|
||||
|
||||
def _build_lexer(self):
|
||||
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
|
||||
|
||||
def _build_parser(self):
|
||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer)
|
||||
|
||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr')
|
||||
callback = self._parse_tree_builder.create_callback(self.options.transformer)
|
||||
if self.profiler:
|
||||
for f in dir(callback):
|
||||
if not (f.startswith('__') and f.endswith('__')):
|
||||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
|
||||
|
||||
parser_conf = ParserConf(self.rules, callback, self.options.start)
|
||||
|
||||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
|
||||
|
||||
@classmethod
|
||||
def open(cls, grammar_filename, rel_to=None, **options):
|
||||
"""Create an instance of Lark with the grammar given by its filename
|
||||
|
||||
If rel_to is provided, the function will find the grammar filename in relation to it.
|
||||
|
||||
Example:
|
||||
|
||||
>>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
|
||||
Lark(...)
|
||||
|
||||
"""
|
||||
if rel_to:
|
||||
basepath = os.path.dirname(rel_to)
|
||||
grammar_filename = os.path.join(basepath, grammar_filename)
|
||||
with open(grammar_filename, encoding='utf8') as f:
|
||||
return cls(f, **options)
|
||||
|
||||
def __repr__(self):
|
||||
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
|
||||
|
||||
|
||||
def lex(self, text):
|
||||
"Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
|
||||
if not hasattr(self, 'lexer'):
|
||||
self.lexer = self._build_lexer()
|
||||
stream = self.lexer.lex(text)
|
||||
if self.options.postlex:
|
||||
return self.options.postlex.process(stream)
|
||||
return stream
|
||||
|
||||
def parse(self, text):
|
||||
"Parse the given text, according to the options provided. Returns a tree, unless specified otherwise."
|
||||
return self.parser.parse(text)
|
||||
|
||||
# if self.profiler:
|
||||
# self.profiler.enter_section('lex')
|
||||
# l = list(self.lex(text))
|
||||
# self.profiler.enter_section('parse')
|
||||
# try:
|
||||
# return self.parser.parse(l)
|
||||
# finally:
|
||||
# self.profiler.enter_section('outside_lark')
|
||||
# else:
|
||||
# l = list(self.lex(text))
|
||||
# return self.parser.parse(l)
|
||||
252
python/extractor/lark/lexer.py
Normal file
252
python/extractor/lark/lexer.py
Normal file
@@ -0,0 +1,252 @@
|
||||
## Lexer Implementation
|
||||
|
||||
import re
|
||||
|
||||
from .utils import Str, classify
|
||||
from .common import PatternStr, PatternRE, TokenDef
|
||||
from .exceptions import UnexpectedCharacters, LexError
|
||||
|
||||
###{standalone
|
||||
class Token(Str):
|
||||
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column')
|
||||
|
||||
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
|
||||
self = super(Token, cls).__new__(cls, value)
|
||||
self.type = type_
|
||||
self.pos_in_stream = pos_in_stream
|
||||
self.value = value
|
||||
self.line = line
|
||||
self.column = column
|
||||
self.end_line = None
|
||||
self.end_column = None
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def new_borrow_pos(cls, type_, value, borrow_t):
|
||||
return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, ))
|
||||
|
||||
def __repr__(self):
|
||||
return 'Token(%s, %r)' % (self.type, self.value)
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Token) and self.type != other.type:
|
||||
return False
|
||||
|
||||
return Str.__eq__(self, other)
|
||||
|
||||
__hash__ = Str.__hash__
|
||||
|
||||
|
||||
class LineCounter:
|
||||
def __init__(self):
|
||||
self.newline_char = '\n'
|
||||
self.char_pos = 0
|
||||
self.line = 1
|
||||
self.column = 1
|
||||
self.line_start_pos = 0
|
||||
|
||||
def feed(self, token, test_newline=True):
|
||||
"""Consume a token and calculate the new line & column.
|
||||
|
||||
As an optional optimization, set test_newline=False is token doesn't contain a newline.
|
||||
"""
|
||||
if test_newline:
|
||||
newlines = token.count(self.newline_char)
|
||||
if newlines:
|
||||
self.line += newlines
|
||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
|
||||
|
||||
self.char_pos += len(token)
|
||||
self.column = self.char_pos - self.line_start_pos + 1
|
||||
|
||||
class _Lex:
|
||||
"Built to serve both Lexer and ContextualLexer"
|
||||
def __init__(self, lexer, state=None):
|
||||
self.lexer = lexer
|
||||
self.state = state
|
||||
|
||||
def lex(self, stream, newline_types, ignore_types):
|
||||
newline_types = list(newline_types)
|
||||
ignore_types = list(ignore_types)
|
||||
line_ctr = LineCounter()
|
||||
|
||||
t = None
|
||||
while True:
|
||||
lexer = self.lexer
|
||||
for mre, type_from_index in lexer.mres:
|
||||
m = mre.match(stream, line_ctr.char_pos)
|
||||
if m:
|
||||
value = m.group(0)
|
||||
type_ = type_from_index[m.lastindex]
|
||||
if type_ not in ignore_types:
|
||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
if t.type in lexer.callback:
|
||||
t = lexer.callback[t.type](t)
|
||||
yield t
|
||||
else:
|
||||
if type_ in lexer.callback:
|
||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
lexer.callback[type_](t)
|
||||
|
||||
line_ctr.feed(value, type_ in newline_types)
|
||||
if t:
|
||||
t.end_line = line_ctr.line
|
||||
t.end_column = line_ctr.column
|
||||
|
||||
break
|
||||
else:
|
||||
if line_ctr.char_pos < len(stream):
|
||||
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, state=self.state)
|
||||
break
|
||||
|
||||
class UnlessCallback:
|
||||
def __init__(self, mres):
|
||||
self.mres = mres
|
||||
|
||||
def __call__(self, t):
|
||||
for mre, type_from_index in self.mres:
|
||||
m = mre.match(t.value)
|
||||
if m:
|
||||
t.type = type_from_index[m.lastindex]
|
||||
break
|
||||
return t
|
||||
|
||||
###}
|
||||
|
||||
|
||||
|
||||
def _create_unless(tokens):
|
||||
tokens_by_type = classify(tokens, lambda t: type(t.pattern))
|
||||
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
|
||||
embedded_strs = set()
|
||||
callback = {}
|
||||
for retok in tokens_by_type.get(PatternRE, []):
|
||||
unless = [] # {}
|
||||
for strtok in tokens_by_type.get(PatternStr, []):
|
||||
if strtok.priority > retok.priority:
|
||||
continue
|
||||
s = strtok.pattern.value
|
||||
m = re.match(retok.pattern.to_regexp(), s)
|
||||
if m and m.group(0) == s:
|
||||
unless.append(strtok)
|
||||
if strtok.pattern.flags <= retok.pattern.flags:
|
||||
embedded_strs.add(strtok)
|
||||
if unless:
|
||||
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
|
||||
|
||||
tokens = [t for t in tokens if t not in embedded_strs]
|
||||
return tokens, callback
|
||||
|
||||
|
||||
def _build_mres(tokens, max_size, match_whole):
|
||||
# Python sets an unreasonable group limit (currently 100) in its re module
|
||||
# Worse, the only way to know we reached it is by catching an AssertionError!
|
||||
# This function recursively tries less and less groups until it's successful.
|
||||
postfix = '$' if match_whole else ''
|
||||
mres = []
|
||||
while tokens:
|
||||
try:
|
||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size]))
|
||||
except AssertionError: # Yes, this is what Python provides us.. :/
|
||||
return _build_mres(tokens, max_size//2, match_whole)
|
||||
|
||||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
|
||||
tokens = tokens[max_size:]
|
||||
return mres
|
||||
|
||||
def build_mres(tokens, match_whole=False):
|
||||
return _build_mres(tokens, len(tokens), match_whole)
|
||||
|
||||
def _regexp_has_newline(r):
|
||||
return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r)
|
||||
|
||||
class Lexer:
|
||||
"""Lexer interface
|
||||
|
||||
Method Signatures:
|
||||
lex(self, stream) -> Iterator[Token]
|
||||
|
||||
set_parser_state(self, state) # Optional
|
||||
"""
|
||||
set_parser_state = NotImplemented
|
||||
lex = NotImplemented
|
||||
|
||||
class TraditionalLexer(Lexer):
|
||||
def __init__(self, tokens, ignore=(), user_callbacks={}):
|
||||
assert all(isinstance(t, TokenDef) for t in tokens), tokens
|
||||
|
||||
tokens = list(tokens)
|
||||
|
||||
# Sanitization
|
||||
for t in tokens:
|
||||
try:
|
||||
re.compile(t.pattern.to_regexp())
|
||||
except:
|
||||
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
|
||||
|
||||
if t.pattern.min_width == 0:
|
||||
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
|
||||
|
||||
assert set(ignore) <= {t.name for t in tokens}
|
||||
|
||||
# Init
|
||||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
|
||||
self.ignore_types = list(ignore)
|
||||
|
||||
tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
|
||||
|
||||
tokens, self.callback = _create_unless(tokens)
|
||||
assert all(self.callback.values())
|
||||
|
||||
for type_, f in user_callbacks.items():
|
||||
assert type_ not in self.callback
|
||||
self.callback[type_] = f
|
||||
|
||||
self.tokens = tokens
|
||||
|
||||
self.mres = build_mres(tokens)
|
||||
|
||||
def lex(self, stream):
|
||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
|
||||
|
||||
|
||||
class ContextualLexer(Lexer):
|
||||
def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}):
|
||||
tokens_by_name = {}
|
||||
for t in tokens:
|
||||
assert t.name not in tokens_by_name, t
|
||||
tokens_by_name[t.name] = t
|
||||
|
||||
lexer_by_tokens = {}
|
||||
self.lexers = {}
|
||||
for state, accepts in states.items():
|
||||
key = frozenset(accepts)
|
||||
try:
|
||||
lexer = lexer_by_tokens[key]
|
||||
except KeyError:
|
||||
accepts = set(accepts) | set(ignore) | set(always_accept)
|
||||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
|
||||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
|
||||
lexer_by_tokens[key] = lexer
|
||||
|
||||
self.lexers[state] = lexer
|
||||
|
||||
self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks)
|
||||
|
||||
self.set_parser_state(None) # Needs to be set on the outside
|
||||
|
||||
def set_parser_state(self, state):
|
||||
self.parser_state = state
|
||||
|
||||
def lex(self, stream):
|
||||
l = _Lex(self.lexers[self.parser_state], self.parser_state)
|
||||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
|
||||
yield x
|
||||
l.lexer = self.lexers[self.parser_state]
|
||||
l.state = self.parser_state
|
||||
741
python/extractor/lark/load_grammar.py
Normal file
741
python/extractor/lark/load_grammar.py
Normal file
@@ -0,0 +1,741 @@
|
||||
"Parses and creates Grammar objects"
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
from itertools import chain
|
||||
import re
|
||||
from ast import literal_eval
|
||||
from copy import deepcopy
|
||||
import pkgutil
|
||||
|
||||
from .lexer import Token
|
||||
|
||||
|
||||
from .parse_tree_builder import ParseTreeBuilder
|
||||
from .parser_frontends import LALR_TraditionalLexer
|
||||
from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
|
||||
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
|
||||
from .utils import classify, suppress
|
||||
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
|
||||
|
||||
from .tree import Tree, SlottedTree as ST
|
||||
from .visitors import Transformer, Visitor, v_args, Transformer_InPlace
|
||||
inline_args = v_args(inline=True)
|
||||
|
||||
__path__ = os.path.dirname(__file__)
|
||||
|
||||
GRAMMAR_PACKAGES = ['lark.grammars']
|
||||
|
||||
EXT = '.lark'
|
||||
|
||||
_RE_FLAGS = 'imslux'
|
||||
|
||||
def is_terminal(sym):
|
||||
return sym.isupper()
|
||||
|
||||
_TERMINAL_NAMES = {
|
||||
'.' : 'DOT',
|
||||
',' : 'COMMA',
|
||||
':' : 'COLON',
|
||||
';' : 'SEMICOLON',
|
||||
'+' : 'PLUS',
|
||||
'-' : 'MINUS',
|
||||
'*' : 'STAR',
|
||||
'/' : 'SLASH',
|
||||
'\\' : 'BACKSLASH',
|
||||
'|' : 'VBAR',
|
||||
'?' : 'QMARK',
|
||||
'!' : 'BANG',
|
||||
'@' : 'AT',
|
||||
'#' : 'HASH',
|
||||
'$' : 'DOLLAR',
|
||||
'%' : 'PERCENT',
|
||||
'^' : 'CIRCUMFLEX',
|
||||
'&' : 'AMPERSAND',
|
||||
'_' : 'UNDERSCORE',
|
||||
'<' : 'LESSTHAN',
|
||||
'>' : 'MORETHAN',
|
||||
'=' : 'EQUAL',
|
||||
'"' : 'DBLQUOTE',
|
||||
'\'' : 'QUOTE',
|
||||
'`' : 'BACKQUOTE',
|
||||
'~' : 'TILDE',
|
||||
'(' : 'LPAR',
|
||||
')' : 'RPAR',
|
||||
'{' : 'LBRACE',
|
||||
'}' : 'RBRACE',
|
||||
'[' : 'LSQB',
|
||||
']' : 'RSQB',
|
||||
'\n' : 'NEWLINE',
|
||||
'\r\n' : 'CRLF',
|
||||
'\t' : 'TAB',
|
||||
' ' : 'SPACE',
|
||||
}
|
||||
|
||||
# Grammar Parser
|
||||
TERMINALS = {
|
||||
'_LPAR': r'\(',
|
||||
'_RPAR': r'\)',
|
||||
'_LBRA': r'\[',
|
||||
'_RBRA': r'\]',
|
||||
'OP': '[+*][?]?|[?](?![a-z])',
|
||||
'_COLON': ':',
|
||||
'_COMMA': ',',
|
||||
'_OR': r'\|',
|
||||
'_DOT': r'\.',
|
||||
'TILDE': '~',
|
||||
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
|
||||
'TERMINAL': '_?[A-Z][_A-Z0-9]*',
|
||||
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
|
||||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
|
||||
'_NL': r'(\r?\n)+\s*',
|
||||
'WS': r'[ \t]+',
|
||||
'COMMENT': r'//[^\n]*',
|
||||
'_TO': '->',
|
||||
'_IGNORE': r'%ignore',
|
||||
'_DECLARE': r'%declare',
|
||||
'_IMPORT': r'%import',
|
||||
'NUMBER': r'\d+',
|
||||
}
|
||||
|
||||
RULES = {
|
||||
'start': ['_list'],
|
||||
'_list': ['_item', '_list _item'],
|
||||
'_item': ['rule', 'token', 'statement', '_NL'],
|
||||
|
||||
'rule': ['RULE _COLON expansions _NL',
|
||||
'RULE _DOT NUMBER _COLON expansions _NL'],
|
||||
'expansions': ['alias',
|
||||
'expansions _OR alias',
|
||||
'expansions _NL _OR alias'],
|
||||
|
||||
'?alias': ['expansion _TO RULE', 'expansion'],
|
||||
'expansion': ['_expansion'],
|
||||
|
||||
'_expansion': ['', '_expansion expr'],
|
||||
|
||||
'?expr': ['atom',
|
||||
'atom OP',
|
||||
'atom TILDE NUMBER',
|
||||
'atom TILDE NUMBER _DOT _DOT NUMBER',
|
||||
],
|
||||
|
||||
'?atom': ['_LPAR expansions _RPAR',
|
||||
'maybe',
|
||||
'value'],
|
||||
|
||||
'value': ['terminal',
|
||||
'nonterminal',
|
||||
'literal',
|
||||
'range'],
|
||||
|
||||
'terminal': ['TERMINAL'],
|
||||
'nonterminal': ['RULE'],
|
||||
|
||||
'?name': ['RULE', 'TERMINAL'],
|
||||
|
||||
'maybe': ['_LBRA expansions _RBRA'],
|
||||
'range': ['STRING _DOT _DOT STRING'],
|
||||
|
||||
'token': ['TERMINAL _COLON expansions _NL',
|
||||
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
|
||||
'statement': ['ignore', 'import', 'declare'],
|
||||
'ignore': ['_IGNORE expansions _NL'],
|
||||
'declare': ['_DECLARE _declare_args _NL'],
|
||||
'import': ['_IMPORT _import_path _NL',
|
||||
'_IMPORT _import_path _LPAR name_list _RPAR _NL',
|
||||
'_IMPORT _import_path _TO TERMINAL _NL'],
|
||||
|
||||
'_import_path': ['import_lib', 'import_rel'],
|
||||
'import_lib': ['_import_args'],
|
||||
'import_rel': ['_DOT _import_args'],
|
||||
'_import_args': ['name', '_import_args _DOT name'],
|
||||
|
||||
'name_list': ['_name_list'],
|
||||
'_name_list': ['name', '_name_list _COMMA name'],
|
||||
|
||||
'_declare_args': ['name', '_declare_args name'],
|
||||
'literal': ['REGEXP', 'STRING'],
|
||||
}
|
||||
|
||||
|
||||
@inline_args
|
||||
class EBNF_to_BNF(Transformer_InPlace):
|
||||
def __init__(self):
|
||||
self.new_rules = []
|
||||
self.rules_by_expr = {}
|
||||
self.prefix = 'anon'
|
||||
self.i = 0
|
||||
self.rule_options = None
|
||||
|
||||
def _add_recurse_rule(self, type_, expr):
|
||||
if expr in self.rules_by_expr:
|
||||
return self.rules_by_expr[expr]
|
||||
|
||||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
|
||||
self.i += 1
|
||||
t = NonTerminal(Token('RULE', new_name, -1))
|
||||
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
|
||||
self.new_rules.append((new_name, tree, self.rule_options))
|
||||
self.rules_by_expr[expr] = t
|
||||
return t
|
||||
|
||||
def expr(self, rule, op, *args):
|
||||
if op.value == '?':
|
||||
return ST('expansions', [rule, ST('expansion', [])])
|
||||
elif op.value == '+':
|
||||
# a : b c+ d
|
||||
# -->
|
||||
# a : b _c d
|
||||
# _c : _c c | c;
|
||||
return self._add_recurse_rule('plus', rule)
|
||||
elif op.value == '*':
|
||||
# a : b c* d
|
||||
# -->
|
||||
# a : b _c? d
|
||||
# _c : _c c | c;
|
||||
new_name = self._add_recurse_rule('star', rule)
|
||||
return ST('expansions', [new_name, ST('expansion', [])])
|
||||
elif op.value == '~':
|
||||
if len(args) == 1:
|
||||
mn = mx = int(args[0])
|
||||
else:
|
||||
mn, mx = map(int, args)
|
||||
if mx < mn:
|
||||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
|
||||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
|
||||
assert False, op
|
||||
|
||||
|
||||
class SimplifyRule_Visitor(Visitor):
|
||||
|
||||
@staticmethod
|
||||
def _flatten(tree):
|
||||
while True:
|
||||
to_expand = [i for i, child in enumerate(tree.children)
|
||||
if isinstance(child, Tree) and child.data == tree.data]
|
||||
if not to_expand:
|
||||
break
|
||||
tree.expand_kids_by_index(*to_expand)
|
||||
|
||||
def expansion(self, tree):
|
||||
# rules_list unpacking
|
||||
# a : b (c|d) e
|
||||
# -->
|
||||
# a : b c e | b d e
|
||||
#
|
||||
# In AST terms:
|
||||
# expansion(b, expansions(c, d), e)
|
||||
# -->
|
||||
# expansions( expansion(b, c, e), expansion(b, d, e) )
|
||||
|
||||
self._flatten(tree)
|
||||
|
||||
for i, child in enumerate(tree.children):
|
||||
if isinstance(child, Tree) and child.data == 'expansions':
|
||||
tree.data = 'expansions'
|
||||
tree.children = [self.visit(ST('expansion', [option if i==j else other
|
||||
for j, other in enumerate(tree.children)]))
|
||||
for option in set(child.children)]
|
||||
self._flatten(tree)
|
||||
break
|
||||
|
||||
def alias(self, tree):
|
||||
rule, alias_name = tree.children
|
||||
if rule.data == 'expansions':
|
||||
aliases = []
|
||||
for child in tree.children[0].children:
|
||||
aliases.append(ST('alias', [child, alias_name]))
|
||||
tree.data = 'expansions'
|
||||
tree.children = aliases
|
||||
|
||||
def expansions(self, tree):
|
||||
self._flatten(tree)
|
||||
tree.children = list(set(tree.children))
|
||||
|
||||
|
||||
class RuleTreeToText(Transformer):
|
||||
def expansions(self, x):
|
||||
return x
|
||||
def expansion(self, symbols):
|
||||
return symbols, None
|
||||
def alias(self, x):
|
||||
(expansion, _alias), alias = x
|
||||
assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed
|
||||
return expansion, alias.value
|
||||
|
||||
|
||||
@inline_args
|
||||
class CanonizeTree(Transformer_InPlace):
|
||||
def maybe(self, expr):
|
||||
return ST('expr', [expr, Token('OP', '?', -1)])
|
||||
|
||||
def tokenmods(self, *args):
|
||||
if len(args) == 1:
|
||||
return list(args)
|
||||
tokenmods, value = args
|
||||
return tokenmods + [value]
|
||||
|
||||
class PrepareAnonTerminals(Transformer_InPlace):
|
||||
"Create a unique list of anonymous tokens. Attempt to give meaningful names to them when we add them"
|
||||
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
self.token_set = {td.name for td in self.tokens}
|
||||
self.token_reverse = {td.pattern: td for td in tokens}
|
||||
self.i = 0
|
||||
|
||||
|
||||
@inline_args
|
||||
def pattern(self, p):
|
||||
value = p.value
|
||||
if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags:
|
||||
raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
|
||||
|
||||
token_name = None
|
||||
|
||||
if isinstance(p, PatternStr):
|
||||
try:
|
||||
# If already defined, use the user-defined token name
|
||||
token_name = self.token_reverse[p].name
|
||||
except KeyError:
|
||||
# Try to assign an indicative anon-token name
|
||||
try:
|
||||
token_name = _TERMINAL_NAMES[value]
|
||||
except KeyError:
|
||||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set:
|
||||
with suppress(UnicodeEncodeError):
|
||||
value.upper().encode('ascii') # Make sure we don't have unicode in our token names
|
||||
token_name = value.upper()
|
||||
|
||||
elif isinstance(p, PatternRE):
|
||||
if p in self.token_reverse: # Kind of a wierd placement.name
|
||||
token_name = self.token_reverse[p].name
|
||||
else:
|
||||
assert False, p
|
||||
|
||||
if token_name is None:
|
||||
token_name = '__ANON_%d' % self.i
|
||||
self.i += 1
|
||||
|
||||
if token_name not in self.token_set:
|
||||
assert p not in self.token_reverse
|
||||
self.token_set.add(token_name)
|
||||
tokendef = TokenDef(token_name, p)
|
||||
self.token_reverse[p] = tokendef
|
||||
self.tokens.append(tokendef)
|
||||
|
||||
return Terminal(token_name, filter_out=isinstance(p, PatternStr))
|
||||
|
||||
|
||||
def _rfind(s, choices):
|
||||
return max(s.rfind(c) for c in choices)
|
||||
|
||||
|
||||
|
||||
def _fix_escaping(s):
|
||||
w = ''
|
||||
i = iter(s)
|
||||
for n in i:
|
||||
w += n
|
||||
if n == '\\':
|
||||
n2 = next(i)
|
||||
if n2 == '\\':
|
||||
w += '\\\\'
|
||||
elif n2 not in 'unftr':
|
||||
w += '\\'
|
||||
w += n2
|
||||
w = w.replace('\\"', '"').replace("'", "\\'")
|
||||
|
||||
to_eval = "u'''%s'''" % w
|
||||
try:
|
||||
s = literal_eval(to_eval)
|
||||
except SyntaxError as e:
|
||||
raise ValueError(s, e)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def _literal_to_pattern(literal):
|
||||
v = literal.value
|
||||
flag_start = _rfind(v, '/"')+1
|
||||
assert flag_start > 0
|
||||
flags = v[flag_start:]
|
||||
assert all(f in _RE_FLAGS for f in flags), flags
|
||||
|
||||
v = v[:flag_start]
|
||||
assert v[0] == v[-1] and v[0] in '"/'
|
||||
x = v[1:-1]
|
||||
|
||||
s = _fix_escaping(x)
|
||||
|
||||
if literal.type == 'STRING':
|
||||
s = s.replace('\\\\', '\\')
|
||||
|
||||
return { 'STRING': PatternStr,
|
||||
'REGEXP': PatternRE }[literal.type](s, flags)
|
||||
|
||||
|
||||
@inline_args
|
||||
class PrepareLiterals(Transformer_InPlace):
|
||||
def literal(self, literal):
|
||||
return ST('pattern', [_literal_to_pattern(literal)])
|
||||
|
||||
def range(self, start, end):
|
||||
assert start.type == end.type == 'STRING'
|
||||
start = start.value[1:-1]
|
||||
end = end.value[1:-1]
|
||||
assert len(start) == len(end) == 1, (start, end, len(start), len(end))
|
||||
regexp = '[%s-%s]' % (start, end)
|
||||
return ST('pattern', [PatternRE(regexp)])
|
||||
|
||||
|
||||
class TokenTreeToPattern(Transformer):
|
||||
def pattern(self, ps):
|
||||
p ,= ps
|
||||
return p
|
||||
|
||||
def expansion(self, items):
|
||||
assert items
|
||||
if len(items) == 1:
|
||||
return items[0]
|
||||
if len({i.flags for i in items}) > 1:
|
||||
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
|
||||
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())
|
||||
|
||||
def expansions(self, exps):
|
||||
if len(exps) == 1:
|
||||
return exps[0]
|
||||
if len({i.flags for i in exps}) > 1:
|
||||
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
|
||||
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags)
|
||||
|
||||
def expr(self, args):
|
||||
inner, op = args[:2]
|
||||
if op == '~':
|
||||
if len(args) == 3:
|
||||
op = "{%d}" % int(args[2])
|
||||
else:
|
||||
mn, mx = map(int, args[2:])
|
||||
if mx < mn:
|
||||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
|
||||
op = "{%d,%d}" % (mn, mx)
|
||||
else:
|
||||
assert len(args) == 2
|
||||
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
|
||||
|
||||
def alias(self, t):
|
||||
raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
|
||||
|
||||
def value(self, v):
|
||||
return v[0]
|
||||
|
||||
class PrepareSymbols(Transformer_InPlace):
|
||||
def value(self, v):
|
||||
v ,= v
|
||||
if isinstance(v, Tree):
|
||||
return v
|
||||
elif v.type == 'RULE':
|
||||
return NonTerminal(v.value)
|
||||
elif v.type == 'TERMINAL':
|
||||
return Terminal(v.value, filter_out=v.startswith('_'))
|
||||
assert False
|
||||
|
||||
def _choice_of_rules(rules):
|
||||
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
|
||||
|
||||
class Grammar:
|
||||
def __init__(self, rule_defs, token_defs, ignore):
|
||||
self.token_defs = token_defs
|
||||
self.rule_defs = rule_defs
|
||||
self.ignore = ignore
|
||||
|
||||
def compile(self):
|
||||
# We change the trees in-place (to support huge grammars)
|
||||
# So deepcopy allows calling compile more than once.
|
||||
token_defs = deepcopy(list(self.token_defs))
|
||||
rule_defs = deepcopy(self.rule_defs)
|
||||
|
||||
# =================
|
||||
# Compile Tokens
|
||||
# =================
|
||||
|
||||
# Convert token-trees to strings/regexps
|
||||
transformer = PrepareLiterals() * TokenTreeToPattern()
|
||||
for name, (token_tree, priority) in token_defs:
|
||||
if token_tree is None: # Terminal added through %declare
|
||||
continue
|
||||
expansions = list(token_tree.find_data('expansion'))
|
||||
if len(expansions) == 1 and not expansions[0].children:
|
||||
raise GrammarError("Terminals cannot be empty (%s)" % name)
|
||||
|
||||
tokens = [TokenDef(name, transformer.transform(token_tree), priority)
|
||||
for name, (token_tree, priority) in token_defs if token_tree]
|
||||
|
||||
# =================
|
||||
# Compile Rules
|
||||
# =================
|
||||
|
||||
# 1. Pre-process terminals
|
||||
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(tokens) # Adds to tokens
|
||||
|
||||
# 2. Convert EBNF to BNF (and apply step 1)
|
||||
ebnf_to_bnf = EBNF_to_BNF()
|
||||
rules = []
|
||||
for name, rule_tree, options in rule_defs:
|
||||
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
|
||||
tree = transformer.transform(rule_tree)
|
||||
rules.append((name, ebnf_to_bnf.transform(tree), options))
|
||||
rules += ebnf_to_bnf.new_rules
|
||||
|
||||
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
|
||||
|
||||
# 3. Compile tree to Rule objects
|
||||
rule_tree_to_text = RuleTreeToText()
|
||||
|
||||
simplify_rule = SimplifyRule_Visitor()
|
||||
compiled_rules = []
|
||||
for name, tree, options in rules:
|
||||
simplify_rule.visit(tree)
|
||||
expansions = rule_tree_to_text.transform(tree)
|
||||
|
||||
for expansion, alias in expansions:
|
||||
if alias and name.startswith('_'):
|
||||
raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
|
||||
|
||||
assert all(isinstance(x, Symbol) for x in expansion), expansion
|
||||
|
||||
rule = Rule(NonTerminal(name), expansion, alias, options)
|
||||
compiled_rules.append(rule)
|
||||
|
||||
return tokens, compiled_rules, self.ignore
|
||||
|
||||
|
||||
_imported_grammars = {}
|
||||
def import_grammar(grammar_path):
|
||||
if grammar_path not in _imported_grammars:
|
||||
for package in GRAMMAR_PACKAGES:
|
||||
text = pkgutil.get_data(package, grammar_path).decode("utf-8")
|
||||
grammar = load_grammar(text, grammar_path)
|
||||
_imported_grammars[grammar_path] = grammar
|
||||
|
||||
return _imported_grammars[grammar_path]
|
||||
|
||||
|
||||
def resolve_token_references(token_defs):
|
||||
# TODO Cycles detection
|
||||
# TODO Solve with transitive closure (maybe)
|
||||
|
||||
token_dict = {k:t for k, (t,_p) in token_defs}
|
||||
assert len(token_dict) == len(token_defs), "Same name defined twice?"
|
||||
|
||||
while True:
|
||||
changed = False
|
||||
for name, (token_tree, _p) in token_defs:
|
||||
if token_tree is None: # Terminal added through %declare
|
||||
continue
|
||||
for exp in token_tree.find_data('value'):
|
||||
item ,= exp.children
|
||||
if isinstance(item, Token):
|
||||
if item.type == 'RULE':
|
||||
raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
|
||||
if item.type == 'TERMINAL':
|
||||
exp.children[0] = token_dict[item]
|
||||
changed = True
|
||||
if not changed:
|
||||
break
|
||||
|
||||
def options_from_rule(name, *x):
|
||||
if len(x) > 1:
|
||||
priority, expansions = x
|
||||
priority = int(priority)
|
||||
else:
|
||||
expansions ,= x
|
||||
priority = None
|
||||
|
||||
keep_all_tokens = name.startswith('!')
|
||||
name = name.lstrip('!')
|
||||
expand1 = name.startswith('?')
|
||||
name = name.lstrip('?')
|
||||
|
||||
return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
|
||||
|
||||
|
||||
def symbols_from_strcase(expansion):
|
||||
return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion]
|
||||
|
||||
@inline_args
|
||||
class PrepareGrammar(Transformer_InPlace):
|
||||
def terminal(self, name):
|
||||
return name
|
||||
def nonterminal(self, name):
|
||||
return name
|
||||
|
||||
|
||||
class GrammarLoader:
|
||||
def __init__(self):
|
||||
tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
|
||||
|
||||
rules = [options_from_rule(name, x) for name, x in RULES.items()]
|
||||
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs]
|
||||
callback = ParseTreeBuilder(rules, ST).create_callback()
|
||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
|
||||
|
||||
parser_conf = ParserConf(rules, callback, 'start')
|
||||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
|
||||
|
||||
self.canonize_tree = CanonizeTree()
|
||||
|
||||
def load_grammar(self, grammar_text, grammar_name='<?>'):
|
||||
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
|
||||
|
||||
try:
|
||||
tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
|
||||
except UnexpectedCharacters as e:
|
||||
context = e.get_context(grammar_text)
|
||||
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
|
||||
(e.line, e.column, grammar_name, context))
|
||||
except UnexpectedToken as e:
|
||||
context = e.get_context(grammar_text)
|
||||
error = e.match_examples(self.parser.parse, {
|
||||
'Unclosed parenthesis': ['a: (\n'],
|
||||
'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'],
|
||||
'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'],
|
||||
'Alias expects lowercase name': ['a: -> "a"\n'],
|
||||
'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'],
|
||||
'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'],
|
||||
'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'],
|
||||
'%import expects a name': ['%import "a"\n'],
|
||||
'%ignore expects a value': ['%ignore %import\n'],
|
||||
})
|
||||
if error:
|
||||
raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context))
|
||||
elif 'STRING' in e.expected:
|
||||
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
|
||||
raise
|
||||
|
||||
tree = PrepareGrammar().transform(tree)
|
||||
|
||||
# Extract grammar items
|
||||
defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
|
||||
token_defs = defs.pop('token', [])
|
||||
rule_defs = defs.pop('rule', [])
|
||||
statements = defs.pop('statement', [])
|
||||
assert not defs
|
||||
|
||||
token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]
|
||||
token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]
|
||||
|
||||
# Execute statements
|
||||
ignore = []
|
||||
declared = []
|
||||
for (stmt,) in statements:
|
||||
if stmt.data == 'ignore':
|
||||
t ,= stmt.children
|
||||
ignore.append(t)
|
||||
elif stmt.data == 'import':
|
||||
if len(stmt.children) > 1:
|
||||
path_node, arg1 = stmt.children
|
||||
else:
|
||||
path_node ,= stmt.children
|
||||
arg1 = None
|
||||
|
||||
dotted_path = path_node.children
|
||||
|
||||
if isinstance(arg1, Tree): # Multi import
|
||||
names = arg1.children
|
||||
aliases = names # Can't have aliased multi import, so all aliases will be the same as names
|
||||
else: # Single import
|
||||
names = [dotted_path[-1]] # Get name from dotted path
|
||||
aliases = [arg1] if arg1 else names # Aliases if exist
|
||||
dotted_path = dotted_path[:-1]
|
||||
|
||||
grammar_path = os.path.join(*dotted_path) + EXT
|
||||
|
||||
if path_node.data == 'import_lib': # Import from library
|
||||
g = import_grammar(grammar_path)
|
||||
else: # Relative import
|
||||
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
|
||||
base_file = os.path.abspath(sys.modules['__main__'].__file__)
|
||||
else:
|
||||
base_file = grammar_name # Import relative to grammar file path if external grammar file
|
||||
base_path = os.path.split(base_file)[0]
|
||||
g = import_grammar(grammar_path, base_paths=[base_path])
|
||||
|
||||
for name, alias in zip(names, aliases):
|
||||
token_options = dict(g.token_defs)[name]
|
||||
assert isinstance(token_options, tuple) and len(token_options)==2
|
||||
token_defs.append([alias.value, token_options])
|
||||
|
||||
elif stmt.data == 'declare':
|
||||
for t in stmt.children:
|
||||
token_defs.append([t.value, (None, None)])
|
||||
else:
|
||||
assert False, stmt
|
||||
|
||||
|
||||
# Verify correctness 1
|
||||
for name, _ in token_defs:
|
||||
if name.startswith('__'):
|
||||
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
|
||||
|
||||
# Handle ignore tokens
|
||||
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
|
||||
# inability to handle duplicate tokens (two names, one value)
|
||||
ignore_names = []
|
||||
for t in ignore:
|
||||
if t.data=='expansions' and len(t.children) == 1:
|
||||
t2 ,= t.children
|
||||
if t2.data=='expansion' and len(t2.children) == 1:
|
||||
item ,= t2.children
|
||||
if item.data == 'value':
|
||||
item ,= item.children
|
||||
if isinstance(item, Token) and item.type == 'TERMINAL':
|
||||
ignore_names.append(item.value)
|
||||
continue
|
||||
|
||||
name = '__IGNORE_%d'% len(ignore_names)
|
||||
ignore_names.append(name)
|
||||
token_defs.append((name, (t, 0)))
|
||||
|
||||
# Verify correctness 2
|
||||
token_names = set()
|
||||
for name, _ in token_defs:
|
||||
if name in token_names:
|
||||
raise GrammarError("Token '%s' defined more than once" % name)
|
||||
token_names.add(name)
|
||||
|
||||
if set(ignore_names) > token_names:
|
||||
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
|
||||
|
||||
# Resolve token references
|
||||
resolve_token_references(token_defs)
|
||||
|
||||
rules = [options_from_rule(*x) for x in rule_defs]
|
||||
|
||||
rule_names = set()
|
||||
for name, _x, _o in rules:
|
||||
if name.startswith('__'):
|
||||
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
|
||||
if name in rule_names:
|
||||
raise GrammarError("Rule '%s' defined more than once" % name)
|
||||
rule_names.add(name)
|
||||
|
||||
for name, expansions, _o in rules:
|
||||
used_symbols = {t for x in expansions.find_data('expansion')
|
||||
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
|
||||
for sym in used_symbols:
|
||||
if is_terminal(sym):
|
||||
if sym not in token_names:
|
||||
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
|
||||
else:
|
||||
if sym not in rule_names:
|
||||
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))
|
||||
|
||||
# TODO don't include unused tokens, they can only cause trouble!
|
||||
|
||||
return Grammar(rules, token_defs, ignore_names)
|
||||
|
||||
|
||||
|
||||
load_grammar = GrammarLoader().load_grammar
|
||||
164
python/extractor/lark/parse_tree_builder.py
Normal file
164
python/extractor/lark/parse_tree_builder.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from .exceptions import GrammarError
|
||||
from .utils import suppress
|
||||
from .lexer import Token
|
||||
from .grammar import Rule
|
||||
from .tree import Tree
|
||||
from .visitors import InlineTransformer # XXX Deprecated
|
||||
|
||||
###{standalone
|
||||
from functools import partial, wraps
|
||||
|
||||
|
||||
class ExpandSingleChild:
|
||||
def __init__(self, node_builder):
|
||||
self.node_builder = node_builder
|
||||
|
||||
def __call__(self, children):
|
||||
if len(children) == 1:
|
||||
return children[0]
|
||||
else:
|
||||
return self.node_builder(children)
|
||||
|
||||
|
||||
class PropagatePositions:
|
||||
def __init__(self, node_builder):
|
||||
self.node_builder = node_builder
|
||||
|
||||
def __call__(self, children):
|
||||
res = self.node_builder(children)
|
||||
|
||||
if children and isinstance(res, Tree):
|
||||
for a in children:
|
||||
if isinstance(a, Tree):
|
||||
res.meta.line = a.meta.line
|
||||
res.meta.column = a.meta.column
|
||||
elif isinstance(a, Token):
|
||||
res.meta.line = a.line
|
||||
res.meta.column = a.column
|
||||
break
|
||||
|
||||
for a in reversed(children):
|
||||
# with suppress(AttributeError):
|
||||
if isinstance(a, Tree):
|
||||
res.meta.end_line = a.meta.end_line
|
||||
res.meta.end_column = a.meta.end_column
|
||||
elif isinstance(a, Token):
|
||||
res.meta.end_line = a.end_line
|
||||
res.meta.end_column = a.end_column
|
||||
|
||||
break
|
||||
|
||||
return res
|
||||
|
||||
|
||||
class ChildFilter:
|
||||
def __init__(self, to_include, node_builder):
|
||||
self.node_builder = node_builder
|
||||
self.to_include = to_include
|
||||
|
||||
def __call__(self, children):
|
||||
filtered = []
|
||||
for i, to_expand in self.to_include:
|
||||
if to_expand:
|
||||
filtered += children[i].children
|
||||
else:
|
||||
filtered.append(children[i])
|
||||
|
||||
return self.node_builder(filtered)
|
||||
|
||||
class ChildFilterLALR(ChildFilter):
|
||||
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
|
||||
|
||||
def __call__(self, children):
|
||||
filtered = []
|
||||
for i, to_expand in self.to_include:
|
||||
if to_expand:
|
||||
if filtered:
|
||||
filtered += children[i].children
|
||||
else: # Optimize for left-recursion
|
||||
filtered = children[i].children
|
||||
else:
|
||||
filtered.append(children[i])
|
||||
|
||||
return self.node_builder(filtered)
|
||||
|
||||
def _should_expand(sym):
|
||||
return not sym.is_term and sym.name.startswith('_')
|
||||
|
||||
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous):
|
||||
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion)
|
||||
if keep_all_tokens or not (sym.is_term and sym.filter_out)]
|
||||
|
||||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
|
||||
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)
|
||||
|
||||
|
||||
class Callback(object):
|
||||
pass
|
||||
|
||||
|
||||
def inline_args(func):
|
||||
@wraps(func)
|
||||
def f(children):
|
||||
return func(*children)
|
||||
return f
|
||||
|
||||
|
||||
|
||||
class ParseTreeBuilder:
|
||||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False):
|
||||
self.tree_class = tree_class
|
||||
self.propagate_positions = propagate_positions
|
||||
self.always_keep_all_tokens = keep_all_tokens
|
||||
self.ambiguous = ambiguous
|
||||
|
||||
self.rule_builders = list(self._init_builders(rules))
|
||||
|
||||
self.user_aliases = {}
|
||||
|
||||
def _init_builders(self, rules):
|
||||
for rule in rules:
|
||||
options = rule.options
|
||||
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
|
||||
expand_single_child = options.expand1 if options else False
|
||||
|
||||
wrapper_chain = filter(None, [
|
||||
(expand_single_child and not rule.alias) and ExpandSingleChild,
|
||||
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
|
||||
self.propagate_positions and PropagatePositions,
|
||||
])
|
||||
|
||||
yield rule, wrapper_chain
|
||||
|
||||
|
||||
def create_callback(self, transformer=None):
|
||||
callback = Callback()
|
||||
|
||||
i = 0
|
||||
for rule, wrapper_chain in self.rule_builders:
|
||||
internal_callback_name = '_cb%d_%s' % (i, rule.origin)
|
||||
i += 1
|
||||
|
||||
user_callback_name = rule.alias or rule.origin.name
|
||||
try:
|
||||
f = getattr(transformer, user_callback_name)
|
||||
assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer"
|
||||
# XXX InlineTransformer is deprecated!
|
||||
if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer):
|
||||
f = inline_args(f)
|
||||
except AttributeError:
|
||||
f = partial(self.tree_class, user_callback_name)
|
||||
|
||||
self.user_aliases[rule] = rule.alias
|
||||
rule.alias = internal_callback_name
|
||||
|
||||
for w in wrapper_chain:
|
||||
f = w(f)
|
||||
|
||||
if hasattr(callback, internal_callback_name):
|
||||
raise GrammarError("Rule '%s' already exists" % (rule,))
|
||||
setattr(callback, internal_callback_name, f)
|
||||
|
||||
return callback
|
||||
|
||||
###}
|
||||
189
python/extractor/lark/parser_frontends.py
Normal file
189
python/extractor/lark/parser_frontends.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
from .utils import get_regexp_width
|
||||
from .parsers.grammar_analysis import GrammarAnalyzer
|
||||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token
|
||||
|
||||
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
|
||||
from .tree import Tree
|
||||
|
||||
class WithLexer:
|
||||
lexer = None
|
||||
parser = None
|
||||
lexer_conf = None
|
||||
|
||||
def init_traditional_lexer(self, lexer_conf):
|
||||
self.lexer_conf = lexer_conf
|
||||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
|
||||
|
||||
def init_contextual_lexer(self, lexer_conf):
|
||||
self.lexer_conf = lexer_conf
|
||||
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
|
||||
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
|
||||
self.lexer = ContextualLexer(lexer_conf.tokens, states,
|
||||
ignore=lexer_conf.ignore,
|
||||
always_accept=always_accept,
|
||||
user_callbacks=lexer_conf.callbacks)
|
||||
|
||||
def lex(self, text):
|
||||
stream = self.lexer.lex(text)
|
||||
if self.lexer_conf.postlex:
|
||||
return self.lexer_conf.postlex.process(stream)
|
||||
return stream
|
||||
|
||||
def parse(self, text):
|
||||
token_stream = self.lex(text)
|
||||
sps = self.lexer.set_parser_state
|
||||
return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else [])
|
||||
|
||||
class LALR_TraditionalLexer(WithLexer):
|
||||
def __init__(self, lexer_conf, parser_conf, options=None):
|
||||
self.parser = lalr_parser.Parser(parser_conf)
|
||||
self.init_traditional_lexer(lexer_conf)
|
||||
|
||||
class LALR_ContextualLexer(WithLexer):
|
||||
def __init__(self, lexer_conf, parser_conf, options=None):
|
||||
self.parser = lalr_parser.Parser(parser_conf)
|
||||
self.init_contextual_lexer(lexer_conf)
|
||||
|
||||
class LALR_CustomLexer(WithLexer):
|
||||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
|
||||
self.parser = lalr_parser.Parser(parser_conf)
|
||||
self.lexer_conf = lexer_conf
|
||||
self.lexer = lexer_cls(lexer_conf)
|
||||
|
||||
|
||||
def get_ambiguity_resolver(options):
|
||||
if not options or options.ambiguity == 'resolve':
|
||||
return resolve_ambig.standard_resolve_ambig
|
||||
elif options.ambiguity == 'resolve__antiscore_sum':
|
||||
return resolve_ambig.antiscore_sum_resolve_ambig
|
||||
elif options.ambiguity == 'explicit':
|
||||
return None
|
||||
raise ValueError(options)
|
||||
|
||||
def tokenize_text(text):
|
||||
line = 1
|
||||
col_start_pos = 0
|
||||
for i, ch in enumerate(text):
|
||||
if '\n' in ch:
|
||||
line += ch.count('\n')
|
||||
col_start_pos = i + ch.rindex('\n')
|
||||
yield Token('CHAR', ch, line=line, column=i - col_start_pos)
|
||||
|
||||
class Earley(WithLexer):
|
||||
def __init__(self, lexer_conf, parser_conf, options=None):
|
||||
self.init_traditional_lexer(lexer_conf)
|
||||
|
||||
self.parser = earley.Parser(parser_conf, self.match,
|
||||
resolve_ambiguity=get_ambiguity_resolver(options))
|
||||
|
||||
def match(self, term, token):
|
||||
return term.name == token.type
|
||||
|
||||
|
||||
class XEarley:
|
||||
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
|
||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
|
||||
|
||||
self._prepare_match(lexer_conf)
|
||||
|
||||
self.parser = xearley.Parser(parser_conf,
|
||||
self.match,
|
||||
resolve_ambiguity=get_ambiguity_resolver(options),
|
||||
ignore=lexer_conf.ignore,
|
||||
predict_all=options.earley__predict_all,
|
||||
**kw
|
||||
)
|
||||
|
||||
def match(self, term, text, index=0):
|
||||
return self.regexps[term.name].match(text, index)
|
||||
|
||||
def _prepare_match(self, lexer_conf):
|
||||
self.regexps = {}
|
||||
for t in lexer_conf.tokens:
|
||||
regexp = t.pattern.to_regexp()
|
||||
try:
|
||||
width = get_regexp_width(regexp)[0]
|
||||
except ValueError:
|
||||
raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
|
||||
else:
|
||||
if width == 0:
|
||||
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
|
||||
|
||||
self.regexps[t.name] = re.compile(regexp)
|
||||
|
||||
def parse(self, text):
|
||||
return self.parser.parse(text)
|
||||
|
||||
class XEarley_CompleteLex(XEarley):
|
||||
def __init__(self, *args, **kw):
|
||||
super(self).__init__(*args, complete_lex=True, **kw)
|
||||
|
||||
|
||||
|
||||
class CYK(WithLexer):
|
||||
|
||||
def __init__(self, lexer_conf, parser_conf, options=None):
|
||||
self.init_traditional_lexer(lexer_conf)
|
||||
|
||||
self._analysis = GrammarAnalyzer(parser_conf)
|
||||
self._parser = cyk.Parser(parser_conf.rules, parser_conf.start)
|
||||
|
||||
self._postprocess = {}
|
||||
for rule in parser_conf.rules:
|
||||
a = rule.alias
|
||||
self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
|
||||
|
||||
def parse(self, text):
|
||||
tokens = list(self.lex(text))
|
||||
parse = self._parser.parse(tokens)
|
||||
parse = self._transform(parse)
|
||||
return parse
|
||||
|
||||
def _transform(self, tree):
|
||||
subtrees = list(tree.iter_subtrees())
|
||||
for subtree in subtrees:
|
||||
subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
|
||||
|
||||
return self._apply_callback(tree)
|
||||
|
||||
def _apply_callback(self, tree):
|
||||
children = tree.children
|
||||
callback = self._postprocess[tree.rule.alias]
|
||||
assert callback, tree.rule.alias
|
||||
r = callback(children)
|
||||
return r
|
||||
|
||||
|
||||
def get_frontend(parser, lexer):
|
||||
if parser=='lalr':
|
||||
if lexer is None:
|
||||
raise ValueError('The LALR parser requires use of a lexer')
|
||||
elif lexer == 'standard':
|
||||
return LALR_TraditionalLexer
|
||||
elif lexer == 'contextual':
|
||||
return LALR_ContextualLexer
|
||||
elif issubclass(lexer, Lexer):
|
||||
return partial(LALR_CustomLexer, lexer)
|
||||
else:
|
||||
raise ValueError('Unknown lexer: %s' % lexer)
|
||||
elif parser=='earley':
|
||||
if lexer=='standard':
|
||||
return Earley
|
||||
elif lexer=='dynamic':
|
||||
return XEarley
|
||||
elif lexer=='dynamic_complete':
|
||||
return XEarley_CompleteLex
|
||||
elif lexer=='contextual':
|
||||
raise ValueError('The Earley parser does not support the contextual parser')
|
||||
else:
|
||||
raise ValueError('Unknown lexer: %s' % lexer)
|
||||
elif parser == 'cyk':
|
||||
if lexer == 'standard':
|
||||
return CYK
|
||||
else:
|
||||
raise ValueError('CYK parser requires using standard parser.')
|
||||
else:
|
||||
raise ValueError('Unknown parser: %s' % parser)
|
||||
0
python/extractor/lark/parsers/__init__.py
Normal file
0
python/extractor/lark/parsers/__init__.py
Normal file
342
python/extractor/lark/parsers/cyk.py
Normal file
342
python/extractor/lark/parsers/cyk.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""This module implements a CYK parser."""
|
||||
|
||||
# Author: https://github.com/ehudt (2018)
|
||||
#
|
||||
# Adapted by Erez
|
||||
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
||||
from ..exceptions import ParseError
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from ..grammar import Terminal as T, NonTerminal as NT, Symbol
|
||||
|
||||
try:
|
||||
xrange
|
||||
except NameError:
|
||||
xrange = range
|
||||
|
||||
def match(t, s):
|
||||
assert isinstance(t, T)
|
||||
return t.name == s.type
|
||||
|
||||
|
||||
class Rule(object):
|
||||
"""Context-free grammar rule."""
|
||||
|
||||
def __init__(self, lhs, rhs, weight, alias):
|
||||
super(Rule, self).__init__()
|
||||
assert isinstance(lhs, NT), lhs
|
||||
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
|
||||
self.lhs = lhs
|
||||
self.rhs = rhs
|
||||
self.weight = weight
|
||||
self.alias = alias
|
||||
|
||||
def __str__(self):
|
||||
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.lhs, tuple(self.rhs)))
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.lhs == other.lhs and self.rhs == other.rhs
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
|
||||
class Grammar(object):
|
||||
"""Context-free grammar."""
|
||||
|
||||
def __init__(self, rules):
|
||||
self.rules = frozenset(rules)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.rules == other.rules
|
||||
|
||||
def __str__(self):
|
||||
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
# Parse tree data structures
|
||||
class RuleNode(object):
|
||||
"""A node in the parse tree, which also contains the full rhs rule."""
|
||||
|
||||
def __init__(self, rule, children, weight=0):
|
||||
self.rule = rule
|
||||
self.children = children
|
||||
self.weight = weight
|
||||
|
||||
def __repr__(self):
|
||||
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
|
||||
|
||||
|
||||
|
||||
class Parser(object):
|
||||
"""Parser wrapper."""
|
||||
|
||||
def __init__(self, rules, start):
|
||||
super(Parser, self).__init__()
|
||||
self.orig_rules = {rule.alias: rule for rule in rules}
|
||||
rules = [self._to_rule(rule) for rule in rules]
|
||||
self.grammar = to_cnf(Grammar(rules))
|
||||
self.start = NT(start)
|
||||
|
||||
def _to_rule(self, lark_rule):
|
||||
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
|
||||
assert isinstance(lark_rule.origin, NT)
|
||||
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
|
||||
return Rule(
|
||||
lark_rule.origin, lark_rule.expansion,
|
||||
weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0,
|
||||
alias=lark_rule.alias)
|
||||
|
||||
def parse(self, tokenized): # pylint: disable=invalid-name
|
||||
"""Parses input, which is a list of tokens."""
|
||||
table, trees = _parse(tokenized, self.grammar)
|
||||
# Check if the parse succeeded.
|
||||
if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
|
||||
raise ParseError('Parsing failed.')
|
||||
parse = trees[(0, len(tokenized) - 1)][self.start]
|
||||
return self._to_tree(revert_cnf(parse))
|
||||
|
||||
def _to_tree(self, rule_node):
|
||||
"""Converts a RuleNode parse tree to a lark Tree."""
|
||||
orig_rule = self.orig_rules[rule_node.rule.alias]
|
||||
children = []
|
||||
for child in rule_node.children:
|
||||
if isinstance(child, RuleNode):
|
||||
children.append(self._to_tree(child))
|
||||
else:
|
||||
assert isinstance(child.name, Token)
|
||||
children.append(child.name)
|
||||
t = Tree(orig_rule.origin, children)
|
||||
t.rule=orig_rule
|
||||
return t
|
||||
|
||||
|
||||
def print_parse(node, indent=0):
|
||||
if isinstance(node, RuleNode):
|
||||
print(' ' * (indent * 2) + str(node.rule.lhs))
|
||||
for child in node.children:
|
||||
print_parse(child, indent + 1)
|
||||
else:
|
||||
print(' ' * (indent * 2) + str(node.s))
|
||||
|
||||
|
||||
def _parse(s, g):
|
||||
"""Parses sentence 's' using CNF grammar 'g'."""
|
||||
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
|
||||
table = defaultdict(set)
|
||||
# Top-level structure is similar to the CYK table. Each cell is a dict from
|
||||
# rule name to the best (lightest) tree for that rule.
|
||||
trees = defaultdict(dict)
|
||||
# Populate base case with existing terminal production rules
|
||||
for i, w in enumerate(s):
|
||||
for terminal, rules in g.terminal_rules.items():
|
||||
if match(terminal, w):
|
||||
for rule in rules:
|
||||
table[(i, i)].add(rule)
|
||||
if (rule.lhs not in trees[(i, i)] or
|
||||
rule.weight < trees[(i, i)][rule.lhs].weight):
|
||||
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
|
||||
|
||||
# Iterate over lengths of sub-sentences
|
||||
for l in xrange(2, len(s) + 1):
|
||||
# Iterate over sub-sentences with the given length
|
||||
for i in xrange(len(s) - l + 1):
|
||||
# Choose partition of the sub-sentence in [1, l)
|
||||
for p in xrange(i + 1, i + l):
|
||||
span1 = (i, p - 1)
|
||||
span2 = (p, i + l - 1)
|
||||
for r1, r2 in itertools.product(table[span1], table[span2]):
|
||||
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
|
||||
table[(i, i + l - 1)].add(rule)
|
||||
r1_tree = trees[span1][r1.lhs]
|
||||
r2_tree = trees[span2][r2.lhs]
|
||||
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
|
||||
if (rule.lhs not in trees[(i, i + l - 1)]
|
||||
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
|
||||
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
|
||||
return table, trees
|
||||
|
||||
|
||||
# This section implements context-free grammar converter to Chomsky normal form.
|
||||
# It also implements a conversion of parse trees from its CNF to the original
|
||||
# grammar.
|
||||
# Overview:
|
||||
# Applies the following operations in this order:
|
||||
# * TERM: Eliminates non-solitary terminals from all rules
|
||||
# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
|
||||
# * UNIT: Eliminates non-terminal unit rules
|
||||
#
|
||||
# The following grammar characteristics aren't featured:
|
||||
# * Start symbol appears on RHS
|
||||
# * Empty rules (epsilon rules)
|
||||
|
||||
|
||||
class CnfWrapper(object):
|
||||
"""CNF wrapper for grammar.
|
||||
|
||||
Validates that the input grammar is CNF and provides helper data structures.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar):
|
||||
super(CnfWrapper, self).__init__()
|
||||
self.grammar = grammar
|
||||
self.rules = grammar.rules
|
||||
self.terminal_rules = defaultdict(list)
|
||||
self.nonterminal_rules = defaultdict(list)
|
||||
for r in self.rules:
|
||||
# Validate that the grammar is CNF and populate auxiliary data structures.
|
||||
assert isinstance(r.lhs, NT), r
|
||||
assert len(r.rhs) in [1, 2], r
|
||||
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
|
||||
self.terminal_rules[r.rhs[0]].append(r)
|
||||
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
|
||||
self.nonterminal_rules[tuple(r.rhs)].append(r)
|
||||
else:
|
||||
assert False, r
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.grammar == other.grammar
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.grammar)
|
||||
|
||||
|
||||
class UnitSkipRule(Rule):
|
||||
"""A rule that records NTs that were skipped during transformation."""
|
||||
|
||||
def __init__(self, lhs, rhs, skipped_rules, weight, alias):
|
||||
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
|
||||
self.skipped_rules = skipped_rules
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
|
||||
|
||||
__hash__ = Rule.__hash__
|
||||
|
||||
|
||||
def build_unit_skiprule(unit_rule, target_rule):
|
||||
skipped_rules = []
|
||||
if isinstance(unit_rule, UnitSkipRule):
|
||||
skipped_rules += unit_rule.skipped_rules
|
||||
skipped_rules.append(target_rule)
|
||||
if isinstance(target_rule, UnitSkipRule):
|
||||
skipped_rules += target_rule.skipped_rules
|
||||
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
|
||||
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
|
||||
|
||||
|
||||
def get_any_nt_unit_rule(g):
|
||||
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
|
||||
return rule
|
||||
return None
|
||||
|
||||
|
||||
def _remove_unit_rule(g, rule):
|
||||
"""Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
|
||||
new_rules = [x for x in g.rules if x != rule]
|
||||
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
|
||||
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _split(rule):
|
||||
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
|
||||
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
|
||||
rule_name = '__SP_%s' % (rule_str) + '_%d'
|
||||
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
|
||||
for i in xrange(1, len(rule.rhs) - 2):
|
||||
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
|
||||
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
|
||||
|
||||
|
||||
def _term(g):
|
||||
"""Applies the TERM rule on 'g' (see top comment)."""
|
||||
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
|
||||
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
|
||||
new_rules = []
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
|
||||
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
|
||||
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
|
||||
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
|
||||
else:
|
||||
new_rules.append(rule)
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _bin(g):
|
||||
"""Applies the BIN rule to 'g' (see top comment)."""
|
||||
new_rules = []
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) > 2:
|
||||
new_rules += _split(rule)
|
||||
else:
|
||||
new_rules.append(rule)
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _unit(g):
|
||||
"""Applies the UNIT rule to 'g' (see top comment)."""
|
||||
nt_unit_rule = get_any_nt_unit_rule(g)
|
||||
while nt_unit_rule:
|
||||
g = _remove_unit_rule(g, nt_unit_rule)
|
||||
nt_unit_rule = get_any_nt_unit_rule(g)
|
||||
return g
|
||||
|
||||
|
||||
def to_cnf(g):
|
||||
"""Creates a CNF grammar from a general context-free grammar 'g'."""
|
||||
g = _unit(_bin(_term(g)))
|
||||
return CnfWrapper(g)
|
||||
|
||||
|
||||
def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
|
||||
if not skipped_rules:
|
||||
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
|
||||
else:
|
||||
weight = weight - skipped_rules[0].weight
|
||||
return RuleNode(
|
||||
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
|
||||
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
|
||||
skipped_rules[1:], children,
|
||||
skipped_rules[0].weight, skipped_rules[0].alias)
|
||||
], weight=weight)
|
||||
|
||||
|
||||
def revert_cnf(node):
|
||||
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
|
||||
if isinstance(node, T):
|
||||
return node
|
||||
# Reverts TERM rule.
|
||||
if node.rule.lhs.name.startswith('__T_'):
|
||||
return node.children[0]
|
||||
else:
|
||||
children = []
|
||||
for child in map(revert_cnf, node.children):
|
||||
# Reverts BIN rule.
|
||||
if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'):
|
||||
children += child.children
|
||||
else:
|
||||
children.append(child)
|
||||
# Reverts UNIT rule.
|
||||
if isinstance(node.rule, UnitSkipRule):
|
||||
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
|
||||
node.rule.skipped_rules, children,
|
||||
node.rule.weight, node.rule.alias)
|
||||
else:
|
||||
return RuleNode(node.rule, children)
|
||||
239
python/extractor/lark/parsers/earley.py
Normal file
239
python/extractor/lark/parsers/earley.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"This module implements an Earley Parser"
|
||||
|
||||
# The parser uses a parse-forest to keep track of derivations and ambiguations.
|
||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
|
||||
# (right now ambiguity resolution is not developed beyond the needs of lark)
|
||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
|
||||
# I use the no-recursion version of Transformer, because the tree might be
|
||||
# deeper than Python's recursion limit (a bit absurd, but that's life)
|
||||
#
|
||||
# The algorithm keeps track of each state set, using a corresponding Column instance.
|
||||
# Column keeps track of new items using NewsList instances.
|
||||
#
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
|
||||
from ..tree import Tree
|
||||
from ..visitors import Transformer_InPlace, v_args
|
||||
from ..exceptions import ParseError, UnexpectedToken
|
||||
from .grammar_analysis import GrammarAnalyzer
|
||||
from ..grammar import NonTerminal
|
||||
|
||||
|
||||
class Derivation(Tree):
|
||||
def __init__(self, rule, items=None):
|
||||
Tree.__init__(self, 'drv', items or [])
|
||||
self.meta.rule = rule
|
||||
self._hash = None
|
||||
|
||||
def _pretty_label(self): # Nicer pretty for debugging the parser
|
||||
return self.rule.origin if self.rule else self.data
|
||||
|
||||
def __hash__(self):
|
||||
if self._hash is None:
|
||||
self._hash = Tree.__hash__(self)
|
||||
return self._hash
|
||||
|
||||
class Item(object):
|
||||
"An Earley Item, the atom of the algorithm."
|
||||
|
||||
def __init__(self, rule, ptr, start, tree):
|
||||
self.rule = rule
|
||||
self.ptr = ptr
|
||||
self.start = start
|
||||
self.tree = tree if tree is not None else Derivation(self.rule)
|
||||
|
||||
@property
|
||||
def expect(self):
|
||||
return self.rule.expansion[self.ptr]
|
||||
|
||||
@property
|
||||
def is_complete(self):
|
||||
return self.ptr == len(self.rule.expansion)
|
||||
|
||||
def advance(self, tree):
|
||||
assert self.tree.data == 'drv'
|
||||
new_tree = Derivation(self.rule, self.tree.children + [tree])
|
||||
return self.__class__(self.rule, self.ptr+1, self.start, new_tree)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__
|
||||
|
||||
def __repr__(self):
|
||||
before = list(map(str, self.rule.expansion[:self.ptr]))
|
||||
after = list(map(str, self.rule.expansion[self.ptr:]))
|
||||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))
|
||||
|
||||
class NewsList(list):
|
||||
"Keeps track of newly added items (append-only)"
|
||||
|
||||
def __init__(self, initial=None):
|
||||
list.__init__(self, initial or [])
|
||||
self.last_iter = 0
|
||||
|
||||
def get_news(self):
|
||||
i = self.last_iter
|
||||
self.last_iter = len(self)
|
||||
return self[i:]
|
||||
|
||||
|
||||
|
||||
class Column:
|
||||
"An entry in the table, aka Earley Chart. Contains lists of items."
|
||||
def __init__(self, i, FIRST, predict_all=False):
|
||||
self.i = i
|
||||
self.to_reduce = NewsList()
|
||||
self.to_predict = NewsList()
|
||||
self.to_scan = []
|
||||
self.item_count = 0
|
||||
self.FIRST = FIRST
|
||||
|
||||
self.predicted = set()
|
||||
self.completed = {}
|
||||
self.predict_all = predict_all
|
||||
|
||||
def add(self, items):
|
||||
"""Sort items into scan/predict/reduce newslists
|
||||
|
||||
Makes sure only unique items are added.
|
||||
"""
|
||||
for item in items:
|
||||
|
||||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison
|
||||
if item.is_complete:
|
||||
# XXX Potential bug: What happens if there's ambiguity in an empty rule?
|
||||
if item.rule.expansion and item_key in self.completed:
|
||||
old_tree = self.completed[item_key].tree
|
||||
if old_tree == item.tree:
|
||||
is_empty = not self.FIRST[item.rule.origin]
|
||||
if not is_empty:
|
||||
continue
|
||||
|
||||
if old_tree.data != '_ambig':
|
||||
new_tree = old_tree.copy()
|
||||
new_tree.meta.rule = old_tree.meta.rule
|
||||
old_tree.set('_ambig', [new_tree])
|
||||
old_tree.meta.rule = None # No longer a 'drv' node
|
||||
|
||||
if item.tree.children[0] is old_tree: # XXX a little hacky!
|
||||
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule)
|
||||
|
||||
if item.tree not in old_tree.children:
|
||||
old_tree.children.append(item.tree)
|
||||
# old_tree.children.append(item.tree)
|
||||
else:
|
||||
self.completed[item_key] = item
|
||||
self.to_reduce.append(item)
|
||||
else:
|
||||
if item.expect.is_term:
|
||||
self.to_scan.append(item)
|
||||
else:
|
||||
k = item_key if self.predict_all else item
|
||||
if k in self.predicted:
|
||||
continue
|
||||
self.predicted.add(k)
|
||||
self.to_predict.append(item)
|
||||
|
||||
self.item_count += 1 # Only count if actually added
|
||||
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.item_count)
|
||||
__nonzero__ = __bool__ # Py2 backwards-compatibility
|
||||
|
||||
class Parser:
|
||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
|
||||
analysis = GrammarAnalyzer(parser_conf)
|
||||
self.parser_conf = parser_conf
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
|
||||
self.FIRST = analysis.FIRST
|
||||
self.postprocess = {}
|
||||
self.predictions = {}
|
||||
for rule in parser_conf.rules:
|
||||
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias)
|
||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
|
||||
|
||||
self.term_matcher = term_matcher
|
||||
|
||||
|
||||
def parse(self, stream, start_symbol=None):
|
||||
# Define parser functions
|
||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
|
||||
|
||||
_Item = Item
|
||||
match = self.term_matcher
|
||||
|
||||
def predict(nonterm, column):
|
||||
assert not nonterm.is_term, nonterm
|
||||
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
|
||||
|
||||
def complete(item):
|
||||
name = item.rule.origin
|
||||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
|
||||
|
||||
def predict_and_complete(column):
|
||||
while True:
|
||||
to_predict = {x.expect for x in column.to_predict.get_news()
|
||||
if x.ptr} # if not part of an already predicted batch
|
||||
to_reduce = set(column.to_reduce.get_news())
|
||||
if not (to_predict or to_reduce):
|
||||
break
|
||||
|
||||
for nonterm in to_predict:
|
||||
column.add( predict(nonterm, column) )
|
||||
|
||||
for item in to_reduce:
|
||||
new_items = list(complete(item))
|
||||
if item in new_items:
|
||||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
|
||||
column.add(new_items)
|
||||
|
||||
def scan(i, token, column):
|
||||
next_set = Column(i, self.FIRST)
|
||||
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))
|
||||
|
||||
if not next_set:
|
||||
expect = {i.expect.name for i in column.to_scan}
|
||||
raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan))
|
||||
|
||||
return next_set
|
||||
|
||||
# Main loop starts
|
||||
column0 = Column(0, self.FIRST)
|
||||
column0.add(predict(start_symbol, column0))
|
||||
|
||||
column = column0
|
||||
for i, token in enumerate(stream):
|
||||
predict_and_complete(column)
|
||||
column = scan(i, token, column)
|
||||
|
||||
predict_and_complete(column)
|
||||
|
||||
# Parse ended. Now build a parse tree
|
||||
solutions = [n.tree for n in column.to_reduce
|
||||
if n.rule.origin==start_symbol and n.start is column0]
|
||||
|
||||
if not solutions:
|
||||
raise ParseError('Incomplete parse: Could not find a solution to input')
|
||||
elif len(solutions) == 1:
|
||||
tree = solutions[0]
|
||||
else:
|
||||
tree = Tree('_ambig', solutions)
|
||||
|
||||
if self.resolve_ambiguity:
|
||||
tree = self.resolve_ambiguity(tree)
|
||||
|
||||
return ApplyCallbacks(self.postprocess).transform(tree)
|
||||
|
||||
|
||||
class ApplyCallbacks(Transformer_InPlace):
|
||||
def __init__(self, postprocess):
|
||||
self.postprocess = postprocess
|
||||
|
||||
@v_args(meta=True)
|
||||
def drv(self, children, meta):
|
||||
return self.postprocess[meta.rule](children)
|
||||
148
python/extractor/lark/parsers/grammar_analysis.py
Normal file
148
python/extractor/lark/parsers/grammar_analysis.py
Normal file
@@ -0,0 +1,148 @@
|
||||
|
||||
from ..utils import bfs, fzset, classify
|
||||
from ..exceptions import GrammarError
|
||||
from ..grammar import Rule, Terminal, NonTerminal
|
||||
|
||||
|
||||
class RulePtr(object):
|
||||
__slots__ = ('rule', 'index')
|
||||
|
||||
def __init__(self, rule, index):
|
||||
assert isinstance(rule, Rule)
|
||||
assert index <= len(rule.expansion)
|
||||
self.rule = rule
|
||||
self.index = index
|
||||
|
||||
def __repr__(self):
|
||||
before = self.rule.expansion[:self.index]
|
||||
after = self.rule.expansion[self.index:]
|
||||
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after))
|
||||
|
||||
@property
|
||||
def next(self):
|
||||
return self.rule.expansion[self.index]
|
||||
|
||||
def advance(self, sym):
|
||||
assert self.next == sym
|
||||
return RulePtr(self.rule, self.index+1)
|
||||
|
||||
@property
|
||||
def is_satisfied(self):
|
||||
return self.index == len(self.rule.expansion)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.rule == other.rule and self.index == other.index
|
||||
def __hash__(self):
|
||||
return hash((self.rule, self.index))
|
||||
|
||||
|
||||
def update_set(set1, set2):
|
||||
if not set2:
|
||||
return False
|
||||
|
||||
copy = set(set1)
|
||||
set1 |= set2
|
||||
return set1 != copy
|
||||
|
||||
def calculate_sets(rules):
|
||||
"""Calculate FOLLOW sets.
|
||||
|
||||
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
|
||||
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
|
||||
|
||||
# foreach grammar rule X ::= Y(1) ... Y(k)
|
||||
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
|
||||
# NULLABLE = NULLABLE union {X}
|
||||
# for i = 1 to k
|
||||
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
|
||||
# FIRST(X) = FIRST(X) union FIRST(Y(i))
|
||||
# for j = i+1 to k
|
||||
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
|
||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
|
||||
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
|
||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
|
||||
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration
|
||||
|
||||
NULLABLE = set()
|
||||
FIRST = {}
|
||||
FOLLOW = {}
|
||||
for sym in symbols:
|
||||
FIRST[sym]={sym} if sym.is_term else set()
|
||||
FOLLOW[sym]=set()
|
||||
|
||||
# Calculate NULLABLE and FIRST
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for rule in rules:
|
||||
if set(rule.expansion) <= NULLABLE:
|
||||
if update_set(NULLABLE, {rule.origin}):
|
||||
changed = True
|
||||
|
||||
for i, sym in enumerate(rule.expansion):
|
||||
if set(rule.expansion[:i]) <= NULLABLE:
|
||||
if update_set(FIRST[rule.origin], FIRST[sym]):
|
||||
changed = True
|
||||
|
||||
# Calculate FOLLOW
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for rule in rules:
|
||||
for i, sym in enumerate(rule.expansion):
|
||||
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE:
|
||||
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
|
||||
changed = True
|
||||
|
||||
for j in range(i+1, len(rule.expansion)):
|
||||
if set(rule.expansion[i+1:j]) <= NULLABLE:
|
||||
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
|
||||
changed = True
|
||||
|
||||
return FIRST, FOLLOW, NULLABLE
|
||||
|
||||
|
||||
class GrammarAnalyzer(object):
|
||||
def __init__(self, parser_conf, debug=False):
|
||||
self.debug = debug
|
||||
|
||||
rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])]
|
||||
self.rules_by_origin = classify(rules, lambda r: r.origin)
|
||||
|
||||
assert len(rules) == len(set(rules))
|
||||
for r in rules:
|
||||
for sym in r.expansion:
|
||||
if not (sym.is_term or sym in self.rules_by_origin):
|
||||
raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation
|
||||
|
||||
self.start_state = self.expand_rule(NonTerminal('$root'))
|
||||
|
||||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
|
||||
|
||||
def expand_rule(self, rule):
|
||||
"Returns all init_ptrs accessible by rule (recursive)"
|
||||
init_ptrs = set()
|
||||
def _expand_rule(rule):
|
||||
assert not rule.is_term, rule
|
||||
|
||||
for r in self.rules_by_origin[rule]:
|
||||
init_ptr = RulePtr(r, 0)
|
||||
init_ptrs.add(init_ptr)
|
||||
|
||||
if r.expansion: # if not empty rule
|
||||
new_r = init_ptr.next
|
||||
if not new_r.is_term:
|
||||
yield new_r
|
||||
|
||||
for _ in bfs([rule], _expand_rule):
|
||||
pass
|
||||
|
||||
return fzset(init_ptrs)
|
||||
|
||||
def _first(self, r):
|
||||
if r.is_term:
|
||||
return {r}
|
||||
else:
|
||||
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}
|
||||
108
python/extractor/lark/parsers/lalr_analysis.py
Normal file
108
python/extractor/lark/parsers/lalr_analysis.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""This module builds a LALR(1) transition-table for lalr_parser.py
|
||||
|
||||
For now, shift/reduce conflicts are automatically resolved as shifts.
|
||||
"""
|
||||
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from ..utils import classify, classify_bool, bfs, fzset
|
||||
from ..exceptions import GrammarError
|
||||
|
||||
from .grammar_analysis import GrammarAnalyzer, Terminal
|
||||
|
||||
class Action:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
def __str__(self):
|
||||
return self.name
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
Shift = Action('Shift')
|
||||
Reduce = Action('Reduce')
|
||||
|
||||
class ParseTable:
|
||||
def __init__(self, states, start_state, end_state):
|
||||
self.states = states
|
||||
self.start_state = start_state
|
||||
self.end_state = end_state
|
||||
|
||||
class IntParseTable(ParseTable):
|
||||
|
||||
@classmethod
|
||||
def from_ParseTable(cls, parse_table):
|
||||
enum = list(parse_table.states)
|
||||
state_to_idx = {s:i for i,s in enumerate(enum)}
|
||||
int_states = {}
|
||||
|
||||
for s, la in parse_table.states.items():
|
||||
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
|
||||
for k,v in la.items()}
|
||||
int_states[ state_to_idx[s] ] = la
|
||||
|
||||
|
||||
start_state = state_to_idx[parse_table.start_state]
|
||||
end_state = state_to_idx[parse_table.end_state]
|
||||
return cls(int_states, start_state, end_state)
|
||||
|
||||
|
||||
|
||||
|
||||
class LALR_Analyzer(GrammarAnalyzer):
|
||||
|
||||
def compute_lookahead(self):
|
||||
self.end_states = []
|
||||
|
||||
self.states = {}
|
||||
def step(state):
|
||||
lookahead = defaultdict(list)
|
||||
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
|
||||
for rp in sat:
|
||||
for term in self.FOLLOW.get(rp.rule.origin, ()):
|
||||
lookahead[term].append((Reduce, rp.rule))
|
||||
|
||||
d = classify(unsat, lambda rp: rp.next)
|
||||
for sym, rps in d.items():
|
||||
rps = {rp.advance(sym) for rp in rps}
|
||||
|
||||
for rp in set(rps):
|
||||
if not rp.is_satisfied and not rp.next.is_term:
|
||||
rps |= self.expand_rule(rp.next)
|
||||
|
||||
new_state = fzset(rps)
|
||||
lookahead[sym].append((Shift, new_state))
|
||||
if sym == Terminal('$END'):
|
||||
self.end_states.append( new_state )
|
||||
yield new_state
|
||||
|
||||
for k, v in lookahead.items():
|
||||
if len(v) > 1:
|
||||
if self.debug:
|
||||
logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v)
|
||||
for x in v:
|
||||
# XXX resolving shift/reduce into shift, like PLY
|
||||
# Give a proper warning
|
||||
if x[0] is Shift:
|
||||
lookahead[k] = [x]
|
||||
|
||||
for k, v in lookahead.items():
|
||||
if not len(v) == 1:
|
||||
raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v])))
|
||||
|
||||
self.states[state] = {k.name:v[0] for k, v in lookahead.items()}
|
||||
|
||||
for _ in bfs([self.start_state], step):
|
||||
pass
|
||||
|
||||
self.end_state ,= self.end_states
|
||||
|
||||
self._parse_table = ParseTable(self.states, self.start_state, self.end_state)
|
||||
|
||||
if self.debug:
|
||||
self.parse_table = self._parse_table
|
||||
else:
|
||||
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)
|
||||
90
python/extractor/lark/parsers/lalr_parser.py
Normal file
90
python/extractor/lark/parsers/lalr_parser.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""This module implements a LALR(1) Parser
|
||||
"""
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
from ..exceptions import UnexpectedToken
|
||||
|
||||
from .lalr_analysis import LALR_Analyzer, Shift
|
||||
|
||||
class Parser:
|
||||
def __init__(self, parser_conf):
|
||||
assert all(r.options is None or r.options.priority is None
|
||||
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
|
||||
analysis = LALR_Analyzer(parser_conf)
|
||||
analysis.compute_lookahead()
|
||||
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
|
||||
for rule in parser_conf.rules}
|
||||
|
||||
self._parse_table = analysis.parse_table
|
||||
self.parser_conf = parser_conf
|
||||
self.parser = _Parser(analysis.parse_table, callbacks)
|
||||
self.parse = self.parser.parse
|
||||
|
||||
###{standalone
|
||||
|
||||
class _Parser:
|
||||
def __init__(self, parse_table, callbacks):
|
||||
self.states = parse_table.states
|
||||
self.start_state = parse_table.start_state
|
||||
self.end_state = parse_table.end_state
|
||||
self.callbacks = callbacks
|
||||
|
||||
def parse(self, seq, set_state=None):
|
||||
i = 0
|
||||
token = None
|
||||
stream = iter(seq)
|
||||
states = self.states
|
||||
|
||||
state_stack = [self.start_state]
|
||||
value_stack = []
|
||||
|
||||
if set_state: set_state(self.start_state)
|
||||
|
||||
def get_action(key):
|
||||
state = state_stack[-1]
|
||||
try:
|
||||
return states[state][key]
|
||||
except KeyError:
|
||||
expected = states[state].keys()
|
||||
raise UnexpectedToken(token, expected, state=state) # TODO filter out rules from expected
|
||||
|
||||
def reduce(rule):
|
||||
size = len(rule.expansion)
|
||||
if size:
|
||||
s = value_stack[-size:]
|
||||
del state_stack[-size:]
|
||||
del value_stack[-size:]
|
||||
else:
|
||||
s = []
|
||||
|
||||
value = self.callbacks[rule](s)
|
||||
|
||||
_action, new_state = get_action(rule.origin.name)
|
||||
assert _action is Shift
|
||||
state_stack.append(new_state)
|
||||
value_stack.append(value)
|
||||
|
||||
# Main LALR-parser loop
|
||||
for i, token in enumerate(stream):
|
||||
while True:
|
||||
action, arg = get_action(token.type)
|
||||
assert arg != self.end_state
|
||||
|
||||
if action is Shift:
|
||||
state_stack.append(arg)
|
||||
value_stack.append(token)
|
||||
if set_state: set_state(arg)
|
||||
break # next token
|
||||
else:
|
||||
reduce(arg)
|
||||
|
||||
while True:
|
||||
_action, arg = get_action('$END')
|
||||
if _action is Shift:
|
||||
assert arg == self.end_state
|
||||
val ,= value_stack
|
||||
return val
|
||||
else:
|
||||
reduce(arg)
|
||||
|
||||
###}
|
||||
109
python/extractor/lark/parsers/resolve_ambig.py
Normal file
109
python/extractor/lark/parsers/resolve_ambig.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from ..utils import compare
|
||||
from functools import cmp_to_key
|
||||
|
||||
from ..tree import Tree
|
||||
|
||||
|
||||
# Standard ambiguity resolver (uses comparison)
|
||||
#
|
||||
# Author: Erez Sh
|
||||
|
||||
def _compare_rules(rule1, rule2):
|
||||
return -compare( len(rule1.expansion), len(rule2.expansion))
|
||||
|
||||
def _sum_priority(tree):
|
||||
p = 0
|
||||
|
||||
for n in tree.iter_subtrees():
|
||||
try:
|
||||
p += n.meta.rule.options.priority or 0
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return p
|
||||
|
||||
def _compare_priority(tree1, tree2):
|
||||
tree1.iter_subtrees()
|
||||
|
||||
def _compare_drv(tree1, tree2):
|
||||
try:
|
||||
rule1 = tree1.meta.rule
|
||||
except AttributeError:
|
||||
rule1 = None
|
||||
|
||||
try:
|
||||
rule2 = tree2.meta.rule
|
||||
except AttributeError:
|
||||
rule2 = None
|
||||
|
||||
if None == rule1 == rule2:
|
||||
return compare(tree1, tree2)
|
||||
elif rule1 is None:
|
||||
return -1
|
||||
elif rule2 is None:
|
||||
return 1
|
||||
|
||||
assert tree1.data != '_ambig'
|
||||
assert tree2.data != '_ambig'
|
||||
|
||||
p1 = _sum_priority(tree1)
|
||||
p2 = _sum_priority(tree2)
|
||||
c = (p1 or p2) and compare(p1, p2)
|
||||
if c:
|
||||
return c
|
||||
|
||||
c = _compare_rules(tree1.meta.rule, tree2.meta.rule)
|
||||
if c:
|
||||
return c
|
||||
|
||||
# rules are "equal", so compare trees
|
||||
if len(tree1.children) == len(tree2.children):
|
||||
for t1, t2 in zip(tree1.children, tree2.children):
|
||||
c = _compare_drv(t1, t2)
|
||||
if c:
|
||||
return c
|
||||
|
||||
return compare(len(tree1.children), len(tree2.children))
|
||||
|
||||
|
||||
def _standard_resolve_ambig(tree):
|
||||
assert tree.data == '_ambig'
|
||||
key_f = cmp_to_key(_compare_drv)
|
||||
best = max(tree.children, key=key_f)
|
||||
assert best.data == 'drv'
|
||||
tree.set('drv', best.children)
|
||||
tree.meta.rule = best.meta.rule # needed for applying callbacks
|
||||
|
||||
def standard_resolve_ambig(tree):
|
||||
for ambig in tree.find_data('_ambig'):
|
||||
_standard_resolve_ambig(ambig)
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
|
||||
|
||||
# Anti-score Sum
|
||||
#
|
||||
# Author: Uriva (https://github.com/uriva)
|
||||
|
||||
def _antiscore_sum_drv(tree):
|
||||
if not isinstance(tree, Tree):
|
||||
return 0
|
||||
|
||||
assert tree.data != '_ambig'
|
||||
|
||||
return _sum_priority(tree)
|
||||
|
||||
def _antiscore_sum_resolve_ambig(tree):
|
||||
assert tree.data == '_ambig'
|
||||
best = min(tree.children, key=_antiscore_sum_drv)
|
||||
assert best.data == 'drv'
|
||||
tree.set('drv', best.children)
|
||||
tree.meta.rule = best.meta.rule # needed for applying callbacks
|
||||
|
||||
def antiscore_sum_resolve_ambig(tree):
|
||||
for ambig in tree.find_data('_ambig'):
|
||||
_antiscore_sum_resolve_ambig(ambig)
|
||||
|
||||
return tree
|
||||
156
python/extractor/lark/parsers/xearley.py
Normal file
156
python/extractor/lark/parsers/xearley.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"This module implements an experimental Earley Parser with a dynamic lexer"
|
||||
|
||||
# The parser uses a parse-forest to keep track of derivations and ambiguations.
|
||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
|
||||
# (right now ambiguity resolution is not developed beyond the needs of lark)
|
||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
|
||||
# I use the no-recursion version of Transformer and Visitor, because the tree might be
|
||||
# deeper than Python's recursion limit (a bit absurd, but that's life)
|
||||
#
|
||||
# The algorithm keeps track of each state set, using a corresponding Column instance.
|
||||
# Column keeps track of new items using NewsList instances.
|
||||
#
|
||||
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
|
||||
# uses regular expressions by necessity, achieving high-performance while maintaining all of
|
||||
# Earley's power in parsing any CFG.
|
||||
#
|
||||
#
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from ..exceptions import ParseError, UnexpectedCharacters
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from .grammar_analysis import GrammarAnalyzer
|
||||
from ..grammar import NonTerminal, Terminal
|
||||
|
||||
from .earley import ApplyCallbacks, Item, Column
|
||||
|
||||
|
||||
class Parser:
|
||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False):
|
||||
self.analysis = GrammarAnalyzer(parser_conf)
|
||||
self.parser_conf = parser_conf
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
self.ignore = [Terminal(t) for t in ignore]
|
||||
self.predict_all = predict_all
|
||||
self.complete_lex = complete_lex
|
||||
|
||||
self.FIRST = self.analysis.FIRST
|
||||
self.postprocess = {}
|
||||
self.predictions = {}
|
||||
for rule in parser_conf.rules:
|
||||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
|
||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
|
||||
|
||||
self.term_matcher = term_matcher
|
||||
|
||||
|
||||
def parse(self, stream, start_symbol=None):
|
||||
# Define parser functions
|
||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
|
||||
delayed_matches = defaultdict(list)
|
||||
match = self.term_matcher
|
||||
|
||||
text_line = 1
|
||||
text_column = 1
|
||||
|
||||
def predict(nonterm, column):
|
||||
assert not nonterm.is_term, nonterm
|
||||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
|
||||
|
||||
def complete(item):
|
||||
name = item.rule.origin
|
||||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
|
||||
|
||||
def predict_and_complete(column):
|
||||
while True:
|
||||
to_predict = {x.expect for x in column.to_predict.get_news()
|
||||
if x.ptr} # if not part of an already predicted batch
|
||||
to_reduce = column.to_reduce.get_news()
|
||||
if not (to_predict or to_reduce):
|
||||
break
|
||||
|
||||
for nonterm in to_predict:
|
||||
column.add( predict(nonterm, column) )
|
||||
for item in to_reduce:
|
||||
new_items = list(complete(item))
|
||||
if item in new_items:
|
||||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
|
||||
column.add(new_items)
|
||||
|
||||
def scan(i, column):
|
||||
to_scan = column.to_scan
|
||||
|
||||
for x in self.ignore:
|
||||
m = match(x, stream, i)
|
||||
if m:
|
||||
delayed_matches[m.end()] += set(to_scan)
|
||||
delayed_matches[m.end()] += set(column.to_reduce)
|
||||
|
||||
# TODO add partial matches for ignore too?
|
||||
# s = m.group(0)
|
||||
# for j in range(1, len(s)):
|
||||
# m = x.match(s[:-j])
|
||||
# if m:
|
||||
# delayed_matches[m.end()] += to_scan
|
||||
|
||||
for item in to_scan:
|
||||
m = match(item.expect, stream, i)
|
||||
if m:
|
||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
|
||||
delayed_matches[m.end()].append(item.advance(t))
|
||||
|
||||
if self.complete_lex:
|
||||
s = m.group(0)
|
||||
for j in range(1, len(s)):
|
||||
m = match(item.expect, s[:-j])
|
||||
if m:
|
||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
|
||||
delayed_matches[i+m.end()].append(item.advance(t))
|
||||
|
||||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
|
||||
next_set.add(delayed_matches[i+1])
|
||||
del delayed_matches[i+1] # No longer needed, so unburden memory
|
||||
|
||||
if not next_set and not delayed_matches:
|
||||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))
|
||||
|
||||
return next_set
|
||||
|
||||
# Main loop starts
|
||||
column0 = Column(0, self.FIRST, predict_all=self.predict_all)
|
||||
column0.add(predict(start_symbol, column0))
|
||||
|
||||
column = column0
|
||||
for i, token in enumerate(stream):
|
||||
predict_and_complete(column)
|
||||
column = scan(i, column)
|
||||
|
||||
if token == '\n':
|
||||
text_line += 1
|
||||
text_column = 1
|
||||
else:
|
||||
text_column += 1
|
||||
|
||||
predict_and_complete(column)
|
||||
|
||||
# Parse ended. Now build a parse tree
|
||||
solutions = [n.tree for n in column.to_reduce
|
||||
if n.rule.origin==start_symbol and n.start is column0]
|
||||
|
||||
if not solutions:
|
||||
expected_tokens = [t.expect for t in column.to_scan]
|
||||
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
|
||||
|
||||
elif len(solutions) == 1:
|
||||
tree = solutions[0]
|
||||
else:
|
||||
tree = Tree('_ambig', solutions)
|
||||
|
||||
if self.resolve_ambiguity:
|
||||
tree = self.resolve_ambiguity(tree)
|
||||
|
||||
return ApplyCallbacks(self.postprocess).transform(tree)
|
||||
129
python/extractor/lark/reconstruct.py
Normal file
129
python/extractor/lark/reconstruct.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from .tree import Tree
|
||||
from .visitors import Transformer_InPlace
|
||||
from .common import ParserConf, PatternStr
|
||||
from .lexer import Token
|
||||
from .parsers import earley, resolve_ambig
|
||||
from .grammar import Rule, Terminal, NonTerminal
|
||||
|
||||
|
||||
|
||||
def is_discarded_terminal(t):
|
||||
return t.is_term and t.filter_out
|
||||
|
||||
def is_iter_empty(i):
|
||||
try:
|
||||
_ = next(i)
|
||||
return False
|
||||
except StopIteration:
|
||||
return True
|
||||
|
||||
class WriteTokensTransformer(Transformer_InPlace):
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
|
||||
def __default__(self, data, children, meta):
|
||||
# if not isinstance(t, MatchTree):
|
||||
# return t
|
||||
if not getattr(meta, 'match_tree', False):
|
||||
return Tree(data, children)
|
||||
|
||||
iter_args = iter(children)
|
||||
to_write = []
|
||||
for sym in meta.orig_expansion:
|
||||
if is_discarded_terminal(sym):
|
||||
t = self.tokens[sym.name]
|
||||
assert isinstance(t.pattern, PatternStr)
|
||||
to_write.append(t.pattern.value)
|
||||
else:
|
||||
x = next(iter_args)
|
||||
if isinstance(x, list):
|
||||
to_write += x
|
||||
else:
|
||||
if isinstance(x, Token):
|
||||
assert Terminal(x.type) == sym, x
|
||||
else:
|
||||
assert NonTerminal(x.data) == sym, (sym, x)
|
||||
to_write.append(x)
|
||||
|
||||
assert is_iter_empty(iter_args)
|
||||
return to_write
|
||||
|
||||
|
||||
class MatchTree(Tree):
|
||||
pass
|
||||
|
||||
class MakeMatchTree:
|
||||
def __init__(self, name, expansion):
|
||||
self.name = name
|
||||
self.expansion = expansion
|
||||
|
||||
def __call__(self, args):
|
||||
t = MatchTree(self.name, args)
|
||||
t.meta.match_tree = True
|
||||
t.meta.orig_expansion = self.expansion
|
||||
return t
|
||||
|
||||
class Reconstructor:
|
||||
def __init__(self, parser):
|
||||
# XXX TODO calling compile twice returns different results!
|
||||
tokens, rules, _grammar_extra = parser.grammar.compile()
|
||||
|
||||
self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens})
|
||||
self.rules = list(self._build_recons_rules(rules))
|
||||
|
||||
def _build_recons_rules(self, rules):
|
||||
expand1s = {r.origin for r in rules if r.options and r.options.expand1}
|
||||
|
||||
aliases = defaultdict(list)
|
||||
for r in rules:
|
||||
if r.alias:
|
||||
aliases[r.origin].append( r.alias )
|
||||
|
||||
rule_names = {r.origin for r in rules}
|
||||
nonterminals = {sym for sym in rule_names
|
||||
if sym.name.startswith('_') or sym in expand1s or sym in aliases }
|
||||
|
||||
for r in rules:
|
||||
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
|
||||
for sym in r.expansion if not is_discarded_terminal(sym)]
|
||||
|
||||
# Skip self-recursive constructs
|
||||
if recons_exp == [r.origin]:
|
||||
continue
|
||||
|
||||
sym = NonTerminal(r.alias) if r.alias else r.origin
|
||||
|
||||
yield Rule(sym, recons_exp, MakeMatchTree(sym.name, r.expansion))
|
||||
|
||||
for origin, rule_aliases in aliases.items():
|
||||
for alias in rule_aliases:
|
||||
yield Rule(origin, [Terminal(alias)], MakeMatchTree(origin.name, [NonTerminal(alias)]))
|
||||
|
||||
yield Rule(origin, [Terminal(origin.name)], MakeMatchTree(origin.name, [origin]))
|
||||
|
||||
|
||||
|
||||
def _match(self, term, token):
|
||||
if isinstance(token, Tree):
|
||||
return Terminal(token.data) == term
|
||||
elif isinstance(token, Token):
|
||||
return term == Terminal(token.type)
|
||||
assert False
|
||||
|
||||
def _reconstruct(self, tree):
|
||||
# TODO: ambiguity?
|
||||
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig)
|
||||
unreduced_tree = parser.parse(tree.children) # find a full derivation
|
||||
assert unreduced_tree.data == tree.data
|
||||
res = self.write_tokens.transform(unreduced_tree)
|
||||
for item in res:
|
||||
if isinstance(item, Tree):
|
||||
for x in self._reconstruct(item):
|
||||
yield x
|
||||
else:
|
||||
yield item
|
||||
|
||||
def reconstruct(self, tree):
|
||||
return ''.join(self._reconstruct(tree))
|
||||
0
python/extractor/lark/tools/__init__.py
Normal file
0
python/extractor/lark/tools/__init__.py
Normal file
186
python/extractor/lark/tools/nearley.py
Normal file
186
python/extractor/lark/tools/nearley.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"Converts between Lark and Nearley grammars. Work in progress!"
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
|
||||
from lark import Lark, InlineTransformer
|
||||
|
||||
nearley_grammar = r"""
|
||||
start: (ruledef|directive)+
|
||||
|
||||
directive: "@" NAME (STRING|NAME)
|
||||
| "@" JS -> js_code
|
||||
ruledef: NAME "->" expansions
|
||||
| NAME REGEXP "->" expansions -> macro
|
||||
expansions: expansion ("|" expansion)*
|
||||
|
||||
expansion: expr+ js
|
||||
|
||||
?expr: item [":" /[+*?]/]
|
||||
|
||||
?item: rule|string|regexp
|
||||
| "(" expansions ")"
|
||||
|
||||
rule: NAME
|
||||
string: STRING
|
||||
regexp: REGEXP
|
||||
JS: /{%.*?%}/s
|
||||
js: JS?
|
||||
|
||||
NAME: /[a-zA-Z_$]\w*/
|
||||
COMMENT: /#[^\n]*/
|
||||
REGEXP: /\[.*?\]/
|
||||
STRING: /".*?"/
|
||||
|
||||
%import common.WS
|
||||
%ignore WS
|
||||
%ignore COMMENT
|
||||
|
||||
"""
|
||||
|
||||
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard')
|
||||
|
||||
def _get_rulename(name):
|
||||
name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)
|
||||
return 'n_' + name.replace('$', '__DOLLAR__').lower()
|
||||
|
||||
class NearleyToLark(InlineTransformer):
|
||||
def __init__(self):
|
||||
self._count = 0
|
||||
self.extra_rules = {}
|
||||
self.extra_rules_rev = {}
|
||||
self.alias_js_code = {}
|
||||
|
||||
def _new_function(self, code):
|
||||
name = 'alias_%d' % self._count
|
||||
self._count += 1
|
||||
|
||||
self.alias_js_code[name] = code
|
||||
return name
|
||||
|
||||
def _extra_rule(self, rule):
|
||||
if rule in self.extra_rules_rev:
|
||||
return self.extra_rules_rev[rule]
|
||||
|
||||
name = 'xrule_%d' % len(self.extra_rules)
|
||||
assert name not in self.extra_rules
|
||||
self.extra_rules[name] = rule
|
||||
self.extra_rules_rev[rule] = name
|
||||
return name
|
||||
|
||||
def rule(self, name):
|
||||
return _get_rulename(name)
|
||||
|
||||
def ruledef(self, name, exps):
|
||||
return '!%s: %s' % (_get_rulename(name), exps)
|
||||
|
||||
def expr(self, item, op):
|
||||
rule = '(%s)%s' % (item, op)
|
||||
return self._extra_rule(rule)
|
||||
|
||||
def regexp(self, r):
|
||||
return '/%s/' % r
|
||||
|
||||
def string(self, s):
|
||||
return self._extra_rule(s)
|
||||
|
||||
def expansion(self, *x):
|
||||
x, js = x[:-1], x[-1]
|
||||
if js.children:
|
||||
js_code ,= js.children
|
||||
js_code = js_code[2:-2]
|
||||
alias = '-> ' + self._new_function(js_code)
|
||||
else:
|
||||
alias = ''
|
||||
return ' '.join(x) + alias
|
||||
|
||||
def expansions(self, *x):
|
||||
return '%s' % ('\n |'.join(x))
|
||||
|
||||
def start(self, *rules):
|
||||
return '\n'.join(filter(None, rules))
|
||||
|
||||
def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
|
||||
rule_defs = []
|
||||
|
||||
tree = nearley_grammar_parser.parse(g)
|
||||
for statement in tree.children:
|
||||
if statement.data == 'directive':
|
||||
directive, arg = statement.children
|
||||
if directive in ('builtin', 'include'):
|
||||
folder = builtin_path if directive == 'builtin' else folder_path
|
||||
path = os.path.join(folder, arg[1:-1])
|
||||
if path not in includes:
|
||||
includes.add(path)
|
||||
with codecs.open(path, encoding='utf8') as f:
|
||||
text = f.read()
|
||||
rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
|
||||
else:
|
||||
assert False, directive
|
||||
elif statement.data == 'js_code':
|
||||
code ,= statement.children
|
||||
code = code[2:-2]
|
||||
js_code.append(code)
|
||||
elif statement.data == 'macro':
|
||||
pass # TODO Add support for macros!
|
||||
elif statement.data == 'ruledef':
|
||||
rule_defs.append( n2l.transform(statement) )
|
||||
else:
|
||||
raise Exception("Unknown statement: %s" % statement)
|
||||
|
||||
return rule_defs
|
||||
|
||||
|
||||
def create_code_for_nearley_grammar(g, start, builtin_path, folder_path):
|
||||
import js2py
|
||||
|
||||
emit_code = []
|
||||
def emit(x=None):
|
||||
if x:
|
||||
emit_code.append(x)
|
||||
emit_code.append('\n')
|
||||
|
||||
js_code = ['function id(x) {return x[0];}']
|
||||
n2l = NearleyToLark()
|
||||
rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
|
||||
lark_g = '\n'.join(rule_defs)
|
||||
lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
|
||||
|
||||
emit('from lark import Lark, Transformer')
|
||||
emit()
|
||||
emit('grammar = ' + repr(lark_g))
|
||||
emit()
|
||||
|
||||
for alias, code in n2l.alias_js_code.items():
|
||||
js_code.append('%s = (%s);' % (alias, code))
|
||||
|
||||
emit(js2py.translate_js('\n'.join(js_code)))
|
||||
emit('class TransformNearley(Transformer):')
|
||||
for alias in n2l.alias_js_code:
|
||||
emit(" %s = var.get('%s').to_python()" % (alias, alias))
|
||||
emit(" __default__ = lambda self, n, c, m: c if c else None")
|
||||
|
||||
emit()
|
||||
emit('parser = Lark(grammar, start="n_%s")' % start)
|
||||
emit('def parse(text):')
|
||||
emit(' return TransformNearley().transform(parser.parse(text))')
|
||||
|
||||
return ''.join(emit_code)
|
||||
|
||||
def main(fn, start, nearley_lib):
|
||||
with codecs.open(fn, encoding='utf8') as f:
|
||||
grammar = f.read()
|
||||
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 4:
|
||||
print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.")
|
||||
print("Usage: %s <nearley_grammar_path> <start_rule> <nearley_lib_path>" % sys.argv[0])
|
||||
sys.exit(1)
|
||||
|
||||
fn, start, nearley_lib = sys.argv[1:]
|
||||
|
||||
print(main(fn, start, nearley_lib))
|
||||
7
python/extractor/lark/tools/standalone.py
Normal file
7
python/extractor/lark/tools/standalone.py
Normal file
@@ -0,0 +1,7 @@
|
||||
# This file used to contain the Lark standalone tool.
|
||||
#
|
||||
# We do not use it, and it is licensed under the GPL, which is much
|
||||
# more restrictive than the rest of Lark. In order to avoid depending
|
||||
# on it accidentally, we exclude it from our repository and distribution.
|
||||
# When LARK is upgraded, this file should be kept in preference to the
|
||||
# original.
|
||||
162
python/extractor/lark/tree.py
Normal file
162
python/extractor/lark/tree.py
Normal file
@@ -0,0 +1,162 @@
|
||||
try:
|
||||
from future_builtins import filter
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
class Meta:
|
||||
pass
|
||||
|
||||
###{standalone
|
||||
class Tree(object):
|
||||
def __init__(self, data, children, meta=None):
|
||||
self.data = data
|
||||
self.children = children
|
||||
self._meta = meta
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
if self._meta is None:
|
||||
self._meta = Meta()
|
||||
return self._meta
|
||||
|
||||
def __repr__(self):
|
||||
return 'Tree(%s, %s)' % (self.data, self.children)
|
||||
|
||||
def _pretty_label(self):
|
||||
return self.data
|
||||
|
||||
def _pretty(self, level, indent_str):
|
||||
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
|
||||
return [ indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n']
|
||||
|
||||
l = [ indent_str*level, self._pretty_label(), '\n' ]
|
||||
for n in self.children:
|
||||
if isinstance(n, Tree):
|
||||
l += n._pretty(level+1, indent_str)
|
||||
else:
|
||||
l += [ indent_str*(level+1), '%s' % (n,), '\n' ]
|
||||
|
||||
return l
|
||||
|
||||
def pretty(self, indent_str=' '):
|
||||
return ''.join(self._pretty(0, indent_str))
|
||||
###}
|
||||
|
||||
def expand_kids_by_index(self, *indices):
|
||||
"Expand (inline) children at the given indices"
|
||||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
|
||||
kid = self.children[i]
|
||||
self.children[i:i+1] = kid.children
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
return self.data == other.data and self.children == other.children
|
||||
except AttributeError:
|
||||
return False
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.data, tuple(self.children)))
|
||||
|
||||
def find_pred(self, pred):
|
||||
"Find all nodes where pred(tree) == True"
|
||||
return filter(pred, self.iter_subtrees())
|
||||
|
||||
def find_data(self, data):
|
||||
"Find all nodes where tree.data == data"
|
||||
return self.find_pred(lambda t: t.data == data)
|
||||
|
||||
def scan_values(self, pred):
|
||||
for c in self.children:
|
||||
if isinstance(c, Tree):
|
||||
for t in c.scan_values(pred):
|
||||
yield t
|
||||
else:
|
||||
if pred(c):
|
||||
yield c
|
||||
|
||||
def iter_subtrees(self):
|
||||
# TODO: Re-write as a more efficient version
|
||||
|
||||
visited = set()
|
||||
q = [self]
|
||||
|
||||
l = []
|
||||
while q:
|
||||
subtree = q.pop()
|
||||
l.append( subtree )
|
||||
if id(subtree) in visited:
|
||||
continue # already been here from another branch
|
||||
visited.add(id(subtree))
|
||||
q += [c for c in subtree.children if isinstance(c, Tree)]
|
||||
|
||||
seen = set()
|
||||
for x in reversed(l):
|
||||
if id(x) not in seen:
|
||||
yield x
|
||||
seen.add(id(x))
|
||||
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
return type(self)(self.data, deepcopy(self.children, memo))
|
||||
|
||||
def copy(self):
|
||||
return type(self)(self.data, self.children)
|
||||
def set(self, data, children):
|
||||
self.data = data
|
||||
self.children = children
|
||||
|
||||
# XXX Deprecated! Here for backwards compatibility <0.6.0
|
||||
@property
|
||||
def line(self):
|
||||
return self.meta.line
|
||||
@property
|
||||
def column(self):
|
||||
return self.meta.column
|
||||
@property
|
||||
def end_line(self):
|
||||
return self.meta.end_line
|
||||
@property
|
||||
def end_column(self):
|
||||
return self.meta.end_column
|
||||
|
||||
|
||||
class SlottedTree(Tree):
|
||||
__slots__ = 'data', 'children', 'rule', '_meta'
|
||||
|
||||
|
||||
def pydot__tree_to_png(tree, filename):
|
||||
"Creates a colorful image that represents the tree (data+children, without meta)"
|
||||
|
||||
import pydot
|
||||
graph = pydot.Dot(graph_type='digraph', rankdir="LR")
|
||||
|
||||
i = [0]
|
||||
|
||||
def new_leaf(leaf):
|
||||
node = pydot.Node(i[0], label=repr(leaf))
|
||||
i[0] += 1
|
||||
graph.add_node(node)
|
||||
return node
|
||||
|
||||
def _to_pydot(subtree):
|
||||
color = hash(subtree.data) & 0xffffff
|
||||
color |= 0x808080
|
||||
|
||||
subnodes = [_to_pydot(child) if isinstance(child, Tree) else new_leaf(child)
|
||||
for child in subtree.children]
|
||||
node = pydot.Node(i[0], style="filled", fillcolor="#%x"%color, label=subtree.data)
|
||||
i[0] += 1
|
||||
graph.add_node(node)
|
||||
|
||||
for subnode in subnodes:
|
||||
graph.add_edge(pydot.Edge(node, subnode))
|
||||
|
||||
return node
|
||||
|
||||
_to_pydot(tree)
|
||||
graph.write_png(filename)
|
||||
127
python/extractor/lark/utils.py
Normal file
127
python/extractor/lark/utils.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from collections import deque
|
||||
import sys
|
||||
|
||||
class fzset(frozenset):
|
||||
def __repr__(self):
|
||||
return '{%s}' % ', '.join(map(repr, self))
|
||||
|
||||
|
||||
def classify_bool(seq, pred):
|
||||
true_elems = []
|
||||
false_elems = []
|
||||
|
||||
for elem in seq:
|
||||
if pred(elem):
|
||||
true_elems.append(elem)
|
||||
else:
|
||||
false_elems.append(elem)
|
||||
|
||||
return true_elems, false_elems
|
||||
|
||||
def classify(seq, key=None, value=None):
|
||||
d = {}
|
||||
for item in seq:
|
||||
k = key(item) if (key is not None) else item
|
||||
v = value(item) if (value is not None) else item
|
||||
if k in d:
|
||||
d[k].append(v)
|
||||
else:
|
||||
d[k] = [v]
|
||||
return d
|
||||
|
||||
def bfs(initial, expand):
|
||||
open_q = deque(list(initial))
|
||||
visited = set(open_q)
|
||||
while open_q:
|
||||
node = open_q.popleft()
|
||||
yield node
|
||||
for next_node in expand(node):
|
||||
if next_node not in visited:
|
||||
visited.add(next_node)
|
||||
open_q.append(next_node)
|
||||
|
||||
|
||||
|
||||
|
||||
try:
|
||||
STRING_TYPE = basestring
|
||||
except NameError: # Python 3
|
||||
STRING_TYPE = str
|
||||
|
||||
###{standalone
|
||||
|
||||
import types
|
||||
from functools import wraps, partial
|
||||
from contextlib import contextmanager
|
||||
|
||||
Str = type(u'')
|
||||
|
||||
def smart_decorator(f, create_decorator):
|
||||
if isinstance(f, types.FunctionType):
|
||||
return wraps(f)(create_decorator(f, True))
|
||||
|
||||
elif isinstance(f, (type, types.BuiltinFunctionType)):
|
||||
return wraps(f)(create_decorator(f, False))
|
||||
|
||||
elif isinstance(f, types.MethodType):
|
||||
return wraps(f)(create_decorator(f.__func__, True))
|
||||
|
||||
elif isinstance(f, partial):
|
||||
# wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
|
||||
return create_decorator(f.__func__, True)
|
||||
|
||||
else:
|
||||
return create_decorator(f.__func__.__call__, True)
|
||||
|
||||
|
||||
|
||||
|
||||
try:
|
||||
from contextlib import suppress # Python 3
|
||||
except ImportError:
|
||||
@contextmanager
|
||||
def suppress(*excs):
|
||||
'''Catch and dismiss the provided exception
|
||||
|
||||
>>> x = 'hello'
|
||||
>>> with suppress(IndexError):
|
||||
... x = x[10]
|
||||
>>> x
|
||||
'hello'
|
||||
'''
|
||||
try:
|
||||
yield
|
||||
except excs:
|
||||
pass
|
||||
|
||||
###}
|
||||
|
||||
|
||||
|
||||
try:
|
||||
compare = cmp
|
||||
except NameError:
|
||||
def compare(a, b):
|
||||
if a == b:
|
||||
return 0
|
||||
elif a > b:
|
||||
return 1
|
||||
return -1
|
||||
|
||||
|
||||
def get_regexp_width(regexp):
|
||||
# in 3.11 sre_parse was replaced with re._parser
|
||||
# see implementation in https://github.com/python/cpython/blob/3.11/Lib/sre_parse.py
|
||||
if sys.version_info >= (3, 11):
|
||||
import re
|
||||
try:
|
||||
return re._parser.parse(regexp).getwidth()
|
||||
except re.error:
|
||||
raise ValueError(regexp)
|
||||
else:
|
||||
import sre_constants
|
||||
import sre_parse
|
||||
try:
|
||||
return sre_parse.parse(regexp).getwidth()
|
||||
except sre_constants.error:
|
||||
raise ValueError(regexp)
|
||||
250
python/extractor/lark/visitors.py
Normal file
250
python/extractor/lark/visitors.py
Normal file
@@ -0,0 +1,250 @@
|
||||
from inspect import getmembers, getmro
|
||||
from functools import wraps
|
||||
|
||||
from .utils import smart_decorator
|
||||
from .tree import Tree
|
||||
|
||||
class Discard(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# Transformers
|
||||
|
||||
class Transformer:
|
||||
"""Visits the tree recursively, starting with the leaves and finally the root (bottom-up)
|
||||
|
||||
Calls its methods (provided by user via inheritance) according to tree.data
|
||||
The returned value replaces the old one in the structure.
|
||||
|
||||
Can be used to implement map or reduce.
|
||||
"""
|
||||
|
||||
def _call_userfunc(self, tree, new_children=None):
|
||||
# Assumes tree is already transformed
|
||||
children = new_children if new_children is not None else tree.children
|
||||
try:
|
||||
f = getattr(self, tree.data)
|
||||
except AttributeError:
|
||||
return self.__default__(tree.data, children, tree.meta)
|
||||
else:
|
||||
if getattr(f, 'meta', False):
|
||||
return f(children, tree.meta)
|
||||
elif getattr(f, 'inline', False):
|
||||
return f(*children)
|
||||
elif getattr(f, 'whole_tree', False):
|
||||
if new_children is not None:
|
||||
raise NotImplementedError("Doesn't work with the base Transformer class")
|
||||
return f(tree)
|
||||
else:
|
||||
return f(children)
|
||||
|
||||
def _transform_children(self, children):
|
||||
for c in children:
|
||||
try:
|
||||
yield self._transform_tree(c) if isinstance(c, Tree) else c
|
||||
except Discard:
|
||||
pass
|
||||
|
||||
def _transform_tree(self, tree):
|
||||
children = list(self._transform_children(tree.children))
|
||||
return self._call_userfunc(tree, children)
|
||||
|
||||
def transform(self, tree):
|
||||
return self._transform_tree(tree)
|
||||
|
||||
def __mul__(self, other):
|
||||
return TransformerChain(self, other)
|
||||
|
||||
def __default__(self, data, children, meta):
|
||||
"Default operation on tree (for override)"
|
||||
return Tree(data, children, meta)
|
||||
|
||||
@classmethod
|
||||
def _apply_decorator(cls, decorator, **kwargs):
|
||||
mro = getmro(cls)
|
||||
assert mro[0] is cls
|
||||
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
|
||||
for name, value in getmembers(cls):
|
||||
if name.startswith('_') or name in libmembers:
|
||||
continue
|
||||
|
||||
setattr(cls, name, decorator(value, **kwargs))
|
||||
return cls
|
||||
|
||||
|
||||
class InlineTransformer(Transformer): # XXX Deprecated
|
||||
def _call_userfunc(self, tree, new_children=None):
|
||||
# Assumes tree is already transformed
|
||||
children = new_children if new_children is not None else tree.children
|
||||
try:
|
||||
f = getattr(self, tree.data)
|
||||
except AttributeError:
|
||||
return self.__default__(tree.data, children, tree.meta)
|
||||
else:
|
||||
return f(*children)
|
||||
|
||||
|
||||
class TransformerChain(object):
|
||||
def __init__(self, *transformers):
|
||||
self.transformers = transformers
|
||||
|
||||
def transform(self, tree):
|
||||
for t in self.transformers:
|
||||
tree = t.transform(tree)
|
||||
return tree
|
||||
|
||||
def __mul__(self, other):
|
||||
return TransformerChain(*self.transformers + (other,))
|
||||
|
||||
|
||||
class Transformer_InPlace(Transformer):
|
||||
"Non-recursive. Changes the tree in-place instead of returning new instances"
|
||||
def _transform_tree(self, tree): # Cancel recursion
|
||||
return self._call_userfunc(tree)
|
||||
|
||||
def transform(self, tree):
|
||||
for subtree in tree.iter_subtrees():
|
||||
subtree.children = list(self._transform_children(subtree.children))
|
||||
|
||||
return self._transform_tree(tree)
|
||||
|
||||
|
||||
class Transformer_InPlaceRecursive(Transformer):
|
||||
"Recursive. Changes the tree in-place instead of returning new instances"
|
||||
def _transform_tree(self, tree):
|
||||
tree.children = list(self._transform_children(tree.children))
|
||||
return self._call_userfunc(tree)
|
||||
|
||||
|
||||
|
||||
# Visitors
|
||||
|
||||
class VisitorBase:
|
||||
def _call_userfunc(self, tree):
|
||||
return getattr(self, tree.data, self.__default__)(tree)
|
||||
|
||||
def __default__(self, tree):
|
||||
"Default operation on tree (for override)"
|
||||
return tree
|
||||
|
||||
|
||||
class Visitor(VisitorBase):
|
||||
"""Bottom-up visitor, non-recursive
|
||||
|
||||
Visits the tree, starting with the leaves and finally the root (bottom-up)
|
||||
Calls its methods (provided by user via inheritance) according to tree.data
|
||||
"""
|
||||
|
||||
|
||||
def visit(self, tree):
|
||||
for subtree in tree.iter_subtrees():
|
||||
self._call_userfunc(subtree)
|
||||
return tree
|
||||
|
||||
class Visitor_Recursive(VisitorBase):
|
||||
"""Bottom-up visitor, recursive
|
||||
|
||||
Visits the tree, starting with the leaves and finally the root (bottom-up)
|
||||
Calls its methods (provided by user via inheritance) according to tree.data
|
||||
"""
|
||||
|
||||
def visit(self, tree):
|
||||
for child in tree.children:
|
||||
if isinstance(child, Tree):
|
||||
self.visit(child)
|
||||
|
||||
f = getattr(self, tree.data, self.__default__)
|
||||
f(tree)
|
||||
return tree
|
||||
|
||||
|
||||
|
||||
def visit_children_decor(func):
|
||||
"See Interpreter"
|
||||
@wraps(func)
|
||||
def inner(cls, tree):
|
||||
values = cls.visit_children(tree)
|
||||
return func(cls, values)
|
||||
return inner
|
||||
|
||||
|
||||
class Interpreter:
|
||||
"""Top-down visitor, recursive
|
||||
|
||||
Visits the tree, starting with the root and finally the leaves (top-down)
|
||||
Calls its methods (provided by user via inheritance) according to tree.data
|
||||
|
||||
Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches.
|
||||
The user has to explicitly call visit_children, or use the @visit_children_decor
|
||||
"""
|
||||
def visit(self, tree):
|
||||
return getattr(self, tree.data)(tree)
|
||||
|
||||
def visit_children(self, tree):
|
||||
return [self.visit(child) if isinstance(child, Tree) else child
|
||||
for child in tree.children]
|
||||
|
||||
def __getattr__(self, name):
|
||||
return self.__default__
|
||||
|
||||
def __default__(self, tree):
|
||||
return self.visit_children(tree)
|
||||
|
||||
|
||||
|
||||
|
||||
# Decorators
|
||||
|
||||
def _apply_decorator(obj, decorator, **kwargs):
|
||||
try:
|
||||
_apply = obj._apply_decorator
|
||||
except AttributeError:
|
||||
return decorator(obj, **kwargs)
|
||||
else:
|
||||
return _apply(decorator, **kwargs)
|
||||
|
||||
|
||||
|
||||
def _inline_args__func(func):
|
||||
@wraps(func)
|
||||
def create_decorator(_f, with_self):
|
||||
if with_self:
|
||||
def f(self, children):
|
||||
return _f(self, *children)
|
||||
else:
|
||||
def f(self, children):
|
||||
return _f(*children)
|
||||
return f
|
||||
|
||||
return smart_decorator(func, create_decorator)
|
||||
|
||||
|
||||
def inline_args(obj): # XXX Deprecated
|
||||
return _apply_decorator(obj, _inline_args__func)
|
||||
|
||||
|
||||
|
||||
def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False):
|
||||
assert [whole_tree, meta, inline].count(True) <= 1
|
||||
def create_decorator(_f, with_self):
|
||||
if with_self:
|
||||
def f(self, *args, **kwargs):
|
||||
return _f(self, *args, **kwargs)
|
||||
else:
|
||||
def f(self, *args, **kwargs):
|
||||
return _f(*args, **kwargs)
|
||||
return f
|
||||
|
||||
f = smart_decorator(func, create_decorator)
|
||||
f.inline = inline
|
||||
f.meta = meta
|
||||
f.whole_tree = whole_tree
|
||||
return f
|
||||
|
||||
def v_args(inline=False, meta=False, tree=False):
|
||||
"A convenience decorator factory, for modifying the behavior of user-supplied visitor methods"
|
||||
if [tree, meta, inline].count(True) > 1:
|
||||
raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.")
|
||||
def _visitor_args_dec(obj):
|
||||
return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree)
|
||||
return _visitor_args_dec
|
||||
Reference in New Issue
Block a user