codeql/python/extractor/tokenizer_generator/gen_state_machine.py

'''
Generate a state-machine based tokenizer from a state transition description and a template.

Parses the state transition description to compute a set of transition tables.
Each table maps (state, character-class) pairs to (state, action) pairs.
During tokenization each input character is converted to a class, then a new state and action is
looked up using the current state and character-class.

The generated tables are:
    CLASS_TABLE:
        Maps ASCII code points to character class.
    ID_TABLE:
        Maps all unicode points to one of Identifier, Identifier-continuation, or other.
    The transition tables:
        Each table maps each state to a per-class transition table.
        Each per-class transition table maps each character-class to an index in the action table.
    ACTION_TABLE:
        Embedded in code as `action_table`; maps each index to a (state, action) pair.

Since the number of character-classes, states and (state, action) pairs is small. Everything is represented as
a byte and tables as `bytes` object for Python 3, or `array.array` objects for Python 2.
'''


from .parser import parse
from . import machine
from .compiled import StateActionListPair, IdentifierTable

def emit_id_bytes(id_table):
    chunks, index = id_table.as_two_level_table()
    print("# %d entries in ID index" % len(index))
    index_bytes = bytes(index)
    print("ID_INDEX = toarray(")
    for n in range(0, len(index_bytes), 32):
        print("    %r" % index_bytes[n:n+32])
    print(")")
    print("ID_CHUNKS = (")
    for chunk in chunks:
        print("    toarray(%r)," % chunk)
    print(")")

def emit_transition_table(table, verbose=False):
    print("%s = (" % table.name.upper(), end="")
    for trans in table.as_list_of_transitions():
        print("B%02d," % trans.id, end=" ")
    print(")")

emitted_rows = set()

def emit_rows(table):
    for trans in table.as_list_of_transitions():
        id = trans.id
        if id in emitted_rows:
            continue
        emitted_rows.add(id)
        print("B%02d = toarray(%r)" % (id, trans.as_bytes()))

action_names = {}
next_action_id = 0

def get_action_id(action):
    global next_action_id
    assert action is not None
    if action in action_names:
        return action_names[action]
    result = next_action_id
    next_action_id += 1
    action_names[action] = result
    return result

def emit_actions(table, indent=""):
    for pair in table:
        if pair.actionlist is None:
            continue
        action = pair.actionlist
        get_action_function(action, indent)

def generate_action_table(table, indent):
    result = []
    result.append(indent + "action_table = [\n    " + indent)
    for i, pair in enumerate(table):
        if pair.actionlist is None:
            result.append("(%d, None), " % pair.state.id)
        else:
            result.append("(%d, self.action_%s), " % (pair.state.id, pair.actionlist.id))
        if (i & 3) == 3:
            result.append("\n    " + indent)
    result.append("\n" + indent + "]")
    return "".join(result)

action_functions = set()

def get_action_function(actionlist, indent=""):
    if actionlist in action_functions:
        return
    action_functions.add(actionlist)
    last = actionlist.actions[-1]
    print(indent + "def action_%d(self):" % actionlist.id)
    emit = False
    for action in actionlist.actions:
        if action is machine.PUSHBACK:
            print(indent + "    self.index -= 1")
            continue
        elif action is machine.POP:
            print(indent + "    self.super_state = self.state_stack.pop()")
        elif isinstance(action, machine.Push):
            print(indent + "    self.state_stack.append(self.super_state)")
            print(indent + "    self.super_state = %s" % action.state.name.upper())
        elif action is machine.MARK:
            print(indent + "    self.token_start_index = self.index")
            print(indent + "    self.token_start = self.line, self.index-self.line_start_index")
        elif isinstance(action, machine.Emit):
            emit = True
            print(indent + "    end = self.line, self.index-self.line_start_index+1")
            if action.text is None:
                print(indent + "    result = [%s, self.text[self.token_start_index:self.index+1], self.token_start, end]" % action.kind)
            else:
                print(indent + "    result = [%s, u%s, (self.line, self.index-self.line_start_index), end]" % (action.kind, action.text))
            print(indent + "    self.token_start = end")
            print(indent + "    self.token_start_index = self.index+1")
        elif action is machine.NEWLINE:
            print(indent + "    self.line_start_index = self.index+1")
            print(indent + "    self.line += 1")
        elif action is machine.EMIT_INDENT:
            assert action is last
            print(indent + "    return self.emit_indent()")
            print()
            return
        else:
            assert False, "Unexpected action: %s" % action
    print(indent + "    self.index += 1")
    if emit:
        print(indent + "    return result")
    else:
        print(indent + "    return None")
    print()
    return

def emit_char_classes(char_classes, verbose=False):
    for cls in sorted(set(char_classes.values()), key=lambda x : x.id):
        print("#%d = %r" % (cls.id, cls))
    table = [None] * 128
    by_id = {
        machine.IDENTIFIER_CLASS.id : machine.IDENTIFIER_CLASS,
        machine.IDENTIFIER_CONTINUE_CLASS.id : machine.IDENTIFIER_CONTINUE_CLASS,
        machine.ERROR_CLASS.id : machine.ERROR_CLASS
    }
    for c, cls in char_classes.items():
        by_id[cls.id] = cls
        if c is machine.IDENTIFIER or c is machine.IDENTIFIER_CONTINUE:
            continue
        table[ord(c)] = cls.id
        by_id[cls.id] = cls
    for i in range(128):
        assert table[i] is not None
    bytes_table = bytes(table)
    if verbose:
        print("# Class Table")
        for i in range(len(bytes_table)):
            b = bytes_table[i]
            print("# %r -> %s" % (chr(i), by_id[b]))
    print("CLASS_TABLE = toarray(%r)" % bytes_table)


PREFACE = """
import codecs
import re
import sys

from blib2to3.pgen2.token import *

if sys.version < '3':
    from array import array
    def toarray(b):
        return array('B', b)
else:
    def toarray(b):
        return b
"""

def main():
    verbose = False
    import sys
    if len(sys.argv) != 3:
        print("Usage %s DESCRIPTION TEMPLATE" % sys.argv[0])
        sys.exit(1)
    descriptionfile = sys.argv[1]
    with open(descriptionfile) as fd:
        m = machine.Machine.load(fd.read())
    templatefile = sys.argv[2]
    with open(templatefile) as fd:
        template = fd.read()
    print("# This file is AUTO-GENERATED. DO NOT MODIFY")
    print('# To regenerate: run "python3 -m tokenizer_generator.gen_state_machine %s %s"' % (descriptionfile, templatefile))
    print(PREFACE)
    print("IDENTIFIER_CLASS = %d" % machine.IDENTIFIER_CLASS.id)
    print("IDENTIFIER_CONTINUE_CLASS = %d" % machine.IDENTIFIER_CONTINUE_CLASS.id)
    print("ERROR_CLASS = %d" % machine.ERROR_CLASS.id)
    emit_id_bytes(IdentifierTable())
    char_classes = m.get_classes()
    emit_char_classes(char_classes, verbose)
    print()
    tables = [state.compile(char_classes) for state in m.states.values() ]
    for table in tables:
        emit_rows(table)
    print()
    for table in tables:
        #pprint(table)
        emit_transition_table(table, verbose)
    print()
    print("TRANSITION_STATE_NAMES = {")
    for state in m.states.values():
        print("    id(%s): '%s'," % (state.name.upper(), state.name))
    print("}")
    print("START_SUPER_STATE = %s" % m.start.name.upper())
    prefix, suffix = template.split("#ACTIONS-HERE")
    print(prefix)
    actions = StateActionListPair.listall()
    emit_actions(actions, "    ")
    action_table = generate_action_table(actions, "        ")
    print(suffix.replace("#ACTION_TABLE_HERE", action_table))

if __name__ == "__main__":
    main()