codeql/python/extractor/tokenizer_generator/tokenizer_template.py

'''
Lookup table based tokenizer with state popping and pushing capabilities.
The ability to push and pop state is required for handling parenthesised expressions,
indentation, and f-strings. We also use it for handling the different quotation mark types,
but it is not essential for that, merely convenient.

'''


class Tokenizer(object):

    def __init__(self, text):
        self.text = text
        self.index = 0
        self.line_start_index = 0
        self.token_start_index = 0
        self.token_start = 1, 0
        self.line = 1
        self.super_state = START_SUPER_STATE
        self.state_stack = []
        self.indents = [0]
#ACTIONS-HERE
    def tokens(self, debug=False):
        text = self.text
        cls_table = CLASS_TABLE
        id_index = ID_INDEX
        id_chunks = ID_CHUNKS
        max_id = len(id_index)*256
#ACTION_TABLE_HERE
        state = 0
        try:
            if debug:
                while True:
                    c = ord(text[self.index])
                    if c < 128:
                        cls = cls_table[c]
                    elif c >= max_id:
                        cls = ERROR_CLASS
                    else:
                        b = id_chunks[id_index[c>>8]][(c>>2)&63]
                        cls = (b>>((c&3)*2))&3
                    prev_state = state
                    print("char = '%s', state=%d, cls=%d" % (text[self.index], state, cls))
                    state, transition = action_table[self.super_state[state][cls]]
                    print ("%s -> %s on %r in %s" % (prev_state, state, text[self.index], TRANSITION_STATE_NAMES[id(self.super_state)]))
                    if transition:
                        tkn = transition()
                        if tkn:
                            yield tkn
                    else:
                        self.index += 1
            else:
                while True:
                    c = ord(text[self.index])
                    if c < 128:
                        cls = cls_table[c]
                    elif c >= max_id:
                        cls = ERROR_CLASS
                    else:
                        b = id_chunks[id_index[c>>8]][(c>>2)&63]
                        cls = (b>>((c&3)*2))&3
                    state, transition = action_table[self.super_state[state][cls]]
                    if transition:
                        tkn = transition()
                        if tkn:
                            yield tkn
                    else:
                        self.index += 1
        except IndexError as ex:
            if self.index != len(text):
                #Reraise index error
                cls = cls_table[c]
                trans = self.super_state[state]
                action_index = trans[cls]
                action_table[action_index]
                # Not raised? Must have been raised in transition function.
                raise ex
            tkn = self.emit_indent()
            while tkn is not None:
                yield tkn
                tkn = self.emit_indent()
            end = self.line, self.index-self.line_start_index
            yield ENDMARKER, u"", self.token_start, end
            return

    def emit_indent(self):
        indent = 0
        index = self.line_start_index
        current = self.index
        here = self.line, current-self.line_start_index
        while index < current:
            if self.text[index] == ' ':
                indent += 1
            elif self.text[index] == '\t':
                indent = (indent+8) & -8
            elif self.text[index] == '\f':
                indent = 0
            else:
                #Unexpected state. Emit error token
                while len(self.indents) > 1:
                    self.indents.pop()
                result = ERRORTOKEN, self.text[self.token_start_index:self.index+1], self.token_start, here
                self.token_start = here
                self.line_start_index = self.index
                return result
            index += 1
        if indent == self.indents[-1]:
            self.token_start = here
            self.token_start_index = self.index
            return None
        elif indent > self.indents[-1]:
            self.indents.append(indent)
            start = self.line, 0
            result = INDENT, self.text[self.line_start_index:current], start, here
            self.token_start = here
            self.token_start_index = current
            return result
        else:
            self.indents.pop()
            if indent > self.indents[-1]:
                #Illegal indent
                result = ILLEGALINDENT, u"", here, here
            else:
                result = DEDENT, u"", here, here
                if indent < self.indents[-1]:
                    #More dedents to do
                    self.state_stack.append(self.super_state)
                    self.super_state = PENDING_DEDENT
            self.token_start = here
            self.token_start_index = self.index
            return result


ENCODING_RE = re.compile(br'.*coding[:=]\s*([-\w.]+).*')
NEWLINE_BYTES = b'\n'

def encoding_from_source(source):
    'Returns encoding of source (bytes), plus source strip of any BOM markers.'
    #Check for BOM
    if source.startswith(codecs.BOM_UTF8):
        return 'utf8', source[len(codecs.BOM_UTF8):]
    if source.startswith(codecs.BOM_UTF16_BE):
        return 'utf-16be', source[len(codecs.BOM_UTF16_BE):]
    if source.startswith(codecs.BOM_UTF16_LE):
        return 'utf-16le', source[len(codecs.BOM_UTF16_LE):]
    try:
        first_new_line = source.find(NEWLINE_BYTES)
        first_line = source[:first_new_line]
        second_new_line = source.find(NEWLINE_BYTES, first_new_line+1)
        second_line = source[first_new_line+1:second_new_line]
        match = ENCODING_RE.match(first_line) or ENCODING_RE.match(second_line)
        if match:
            ascii_encoding = match.groups()[0]
            encoding = ascii_encoding.decode("ascii")
            # Handle non-standard encodings that are recognised by the interpreter.
            if encoding.startswith("utf-8-"):
                encoding = "utf-8"
            elif encoding == "iso-latin-1":
                encoding = "iso-8859-1"
            elif encoding.startswith("latin-1-"):
                encoding = "iso-8859-1"
            elif encoding.startswith("iso-8859-1-"):
                encoding = "iso-8859-1"
            elif encoding.startswith("iso-latin-1-"):
                encoding = "iso-8859-1"
            return encoding, source
    except Exception as ex:
        print(ex)
        #Failed to determine encoding -- Just treat as default.
        pass
    return 'utf-8', source