mirror of
https://github.com/github/codeql.git
synced 2026-01-18 17:04:50 +01:00
Python: Copy Python extractor to codeql repo
This commit is contained in:
6615
python/extractor/tsg-python/tree-sitter-python/src/grammar.json
Normal file
6615
python/extractor/tsg-python/tree-sitter-python/src/grammar.json
Normal file
File diff suppressed because it is too large
Load Diff
4064
python/extractor/tsg-python/tree-sitter-python/src/node-types.json
Normal file
4064
python/extractor/tsg-python/tree-sitter-python/src/node-types.json
Normal file
File diff suppressed because it is too large
Load Diff
76504
python/extractor/tsg-python/tree-sitter-python/src/parser.c
Normal file
76504
python/extractor/tsg-python/tree-sitter-python/src/parser.c
Normal file
File diff suppressed because it is too large
Load Diff
402
python/extractor/tsg-python/tree-sitter-python/src/scanner.cc
Normal file
402
python/extractor/tsg-python/tree-sitter-python/src/scanner.cc
Normal file
@@ -0,0 +1,402 @@
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cwctype>
|
||||
#include <stdio.h>
|
||||
#include <tree_sitter/parser.h>
|
||||
#include <vector>
|
||||
namespace {
|
||||
|
||||
using std::vector;
|
||||
using std::iswspace;
|
||||
using std::memcpy;
|
||||
|
||||
enum TokenType {
|
||||
NEWLINE,
|
||||
INDENT,
|
||||
DEDENT,
|
||||
STRING_START,
|
||||
STRING_CONTENT,
|
||||
STRING_END,
|
||||
};
|
||||
|
||||
struct Delimiter {
|
||||
enum {
|
||||
SingleQuote = 1 << 0,
|
||||
DoubleQuote = 1 << 1,
|
||||
BackQuote = 1 << 2,
|
||||
Raw = 1 << 3,
|
||||
Format = 1 << 4,
|
||||
Triple = 1 << 5,
|
||||
Bytes = 1 << 6,
|
||||
};
|
||||
|
||||
Delimiter() : flags(0) {}
|
||||
|
||||
bool is_format() const {
|
||||
return flags & Format;
|
||||
}
|
||||
|
||||
bool is_raw() const {
|
||||
return flags & Raw;
|
||||
}
|
||||
|
||||
bool is_triple() const {
|
||||
return flags & Triple;
|
||||
}
|
||||
|
||||
bool is_bytes() const {
|
||||
return flags & Bytes;
|
||||
}
|
||||
|
||||
int32_t end_character() const {
|
||||
if (flags & SingleQuote) return '\'';
|
||||
if (flags & DoubleQuote) return '"';
|
||||
if (flags & BackQuote) return '`';
|
||||
return 0;
|
||||
}
|
||||
|
||||
void set_format() {
|
||||
flags |= Format;
|
||||
}
|
||||
|
||||
void set_raw() {
|
||||
flags |= Raw;
|
||||
}
|
||||
|
||||
void set_triple() {
|
||||
flags |= Triple;
|
||||
}
|
||||
|
||||
void set_bytes() {
|
||||
flags |= Bytes;
|
||||
}
|
||||
|
||||
void set_end_character(int32_t character) {
|
||||
switch (character) {
|
||||
case '\'':
|
||||
flags |= SingleQuote;
|
||||
break;
|
||||
case '"':
|
||||
flags |= DoubleQuote;
|
||||
break;
|
||||
case '`':
|
||||
flags |= BackQuote;
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
char flags;
|
||||
};
|
||||
|
||||
struct Scanner {
|
||||
Scanner() {
|
||||
assert(sizeof(Delimiter) == sizeof(char));
|
||||
deserialize(NULL, 0);
|
||||
}
|
||||
|
||||
unsigned serialize(char *buffer) {
|
||||
size_t i = 0;
|
||||
|
||||
size_t delimiter_count = delimiter_stack.size();
|
||||
if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
|
||||
buffer[i++] = delimiter_count;
|
||||
|
||||
if (delimiter_count > 0) {
|
||||
memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
|
||||
}
|
||||
i += delimiter_count;
|
||||
|
||||
vector<uint16_t>::iterator
|
||||
iter = indent_length_stack.begin() + 1,
|
||||
end = indent_length_stack.end();
|
||||
|
||||
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
|
||||
buffer[i++] = *iter;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void deserialize(const char *buffer, unsigned length) {
|
||||
delimiter_stack.clear();
|
||||
indent_length_stack.clear();
|
||||
indent_length_stack.push_back(0);
|
||||
|
||||
if (length > 0) {
|
||||
size_t i = 0;
|
||||
|
||||
size_t delimiter_count = (uint8_t)buffer[i++];
|
||||
delimiter_stack.resize(delimiter_count);
|
||||
if (delimiter_count > 0) {
|
||||
memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
|
||||
}
|
||||
i += delimiter_count;
|
||||
|
||||
for (; i < length; i++) {
|
||||
indent_length_stack.push_back(buffer[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void advance(TSLexer *lexer) {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
void skip(TSLexer *lexer) {
|
||||
lexer->advance(lexer, true);
|
||||
}
|
||||
|
||||
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
||||
if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
|
||||
Delimiter delimiter = delimiter_stack.back();
|
||||
int32_t end_character = delimiter.end_character();
|
||||
bool has_content = false;
|
||||
while (lexer->lookahead) {
|
||||
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return has_content;
|
||||
} else if (lexer->lookahead == '\\') {
|
||||
if (delimiter.is_raw()) {
|
||||
lexer->advance(lexer, false);
|
||||
continue;
|
||||
} else if (delimiter.is_bytes()) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
|
||||
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
|
||||
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||
lexer->advance(lexer, false);
|
||||
} else {
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return has_content;
|
||||
}
|
||||
} else {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return has_content;
|
||||
}
|
||||
} else if (lexer->lookahead == end_character) {
|
||||
if (delimiter.is_triple()) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == end_character) {
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == end_character) {
|
||||
if (has_content) {
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
} else {
|
||||
lexer->advance(lexer, false);
|
||||
lexer->mark_end(lexer);
|
||||
delimiter_stack.pop_back();
|
||||
lexer->result_symbol = STRING_END;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (has_content) {
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
} else {
|
||||
lexer->advance(lexer, false);
|
||||
delimiter_stack.pop_back();
|
||||
lexer->result_symbol = STRING_END;
|
||||
}
|
||||
lexer->mark_end(lexer);
|
||||
return true;
|
||||
}
|
||||
} else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
|
||||
return false;
|
||||
}
|
||||
advance(lexer);
|
||||
has_content = true;
|
||||
}
|
||||
}
|
||||
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
bool found_end_of_line = false;
|
||||
uint32_t indent_length = 0;
|
||||
int32_t first_comment_indent_length = -1;
|
||||
for (;;) {
|
||||
if (lexer->lookahead == '\n') {
|
||||
found_end_of_line = true;
|
||||
indent_length = 0;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == ' ') {
|
||||
indent_length++;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '\r') {
|
||||
indent_length = 0;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '\t') {
|
||||
indent_length += 8;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '#') {
|
||||
if (first_comment_indent_length == -1) {
|
||||
first_comment_indent_length = (int32_t)indent_length;
|
||||
}
|
||||
while (lexer->lookahead && lexer->lookahead != '\n') {
|
||||
skip(lexer);
|
||||
}
|
||||
skip(lexer);
|
||||
indent_length = 0;
|
||||
} else if (lexer->lookahead == '\\') {
|
||||
skip(lexer);
|
||||
if (lexer->lookahead == '\r') {
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
skip(lexer);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (lexer->lookahead == '\f') {
|
||||
indent_length = 0;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == 0) {
|
||||
indent_length = 0;
|
||||
found_end_of_line = true;
|
||||
break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_end_of_line) {
|
||||
if (!indent_length_stack.empty()) {
|
||||
uint16_t current_indent_length = indent_length_stack.back();
|
||||
|
||||
if (
|
||||
valid_symbols[INDENT] &&
|
||||
indent_length > current_indent_length
|
||||
) {
|
||||
indent_length_stack.push_back(indent_length);
|
||||
lexer->result_symbol = INDENT;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
valid_symbols[DEDENT] &&
|
||||
indent_length < current_indent_length &&
|
||||
|
||||
// Wait to create a dedent token until we've consumed any comments
|
||||
// whose indentation matches the current block.
|
||||
first_comment_indent_length < (int32_t)current_indent_length
|
||||
) {
|
||||
indent_length_stack.pop_back();
|
||||
lexer->result_symbol = DEDENT;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (valid_symbols[NEWLINE]) {
|
||||
lexer->result_symbol = NEWLINE;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
|
||||
Delimiter delimiter;
|
||||
|
||||
bool has_flags = false;
|
||||
while (lexer->lookahead) {
|
||||
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
|
||||
delimiter.set_format();
|
||||
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
|
||||
delimiter.set_raw();
|
||||
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
|
||||
delimiter.set_bytes();
|
||||
} else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
|
||||
break;
|
||||
}
|
||||
has_flags = true;
|
||||
advance(lexer);
|
||||
}
|
||||
|
||||
if (lexer->lookahead == '`') {
|
||||
delimiter.set_end_character('`');
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
} else if (lexer->lookahead == '\'') {
|
||||
delimiter.set_end_character('\'');
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
if (lexer->lookahead == '\'') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '\'') {
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
delimiter.set_triple();
|
||||
}
|
||||
}
|
||||
} else if (lexer->lookahead == '"') {
|
||||
delimiter.set_end_character('"');
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
if (lexer->lookahead == '"') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '"') {
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
delimiter.set_triple();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (delimiter.end_character()) {
|
||||
delimiter_stack.push_back(delimiter);
|
||||
lexer->result_symbol = STRING_START;
|
||||
return true;
|
||||
} else if (has_flags) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<uint16_t> indent_length_stack;
|
||||
vector<Delimiter> delimiter_stack;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *tree_sitter_python_external_scanner_create() {
|
||||
return new Scanner();
|
||||
}
|
||||
|
||||
bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
|
||||
const bool *valid_symbols) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
return scanner->scan(lexer, valid_symbols);
|
||||
}
|
||||
|
||||
unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
return scanner->serialize(buffer);
|
||||
}
|
||||
|
||||
void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
scanner->deserialize(buffer, length);
|
||||
}
|
||||
|
||||
void tree_sitter_python_external_scanner_destroy(void *payload) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
delete scanner;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
#ifndef TREE_SITTER_PARSER_H_
|
||||
#define TREE_SITTER_PARSER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define ts_builtin_sym_error ((TSSymbol)-1)
|
||||
#define ts_builtin_sym_end 0
|
||||
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
|
||||
|
||||
typedef uint16_t TSStateId;
|
||||
|
||||
#ifndef TREE_SITTER_API_H_
|
||||
typedef uint16_t TSSymbol;
|
||||
typedef uint16_t TSFieldId;
|
||||
typedef struct TSLanguage TSLanguage;
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
TSFieldId field_id;
|
||||
uint8_t child_index;
|
||||
bool inherited;
|
||||
} TSFieldMapEntry;
|
||||
|
||||
typedef struct {
|
||||
uint16_t index;
|
||||
uint16_t length;
|
||||
} TSFieldMapSlice;
|
||||
|
||||
typedef struct {
|
||||
bool visible;
|
||||
bool named;
|
||||
bool supertype;
|
||||
} TSSymbolMetadata;
|
||||
|
||||
typedef struct TSLexer TSLexer;
|
||||
|
||||
struct TSLexer {
|
||||
int32_t lookahead;
|
||||
TSSymbol result_symbol;
|
||||
void (*advance)(TSLexer *, bool);
|
||||
void (*mark_end)(TSLexer *);
|
||||
uint32_t (*get_column)(TSLexer *);
|
||||
bool (*is_at_included_range_start)(const TSLexer *);
|
||||
bool (*eof)(const TSLexer *);
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
TSParseActionTypeShift,
|
||||
TSParseActionTypeReduce,
|
||||
TSParseActionTypeAccept,
|
||||
TSParseActionTypeRecover,
|
||||
} TSParseActionType;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
uint8_t type;
|
||||
TSStateId state;
|
||||
bool extra;
|
||||
bool repetition;
|
||||
} shift;
|
||||
struct {
|
||||
uint8_t type;
|
||||
uint8_t child_count;
|
||||
TSSymbol symbol;
|
||||
int16_t dynamic_precedence;
|
||||
uint16_t production_id;
|
||||
} reduce;
|
||||
uint8_t type;
|
||||
} TSParseAction;
|
||||
|
||||
typedef struct {
|
||||
uint16_t lex_state;
|
||||
uint16_t external_lex_state;
|
||||
} TSLexMode;
|
||||
|
||||
typedef union {
|
||||
TSParseAction action;
|
||||
struct {
|
||||
uint8_t count;
|
||||
bool reusable;
|
||||
} entry;
|
||||
} TSParseActionEntry;
|
||||
|
||||
struct TSLanguage {
|
||||
uint32_t version;
|
||||
uint32_t symbol_count;
|
||||
uint32_t alias_count;
|
||||
uint32_t token_count;
|
||||
uint32_t external_token_count;
|
||||
uint32_t state_count;
|
||||
uint32_t large_state_count;
|
||||
uint32_t production_id_count;
|
||||
uint32_t field_count;
|
||||
uint16_t max_alias_sequence_length;
|
||||
const uint16_t *parse_table;
|
||||
const uint16_t *small_parse_table;
|
||||
const uint32_t *small_parse_table_map;
|
||||
const TSParseActionEntry *parse_actions;
|
||||
const char * const *symbol_names;
|
||||
const char * const *field_names;
|
||||
const TSFieldMapSlice *field_map_slices;
|
||||
const TSFieldMapEntry *field_map_entries;
|
||||
const TSSymbolMetadata *symbol_metadata;
|
||||
const TSSymbol *public_symbol_map;
|
||||
const uint16_t *alias_map;
|
||||
const TSSymbol *alias_sequences;
|
||||
const TSLexMode *lex_modes;
|
||||
bool (*lex_fn)(TSLexer *, TSStateId);
|
||||
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
|
||||
TSSymbol keyword_capture_token;
|
||||
struct {
|
||||
const bool *states;
|
||||
const TSSymbol *symbol_map;
|
||||
void *(*create)(void);
|
||||
void (*destroy)(void *);
|
||||
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
|
||||
unsigned (*serialize)(void *, char *);
|
||||
void (*deserialize)(void *, const char *, unsigned);
|
||||
} external_scanner;
|
||||
const TSStateId *primary_state_ids;
|
||||
};
|
||||
|
||||
/*
|
||||
* Lexer Macros
|
||||
*/
|
||||
|
||||
#define START_LEXER() \
|
||||
bool result = false; \
|
||||
bool skip = false; \
|
||||
bool eof = false; \
|
||||
int32_t lookahead; \
|
||||
goto start; \
|
||||
next_state: \
|
||||
lexer->advance(lexer, skip); \
|
||||
start: \
|
||||
skip = false; \
|
||||
lookahead = lexer->lookahead;
|
||||
|
||||
#define ADVANCE(state_value) \
|
||||
{ \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
skip = true; \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ACCEPT_TOKEN(symbol_value) \
|
||||
result = true; \
|
||||
lexer->result_symbol = symbol_value; \
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
#define END_STATE() return result;
|
||||
|
||||
/*
|
||||
* Parse Table Macros
|
||||
*/
|
||||
|
||||
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
|
||||
|
||||
#define STATE(id) id
|
||||
|
||||
#define ACTIONS(id) id
|
||||
|
||||
#define SHIFT(state_value) \
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.state = state_value \
|
||||
} \
|
||||
}}
|
||||
|
||||
#define SHIFT_REPEAT(state_value) \
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.state = state_value, \
|
||||
.repetition = true \
|
||||
} \
|
||||
}}
|
||||
|
||||
#define SHIFT_EXTRA() \
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.extra = true \
|
||||
} \
|
||||
}}
|
||||
|
||||
#define REDUCE(symbol_val, child_count_val, ...) \
|
||||
{{ \
|
||||
.reduce = { \
|
||||
.type = TSParseActionTypeReduce, \
|
||||
.symbol = symbol_val, \
|
||||
.child_count = child_count_val, \
|
||||
__VA_ARGS__ \
|
||||
}, \
|
||||
}}
|
||||
|
||||
#define RECOVER() \
|
||||
{{ \
|
||||
.type = TSParseActionTypeRecover \
|
||||
}}
|
||||
|
||||
#define ACCEPT_INPUT() \
|
||||
{{ \
|
||||
.type = TSParseActionTypeAccept \
|
||||
}}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // TREE_SITTER_PARSER_H_
|
||||
Reference in New Issue
Block a user