mirror of
https://github.com/github/codeql.git
synced 2026-01-12 06:00:23 +01:00
The current name results in a path that is more than 260 characters long, and this causes issues for the build on Windows.
403 lines
10 KiB
C++
403 lines
10 KiB
C++
#include <cassert>
|
|
#include <cstring>
|
|
#include <cwctype>
|
|
#include <stdio.h>
|
|
#include <tree_sitter/parser.h>
|
|
#include <vector>
|
|
namespace {
|
|
|
|
using std::vector;
|
|
using std::iswspace;
|
|
using std::memcpy;
|
|
|
|
enum TokenType {
|
|
NEWLINE,
|
|
INDENT,
|
|
DEDENT,
|
|
STRING_START,
|
|
STRING_CONTENT,
|
|
STRING_END,
|
|
};
|
|
|
|
struct Delimiter {
|
|
enum {
|
|
SingleQuote = 1 << 0,
|
|
DoubleQuote = 1 << 1,
|
|
BackQuote = 1 << 2,
|
|
Raw = 1 << 3,
|
|
Format = 1 << 4,
|
|
Triple = 1 << 5,
|
|
Bytes = 1 << 6,
|
|
};
|
|
|
|
Delimiter() : flags(0) {}
|
|
|
|
bool is_format() const {
|
|
return flags & Format;
|
|
}
|
|
|
|
bool is_raw() const {
|
|
return flags & Raw;
|
|
}
|
|
|
|
bool is_triple() const {
|
|
return flags & Triple;
|
|
}
|
|
|
|
bool is_bytes() const {
|
|
return flags & Bytes;
|
|
}
|
|
|
|
int32_t end_character() const {
|
|
if (flags & SingleQuote) return '\'';
|
|
if (flags & DoubleQuote) return '"';
|
|
if (flags & BackQuote) return '`';
|
|
return 0;
|
|
}
|
|
|
|
void set_format() {
|
|
flags |= Format;
|
|
}
|
|
|
|
void set_raw() {
|
|
flags |= Raw;
|
|
}
|
|
|
|
void set_triple() {
|
|
flags |= Triple;
|
|
}
|
|
|
|
void set_bytes() {
|
|
flags |= Bytes;
|
|
}
|
|
|
|
void set_end_character(int32_t character) {
|
|
switch (character) {
|
|
case '\'':
|
|
flags |= SingleQuote;
|
|
break;
|
|
case '"':
|
|
flags |= DoubleQuote;
|
|
break;
|
|
case '`':
|
|
flags |= BackQuote;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
char flags;
|
|
};
|
|
|
|
struct Scanner {
|
|
Scanner() {
|
|
assert(sizeof(Delimiter) == sizeof(char));
|
|
deserialize(NULL, 0);
|
|
}
|
|
|
|
unsigned serialize(char *buffer) {
|
|
size_t i = 0;
|
|
|
|
size_t delimiter_count = delimiter_stack.size();
|
|
if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
|
|
buffer[i++] = delimiter_count;
|
|
|
|
if (delimiter_count > 0) {
|
|
memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
|
|
}
|
|
i += delimiter_count;
|
|
|
|
vector<uint16_t>::iterator
|
|
iter = indent_length_stack.begin() + 1,
|
|
end = indent_length_stack.end();
|
|
|
|
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
|
|
buffer[i++] = *iter;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
void deserialize(const char *buffer, unsigned length) {
|
|
delimiter_stack.clear();
|
|
indent_length_stack.clear();
|
|
indent_length_stack.push_back(0);
|
|
|
|
if (length > 0) {
|
|
size_t i = 0;
|
|
|
|
size_t delimiter_count = (uint8_t)buffer[i++];
|
|
delimiter_stack.resize(delimiter_count);
|
|
if (delimiter_count > 0) {
|
|
memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
|
|
}
|
|
i += delimiter_count;
|
|
|
|
for (; i < length; i++) {
|
|
indent_length_stack.push_back(buffer[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void advance(TSLexer *lexer) {
|
|
lexer->advance(lexer, false);
|
|
}
|
|
|
|
void skip(TSLexer *lexer) {
|
|
lexer->advance(lexer, true);
|
|
}
|
|
|
|
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
|
if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
|
|
Delimiter delimiter = delimiter_stack.back();
|
|
int32_t end_character = delimiter.end_character();
|
|
bool has_content = false;
|
|
while (lexer->lookahead) {
|
|
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return has_content;
|
|
} else if (lexer->lookahead == '\\') {
|
|
if (delimiter.is_raw()) {
|
|
lexer->advance(lexer, false);
|
|
continue;
|
|
} else if (delimiter.is_bytes()) {
|
|
lexer->mark_end(lexer);
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
|
|
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
|
|
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
|
lexer->advance(lexer, false);
|
|
} else {
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return has_content;
|
|
}
|
|
} else {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return has_content;
|
|
}
|
|
} else if (lexer->lookahead == end_character) {
|
|
if (delimiter.is_triple()) {
|
|
lexer->mark_end(lexer);
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == end_character) {
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == end_character) {
|
|
if (has_content) {
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
} else {
|
|
lexer->advance(lexer, false);
|
|
lexer->mark_end(lexer);
|
|
delimiter_stack.pop_back();
|
|
lexer->result_symbol = STRING_END;
|
|
}
|
|
return true;
|
|
} else {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return true;
|
|
}
|
|
} else {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return true;
|
|
}
|
|
} else {
|
|
if (has_content) {
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
} else {
|
|
lexer->advance(lexer, false);
|
|
delimiter_stack.pop_back();
|
|
lexer->result_symbol = STRING_END;
|
|
}
|
|
lexer->mark_end(lexer);
|
|
return true;
|
|
}
|
|
} else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
|
|
return false;
|
|
}
|
|
advance(lexer);
|
|
has_content = true;
|
|
}
|
|
}
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
bool found_end_of_line = false;
|
|
uint32_t indent_length = 0;
|
|
int32_t first_comment_indent_length = -1;
|
|
for (;;) {
|
|
if (lexer->lookahead == '\n') {
|
|
found_end_of_line = true;
|
|
indent_length = 0;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == ' ') {
|
|
indent_length++;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '\r') {
|
|
indent_length = 0;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '\t') {
|
|
indent_length += 8;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '#') {
|
|
if (first_comment_indent_length == -1) {
|
|
first_comment_indent_length = (int32_t)indent_length;
|
|
}
|
|
while (lexer->lookahead && lexer->lookahead != '\n') {
|
|
skip(lexer);
|
|
}
|
|
skip(lexer);
|
|
indent_length = 0;
|
|
} else if (lexer->lookahead == '\\') {
|
|
skip(lexer);
|
|
if (lexer->lookahead == '\r') {
|
|
skip(lexer);
|
|
}
|
|
if (lexer->lookahead == '\n') {
|
|
skip(lexer);
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (lexer->lookahead == '\f') {
|
|
indent_length = 0;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == 0) {
|
|
indent_length = 0;
|
|
found_end_of_line = true;
|
|
break;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (found_end_of_line) {
|
|
if (!indent_length_stack.empty()) {
|
|
uint16_t current_indent_length = indent_length_stack.back();
|
|
|
|
if (
|
|
valid_symbols[INDENT] &&
|
|
indent_length > current_indent_length
|
|
) {
|
|
indent_length_stack.push_back(indent_length);
|
|
lexer->result_symbol = INDENT;
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
valid_symbols[DEDENT] &&
|
|
indent_length < current_indent_length &&
|
|
|
|
// Wait to create a dedent token until we've consumed any comments
|
|
// whose indentation matches the current block.
|
|
first_comment_indent_length < (int32_t)current_indent_length
|
|
) {
|
|
indent_length_stack.pop_back();
|
|
lexer->result_symbol = DEDENT;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (valid_symbols[NEWLINE]) {
|
|
lexer->result_symbol = NEWLINE;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
|
|
Delimiter delimiter;
|
|
|
|
bool has_flags = false;
|
|
while (lexer->lookahead) {
|
|
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
|
|
delimiter.set_format();
|
|
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
|
|
delimiter.set_raw();
|
|
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
|
|
delimiter.set_bytes();
|
|
} else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
|
|
break;
|
|
}
|
|
has_flags = true;
|
|
advance(lexer);
|
|
}
|
|
|
|
if (lexer->lookahead == '`') {
|
|
delimiter.set_end_character('`');
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
} else if (lexer->lookahead == '\'') {
|
|
delimiter.set_end_character('\'');
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
if (lexer->lookahead == '\'') {
|
|
advance(lexer);
|
|
if (lexer->lookahead == '\'') {
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
delimiter.set_triple();
|
|
}
|
|
}
|
|
} else if (lexer->lookahead == '"') {
|
|
delimiter.set_end_character('"');
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
if (lexer->lookahead == '"') {
|
|
advance(lexer);
|
|
if (lexer->lookahead == '"') {
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
delimiter.set_triple();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (delimiter.end_character()) {
|
|
delimiter_stack.push_back(delimiter);
|
|
lexer->result_symbol = STRING_START;
|
|
return true;
|
|
} else if (has_flags) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
vector<uint16_t> indent_length_stack;
|
|
vector<Delimiter> delimiter_stack;
|
|
};
|
|
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
void *tree_sitter_python_external_scanner_create() {
|
|
return new Scanner();
|
|
}
|
|
|
|
bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
|
|
const bool *valid_symbols) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
return scanner->scan(lexer, valid_symbols);
|
|
}
|
|
|
|
unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
return scanner->serialize(buffer);
|
|
}
|
|
|
|
void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
scanner->deserialize(buffer, length);
|
|
}
|
|
|
|
void tree_sitter_python_external_scanner_destroy(void *payload) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
delete scanner;
|
|
}
|
|
|
|
}
|