mirror of
https://github.com/github/codeql.git
synced 2026-02-13 05:31:22 +01:00
- Extends the scanner with a new token kind representing the start of a template string. This is used to distinguish template strings from regular strings (because only a template string will start with a `_template_string_start` external token). - Cleans up the logic surrounding interpolations (and the method names) so that format strings and template strings behave the same in this case. Finally, we add two new node types in the tree-sitter grammar: - `template_string` behaves like format strings, but is a distinct type (mainly so that an implicit concatenation between template strings and regular strings becomes a syntax error). - `concatenated_template_string` is the counterpart of `concatenated_string`. However, internally, the string parts of a template strings are just the same `string_content` nodes that are used in regular format strings. We will disambiguate these inside `tsg-python`.
437 lines
12 KiB
C++
437 lines
12 KiB
C++
#include <cassert>
|
|
#include <cstring>
|
|
#include <cwctype>
|
|
#include <stdio.h>
|
|
#include <tree_sitter/parser.h>
|
|
#include <vector>
|
|
namespace {
|
|
|
|
using std::vector;
|
|
using std::iswspace;
|
|
using std::memcpy;
|
|
|
|
enum TokenType {
|
|
NEWLINE,
|
|
INDENT,
|
|
DEDENT,
|
|
STRING_START,
|
|
STRING_CONTENT,
|
|
STRING_END,
|
|
TEMPLATE_STRING_START,
|
|
};
|
|
|
|
struct Delimiter {
|
|
enum {
|
|
SingleQuote = 1 << 0,
|
|
DoubleQuote = 1 << 1,
|
|
BackQuote = 1 << 2,
|
|
Raw = 1 << 3,
|
|
Format = 1 << 4,
|
|
Triple = 1 << 5,
|
|
Bytes = 1 << 6,
|
|
Template = 1 << 7,
|
|
};
|
|
|
|
Delimiter() : flags(0) {}
|
|
|
|
bool is_format() const {
|
|
return flags & Format;
|
|
}
|
|
|
|
bool is_template() const {
|
|
return flags & Template;
|
|
}
|
|
|
|
bool can_interpolate() const {
|
|
return is_format() || is_template();
|
|
}
|
|
|
|
bool is_raw() const {
|
|
return flags & Raw;
|
|
}
|
|
|
|
bool is_triple() const {
|
|
return flags & Triple;
|
|
}
|
|
|
|
bool is_bytes() const {
|
|
return flags & Bytes;
|
|
}
|
|
|
|
int32_t end_character() const {
|
|
if (flags & SingleQuote) return '\'';
|
|
if (flags & DoubleQuote) return '"';
|
|
if (flags & BackQuote) return '`';
|
|
return 0;
|
|
}
|
|
|
|
void set_format() {
|
|
flags |= Format;
|
|
}
|
|
|
|
void set_template() {
|
|
flags |= Template;
|
|
}
|
|
|
|
void set_raw() {
|
|
flags |= Raw;
|
|
}
|
|
|
|
void set_triple() {
|
|
flags |= Triple;
|
|
}
|
|
|
|
void set_bytes() {
|
|
flags |= Bytes;
|
|
}
|
|
|
|
void set_end_character(int32_t character) {
|
|
switch (character) {
|
|
case '\'':
|
|
flags |= SingleQuote;
|
|
break;
|
|
case '"':
|
|
flags |= DoubleQuote;
|
|
break;
|
|
case '`':
|
|
flags |= BackQuote;
|
|
break;
|
|
default:
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
char flags;
|
|
};
|
|
|
|
struct Scanner {
|
|
Scanner() {
|
|
assert(sizeof(Delimiter) == sizeof(char));
|
|
deserialize(NULL, 0);
|
|
}
|
|
|
|
unsigned serialize(char *buffer) {
|
|
size_t i = 0;
|
|
|
|
size_t delimiter_count = delimiter_stack.size();
|
|
if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
|
|
buffer[i++] = delimiter_count;
|
|
|
|
if (delimiter_count > 0) {
|
|
memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
|
|
}
|
|
i += delimiter_count;
|
|
|
|
vector<uint16_t>::iterator
|
|
iter = indent_length_stack.begin() + 1,
|
|
end = indent_length_stack.end();
|
|
|
|
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
|
|
buffer[i++] = *iter;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
void deserialize(const char *buffer, unsigned length) {
|
|
delimiter_stack.clear();
|
|
indent_length_stack.clear();
|
|
indent_length_stack.push_back(0);
|
|
|
|
if (length > 0) {
|
|
size_t i = 0;
|
|
|
|
size_t delimiter_count = (uint8_t)buffer[i++];
|
|
delimiter_stack.resize(delimiter_count);
|
|
if (delimiter_count > 0) {
|
|
memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
|
|
}
|
|
i += delimiter_count;
|
|
|
|
for (; i < length; i++) {
|
|
indent_length_stack.push_back(buffer[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void advance(TSLexer *lexer) {
|
|
lexer->advance(lexer, false);
|
|
}
|
|
|
|
void skip(TSLexer *lexer) {
|
|
lexer->advance(lexer, true);
|
|
}
|
|
|
|
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
|
if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
|
|
Delimiter delimiter = delimiter_stack.back();
|
|
int32_t end_character = delimiter.end_character();
|
|
bool has_content = false;
|
|
while (lexer->lookahead) {
|
|
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.can_interpolate()) {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return has_content;
|
|
} else if (lexer->lookahead == '\\') {
|
|
if (delimiter.is_raw()) {
|
|
lexer->advance(lexer, false);
|
|
// In raw strings, backslashes _can_ escape the same kind of quotes as the outer
|
|
// string, so we must take care to traverse any such escaped quotes now. If we don't do
|
|
// this, we will mistakenly consider the string to end at that escaped quote.
|
|
// Likewise, this also extends to escaped backslashes.
|
|
if (lexer->lookahead == end_character || lexer->lookahead == '\\') {
|
|
lexer->advance(lexer, false);
|
|
}
|
|
// Newlines after backslashes also cause issues, so we explicitly step over them here.
|
|
if (lexer->lookahead == '\r') {
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == '\n') {
|
|
lexer->advance(lexer, false);
|
|
}
|
|
} else if (lexer->lookahead == '\n') {
|
|
lexer->advance(lexer, false);
|
|
}
|
|
continue;
|
|
} else if (delimiter.is_bytes()) {
|
|
lexer->mark_end(lexer);
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
|
|
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
|
|
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
|
lexer->advance(lexer, false);
|
|
} else {
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return has_content;
|
|
}
|
|
} else {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return has_content;
|
|
}
|
|
} else if (lexer->lookahead == end_character) {
|
|
if (delimiter.is_triple()) {
|
|
lexer->mark_end(lexer);
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == end_character) {
|
|
lexer->advance(lexer, false);
|
|
if (lexer->lookahead == end_character) {
|
|
if (has_content) {
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
} else {
|
|
lexer->advance(lexer, false);
|
|
lexer->mark_end(lexer);
|
|
delimiter_stack.pop_back();
|
|
lexer->result_symbol = STRING_END;
|
|
}
|
|
return true;
|
|
} else {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return true;
|
|
}
|
|
} else {
|
|
lexer->mark_end(lexer);
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
return true;
|
|
}
|
|
} else {
|
|
if (has_content) {
|
|
lexer->result_symbol = STRING_CONTENT;
|
|
} else {
|
|
lexer->advance(lexer, false);
|
|
delimiter_stack.pop_back();
|
|
lexer->result_symbol = STRING_END;
|
|
}
|
|
lexer->mark_end(lexer);
|
|
return true;
|
|
}
|
|
} else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
|
|
return false;
|
|
}
|
|
advance(lexer);
|
|
has_content = true;
|
|
}
|
|
}
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
bool found_end_of_line = false;
|
|
uint32_t indent_length = 0;
|
|
int32_t first_comment_indent_length = -1;
|
|
for (;;) {
|
|
if (lexer->lookahead == '\n') {
|
|
found_end_of_line = true;
|
|
indent_length = 0;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == ' ') {
|
|
indent_length++;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '\r') {
|
|
indent_length = 0;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '\t') {
|
|
indent_length += 8;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == '#') {
|
|
if (first_comment_indent_length == -1) {
|
|
first_comment_indent_length = (int32_t)indent_length;
|
|
}
|
|
while (lexer->lookahead && lexer->lookahead != '\n') {
|
|
skip(lexer);
|
|
}
|
|
skip(lexer);
|
|
indent_length = 0;
|
|
} else if (lexer->lookahead == '\\') {
|
|
skip(lexer);
|
|
if (lexer->lookahead == '\r') {
|
|
skip(lexer);
|
|
}
|
|
if (lexer->lookahead == '\n') {
|
|
skip(lexer);
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (lexer->lookahead == '\f') {
|
|
indent_length = 0;
|
|
skip(lexer);
|
|
} else if (lexer->lookahead == 0) {
|
|
indent_length = 0;
|
|
found_end_of_line = true;
|
|
break;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (found_end_of_line) {
|
|
if (!indent_length_stack.empty()) {
|
|
uint16_t current_indent_length = indent_length_stack.back();
|
|
|
|
if (
|
|
valid_symbols[INDENT] &&
|
|
indent_length > current_indent_length
|
|
) {
|
|
indent_length_stack.push_back(indent_length);
|
|
lexer->result_symbol = INDENT;
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
valid_symbols[DEDENT] &&
|
|
indent_length < current_indent_length &&
|
|
|
|
// Wait to create a dedent token until we've consumed any comments
|
|
// whose indentation matches the current block.
|
|
first_comment_indent_length < (int32_t)current_indent_length
|
|
) {
|
|
indent_length_stack.pop_back();
|
|
lexer->result_symbol = DEDENT;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (valid_symbols[NEWLINE]) {
|
|
lexer->result_symbol = NEWLINE;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool expects_string_start = valid_symbols[STRING_START] || valid_symbols[TEMPLATE_STRING_START];
|
|
|
|
if (first_comment_indent_length == -1 && expects_string_start) {
|
|
Delimiter delimiter;
|
|
|
|
bool has_flags = false;
|
|
while (lexer->lookahead) {
|
|
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
|
|
delimiter.set_format();
|
|
} else if (lexer->lookahead == 't' || lexer->lookahead == 'T') {
|
|
delimiter.set_template();
|
|
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
|
|
delimiter.set_raw();
|
|
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
|
|
delimiter.set_bytes();
|
|
} else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
|
|
break;
|
|
}
|
|
has_flags = true;
|
|
advance(lexer);
|
|
}
|
|
|
|
if (lexer->lookahead == '`') {
|
|
delimiter.set_end_character('`');
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
} else if (lexer->lookahead == '\'') {
|
|
delimiter.set_end_character('\'');
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
if (lexer->lookahead == '\'') {
|
|
advance(lexer);
|
|
if (lexer->lookahead == '\'') {
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
delimiter.set_triple();
|
|
}
|
|
}
|
|
} else if (lexer->lookahead == '"') {
|
|
delimiter.set_end_character('"');
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
if (lexer->lookahead == '"') {
|
|
advance(lexer);
|
|
if (lexer->lookahead == '"') {
|
|
advance(lexer);
|
|
lexer->mark_end(lexer);
|
|
delimiter.set_triple();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (delimiter.end_character()) {
|
|
delimiter_stack.push_back(delimiter);
|
|
lexer->result_symbol = delimiter.is_template() ? TEMPLATE_STRING_START : STRING_START;
|
|
return true;
|
|
} else if (has_flags) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
vector<uint16_t> indent_length_stack;
|
|
vector<Delimiter> delimiter_stack;
|
|
};
|
|
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
void *tree_sitter_python_external_scanner_create() {
|
|
return new Scanner();
|
|
}
|
|
|
|
bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
|
|
const bool *valid_symbols) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
return scanner->scan(lexer, valid_symbols);
|
|
}
|
|
|
|
unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
return scanner->serialize(buffer);
|
|
}
|
|
|
|
void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
scanner->deserialize(buffer, length);
|
|
}
|
|
|
|
void tree_sitter_python_external_scanner_destroy(void *payload) {
|
|
Scanner *scanner = static_cast<Scanner *>(payload);
|
|
delete scanner;
|
|
}
|
|
|
|
}
|