Files
codeql/python/extractor/tsg-python/tsp/src/scanner.cc
Taus 1e51703ce9 Python: Allow escaped quotes/backslashes in raw strings
Quoting the Python documentation (last paragraph of
https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences):

"Even in a raw literal, quotes can be escaped with a backslash, but the
backslash remains in the result; for example, r"\"" is a valid string
literal consisting of two characters: a backslash and a double quote;
r"\" is not a valid string literal (even a raw string cannot end in an
odd number of backslashes)."

We did not handle this correctly in the scanner, as we only consumed the
backslash but not the following single or double quote, resulting in
that character getting interpreted as the end of the string.

To fix this, we do a second lookahead after consuming the backslash, and
if the next character is the end character for the string, we advance
the lexer across it as well.

Similarly, backslashes in raw strings can escape other backslashes.
Thus, for a string like '\\' we must consume the second backslash,
otherwise we'll interpret it as escaping the end quote.
2024-10-28 14:40:24 +00:00

419 lines
11 KiB
C++

#include <cassert>
#include <cstring>
#include <cwctype>
#include <stdio.h>
#include <tree_sitter/parser.h>
#include <vector>
namespace {
using std::vector;
using std::iswspace;
using std::memcpy;
enum TokenType {
NEWLINE,
INDENT,
DEDENT,
STRING_START,
STRING_CONTENT,
STRING_END,
};
struct Delimiter {
enum {
SingleQuote = 1 << 0,
DoubleQuote = 1 << 1,
BackQuote = 1 << 2,
Raw = 1 << 3,
Format = 1 << 4,
Triple = 1 << 5,
Bytes = 1 << 6,
};
Delimiter() : flags(0) {}
bool is_format() const {
return flags & Format;
}
bool is_raw() const {
return flags & Raw;
}
bool is_triple() const {
return flags & Triple;
}
bool is_bytes() const {
return flags & Bytes;
}
int32_t end_character() const {
if (flags & SingleQuote) return '\'';
if (flags & DoubleQuote) return '"';
if (flags & BackQuote) return '`';
return 0;
}
void set_format() {
flags |= Format;
}
void set_raw() {
flags |= Raw;
}
void set_triple() {
flags |= Triple;
}
void set_bytes() {
flags |= Bytes;
}
void set_end_character(int32_t character) {
switch (character) {
case '\'':
flags |= SingleQuote;
break;
case '"':
flags |= DoubleQuote;
break;
case '`':
flags |= BackQuote;
break;
default:
assert(false);
}
}
char flags;
};
struct Scanner {
Scanner() {
assert(sizeof(Delimiter) == sizeof(char));
deserialize(NULL, 0);
}
unsigned serialize(char *buffer) {
size_t i = 0;
size_t delimiter_count = delimiter_stack.size();
if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
buffer[i++] = delimiter_count;
if (delimiter_count > 0) {
memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
}
i += delimiter_count;
vector<uint16_t>::iterator
iter = indent_length_stack.begin() + 1,
end = indent_length_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
return i;
}
void deserialize(const char *buffer, unsigned length) {
delimiter_stack.clear();
indent_length_stack.clear();
indent_length_stack.push_back(0);
if (length > 0) {
size_t i = 0;
size_t delimiter_count = (uint8_t)buffer[i++];
delimiter_stack.resize(delimiter_count);
if (delimiter_count > 0) {
memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
}
i += delimiter_count;
for (; i < length; i++) {
indent_length_stack.push_back(buffer[i]);
}
}
}
void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
Delimiter delimiter = delimiter_stack.back();
int32_t end_character = delimiter.end_character();
bool has_content = false;
while (lexer->lookahead) {
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return has_content;
} else if (lexer->lookahead == '\\') {
if (delimiter.is_raw()) {
lexer->advance(lexer, false);
// In raw strings, backslashes _can_ escape the same kind of quotes as the outer
// string, so we must take care to traverse any such escaped quotes now. If we don't do
// this, we will mistakenly consider the string to end at that escaped quote.
// Likewise, this also extends to escaped backslashes.
if (lexer->lookahead == end_character || lexer->lookahead == '\\') {
lexer->advance(lexer, false);
}
// Newlines after backslashes also cause issues, so we explicitly step over them here.
if (lexer->lookahead == '\r') {
lexer->advance(lexer, false);
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
}
} else if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
}
continue;
} else if (delimiter.is_bytes()) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
lexer->advance(lexer, false);
} else {
lexer->result_symbol = STRING_CONTENT;
return has_content;
}
} else {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return has_content;
}
} else if (lexer->lookahead == end_character) {
if (delimiter.is_triple()) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == end_character) {
lexer->advance(lexer, false);
if (lexer->lookahead == end_character) {
if (has_content) {
lexer->result_symbol = STRING_CONTENT;
} else {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
delimiter_stack.pop_back();
lexer->result_symbol = STRING_END;
}
return true;
} else {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return true;
}
} else {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return true;
}
} else {
if (has_content) {
lexer->result_symbol = STRING_CONTENT;
} else {
lexer->advance(lexer, false);
delimiter_stack.pop_back();
lexer->result_symbol = STRING_END;
}
lexer->mark_end(lexer);
return true;
}
} else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
return false;
}
advance(lexer);
has_content = true;
}
}
lexer->mark_end(lexer);
bool found_end_of_line = false;
uint32_t indent_length = 0;
int32_t first_comment_indent_length = -1;
for (;;) {
if (lexer->lookahead == '\n') {
found_end_of_line = true;
indent_length = 0;
skip(lexer);
} else if (lexer->lookahead == ' ') {
indent_length++;
skip(lexer);
} else if (lexer->lookahead == '\r') {
indent_length = 0;
skip(lexer);
} else if (lexer->lookahead == '\t') {
indent_length += 8;
skip(lexer);
} else if (lexer->lookahead == '#') {
if (first_comment_indent_length == -1) {
first_comment_indent_length = (int32_t)indent_length;
}
while (lexer->lookahead && lexer->lookahead != '\n') {
skip(lexer);
}
skip(lexer);
indent_length = 0;
} else if (lexer->lookahead == '\\') {
skip(lexer);
if (lexer->lookahead == '\r') {
skip(lexer);
}
if (lexer->lookahead == '\n') {
skip(lexer);
} else {
return false;
}
} else if (lexer->lookahead == '\f') {
indent_length = 0;
skip(lexer);
} else if (lexer->lookahead == 0) {
indent_length = 0;
found_end_of_line = true;
break;
} else {
break;
}
}
if (found_end_of_line) {
if (!indent_length_stack.empty()) {
uint16_t current_indent_length = indent_length_stack.back();
if (
valid_symbols[INDENT] &&
indent_length > current_indent_length
) {
indent_length_stack.push_back(indent_length);
lexer->result_symbol = INDENT;
return true;
}
if (
valid_symbols[DEDENT] &&
indent_length < current_indent_length &&
// Wait to create a dedent token until we've consumed any comments
// whose indentation matches the current block.
first_comment_indent_length < (int32_t)current_indent_length
) {
indent_length_stack.pop_back();
lexer->result_symbol = DEDENT;
return true;
}
}
if (valid_symbols[NEWLINE]) {
lexer->result_symbol = NEWLINE;
return true;
}
}
if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
Delimiter delimiter;
bool has_flags = false;
while (lexer->lookahead) {
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
delimiter.set_format();
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
delimiter.set_raw();
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
delimiter.set_bytes();
} else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
break;
}
has_flags = true;
advance(lexer);
}
if (lexer->lookahead == '`') {
delimiter.set_end_character('`');
advance(lexer);
lexer->mark_end(lexer);
} else if (lexer->lookahead == '\'') {
delimiter.set_end_character('\'');
advance(lexer);
lexer->mark_end(lexer);
if (lexer->lookahead == '\'') {
advance(lexer);
if (lexer->lookahead == '\'') {
advance(lexer);
lexer->mark_end(lexer);
delimiter.set_triple();
}
}
} else if (lexer->lookahead == '"') {
delimiter.set_end_character('"');
advance(lexer);
lexer->mark_end(lexer);
if (lexer->lookahead == '"') {
advance(lexer);
if (lexer->lookahead == '"') {
advance(lexer);
lexer->mark_end(lexer);
delimiter.set_triple();
}
}
}
if (delimiter.end_character()) {
delimiter_stack.push_back(delimiter);
lexer->result_symbol = STRING_START;
return true;
} else if (has_flags) {
return false;
}
}
return false;
}
vector<uint16_t> indent_length_stack;
vector<Delimiter> delimiter_stack;
};
}
extern "C" {
void *tree_sitter_python_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_python_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}