mirror of
https://github.com/github/codeql.git
synced 2026-04-08 00:24:03 +02:00
Python: Copy Python extractor to codeql repo
This commit is contained in:
7
python/extractor/tsg-python/tree-sitter-python/.gitignore
vendored
Normal file
7
python/extractor/tsg-python/tree-sitter-python/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
Cargo.lock
|
||||
package-lock.json
|
||||
node_modules
|
||||
build
|
||||
*.log
|
||||
/examples/*/
|
||||
/target/
|
||||
@@ -0,0 +1,6 @@
|
||||
corpus
|
||||
examples
|
||||
build
|
||||
script
|
||||
target
|
||||
bindings/rust
|
||||
38
python/extractor/tsg-python/tree-sitter-python/BUILD.bazel
Normal file
38
python/extractor/tsg-python/tree-sitter-python/BUILD.bazel
Normal file
@@ -0,0 +1,38 @@
|
||||
load("@rules_rust//cargo:defs.bzl", "cargo_build_script")
|
||||
load("@rules_rust//rust:defs.bzl", "rust_library")
|
||||
load("@tsg_python_crate_index//:defs.bzl", "aliases", "all_crate_deps")
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
# This will run the build script from the root of the workspace, and
|
||||
# collect the outputs.
|
||||
cargo_build_script(
|
||||
name = "tsg-build-script",
|
||||
srcs = ["bindings/rust/build.rs"],
|
||||
data = glob([
|
||||
"src/**",
|
||||
]),
|
||||
deps = all_crate_deps(
|
||||
build = True,
|
||||
),
|
||||
)
|
||||
|
||||
rust_library(
|
||||
name = "tree-sitter-python",
|
||||
srcs = [
|
||||
"bindings/rust/lib.rs",
|
||||
],
|
||||
aliases = aliases(),
|
||||
compile_data = glob([
|
||||
"src/**",
|
||||
"queries/**",
|
||||
]) + [
|
||||
"grammar.js",
|
||||
],
|
||||
proc_macro_deps = all_crate_deps(
|
||||
proc_macro = True,
|
||||
),
|
||||
deps = [":tsg-build-script"] + all_crate_deps(
|
||||
normal = True,
|
||||
),
|
||||
)
|
||||
31
python/extractor/tsg-python/tree-sitter-python/Cargo.toml
Normal file
31
python/extractor/tsg-python/tree-sitter-python/Cargo.toml
Normal file
@@ -0,0 +1,31 @@
|
||||
[package]
|
||||
name = "tree-sitter-python"
|
||||
description = "Python grammar for the tree-sitter parsing library"
|
||||
version = "0.19.0"
|
||||
authors = [
|
||||
"Max Brunsfeld <maxbrunsfeld@gmail.com>",
|
||||
"Douglas Creager <dcreager@dcreager.net>",
|
||||
]
|
||||
license = "MIT"
|
||||
readme = "bindings/rust/README.md"
|
||||
keywords = ["incremental", "parsing", "python"]
|
||||
categories = ["parsing", "text-editors"]
|
||||
repository = "https://github.com/tree-sitter/tree-sitter-python"
|
||||
edition = "2018"
|
||||
|
||||
build = "bindings/rust/build.rs"
|
||||
include = [
|
||||
"bindings/rust/*",
|
||||
"grammar.js",
|
||||
"queries/*",
|
||||
"src/*",
|
||||
]
|
||||
|
||||
[lib]
|
||||
path = "bindings/rust/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
tree-sitter = ">= 0.20, < 0.21"
|
||||
|
||||
[build-dependencies]
|
||||
cc = "1.0"
|
||||
21
python/extractor/tsg-python/tree-sitter-python/LICENSE
Normal file
21
python/extractor/tsg-python/tree-sitter-python/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 Max Brunsfeld
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
13
python/extractor/tsg-python/tree-sitter-python/README.md
Normal file
13
python/extractor/tsg-python/tree-sitter-python/README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
tree-sitter-python
|
||||
==================
|
||||
|
||||
[](https://github.com/tree-sitter/tree-sitter-python/actions/workflows/ci.yml)
|
||||
|
||||
Python grammar for [tree-sitter][].
|
||||
|
||||
[tree-sitter]: https://github.com/tree-sitter/tree-sitter
|
||||
|
||||
#### References
|
||||
|
||||
* [Python 2 Grammar](https://docs.python.org/2/reference/grammar.html)
|
||||
* [Python 3 Grammar](https://docs.python.org/3/reference/grammar.html)
|
||||
19
python/extractor/tsg-python/tree-sitter-python/binding.gyp
Normal file
19
python/extractor/tsg-python/tree-sitter-python/binding.gyp
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"target_name": "tree_sitter_python_binding",
|
||||
"include_dirs": [
|
||||
"<!(node -e \"require('nan')\")",
|
||||
"src"
|
||||
],
|
||||
"sources": [
|
||||
"src/parser.c",
|
||||
"bindings/node/binding.cc",
|
||||
"src/scanner.cc"
|
||||
],
|
||||
"cflags_c": [
|
||||
"-std=c99",
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
#include "tree_sitter/parser.h"
|
||||
#include <node.h>
|
||||
#include "nan.h"
|
||||
|
||||
using namespace v8;
|
||||
|
||||
extern "C" TSLanguage * tree_sitter_python();
|
||||
|
||||
namespace {
|
||||
|
||||
NAN_METHOD(New) {}
|
||||
|
||||
void Init(Local<Object> exports, Local<Object> module) {
|
||||
Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
|
||||
tpl->SetClassName(Nan::New("Language").ToLocalChecked());
|
||||
tpl->InstanceTemplate()->SetInternalFieldCount(1);
|
||||
|
||||
Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
|
||||
Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
|
||||
Nan::SetInternalFieldPointer(instance, 0, tree_sitter_python());
|
||||
|
||||
Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("python").ToLocalChecked());
|
||||
Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
|
||||
}
|
||||
|
||||
NODE_MODULE(tree_sitter_python_binding, Init)
|
||||
|
||||
} // namespace
|
||||
@@ -0,0 +1,19 @@
|
||||
try {
|
||||
module.exports = require("../../build/Release/tree_sitter_python_binding");
|
||||
} catch (error1) {
|
||||
if (error1.code !== 'MODULE_NOT_FOUND') {
|
||||
throw error1;
|
||||
}
|
||||
try {
|
||||
module.exports = require("../../build/Debug/tree_sitter_python_binding");
|
||||
} catch (error2) {
|
||||
if (error2.code !== 'MODULE_NOT_FOUND') {
|
||||
throw error2;
|
||||
}
|
||||
throw error1
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
module.exports.nodeTypeInfo = require("../../src/node-types.json");
|
||||
} catch (_) {}
|
||||
@@ -0,0 +1,36 @@
|
||||
# tree-sitter-python
|
||||
|
||||
This crate provides a Python grammar for the [tree-sitter][] parsing library.
|
||||
To use this crate, add it to the `[dependencies]` section of your `Cargo.toml`
|
||||
file. (Note that you will probably also need to depend on the
|
||||
[`tree-sitter`][tree-sitter crate] crate to use the parsed result in any useful
|
||||
way.)
|
||||
|
||||
``` toml
|
||||
[dependencies]
|
||||
tree-sitter = "0.17"
|
||||
tree-sitter-python = "0.17"
|
||||
```
|
||||
|
||||
Typically, you will use the [language][language func] function to add this
|
||||
grammar to a tree-sitter [Parser][], and then use the parser to parse some code:
|
||||
|
||||
``` rust
|
||||
let code = r#"
|
||||
def double(x):
|
||||
return x * 2
|
||||
"#;
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(tree_sitter_python::language()).expect("Error loading Python grammar");
|
||||
let parsed = parser.parse(code, None);
|
||||
```
|
||||
|
||||
If you have any questions, please reach out to us in the [tree-sitter
|
||||
discussions] page.
|
||||
|
||||
[Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
|
||||
[language func]: https://docs.rs/tree-sitter-python/*/tree_sitter_python/fn.language.html
|
||||
[Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
|
||||
[tree-sitter]: https://tree-sitter.github.io/
|
||||
[tree-sitter crate]: https://crates.io/crates/tree-sitter
|
||||
[tree-sitter discussions]: https://github.com/tree-sitter/tree-sitter/discussions
|
||||
@@ -0,0 +1,28 @@
|
||||
use std::path::Path;
|
||||
extern crate cc;
|
||||
|
||||
fn main() {
|
||||
let src_dir = Path::new("src");
|
||||
|
||||
let mut c_config = cc::Build::new();
|
||||
c_config.include(&src_dir);
|
||||
c_config
|
||||
.flag_if_supported("-Wno-unused-parameter")
|
||||
.flag_if_supported("-Wno-unused-but-set-variable")
|
||||
.flag_if_supported("-Wno-trigraphs");
|
||||
let parser_path = src_dir.join("parser.c");
|
||||
c_config.file(&parser_path);
|
||||
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
|
||||
c_config.compile("parser");
|
||||
|
||||
let mut cpp_config = cc::Build::new();
|
||||
cpp_config.cpp(true);
|
||||
cpp_config.include(&src_dir);
|
||||
cpp_config
|
||||
.flag_if_supported("-Wno-unused-parameter")
|
||||
.flag_if_supported("-Wno-unused-but-set-variable");
|
||||
let scanner_path = src_dir.join("scanner.cc");
|
||||
cpp_config.file(&scanner_path);
|
||||
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
|
||||
cpp_config.compile("scanner");
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// -*- coding: utf-8 -*-
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
// Copyright © 2020, tree-sitter-python authors.
|
||||
// See the LICENSE file in this repo for license details.
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
|
||||
//! This crate provides a Python grammar for the [tree-sitter][] parsing library.
|
||||
//!
|
||||
//! Typically, you will use the [language][language func] function to add this grammar to a
|
||||
//! tree-sitter [Parser][], and then use the parser to parse some code:
|
||||
//!
|
||||
//! ```
|
||||
//! use tree_sitter::Parser;
|
||||
//!
|
||||
//! let code = r#"
|
||||
//! def double(x):
|
||||
//! return x * 2
|
||||
//! "#;
|
||||
//! let mut parser = Parser::new();
|
||||
//! parser.set_language(tree_sitter_python::language()).expect("Error loading Python grammar");
|
||||
//! let parsed = parser.parse(code, None);
|
||||
//! # let parsed = parsed.unwrap();
|
||||
//! # let root = parsed.root_node();
|
||||
//! # assert!(!root.has_error());
|
||||
//! ```
|
||||
//!
|
||||
//! [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
|
||||
//! [language func]: fn.language.html
|
||||
//! [Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
|
||||
//! [tree-sitter]: https://tree-sitter.github.io/
|
||||
|
||||
use tree_sitter::Language;
|
||||
|
||||
extern "C" {
|
||||
fn tree_sitter_python() -> Language;
|
||||
}
|
||||
|
||||
/// Returns the tree-sitter [Language][] for this grammar.
|
||||
///
|
||||
/// [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
|
||||
pub fn language() -> Language {
|
||||
unsafe { tree_sitter_python() }
|
||||
}
|
||||
|
||||
/// The source of the Python tree-sitter grammar description.
|
||||
pub const GRAMMAR: &'static str = include_str!("../../grammar.js");
|
||||
|
||||
/// The syntax highlighting query for this language.
|
||||
pub const HIGHLIGHT_QUERY: &'static str = include_str!("../../queries/highlights.scm");
|
||||
|
||||
/// The content of the [`node-types.json`][] file for this grammar.
|
||||
///
|
||||
/// [`node-types.json`]: https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
|
||||
pub const NODE_TYPES: &'static str = include_str!("../../src/node-types.json");
|
||||
|
||||
/// The symbol tagging query for this language.
|
||||
pub const TAGGING_QUERY: &'static str = include_str!("../../queries/tags.scm");
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn can_load_grammar() {
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser
|
||||
.set_language(super::language())
|
||||
.expect("Error loading Python grammar");
|
||||
}
|
||||
}
|
||||
1230
python/extractor/tsg-python/tree-sitter-python/grammar.js
Normal file
1230
python/extractor/tsg-python/tree-sitter-python/grammar.js
Normal file
File diff suppressed because it is too large
Load Diff
1687
python/extractor/tsg-python/tree-sitter-python/log.html
Normal file
1687
python/extractor/tsg-python/tree-sitter-python/log.html
Normal file
File diff suppressed because it is too large
Load Diff
33
python/extractor/tsg-python/tree-sitter-python/package.json
Normal file
33
python/extractor/tsg-python/tree-sitter-python/package.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "tree-sitter-python",
|
||||
"version": "0.19.0",
|
||||
"description": "Python grammar for tree-sitter",
|
||||
"main": "bindings/node",
|
||||
"keywords": [
|
||||
"parser",
|
||||
"lexer"
|
||||
],
|
||||
"author": "Max Brunsfeld",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"nan": "^2.14.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"tree-sitter-cli": "^0.19.3"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "tree-sitter generate && node-gyp build",
|
||||
"test": "tree-sitter test && script/parse-examples",
|
||||
"parse": "tree-sitter parse",
|
||||
"test-windows": "tree-sitter test"
|
||||
},
|
||||
"repository": "https://github.com/tree-sitter/tree-sitter-python",
|
||||
"tree-sitter": [
|
||||
{
|
||||
"scope": "source.python",
|
||||
"file-types": [
|
||||
"py"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
; Identifier naming conventions
|
||||
|
||||
((identifier) @constructor
|
||||
(#match? @constructor "^[A-Z]"))
|
||||
|
||||
((identifier) @constant
|
||||
(#match? @constant "^[A-Z][A-Z_]*$"))
|
||||
|
||||
; Builtin functions
|
||||
|
||||
((call
|
||||
function: (identifier) @function.builtin)
|
||||
(#match?
|
||||
@function.builtin
|
||||
"^(abs|all|any|ascii|bin|bool|breakpoint|bytearray|bytes|callable|chr|classmethod|compile|complex|delattr|dict|dir|divmod|enumerate|eval|exec|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|isinstance|issubclass|iter|len|list|locals|map|max|memoryview|min|next|object|oct|open|ord|pow|print|property|range|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|vars|zip|__import__)$"))
|
||||
|
||||
; Function calls
|
||||
|
||||
(decorator) @function
|
||||
|
||||
(call
|
||||
function: (attribute attribute: (identifier) @function.method))
|
||||
(call
|
||||
function: (identifier) @function)
|
||||
|
||||
; Function definitions
|
||||
|
||||
(function_definition
|
||||
name: (identifier) @function)
|
||||
|
||||
(identifier) @variable
|
||||
(attribute attribute: (identifier) @property)
|
||||
(type (identifier) @type)
|
||||
|
||||
; Literals
|
||||
|
||||
[
|
||||
(none)
|
||||
(true)
|
||||
(false)
|
||||
] @constant.builtin
|
||||
|
||||
[
|
||||
(integer)
|
||||
(float)
|
||||
] @number
|
||||
|
||||
(comment) @comment
|
||||
(string) @string
|
||||
(escape_sequence) @escape
|
||||
|
||||
(interpolation
|
||||
"{" @punctuation.special
|
||||
"}" @punctuation.special) @embedded
|
||||
|
||||
[
|
||||
"-"
|
||||
"-="
|
||||
"!="
|
||||
"*"
|
||||
"**"
|
||||
"**="
|
||||
"*="
|
||||
"/"
|
||||
"//"
|
||||
"//="
|
||||
"/="
|
||||
"&"
|
||||
"%"
|
||||
"%="
|
||||
"^"
|
||||
"+"
|
||||
"->"
|
||||
"+="
|
||||
"<"
|
||||
"<<"
|
||||
"<="
|
||||
"<>"
|
||||
"="
|
||||
":="
|
||||
"=="
|
||||
">"
|
||||
">="
|
||||
">>"
|
||||
"|"
|
||||
"~"
|
||||
"and"
|
||||
"in"
|
||||
"is"
|
||||
"not"
|
||||
"or"
|
||||
] @operator
|
||||
|
||||
[
|
||||
"as"
|
||||
"assert"
|
||||
"async"
|
||||
"await"
|
||||
"break"
|
||||
"class"
|
||||
"continue"
|
||||
"def"
|
||||
"del"
|
||||
"elif"
|
||||
"else"
|
||||
"except"
|
||||
"exec"
|
||||
"finally"
|
||||
"for"
|
||||
"from"
|
||||
"global"
|
||||
"if"
|
||||
"import"
|
||||
"lambda"
|
||||
"nonlocal"
|
||||
"pass"
|
||||
"print"
|
||||
"raise"
|
||||
"return"
|
||||
"try"
|
||||
"while"
|
||||
"with"
|
||||
"yield"
|
||||
] @keyword
|
||||
@@ -0,0 +1,12 @@
|
||||
(class_definition
|
||||
name: (identifier) @name) @definition.class
|
||||
|
||||
(function_definition
|
||||
name: (identifier) @name) @definition.function
|
||||
|
||||
(call
|
||||
function: [
|
||||
(identifier) @name
|
||||
(attribute
|
||||
attribute: (identifier) @name)
|
||||
]) @reference.call
|
||||
6615
python/extractor/tsg-python/tree-sitter-python/src/grammar.json
Normal file
6615
python/extractor/tsg-python/tree-sitter-python/src/grammar.json
Normal file
File diff suppressed because it is too large
Load Diff
4064
python/extractor/tsg-python/tree-sitter-python/src/node-types.json
Normal file
4064
python/extractor/tsg-python/tree-sitter-python/src/node-types.json
Normal file
File diff suppressed because it is too large
Load Diff
76504
python/extractor/tsg-python/tree-sitter-python/src/parser.c
Normal file
76504
python/extractor/tsg-python/tree-sitter-python/src/parser.c
Normal file
File diff suppressed because it is too large
Load Diff
402
python/extractor/tsg-python/tree-sitter-python/src/scanner.cc
Normal file
402
python/extractor/tsg-python/tree-sitter-python/src/scanner.cc
Normal file
@@ -0,0 +1,402 @@
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cwctype>
|
||||
#include <stdio.h>
|
||||
#include <tree_sitter/parser.h>
|
||||
#include <vector>
|
||||
namespace {
|
||||
|
||||
using std::vector;
|
||||
using std::iswspace;
|
||||
using std::memcpy;
|
||||
|
||||
enum TokenType {
|
||||
NEWLINE,
|
||||
INDENT,
|
||||
DEDENT,
|
||||
STRING_START,
|
||||
STRING_CONTENT,
|
||||
STRING_END,
|
||||
};
|
||||
|
||||
struct Delimiter {
|
||||
enum {
|
||||
SingleQuote = 1 << 0,
|
||||
DoubleQuote = 1 << 1,
|
||||
BackQuote = 1 << 2,
|
||||
Raw = 1 << 3,
|
||||
Format = 1 << 4,
|
||||
Triple = 1 << 5,
|
||||
Bytes = 1 << 6,
|
||||
};
|
||||
|
||||
Delimiter() : flags(0) {}
|
||||
|
||||
bool is_format() const {
|
||||
return flags & Format;
|
||||
}
|
||||
|
||||
bool is_raw() const {
|
||||
return flags & Raw;
|
||||
}
|
||||
|
||||
bool is_triple() const {
|
||||
return flags & Triple;
|
||||
}
|
||||
|
||||
bool is_bytes() const {
|
||||
return flags & Bytes;
|
||||
}
|
||||
|
||||
int32_t end_character() const {
|
||||
if (flags & SingleQuote) return '\'';
|
||||
if (flags & DoubleQuote) return '"';
|
||||
if (flags & BackQuote) return '`';
|
||||
return 0;
|
||||
}
|
||||
|
||||
void set_format() {
|
||||
flags |= Format;
|
||||
}
|
||||
|
||||
void set_raw() {
|
||||
flags |= Raw;
|
||||
}
|
||||
|
||||
void set_triple() {
|
||||
flags |= Triple;
|
||||
}
|
||||
|
||||
void set_bytes() {
|
||||
flags |= Bytes;
|
||||
}
|
||||
|
||||
void set_end_character(int32_t character) {
|
||||
switch (character) {
|
||||
case '\'':
|
||||
flags |= SingleQuote;
|
||||
break;
|
||||
case '"':
|
||||
flags |= DoubleQuote;
|
||||
break;
|
||||
case '`':
|
||||
flags |= BackQuote;
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
char flags;
|
||||
};
|
||||
|
||||
struct Scanner {
|
||||
Scanner() {
|
||||
assert(sizeof(Delimiter) == sizeof(char));
|
||||
deserialize(NULL, 0);
|
||||
}
|
||||
|
||||
unsigned serialize(char *buffer) {
|
||||
size_t i = 0;
|
||||
|
||||
size_t delimiter_count = delimiter_stack.size();
|
||||
if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
|
||||
buffer[i++] = delimiter_count;
|
||||
|
||||
if (delimiter_count > 0) {
|
||||
memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
|
||||
}
|
||||
i += delimiter_count;
|
||||
|
||||
vector<uint16_t>::iterator
|
||||
iter = indent_length_stack.begin() + 1,
|
||||
end = indent_length_stack.end();
|
||||
|
||||
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
|
||||
buffer[i++] = *iter;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void deserialize(const char *buffer, unsigned length) {
|
||||
delimiter_stack.clear();
|
||||
indent_length_stack.clear();
|
||||
indent_length_stack.push_back(0);
|
||||
|
||||
if (length > 0) {
|
||||
size_t i = 0;
|
||||
|
||||
size_t delimiter_count = (uint8_t)buffer[i++];
|
||||
delimiter_stack.resize(delimiter_count);
|
||||
if (delimiter_count > 0) {
|
||||
memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
|
||||
}
|
||||
i += delimiter_count;
|
||||
|
||||
for (; i < length; i++) {
|
||||
indent_length_stack.push_back(buffer[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void advance(TSLexer *lexer) {
|
||||
lexer->advance(lexer, false);
|
||||
}
|
||||
|
||||
void skip(TSLexer *lexer) {
|
||||
lexer->advance(lexer, true);
|
||||
}
|
||||
|
||||
bool scan(TSLexer *lexer, const bool *valid_symbols) {
|
||||
if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
|
||||
Delimiter delimiter = delimiter_stack.back();
|
||||
int32_t end_character = delimiter.end_character();
|
||||
bool has_content = false;
|
||||
while (lexer->lookahead) {
|
||||
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return has_content;
|
||||
} else if (lexer->lookahead == '\\') {
|
||||
if (delimiter.is_raw()) {
|
||||
lexer->advance(lexer, false);
|
||||
continue;
|
||||
} else if (delimiter.is_bytes()) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
|
||||
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
|
||||
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||
lexer->advance(lexer, false);
|
||||
} else {
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return has_content;
|
||||
}
|
||||
} else {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return has_content;
|
||||
}
|
||||
} else if (lexer->lookahead == end_character) {
|
||||
if (delimiter.is_triple()) {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == end_character) {
|
||||
lexer->advance(lexer, false);
|
||||
if (lexer->lookahead == end_character) {
|
||||
if (has_content) {
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
} else {
|
||||
lexer->advance(lexer, false);
|
||||
lexer->mark_end(lexer);
|
||||
delimiter_stack.pop_back();
|
||||
lexer->result_symbol = STRING_END;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
lexer->mark_end(lexer);
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (has_content) {
|
||||
lexer->result_symbol = STRING_CONTENT;
|
||||
} else {
|
||||
lexer->advance(lexer, false);
|
||||
delimiter_stack.pop_back();
|
||||
lexer->result_symbol = STRING_END;
|
||||
}
|
||||
lexer->mark_end(lexer);
|
||||
return true;
|
||||
}
|
||||
} else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
|
||||
return false;
|
||||
}
|
||||
advance(lexer);
|
||||
has_content = true;
|
||||
}
|
||||
}
|
||||
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
bool found_end_of_line = false;
|
||||
uint32_t indent_length = 0;
|
||||
int32_t first_comment_indent_length = -1;
|
||||
for (;;) {
|
||||
if (lexer->lookahead == '\n') {
|
||||
found_end_of_line = true;
|
||||
indent_length = 0;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == ' ') {
|
||||
indent_length++;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '\r') {
|
||||
indent_length = 0;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '\t') {
|
||||
indent_length += 8;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == '#') {
|
||||
if (first_comment_indent_length == -1) {
|
||||
first_comment_indent_length = (int32_t)indent_length;
|
||||
}
|
||||
while (lexer->lookahead && lexer->lookahead != '\n') {
|
||||
skip(lexer);
|
||||
}
|
||||
skip(lexer);
|
||||
indent_length = 0;
|
||||
} else if (lexer->lookahead == '\\') {
|
||||
skip(lexer);
|
||||
if (lexer->lookahead == '\r') {
|
||||
skip(lexer);
|
||||
}
|
||||
if (lexer->lookahead == '\n') {
|
||||
skip(lexer);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (lexer->lookahead == '\f') {
|
||||
indent_length = 0;
|
||||
skip(lexer);
|
||||
} else if (lexer->lookahead == 0) {
|
||||
indent_length = 0;
|
||||
found_end_of_line = true;
|
||||
break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found_end_of_line) {
|
||||
if (!indent_length_stack.empty()) {
|
||||
uint16_t current_indent_length = indent_length_stack.back();
|
||||
|
||||
if (
|
||||
valid_symbols[INDENT] &&
|
||||
indent_length > current_indent_length
|
||||
) {
|
||||
indent_length_stack.push_back(indent_length);
|
||||
lexer->result_symbol = INDENT;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
valid_symbols[DEDENT] &&
|
||||
indent_length < current_indent_length &&
|
||||
|
||||
// Wait to create a dedent token until we've consumed any comments
|
||||
// whose indentation matches the current block.
|
||||
first_comment_indent_length < (int32_t)current_indent_length
|
||||
) {
|
||||
indent_length_stack.pop_back();
|
||||
lexer->result_symbol = DEDENT;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (valid_symbols[NEWLINE]) {
|
||||
lexer->result_symbol = NEWLINE;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
|
||||
Delimiter delimiter;
|
||||
|
||||
bool has_flags = false;
|
||||
while (lexer->lookahead) {
|
||||
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
|
||||
delimiter.set_format();
|
||||
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
|
||||
delimiter.set_raw();
|
||||
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
|
||||
delimiter.set_bytes();
|
||||
} else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
|
||||
break;
|
||||
}
|
||||
has_flags = true;
|
||||
advance(lexer);
|
||||
}
|
||||
|
||||
if (lexer->lookahead == '`') {
|
||||
delimiter.set_end_character('`');
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
} else if (lexer->lookahead == '\'') {
|
||||
delimiter.set_end_character('\'');
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
if (lexer->lookahead == '\'') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '\'') {
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
delimiter.set_triple();
|
||||
}
|
||||
}
|
||||
} else if (lexer->lookahead == '"') {
|
||||
delimiter.set_end_character('"');
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
if (lexer->lookahead == '"') {
|
||||
advance(lexer);
|
||||
if (lexer->lookahead == '"') {
|
||||
advance(lexer);
|
||||
lexer->mark_end(lexer);
|
||||
delimiter.set_triple();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (delimiter.end_character()) {
|
||||
delimiter_stack.push_back(delimiter);
|
||||
lexer->result_symbol = STRING_START;
|
||||
return true;
|
||||
} else if (has_flags) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<uint16_t> indent_length_stack;
|
||||
vector<Delimiter> delimiter_stack;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *tree_sitter_python_external_scanner_create() {
|
||||
return new Scanner();
|
||||
}
|
||||
|
||||
bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
|
||||
const bool *valid_symbols) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
return scanner->scan(lexer, valid_symbols);
|
||||
}
|
||||
|
||||
unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
return scanner->serialize(buffer);
|
||||
}
|
||||
|
||||
void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
scanner->deserialize(buffer, length);
|
||||
}
|
||||
|
||||
void tree_sitter_python_external_scanner_destroy(void *payload) {
|
||||
Scanner *scanner = static_cast<Scanner *>(payload);
|
||||
delete scanner;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
#ifndef TREE_SITTER_PARSER_H_
|
||||
#define TREE_SITTER_PARSER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define ts_builtin_sym_error ((TSSymbol)-1)
|
||||
#define ts_builtin_sym_end 0
|
||||
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
|
||||
|
||||
typedef uint16_t TSStateId;
|
||||
|
||||
#ifndef TREE_SITTER_API_H_
|
||||
typedef uint16_t TSSymbol;
|
||||
typedef uint16_t TSFieldId;
|
||||
typedef struct TSLanguage TSLanguage;
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
TSFieldId field_id;
|
||||
uint8_t child_index;
|
||||
bool inherited;
|
||||
} TSFieldMapEntry;
|
||||
|
||||
typedef struct {
|
||||
uint16_t index;
|
||||
uint16_t length;
|
||||
} TSFieldMapSlice;
|
||||
|
||||
typedef struct {
|
||||
bool visible;
|
||||
bool named;
|
||||
bool supertype;
|
||||
} TSSymbolMetadata;
|
||||
|
||||
typedef struct TSLexer TSLexer;
|
||||
|
||||
struct TSLexer {
|
||||
int32_t lookahead;
|
||||
TSSymbol result_symbol;
|
||||
void (*advance)(TSLexer *, bool);
|
||||
void (*mark_end)(TSLexer *);
|
||||
uint32_t (*get_column)(TSLexer *);
|
||||
bool (*is_at_included_range_start)(const TSLexer *);
|
||||
bool (*eof)(const TSLexer *);
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
TSParseActionTypeShift,
|
||||
TSParseActionTypeReduce,
|
||||
TSParseActionTypeAccept,
|
||||
TSParseActionTypeRecover,
|
||||
} TSParseActionType;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
uint8_t type;
|
||||
TSStateId state;
|
||||
bool extra;
|
||||
bool repetition;
|
||||
} shift;
|
||||
struct {
|
||||
uint8_t type;
|
||||
uint8_t child_count;
|
||||
TSSymbol symbol;
|
||||
int16_t dynamic_precedence;
|
||||
uint16_t production_id;
|
||||
} reduce;
|
||||
uint8_t type;
|
||||
} TSParseAction;
|
||||
|
||||
typedef struct {
|
||||
uint16_t lex_state;
|
||||
uint16_t external_lex_state;
|
||||
} TSLexMode;
|
||||
|
||||
typedef union {
|
||||
TSParseAction action;
|
||||
struct {
|
||||
uint8_t count;
|
||||
bool reusable;
|
||||
} entry;
|
||||
} TSParseActionEntry;
|
||||
|
||||
struct TSLanguage {
|
||||
uint32_t version;
|
||||
uint32_t symbol_count;
|
||||
uint32_t alias_count;
|
||||
uint32_t token_count;
|
||||
uint32_t external_token_count;
|
||||
uint32_t state_count;
|
||||
uint32_t large_state_count;
|
||||
uint32_t production_id_count;
|
||||
uint32_t field_count;
|
||||
uint16_t max_alias_sequence_length;
|
||||
const uint16_t *parse_table;
|
||||
const uint16_t *small_parse_table;
|
||||
const uint32_t *small_parse_table_map;
|
||||
const TSParseActionEntry *parse_actions;
|
||||
const char * const *symbol_names;
|
||||
const char * const *field_names;
|
||||
const TSFieldMapSlice *field_map_slices;
|
||||
const TSFieldMapEntry *field_map_entries;
|
||||
const TSSymbolMetadata *symbol_metadata;
|
||||
const TSSymbol *public_symbol_map;
|
||||
const uint16_t *alias_map;
|
||||
const TSSymbol *alias_sequences;
|
||||
const TSLexMode *lex_modes;
|
||||
bool (*lex_fn)(TSLexer *, TSStateId);
|
||||
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
|
||||
TSSymbol keyword_capture_token;
|
||||
struct {
|
||||
const bool *states;
|
||||
const TSSymbol *symbol_map;
|
||||
void *(*create)(void);
|
||||
void (*destroy)(void *);
|
||||
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
|
||||
unsigned (*serialize)(void *, char *);
|
||||
void (*deserialize)(void *, const char *, unsigned);
|
||||
} external_scanner;
|
||||
const TSStateId *primary_state_ids;
|
||||
};
|
||||
|
||||
/*
|
||||
* Lexer Macros
|
||||
*/
|
||||
|
||||
#define START_LEXER() \
|
||||
bool result = false; \
|
||||
bool skip = false; \
|
||||
bool eof = false; \
|
||||
int32_t lookahead; \
|
||||
goto start; \
|
||||
next_state: \
|
||||
lexer->advance(lexer, skip); \
|
||||
start: \
|
||||
skip = false; \
|
||||
lookahead = lexer->lookahead;
|
||||
|
||||
#define ADVANCE(state_value) \
|
||||
{ \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define SKIP(state_value) \
|
||||
{ \
|
||||
skip = true; \
|
||||
state = state_value; \
|
||||
goto next_state; \
|
||||
}
|
||||
|
||||
#define ACCEPT_TOKEN(symbol_value) \
|
||||
result = true; \
|
||||
lexer->result_symbol = symbol_value; \
|
||||
lexer->mark_end(lexer);
|
||||
|
||||
#define END_STATE() return result;
|
||||
|
||||
/*
|
||||
* Parse Table Macros
|
||||
*/
|
||||
|
||||
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
|
||||
|
||||
#define STATE(id) id
|
||||
|
||||
#define ACTIONS(id) id
|
||||
|
||||
#define SHIFT(state_value) \
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.state = state_value \
|
||||
} \
|
||||
}}
|
||||
|
||||
#define SHIFT_REPEAT(state_value) \
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.state = state_value, \
|
||||
.repetition = true \
|
||||
} \
|
||||
}}
|
||||
|
||||
#define SHIFT_EXTRA() \
|
||||
{{ \
|
||||
.shift = { \
|
||||
.type = TSParseActionTypeShift, \
|
||||
.extra = true \
|
||||
} \
|
||||
}}
|
||||
|
||||
#define REDUCE(symbol_val, child_count_val, ...) \
|
||||
{{ \
|
||||
.reduce = { \
|
||||
.type = TSParseActionTypeReduce, \
|
||||
.symbol = symbol_val, \
|
||||
.child_count = child_count_val, \
|
||||
__VA_ARGS__ \
|
||||
}, \
|
||||
}}
|
||||
|
||||
#define RECOVER() \
|
||||
{{ \
|
||||
.type = TSParseActionTypeRecover \
|
||||
}}
|
||||
|
||||
#define ACCEPT_INPUT() \
|
||||
{{ \
|
||||
.type = TSParseActionTypeAccept \
|
||||
}}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // TREE_SITTER_PARSER_H_
|
||||
Reference in New Issue
Block a user