Python: Copy Python extractor to codeql repo

2026-04-08 00:24:03 +02:00 · 2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions
--- a/python/extractor/tsg-python/tree-sitter-python/.gitignore
+++ b/python/extractor/tsg-python/tree-sitter-python/.gitignore
@@ -0,0 +1,7 @@
+Cargo.lock
+package-lock.json
+node_modules
+build
+*.log
+/examples/*/
+/target/
--- a/python/extractor/tsg-python/tree-sitter-python/.npmignore
+++ b/python/extractor/tsg-python/tree-sitter-python/.npmignore
@@ -0,0 +1,6 @@
+corpus
+examples
+build
+script
+target
+bindings/rust
--- a/python/extractor/tsg-python/tree-sitter-python/BUILD.bazel
+++ b/python/extractor/tsg-python/tree-sitter-python/BUILD.bazel
@@ -0,0 +1,38 @@
+load("@rules_rust//cargo:defs.bzl", "cargo_build_script")
+load("@rules_rust//rust:defs.bzl", "rust_library")
+load("@tsg_python_crate_index//:defs.bzl", "aliases", "all_crate_deps")
+
+package(default_visibility = ["//visibility:public"])
+
+# This will run the build script from the root of the workspace, and
+# collect the outputs.
+cargo_build_script(
+    name = "tsg-build-script",
+    srcs = ["bindings/rust/build.rs"],
+    data = glob([
+        "src/**",
+    ]),
+    deps = all_crate_deps(
+        build = True,
+    ),
+)
+
+rust_library(
+    name = "tree-sitter-python",
+    srcs = [
+        "bindings/rust/lib.rs",
+    ],
+    aliases = aliases(),
+    compile_data = glob([
+        "src/**",
+        "queries/**",
+    ]) + [
+        "grammar.js",
+    ],
+    proc_macro_deps = all_crate_deps(
+        proc_macro = True,
+    ),
+    deps = [":tsg-build-script"] + all_crate_deps(
+        normal = True,
+    ),
+)
--- a/python/extractor/tsg-python/tree-sitter-python/Cargo.toml
+++ b/python/extractor/tsg-python/tree-sitter-python/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "tree-sitter-python"
+description = "Python grammar for the tree-sitter parsing library"
+version = "0.19.0"
+authors = [
+    "Max Brunsfeld <maxbrunsfeld@gmail.com>",
+    "Douglas Creager <dcreager@dcreager.net>",
+]
+license = "MIT"
+readme = "bindings/rust/README.md"
+keywords = ["incremental", "parsing", "python"]
+categories = ["parsing", "text-editors"]
+repository = "https://github.com/tree-sitter/tree-sitter-python"
+edition = "2018"
+
+build = "bindings/rust/build.rs"
+include = [
+  "bindings/rust/*",
+  "grammar.js",
+  "queries/*",
+  "src/*",
+]
+
+[lib]
+path = "bindings/rust/lib.rs"
+
+[dependencies]
+tree-sitter = ">= 0.20, < 0.21"
+
+[build-dependencies]
+cc = "1.0"
--- a/python/extractor/tsg-python/tree-sitter-python/LICENSE
+++ b/python/extractor/tsg-python/tree-sitter-python/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Max Brunsfeld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/python/extractor/tsg-python/tree-sitter-python/README.md
+++ b/python/extractor/tsg-python/tree-sitter-python/README.md
@@ -0,0 +1,13 @@
+tree-sitter-python
+==================
+
+[![build](https://github.com/tree-sitter/tree-sitter-python/actions/workflows/ci.yml/badge.svg)](https://github.com/tree-sitter/tree-sitter-python/actions/workflows/ci.yml)
+
+Python grammar for [tree-sitter][].
+
+[tree-sitter]: https://github.com/tree-sitter/tree-sitter
+
+#### References
+
+* [Python 2 Grammar](https://docs.python.org/2/reference/grammar.html)
+* [Python 3 Grammar](https://docs.python.org/3/reference/grammar.html)
--- a/python/extractor/tsg-python/tree-sitter-python/binding.gyp
+++ b/python/extractor/tsg-python/tree-sitter-python/binding.gyp
@@ -0,0 +1,19 @@
+{
+  "targets": [
+    {
+      "target_name": "tree_sitter_python_binding",
+      "include_dirs": [
+        "<!(node -e \"require('nan')\")",
+        "src"
+      ],
+      "sources": [
+        "src/parser.c",
+        "bindings/node/binding.cc",
+        "src/scanner.cc"
+      ],
+      "cflags_c": [
+        "-std=c99",
+      ]
+    }
+  ]
+}
--- a/python/extractor/tsg-python/tree-sitter-python/bindings/node/binding.cc
+++ b/python/extractor/tsg-python/tree-sitter-python/bindings/node/binding.cc
@@ -0,0 +1,28 @@
+#include "tree_sitter/parser.h"
+#include <node.h>
+#include "nan.h"
+
+using namespace v8;
+
+extern "C" TSLanguage * tree_sitter_python();
+
+namespace {
+
+NAN_METHOD(New) {}
+
+void Init(Local<Object> exports, Local<Object> module) {
+  Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
+  tpl->SetClassName(Nan::New("Language").ToLocalChecked());
+  tpl->InstanceTemplate()->SetInternalFieldCount(1);
+
+  Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
+  Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
+  Nan::SetInternalFieldPointer(instance, 0, tree_sitter_python());
+
+  Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("python").ToLocalChecked());
+  Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
+}
+
+NODE_MODULE(tree_sitter_python_binding, Init)
+
+}  // namespace
--- a/python/extractor/tsg-python/tree-sitter-python/bindings/node/index.js
+++ b/python/extractor/tsg-python/tree-sitter-python/bindings/node/index.js
@@ -0,0 +1,19 @@
+try {
+  module.exports = require("../../build/Release/tree_sitter_python_binding");
+} catch (error1) {
+  if (error1.code !== 'MODULE_NOT_FOUND') {
+    throw error1;
+  }
+  try {
+    module.exports = require("../../build/Debug/tree_sitter_python_binding");
+  } catch (error2) {
+    if (error2.code !== 'MODULE_NOT_FOUND') {
+      throw error2;
+    }
+    throw error1
+  }
+}
+
+try {
+  module.exports.nodeTypeInfo = require("../../src/node-types.json");
+} catch (_) {}
--- a/python/extractor/tsg-python/tree-sitter-python/bindings/rust/README.md
+++ b/python/extractor/tsg-python/tree-sitter-python/bindings/rust/README.md
@@ -0,0 +1,36 @@
+# tree-sitter-python
+
+This crate provides a Python grammar for the [tree-sitter][] parsing library.
+To use this crate, add it to the `[dependencies]` section of your `Cargo.toml`
+file.  (Note that you will probably also need to depend on the
+[`tree-sitter`][tree-sitter crate] crate to use the parsed result in any useful
+way.)
+
+``` toml
+[dependencies]
+tree-sitter = "0.17"
+tree-sitter-python = "0.17"
+```
+
+Typically, you will use the [language][language func] function to add this
+grammar to a tree-sitter [Parser][], and then use the parser to parse some code:
+
+``` rust
+let code = r#"
+    def double(x):
+        return x * 2
+"#;
+let mut parser = Parser::new();
+parser.set_language(tree_sitter_python::language()).expect("Error loading Python grammar");
+let parsed = parser.parse(code, None);
+```
+
+If you have any questions, please reach out to us in the [tree-sitter
+discussions] page.
+
+[Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
+[language func]: https://docs.rs/tree-sitter-python/*/tree_sitter_python/fn.language.html
+[Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
+[tree-sitter]: https://tree-sitter.github.io/
+[tree-sitter crate]: https://crates.io/crates/tree-sitter
+[tree-sitter discussions]: https://github.com/tree-sitter/tree-sitter/discussions
--- a/python/extractor/tsg-python/tree-sitter-python/bindings/rust/build.rs
+++ b/python/extractor/tsg-python/tree-sitter-python/bindings/rust/build.rs
@@ -0,0 +1,28 @@
+use std::path::Path;
+extern crate cc;
+
+fn main() {
+    let src_dir = Path::new("src");
+
+    let mut c_config = cc::Build::new();
+    c_config.include(&src_dir);
+    c_config
+        .flag_if_supported("-Wno-unused-parameter")
+        .flag_if_supported("-Wno-unused-but-set-variable")
+        .flag_if_supported("-Wno-trigraphs");
+    let parser_path = src_dir.join("parser.c");
+    c_config.file(&parser_path);
+    println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
+    c_config.compile("parser");
+
+    let mut cpp_config = cc::Build::new();
+    cpp_config.cpp(true);
+    cpp_config.include(&src_dir);
+    cpp_config
+        .flag_if_supported("-Wno-unused-parameter")
+        .flag_if_supported("-Wno-unused-but-set-variable");
+    let scanner_path = src_dir.join("scanner.cc");
+    cpp_config.file(&scanner_path);
+    println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
+    cpp_config.compile("scanner");
+}
--- a/python/extractor/tsg-python/tree-sitter-python/bindings/rust/lib.rs
+++ b/python/extractor/tsg-python/tree-sitter-python/bindings/rust/lib.rs
@@ -0,0 +1,68 @@
+// -*- coding: utf-8 -*-
+// ------------------------------------------------------------------------------------------------
+// Copyright © 2020, tree-sitter-python authors.
+// See the LICENSE file in this repo for license details.
+// ------------------------------------------------------------------------------------------------
+
+//! This crate provides a Python grammar for the [tree-sitter][] parsing library.
+//!
+//! Typically, you will use the [language][language func] function to add this grammar to a
+//! tree-sitter [Parser][], and then use the parser to parse some code:
+//!
+//! ```
+//! use tree_sitter::Parser;
+//!
+//! let code = r#"
+//!     def double(x):
+//!         return x * 2
+//! "#;
+//! let mut parser = Parser::new();
+//! parser.set_language(tree_sitter_python::language()).expect("Error loading Python grammar");
+//! let parsed = parser.parse(code, None);
+//! # let parsed = parsed.unwrap();
+//! # let root = parsed.root_node();
+//! # assert!(!root.has_error());
+//! ```
+//!
+//! [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
+//! [language func]: fn.language.html
+//! [Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
+//! [tree-sitter]: https://tree-sitter.github.io/
+
+use tree_sitter::Language;
+
+extern "C" {
+    fn tree_sitter_python() -> Language;
+}
+
+/// Returns the tree-sitter [Language][] for this grammar.
+///
+/// [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
+pub fn language() -> Language {
+    unsafe { tree_sitter_python() }
+}
+
+/// The source of the Python tree-sitter grammar description.
+pub const GRAMMAR: &'static str = include_str!("../../grammar.js");
+
+/// The syntax highlighting query for this language.
+pub const HIGHLIGHT_QUERY: &'static str = include_str!("../../queries/highlights.scm");
+
+/// The content of the [`node-types.json`][] file for this grammar.
+///
+/// [`node-types.json`]: https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
+pub const NODE_TYPES: &'static str = include_str!("../../src/node-types.json");
+
+/// The symbol tagging query for this language.
+pub const TAGGING_QUERY: &'static str = include_str!("../../queries/tags.scm");
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn can_load_grammar() {
+        let mut parser = tree_sitter::Parser::new();
+        parser
+            .set_language(super::language())
+            .expect("Error loading Python grammar");
+    }
+}
--- a/python/extractor/tsg-python/tree-sitter-python/grammar.js
+++ b/python/extractor/tsg-python/tree-sitter-python/grammar.js
--- a/python/extractor/tsg-python/tree-sitter-python/log.html
+++ b/python/extractor/tsg-python/tree-sitter-python/log.html
--- a/python/extractor/tsg-python/tree-sitter-python/package.json
+++ b/python/extractor/tsg-python/tree-sitter-python/package.json
@@ -0,0 +1,33 @@
+{
+  "name": "tree-sitter-python",
+  "version": "0.19.0",
+  "description": "Python grammar for tree-sitter",
+  "main": "bindings/node",
+  "keywords": [
+    "parser",
+    "lexer"
+  ],
+  "author": "Max Brunsfeld",
+  "license": "MIT",
+  "dependencies": {
+    "nan": "^2.14.0"
+  },
+  "devDependencies": {
+    "tree-sitter-cli": "^0.19.3"
+  },
+  "scripts": {
+    "build": "tree-sitter generate && node-gyp build",
+    "test": "tree-sitter test && script/parse-examples",
+    "parse": "tree-sitter parse",
+    "test-windows": "tree-sitter test"
+  },
+  "repository": "https://github.com/tree-sitter/tree-sitter-python",
+  "tree-sitter": [
+    {
+      "scope": "source.python",
+      "file-types": [
+        "py"
+      ]
+    }
+  ]
+}
--- a/python/extractor/tsg-python/tree-sitter-python/queries/highlights.scm
+++ b/python/extractor/tsg-python/tree-sitter-python/queries/highlights.scm
@@ -0,0 +1,124 @@
+; Identifier naming conventions
+
+((identifier) @constructor
+ (#match? @constructor "^[A-Z]"))
+
+((identifier) @constant
+ (#match? @constant "^[A-Z][A-Z_]*$"))
+
+; Builtin functions
+
+((call
+  function: (identifier) @function.builtin)
+ (#match?
+   @function.builtin
+   "^(abs|all|any|ascii|bin|bool|breakpoint|bytearray|bytes|callable|chr|classmethod|compile|complex|delattr|dict|dir|divmod|enumerate|eval|exec|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|isinstance|issubclass|iter|len|list|locals|map|max|memoryview|min|next|object|oct|open|ord|pow|print|property|range|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|vars|zip|__import__)$"))
+
+; Function calls
+
+(decorator) @function
+
+(call
+  function: (attribute attribute: (identifier) @function.method))
+(call
+  function: (identifier) @function)
+
+; Function definitions
+
+(function_definition
+  name: (identifier) @function)
+
+(identifier) @variable
+(attribute attribute: (identifier) @property)
+(type (identifier) @type)
+
+; Literals
+
+[
+  (none)
+  (true)
+  (false)
+] @constant.builtin
+
+[
+  (integer)
+  (float)
+] @number
+
+(comment) @comment
+(string) @string
+(escape_sequence) @escape
+
+(interpolation
+  "{" @punctuation.special
+  "}" @punctuation.special) @embedded
+
+[
+  "-"
+  "-="
+  "!="
+  "*"
+  "**"
+  "**="
+  "*="
+  "/"
+  "//"
+  "//="
+  "/="
+  "&"
+  "%"
+  "%="
+  "^"
+  "+"
+  "->"
+  "+="
+  "<"
+  "<<"
+  "<="
+  "<>"
+  "="
+  ":="
+  "=="
+  ">"
+  ">="
+  ">>"
+  "|"
+  "~"
+  "and"
+  "in"
+  "is"
+  "not"
+  "or"
+] @operator
+
+[
+  "as"
+  "assert"
+  "async"
+  "await"
+  "break"
+  "class"
+  "continue"
+  "def"
+  "del"
+  "elif"
+  "else"
+  "except"
+  "exec"
+  "finally"
+  "for"
+  "from"
+  "global"
+  "if"
+  "import"
+  "lambda"
+  "nonlocal"
+  "pass"
+  "print"
+  "raise"
+  "return"
+  "try"
+  "while"
+  "with"
+  "yield"
+] @keyword
--- a/python/extractor/tsg-python/tree-sitter-python/queries/tags.scm
+++ b/python/extractor/tsg-python/tree-sitter-python/queries/tags.scm
@@ -0,0 +1,12 @@
+(class_definition
+  name: (identifier) @name) @definition.class
+
+(function_definition
+  name: (identifier) @name) @definition.function
+
+(call
+  function: [
+      (identifier) @name
+      (attribute
+        attribute: (identifier) @name)
+  ]) @reference.call
--- a/python/extractor/tsg-python/tree-sitter-python/src/grammar.json
+++ b/python/extractor/tsg-python/tree-sitter-python/src/grammar.json
--- a/python/extractor/tsg-python/tree-sitter-python/src/node-types.json
+++ b/python/extractor/tsg-python/tree-sitter-python/src/node-types.json
--- a/python/extractor/tsg-python/tree-sitter-python/src/parser.c
+++ b/python/extractor/tsg-python/tree-sitter-python/src/parser.c
--- a/python/extractor/tsg-python/tree-sitter-python/src/scanner.cc
+++ b/python/extractor/tsg-python/tree-sitter-python/src/scanner.cc
@@ -0,0 +1,402 @@
+#include <cassert>
+#include <cstring>
+#include <cwctype>
+#include <stdio.h>
+#include <tree_sitter/parser.h>
+#include <vector>
+namespace {
+
+using std::vector;
+using std::iswspace;
+using std::memcpy;
+
+enum TokenType {
+  NEWLINE,
+  INDENT,
+  DEDENT,
+  STRING_START,
+  STRING_CONTENT,
+  STRING_END,
+};
+
+struct Delimiter {
+  enum {
+    SingleQuote = 1 << 0,
+    DoubleQuote = 1 << 1,
+    BackQuote = 1 << 2,
+    Raw = 1 << 3,
+    Format = 1 << 4,
+    Triple = 1 << 5,
+    Bytes = 1 << 6,
+  };
+
+  Delimiter() : flags(0) {}
+
+  bool is_format() const {
+    return flags & Format;
+  }
+
+  bool is_raw() const {
+    return flags & Raw;
+  }
+
+  bool is_triple() const {
+    return flags & Triple;
+  }
+
+  bool is_bytes() const {
+    return flags & Bytes;
+  }
+
+  int32_t end_character() const {
+    if (flags & SingleQuote) return '\'';
+    if (flags & DoubleQuote) return '"';
+    if (flags & BackQuote) return '`';
+    return 0;
+  }
+
+  void set_format() {
+    flags |= Format;
+  }
+
+  void set_raw() {
+    flags |= Raw;
+  }
+
+  void set_triple() {
+    flags |= Triple;
+  }
+
+  void set_bytes() {
+    flags |= Bytes;
+  }
+
+  void set_end_character(int32_t character) {
+    switch (character) {
+      case '\'':
+        flags |= SingleQuote;
+        break;
+      case '"':
+        flags |= DoubleQuote;
+        break;
+      case '`':
+        flags |= BackQuote;
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  char flags;
+};
+
+struct Scanner {
+  Scanner() {
+    assert(sizeof(Delimiter) == sizeof(char));
+    deserialize(NULL, 0);
+  }
+
+  unsigned serialize(char *buffer) {
+    size_t i = 0;
+
+    size_t delimiter_count = delimiter_stack.size();
+    if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
+    buffer[i++] = delimiter_count;
+
+    if (delimiter_count > 0) {
+      memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
+    }
+    i += delimiter_count;
+
+    vector<uint16_t>::iterator
+      iter = indent_length_stack.begin() + 1,
+      end = indent_length_stack.end();
+
+    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
+      buffer[i++] = *iter;
+    }
+
+    return i;
+  }
+
+  void deserialize(const char *buffer, unsigned length) {
+    delimiter_stack.clear();
+    indent_length_stack.clear();
+    indent_length_stack.push_back(0);
+
+    if (length > 0) {
+      size_t i = 0;
+
+      size_t delimiter_count = (uint8_t)buffer[i++];
+      delimiter_stack.resize(delimiter_count);
+      if (delimiter_count > 0) {
+        memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
+      }
+      i += delimiter_count;
+
+      for (; i < length; i++) {
+        indent_length_stack.push_back(buffer[i]);
+      }
+    }
+  }
+
+  void advance(TSLexer *lexer) {
+    lexer->advance(lexer, false);
+  }
+
+  void skip(TSLexer *lexer) {
+    lexer->advance(lexer, true);
+  }
+
+  bool scan(TSLexer *lexer, const bool *valid_symbols) {
+    if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
+      Delimiter delimiter = delimiter_stack.back();
+      int32_t end_character = delimiter.end_character();
+      bool has_content = false;
+      while (lexer->lookahead) {
+        if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
+          lexer->mark_end(lexer);
+          lexer->result_symbol = STRING_CONTENT;
+          return has_content;
+        } else if (lexer->lookahead == '\\') {
+          if (delimiter.is_raw()) {
+            lexer->advance(lexer, false);
+            continue;
+          } else if (delimiter.is_bytes()) {
+              lexer->mark_end(lexer);
+              lexer->advance(lexer, false);
+              if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
+                // In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
+                // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+                lexer->advance(lexer, false);
+              } else {
+                  lexer->result_symbol = STRING_CONTENT;
+                  return has_content;
+              }
+          } else {
+            lexer->mark_end(lexer);
+            lexer->result_symbol = STRING_CONTENT;
+            return has_content;
+          }
+        } else if (lexer->lookahead == end_character) {
+          if (delimiter.is_triple()) {
+            lexer->mark_end(lexer);
+            lexer->advance(lexer, false);
+            if (lexer->lookahead == end_character) {
+              lexer->advance(lexer, false);
+              if (lexer->lookahead == end_character) {
+                if (has_content) {
+                  lexer->result_symbol = STRING_CONTENT;
+                } else {
+                  lexer->advance(lexer, false);
+                  lexer->mark_end(lexer);
+                  delimiter_stack.pop_back();
+                  lexer->result_symbol = STRING_END;
+                }
+                return true;
+              } else {
+                lexer->mark_end(lexer);
+                lexer->result_symbol = STRING_CONTENT;
+                return true;
+              }
+            } else {
+              lexer->mark_end(lexer);
+              lexer->result_symbol = STRING_CONTENT;
+              return true;
+            }
+          } else {
+            if (has_content) {
+              lexer->result_symbol = STRING_CONTENT;
+            } else {
+              lexer->advance(lexer, false);
+              delimiter_stack.pop_back();
+              lexer->result_symbol = STRING_END;
+            }
+            lexer->mark_end(lexer);
+            return true;
+          }
+        } else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
+          return false;
+        }
+        advance(lexer);
+        has_content = true;
+      }
+    }
+
+    lexer->mark_end(lexer);
+
+    bool found_end_of_line = false;
+    uint32_t indent_length = 0;
+    int32_t first_comment_indent_length = -1;
+    for (;;) {
+      if (lexer->lookahead == '\n') {
+        found_end_of_line = true;
+        indent_length = 0;
+        skip(lexer);
+      } else if (lexer->lookahead == ' ') {
+        indent_length++;
+        skip(lexer);
+      } else if (lexer->lookahead == '\r') {
+        indent_length = 0;
+        skip(lexer);
+      } else if (lexer->lookahead == '\t') {
+        indent_length += 8;
+        skip(lexer);
+      } else if (lexer->lookahead == '#') {
+        if (first_comment_indent_length == -1) {
+          first_comment_indent_length = (int32_t)indent_length;
+        }
+        while (lexer->lookahead && lexer->lookahead != '\n') {
+          skip(lexer);
+        }
+        skip(lexer);
+        indent_length = 0;
+      } else if (lexer->lookahead == '\\') {
+        skip(lexer);
+        if (lexer->lookahead == '\r') {
+          skip(lexer);
+        }
+        if (lexer->lookahead == '\n') {
+          skip(lexer);
+        } else {
+          return false;
+        }
+      } else if (lexer->lookahead == '\f') {
+        indent_length = 0;
+        skip(lexer);
+      } else if (lexer->lookahead == 0) {
+        indent_length = 0;
+        found_end_of_line = true;
+        break;
+      } else {
+        break;
+      }
+    }
+
+    if (found_end_of_line) {
+      if (!indent_length_stack.empty()) {
+        uint16_t current_indent_length = indent_length_stack.back();
+
+        if (
+          valid_symbols[INDENT] &&
+          indent_length > current_indent_length
+        ) {
+          indent_length_stack.push_back(indent_length);
+          lexer->result_symbol = INDENT;
+          return true;
+        }
+
+        if (
+          valid_symbols[DEDENT] &&
+          indent_length < current_indent_length &&
+
+          // Wait to create a dedent token until we've consumed any comments
+          // whose indentation matches the current block.
+          first_comment_indent_length < (int32_t)current_indent_length
+        ) {
+          indent_length_stack.pop_back();
+          lexer->result_symbol = DEDENT;
+          return true;
+        }
+      }
+
+      if (valid_symbols[NEWLINE]) {
+        lexer->result_symbol = NEWLINE;
+        return true;
+      }
+    }
+
+    if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
+      Delimiter delimiter;
+
+      bool has_flags = false;
+      while (lexer->lookahead) {
+        if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
+          delimiter.set_format();
+        } else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
+          delimiter.set_raw();
+        } else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
+          delimiter.set_bytes();
+        } else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
+          break;
+        }
+        has_flags = true;
+        advance(lexer);
+      }
+
+      if (lexer->lookahead == '`') {
+        delimiter.set_end_character('`');
+        advance(lexer);
+        lexer->mark_end(lexer);
+      } else if (lexer->lookahead == '\'') {
+        delimiter.set_end_character('\'');
+        advance(lexer);
+        lexer->mark_end(lexer);
+        if (lexer->lookahead == '\'') {
+          advance(lexer);
+          if (lexer->lookahead == '\'') {
+            advance(lexer);
+            lexer->mark_end(lexer);
+            delimiter.set_triple();
+          }
+        }
+      } else if (lexer->lookahead == '"') {
+        delimiter.set_end_character('"');
+        advance(lexer);
+        lexer->mark_end(lexer);
+        if (lexer->lookahead == '"') {
+          advance(lexer);
+          if (lexer->lookahead == '"') {
+            advance(lexer);
+            lexer->mark_end(lexer);
+            delimiter.set_triple();
+          }
+        }
+      }
+
+      if (delimiter.end_character()) {
+        delimiter_stack.push_back(delimiter);
+        lexer->result_symbol = STRING_START;
+        return true;
+      } else if (has_flags) {
+        return false;
+      }
+    }
+
+    return false;
+  }
+
+  vector<uint16_t> indent_length_stack;
+  vector<Delimiter> delimiter_stack;
+};
+
+}
+
+extern "C" {
+
+void *tree_sitter_python_external_scanner_create() {
+  return new Scanner();
+}
+
+bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
+                                            const bool *valid_symbols) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  return scanner->scan(lexer, valid_symbols);
+}
+
+unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  return scanner->serialize(buffer);
+}
+
+void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  scanner->deserialize(buffer, length);
+}
+
+void tree_sitter_python_external_scanner_destroy(void *payload) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  delete scanner;
+}
+
+}
--- a/python/extractor/tsg-python/tree-sitter-python/src/tree_sitter/parser.h
+++ b/python/extractor/tsg-python/tree-sitter-python/src/tree_sitter/parser.h
@@ -0,0 +1,224 @@
+#ifndef TREE_SITTER_PARSER_H_
+#define TREE_SITTER_PARSER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define ts_builtin_sym_error ((TSSymbol)-1)
+#define ts_builtin_sym_end 0
+#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
+
+typedef uint16_t TSStateId;
+
+#ifndef TREE_SITTER_API_H_
+typedef uint16_t TSSymbol;
+typedef uint16_t TSFieldId;
+typedef struct TSLanguage TSLanguage;
+#endif
+
+typedef struct {
+  TSFieldId field_id;
+  uint8_t child_index;
+  bool inherited;
+} TSFieldMapEntry;
+
+typedef struct {
+  uint16_t index;
+  uint16_t length;
+} TSFieldMapSlice;
+
+typedef struct {
+  bool visible;
+  bool named;
+  bool supertype;
+} TSSymbolMetadata;
+
+typedef struct TSLexer TSLexer;
+
+struct TSLexer {
+  int32_t lookahead;
+  TSSymbol result_symbol;
+  void (*advance)(TSLexer *, bool);
+  void (*mark_end)(TSLexer *);
+  uint32_t (*get_column)(TSLexer *);
+  bool (*is_at_included_range_start)(const TSLexer *);
+  bool (*eof)(const TSLexer *);
+};
+
+typedef enum {
+  TSParseActionTypeShift,
+  TSParseActionTypeReduce,
+  TSParseActionTypeAccept,
+  TSParseActionTypeRecover,
+} TSParseActionType;
+
+typedef union {
+  struct {
+    uint8_t type;
+    TSStateId state;
+    bool extra;
+    bool repetition;
+  } shift;
+  struct {
+    uint8_t type;
+    uint8_t child_count;
+    TSSymbol symbol;
+    int16_t dynamic_precedence;
+    uint16_t production_id;
+  } reduce;
+  uint8_t type;
+} TSParseAction;
+
+typedef struct {
+  uint16_t lex_state;
+  uint16_t external_lex_state;
+} TSLexMode;
+
+typedef union {
+  TSParseAction action;
+  struct {
+    uint8_t count;
+    bool reusable;
+  } entry;
+} TSParseActionEntry;
+
+struct TSLanguage {
+  uint32_t version;
+  uint32_t symbol_count;
+  uint32_t alias_count;
+  uint32_t token_count;
+  uint32_t external_token_count;
+  uint32_t state_count;
+  uint32_t large_state_count;
+  uint32_t production_id_count;
+  uint32_t field_count;
+  uint16_t max_alias_sequence_length;
+  const uint16_t *parse_table;
+  const uint16_t *small_parse_table;
+  const uint32_t *small_parse_table_map;
+  const TSParseActionEntry *parse_actions;
+  const char * const *symbol_names;
+  const char * const *field_names;
+  const TSFieldMapSlice *field_map_slices;
+  const TSFieldMapEntry *field_map_entries;
+  const TSSymbolMetadata *symbol_metadata;
+  const TSSymbol *public_symbol_map;
+  const uint16_t *alias_map;
+  const TSSymbol *alias_sequences;
+  const TSLexMode *lex_modes;
+  bool (*lex_fn)(TSLexer *, TSStateId);
+  bool (*keyword_lex_fn)(TSLexer *, TSStateId);
+  TSSymbol keyword_capture_token;
+  struct {
+    const bool *states;
+    const TSSymbol *symbol_map;
+    void *(*create)(void);
+    void (*destroy)(void *);
+    bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
+    unsigned (*serialize)(void *, char *);
+    void (*deserialize)(void *, const char *, unsigned);
+  } external_scanner;
+  const TSStateId *primary_state_ids;
+};
+
+/*
+ *  Lexer Macros
+ */
+
+#define START_LEXER()           \
+  bool result = false;          \
+  bool skip = false;            \
+  bool eof = false;             \
+  int32_t lookahead;            \
+  goto start;                   \
+  next_state:                   \
+  lexer->advance(lexer, skip);  \
+  start:                        \
+  skip = false;                 \
+  lookahead = lexer->lookahead;
+
+#define ADVANCE(state_value) \
+  {                          \
+    state = state_value;     \
+    goto next_state;         \
+  }
+
+#define SKIP(state_value) \
+  {                       \
+    skip = true;          \
+    state = state_value;  \
+    goto next_state;      \
+  }
+
+#define ACCEPT_TOKEN(symbol_value)     \
+  result = true;                       \
+  lexer->result_symbol = symbol_value; \
+  lexer->mark_end(lexer);
+
+#define END_STATE() return result;
+
+/*
+ *  Parse Table Macros
+ */
+
+#define SMALL_STATE(id) id - LARGE_STATE_COUNT
+
+#define STATE(id) id
+
+#define ACTIONS(id) id
+
+#define SHIFT(state_value)            \
+  {{                                  \
+    .shift = {                        \
+      .type = TSParseActionTypeShift, \
+      .state = state_value            \
+    }                                 \
+  }}
+
+#define SHIFT_REPEAT(state_value)     \
+  {{                                  \
+    .shift = {                        \
+      .type = TSParseActionTypeShift, \
+      .state = state_value,           \
+      .repetition = true              \
+    }                                 \
+  }}
+
+#define SHIFT_EXTRA()                 \
+  {{                                  \
+    .shift = {                        \
+      .type = TSParseActionTypeShift, \
+      .extra = true                   \
+    }                                 \
+  }}
+
+#define REDUCE(symbol_val, child_count_val, ...) \
+  {{                                             \
+    .reduce = {                                  \
+      .type = TSParseActionTypeReduce,           \
+      .symbol = symbol_val,                      \
+      .child_count = child_count_val,            \
+      __VA_ARGS__                                \
+    },                                           \
+  }}
+
+#define RECOVER()                    \
+  {{                                 \
+    .type = TSParseActionTypeRecover \
+  }}
+
+#define ACCEPT_INPUT()              \
+  {{                                \
+    .type = TSParseActionTypeAccept \
+  }}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TREE_SITTER_PARSER_H_