Python: Copy Python extractor to codeql repo

This commit is contained in:
Taus
2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions

View File

@@ -0,0 +1 @@
target/

View File

@@ -0,0 +1,16 @@
load("@tsg_python_crate_index//:defs.bzl", "aliases", "all_crate_deps")
load("//:common.bzl", "codeql_rust_binary")
codeql_rust_binary(
name = "tsg-python",
srcs = ["src/main.rs"],
aliases = aliases(),
data = ["python.tsg"],
proc_macro_deps = all_crate_deps(
proc_macro = True,
),
visibility = ["//visibility:public"],
deps = all_crate_deps(
normal = True,
) + ["//extractor-python/tsg-python/tree-sitter-python"],
)

File diff suppressed because it is too large Load Diff

331
python/extractor/tsg-python/Cargo.lock generated Normal file
View File

@@ -0,0 +1,331 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]]
name = "aho-corasick"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
[[package]]
name = "ansi_term"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
dependencies = [
"winapi",
]
[[package]]
name = "anyhow"
version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1"
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "cc"
version = "1.0.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "2.33.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
dependencies = [
"ansi_term",
"atty",
"bitflags",
"strsim",
"textwrap",
"unicode-width",
"vec_map",
]
[[package]]
name = "hashbrown"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
dependencies = [
"ahash",
]
[[package]]
name = "hermit-abi"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
[[package]]
name = "itoa"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
[[package]]
name = "libc"
version = "0.2.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"
[[package]]
name = "log"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
dependencies = [
"cfg-if",
]
[[package]]
name = "memchr"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
[[package]]
name = "proc-macro2"
version = "1.0.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "ryu"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f"
[[package]]
name = "serde"
version = "1.0.136"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789"
[[package]]
name = "serde_json"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "smallvec"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
[[package]]
name = "string-interner"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "383196d1876517ee6f9f0864d1fc1070331b803335d3c6daaa04bbcccd823c08"
dependencies = [
"cfg-if",
"hashbrown",
]
[[package]]
name = "strsim"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "syn"
version = "1.0.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6f107db402c2c2055242dbf4d2af0e69197202e9faacbef9571bbe47f5a1b84"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]]
name = "thiserror"
version = "1.0.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tree-sitter"
version = "0.20.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e34327f8eac545e3f037382471b2b19367725a242bba7bc45edb9efb49fe39a"
dependencies = [
"cc",
"regex",
]
[[package]]
name = "tree-sitter-graph"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "639d21e886f581d293de5f5081f09af003c54607ff3fa85efa159b243ba1f97a"
dependencies = [
"log",
"regex",
"serde",
"serde_json",
"smallvec",
"string-interner",
"thiserror",
"tree-sitter",
]
[[package]]
name = "tree-sitter-python"
version = "0.19.0"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tsg-python"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"regex",
"smallvec",
"string-interner",
"thiserror",
"tree-sitter",
"tree-sitter-graph",
"tree-sitter-python",
]
[[package]]
name = "unicode-width"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
[[package]]
name = "unicode-xid"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]]
name = "vec_map"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@@ -0,0 +1,26 @@
[workspace]
[package]
name = "tsg-python"
version = "0.1.0"
authors = ["Taus Brock-Nannestad <tausbn@github.com>"]
edition = "2018"
# When changing/updating these, the `Cargo.Bazel.lock` file has to be regenerated.
# Check out the documentation at https://bazelbuild.github.io/rules_rust/crate_universe.html#repinning--updating-dependencies
# for how to do so. The bazel repository for the tsg-python project is called `tsg_python_crate_index`,
# and instead of calling `bazel sync`, `./build --bazel sync` should be used instead, to always use the correct bazel version.
[dependencies]
anyhow = "1.0"
regex = "1"
smallvec = { version="1.6", features=["union"] }
thiserror = "1.0"
tree-sitter = "0.20.4"
tree-sitter-graph = "0.7.0"
tree-sitter-python = {path = "tree-sitter-python"}
clap = "2.32"
[dependencies.string-interner]
version = "0.12"
default-features = false
features = ["std", "inline-more", "backends"]

View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 stack-graphs authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,624 @@
# `tsg-python`
Run `tree-sitter-graph` queries against Python source files.
## How to build
Run `cargo build --release`. The resulting binary can be found in the `target/release` directory.
## How to invoke
`tsg-python tsg-file.tsg python-file.py`
Output is emitted on `stdout`.
If you're impatient, you can also build and run using `cargo run` followed by the arguments given
above.
## How to use
To use `tsg-python`, you must have an appropriate `.tsg` file containing the directions for how to
construct a Python AST from the output of `tree-sitter-python`.
### A quick primer on `tree-sitter-graph` syntax
A file consists of a sequence of stanzas. Each stanza consists of a query (using the [tree-sitter
query syntax](https://tree-sitter.github.io/tree-sitter/using-parsers#pattern-matching-with-queries)) and a sequence of nodes and edges to define for each query match in the source file.
Queries will (almost always) include captures like `@foo`, which means any occurrence of `@foo` in
the corresponding stanza will refer to a particular syntax node in the bit that the query matches.
Stanzas are executed in order, and a stanza is only run when all possible matches have been
exhausted for all preceding stanzas. (Since the syntax tree that is matched against never changes,
execution never jumps back to an earlier stanza.)
Inside stanzas, scoped variables have the form `@foo.bar` where `@foo` is a capture in the
associated query, and `bar` is an identifier. This should be thought of as a variable that is
"attached" to the `tree-sitter` node that `@foo` refers to. If `@baz` is another reference to the same node as
`@foo` (perhaps even in a different stanza), then `@baz.bar` will be a reference to the _same_
scoped variable. This permits information to be linked across different stanzas.
Assigning a value to a scoped variable is done using the syntax `let @foo.bar = some-expr` (`let`
for immutable variables, `var` for mutable variables, which may be mutated using `set`). Note that
scoped variables only exist during the execution of the stack graph, and are not immediately part of
the output graph.
To actually produce output, we must specify some `node`s or `edge`s and possibly `attr`ibutes
thereof.
To produce a node, we declare `node @foo.bar` (which is equivalent to `let @foo.bar = (node)`, the
right hand side being a function that creates a new node). In the output, nodes are simply integers.
To assign an attribute to a node, we write `attr (@foo.bar) identifier = expr`, for some suitable
choice of `identifier` and `expr`. In the output, attributes are given alongside nodes in a `key:
value` notation.
For edges and their attributes, the syntax is similar:
`edge @foo.bar -> @baz.quux`
and
`attr (@foo.bar -> @baz.quux) identifier = expr`.
Note that it is an error to declare the same node, edge, (or attribute of either of these) twice.
### The general scheme:
For fields that point to some literal value
```tsg
<some capture involving @nd>
{
attr (@nd.node) field_name = some_value
}
```
For fields that point directly to an AST node:
```tsg
<some capture involving @parent and @child>
{
attr (@parent.node) field_name = @child.node
}
```
For fields that point to lists of AST nodes:
```tsg
<some capture involving @parent and @child>
{
edge @parent.node -> @child.node
attr (@parent.node -> @child.node) field_name = <index of @child in the resulting list>
}
```
Scoped variables of the form `@foo.node` are used to tie the AST together, and so it's important
that this is set for nodes that map directly onto `tree-sitter-python` nodes. Thus, for instance
for binary operators, the stanza could look as follows:
```tsg
(binary_operator
left: (_) @left
right: (_) @right
) @bin
{
attr (@bin.node) left = @left.node
attr (@bin.node) right = @right.node
}
```
Note in particular the `@left.node` and `@right.node` references. In order for the above stanza to
work, these scoped variables _must_ exist and point to suitable graph `node`s.
In practice, the setting up of all of these scoped variables (and creation of output graph nodes)
will happen at the very top of the `.tsg` file, to ensure that these scoped variables are defined
for the remainder of the file.
To ease the creation of these variables, we have the `ast-node` convenience function. For binary
operators, it would take the following form:
```tsg
(binary_operator) @bin
{
let @bin.node = (ast-node @bin "BinOp")
}
```
Here, the two arguments are respectively
- a `tree-sitter` node (which is used to set the location of `@bin.node`), and
- a string (which is used to set the "kind" of `@bin.node`)
In effect, the call
```tsg
let @bin.node = (ast-node @bin "BinOp")
```
is exactly equivalent to the more verbose
```tsg
node @bin.node ; or equivalently `let @bin.node = (node)`
attr (@bin.node) _location = (location @bin)
attr (@bin.node) _kind = "BinOp"
```
As the above suggests, attributes that start with an underscore are interpreted in a special way
when reconstructing the AST.
### Special attributes
#### The `_kind` attribute (mandatory)
Should be set to a string consisting of the name of the corresponding Python AST class. This
information will be used to build the AST, and so it is an error if this is left out.
Generally, this (and `_location`) will be set using the `ast-node` function.
#### The `_skip_to` attribute (optional)
This is used to indicate that the present graph node should _not_ be turned into an AST node, but that the
graph node contained in this attribute should be used instead. That graph node may _also_ contain a
`_skip_to` field, in which case the entire chain is followed until a node is encountered that does
not have a `_skip_to` field. (Please ensure that there are no cycles of `_skip_to` pointers.)
Example:
In `tree-sitter-python`, assignment statements are a form of `expression_statement`, and this node
type also encompasses things like expressions (e.g. `2+2`) appearing at the level of statements. In
the internal Python AST, we need to separate the assignment from such expressions. The assignment should be present as an `Assign` node, but `2+2` should be
wrapped in an `Expr` node. To solve this, we create an `Expr` for each `expression_statement`, and
then explicitly skip this node in the AST if it contains an `assignment`. This is implemented as
follows:
```tsg
(expression_statement (assignment) @inner) @outer
{
attr (@outer.node) _skip_to = @inner.node
}
```
#### The `_location` attribute (optional)
This attribute is used to indicate the location of the corresponding AST node. As with `_kind` it
should be set using the `ast-node` function.
#### The `_location_start` and `_location_end` attributes (optional)
These attributes are used to indicate the start or end of the location of the AST node. They can be
used for nodes where `_location` has already been set, in which case they override the relevant part
of that location. For an example of this see the worked example on `if` statements below.
#### The `_start_line`, `_start_column`, `_end_line`, and `_end_column` attributes (optional)
These can be used to set the start or end position of an AST node with even greater detail than the
preceding attributes. As with the `_location_start` and `_location_end` attributes, these will
override the values of the corresponding part of the location.
In general, these attributes should be used sparingly, as they are quite verbose.
### Built-in functions
#### `(source-text` _`tree-sitter-node`_`)` (built-in)
This function returns the source text of the `tree-sitter` node it receives as an argument.
Example:
Extracting the operator from a binary expression:
```tsg
(binary_operator
operator: _ @op
) @bin
{
attr (@bin.node) op = (source-text @op)
}
```
#### `(ast-node` _`tree-sitter-node`_ _`string`_`)` (`tsg-python` only)
Creates a new graph node with the given `_kind` and sets the `_location` attribute to the location
of the given `tree-sitter` node.
#### `(child-index` _`tree-sitter-node`_`)` (built-in)
Returns the index of the given `tree-sitter` node in its parent.
#### `(location` _`tree-sitter-node`_`)` (`tsg-python` only)
Returns the location of the given `tree-sitter` node as a list containing four integers
corresponding to the start row and column, followed by the end row and column.
#### `(location-start` _`tree-sitter-node`_`)` and `(location-end` _`tree-sitter-node`_`)` (`tsg-python` only)
Returns the start or end position (row followed by column) of the given `tree-sitter` node (as a list containing two integers).
#### `start-row`, `start-column`, `end-row`, and `end-column` (built-in)
(All of these take a `tree-sitter-node` as an argument.)
Returns an integer corresponding to the appropriate part of the location of the given `tree-sitter` node.
### A worked example: `if` statements
The way the current parser handles `if` statements means we cannot do a straight mapping from the tree-sitter grammar to the AST. In particular, a block of code such as
```python
if x: do_x
elif y: do_y
elif z: do_z
else: do_else
```
is unrolled into the following form by the current parser:
```python
if x: do_x
else:
if y: do_y
else:
if z: do_z
else: do_else
```
This means we have to synthesise nodes for the inner `if` statements.
However, this should be straightforward -- we simply have to make sure that `elif_clause`s also
produce the appropriate kind of node, and that everything is linked up correctly.
For references, here are the productions for `if_statement`, `else_clause` and `elif_clause` in
`tree-sitter-python`
```javascript
if_statement: $ => seq(
'if',
field('condition', $.expression),
':',
field('consequence', $._suite),
repeat(field('alternative', $.elif_clause)),
optional(field('alternative', $.else_clause))
),
elif_clause: $ => seq(
'elif',
field('condition', $.expression),
':',
field('consequence', $._suite)
),
else_clause: $ => seq(
'else',
':',
field('body', $._suite)
),
```
First, we'll set up all of the relevant nodes with corresponding nodes in the AST:
```tsg
(if_statement)
@tree_sitter_node
{
let @tree_sitter_node.node = (ast-node @tree_sitter_node "If")
}
```
This ensures that we can reference the `.node` scoped variable on the above nodes.
(We named the capture `@tree_sitter_node` above to make it more clear, but in general something like
`@if` would be more appropriate.)
In particular, since we want `elif`s to be turned into nested `if`s, it makes sense to apply the
`If` kind to `elif_clauses` as well:
```tsg
(elif_clause) @elif
{
let @elif.node = (ast-node @elif "If")
}
```
Whenever we refer to a node, we must ensure that it has first been defined, however there is no
need to do this separately for each node.
Next, for both `if`s and `elif`s, we want to record the `test` and the `body`. The `test` we do as follows:
```tsg
[
(if_statement
condition: (_) @test) @if
(elif_clause
condition: (_) @test) @if
]
{
attr (@if.node) test = @test.node
}
```
For `body`, in the Python AST this is simply a list of nodes, whereas for the `tree-sitter` parse tree, it
will contain a `block` node. Because there is no Python AST equivalent for `block`, we skip over
this node when linking the `if`-statement to its body:
```tsg
[
(if_statement
consequence: (block (_) @stmt)) @parent
(elif_clause
consequence: (block (_) @stmt)) @parent
]
{
edge @parent.node -> @stmt.node
attr (@parent.node -> @stmt.node) body = (child-index @stmt)
}
```
The above shows how we handle fields containing lists of items: we add an edge from the parent node
to each child node, and put an attribute on that edge. The name of the attribute will be the name of
the field, and the value will be the index of this node among the children of its `tree-sitter` parent.
Now we can begin unwinding the nesting. First of all, the first `elif` should be the `orelse` of the
initial `if_statement`:
```tsg
(if_statement
consequence: (_)
.
(elif_clause) @elif
) @if
{
edge @if.node -> @elif.node
attr (@if.node -> @elif.node) orelse = 0
}
```
(The `.` acts as an anchor, forcing its two neighbours to be adjancent in the tree. So in this case,
we get the first `elif` after the body of the `if`)
Next, whenever we have two adjacent `elif`s, we want the `orelse` of the first one to be the second one:
```tsg
(
(elif_clause) @elif1
.
(elif_clause) @elif2
)
{
edge @elif1.node -> @elif2.node
attr (@elif1.node -> @elif2.node) orelse = 0
}
```
Finally, the `else` branch of the outermost `if` should be the `orelse` of the _last_ `elif`:
```tsg
(if_statement
(elif_clause) @elif
.
alternative: (else_clause body: (block (_) @orelse))
)
{
edge @elif.node -> @orelse.node
attr (@elif.node -> @orelse.node) orelse = (child-index @orelse)
}
```
The above gives us the correct tree structure, but we're still missing a few bits (such as
locations). To capture location information we use the following stanza:
```tsg
[
(if_statement
condition: (_)
":" @colon) @if
(elif_clause
condition: (_)
":" @colon) @if
]
{
attr (@if.node) _location_end = (location-end @colon)
}
```
Because `tree-sitter-python` disagrees with the Python AST about the location of the `If` node, we
have to adjust it. We do this by setting the `_location_end` attribute to the end of the `:` token.
(Note that the _start_ of this location was set when we called `ast-node` above. As we don't have to
change this part of the location, we simply leave it as is.)
### Synthesizing nodes
In many cases it will be sufficient to hook up AST nodes to the corresponding `tree-sitter` nodes,
but occasionally we want the tree structure to be different. One example of this would be the
`class` statement. For instance, a class declaration such as
```python
class Foo(int, object, metaclass=type):
x = 5
```
has a `tree-sitter-python` parse tree that looks like this:
```
module [0, 0] - [2, 0]
class_definition [0, 0] - [1, 9]
name: identifier [0, 6] - [0, 9]
superclasses: argument_list [0, 9] - [0, 38]
identifier [0, 10] - [0, 13]
identifier [0, 15] - [0, 21]
keyword_argument [0, 23] - [0, 37]
name: identifier [0, 23] - [0, 32]
value: identifier [0, 33] - [0, 37]
body: block [1, 4] - [1, 9]
expression_statement [1, 4] - [1, 9]
assignment [1, 4] - [1, 9]
left: identifier [1, 4] - [1, 5]
right: integer [1, 8] - [1, 9]
```
but the Python AST looks like _this_:
```
Module: [1, 0] - [3, 0]
body: [
Assign: [1, 0] - [1, 39]
targets: [
Name: [1, 6] - [1, 9]
variable: Variable('Foo', None)
ctx: Store
]
value:
ClassExpr: [1, 0] - [1, 39]
name: 'Foo'
bases: [
Name: [1, 10] - [1, 13]
variable: Variable('int', None)
ctx: Load
Name: [1, 15] - [1, 21]
variable: Variable('object', None)
ctx: Load
]
keywords: [
keyword: [1, 23] - [1, 37]
arg: 'metaclass'
value:
Name: [1, 33] - [1, 37]
variable: Variable('type', None)
ctx: Load
]
inner_scope:
Class: [1, 0] - [1, 39]
name: 'Foo'
body: [
Assign: [2, 4] - [2, 9]
targets: [
Name: [2, 4] - [2, 5]
variable: Variable('x', None)
ctx: Store
]
value:
Num: [2, 8] - [2, 9]
n: 5
text: '5'
]
]
```
In particular, we unroll the `class` statement into an explicit assignment (which is the top node
for this statement in the AST) of a synthetic `ClassExpr`, which in turn contains a `Class` node
(which holds things like the body of the class). This requires too many nodes to simply reuse what's given to
us by `tree-sitter-python`, and so we must _synthesize_ additional nodes.
First of all, let us set up the outer node to be an `Assign` node:
```tsg
(class_definition) @class
{
let @class.node = (ast-node @class "Assign")
}
```
Next, we can do most of the work in a single stanza:
```tsg
(class_definition
name: (identifier) @name
":" @colon
) @class
{
; To make it clearer that the outer node is an assignment, we create an alias for it.
let @class.assign = @class.node
; Synthesized nodes: the left-hand side of the assignment, the class_expr node, and the class
; node.
let @class.assign_lhs = (ast-node @name "Name")
let @class.class_expr = (ast-node @class "ClassExpr")
let @class.inner_scope = (ast-node @class "Class")
edge @class.assign -> @class.assign_lhs
attr (@class.assign -> @class.assign_lhs) targets = 0
attr (@class.assign) value = @class.class_expr
attr (@class.assign) _location_end = (location-end @colon)
let class_name = (source-text @name)
; The left-hand side of the assignment, a `Name`.
attr (@class.assign_lhs) variable = class_name
attr (@class.assign_lhs) ctx = "store"
; The right hand side of the assignment, a `ClassExpr`.
attr (@class.class_expr) name = class_name
attr (@class.class_expr) inner_scope = @class.inner_scope
; `bases` will be set elsewhere
; `keywords` will be set elsewhere
attr (@class.class_expr) _location_end = (location-end @colon)
; The inner scope of the class_expr, a `Class`.
attr (@class.inner_scope) name = class_name
; body will be set in a separate stanza.
attr (@class.inner_scope) _location_end = (location-end @colon)
}
```
Let's go over these lines bit by bit. First, we create an alias for the outermost node (which will
become an assignment node) in order to make it clearer that it's an assignment. Next, we create
_new_ nodes for the inner synthesized nodes. Note that we can't assign these to `@class.node` as
that already points to the node that will become the assignment node. Instead, we create new scoped
variables (with suitable names), and assign them nodes (with appropriate kinds and locations using
`ast-node`).
```tsg
; To make it clearer that the outer node is an assignment, we create an alias for it.
let @class.assign = @class.node
; Synthesized nodes: the left-hand side of the assignment, the class_expr node, and the class
; node.
let @class.assign_lhs = (ast-node @name "Name")
let @class.class_expr = (ast-node @class "ClassExpr")
let @class.inner_scope = (ast-node @class "Class")
```
Next, we set up the outer assignment:
```tsg
edge @class.assign -> @class.assign_lhs
attr (@class.assign -> @class.assign_lhs) targets = 0
attr (@class.assign) value = @class.class_expr
attr (@class.assign) _location_end = (location-end @colon)
```
The remaining nodes all contain a field that refers to the name of the class, so put this in a local
variable for convenience:
```tsg
let class_name = (source-text @name)
```
We set up the left hand side of the assignment:
```tsg
; The left-hand side of the assignment, a `Name`.
attr (@class.assign_lhs) variable = class_name
attr (@class.assign_lhs) ctx = "store"
```
The `ClassExpr`:
```tsg
; The right hand side of the assignment, a `ClassExpr`.
attr (@class.class_expr) name = class_name
attr (@class.class_expr) inner_scope = @class.inner_scope
; `bases` will be set elsewhere
; `keywords` will be set elsewhere
attr (@class.class_expr) _location_end = (location-end @colon)
```
The `Class`:
```tsg
; The inner scope of the class_expr, a `Class`.
attr (@class.inner_scope) name = class_name
; body will be set elsewhere
attr (@class.inner_scope) _location_end = (location-end @colon)
```
The remaining stanzas take care of setting up the fields that contain lists of nodes, and these
follow the same scheme as before.
```tsg
; Class.body
(class_definition
body: (block (_) @stmt)
) @class
{
edge @class.inner_scope -> @stmt.node
attr (@class.inner_scope -> @stmt.node) body = (child-index @stmt)
}
; Class.bases
(class_definition
superclasses: (argument_list (identifier) @arg)
) @class
{
edge @class.class_expr -> @arg.node
attr (@class.class_expr -> @arg.node) bases = (child-index @arg)
attr (@arg.node) ctx = "load"
}
; Class.keywords
(class_definition
superclasses: (argument_list (keyword_argument) @arg)
) @class
{
edge @class.class_expr -> @arg.node
attr (@class.class_expr -> @arg.node) keywords = (child-index @arg)
}
```

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
# This file specifies the Rust version used to develop and test the Python
# extractor. It is set to the lowest version of Rust we want to support.
[toolchain]
channel = "1.68"
profile = "minimal"
components = [ "rustfmt" ]

View File

@@ -0,0 +1,572 @@
// -*- coding: utf-8 -*-
// ------------------------------------------------------------------------------------------------
// Copyright © 2021, GitHub.
// Licensed under either of Apache License, Version 2.0, or MIT license, at your option.
// Please see the LICENSE-APACHE or LICENSE-MIT files in this distribution for license details.
// ------------------------------------------------------------------------------------------------
use std::path::Path;
use anyhow::anyhow;
use anyhow::Context as _;
use anyhow::Result;
use clap::App;
use clap::Arg;
use tree_sitter::Parser;
use tree_sitter_graph::ast::File;
use tree_sitter_graph::functions::Functions;
use tree_sitter_graph::ExecutionConfig;
use tree_sitter_graph::Identifier;
use tree_sitter_graph::NoCancellation;
use tree_sitter_graph::Variables;
const BUILD_VERSION: &'static str = env!("CARGO_PKG_VERSION");
pub mod extra_functions {
use tree_sitter_graph::functions::{Function, Parameters};
use tree_sitter_graph::graph::{Graph, Value};
use tree_sitter_graph::{ExecutionError, Identifier};
pub struct Location;
fn get_location(node: Value, graph: &Graph) -> Result<Value, ExecutionError> {
let node = graph[node.into_syntax_node_ref()?];
let start = node.start_position();
let end = node.end_position();
Ok(Value::List(
vec![start.row, start.column, end.row, end.column]
.into_iter()
.map(|v| Value::from(v as u32))
.collect(),
))
}
impl Function for Location {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = parameters.param()?;
parameters.finish()?;
get_location(node, graph)
}
}
pub struct LocationStart;
impl Function for LocationStart {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let start = node.start_position();
Ok(Value::List(
vec![start.row, start.column]
.into_iter()
.map(|v| Value::from(v as u32))
.collect(),
))
}
}
pub struct LocationEnd;
impl Function for LocationEnd {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let end = node.end_position();
Ok(Value::List(
vec![end.row, end.column]
.into_iter()
.map(|v| Value::from(v as u32))
.collect(),
))
}
}
pub struct AstNode;
impl Function for AstNode {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let tree_sitter_node = parameters.param()?;
let kind = parameters.param()?;
parameters.finish()?;
let node = graph.add_graph_node();
let loc = get_location(tree_sitter_node, graph)?;
graph[node]
.attributes
.add(Identifier::from("_location"), loc)
.map_err(|_| {
ExecutionError::DuplicateAttribute(format!(
" _location on graph node ({:?})",
node
))
})?;
graph[node]
.attributes
.add(Identifier::from("_kind"), kind)
.map_err(|_| {
ExecutionError::DuplicateAttribute(format!(" _kind on graph node ({:?})", node))
})?;
Ok(Value::GraphNode(node))
}
}
/// A struct representing the prefix on a Python string.
struct Prefix {
flags: String,
quotes: String,
}
impl Prefix {
fn full(&self) -> String {
format!("{}{}", self.flags, self.quotes)
}
fn safe(&self) -> Prefix {
Prefix {
flags: self.flags.clone().replace("f", "").replace("F", ""),
quotes: self.quotes.clone(),
}
}
}
fn get_prefix(s: &str) -> Prefix {
let flags_matcher = regex::Regex::new("^[bfurBFUR]{0,2}").unwrap();
let mut end = 0;
let flags = match flags_matcher.find(s) {
Some(m) => {
end = m.end();
&s[m.start()..m.end()]
}
None => "",
};
let mut quotes = "";
if s[end..].starts_with("\"\"\"") {
quotes = "\"\"\"";
} else if s[end..].starts_with("'''") {
quotes = "'''";
} else if s[end..].starts_with('"') {
quotes = "\"";
} else if s[end..].starts_with('\'') {
quotes = "'";
} else if s[end..].starts_with('}') {
quotes = "}";
}
Prefix {
flags: flags.to_lowercase().to_owned(),
quotes: quotes.to_owned(),
}
}
#[test]
fn test_get_prefix() {
let p = get_prefix("rb'''hello'''");
assert_eq!(p.flags, "rb");
assert_eq!(p.quotes, "'''");
let p = get_prefix("Br\"\"\"hello\"\"\"");
assert_eq!(p.flags, "Br");
assert_eq!(p.quotes, "\"\"\"");
let p = get_prefix("FR\"hello\"");
assert_eq!(p.flags, "FR");
assert_eq!(p.quotes, "\"");
let p = get_prefix("uR'hello'");
assert_eq!(p.flags, "uR");
assert_eq!(p.quotes, "'");
let p = get_prefix("''");
assert_eq!(p.flags, "");
assert_eq!(p.quotes, "'");
let p = get_prefix("\"\"");
assert_eq!(p.flags, "");
assert_eq!(p.quotes, "\"");
let p = get_prefix("\"\"\"\"\"\"");
assert_eq!(p.flags, "");
assert_eq!(p.quotes, "\"\"\"");
}
fn get_string_contents(s: String) -> String {
let prefix = get_prefix(&s);
let contents = s.clone();
let contents = contents.strip_prefix(prefix.full().as_str()).unwrap();
let contents = contents.strip_suffix(prefix.quotes.as_str()).unwrap();
contents.to_owned()
}
#[test]
fn test_get_string_contents() {
let s = "rb'''hello'''";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "Br\"\"\"hello\"\"\"";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "FR\"hello\"";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "uR'hello'";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "''";
assert_eq!(get_string_contents(s.to_owned()), "");
let s = "\"\"";
assert_eq!(get_string_contents(s.to_owned()), "");
let s = "\"\"\"\"\"\"";
assert_eq!(get_string_contents(s.to_owned()), "");
let s = "''''''";
assert_eq!(get_string_contents(s.to_owned()), "");
}
pub struct StringPrefix;
impl Function for StringPrefix {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]).full();
Ok(Value::String(prefix))
}
}
pub struct StringContents;
impl Function for StringContents {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let contents = get_string_contents(source[node.byte_range()].to_owned());
Ok(Value::String(contents))
}
}
pub struct StringQuotes;
impl Function for StringQuotes {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]);
Ok(Value::String(prefix.quotes))
}
}
// Gets a version of the prefix that can be used in a call to `literal_eval`. To do so, we must remove
// any `f` or `F` characters, if present.
pub struct StringSafePrefix;
impl Function for StringSafePrefix {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]).full();
let prefix = prefix.replace("f", "").replace("F", "");
Ok(Value::String(prefix))
}
}
// Gets a version of the string where `f` and `F` have been stripped from the prefix.
pub struct SafeString;
impl Function for SafeString {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]);
let contents = get_string_contents(source[node.byte_range()].to_owned());
let s = format!("{}{}{}", prefix.safe().full(), contents, prefix.quotes);
Ok(Value::String(s))
}
}
pub struct UnnamedChildIndex;
impl Function for UnnamedChildIndex {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let parent = match node.parent() {
Some(parent) => parent,
None => {
return Err(ExecutionError::FunctionFailed(
"unnamed-child-index".into(),
format!("Cannot call child-index on the root node"),
))
}
};
let mut tree_cursor = parent.walk();
let index = parent
.children(&mut tree_cursor)
.position(|child| child == node)
.ok_or_else(|| {
ExecutionError::FunctionFailed(
"unnamed-child-index".into(),
format!("Called child-index on a non-named child"),
)
})?;
Ok(Value::Integer(index as u32))
}
}
pub struct ConcatenateStrings;
impl Function for ConcatenateStrings {
fn call(
&self,
_graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let mut result = String::new();
while let Ok(param) = parameters.param() {
let string = param.into_string()?;
result.push_str(string.as_str());
}
Ok(Value::String(result))
}
}
pub struct InstanceOf;
impl Function for InstanceOf {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
let class_name = parameters.param()?.into_string()?;
parameters.finish()?;
let node_type = node.kind();
let class_name = class_name.as_str();
let is_instance = node_type == class_name;
Ok(Value::Boolean(is_instance))
}
}
pub struct GetParent;
impl Function for GetParent {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let parent = node.parent().ok_or_else(|| {
ExecutionError::FunctionFailed(
"get-parent".into(),
format!("Cannot call get-parent on the root node"),
)
})?;
Ok(Value::SyntaxNode(graph.add_syntax_node(parent)))
}
}
pub struct HasNamedChild;
impl Function for HasNamedChild {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
let field_name = parameters.param()?.into_string()?;
parameters.finish()?;
let field_name = field_name.as_str();
let has_named_child = node.child_by_field_name(field_name).is_some();
Ok(Value::Boolean(has_named_child))
}
}
pub struct IsBooleanOperator;
impl Function for IsBooleanOperator {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
let expected_op_type = parameters.param()?.into_string()?;
parameters.finish()?;
if let Some(op) = node.child_by_field_name("operator") {
let op_type = source[op.byte_range()].to_string();
let is_boolean_op = expected_op_type == op_type;
Ok(Value::Boolean(is_boolean_op))
} else {
Ok(Value::Boolean(false))
}
}
}
pub struct Modulo;
impl Function for Modulo {
fn call(
&self,
_graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let left = parameters.param()?.into_integer()?;
let right = parameters.param()?.into_integer()?;
parameters.finish()?;
Ok(Value::Integer(left % right))
}
}
}
fn main() -> Result<()> {
let matches = App::new("tsg-python")
.version(BUILD_VERSION)
.author("Taus Brock-Nannestad <tausbn@github.com>")
.about("Extracts a Python AST from the parse tree given by tree-sitter-python")
.arg(
Arg::with_name("tsg")
.short("t")
.long("tsg")
.takes_value(true)
.required(false),
)
.arg(Arg::with_name("source").index(1).required(true))
.get_matches();
let tsg_path = if matches.is_present("tsg") {
Path::new(matches.value_of("tsg").unwrap())
.display()
.to_string()
} else {
"bundled `python.tsg`".to_owned()
};
let source_path = Path::new(matches.value_of("source").unwrap());
let language = tree_sitter_python::language();
let mut parser = Parser::new();
parser.set_language(language)?;
// Statically include `python.tsg`:
let tsg = if matches.is_present("tsg") {
std::fs::read(&tsg_path).with_context(|| format!("Error reading TSG file {}", tsg_path))?
} else {
include_bytes!("../python.tsg").to_vec()
};
let tsg = String::from_utf8(tsg)?;
let source = std::fs::read(source_path)
.with_context(|| format!("Error reading source file {}", source_path.display()))?;
let source = String::from_utf8(source)?;
let tree = parser
.parse(&source, None)
.ok_or_else(|| anyhow!("Could not parse {}", source_path.display()))?;
let file = File::from_str(language, &tsg)
.with_context(|| anyhow!("Error parsing TSG file {}", tsg_path))?;
let mut functions = Functions::stdlib();
functions.add(Identifier::from("location"), extra_functions::Location);
functions.add(
Identifier::from("location-start"),
extra_functions::LocationStart,
);
functions.add(
Identifier::from("location-end"),
extra_functions::LocationEnd,
);
functions.add(
Identifier::from("string-prefix"),
extra_functions::StringPrefix,
);
functions.add(
Identifier::from("string-contents"),
extra_functions::StringContents,
);
functions.add(
Identifier::from("string-quotes"),
extra_functions::StringQuotes,
);
functions.add(
Identifier::from("string-safe-prefix"),
extra_functions::StringSafePrefix,
);
functions.add(Identifier::from("safe-string"), extra_functions::SafeString);
functions.add(
Identifier::from("unnamed-child-index"),
extra_functions::UnnamedChildIndex,
);
functions.add(Identifier::from("ast-node"), extra_functions::AstNode);
functions.add(
Identifier::from("concatenate-strings"),
extra_functions::ConcatenateStrings,
);
functions.add(Identifier::from("instance-of"), extra_functions::InstanceOf);
functions.add(Identifier::from("get-parent"), extra_functions::GetParent);
functions.add(
Identifier::from("has-named-child"),
extra_functions::HasNamedChild,
);
functions.add(
Identifier::from("is-boolean-operator"),
extra_functions::IsBooleanOperator,
);
functions.add(Identifier::from("mod"), extra_functions::Modulo);
let globals = Variables::new();
let mut config = ExecutionConfig::new(&mut functions, &globals).lazy(false);
let graph = file
.execute(&tree, &source, &mut config, &NoCancellation)
.with_context(|| format!("Could not execute TSG file {}", tsg_path))?;
print!("{}", graph.pretty_print());
Ok(())
}

View File

@@ -0,0 +1,7 @@
Cargo.lock
package-lock.json
node_modules
build
*.log
/examples/*/
/target/

View File

@@ -0,0 +1,6 @@
corpus
examples
build
script
target
bindings/rust

View File

@@ -0,0 +1,38 @@
load("@rules_rust//cargo:defs.bzl", "cargo_build_script")
load("@rules_rust//rust:defs.bzl", "rust_library")
load("@tsg_python_crate_index//:defs.bzl", "aliases", "all_crate_deps")
package(default_visibility = ["//visibility:public"])
# This will run the build script from the root of the workspace, and
# collect the outputs.
cargo_build_script(
name = "tsg-build-script",
srcs = ["bindings/rust/build.rs"],
data = glob([
"src/**",
]),
deps = all_crate_deps(
build = True,
),
)
rust_library(
name = "tree-sitter-python",
srcs = [
"bindings/rust/lib.rs",
],
aliases = aliases(),
compile_data = glob([
"src/**",
"queries/**",
]) + [
"grammar.js",
],
proc_macro_deps = all_crate_deps(
proc_macro = True,
),
deps = [":tsg-build-script"] + all_crate_deps(
normal = True,
),
)

View File

@@ -0,0 +1,31 @@
[package]
name = "tree-sitter-python"
description = "Python grammar for the tree-sitter parsing library"
version = "0.19.0"
authors = [
"Max Brunsfeld <maxbrunsfeld@gmail.com>",
"Douglas Creager <dcreager@dcreager.net>",
]
license = "MIT"
readme = "bindings/rust/README.md"
keywords = ["incremental", "parsing", "python"]
categories = ["parsing", "text-editors"]
repository = "https://github.com/tree-sitter/tree-sitter-python"
edition = "2018"
build = "bindings/rust/build.rs"
include = [
"bindings/rust/*",
"grammar.js",
"queries/*",
"src/*",
]
[lib]
path = "bindings/rust/lib.rs"
[dependencies]
tree-sitter = ">= 0.20, < 0.21"
[build-dependencies]
cc = "1.0"

View File

@@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016 Max Brunsfeld
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,13 @@
tree-sitter-python
==================
[![build](https://github.com/tree-sitter/tree-sitter-python/actions/workflows/ci.yml/badge.svg)](https://github.com/tree-sitter/tree-sitter-python/actions/workflows/ci.yml)
Python grammar for [tree-sitter][].
[tree-sitter]: https://github.com/tree-sitter/tree-sitter
#### References
* [Python 2 Grammar](https://docs.python.org/2/reference/grammar.html)
* [Python 3 Grammar](https://docs.python.org/3/reference/grammar.html)

View File

@@ -0,0 +1,19 @@
{
"targets": [
{
"target_name": "tree_sitter_python_binding",
"include_dirs": [
"<!(node -e \"require('nan')\")",
"src"
],
"sources": [
"src/parser.c",
"bindings/node/binding.cc",
"src/scanner.cc"
],
"cflags_c": [
"-std=c99",
]
}
]
}

View File

@@ -0,0 +1,28 @@
#include "tree_sitter/parser.h"
#include <node.h>
#include "nan.h"
using namespace v8;
extern "C" TSLanguage * tree_sitter_python();
namespace {
NAN_METHOD(New) {}
void Init(Local<Object> exports, Local<Object> module) {
Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
tpl->SetClassName(Nan::New("Language").ToLocalChecked());
tpl->InstanceTemplate()->SetInternalFieldCount(1);
Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
Nan::SetInternalFieldPointer(instance, 0, tree_sitter_python());
Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("python").ToLocalChecked());
Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
}
NODE_MODULE(tree_sitter_python_binding, Init)
} // namespace

View File

@@ -0,0 +1,19 @@
try {
module.exports = require("../../build/Release/tree_sitter_python_binding");
} catch (error1) {
if (error1.code !== 'MODULE_NOT_FOUND') {
throw error1;
}
try {
module.exports = require("../../build/Debug/tree_sitter_python_binding");
} catch (error2) {
if (error2.code !== 'MODULE_NOT_FOUND') {
throw error2;
}
throw error1
}
}
try {
module.exports.nodeTypeInfo = require("../../src/node-types.json");
} catch (_) {}

View File

@@ -0,0 +1,36 @@
# tree-sitter-python
This crate provides a Python grammar for the [tree-sitter][] parsing library.
To use this crate, add it to the `[dependencies]` section of your `Cargo.toml`
file. (Note that you will probably also need to depend on the
[`tree-sitter`][tree-sitter crate] crate to use the parsed result in any useful
way.)
``` toml
[dependencies]
tree-sitter = "0.17"
tree-sitter-python = "0.17"
```
Typically, you will use the [language][language func] function to add this
grammar to a tree-sitter [Parser][], and then use the parser to parse some code:
``` rust
let code = r#"
def double(x):
return x * 2
"#;
let mut parser = Parser::new();
parser.set_language(tree_sitter_python::language()).expect("Error loading Python grammar");
let parsed = parser.parse(code, None);
```
If you have any questions, please reach out to us in the [tree-sitter
discussions] page.
[Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
[language func]: https://docs.rs/tree-sitter-python/*/tree_sitter_python/fn.language.html
[Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
[tree-sitter]: https://tree-sitter.github.io/
[tree-sitter crate]: https://crates.io/crates/tree-sitter
[tree-sitter discussions]: https://github.com/tree-sitter/tree-sitter/discussions

View File

@@ -0,0 +1,28 @@
use std::path::Path;
extern crate cc;
fn main() {
let src_dir = Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
.flag_if_supported("-Wno-trigraphs");
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
c_config.compile("parser");
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
cpp_config.compile("scanner");
}

View File

@@ -0,0 +1,68 @@
// -*- coding: utf-8 -*-
// ------------------------------------------------------------------------------------------------
// Copyright © 2020, tree-sitter-python authors.
// See the LICENSE file in this repo for license details.
// ------------------------------------------------------------------------------------------------
//! This crate provides a Python grammar for the [tree-sitter][] parsing library.
//!
//! Typically, you will use the [language][language func] function to add this grammar to a
//! tree-sitter [Parser][], and then use the parser to parse some code:
//!
//! ```
//! use tree_sitter::Parser;
//!
//! let code = r#"
//! def double(x):
//! return x * 2
//! "#;
//! let mut parser = Parser::new();
//! parser.set_language(tree_sitter_python::language()).expect("Error loading Python grammar");
//! let parsed = parser.parse(code, None);
//! # let parsed = parsed.unwrap();
//! # let root = parsed.root_node();
//! # assert!(!root.has_error());
//! ```
//!
//! [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
//! [language func]: fn.language.html
//! [Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
//! [tree-sitter]: https://tree-sitter.github.io/
use tree_sitter::Language;
extern "C" {
fn tree_sitter_python() -> Language;
}
/// Returns the tree-sitter [Language][] for this grammar.
///
/// [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
pub fn language() -> Language {
unsafe { tree_sitter_python() }
}
/// The source of the Python tree-sitter grammar description.
pub const GRAMMAR: &'static str = include_str!("../../grammar.js");
/// The syntax highlighting query for this language.
pub const HIGHLIGHT_QUERY: &'static str = include_str!("../../queries/highlights.scm");
/// The content of the [`node-types.json`][] file for this grammar.
///
/// [`node-types.json`]: https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
pub const NODE_TYPES: &'static str = include_str!("../../src/node-types.json");
/// The symbol tagging query for this language.
pub const TAGGING_QUERY: &'static str = include_str!("../../queries/tags.scm");
#[cfg(test)]
mod tests {
#[test]
fn can_load_grammar() {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(super::language())
.expect("Error loading Python grammar");
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,33 @@
{
"name": "tree-sitter-python",
"version": "0.19.0",
"description": "Python grammar for tree-sitter",
"main": "bindings/node",
"keywords": [
"parser",
"lexer"
],
"author": "Max Brunsfeld",
"license": "MIT",
"dependencies": {
"nan": "^2.14.0"
},
"devDependencies": {
"tree-sitter-cli": "^0.19.3"
},
"scripts": {
"build": "tree-sitter generate && node-gyp build",
"test": "tree-sitter test && script/parse-examples",
"parse": "tree-sitter parse",
"test-windows": "tree-sitter test"
},
"repository": "https://github.com/tree-sitter/tree-sitter-python",
"tree-sitter": [
{
"scope": "source.python",
"file-types": [
"py"
]
}
]
}

View File

@@ -0,0 +1,124 @@
; Identifier naming conventions
((identifier) @constructor
(#match? @constructor "^[A-Z]"))
((identifier) @constant
(#match? @constant "^[A-Z][A-Z_]*$"))
; Builtin functions
((call
function: (identifier) @function.builtin)
(#match?
@function.builtin
"^(abs|all|any|ascii|bin|bool|breakpoint|bytearray|bytes|callable|chr|classmethod|compile|complex|delattr|dict|dir|divmod|enumerate|eval|exec|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|isinstance|issubclass|iter|len|list|locals|map|max|memoryview|min|next|object|oct|open|ord|pow|print|property|range|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|vars|zip|__import__)$"))
; Function calls
(decorator) @function
(call
function: (attribute attribute: (identifier) @function.method))
(call
function: (identifier) @function)
; Function definitions
(function_definition
name: (identifier) @function)
(identifier) @variable
(attribute attribute: (identifier) @property)
(type (identifier) @type)
; Literals
[
(none)
(true)
(false)
] @constant.builtin
[
(integer)
(float)
] @number
(comment) @comment
(string) @string
(escape_sequence) @escape
(interpolation
"{" @punctuation.special
"}" @punctuation.special) @embedded
[
"-"
"-="
"!="
"*"
"**"
"**="
"*="
"/"
"//"
"//="
"/="
"&"
"%"
"%="
"^"
"+"
"->"
"+="
"<"
"<<"
"<="
"<>"
"="
":="
"=="
">"
">="
">>"
"|"
"~"
"and"
"in"
"is"
"not"
"or"
] @operator
[
"as"
"assert"
"async"
"await"
"break"
"class"
"continue"
"def"
"del"
"elif"
"else"
"except"
"exec"
"finally"
"for"
"from"
"global"
"if"
"import"
"lambda"
"nonlocal"
"pass"
"print"
"raise"
"return"
"try"
"while"
"with"
"yield"
] @keyword

View File

@@ -0,0 +1,12 @@
(class_definition
name: (identifier) @name) @definition.class
(function_definition
name: (identifier) @name) @definition.function
(call
function: [
(identifier) @name
(attribute
attribute: (identifier) @name)
]) @reference.call

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,402 @@
#include <cassert>
#include <cstring>
#include <cwctype>
#include <stdio.h>
#include <tree_sitter/parser.h>
#include <vector>
namespace {
using std::vector;
using std::iswspace;
using std::memcpy;
enum TokenType {
NEWLINE,
INDENT,
DEDENT,
STRING_START,
STRING_CONTENT,
STRING_END,
};
struct Delimiter {
enum {
SingleQuote = 1 << 0,
DoubleQuote = 1 << 1,
BackQuote = 1 << 2,
Raw = 1 << 3,
Format = 1 << 4,
Triple = 1 << 5,
Bytes = 1 << 6,
};
Delimiter() : flags(0) {}
bool is_format() const {
return flags & Format;
}
bool is_raw() const {
return flags & Raw;
}
bool is_triple() const {
return flags & Triple;
}
bool is_bytes() const {
return flags & Bytes;
}
int32_t end_character() const {
if (flags & SingleQuote) return '\'';
if (flags & DoubleQuote) return '"';
if (flags & BackQuote) return '`';
return 0;
}
void set_format() {
flags |= Format;
}
void set_raw() {
flags |= Raw;
}
void set_triple() {
flags |= Triple;
}
void set_bytes() {
flags |= Bytes;
}
void set_end_character(int32_t character) {
switch (character) {
case '\'':
flags |= SingleQuote;
break;
case '"':
flags |= DoubleQuote;
break;
case '`':
flags |= BackQuote;
break;
default:
assert(false);
}
}
char flags;
};
struct Scanner {
Scanner() {
assert(sizeof(Delimiter) == sizeof(char));
deserialize(NULL, 0);
}
unsigned serialize(char *buffer) {
size_t i = 0;
size_t delimiter_count = delimiter_stack.size();
if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX;
buffer[i++] = delimiter_count;
if (delimiter_count > 0) {
memcpy(&buffer[i], delimiter_stack.data(), delimiter_count);
}
i += delimiter_count;
vector<uint16_t>::iterator
iter = indent_length_stack.begin() + 1,
end = indent_length_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
return i;
}
void deserialize(const char *buffer, unsigned length) {
delimiter_stack.clear();
indent_length_stack.clear();
indent_length_stack.push_back(0);
if (length > 0) {
size_t i = 0;
size_t delimiter_count = (uint8_t)buffer[i++];
delimiter_stack.resize(delimiter_count);
if (delimiter_count > 0) {
memcpy(delimiter_stack.data(), &buffer[i], delimiter_count);
}
i += delimiter_count;
for (; i < length; i++) {
indent_length_stack.push_back(buffer[i]);
}
}
}
void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) {
Delimiter delimiter = delimiter_stack.back();
int32_t end_character = delimiter.end_character();
bool has_content = false;
while (lexer->lookahead) {
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return has_content;
} else if (lexer->lookahead == '\\') {
if (delimiter.is_raw()) {
lexer->advance(lexer, false);
continue;
} else if (delimiter.is_bytes()) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') {
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
lexer->advance(lexer, false);
} else {
lexer->result_symbol = STRING_CONTENT;
return has_content;
}
} else {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return has_content;
}
} else if (lexer->lookahead == end_character) {
if (delimiter.is_triple()) {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == end_character) {
lexer->advance(lexer, false);
if (lexer->lookahead == end_character) {
if (has_content) {
lexer->result_symbol = STRING_CONTENT;
} else {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
delimiter_stack.pop_back();
lexer->result_symbol = STRING_END;
}
return true;
} else {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return true;
}
} else {
lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT;
return true;
}
} else {
if (has_content) {
lexer->result_symbol = STRING_CONTENT;
} else {
lexer->advance(lexer, false);
delimiter_stack.pop_back();
lexer->result_symbol = STRING_END;
}
lexer->mark_end(lexer);
return true;
}
} else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) {
return false;
}
advance(lexer);
has_content = true;
}
}
lexer->mark_end(lexer);
bool found_end_of_line = false;
uint32_t indent_length = 0;
int32_t first_comment_indent_length = -1;
for (;;) {
if (lexer->lookahead == '\n') {
found_end_of_line = true;
indent_length = 0;
skip(lexer);
} else if (lexer->lookahead == ' ') {
indent_length++;
skip(lexer);
} else if (lexer->lookahead == '\r') {
indent_length = 0;
skip(lexer);
} else if (lexer->lookahead == '\t') {
indent_length += 8;
skip(lexer);
} else if (lexer->lookahead == '#') {
if (first_comment_indent_length == -1) {
first_comment_indent_length = (int32_t)indent_length;
}
while (lexer->lookahead && lexer->lookahead != '\n') {
skip(lexer);
}
skip(lexer);
indent_length = 0;
} else if (lexer->lookahead == '\\') {
skip(lexer);
if (lexer->lookahead == '\r') {
skip(lexer);
}
if (lexer->lookahead == '\n') {
skip(lexer);
} else {
return false;
}
} else if (lexer->lookahead == '\f') {
indent_length = 0;
skip(lexer);
} else if (lexer->lookahead == 0) {
indent_length = 0;
found_end_of_line = true;
break;
} else {
break;
}
}
if (found_end_of_line) {
if (!indent_length_stack.empty()) {
uint16_t current_indent_length = indent_length_stack.back();
if (
valid_symbols[INDENT] &&
indent_length > current_indent_length
) {
indent_length_stack.push_back(indent_length);
lexer->result_symbol = INDENT;
return true;
}
if (
valid_symbols[DEDENT] &&
indent_length < current_indent_length &&
// Wait to create a dedent token until we've consumed any comments
// whose indentation matches the current block.
first_comment_indent_length < (int32_t)current_indent_length
) {
indent_length_stack.pop_back();
lexer->result_symbol = DEDENT;
return true;
}
}
if (valid_symbols[NEWLINE]) {
lexer->result_symbol = NEWLINE;
return true;
}
}
if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) {
Delimiter delimiter;
bool has_flags = false;
while (lexer->lookahead) {
if (lexer->lookahead == 'f' || lexer->lookahead == 'F') {
delimiter.set_format();
} else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') {
delimiter.set_raw();
} else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') {
delimiter.set_bytes();
} else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') {
break;
}
has_flags = true;
advance(lexer);
}
if (lexer->lookahead == '`') {
delimiter.set_end_character('`');
advance(lexer);
lexer->mark_end(lexer);
} else if (lexer->lookahead == '\'') {
delimiter.set_end_character('\'');
advance(lexer);
lexer->mark_end(lexer);
if (lexer->lookahead == '\'') {
advance(lexer);
if (lexer->lookahead == '\'') {
advance(lexer);
lexer->mark_end(lexer);
delimiter.set_triple();
}
}
} else if (lexer->lookahead == '"') {
delimiter.set_end_character('"');
advance(lexer);
lexer->mark_end(lexer);
if (lexer->lookahead == '"') {
advance(lexer);
if (lexer->lookahead == '"') {
advance(lexer);
lexer->mark_end(lexer);
delimiter.set_triple();
}
}
}
if (delimiter.end_character()) {
delimiter_stack.push_back(delimiter);
lexer->result_symbol = STRING_START;
return true;
} else if (has_flags) {
return false;
}
}
return false;
}
vector<uint16_t> indent_length_stack;
vector<Delimiter> delimiter_stack;
};
}
extern "C" {
void *tree_sitter_python_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_python_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}

View File

@@ -0,0 +1,224 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef uint16_t TSStateId;
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef struct {
bool visible;
bool named;
bool supertype;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef union {
struct {
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
int16_t dynamic_precedence;
uint16_t production_id;
} reduce;
uint8_t type;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable;
} entry;
} TSParseActionEntry;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
uint32_t state_count;
uint32_t large_state_count;
uint32_t production_id_count;
uint32_t field_count;
uint16_t max_alias_sequence_length;
const uint16_t *parse_table;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSParseActionEntry *parse_actions;
const char * const *symbol_names;
const char * const *field_names;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata;
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSLexMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
const TSStateId *primary_state_ids;
};
/*
* Lexer Macros
*/
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value \
} \
}}
#define SHIFT_REPEAT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value, \
.repetition = true \
} \
}}
#define SHIFT_EXTRA() \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
} \
}}
#define REDUCE(symbol_val, child_count_val, ...) \
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
}, \
}}
#define RECOVER() \
{{ \
.type = TSParseActionTypeRecover \
}}
#define ACCEPT_INPUT() \
{{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_

View File

@@ -0,0 +1,60 @@
# Convert output of tree-sitter-graph to dot format.
import sys
import re
# regular expression to match a node
node_re = re.compile(r"node (?P<id>\d+)")
# regular expression to match an edge
edge_re = re.compile(r"edge (?P<from>\d+) -> (?P<to>\d+)")
# regular expression to match a property
prop_re = re.compile(r"\s+(?P<key>\w+): (?P<value>.*)")
# regular expression to match a link: "[graph node n]"
link_re = re.compile(r"\[graph node (?P<id>\d+)\]")
with open(sys.argv[1], 'r') as f, open(sys.argv[2], 'w') as out:
out.write("digraph G {\n")
label = []
inside = False
node_id = 0
links = {}
for line in f:
m = node_re.match(line)
if m:
if inside:
out.write('\\n'.join(label) + "\"];\n")
for k, v in links.items():
out.write("{} -> {} [label=\"{}\"];\n".format(node_id, v, k))
out.write("{id} [label=\"".format(**m.groupdict()))
label = ["id={id}".format(**m.groupdict())]
inside = True
node_id = m.group('id')
links = {}
m = edge_re.match(line)
if m:
if inside:
out.write('\\n'.join(label) + "\"];\n")
for k, v in links.items():
out.write("{} -> {} [label=\"{}\"];\n".format(node_id, v, k))
out.write("{from} -> {to} [label=\"".format(**m.groupdict()))
label = []
inside = True
node_id = 0
links = {}
m = prop_re.match(line)
if m:
# escape quotes in value
label.append("{key}={value}".format(**m.groupdict()).replace('"', '\\"').replace('\\\\"', ''))
l = link_re.match(m.group('value'))
if l:
links[m.group('key')] = l.group('id')
out.write('\\n'.join(label) + "\"];\n")
for k, v in links.items():
out.write("{} -> {} [label=\"{}\"];\n".format(node_id, v, k))
out.write("}\n")