Files
codeql/python/extractor/tsg-python/src/main.rs
2025-09-03 11:55:49 +00:00

695 lines
22 KiB
Rust

// -*- coding: utf-8 -*-
// ------------------------------------------------------------------------------------------------
// Copyright © 2021, GitHub.
// Licensed under either of Apache License, Version 2.0, or MIT license, at your option.
// Please see the LICENSE-APACHE or LICENSE-MIT files in this distribution for license details.
// ------------------------------------------------------------------------------------------------
use std::path::Path;
use anyhow::Context as _;
use anyhow::Result;
use anyhow::anyhow;
use clap::{Arg, ArgAction, Command};
use tree_sitter::Parser;
use tree_sitter_graph::ExecutionConfig;
use tree_sitter_graph::Identifier;
use tree_sitter_graph::NoCancellation;
use tree_sitter_graph::Variables;
use tree_sitter_graph::ast::File;
use tree_sitter_graph::functions::Functions;
const BUILD_VERSION: &str = env!("CARGO_PKG_VERSION");
pub mod extra_functions {
use tree_sitter_graph::functions::{Function, Parameters};
use tree_sitter_graph::graph::{Graph, Value};
use tree_sitter_graph::{ExecutionError, Identifier};
pub struct Location;
fn get_location(node: Value, graph: &Graph) -> Result<Value, ExecutionError> {
let node = graph[node.into_syntax_node_ref()?];
let start = node.start_position();
let end = node.end_position();
Ok(Value::List(
vec![start.row, start.column, end.row, end.column]
.into_iter()
.map(|v| Value::from(v as u32))
.collect(),
))
}
impl Function for Location {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = parameters.param()?;
parameters.finish()?;
get_location(node, graph)
}
}
pub struct LocationStart;
impl Function for LocationStart {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let start = node.start_position();
Ok(Value::List(
vec![start.row, start.column]
.into_iter()
.map(|v| Value::from(v as u32))
.collect(),
))
}
}
pub struct LocationEnd;
impl Function for LocationEnd {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let end = node.end_position();
Ok(Value::List(
vec![end.row, end.column]
.into_iter()
.map(|v| Value::from(v as u32))
.collect(),
))
}
}
pub struct AstNode;
impl Function for AstNode {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let tree_sitter_node = parameters.param()?;
let kind = parameters.param()?;
parameters.finish()?;
let node = graph.add_graph_node();
let loc = get_location(tree_sitter_node, graph)?;
graph[node]
.attributes
.add(Identifier::from("_location"), loc)
.map_err(|_| {
ExecutionError::DuplicateAttribute(format!(
" _location on graph node ({:?})",
node
))
})?;
graph[node]
.attributes
.add(Identifier::from("_kind"), kind)
.map_err(|_| {
ExecutionError::DuplicateAttribute(format!(" _kind on graph node ({:?})", node))
})?;
Ok(Value::GraphNode(node))
}
}
/// A struct representing the prefix on a Python string.
struct Prefix {
flags: String,
quotes: String,
}
impl Prefix {
fn full(&self) -> String {
format!("{}{}", self.flags, self.quotes)
}
fn safe(&self) -> Prefix {
Prefix {
flags: self.flags.clone().replace("f", "").replace("F", ""),
quotes: self.quotes.clone(),
}
}
}
fn get_prefix(s: &str) -> Prefix {
let flags_matcher = regex::Regex::new("^[bfurBFUR]{0,2}").unwrap();
let mut end = 0;
let flags = match flags_matcher.find(s) {
Some(m) => {
end = m.end();
&s[m.start()..m.end()]
}
None => "",
};
let mut quotes = "";
if s[end..].starts_with("\"\"\"") {
quotes = "\"\"\"";
} else if s[end..].starts_with("'''") {
quotes = "'''";
} else if s[end..].starts_with('"') {
quotes = "\"";
} else if s[end..].starts_with('\'') {
quotes = "'";
} else if s[end..].starts_with('}') {
quotes = "}";
}
Prefix {
flags: flags.to_lowercase().to_owned(),
quotes: quotes.to_owned(),
}
}
#[test]
fn test_get_prefix() {
let p = get_prefix("rb'''hello'''");
assert_eq!(p.flags, "rb");
assert_eq!(p.quotes, "'''");
let p = get_prefix("Br\"\"\"hello\"\"\"");
assert_eq!(p.flags, "Br");
assert_eq!(p.quotes, "\"\"\"");
let p = get_prefix("FR\"hello\"");
assert_eq!(p.flags, "FR");
assert_eq!(p.quotes, "\"");
let p = get_prefix("uR'hello'");
assert_eq!(p.flags, "uR");
assert_eq!(p.quotes, "'");
let p = get_prefix("''");
assert_eq!(p.flags, "");
assert_eq!(p.quotes, "'");
let p = get_prefix("\"\"");
assert_eq!(p.flags, "");
assert_eq!(p.quotes, "\"");
let p = get_prefix("\"\"\"\"\"\"");
assert_eq!(p.flags, "");
assert_eq!(p.quotes, "\"\"\"");
}
fn get_string_contents(s: String) -> String {
let prefix = get_prefix(&s);
let contents = s.clone();
let contents = contents.strip_prefix(prefix.full().as_str()).unwrap();
let contents = contents.strip_suffix(prefix.quotes.as_str()).unwrap();
contents.to_owned()
}
#[test]
fn test_get_string_contents() {
let s = "rb'''hello'''";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "Br\"\"\"hello\"\"\"";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "FR\"hello\"";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "uR'hello'";
assert_eq!(get_string_contents(s.to_owned()), "hello");
let s = "''";
assert_eq!(get_string_contents(s.to_owned()), "");
let s = "\"\"";
assert_eq!(get_string_contents(s.to_owned()), "");
let s = "\"\"\"\"\"\"";
assert_eq!(get_string_contents(s.to_owned()), "");
let s = "''''''";
assert_eq!(get_string_contents(s.to_owned()), "");
}
pub struct StringPrefix;
impl Function for StringPrefix {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]).full();
Ok(Value::String(prefix))
}
}
pub struct StringContents;
impl Function for StringContents {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let contents = get_string_contents(source[node.byte_range()].to_owned());
Ok(Value::String(contents))
}
}
pub struct StringQuotes;
impl Function for StringQuotes {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]);
Ok(Value::String(prefix.quotes))
}
}
// Gets a version of the prefix that can be used in a call to `literal_eval`. To do so, we must remove
// any `f` or `F` characters, if present.
pub struct StringSafePrefix;
impl Function for StringSafePrefix {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]).full();
let prefix = prefix.replace("f", "").replace("F", "");
Ok(Value::String(prefix))
}
}
// Gets a version of the string where `f` and `F` have been stripped from the prefix.
pub struct SafeString;
impl Function for SafeString {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let prefix = get_prefix(&source[node.byte_range()]);
let contents = get_string_contents(source[node.byte_range()].to_owned());
let s = format!("{}{}{}", prefix.safe().full(), contents, prefix.quotes);
Ok(Value::String(s))
}
}
pub struct UnnamedChildIndex;
impl Function for UnnamedChildIndex {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let parent = match node.parent() {
Some(parent) => parent,
None => {
return Err(ExecutionError::FunctionFailed(
"unnamed-child-index".into(),
"Cannot call child-index on the root node".to_string(),
));
}
};
let mut tree_cursor = parent.walk();
let index = parent
.children(&mut tree_cursor)
.position(|child| child == node)
.ok_or_else(|| {
ExecutionError::FunctionFailed(
"unnamed-child-index".into(),
"Called child-index on a non-named child".to_string(),
)
})?;
Ok(Value::Integer(index as u32))
}
}
pub struct ConcatenateStrings;
impl Function for ConcatenateStrings {
fn call(
&self,
_graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let mut result = String::new();
while let Ok(param) = parameters.param() {
let string = param.into_string()?;
result.push_str(string.as_str());
}
Ok(Value::String(result))
}
}
pub struct InstanceOf;
impl Function for InstanceOf {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
let class_name = parameters.param()?.into_string()?;
parameters.finish()?;
let node_type = node.kind();
let class_name = class_name.as_str();
let is_instance = node_type == class_name;
Ok(Value::Boolean(is_instance))
}
}
pub struct GetParent;
impl Function for GetParent {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
parameters.finish()?;
let parent = node.parent().ok_or_else(|| {
ExecutionError::FunctionFailed(
"get-parent".into(),
"Cannot call get-parent on the root node".to_string(),
)
})?;
Ok(Value::SyntaxNode(graph.add_syntax_node(parent)))
}
}
pub struct HasNamedChild;
impl Function for HasNamedChild {
fn call(
&self,
graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
let field_name = parameters.param()?.into_string()?;
parameters.finish()?;
let field_name = field_name.as_str();
let has_named_child = node.child_by_field_name(field_name).is_some();
Ok(Value::Boolean(has_named_child))
}
}
pub struct IsBooleanOperator;
impl Function for IsBooleanOperator {
fn call(
&self,
graph: &mut Graph,
source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let node = graph[parameters.param()?.into_syntax_node_ref()?];
let expected_op_type = parameters.param()?.into_string()?;
parameters.finish()?;
if let Some(op) = node.child_by_field_name("operator") {
let op_type = source[op.byte_range()].to_string();
let is_boolean_op = expected_op_type == op_type;
Ok(Value::Boolean(is_boolean_op))
} else {
Ok(Value::Boolean(false))
}
}
}
pub struct Modulo;
impl Function for Modulo {
fn call(
&self,
_graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let left = parameters.param()?.into_integer()?;
let right = parameters.param()?.into_integer()?;
parameters.finish()?;
Ok(Value::Integer(left % right))
}
}
pub struct GetLastElement;
impl Function for GetLastElement {
fn call(
&self,
_graph: &mut Graph,
_source: &str,
parameters: &mut dyn Parameters,
) -> Result<Value, ExecutionError> {
let list = parameters.param()?.into_list()?;
parameters.finish()?;
let last = list.last().unwrap_or(&Value::Null).clone();
Ok(last)
}
}
}
struct TreeIterator<'a> {
nodes_to_visit: Vec<tree_sitter::Node<'a>>,
}
impl<'a> TreeIterator<'a> {
fn new(root: tree_sitter::Node<'a>) -> Self {
Self {
nodes_to_visit: vec![root],
}
}
}
impl<'a> Iterator for TreeIterator<'a> {
type Item = tree_sitter::Node<'a>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(node) = self.nodes_to_visit.pop() {
// Add all children to the queue for processing
let children: Vec<_> = (0..node.child_count())
.rev()
.filter_map(|i| node.child(i))
.collect();
self.nodes_to_visit.extend(children);
Some(node)
} else {
None
}
}
}
#[derive(Debug, Clone)]
struct SyntaxError {
start_pos: tree_sitter::Point,
end_pos: tree_sitter::Point,
source: String,
}
fn syntax_errors_from_tree<'a>(
root: tree_sitter::Node<'a>,
source: &'a str,
) -> impl Iterator<Item = SyntaxError> + 'a {
TreeIterator::new(root)
.filter(|&node| node.is_error() || node.is_missing())
.map(move |node| {
let start_pos = node.start_position();
let end_pos = node.end_position();
let text = &source.get(node.byte_range()).unwrap_or("");
SyntaxError {
start_pos,
end_pos,
source: text.to_string(),
}
})
}
fn add_syntax_error_nodes(graph: &mut tree_sitter_graph::graph::Graph, errors: &[SyntaxError]) {
for error in errors {
let error_node = graph.add_graph_node();
// Add _kind attribute
graph[error_node]
.attributes
.add(
tree_sitter_graph::Identifier::from("_kind"),
tree_sitter_graph::graph::Value::String("SyntaxErrorNode".to_string()),
)
.expect("Fresh node should not have duplicate attributes");
// Add _location attribute
let location = tree_sitter_graph::graph::Value::List(
vec![
error.start_pos.row,
error.start_pos.column,
error.end_pos.row,
error.end_pos.column,
]
.into_iter()
.map(|v| tree_sitter_graph::graph::Value::from(v as u32))
.collect(),
);
graph[error_node]
.attributes
.add(tree_sitter_graph::Identifier::from("_location"), location)
.expect("Fresh node should not have duplicate attributes");
// Add source attribute
graph[error_node]
.attributes
.add(
tree_sitter_graph::Identifier::from("source"),
tree_sitter_graph::graph::Value::String(error.source.clone()),
)
.expect("Fresh node should not have duplicate attributes");
}
}
fn main() -> Result<()> {
let matches = Command::new("tsg-python")
.version(BUILD_VERSION)
.author("Taus Brock-Nannestad <tausbn@github.com>")
.about("Extracts a Python AST from the parse tree given by tree-sitter-python")
.arg(
Arg::new("tsg")
.short('t')
.long("tsg")
.action(ArgAction::Set)
.required(false),
)
.arg(Arg::new("source").index(1).required(true))
.get_matches();
let tsg_path = matches
.get_one::<String>("tsg")
.map(|s| Path::new(s).display().to_string())
.unwrap_or_else(|| "bundled `python.tsg`".to_owned());
let source_path = Path::new(matches.get_one::<String>("source").unwrap());
let language = tsp::language();
let mut parser = Parser::new();
parser.set_language(&language)?;
// Statically include `python.tsg`:
let tsg = if matches.contains_id("tsg") {
std::fs::read(&tsg_path).with_context(|| format!("Error reading TSG file {}", tsg_path))?
} else {
include_bytes!("../python.tsg").to_vec()
};
let tsg = String::from_utf8(tsg)?;
let source = std::fs::read(source_path)
.with_context(|| format!("Error reading source file {}", source_path.display()))?;
let source = String::from_utf8(source)?;
let tree = parser
.parse(&source, None)
.ok_or_else(|| anyhow!("Could not parse {}", source_path.display()))?;
let file = File::from_str(language, &tsg)
.with_context(|| anyhow!("Error parsing TSG file {}", tsg_path))?;
let mut functions = Functions::stdlib();
functions.add(Identifier::from("location"), extra_functions::Location);
functions.add(
Identifier::from("location-start"),
extra_functions::LocationStart,
);
functions.add(
Identifier::from("location-end"),
extra_functions::LocationEnd,
);
functions.add(
Identifier::from("string-prefix"),
extra_functions::StringPrefix,
);
functions.add(
Identifier::from("string-contents"),
extra_functions::StringContents,
);
functions.add(
Identifier::from("string-quotes"),
extra_functions::StringQuotes,
);
functions.add(
Identifier::from("string-safe-prefix"),
extra_functions::StringSafePrefix,
);
functions.add(Identifier::from("safe-string"), extra_functions::SafeString);
functions.add(
Identifier::from("unnamed-child-index"),
extra_functions::UnnamedChildIndex,
);
functions.add(Identifier::from("ast-node"), extra_functions::AstNode);
functions.add(
Identifier::from("concatenate-strings"),
extra_functions::ConcatenateStrings,
);
functions.add(Identifier::from("instance-of"), extra_functions::InstanceOf);
functions.add(Identifier::from("get-parent"), extra_functions::GetParent);
functions.add(
Identifier::from("has-named-child"),
extra_functions::HasNamedChild,
);
functions.add(
Identifier::from("is-boolean-operator"),
extra_functions::IsBooleanOperator,
);
functions.add(Identifier::from("mod"), extra_functions::Modulo);
functions.add(
Identifier::from("get-last-element"),
extra_functions::GetLastElement,
);
let globals = Variables::new();
let config = ExecutionConfig::new(&functions, &globals).lazy(false);
let mut graph = file
.execute(&tree, &source, &config, &NoCancellation)
.with_context(|| format!("Could not execute TSG file {}", tsg_path))?;
// Collect and add syntax error nodes to the graph
if tree.root_node().has_error() {
let syntax_errors: Vec<SyntaxError> =
syntax_errors_from_tree(tree.root_node(), &source).collect();
add_syntax_error_nodes(&mut graph, &syntax_errors);
}
print!("{}", graph.pretty_print());
Ok(())
}