Initial version of extractor based on tree-sitter grammar

This commit is contained in:
Arthur Baars
2020-10-20 13:08:13 +02:00
parent d00c956028
commit 47ccc33ab3
5 changed files with 790 additions and 11 deletions

108
Cargo.lock generated
View File

@@ -9,12 +9,53 @@ dependencies = [
"memchr",
]
[[package]]
name = "ansi_term"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
dependencies = [
"winapi",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi",
]
[[package]]
name = "bitflags"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "cc"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed67cbde08356238e75fc4656be4749481eeffb09e19f320a25237d5221c985d"
[[package]]
name = "clap"
version = "2.33.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
dependencies = [
"ansi_term",
"atty",
"bitflags",
"strsim",
"textwrap",
"unicode-width",
"vec_map",
]
[[package]]
name = "generator"
version = "0.1.0"
@@ -22,6 +63,15 @@ dependencies = [
"node-types",
]
[[package]]
name = "hermit-abi"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8"
dependencies = [
"libc",
]
[[package]]
name = "itoa"
version = "0.4.6"
@@ -34,6 +84,12 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2448f6066e80e3bfc792e9c98bf705b4b0fc6e8ef5b43e5889aff0eaa9c58743"
[[package]]
name = "memchr"
version = "2.3.3"
@@ -89,6 +145,9 @@ name = "ruby-extractor"
version = "0.1.0"
dependencies = [
"cc",
"clap",
"serde",
"serde_json",
"tree-sitter",
]
@@ -129,6 +188,12 @@ dependencies = [
"serde",
]
[[package]]
name = "strsim"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "syn"
version = "1.0.46"
@@ -140,6 +205,15 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]]
name = "thread_local"
version = "1.0.1"
@@ -159,8 +233,42 @@ dependencies = [
"regex",
]
[[package]]
name = "unicode-width"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
[[package]]
name = "unicode-xid"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "vec_map"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@@ -8,6 +8,9 @@ edition = "2018"
[dependencies]
tree-sitter = "0.17.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
clap = "2.33"
[build-dependencies]
cc="*"

430
extractor/src/extractor.rs Normal file
View File

@@ -0,0 +1,430 @@
use super::nodes_types::{Entry, Field, Storage, TypeName};
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fmt;
use std::path::Path;
use tree_sitter::{Language, Node, Parser, Tree};
pub struct Extractor {
pub parser: Parser,
pub schema: Vec<Entry>,
}
pub fn create(language: Language, schema: Vec<Entry>) -> Extractor {
let mut parser = Parser::new();
parser.set_language(language).unwrap();
Extractor { parser, schema }
}
impl Extractor {
pub fn extract<'a>(&'a mut self, path: &Path) -> std::io::Result<Program> {
let source = std::fs::read(&path)?;
let tree = &self
.parser
.parse(&source, None)
.expect("Failed to parse file");
let mut visitor = Visitor {
source: &source,
program: vec![Fact::Comment(format!(
"Auto-generated FACT file for {}, generated by the cool kids",
path.display()
))],
counter: -1,
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
stack: Vec::new(),
tables: build_schema_lookup(&self.schema),
union_types: build_union_type_lookup(&self.schema),
};
traverse(&tree, &mut visitor);
&self.parser.reset();
Ok(Program(visitor.program))
}
}
fn build_schema_lookup<'a>(schema: &'a Vec<Entry>) -> Map<&'a TypeName, &'a Entry> {
let mut map = std::collections::BTreeMap::new();
for entry in schema {
if let Entry::Table { type_name, .. } = entry {
map.insert(type_name, entry);
}
}
map
}
fn build_union_type_lookup<'a>(schema: &'a Vec<Entry>) -> Map<&'a TypeName, &'a Set<TypeName>> {
let mut union_types = std::collections::BTreeMap::new();
for entry in schema {
if let Entry::Union { type_name, members } = entry {
union_types.insert(type_name, members);
}
}
union_types
}
struct Visitor<'a> {
source: &'a Vec<u8>,
program: Vec<Fact>,
counter: i32,
path: String,
stack: Vec<Vec<(Option<&'static str>, Id, TypeName)>>,
tables: Map<&'a TypeName, &'a Entry>,
union_types: Map<&'a TypeName, &'a Set<TypeName>>,
}
impl Visitor<'_> {
fn enter_node(&mut self, node: Node) {
if node.is_extra() {
return;
}
self.stack.push(Vec::new());
}
fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) {
if node.is_extra() {
return;
}
let child_nodes = self.stack.pop().expect("Vistor: empty stack");
let table = self.tables.get(&TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
});
if let Some(Entry::Table { fields, .. }) = table {
self.counter += 1;
let id = Id(self.counter);
let loc = Loc(self.counter);
self.program.push(Fact::New(Arg::IdArg(id)));
self.program.push(Fact::New(Arg::LocArg(loc)));
self.program.push(location_for(&self.path, loc, node));
let table_name = node_type_name(node.kind(), node.is_named());
let args: Option<Vec<Arg>>;
if fields.is_empty() {
args = Some(vec![sliced_source_arg(self.source, node)]);
} else {
args = self.complex_node(fields, child_nodes, id);
}
if let Some(args) = args {
self.program
.push(Fact::Definition(table_name, id, args, loc));
}
if let Some(parent) = self.stack.last_mut() {
parent.push((
field_name,
id,
TypeName {
kind: node.kind().to_owned(),
named: node.is_named(),
},
))
};
} else {
panic!(format!("Unknown table type: '{}'", node.kind()))
}
}
fn complex_node(
&mut self,
fields: &Vec<Field>,
child_nodes: Vec<(Option<&str>, Id, TypeName)>,
parent_id: Id,
) -> Option<Vec<Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<Id>)> = std::collections::BTreeMap::new();
for field in fields {
map.insert(&field.name, (field, Vec::new()));
}
for (child_field, child_id, child_type) in child_nodes {
if let Some((field, values)) = map.get_mut(&child_field.map(|x| x.to_owned())) {
//TODO: handle error and missing nodes
if self.type_matches(&child_type, &field.types) {
values.push(child_id);
} else if field.name.is_some() {
println!(
"Type mismatch for field {:?} with type {:?} != {:?}",
child_field, child_type, field.types
)
}
} else {
println!(
"Value for unknown field: {:?} and type {:?}",
&child_field, &child_type
);
}
}
let mut args = Vec::new();
let mut is_valid = true;
for field in fields {
let child_ids = &map.get(&field.name).unwrap().1;
match &field.storage {
Storage::Column => {
if child_ids.len() == 1 {
args.push(Arg::IdArg(*child_ids.first().unwrap()));
} else {
is_valid = false;
println!("Argument count mismatch for field {:?}", field.name);
}
}
Storage::Table { parent, index } => {
for child_id in child_ids {
self.program.push(Fact::ChildOf(
node_type_name(&parent.kind, parent.named),
parent_id,
match &field.name {
Some(name) => name.to_owned(),
None => "child".to_owned(),
},
Index(*index),
*child_id,
));
}
}
}
}
if is_valid {
Some(args)
} else {
None
}
}
fn type_matches(&self, tp: &TypeName, types: &Set<TypeName>) -> bool {
if types.contains(tp) {
return true;
}
for other in types.iter() {
if let Some(x) = self.union_types.get(other) {
if self.type_matches(tp, x) {
return true;
}
}
}
return false;
}
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &Vec<u8>, n: Node) -> Arg {
let range = n.byte_range();
Arg::StringArg(String::from(
std::str::from_utf8(&source[range.start..range.end]).expect("Failed to decode string"),
))
}
// Emit a 'Located' fact for the provided node, appropriately calibrated.
fn location_for<'a>(fp: &String, ident: Loc, n: Node) -> Fact {
let start_line = n.start_position().row;
let start_col = n.start_position().column;
let end_line = n.end_position().row;
let end_col = n.end_position().column;
Fact::Located(vec![
Arg::LocArg(ident),
Arg::StringArg(fp.to_owned()),
Arg::IntArg(start_line),
Arg::IntArg(start_col),
Arg::IntArg(end_line),
Arg::IntArg(end_col),
])
}
fn traverse(tree: &Tree, visitor: &mut Visitor) {
let cursor = &mut tree.walk();
visitor.enter_node(cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
visitor.enter_node(cursor.node());
} else {
visitor.leave_node(cursor.field_name(), cursor.node());
if cursor.goto_next_sibling() {
recurse = true;
visitor.enter_node(cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
break;
}
}
}
}
pub struct Program(Vec<Fact>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for fact in &self.0 {
text.push_str(&format!("{}\n", fact));
}
write!(f, "{}", text)
}
}
#[derive(Debug)]
enum Fact {
// @id = *@
New(Arg),
// @node_def(self, arg?, location)@
Definition(String, Id, Vec<Arg>, Loc),
// @node_child(self, index, parent)@
ChildOf(String, Id, String, Index, Id),
// @location(loc, path, r1, c1, r2, c2)
Located(Vec<Arg>),
Comment(String),
}
impl fmt::Display for Fact {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Fact::New(id) => write!(f, "{} = *", id),
Fact::Definition(n, id, args, loc) => {
let mut args_str = String::new();
for arg in args {
args_str.push_str(&format!("{}, ", arg));
}
write!(
f,
"{}({}, {}{})",
escape_name(&format!("{}_def", &n)),
id,
args_str,
loc
)
}
Fact::ChildOf(pname, id, fname, idx, p) => write!(
f,
"{}({}, {}, {})",
escape_name(&format!("{}_{}", &pname, &fname)),
id,
idx,
p
),
Fact::Located(args) => write!(
f,
"location({}, {}, {}, {}, {}, {})",
args.get(0).unwrap(),
args.get(1).unwrap(),
args.get(2).unwrap(),
args.get(3).unwrap(),
args.get(4).unwrap(),
args.get(5).unwrap(),
),
Fact::Comment(line) => write!(f, "// {}", line),
}
}
}
// Identifiers of the form #0, #1...
#[derive(Debug, Copy, Clone)]
struct Id(i32);
impl fmt::Display for Id {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{}", self.0)
}
}
// Locative identifiers of the form #0_loc, #1_loc...
#[derive(Debug, Copy, Clone)]
struct Loc(i32);
impl fmt::Display for Loc {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{}_loc", self.0)
}
}
// Numeric indices.
#[derive(Debug, Copy, Clone)]
struct Index(usize);
impl fmt::Display for Index {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
// Some untyped argument to a fact.
#[derive(Debug)]
enum Arg {
IntArg(usize),
StringArg(String),
IdArg(Id),
LocArg(Loc),
}
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::IntArg(x) => write!(f, "{}", x),
Arg::StringArg(x) => write!(f, "\"{}\"", x.replace("\"", "\"\"")),
Arg::IdArg(x) => write!(f, "{}", x),
Arg::LocArg(x) => write!(f, "{}", x),
}
}
}
const RESERVED_KEYWORDS: [&'static str; 14] = [
"boolean", "case", "date", "float", "int", "key", "of", "order", "ref", "string", "subtype",
"type", "unique", "varchar",
];
/// Returns a string that's a copy of `name` but suitably escaped to be a valid
/// QL identifier.
pub fn escape_name(name: &str) -> String {
let mut result = String::new();
// If there's a leading underscore, replace it with 'underscore_'.
if let Some(c) = name.chars().next() {
if c == '_' {
result.push_str("underscore");
}
}
for c in name.chars() {
match c {
'{' => result.push_str("lbrace"),
'}' => result.push_str("rbrace"),
'<' => result.push_str("langle"),
'>' => result.push_str("rangle"),
'[' => result.push_str("lbracket"),
']' => result.push_str("rbracket"),
'(' => result.push_str("lparen"),
')' => result.push_str("rparen"),
'|' => result.push_str("pipe"),
'=' => result.push_str("equal"),
'~' => result.push_str("tilde"),
'?' => result.push_str("question"),
'`' => result.push_str("backtick"),
'^' => result.push_str("caret"),
'!' => result.push_str("bang"),
'#' => result.push_str("hash"),
'%' => result.push_str("percent"),
'&' => result.push_str("ampersand"),
'.' => result.push_str("dot"),
',' => result.push_str("comma"),
'/' => result.push_str("slash"),
':' => result.push_str("colon"),
';' => result.push_str("semicolon"),
'"' => result.push_str("dquote"),
'*' => result.push_str("star"),
'+' => result.push_str("plus"),
'-' => result.push_str("minus"),
'@' => result.push_str("at"),
_ => result.push_str(&c.to_lowercase().to_string()),
}
}
for &keyword in &RESERVED_KEYWORDS {
if result == keyword {
result.push_str("__");
break;
}
}
result
}
/// Given a tree-sitter node type's (kind, named) pair, returns a single string
/// representing the (unescaped) name we'll use to refer to corresponding QL
/// type.
fn node_type_name(kind: &str, named: bool) -> String {
if named {
kind.to_string()
} else {
format!("{}_unnamed", kind)
}
}

View File

@@ -1,18 +1,86 @@
use tree_sitter::{Language, Parser};
mod extractor;
mod nodes_types;
fn main() {
let mut parser = Parser::new();
use clap;
use std::fs;
use std::io::BufRead;
use std::path::{Path, PathBuf};
use tree_sitter::Language;
fn main() -> std::io::Result<()> {
extern "C" {
fn tree_sitter_ruby() -> Language;
}
let matches = clap::App::new("Ruby extractor")
.version("1.0")
.author("GitHub")
.about("CodeQL Ruby extractor")
.args_from_usage(
"--source-archive-dir=<DIR> 'Sets a custom source archive folder'
--output-dir=<DIR> 'Sets a custom trap folder'
--file-list=<FILE_LIST> 'A text files containing the paths of the files to extract'",
)
.get_matches();
let src_archive_dir = matches
.value_of("source-archive-dir")
.expect("missing --source-archive-dir");
let src_archive_dir = PathBuf::from(src_archive_dir);
let trap_dir = matches
.value_of("output-dir")
.expect("missing --output-dir");
let trap_dir = PathBuf::from(trap_dir);
let file_list = matches.value_of("file-list").expect("missing --file-list");
let file_list = fs::File::open(file_list)?;
let node_types_path = PathBuf::from("tree-sitter-ruby/src/node-types.json");
let language = unsafe { tree_sitter_ruby() };
parser.set_language(language).unwrap();
let src = "def foo\n puts \"hello\"\nend";
let tree = parser.parse(src, None).unwrap();
let root_node = tree.root_node();
println!("Root: {}", root_node.to_sexp());
let schema = nodes_types::read_node_types(&node_types_path)?;
let mut extractor = extractor::create(language, schema);
for line in std::io::BufReader::new(file_list).lines() {
let path = PathBuf::from(line?);
let trap_file = path_for(&trap_dir, &path, ".trap");
let src_archive_file = path_for(&src_archive_dir, &path, "");
let trap = extractor.extract(&path)?;
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
std::fs::copy(&path, &src_archive_file)?;
std::fs::create_dir_all(&trap_file.parent().unwrap())?;
let mut trap_file = std::fs::File::create(&trap_file)?;
let trap_file: &mut dyn std::io::Write = &mut trap_file;
write!(trap_file, "{}", trap)?;
}
return Ok(());
}
fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
let mut result = PathBuf::from(dir);
for component in path.components() {
match component {
std::path::Component::Prefix(_) => {
// skip for now
// TODO: handle this properly for Windows
}
std::path::Component::RootDir => {
// skip
}
std::path::Component::Normal(_) => {
result.push(component);
}
std::path::Component::CurDir => {
// skip
}
std::path::Component::ParentDir => {
result.pop();
}
}
}
if let Some(x) = result.extension() {
let mut new_ext = x.to_os_string();
new_ext.push(ext);
result.set_extension(new_ext);
} else {
result.set_extension(ext);
}
result
}

View File

@@ -0,0 +1,170 @@
use serde::Deserialize;
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fs;
use std::path::Path;
#[derive(Debug)]
pub enum Entry {
Union {
type_name: TypeName,
members: Set<TypeName>,
},
Table {
type_name: TypeName,
fields: Vec<Field>,
},
}
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
pub struct TypeName {
pub kind: String,
pub named: bool,
}
#[derive(Debug)]
pub struct Field {
pub types: Set<TypeName>,
/// The name of the field or None for the anonymous 'children'
/// entry from node_types.json
pub name: Option<String>,
pub storage: Storage,
}
#[derive(Debug)]
pub enum Storage {
/// the field is stored as a column in the parent table
Column,
// the field is store in a link table
Table {
parent: TypeName,
index: usize,
},
}
pub fn read_node_types(node_types_path: &Path) -> std::io::Result<Vec<Entry>> {
let file = fs::File::open(node_types_path)?;
let node_types = serde_json::from_reader(file)?;
Ok(convert_nodes(node_types))
}
fn convert_type(node_type: &NodeType) -> TypeName {
TypeName {
kind: node_type.kind.to_string(),
named: node_type.named,
}
}
fn convert_types(node_types: &Vec<NodeType>) -> Set<TypeName> {
let iter = node_types.iter().map(convert_type).collect();
std::collections::BTreeSet::from(iter)
}
pub fn convert_nodes(nodes: Vec<NodeInfo>) -> Vec<Entry> {
let mut entries: Vec<Entry> = Vec::new();
for node in nodes {
if let Some(subtypes) = &node.subtypes {
// It's a tree-sitter supertype node, for which we create a union
// type.
entries.push(Entry::Union {
type_name: TypeName {
kind: node.kind,
named: node.named,
},
members: convert_types(&subtypes),
});
} else {
// It's a product type, defined by a table.
let type_name = TypeName {
kind: node.kind,
named: node.named,
};
let mut fields = Vec::new();
// If the type also has fields or children, then we create either
// auxiliary tables or columns in the defining table for them.
if let Some(node_fields) = &node.fields {
for (field_name, field_info) in node_fields {
add_field(
&type_name,
Some(field_name.to_string()),
field_info,
&mut fields,
);
}
}
if let Some(children) = &node.children {
// Treat children as if they were a field called 'child'.
add_field(&type_name, None, children, &mut fields);
}
entries.push(Entry::Table { type_name, fields });
}
}
entries
}
fn add_field(
parent_type_name: &TypeName,
field_name: Option<String>,
field_info: &FieldInfo,
fields: &mut Vec<Field>,
) {
let storage;
if !field_info.multiple && field_info.required {
// This field must appear exactly once, so we add it as
// a column to the main table for the node type.
storage = Storage::Column;
} else {
// This field can appear zero or multiple times, so put
// it in an auxiliary table.
storage = Storage::Table {
parent: TypeName {
kind: parent_type_name.kind.to_string(),
named: parent_type_name.named,
},
index: fields.len(),
};
}
fields.push(Field {
types: convert_types(&field_info.types),
name: field_name,
storage,
});
}
#[derive(Deserialize)]
pub struct NodeInfo {
#[serde(rename = "type")]
pub kind: String,
pub named: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub fields: Option<Map<String, FieldInfo>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub children: Option<FieldInfo>,
#[serde(skip_serializing_if = "Option::is_none")]
pub subtypes: Option<Vec<NodeType>>,
}
#[derive(Deserialize)]
pub struct NodeType {
#[serde(rename = "type")]
pub kind: String,
pub named: bool,
}
#[derive(Deserialize)]
pub struct FieldInfo {
pub multiple: bool,
pub required: bool,
pub types: Vec<NodeType>,
}
impl Default for FieldInfo {
fn default() -> Self {
FieldInfo {
multiple: false,
required: true,
types: Vec::new(),
}
}
}