mirror of
https://github.com/github/codeql.git
synced 2026-02-19 08:23:45 +01:00
Initial version of extractor based on tree-sitter grammar
This commit is contained in:
108
Cargo.lock
generated
108
Cargo.lock
generated
@@ -9,12 +9,53 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.61"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed67cbde08356238e75fc4656be4749481eeffb09e19f320a25237d5221c985d"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.33.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"atty",
|
||||
"bitflags",
|
||||
"strsim",
|
||||
"textwrap",
|
||||
"unicode-width",
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generator"
|
||||
version = "0.1.0"
|
||||
@@ -22,6 +63,15 @@ dependencies = [
|
||||
"node-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.6"
|
||||
@@ -34,6 +84,12 @@ version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2448f6066e80e3bfc792e9c98bf705b4b0fc6e8ef5b43e5889aff0eaa9c58743"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.3.3"
|
||||
@@ -89,6 +145,9 @@ name = "ruby-extractor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"clap",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tree-sitter",
|
||||
]
|
||||
|
||||
@@ -129,6 +188,12 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.46"
|
||||
@@ -140,6 +205,15 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.0.1"
|
||||
@@ -159,8 +233,42 @@ dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
||||
|
||||
[[package]]
|
||||
name = "vec_map"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
@@ -8,6 +8,9 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
tree-sitter = "0.17.0"
|
||||
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
clap = "2.33"
|
||||
[build-dependencies]
|
||||
cc="*"
|
||||
|
||||
|
||||
430
extractor/src/extractor.rs
Normal file
430
extractor/src/extractor.rs
Normal file
@@ -0,0 +1,430 @@
|
||||
use super::nodes_types::{Entry, Field, Storage, TypeName};
|
||||
|
||||
use std::collections::BTreeMap as Map;
|
||||
use std::collections::BTreeSet as Set;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use tree_sitter::{Language, Node, Parser, Tree};
|
||||
|
||||
pub struct Extractor {
|
||||
pub parser: Parser,
|
||||
pub schema: Vec<Entry>,
|
||||
}
|
||||
|
||||
pub fn create(language: Language, schema: Vec<Entry>) -> Extractor {
|
||||
let mut parser = Parser::new();
|
||||
parser.set_language(language).unwrap();
|
||||
|
||||
Extractor { parser, schema }
|
||||
}
|
||||
impl Extractor {
|
||||
pub fn extract<'a>(&'a mut self, path: &Path) -> std::io::Result<Program> {
|
||||
let source = std::fs::read(&path)?;
|
||||
let tree = &self
|
||||
.parser
|
||||
.parse(&source, None)
|
||||
.expect("Failed to parse file");
|
||||
let mut visitor = Visitor {
|
||||
source: &source,
|
||||
program: vec![Fact::Comment(format!(
|
||||
"Auto-generated FACT file for {}, generated by the cool kids",
|
||||
path.display()
|
||||
))],
|
||||
counter: -1,
|
||||
// TODO: should we handle path strings that are not valid UTF8 better?
|
||||
path: format!("{}", path.display()),
|
||||
stack: Vec::new(),
|
||||
tables: build_schema_lookup(&self.schema),
|
||||
union_types: build_union_type_lookup(&self.schema),
|
||||
};
|
||||
traverse(&tree, &mut visitor);
|
||||
|
||||
&self.parser.reset();
|
||||
Ok(Program(visitor.program))
|
||||
}
|
||||
}
|
||||
|
||||
fn build_schema_lookup<'a>(schema: &'a Vec<Entry>) -> Map<&'a TypeName, &'a Entry> {
|
||||
let mut map = std::collections::BTreeMap::new();
|
||||
for entry in schema {
|
||||
if let Entry::Table { type_name, .. } = entry {
|
||||
map.insert(type_name, entry);
|
||||
}
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
fn build_union_type_lookup<'a>(schema: &'a Vec<Entry>) -> Map<&'a TypeName, &'a Set<TypeName>> {
|
||||
let mut union_types = std::collections::BTreeMap::new();
|
||||
for entry in schema {
|
||||
if let Entry::Union { type_name, members } = entry {
|
||||
union_types.insert(type_name, members);
|
||||
}
|
||||
}
|
||||
union_types
|
||||
}
|
||||
|
||||
struct Visitor<'a> {
|
||||
source: &'a Vec<u8>,
|
||||
program: Vec<Fact>,
|
||||
counter: i32,
|
||||
path: String,
|
||||
stack: Vec<Vec<(Option<&'static str>, Id, TypeName)>>,
|
||||
tables: Map<&'a TypeName, &'a Entry>,
|
||||
union_types: Map<&'a TypeName, &'a Set<TypeName>>,
|
||||
}
|
||||
|
||||
impl Visitor<'_> {
|
||||
fn enter_node(&mut self, node: Node) {
|
||||
if node.is_extra() {
|
||||
return;
|
||||
}
|
||||
self.stack.push(Vec::new());
|
||||
}
|
||||
|
||||
fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) {
|
||||
if node.is_extra() {
|
||||
return;
|
||||
}
|
||||
let child_nodes = self.stack.pop().expect("Vistor: empty stack");
|
||||
let table = self.tables.get(&TypeName {
|
||||
kind: node.kind().to_owned(),
|
||||
named: node.is_named(),
|
||||
});
|
||||
if let Some(Entry::Table { fields, .. }) = table {
|
||||
self.counter += 1;
|
||||
let id = Id(self.counter);
|
||||
let loc = Loc(self.counter);
|
||||
self.program.push(Fact::New(Arg::IdArg(id)));
|
||||
self.program.push(Fact::New(Arg::LocArg(loc)));
|
||||
self.program.push(location_for(&self.path, loc, node));
|
||||
let table_name = node_type_name(node.kind(), node.is_named());
|
||||
let args: Option<Vec<Arg>>;
|
||||
if fields.is_empty() {
|
||||
args = Some(vec![sliced_source_arg(self.source, node)]);
|
||||
} else {
|
||||
args = self.complex_node(fields, child_nodes, id);
|
||||
}
|
||||
if let Some(args) = args {
|
||||
self.program
|
||||
.push(Fact::Definition(table_name, id, args, loc));
|
||||
}
|
||||
if let Some(parent) = self.stack.last_mut() {
|
||||
parent.push((
|
||||
field_name,
|
||||
id,
|
||||
TypeName {
|
||||
kind: node.kind().to_owned(),
|
||||
named: node.is_named(),
|
||||
},
|
||||
))
|
||||
};
|
||||
} else {
|
||||
panic!(format!("Unknown table type: '{}'", node.kind()))
|
||||
}
|
||||
}
|
||||
|
||||
fn complex_node(
|
||||
&mut self,
|
||||
fields: &Vec<Field>,
|
||||
child_nodes: Vec<(Option<&str>, Id, TypeName)>,
|
||||
parent_id: Id,
|
||||
) -> Option<Vec<Arg>> {
|
||||
let mut map: Map<&Option<String>, (&Field, Vec<Id>)> = std::collections::BTreeMap::new();
|
||||
for field in fields {
|
||||
map.insert(&field.name, (field, Vec::new()));
|
||||
}
|
||||
for (child_field, child_id, child_type) in child_nodes {
|
||||
if let Some((field, values)) = map.get_mut(&child_field.map(|x| x.to_owned())) {
|
||||
//TODO: handle error and missing nodes
|
||||
if self.type_matches(&child_type, &field.types) {
|
||||
values.push(child_id);
|
||||
} else if field.name.is_some() {
|
||||
println!(
|
||||
"Type mismatch for field {:?} with type {:?} != {:?}",
|
||||
child_field, child_type, field.types
|
||||
)
|
||||
}
|
||||
} else {
|
||||
println!(
|
||||
"Value for unknown field: {:?} and type {:?}",
|
||||
&child_field, &child_type
|
||||
);
|
||||
}
|
||||
}
|
||||
let mut args = Vec::new();
|
||||
let mut is_valid = true;
|
||||
for field in fields {
|
||||
let child_ids = &map.get(&field.name).unwrap().1;
|
||||
match &field.storage {
|
||||
Storage::Column => {
|
||||
if child_ids.len() == 1 {
|
||||
args.push(Arg::IdArg(*child_ids.first().unwrap()));
|
||||
} else {
|
||||
is_valid = false;
|
||||
println!("Argument count mismatch for field {:?}", field.name);
|
||||
}
|
||||
}
|
||||
Storage::Table { parent, index } => {
|
||||
for child_id in child_ids {
|
||||
self.program.push(Fact::ChildOf(
|
||||
node_type_name(&parent.kind, parent.named),
|
||||
parent_id,
|
||||
match &field.name {
|
||||
Some(name) => name.to_owned(),
|
||||
None => "child".to_owned(),
|
||||
},
|
||||
Index(*index),
|
||||
*child_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if is_valid {
|
||||
Some(args)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
fn type_matches(&self, tp: &TypeName, types: &Set<TypeName>) -> bool {
|
||||
if types.contains(tp) {
|
||||
return true;
|
||||
}
|
||||
for other in types.iter() {
|
||||
if let Some(x) = self.union_types.get(other) {
|
||||
if self.type_matches(tp, x) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Emit a slice of a source file as an Arg.
|
||||
fn sliced_source_arg(source: &Vec<u8>, n: Node) -> Arg {
|
||||
let range = n.byte_range();
|
||||
Arg::StringArg(String::from(
|
||||
std::str::from_utf8(&source[range.start..range.end]).expect("Failed to decode string"),
|
||||
))
|
||||
}
|
||||
|
||||
// Emit a 'Located' fact for the provided node, appropriately calibrated.
|
||||
fn location_for<'a>(fp: &String, ident: Loc, n: Node) -> Fact {
|
||||
let start_line = n.start_position().row;
|
||||
let start_col = n.start_position().column;
|
||||
let end_line = n.end_position().row;
|
||||
let end_col = n.end_position().column;
|
||||
Fact::Located(vec![
|
||||
Arg::LocArg(ident),
|
||||
Arg::StringArg(fp.to_owned()),
|
||||
Arg::IntArg(start_line),
|
||||
Arg::IntArg(start_col),
|
||||
Arg::IntArg(end_line),
|
||||
Arg::IntArg(end_col),
|
||||
])
|
||||
}
|
||||
|
||||
fn traverse(tree: &Tree, visitor: &mut Visitor) {
|
||||
let cursor = &mut tree.walk();
|
||||
visitor.enter_node(cursor.node());
|
||||
let mut recurse = true;
|
||||
loop {
|
||||
if recurse && cursor.goto_first_child() {
|
||||
visitor.enter_node(cursor.node());
|
||||
} else {
|
||||
visitor.leave_node(cursor.field_name(), cursor.node());
|
||||
|
||||
if cursor.goto_next_sibling() {
|
||||
recurse = true;
|
||||
visitor.enter_node(cursor.node());
|
||||
} else if cursor.goto_parent() {
|
||||
recurse = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub struct Program(Vec<Fact>);
|
||||
|
||||
impl fmt::Display for Program {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let mut text = String::new();
|
||||
for fact in &self.0 {
|
||||
text.push_str(&format!("{}\n", fact));
|
||||
}
|
||||
write!(f, "{}", text)
|
||||
}
|
||||
}
|
||||
#[derive(Debug)]
|
||||
enum Fact {
|
||||
// @id = *@
|
||||
New(Arg),
|
||||
// @node_def(self, arg?, location)@
|
||||
Definition(String, Id, Vec<Arg>, Loc),
|
||||
// @node_child(self, index, parent)@
|
||||
ChildOf(String, Id, String, Index, Id),
|
||||
// @location(loc, path, r1, c1, r2, c2)
|
||||
Located(Vec<Arg>),
|
||||
Comment(String),
|
||||
}
|
||||
impl fmt::Display for Fact {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Fact::New(id) => write!(f, "{} = *", id),
|
||||
Fact::Definition(n, id, args, loc) => {
|
||||
let mut args_str = String::new();
|
||||
for arg in args {
|
||||
args_str.push_str(&format!("{}, ", arg));
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
"{}({}, {}{})",
|
||||
escape_name(&format!("{}_def", &n)),
|
||||
id,
|
||||
args_str,
|
||||
loc
|
||||
)
|
||||
}
|
||||
Fact::ChildOf(pname, id, fname, idx, p) => write!(
|
||||
f,
|
||||
"{}({}, {}, {})",
|
||||
escape_name(&format!("{}_{}", &pname, &fname)),
|
||||
id,
|
||||
idx,
|
||||
p
|
||||
),
|
||||
Fact::Located(args) => write!(
|
||||
f,
|
||||
"location({}, {}, {}, {}, {}, {})",
|
||||
args.get(0).unwrap(),
|
||||
args.get(1).unwrap(),
|
||||
args.get(2).unwrap(),
|
||||
args.get(3).unwrap(),
|
||||
args.get(4).unwrap(),
|
||||
args.get(5).unwrap(),
|
||||
),
|
||||
Fact::Comment(line) => write!(f, "// {}", line),
|
||||
}
|
||||
}
|
||||
}
|
||||
// Identifiers of the form #0, #1...
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Id(i32);
|
||||
|
||||
impl fmt::Display for Id {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "#{}", self.0)
|
||||
}
|
||||
}
|
||||
// Locative identifiers of the form #0_loc, #1_loc...
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Loc(i32);
|
||||
|
||||
impl fmt::Display for Loc {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "#{}_loc", self.0)
|
||||
}
|
||||
}
|
||||
// Numeric indices.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Index(usize);
|
||||
|
||||
impl fmt::Display for Index {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
// Some untyped argument to a fact.
|
||||
#[derive(Debug)]
|
||||
enum Arg {
|
||||
IntArg(usize),
|
||||
StringArg(String),
|
||||
IdArg(Id),
|
||||
LocArg(Loc),
|
||||
}
|
||||
|
||||
impl fmt::Display for Arg {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Arg::IntArg(x) => write!(f, "{}", x),
|
||||
Arg::StringArg(x) => write!(f, "\"{}\"", x.replace("\"", "\"\"")),
|
||||
Arg::IdArg(x) => write!(f, "{}", x),
|
||||
Arg::LocArg(x) => write!(f, "{}", x),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const RESERVED_KEYWORDS: [&'static str; 14] = [
|
||||
"boolean", "case", "date", "float", "int", "key", "of", "order", "ref", "string", "subtype",
|
||||
"type", "unique", "varchar",
|
||||
];
|
||||
|
||||
/// Returns a string that's a copy of `name` but suitably escaped to be a valid
|
||||
/// QL identifier.
|
||||
pub fn escape_name(name: &str) -> String {
|
||||
let mut result = String::new();
|
||||
|
||||
// If there's a leading underscore, replace it with 'underscore_'.
|
||||
if let Some(c) = name.chars().next() {
|
||||
if c == '_' {
|
||||
result.push_str("underscore");
|
||||
}
|
||||
}
|
||||
for c in name.chars() {
|
||||
match c {
|
||||
'{' => result.push_str("lbrace"),
|
||||
'}' => result.push_str("rbrace"),
|
||||
'<' => result.push_str("langle"),
|
||||
'>' => result.push_str("rangle"),
|
||||
'[' => result.push_str("lbracket"),
|
||||
']' => result.push_str("rbracket"),
|
||||
'(' => result.push_str("lparen"),
|
||||
')' => result.push_str("rparen"),
|
||||
'|' => result.push_str("pipe"),
|
||||
'=' => result.push_str("equal"),
|
||||
'~' => result.push_str("tilde"),
|
||||
'?' => result.push_str("question"),
|
||||
'`' => result.push_str("backtick"),
|
||||
'^' => result.push_str("caret"),
|
||||
'!' => result.push_str("bang"),
|
||||
'#' => result.push_str("hash"),
|
||||
'%' => result.push_str("percent"),
|
||||
'&' => result.push_str("ampersand"),
|
||||
'.' => result.push_str("dot"),
|
||||
',' => result.push_str("comma"),
|
||||
'/' => result.push_str("slash"),
|
||||
':' => result.push_str("colon"),
|
||||
';' => result.push_str("semicolon"),
|
||||
'"' => result.push_str("dquote"),
|
||||
'*' => result.push_str("star"),
|
||||
'+' => result.push_str("plus"),
|
||||
'-' => result.push_str("minus"),
|
||||
'@' => result.push_str("at"),
|
||||
_ => result.push_str(&c.to_lowercase().to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
for &keyword in &RESERVED_KEYWORDS {
|
||||
if result == keyword {
|
||||
result.push_str("__");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Given a tree-sitter node type's (kind, named) pair, returns a single string
|
||||
/// representing the (unescaped) name we'll use to refer to corresponding QL
|
||||
/// type.
|
||||
fn node_type_name(kind: &str, named: bool) -> String {
|
||||
if named {
|
||||
kind.to_string()
|
||||
} else {
|
||||
format!("{}_unnamed", kind)
|
||||
}
|
||||
}
|
||||
@@ -1,18 +1,86 @@
|
||||
use tree_sitter::{Language, Parser};
|
||||
mod extractor;
|
||||
mod nodes_types;
|
||||
|
||||
fn main() {
|
||||
let mut parser = Parser::new();
|
||||
use clap;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tree_sitter::Language;
|
||||
|
||||
fn main() -> std::io::Result<()> {
|
||||
extern "C" {
|
||||
fn tree_sitter_ruby() -> Language;
|
||||
}
|
||||
|
||||
let matches = clap::App::new("Ruby extractor")
|
||||
.version("1.0")
|
||||
.author("GitHub")
|
||||
.about("CodeQL Ruby extractor")
|
||||
.args_from_usage(
|
||||
"--source-archive-dir=<DIR> 'Sets a custom source archive folder'
|
||||
--output-dir=<DIR> 'Sets a custom trap folder'
|
||||
--file-list=<FILE_LIST> 'A text files containing the paths of the files to extract'",
|
||||
)
|
||||
.get_matches();
|
||||
let src_archive_dir = matches
|
||||
.value_of("source-archive-dir")
|
||||
.expect("missing --source-archive-dir");
|
||||
let src_archive_dir = PathBuf::from(src_archive_dir);
|
||||
|
||||
let trap_dir = matches
|
||||
.value_of("output-dir")
|
||||
.expect("missing --output-dir");
|
||||
let trap_dir = PathBuf::from(trap_dir);
|
||||
|
||||
let file_list = matches.value_of("file-list").expect("missing --file-list");
|
||||
let file_list = fs::File::open(file_list)?;
|
||||
|
||||
let node_types_path = PathBuf::from("tree-sitter-ruby/src/node-types.json");
|
||||
let language = unsafe { tree_sitter_ruby() };
|
||||
parser.set_language(language).unwrap();
|
||||
|
||||
let src = "def foo\n puts \"hello\"\nend";
|
||||
let tree = parser.parse(src, None).unwrap();
|
||||
let root_node = tree.root_node();
|
||||
|
||||
println!("Root: {}", root_node.to_sexp());
|
||||
let schema = nodes_types::read_node_types(&node_types_path)?;
|
||||
let mut extractor = extractor::create(language, schema);
|
||||
for line in std::io::BufReader::new(file_list).lines() {
|
||||
let path = PathBuf::from(line?);
|
||||
let trap_file = path_for(&trap_dir, &path, ".trap");
|
||||
let src_archive_file = path_for(&src_archive_dir, &path, "");
|
||||
let trap = extractor.extract(&path)?;
|
||||
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
|
||||
std::fs::copy(&path, &src_archive_file)?;
|
||||
std::fs::create_dir_all(&trap_file.parent().unwrap())?;
|
||||
let mut trap_file = std::fs::File::create(&trap_file)?;
|
||||
let trap_file: &mut dyn std::io::Write = &mut trap_file;
|
||||
write!(trap_file, "{}", trap)?;
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
|
||||
let mut result = PathBuf::from(dir);
|
||||
for component in path.components() {
|
||||
match component {
|
||||
std::path::Component::Prefix(_) => {
|
||||
// skip for now
|
||||
// TODO: handle this properly for Windows
|
||||
}
|
||||
std::path::Component::RootDir => {
|
||||
// skip
|
||||
}
|
||||
std::path::Component::Normal(_) => {
|
||||
result.push(component);
|
||||
}
|
||||
std::path::Component::CurDir => {
|
||||
// skip
|
||||
}
|
||||
std::path::Component::ParentDir => {
|
||||
result.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(x) = result.extension() {
|
||||
let mut new_ext = x.to_os_string();
|
||||
new_ext.push(ext);
|
||||
result.set_extension(new_ext);
|
||||
} else {
|
||||
result.set_extension(ext);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
170
extractor/src/nodes_types.rs
Normal file
170
extractor/src/nodes_types.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
use serde::Deserialize;
|
||||
|
||||
use std::collections::BTreeMap as Map;
|
||||
use std::collections::BTreeSet as Set;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Entry {
|
||||
Union {
|
||||
type_name: TypeName,
|
||||
members: Set<TypeName>,
|
||||
},
|
||||
Table {
|
||||
type_name: TypeName,
|
||||
fields: Vec<Field>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
|
||||
pub struct TypeName {
|
||||
pub kind: String,
|
||||
pub named: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Field {
|
||||
pub types: Set<TypeName>,
|
||||
/// The name of the field or None for the anonymous 'children'
|
||||
/// entry from node_types.json
|
||||
pub name: Option<String>,
|
||||
pub storage: Storage,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Storage {
|
||||
/// the field is stored as a column in the parent table
|
||||
Column,
|
||||
// the field is store in a link table
|
||||
Table {
|
||||
parent: TypeName,
|
||||
index: usize,
|
||||
},
|
||||
}
|
||||
|
||||
pub fn read_node_types(node_types_path: &Path) -> std::io::Result<Vec<Entry>> {
|
||||
let file = fs::File::open(node_types_path)?;
|
||||
let node_types = serde_json::from_reader(file)?;
|
||||
Ok(convert_nodes(node_types))
|
||||
}
|
||||
|
||||
fn convert_type(node_type: &NodeType) -> TypeName {
|
||||
TypeName {
|
||||
kind: node_type.kind.to_string(),
|
||||
named: node_type.named,
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_types(node_types: &Vec<NodeType>) -> Set<TypeName> {
|
||||
let iter = node_types.iter().map(convert_type).collect();
|
||||
std::collections::BTreeSet::from(iter)
|
||||
}
|
||||
pub fn convert_nodes(nodes: Vec<NodeInfo>) -> Vec<Entry> {
|
||||
let mut entries: Vec<Entry> = Vec::new();
|
||||
|
||||
for node in nodes {
|
||||
if let Some(subtypes) = &node.subtypes {
|
||||
// It's a tree-sitter supertype node, for which we create a union
|
||||
// type.
|
||||
entries.push(Entry::Union {
|
||||
type_name: TypeName {
|
||||
kind: node.kind,
|
||||
named: node.named,
|
||||
},
|
||||
members: convert_types(&subtypes),
|
||||
});
|
||||
} else {
|
||||
// It's a product type, defined by a table.
|
||||
let type_name = TypeName {
|
||||
kind: node.kind,
|
||||
named: node.named,
|
||||
};
|
||||
let mut fields = Vec::new();
|
||||
|
||||
// If the type also has fields or children, then we create either
|
||||
// auxiliary tables or columns in the defining table for them.
|
||||
if let Some(node_fields) = &node.fields {
|
||||
for (field_name, field_info) in node_fields {
|
||||
add_field(
|
||||
&type_name,
|
||||
Some(field_name.to_string()),
|
||||
field_info,
|
||||
&mut fields,
|
||||
);
|
||||
}
|
||||
}
|
||||
if let Some(children) = &node.children {
|
||||
// Treat children as if they were a field called 'child'.
|
||||
add_field(&type_name, None, children, &mut fields);
|
||||
}
|
||||
entries.push(Entry::Table { type_name, fields });
|
||||
}
|
||||
}
|
||||
entries
|
||||
}
|
||||
|
||||
fn add_field(
|
||||
parent_type_name: &TypeName,
|
||||
field_name: Option<String>,
|
||||
field_info: &FieldInfo,
|
||||
fields: &mut Vec<Field>,
|
||||
) {
|
||||
let storage;
|
||||
if !field_info.multiple && field_info.required {
|
||||
// This field must appear exactly once, so we add it as
|
||||
// a column to the main table for the node type.
|
||||
storage = Storage::Column;
|
||||
} else {
|
||||
// This field can appear zero or multiple times, so put
|
||||
// it in an auxiliary table.
|
||||
storage = Storage::Table {
|
||||
parent: TypeName {
|
||||
kind: parent_type_name.kind.to_string(),
|
||||
named: parent_type_name.named,
|
||||
},
|
||||
index: fields.len(),
|
||||
};
|
||||
}
|
||||
fields.push(Field {
|
||||
types: convert_types(&field_info.types),
|
||||
name: field_name,
|
||||
storage,
|
||||
});
|
||||
}
|
||||
#[derive(Deserialize)]
|
||||
pub struct NodeInfo {
|
||||
#[serde(rename = "type")]
|
||||
pub kind: String,
|
||||
pub named: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub fields: Option<Map<String, FieldInfo>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub children: Option<FieldInfo>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub subtypes: Option<Vec<NodeType>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct NodeType {
|
||||
#[serde(rename = "type")]
|
||||
pub kind: String,
|
||||
pub named: bool,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct FieldInfo {
|
||||
pub multiple: bool,
|
||||
pub required: bool,
|
||||
pub types: Vec<NodeType>,
|
||||
}
|
||||
|
||||
impl Default for FieldInfo {
|
||||
fn default() -> Self {
|
||||
FieldInfo {
|
||||
multiple: false,
|
||||
required: true,
|
||||
types: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user