Compare commits

...

7 Commits

Author SHA1 Message Date
Nick Rolfe
b8146a1089 Merge remote-tracking branch 'origin/main' into nickrolfe/extractor-performance 2022-01-27 15:06:05 +00:00
Nick Rolfe
ea5d696d55 Ruby: use IndexMap
This is the same idea as Java's LinkedHashMap: it gives the same O(1)
insertion and lookup as HashMap, but preserves insertion order for
iteration.
2021-11-23 11:08:18 +00:00
Nick Rolfe
6908a0dc12 Ruby: avoid repeated construction of table name strings 2021-11-23 11:08:18 +00:00
Nick Rolfe
189e75bfe2 Sort TRAP output
First, emit labels with fresh ids. Then other labels. Then tuples,
grouped by name. Hopefully this will help both with the compression
ratio but also with branch prediction in the TRAP importer.
2021-11-23 11:08:18 +00:00
Nick Rolfe
b502e68783 Ruby: compute path string only once 2021-11-23 11:08:18 +00:00
Nick Rolfe
6d28e87f57 Ruby: separate trap-writer into its own module 2021-11-23 11:08:18 +00:00
Nick Rolfe
5cada400f1 Ruby: pre-compute set of valid types for each field
We were previously doing this during extraction, i.e. for each field
node we encouter, which meant we were repeating a lot of work. The
`type_matches_set` function was a fairly significant hot-spot in
profiling results, so this should improve performance.
2021-11-23 11:08:18 +00:00
8 changed files with 586 additions and 533 deletions

BIN
ruby/Cargo.lock generated

Binary file not shown.

View File

@@ -18,3 +18,4 @@ tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
rayon = "1.5.0"
num_cpus = "1.13.0"
regex = "1.4.3"
indexmap = "1.7.0"

View File

@@ -1,161 +1,111 @@
use crate::trap;
use indexmap::IndexMap;
use node_types::{EntryKind, Field, NodeTypeMap, Storage, TypeName};
use std::borrow::Cow;
use std::collections::BTreeMap as Map;
use std::collections::BTreeSet as Set;
use std::fmt;
use std::io::Write;
use std::path::Path;
use tracing::{error, info, span, Level};
use tree_sitter::{Language, Node, Parser, Range, Tree};
pub struct TrapWriter {
/// The accumulated trap entries
trap_output: Vec<TrapEntry>,
/// A counter for generating fresh labels
counter: u32,
/// cache of global keys
global_keys: std::collections::HashMap<String, Label>,
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
let (file_label, fresh) =
writer.global_id(trap::full_id_for_file(&normalize_path(absolute_path)));
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String(normalize_path(absolute_path)),
],
);
populate_parent_folders(writer, file_label, absolute_path.parent());
}
file_label
}
pub fn new_trap_writer() -> TrapWriter {
TrapWriter {
counter: 0,
trap_output: Vec::new(),
global_keys: std::collections::HashMap::new(),
fn populate_empty_file(writer: &mut trap::Writer) -> trap::Label {
let (file_label, fresh) = writer.global_id("empty;sourcefile".to_owned());
if fresh {
writer.add_tuple(
"files",
vec![
trap::Arg::Label(file_label),
trap::Arg::String("".to_string()),
],
);
}
file_label
}
impl TrapWriter {
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn populate_empty_location(writer: &mut trap::Writer) {
let file_label = populate_empty_file(writer);
location(writer, file_label, 0, 0, 0, 0);
}
fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.trap_output.push(TrapEntry::FreshId(label));
label
}
fn global_id(&mut self, key: &str) -> (Label, bool) {
if let Some(label) = self.global_keys.get(key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key.to_owned(), label);
self.trap_output
.push(TrapEntry::MapLabelToKey(label, key.to_owned()));
(label, true)
}
fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.trap_output
.push(TrapEntry::GenericTuple(table_name.to_owned(), args))
}
fn populate_file(&mut self, absolute_path: &Path) -> Label {
let (file_label, fresh) = self.global_id(&full_id_for_file(absolute_path));
if fresh {
self.add_tuple(
"files",
vec![
Arg::Label(file_label),
Arg::String(normalize_path(absolute_path)),
],
);
self.populate_parent_folders(file_label, absolute_path.parent());
}
file_label
}
fn populate_empty_file(&mut self) -> Label {
let (file_label, fresh) = self.global_id("empty;sourcefile");
if fresh {
self.add_tuple(
"files",
vec![Arg::Label(file_label), Arg::String("".to_string())],
);
}
file_label
}
pub fn populate_empty_location(&mut self) {
let file_label = self.populate_empty_file();
self.location(file_label, 0, 0, 0, 0);
}
fn populate_parent_folders(&mut self, child_label: Label, path: Option<&Path>) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) = self.global_id(&full_id_for_folder(folder));
self.add_tuple(
"containerparent",
vec![Arg::Label(folder_label), Arg::Label(child_label)],
pub fn populate_parent_folders(
writer: &mut trap::Writer,
child_label: trap::Label,
path: Option<&Path>,
) {
let mut path = path;
let mut child_label = child_label;
loop {
match path {
None => break,
Some(folder) => {
let (folder_label, fresh) =
writer.global_id(trap::full_id_for_folder(&normalize_path(folder)));
writer.add_tuple(
"containerparent",
vec![
trap::Arg::Label(folder_label),
trap::Arg::Label(child_label),
],
);
if fresh {
writer.add_tuple(
"folders",
vec![
trap::Arg::Label(folder_label),
trap::Arg::String(normalize_path(folder)),
],
);
if fresh {
self.add_tuple(
"folders",
vec![
Arg::Label(folder_label),
Arg::String(normalize_path(folder)),
],
);
path = folder.parent();
child_label = folder_label;
} else {
break;
}
path = folder.parent();
child_label = folder_label;
} else {
break;
}
}
}
}
}
fn location(
&mut self,
file_label: Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> Label {
let (loc_label, fresh) = self.global_id(&format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
self.add_tuple(
"locations_default",
vec![
Arg::Label(loc_label),
Arg::Label(file_label),
Arg::Int(start_line),
Arg::Int(start_column),
Arg::Int(end_line),
Arg::Int(end_column),
],
);
}
loc_label
}
fn comment(&mut self, text: String) {
self.trap_output.push(TrapEntry::Comment(text));
}
pub fn output(self, writer: &mut dyn Write) -> std::io::Result<()> {
write!(writer, "{}", Program(self.trap_output))
fn location(
writer: &mut trap::Writer,
file_label: trap::Label,
start_line: usize,
start_column: usize,
end_line: usize,
end_column: usize,
) -> trap::Label {
let (loc_label, fresh) = writer.global_id(format!(
"loc,{{{}}},{},{},{},{}",
file_label, start_line, start_column, end_line, end_column
));
if fresh {
writer.add_tuple(
"locations_default",
vec![
trap::Arg::Label(loc_label),
trap::Arg::Label(file_label),
trap::Arg::Int(start_line),
trap::Arg::Int(start_column),
trap::Arg::Int(end_line),
trap::Arg::Int(end_column),
],
);
}
loc_label
}
/// Extracts the source file at `path`, which is assumed to be canonicalized.
@@ -163,71 +113,42 @@ pub fn extract(
language: Language,
language_prefix: &str,
schema: &NodeTypeMap,
trap_writer: &mut TrapWriter,
trap_writer: &mut trap::Writer,
path: &Path,
source: &[u8],
ranges: &[Range],
) -> std::io::Result<()> {
let path_str = format!("{}", path.display());
let span = span!(
Level::TRACE,
"extract",
file = %path.display()
file = %path_str
);
let _enter = span.enter();
info!("extracting: {}", path.display());
info!("extracting: {}", path_str);
let mut parser = Parser::new();
parser.set_language(language).unwrap();
parser.set_included_ranges(ranges).unwrap();
let tree = parser.parse(&source, None).expect("Failed to parse file");
trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
let file_label = &trap_writer.populate_file(path);
let mut visitor = Visitor {
let file_label = populate_file(trap_writer, path);
let mut visitor = Visitor::new(
source,
trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
file_label: *file_label,
toplevel_child_counter: 0,
stack: Vec::new(),
&path_str,
file_label,
language_prefix,
schema,
};
);
traverse(&tree, &mut visitor);
parser.reset();
Ok(())
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Normalizes the path according the common CodeQL specification. Assumes that
/// `path` has already been canonicalized using `std::fs::canonicalize`.
fn normalize_path(path: &Path) -> String {
@@ -267,34 +188,28 @@ fn normalize_path(path: &Path) -> String {
}
}
fn full_id_for_file(path: &Path) -> String {
format!("{};sourcefile", escape_key(&normalize_path(path)))
}
fn full_id_for_folder(path: &Path) -> String {
format!("{};folder", escape_key(&normalize_path(path)))
}
struct ChildNode {
field_name: Option<&'static str>,
label: Label,
label: trap::Label,
type_name: TypeName,
}
struct Visitor<'a> {
/// The file path of the source code (as string)
path: String,
path: &'a str,
/// The label to use whenever we need to refer to the `@file` entity of this
/// source file.
file_label: Label,
file_label: trap::Label,
/// The source code as a UTF-8 byte array
source: &'a [u8],
/// A TrapWriter to accumulate trap entries
trap_writer: &'a mut TrapWriter,
/// A trap::Writer to accumulate trap entries
trap_writer: &'a mut trap::Writer,
/// A counter for top-level child nodes
toplevel_child_counter: usize,
/// Language prefix
language_prefix: &'a str,
/// Language-specific name of the AST parent table
ast_node_parent_table_name: String,
/// Language-specific name of the tokeninfo table
tokeninfo_table_name: String,
/// A lookup table from type name to node types
schema: &'a NodeTypeMap,
/// A stack for gathering information from child nodes. Whenever a node is
@@ -303,39 +218,62 @@ struct Visitor<'a> {
/// node the list containing the child data is popped from the stack and
/// matched against the dbscheme for the node. If the expectations are met
/// the corresponding row definitions are added to the trap_output.
stack: Vec<(Label, usize, Vec<ChildNode>)>,
stack: Vec<(trap::Label, usize, Vec<ChildNode>)>,
}
impl Visitor<'_> {
impl<'a> Visitor<'a> {
fn new(
source: &'a [u8],
trap_writer: &'a mut trap::Writer,
path: &'a str,
file_label: trap::Label,
language_prefix: &str,
schema: &'a NodeTypeMap,
) -> Visitor<'a> {
Visitor {
path,
file_label,
source,
trap_writer,
toplevel_child_counter: 0,
ast_node_parent_table_name: format!("{}_ast_node_parent", language_prefix),
tokeninfo_table_name: format!("{}_tokeninfo", language_prefix),
schema,
stack: Vec::new(),
}
}
fn record_parse_error(
&mut self,
error_message: String,
full_error_message: String,
loc: Label,
loc: trap::Label,
) {
error!("{}", full_error_message);
let id = self.trap_writer.fresh_id();
self.trap_writer.add_tuple(
"diagnostics",
vec![
Arg::Label(id),
Arg::Int(40), // severity 40 = error
Arg::String("parse_error".to_string()),
Arg::String(error_message),
Arg::String(full_error_message),
Arg::Label(loc),
trap::Arg::Label(id),
trap::Arg::Int(40), // severity 40 = error
trap::Arg::String("parse_error".to_string()),
trap::Arg::String(error_message),
trap::Arg::String(full_error_message),
trap::Arg::Label(loc),
],
);
}
fn record_parse_error_for_node(
&mut self,
error_message: String,
full_error_message: String,
node: Node,
) {
fn record_parse_error_for_node(&mut self, error_message: String, node: &Node) {
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
&error_message
);
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let loc = self.trap_writer.location(
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
@@ -345,20 +283,14 @@ impl Visitor<'_> {
self.record_parse_error(error_message, full_error_message, loc);
}
fn enter_node(&mut self, node: Node) -> bool {
fn enter_node(&mut self, node: &Node) -> bool {
if node.is_error() || node.is_missing() {
let error_message = if node.is_missing() {
format!("parse error: expecting '{}'", node.kind())
} else {
"parse error".to_string()
};
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, node);
self.record_parse_error_for_node(error_message, node);
return false;
}
@@ -368,13 +300,14 @@ impl Visitor<'_> {
true
}
fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) {
fn leave_node(&mut self, field_name: Option<&'static str>, node: &Node) {
if node.is_error() || node.is_missing() {
return;
}
let (id, _, child_nodes) = self.stack.pop().expect("Vistor: empty stack");
let (start_line, start_column, end_line, end_column) = location_for(self.source, node);
let loc = self.trap_writer.location(
let loc = location(
self.trap_writer,
self.file_label,
start_line,
start_column,
@@ -402,20 +335,20 @@ impl Visitor<'_> {
match &table.kind {
EntryKind::Token { kind_id, .. } => {
self.trap_writer.add_tuple(
&format!("{}_ast_node_parent", self.language_prefix),
&self.ast_node_parent_table_name,
vec![
Arg::Label(id),
Arg::Label(parent_id),
Arg::Int(parent_index),
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
],
);
self.trap_writer.add_tuple(
&format!("{}_tokeninfo", self.language_prefix),
&self.tokeninfo_table_name,
vec![
Arg::Label(id),
Arg::Int(*kind_id),
trap::Arg::Label(id),
trap::Arg::Int(*kind_id),
sliced_source_arg(self.source, node),
Arg::Label(loc),
trap::Arg::Label(loc),
],
);
}
@@ -423,18 +356,18 @@ impl Visitor<'_> {
fields,
name: table_name,
} => {
if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) {
if let Some(args) = self.complex_node(node, fields, &child_nodes, id) {
self.trap_writer.add_tuple(
&format!("{}_ast_node_parent", self.language_prefix),
&self.ast_node_parent_table_name,
vec![
Arg::Label(id),
Arg::Label(parent_id),
Arg::Int(parent_index),
trap::Arg::Label(id),
trap::Arg::Label(parent_id),
trap::Arg::Int(parent_index),
],
);
let mut all_args = vec![Arg::Label(id)];
let mut all_args = vec![trap::Arg::Label(id)];
all_args.extend(args);
all_args.push(Arg::Label(loc));
all_args.push(trap::Arg::Label(loc));
self.trap_writer.add_tuple(table_name, all_args);
}
}
@@ -472,9 +405,9 @@ impl Visitor<'_> {
node: &Node,
fields: &[Field],
child_nodes: &[ChildNode],
parent_id: Label,
) -> Option<Vec<Arg>> {
let mut map: Map<&Option<String>, (&Field, Vec<Arg>)> = Map::new();
parent_id: trap::Label,
) -> Option<Vec<trap::Arg>> {
let mut map: IndexMap<&Option<String>, (&Field, Vec<trap::Arg>)> = IndexMap::new();
for field in fields {
map.insert(&field.name, (field, Vec::new()));
}
@@ -482,46 +415,47 @@ impl Visitor<'_> {
if let Some((field, values)) = map.get_mut(&child_node.field_name.map(|x| x.to_owned()))
{
//TODO: handle error and missing nodes
if self.type_matches(&child_node.type_name, &field.type_info) {
if let node_types::FieldTypeInfo::ReservedWordInt(int_mapping) =
&field.type_info
if field.type_info.valid_types.contains(&child_node.type_name) {
if let node_types::FieldTypeKind::ReservedWordInt(int_mapping) =
&field.type_info.kind
{
// We can safely unwrap because type_matches checks the key is in the map.
let (int_value, _) = int_mapping.get(&child_node.type_name.kind).unwrap();
values.push(Arg::Int(*int_value));
match int_mapping.get(&child_node.type_name.kind) {
Some((int_value, _)) => values.push(trap::Arg::Int(*int_value)),
None => self.record_parse_error_for_node(
format!(
"could not map field {}::{} with type {:?} to an integer value",
node.kind(),
child_node.field_name.unwrap_or("child"),
child_node.type_name
),
node,
),
};
} else {
values.push(Arg::Label(child_node.label));
values.push(trap::Arg::Label(child_node.label));
}
} else if field.name.is_some() {
let error_message = format!(
"type mismatch for field {}::{} with type {:?} != {:?}",
node.kind(),
child_node.field_name.unwrap_or("child"),
child_node.type_name,
field.type_info
self.record_parse_error_for_node(
format!(
"type mismatch for field {}::{} with type {:?} != {:?}",
node.kind(),
child_node.field_name.unwrap_or("child"),
child_node.type_name,
field.type_info
),
node,
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, *node);
}
} else if child_node.field_name.is_some() || child_node.type_name.named {
let error_message = format!(
"value for unknown field: {}::{} and type {:?}",
node.kind(),
&child_node.field_name.unwrap_or("child"),
&child_node.type_name
self.record_parse_error_for_node(
format!(
"value for unknown field: {}::{} and type {:?}",
node.kind(),
&child_node.field_name.unwrap_or("child"),
&child_node.type_name
),
node,
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, *node);
}
}
let mut args = Vec::new();
@@ -534,23 +468,19 @@ impl Visitor<'_> {
args.push(child_values.first().unwrap().clone());
} else {
is_valid = false;
let error_message = format!(
"{} for field: {}::{}",
if child_values.is_empty() {
"missing value"
} else {
"too many values"
},
node.kind(),
column_name
self.record_parse_error_for_node(
format!(
"{} for field: {}::{}",
if child_values.is_empty() {
"missing value"
} else {
"too many values"
},
node.kind(),
column_name
),
node,
);
let full_error_message = format!(
"{}:{}: {}",
&self.path,
node.start_position().row + 1,
error_message
);
self.record_parse_error_for_node(error_message, full_error_message, *node);
}
}
Storage::Table {
@@ -569,9 +499,9 @@ impl Visitor<'_> {
);
break;
}
let mut args = vec![Arg::Label(parent_id)];
let mut args = vec![trap::Arg::Label(parent_id)];
if *has_index {
args.push(Arg::Int(index))
args.push(trap::Arg::Int(index))
}
args.push(child_value.clone());
self.trap_writer.add_tuple(table_name, args);
@@ -585,55 +515,18 @@ impl Visitor<'_> {
None
}
}
fn type_matches(&self, tp: &TypeName, type_info: &node_types::FieldTypeInfo) -> bool {
match type_info {
node_types::FieldTypeInfo::Single(single_type) => {
if tp == single_type {
return true;
}
if let EntryKind::Union { members } = &self.schema.get(single_type).unwrap().kind {
if self.type_matches_set(tp, members) {
return true;
}
}
}
node_types::FieldTypeInfo::Multiple { types, .. } => {
return self.type_matches_set(tp, types);
}
node_types::FieldTypeInfo::ReservedWordInt(int_mapping) => {
return !tp.named && int_mapping.contains_key(&tp.kind)
}
}
false
}
fn type_matches_set(&self, tp: &TypeName, types: &Set<TypeName>) -> bool {
if types.contains(tp) {
return true;
}
for other in types.iter() {
if let EntryKind::Union { members } = &self.schema.get(other).unwrap().kind {
if self.type_matches_set(tp, members) {
return true;
}
}
}
false
}
}
// Emit a slice of a source file as an Arg.
fn sliced_source_arg(source: &[u8], n: Node) -> Arg {
// Emit a slice of a source file as a trap::Arg.
fn sliced_source_arg(source: &[u8], n: &Node) -> trap::Arg {
let range = n.byte_range();
Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned())
}
// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated.
// The first is the location and label definition, and the second is the
// 'Located' entry.
fn location_for(source: &[u8], n: Node) -> (usize, usize, usize, usize) {
fn location_for(source: &[u8], n: &Node) -> (usize, usize, usize, usize) {
// Tree-sitter row, column values are 0-based while CodeQL starts
// counting at 1. In addition Tree-sitter's row and column for the
// end position are exclusive while CodeQL's end positions are inclusive.
@@ -680,16 +573,16 @@ fn location_for(source: &[u8], n: Node) -> (usize, usize, usize, usize) {
fn traverse(tree: &Tree, visitor: &mut Visitor) {
let cursor = &mut tree.walk();
visitor.enter_node(cursor.node());
visitor.enter_node(&cursor.node());
let mut recurse = true;
loop {
if recurse && cursor.goto_first_child() {
recurse = visitor.enter_node(cursor.node());
recurse = visitor.enter_node(&cursor.node());
} else {
visitor.leave_node(cursor.field_name(), cursor.node());
visitor.leave_node(cursor.field_name(), &cursor.node());
if cursor.goto_next_sibling() {
recurse = visitor.enter_node(cursor.node());
recurse = visitor.enter_node(&cursor.node());
} else if cursor.goto_parent() {
recurse = false;
} else {
@@ -699,59 +592,6 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
}
}
pub struct Program(Vec<TrapEntry>);
impl fmt::Display for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut text = String::new();
for trap_entry in &self.0 {
text.push_str(&format!("{}\n", trap_entry));
}
write!(f, "{}", text)
}
}
enum TrapEntry {
/// Maps the label to a fresh id, e.g. `#123=*`.
FreshId(Label),
/// Maps the label to a key, e.g. `#7=@"foo"`.
MapLabelToKey(Label, String),
/// foo_bar(arg*)
GenericTuple(String, Vec<Arg>),
Comment(String),
}
impl fmt::Display for TrapEntry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
TrapEntry::FreshId(label) => write!(f, "{}=*", label),
TrapEntry::MapLabelToKey(label, key) => {
write!(f, "{}=@\"{}\"", label, key.replace("\"", "\"\""))
}
TrapEntry::GenericTuple(name, args) => {
write!(f, "{}(", name)?;
for (index, arg) in args.iter().enumerate() {
if index > 0 {
write!(f, ",")?;
}
write!(f, "{}", arg)?;
}
write!(f, ")")
}
TrapEntry::Comment(line) => write!(f, "// {}", line),
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Numeric indices.
#[derive(Debug, Copy, Clone)]
struct Index(usize);
@@ -761,69 +601,3 @@ impl fmt::Display for Index {
write!(f, "{}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

View File

@@ -1,51 +1,15 @@
mod extractor;
mod trap;
extern crate num_cpus;
use clap::arg;
use flate2::write::GzEncoder;
use rayon::prelude::*;
use std::fs;
use std::io::{BufRead, BufWriter};
use std::io::BufRead;
use std::path::{Path, PathBuf};
use tree_sitter::{Language, Parser, Range};
enum TrapCompression {
None,
Gzip,
}
impl TrapCompression {
fn from_env() -> TrapCompression {
match std::env::var("CODEQL_RUBY_TRAP_COMPRESSION") {
Ok(method) => match TrapCompression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
TrapCompression::Gzip
}
},
// Default compression method if the env var isn't set:
Err(_) => TrapCompression::Gzip,
}
}
fn from_string(s: &str) -> Option<TrapCompression> {
match s.to_lowercase().as_ref() {
"none" => Some(TrapCompression::None),
"gzip" => Some(TrapCompression::Gzip),
_ => None,
}
}
fn extension(&self) -> &str {
match self {
TrapCompression::None => "trap",
TrapCompression::Gzip => "trap.gz",
}
}
}
/**
* Gets the number of threads the extractor should use, by reading the
* CODEQL_THREADS environment variable and using it as described in the
@@ -118,7 +82,7 @@ fn main() -> std::io::Result<()> {
.value_of("output-dir")
.expect("missing --output-dir");
let trap_dir = PathBuf::from(trap_dir);
let trap_compression = TrapCompression::from_env();
let trap_compression = trap::Compression::from_env("CODEQL_RUBY_TRAP_COMPRESSION");
let file_list = matches.value_of("file-list").expect("missing --file-list");
let file_list = fs::File::open(file_list)?;
@@ -141,7 +105,7 @@ fn main() -> std::io::Result<()> {
let src_archive_file = path_for(&src_archive_dir, &path, "");
let mut source = std::fs::read(&path)?;
let code_ranges;
let mut trap_writer = extractor::new_trap_writer();
let mut trap_writer = trap::Writer::new();
if path.extension().map_or(false, |x| x == "erb") {
tracing::info!("scanning: {}", path.display());
extractor::extract(
@@ -186,28 +150,20 @@ fn main() -> std::io::Result<()> {
.expect("failed to extract files");
let path = PathBuf::from("extras");
let mut trap_writer = extractor::new_trap_writer();
trap_writer.populate_empty_location();
let mut trap_writer = trap::Writer::new();
extractor::populate_empty_location(&mut trap_writer);
write_trap(&trap_dir, path, trap_writer, &trap_compression)
}
fn write_trap(
trap_dir: &Path,
path: PathBuf,
trap_writer: extractor::TrapWriter,
trap_compression: &TrapCompression,
trap_writer: trap::Writer,
trap_compression: &trap::Compression,
) -> std::io::Result<()> {
let trap_file = path_for(trap_dir, &path, trap_compression.extension());
std::fs::create_dir_all(&trap_file.parent().unwrap())?;
let trap_file = std::fs::File::create(&trap_file)?;
let mut trap_file = BufWriter::new(trap_file);
match trap_compression {
TrapCompression::None => trap_writer.output(&mut trap_file),
TrapCompression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
trap_writer.output(&mut compressed_writer)
}
}
trap_writer.write_to_file(&trap_file, trap_compression)
}
fn scan_erb(

247
ruby/extractor/src/trap.rs Normal file
View File

@@ -0,0 +1,247 @@
use std::borrow::Cow;
use std::fmt;
use std::io::BufWriter;
use std::path::Path;
use flate2::write::GzEncoder;
use indexmap::IndexMap;
pub struct Writer {
/// Labels that should be assigned fresh ids, e.g. `#123=*`.
fresh_ids: Vec<Label>,
/// Labels that should be assigned trap keys, e.g. `#7=@"foo"`.
global_keys: IndexMap<String, Label>,
/// Database rows to emit. Each key is the tuple name, each value is a list.
/// Each member of *that* list represents an instance of that tuple,
/// containing a list of the arguments/column values.
tuples: IndexMap<String, Vec<Vec<Arg>>>,
/// A counter for generating fresh labels
counter: u32,
}
impl Writer {
pub fn new() -> Writer {
Writer {
fresh_ids: Vec::new(),
tuples: IndexMap::new(),
global_keys: IndexMap::new(),
counter: 0,
}
}
/// Gets a label that will hold the unique ID of the passed string at import time.
/// This can be used for incrementally importable TRAP files -- use globally unique
/// strings to compute a unique ID for table tuples.
///
/// Note: You probably want to make sure that the key strings that you use are disjoint
/// for disjoint column types; the standard way of doing this is to prefix (or append)
/// the column type name to the ID. Thus, you might identify methods in Java by the
/// full ID "methods_com.method.package.DeclaringClass.method(argumentList)".
pub fn fresh_id(&mut self) -> Label {
let label = Label(self.counter);
self.counter += 1;
self.fresh_ids.push(label);
label
}
pub fn global_id(&mut self, key: String) -> (Label, bool) {
if let Some(label) = self.global_keys.get(&key) {
return (*label, false);
}
let label = Label(self.counter);
self.counter += 1;
self.global_keys.insert(key, label);
(label, true)
}
pub fn add_tuple(&mut self, table_name: &str, args: Vec<Arg>) {
self.tuples
.entry(table_name.to_owned())
.or_insert_with(Vec::new)
.push(args);
}
fn write<T: std::io::Write>(&self, dest: &mut T) -> std::io::Result<()> {
for label in &self.fresh_ids {
writeln!(dest, "{}=*", label)?;
}
for (key, label) in &self.global_keys {
writeln!(dest, "{}=@\"{}\"", label, key.replace("\"", "\"\""))?;
}
for (name, instances) in &self.tuples {
for instance in instances {
write!(dest, "{}(", name)?;
for (index, arg) in instance.iter().enumerate() {
if index > 0 {
write!(dest, ",")?;
}
write!(dest, "{}", arg)?;
}
writeln!(dest, ")")?;
}
}
Ok(())
}
pub fn write_to_file(&self, path: &Path, compression: &Compression) -> std::io::Result<()> {
let trap_file = std::fs::File::create(path)?;
let mut trap_file = BufWriter::new(trap_file);
match compression {
Compression::None => self.write(&mut trap_file),
Compression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
self.write(&mut compressed_writer)
}
}
}
}
#[derive(Debug, Copy, Clone)]
// Identifiers of the form #0, #1...
pub struct Label(u32);
impl fmt::Display for Label {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "#{:x}", self.0)
}
}
// Some untyped argument to a TrapEntry.
#[derive(Debug, Clone)]
pub enum Arg {
Label(Label),
Int(usize),
String(String),
}
const MAX_STRLEN: usize = 1048576;
impl fmt::Display for Arg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Arg::Label(x) => write!(f, "{}", x),
Arg::Int(x) => write!(f, "{}", x),
Arg::String(x) => write!(
f,
"\"{}\"",
limit_string(x, MAX_STRLEN).replace("\"", "\"\"")
),
}
}
}
pub fn full_id_for_file(normalized_path: &str) -> String {
format!("{};sourcefile", escape_key(normalized_path))
}
pub fn full_id_for_folder(normalized_path: &str) -> String {
format!("{};folder", escape_key(normalized_path))
}
/// Escapes a string for use in a TRAP key, by replacing special characters with
/// HTML entities.
fn escape_key<'a, S: Into<Cow<'a, str>>>(key: S) -> Cow<'a, str> {
fn needs_escaping(c: char) -> bool {
matches!(c, '&' | '{' | '}' | '"' | '@' | '#')
}
let key = key.into();
if key.contains(needs_escaping) {
let mut escaped = String::with_capacity(2 * key.len());
for c in key.chars() {
match c {
'&' => escaped.push_str("&amp;"),
'{' => escaped.push_str("&lbrace;"),
'}' => escaped.push_str("&rbrace;"),
'"' => escaped.push_str("&quot;"),
'@' => escaped.push_str("&commat;"),
'#' => escaped.push_str("&num;"),
_ => escaped.push(c),
}
}
Cow::Owned(escaped)
} else {
key
}
}
/// Limit the length (in bytes) of a string. If the string's length in bytes is
/// less than or equal to the limit then the entire string is returned. Otherwise
/// the string is sliced at the provided limit. If there is a multi-byte character
/// at the limit then the returned slice will be slightly shorter than the limit to
/// avoid splitting that multi-byte character.
fn limit_string(string: &str, max_size: usize) -> &str {
if string.len() <= max_size {
return string;
}
let p = string.as_bytes();
let mut index = max_size;
// We want to clip the string at [max_size]; however, the character at that position
// may span several bytes. We need to find the first byte of the character. In UTF-8
// encoded data any byte that matches the bit pattern 10XXXXXX is not a start byte.
// Therefore we decrement the index as long as there are bytes matching this pattern.
// This ensures we cut the string at the border between one character and another.
while index > 0 && (p[index] & 0b11000000) == 0b10000000 {
index -= 1;
}
&string[0..index]
}
pub enum Compression {
None,
Gzip,
}
impl Compression {
pub fn from_env(var_name: &str) -> Compression {
match std::env::var(var_name) {
Ok(method) => match Compression::from_string(&method) {
Some(c) => c,
None => {
tracing::error!("Unknown compression method '{}'; using gzip.", &method);
Compression::Gzip
}
},
// Default compression method if the env var isn't set:
Err(_) => Compression::Gzip,
}
}
pub fn from_string(s: &str) -> Option<Compression> {
match s.to_lowercase().as_ref() {
"none" => Some(Compression::None),
"gzip" => Some(Compression::Gzip),
_ => None,
}
}
pub fn extension(&self) -> &str {
match self {
Compression::None => "trap",
Compression::Gzip => "trap.gz",
}
}
}
#[test]
fn limit_string_test() {
assert_eq!("hello", limit_string(&"hello world".to_owned(), 5));
assert_eq!("hi ☹", limit_string(&"hi ☹☹".to_owned(), 6));
assert_eq!("hi ", limit_string(&"hi ☹☹".to_owned(), 5));
}
#[test]
fn escape_key_test() {
assert_eq!("foo!", escape_key("foo!"));
assert_eq!("foo&lbrace;&rbrace;", escape_key("foo{}"));
assert_eq!("&lbrace;&rbrace;", escape_key("{}"));
assert_eq!("", escape_key(""));
assert_eq!("/path/to/foo.rb", escape_key("/path/to/foo.rb"));
assert_eq!(
"/path/to/foo&amp;&lbrace;&rbrace;&quot;&commat;&num;.rb",
escape_key("/path/to/foo&{}\"@#.rb")
);
}

View File

@@ -20,8 +20,8 @@ fn make_field_type<'a>(
field: &'a node_types::Field,
nodes: &'a node_types::NodeTypeMap,
) -> (ql::Type<'a>, Option<dbscheme::Entry<'a>>) {
match &field.type_info {
node_types::FieldTypeInfo::Multiple {
match &field.type_info.kind {
node_types::FieldTypeKind::Multiple {
types,
dbscheme_union,
ql_class: _,
@@ -40,11 +40,11 @@ fn make_field_type<'a>(
})),
)
}
node_types::FieldTypeInfo::Single(t) => {
node_types::FieldTypeKind::Single(t) => {
let dbscheme_name = &nodes.get(t).unwrap().dbscheme_name;
(ql::Type::At(dbscheme_name), None)
}
node_types::FieldTypeInfo::ReservedWordInt(int_mapping) => {
node_types::FieldTypeKind::ReservedWordInt(int_mapping) => {
// The field will be an `int` in the db, and we add a case split to
// create other db types for each integer value.
let mut branches: Vec<(usize, &'a str)> = Vec::new();

View File

@@ -345,16 +345,16 @@ fn create_field_getters<'a>(
field: &'a node_types::Field,
nodes: &'a node_types::NodeTypeMap,
) -> (ql::Predicate<'a>, Option<ql::Expression<'a>>) {
let return_type = match &field.type_info {
node_types::FieldTypeInfo::Single(t) => {
let return_type = match &field.type_info.kind {
node_types::FieldTypeKind::Single(t) => {
Some(ql::Type::Normal(&nodes.get(t).unwrap().ql_class_name))
}
node_types::FieldTypeInfo::Multiple {
node_types::FieldTypeKind::Multiple {
types: _,
dbscheme_union: _,
ql_class,
} => Some(ql::Type::Normal(ql_class)),
node_types::FieldTypeInfo::ReservedWordInt(_) => Some(ql::Type::String),
node_types::FieldTypeKind::ReservedWordInt(_) => Some(ql::Type::String),
};
let formal_parameters = match &field.storage {
node_types::Storage::Column { .. } => vec![],
@@ -372,10 +372,10 @@ fn create_field_getters<'a>(
// For the expression to get a value, what variable name should the result
// be bound to?
let get_value_result_var_name = match &field.type_info {
node_types::FieldTypeInfo::ReservedWordInt(_) => "value",
node_types::FieldTypeInfo::Single(_) => "result",
node_types::FieldTypeInfo::Multiple { .. } => "result",
let get_value_result_var_name = match &field.type_info.kind {
node_types::FieldTypeKind::ReservedWordInt(_) => "value",
node_types::FieldTypeKind::Single(_) => "result",
node_types::FieldTypeKind::Multiple { .. } => "result",
};
// Two expressions for getting the value. One that's suitable use in the
@@ -418,8 +418,8 @@ fn create_field_getters<'a>(
),
),
};
let (body, optional_expr) = match &field.type_info {
node_types::FieldTypeInfo::ReservedWordInt(int_mapping) => {
let (body, optional_expr) = match &field.type_info.kind {
node_types::FieldTypeKind::ReservedWordInt(int_mapping) => {
// Create an expression that binds the corresponding string to `result` for each `value`, e.g.:
// result = "foo" and value = 0 or
// result = "bar" and value = 1 or
@@ -454,7 +454,7 @@ fn create_field_getters<'a>(
None,
)
}
node_types::FieldTypeInfo::Single(_) | node_types::FieldTypeInfo::Multiple { .. } => {
node_types::FieldTypeKind::Single(_) | node_types::FieldTypeKind::Multiple { .. } => {
(get_value, Some(get_value_any_index))
}
};

View File

@@ -1,5 +1,5 @@
use serde::Deserialize;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::path::Path;
use std::collections::BTreeSet as Set;
@@ -22,14 +22,23 @@ pub enum EntryKind {
Token { kind_id: usize },
}
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq)]
#[derive(Debug, Ord, PartialOrd, Eq, PartialEq, Hash, Clone)]
pub struct TypeName {
pub kind: String,
pub named: bool,
}
#[derive(Debug)]
pub enum FieldTypeInfo {
#[derive(Debug, Eq, PartialEq)]
pub struct FieldTypeInfo {
pub kind: FieldTypeKind,
/// The set of types this field is allowed to take, after recursively
/// expanding subtypes.
pub valid_types: HashSet<TypeName>,
}
#[derive(Debug, Eq, PartialEq)]
pub enum FieldTypeKind {
/// The field has a single type.
Single(TypeName),
@@ -103,7 +112,49 @@ fn convert_types(node_types: &[NodeType]) -> Set<TypeName> {
node_types.iter().map(convert_type).collect()
}
fn get_matching_types(
node: &NodeInfo,
type_map: &HashMap<NodeType, &NodeInfo>,
) -> HashSet<TypeName> {
let mut result = HashSet::new();
let node_type_name = TypeName {
kind: node.kind.clone(),
named: node.named,
};
result.insert(node_type_name);
if let Some(subtypes) = &node.subtypes {
for subtype in subtypes {
let subtype = type_map.get(subtype).unwrap();
result.extend(get_matching_types(subtype, type_map));
}
}
result
}
pub fn convert_nodes(prefix: &str, nodes: &[NodeInfo]) -> NodeTypeMap {
// Since the nodes contain only weak references to their subtypes, build a
// map so we can resolve them.
let mut type_map: HashMap<NodeType, &NodeInfo> = HashMap::new();
for node in nodes {
let node_type = NodeType {
kind: node.kind.clone(),
named: node.named,
};
type_map.insert(node_type, node);
}
// Now recursively expand subtypes so that for each tree-sitter node type,
// we have a set of all matching types.
let mut transitive_type_map = HashMap::new();
for node in nodes {
let type_name = TypeName {
kind: node.kind.clone(),
named: node.named,
};
let matching_types = get_matching_types(node, &type_map);
transitive_type_map.insert(type_name, matching_types);
}
let mut entries = NodeTypeMap::new();
let mut token_kinds = Set::new();
@@ -166,6 +217,7 @@ pub fn convert_nodes(prefix: &str, nodes: &[NodeInfo]) -> NodeTypeMap {
field_info,
&mut fields,
&token_kinds,
&transitive_type_map,
);
}
}
@@ -178,6 +230,7 @@ pub fn convert_nodes(prefix: &str, nodes: &[NodeInfo]) -> NodeTypeMap {
children,
&mut fields,
&token_kinds,
&transitive_type_map,
);
}
entries.insert(
@@ -222,6 +275,7 @@ fn add_field(
field_info: &FieldInfo,
fields: &mut Vec<Field>,
token_kinds: &Set<TypeName>,
transitive_type_map: &HashMap<TypeName, HashSet<TypeName>>,
) {
let parent_flattened_name = node_type_name(&parent_type_name.kind, parent_type_name.named);
let column_name = escape_name(&name_for_field_or_child(&field_name));
@@ -245,6 +299,18 @@ fn add_field(
}
};
let converted_types = convert_types(&field_info.types);
// Use the transitive type map we built earlier to create the set of all
// possible types this field could take.
let mut valid_types: HashSet<TypeName> = HashSet::new();
for type_name in &converted_types {
if let Some(types) = transitive_type_map.get(type_name) {
for t in types {
valid_types.insert(t.clone());
}
}
}
let type_info = if field_info
.types
.iter()
@@ -259,20 +325,29 @@ fn add_field(
escape_name(&format!("{}_{}_{}", &prefix, parent_flattened_name, t.kind));
field_token_ints.insert(t.kind.to_owned(), (counter, dbscheme_variant_name));
}
FieldTypeInfo::ReservedWordInt(field_token_ints)
FieldTypeInfo {
kind: FieldTypeKind::ReservedWordInt(field_token_ints),
valid_types,
}
} else if field_info.types.len() == 1 {
FieldTypeInfo::Single(converted_types.into_iter().next().unwrap())
FieldTypeInfo {
kind: FieldTypeKind::Single(converted_types.into_iter().next().unwrap()),
valid_types,
}
} else {
// The dbscheme type for this field will be a union. In QL, it'll just be AstNode.
FieldTypeInfo::Multiple {
types: converted_types,
dbscheme_union: format!(
"{}_{}_{}_type",
&prefix,
&parent_flattened_name,
&name_for_field_or_child(&field_name)
),
ql_class: "AstNode".to_owned(),
FieldTypeInfo {
kind: FieldTypeKind::Multiple {
types: converted_types,
dbscheme_union: format!(
"{}_{}_{}_type",
&prefix,
&parent_flattened_name,
&name_for_field_or_child(&field_name)
),
ql_class: "AstNode".to_owned(),
},
valid_types,
}
};
let getter_name = format!(
@@ -303,7 +378,7 @@ pub struct NodeInfo {
pub subtypes: Option<Vec<NodeType>>,
}
#[derive(Deserialize)]
#[derive(Deserialize, Hash, Eq, PartialEq)]
pub struct NodeType {
#[serde(rename = "type")]
pub kind: String,