From 4d5d80c749c08ab46c551539a52c555c4919ae34 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Thu, 29 Oct 2020 19:41:41 +0000 Subject: [PATCH 01/13] Fix location handling to match common db schema requirements --- extractor/src/extractor.rs | 47 ++++++++-- generator/src/main.rs | 175 +++++++++++++++++++++++++++++++++++-- generator/src/ql.rs | 36 ++++++++ generator/src/ql_gen.rs | 84 +++++++++++++++--- ql/src/codeql_ruby/ast.qll | 11 ++- ql/src/ruby.dbscheme | 38 +++++++- 6 files changed, 361 insertions(+), 30 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 143aeccf8a8..33a52efb0ff 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -34,15 +34,31 @@ impl Extractor { .parser .parse(&source, None) .expect("Failed to parse file"); + let mut counter = -1; + // Create a label for the current file and increment the counter so that + // label doesn't get redefined. + counter += 1; + let file_label = Label::Normal(counter); let mut visitor = Visitor { source: &source, - trap_output: vec![TrapEntry::Comment(format!( - "Auto-generated TRAP file for {}", - path.display() - ))], - counter: -1, + trap_output: vec![ + TrapEntry::Comment(format!("Auto-generated TRAP file for {}", path.display())), + TrapEntry::New(file_label), + TrapEntry::GenericTuple( + "files".to_owned(), + vec![ + Arg::Label(file_label), + Arg::String(format!("{}", path.canonicalize()?.display())), + Arg::String(format!("{}", path.file_name().unwrap().to_string_lossy())), + Arg::String(format!("{}", path.extension().unwrap().to_string_lossy())), + Arg::Int(0), // 0 = unknown + ], + ), + ], + counter, // TODO: should we handle path strings that are not valid UTF8 better? path: format!("{}", path.display()), + file_label, stack: Vec::new(), tables: build_schema_lookup(&self.schema), union_types: build_union_type_lookup(&self.schema), @@ -77,6 +93,9 @@ fn build_union_type_lookup<'a>(schema: &'a Vec) -> Map<&'a TypeName, &'a struct Visitor<'a> { /// The file path of the source code (as string) path: String, + /// The label to use whenever we need to refer to the `@file` entity of this + /// source file. + file_label: Label, /// The source code as a UTF-8 byte array source: &'a Vec, /// The accumulated trap entries @@ -135,7 +154,7 @@ impl Visitor<'_> { self.trap_output.push(TrapEntry::New(id)); self.trap_output.push(TrapEntry::New(loc)); self.trap_output - .push(location_for(&self.source, &self.path, loc, node)); + .push(location_for(&self.source, &self.file_label, loc, node)); let table_name = node_type_name(node.kind(), node.is_named()); let args: Option>; if fields.is_empty() { @@ -283,7 +302,7 @@ fn sliced_source_arg(source: &Vec, n: Node) -> Arg { } // Emit a 'Located' TrapEntry for the provided node, appropriately calibrated. -fn location_for<'a>(source: &Vec, fp: &String, label: Label, n: Node) -> TrapEntry { +fn location_for<'a>(source: &Vec, file_label: &Label, label: Label, n: Node) -> TrapEntry { // Tree-sitter row, column values are 0-based while CodeQL starts // counting at 1. In addition Tree-sitter's row and column for the // end position are exclusive while CodeQL's end positions are inclusive. @@ -327,7 +346,7 @@ fn location_for<'a>(source: &Vec, fp: &String, label: Label, n: Node) -> Tra } TrapEntry::Located(vec![ Arg::Label(label), - Arg::String(fp.to_owned()), + Arg::Label(file_label.clone()), Arg::Int(start_line), Arg::Int(start_col), Arg::Int(end_line), @@ -377,6 +396,8 @@ enum TrapEntry { ChildOf(String, Label, String, Option, Label), // @location(loc, path, r1, c1, r2, c2) Located(Vec), + /// foo_bar(arg?) + GenericTuple(String, Vec), Comment(String), } impl fmt::Display for TrapEntry { @@ -424,6 +445,16 @@ impl fmt::Display for TrapEntry { args.get(4).unwrap(), args.get(5).unwrap(), ), + TrapEntry::GenericTuple(name, args) => { + write!(f, "{}(", name)?; + for (index, arg) in args.iter().enumerate() { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{}", arg)?; + } + write!(f, ")") + } TrapEntry::Comment(line) => write!(f, "// {}", line), } } diff --git a/generator/src/main.rs b/generator/src/main.rs index bdedcbdd938..b92cbce5c6c 100644 --- a/generator/src/main.rs +++ b/generator/src/main.rs @@ -113,7 +113,14 @@ fn add_field( /// Converts the given tree-sitter node types into CodeQL dbscheme entries. fn convert_nodes(nodes: &Vec) -> Vec { let mut entries: Vec = vec![ - create_location_table(), + create_location_union(), + create_locations_default_table(), + create_sourceline_union(), + create_numlines_table(), + create_files_table(), + create_folders_table(), + create_container_union(), + create_containerparent_table(), create_source_location_prefix_table(), ]; let mut top_members: Vec = Vec::new(); @@ -212,23 +219,103 @@ fn write_dbscheme(language: &Language, entries: &[dbscheme::Entry]) -> std::io:: dbscheme::write(&language.name, &mut file, &entries) } -fn create_location_table() -> dbscheme::Entry { +fn create_location_union() -> dbscheme::Entry { + dbscheme::Entry::Union(dbscheme::Union { + name: "location".to_owned(), + members: vec!["location_default".to_owned()], + }) +} + +fn create_files_table() -> dbscheme::Entry { dbscheme::Entry::Table(dbscheme::Table { - name: "location".to_string(), + name: "files".to_owned(), + keysets: None, + columns: vec![ + dbscheme::Column { + unique: true, + db_type: dbscheme::DbColumnType::Int, + name: "id".to_owned(), + ql_type: ql::Type::AtType("file".to_owned()), + ql_type_is_ref: false, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "name".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "simple".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "ext".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::Int, + name: "fromSource".to_owned(), + unique: false, + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + ], + }) +} +fn create_folders_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "folders".to_owned(), + keysets: None, + columns: vec![ + dbscheme::Column { + unique: true, + db_type: dbscheme::DbColumnType::Int, + name: "id".to_owned(), + ql_type: ql::Type::AtType("folder".to_owned()), + ql_type_is_ref: false, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "name".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "simple".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + ], + }) +} + +fn create_locations_default_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "locations_default".to_string(), keysets: None, columns: vec![ dbscheme::Column { unique: true, db_type: dbscheme::DbColumnType::Int, name: "id".to_string(), - ql_type: ql::Type::AtType("location".to_string()), + ql_type: ql::Type::AtType("location_default".to_string()), ql_type_is_ref: false, }, dbscheme::Column { unique: false, - db_type: dbscheme::DbColumnType::String, - name: "file_path".to_string(), - ql_type: ql::Type::String, + db_type: dbscheme::DbColumnType::Int, + name: "file".to_string(), + ql_type: ql::Type::AtType("file".to_owned()), ql_type_is_ref: true, }, dbscheme::Column { @@ -263,6 +350,80 @@ fn create_location_table() -> dbscheme::Entry { }) } +fn create_sourceline_union() -> dbscheme::Entry { + dbscheme::Entry::Union(dbscheme::Union { + name: "sourceline".to_owned(), + members: vec!["file".to_owned()], + }) +} + +fn create_numlines_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "numlines".to_owned(), + columns: vec![ + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "element_id".to_string(), + ql_type: ql::Type::AtType("sourceline".to_owned()), + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "num_lines".to_string(), + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "num_code".to_string(), + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "num_comment".to_string(), + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + ], + keysets: None, + }) +} + +fn create_container_union() -> dbscheme::Entry { + dbscheme::Entry::Union(dbscheme::Union { + name: "container".to_owned(), + members: vec!["folder".to_owned(), "file".to_owned()], + }) +} + +fn create_containerparent_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "containerparent".to_owned(), + columns: vec![ + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "parent".to_string(), + ql_type: ql::Type::AtType("container".to_owned()), + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: true, + db_type: dbscheme::DbColumnType::Int, + name: "child".to_string(), + ql_type: ql::Type::AtType("container".to_owned()), + ql_type_is_ref: true, + }, + ], + keysets: None, + }) +} + fn create_source_location_prefix_table() -> dbscheme::Entry { dbscheme::Entry::Table(dbscheme::Table { name: "sourceLocationPrefix".to_string(), diff --git a/generator/src/ql.rs b/generator/src/ql.rs index d422cde9f3a..4c6828d044e 100644 --- a/generator/src/ql.rs +++ b/generator/src/ql.rs @@ -79,7 +79,10 @@ pub enum Expression { String(String), Pred(String, Vec), Or(Vec), + And(Vec), Equals(Box, Box), + Exists(Vec, Box), + Dot(Box, String, Vec), } impl fmt::Display for Expression { @@ -110,7 +113,40 @@ impl fmt::Display for Expression { Ok(()) } } + Expression::And(conjuncts) => { + if conjuncts.is_empty() { + write!(f, "any()") + } else { + for (index, disjunct) in conjuncts.iter().enumerate() { + if index > 0 { + write!(f, " and ")?; + } + write!(f, "{}", disjunct)?; + } + Ok(()) + } + } Expression::Equals(a, b) => write!(f, "{} = {}", a, b), + Expression::Exists(params, formula) => { + write!(f, "exists(")?; + for (index, param) in params.iter().enumerate() { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{}", param)?; + } + write!(f, " | {})", formula) + } + Expression::Dot(x, member_pred, args) => { + write!(f, "{}.{}(", x, member_pred)?; + for (index, arg) in args.iter().enumerate() { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{}", arg)?; + } + write!(f, ")") + } } } } diff --git a/generator/src/ql_gen.rs b/generator/src/ql_gen.rs index d0251b92ee3..09054a43770 100644 --- a/generator/src/ql_gen.rs +++ b/generator/src/ql_gen.rs @@ -130,6 +130,48 @@ fn create_none_predicate( } } +/// Creates the hard-coded `File` class. +fn create_file_class() -> ql::Class { + let get_absolute_path = ql::Predicate { + name: "getAbsolutePath".to_owned(), + overridden: false, + return_type: Some(ql::Type::String), + formal_parameters: vec![], + //override string getAbsolutePath() { files(underlyingElement(this), result, _, _, _) } + body: ql::Expression::Pred( + "files".to_owned(), + vec![ + ql::Expression::Var("this".to_owned()), + ql::Expression::Var("result".to_owned()), + ql::Expression::Var("_".to_owned()), + ql::Expression::Var("_".to_owned()), + ql::Expression::Var("_".to_owned()), + ], + ), + }; + let to_string = ql::Predicate { + name: "toString".to_owned(), + overridden: false, + return_type: Some(ql::Type::String), + formal_parameters: vec![], + body: ql::Expression::Equals( + Box::new(ql::Expression::Var("result".to_owned())), + Box::new(ql::Expression::Dot( + Box::new(ql::Expression::Var("this".to_owned())), + "getAbsolutePath".to_owned(), + vec![], + )), + ), + }; + ql::Class { + name: "File".to_owned(), + is_abstract: false, + supertypes: vec![ql::Type::AtType("file".to_owned())], + characteristic_predicate: None, + predicates: vec![get_absolute_path, to_string], + } +} + /// Creates the special `Location` class to wrap the location table. fn create_location_class() -> ql::Class { let to_string = ql::Predicate { @@ -168,16 +210,32 @@ fn create_location_class() -> ql::Class { param_type: ql::Type::Int, }, ], - body: ql::Expression::Pred( - "location".to_owned(), - vec![ - ql::Expression::Var("this".to_owned()), - ql::Expression::Var("filePath".to_owned()), - ql::Expression::Var("startLine".to_owned()), - ql::Expression::Var("startColumn".to_owned()), - ql::Expression::Var("endLine".to_owned()), - ql::Expression::Var("endColumn".to_owned()), - ], + body: ql::Expression::Exists( + vec![ql::FormalParameter { + param_type: ql::Type::Normal("File".to_owned()), + name: "f".to_owned(), + }], + Box::new(ql::Expression::And(vec![ + ql::Expression::Pred( + "locations_default".to_owned(), + vec![ + ql::Expression::Var("this".to_owned()), + ql::Expression::Var("f".to_owned()), + ql::Expression::Var("startLine".to_owned()), + ql::Expression::Var("startColumn".to_owned()), + ql::Expression::Var("endLine".to_owned()), + ql::Expression::Var("endColumn".to_owned()), + ], + ), + ql::Expression::Equals( + Box::new(ql::Expression::Var("filePath".to_owned())), + Box::new(ql::Expression::Dot( + Box::new(ql::Expression::Var("f".to_owned())), + "getAbsolutePath".to_owned(), + vec![], + )), + ), + ])), ), }; ql::Class { @@ -441,7 +499,11 @@ fn create_field_getters( /// Converts the given node types into CodeQL classes wrapping the dbscheme. pub fn convert_nodes(nodes: &Vec) -> Vec { let supertype_map = create_supertype_map(nodes); - let mut classes: Vec = vec![create_location_class(), create_top_class()]; + let mut classes: Vec = vec![ + create_file_class(), + create_location_class(), + create_top_class(), + ]; for node in nodes { match &node { diff --git a/ql/src/codeql_ruby/ast.qll b/ql/src/codeql_ruby/ast.qll index c0ae4a8398e..3b4fead2ab7 100644 --- a/ql/src/codeql_ruby/ast.qll +++ b/ql/src/codeql_ruby/ast.qll @@ -3,13 +3,22 @@ * Automatically generated from the tree-sitter grammar; do not edit */ +class File extends @file { + string getAbsolutePath() { files(this, result, _, _, _) } + + string toString() { result = this.getAbsolutePath() } +} + class Location extends @location { string toString() { result = "Location" } predicate hasLocationInfo( string filePath, int startLine, int startColumn, int endLine, int endColumn ) { - location(this, filePath, startLine, startColumn, endLine, endColumn) + exists(File f | + locations_default(this, f, startLine, startColumn, endLine, endColumn) and + filePath = f.getAbsolutePath() + ) } } diff --git a/ql/src/ruby.dbscheme b/ql/src/ruby.dbscheme index daa970ea1cd..3456e12d155 100644 --- a/ql/src/ruby.dbscheme +++ b/ql/src/ruby.dbscheme @@ -1,15 +1,47 @@ // CodeQL database schema for Ruby // Automatically generated from the tree-sitter grammar; do not edit -location( - unique int id: @location, - string file_path: string ref, +@location = @location_default + +locations_default( + unique int id: @location_default, + int file: @file ref, int start_line: int ref, int start_column: int ref, int end_line: int ref, int end_column: int ref ); +@sourceline = @file + +numlines( + int element_id: @sourceline ref, + int num_lines: int ref, + int num_code: int ref, + int num_comment: int ref +); + +files( + unique int id: @file, + string name: string ref, + string simple: string ref, + string ext: string ref, + int fromSource: int ref +); + +folders( + unique int id: @folder, + string name: string ref, + string simple: string ref +); + +@container = @folder | @file + +containerparent( + int parent: @container ref, + unique int child: @container ref +); + sourceLocationPrefix( string prefix: string ref ); From 2232700428a3e2ee85e0fb855af43bddf89e51bf Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 10:24:24 +0000 Subject: [PATCH 02/13] Correct comment Co-authored-by: Arthur Baars --- extractor/src/extractor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 33a52efb0ff..0636ed3de1c 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -396,7 +396,7 @@ enum TrapEntry { ChildOf(String, Label, String, Option, Label), // @location(loc, path, r1, c1, r2, c2) Located(Vec), - /// foo_bar(arg?) + /// foo_bar(arg*) GenericTuple(String, Vec), Comment(String), } From f198dc530fc75fbaad5e134ea1d61f3a277f5fb9 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 10:25:09 +0000 Subject: [PATCH 03/13] Use fromSource = 1 Co-authored-by: Arthur Baars --- extractor/src/extractor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 0636ed3de1c..7405f3392f3 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -51,7 +51,7 @@ impl Extractor { Arg::String(format!("{}", path.canonicalize()?.display())), Arg::String(format!("{}", path.file_name().unwrap().to_string_lossy())), Arg::String(format!("{}", path.extension().unwrap().to_string_lossy())), - Arg::Int(0), // 0 = unknown + Arg::Int(1), // 1 = from source ], ), ], From d47bd32b5859c858bf85bd7c74811a76dec675d7 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 10:34:42 +0000 Subject: [PATCH 04/13] Now that we also generate conjunctions, use parentheses in disjunctions --- generator/src/ql.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/generator/src/ql.rs b/generator/src/ql.rs index 4c6828d044e..e54688e1ed1 100644 --- a/generator/src/ql.rs +++ b/generator/src/ql.rs @@ -108,7 +108,7 @@ impl fmt::Display for Expression { if index > 0 { write!(f, " or ")?; } - write!(f, "{}", disjunct)?; + write!(f, "({})", disjunct)?; } Ok(()) } @@ -117,11 +117,11 @@ impl fmt::Display for Expression { if conjuncts.is_empty() { write!(f, "any()") } else { - for (index, disjunct) in conjuncts.iter().enumerate() { + for (index, conjunct) in conjuncts.iter().enumerate() { if index > 0 { write!(f, " and ")?; } - write!(f, "{}", disjunct)?; + write!(f, "{}", conjunct)?; } Ok(()) } From 826b4571a098c8e57d6140658ed6f291d7ca5e21 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 11:21:51 +0000 Subject: [PATCH 05/13] Canonicalize source file paths in main --- extractor/src/extractor.rs | 4 +++- extractor/src/main.rs | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 7405f3392f3..352aa38f85c 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -17,7 +17,9 @@ pub fn create(language: Language, schema: Vec) -> Extractor { Extractor { parser, schema } } + impl Extractor { + /// Extracts the source file at `path`, which is assumed to be canonicalized. pub fn extract<'a>(&'a mut self, path: &Path) -> std::io::Result { let span = span!( Level::TRACE, @@ -48,7 +50,7 @@ impl Extractor { "files".to_owned(), vec![ Arg::Label(file_label), - Arg::String(format!("{}", path.canonicalize()?.display())), + Arg::String(format!("{}", path.display())), Arg::String(format!("{}", path.file_name().unwrap().to_string_lossy())), Arg::String(format!("{}", path.extension().unwrap().to_string_lossy())), Arg::Int(1), // 1 = from source diff --git a/extractor/src/main.rs b/extractor/src/main.rs index 6f60b129a78..4dd1aa1e9fc 100644 --- a/extractor/src/main.rs +++ b/extractor/src/main.rs @@ -40,7 +40,7 @@ fn main() -> std::io::Result<()> { let schema = node_types::read_node_types_str(tree_sitter_ruby::NODE_TYPES)?; let mut extractor = extractor::create(language, schema); for line in std::io::BufReader::new(file_list).lines() { - let path = PathBuf::from(line?); + let path = PathBuf::from(line?).canonicalize()?; let trap_file = path_for(&trap_dir, &path, ".trap"); let src_archive_file = path_for(&src_archive_dir, &path, ""); let trap = extractor.extract(&path)?; From 7f03206b5262195c80d1139a698f1efc7436886e Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 11:29:04 +0000 Subject: [PATCH 06/13] Use a key id for file entities --- extractor/src/extractor.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 352aa38f85c..d21336aa005 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -41,16 +41,17 @@ impl Extractor { // label doesn't get redefined. counter += 1; let file_label = Label::Normal(counter); + let path_string = format!("{}", path.display()); let mut visitor = Visitor { source: &source, trap_output: vec![ TrapEntry::Comment(format!("Auto-generated TRAP file for {}", path.display())), - TrapEntry::New(file_label), + TrapEntry::MapLabelToKey(file_label, path_string.clone()), TrapEntry::GenericTuple( "files".to_owned(), vec![ Arg::Label(file_label), - Arg::String(format!("{}", path.display())), + Arg::String(path_string.clone()), Arg::String(format!("{}", path.file_name().unwrap().to_string_lossy())), Arg::String(format!("{}", path.extension().unwrap().to_string_lossy())), Arg::Int(1), // 1 = from source @@ -59,7 +60,7 @@ impl Extractor { ], counter, // TODO: should we handle path strings that are not valid UTF8 better? - path: format!("{}", path.display()), + path: path_string, file_label, stack: Vec::new(), tables: build_schema_lookup(&self.schema), @@ -153,8 +154,8 @@ impl Visitor<'_> { self.counter += 1; let id = Label::Normal(self.counter); let loc = Label::Location(self.counter); - self.trap_output.push(TrapEntry::New(id)); - self.trap_output.push(TrapEntry::New(loc)); + self.trap_output.push(TrapEntry::FreshId(id)); + self.trap_output.push(TrapEntry::FreshId(loc)); self.trap_output .push(location_for(&self.source, &self.file_label, loc, node)); let table_name = node_type_name(node.kind(), node.is_named()); @@ -390,8 +391,10 @@ impl fmt::Display for Program { } enum TrapEntry { - // @id = *@ - New(Label), + /// Maps the label to a fresh id, e.g. `#123 = *`. + FreshId(Label), + /// Maps the label to a key, e.g. `#7 = @"foo"`. + MapLabelToKey(Label, String), // @node_def(self, arg?, location)@ Definition(String, Label, Vec, Label), // @node_child(self, index, parent)@ @@ -405,7 +408,8 @@ enum TrapEntry { impl fmt::Display for TrapEntry { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - TrapEntry::New(id) => write!(f, "{} = *", id), + TrapEntry::FreshId(label) => write!(f, "{} = *", label), + TrapEntry::MapLabelToKey(label, key) => write!(f, "{} = @\"{}\"", label, key), TrapEntry::Definition(n, id, args, loc) => { let mut args_str = String::new(); for arg in args { From 79d15051beadf31c9c4e34bb615c19b823dd9de9 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 12:45:23 +0000 Subject: [PATCH 07/13] Fix full ids for files to match common spec --- extractor/src/extractor.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index d21336aa005..d318d397256 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -46,7 +46,7 @@ impl Extractor { source: &source, trap_output: vec![ TrapEntry::Comment(format!("Auto-generated TRAP file for {}", path.display())), - TrapEntry::MapLabelToKey(file_label, path_string.clone()), + TrapEntry::MapLabelToKey(file_label, full_id_for_file(path)), TrapEntry::GenericTuple( "files".to_owned(), vec![ @@ -73,6 +73,25 @@ impl Extractor { } } +fn full_id_for_file(path: &Path) -> String { + let full_id = format!("{};sourcefile", path.display()); + + if cfg!(windows) { + // Strip the Windows long path prefix, since std::fs::canonicalize adds it, + // but it's not part of the common CodeQL spec for file ids. + let win_long_path_prefix = r"\\?\"; + let full_id = match full_id.strip_prefix(win_long_path_prefix) { + Some(s) => s.to_owned(), + None => full_id, + }; + + // And replace backslashes with forward slashes. + full_id.replace(r"\", "/") + } else { + full_id + } +} + fn build_schema_lookup<'a>(schema: &'a Vec) -> Map<&'a TypeName, &'a Entry> { let mut map = std::collections::BTreeMap::new(); for entry in schema { From 4b8bbd101cb807488e283c9390794fa7ba966251 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 13:06:21 +0000 Subject: [PATCH 08/13] Give locations full ids matching the common spec --- extractor/src/extractor.rs | 43 ++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index d318d397256..98c24aa9b5a 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -174,9 +174,10 @@ impl Visitor<'_> { let id = Label::Normal(self.counter); let loc = Label::Location(self.counter); self.trap_output.push(TrapEntry::FreshId(id)); - self.trap_output.push(TrapEntry::FreshId(loc)); - self.trap_output - .push(location_for(&self.source, &self.file_label, loc, node)); + let (loc_label_def, loc_tuple) = + location_for(&self.source, &self.file_label, loc, node); + self.trap_output.push(loc_label_def); + self.trap_output.push(loc_tuple); let table_name = node_type_name(node.kind(), node.is_named()); let args: Option>; if fields.is_empty() { @@ -323,8 +324,15 @@ fn sliced_source_arg(source: &Vec, n: Node) -> Arg { )) } -// Emit a 'Located' TrapEntry for the provided node, appropriately calibrated. -fn location_for<'a>(source: &Vec, file_label: &Label, label: Label, n: Node) -> TrapEntry { +// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated. +// The first is the location and label definition, and the second is the +// 'Located' entry. +fn location_for<'a>( + source: &Vec, + file_label: &Label, + label: Label, + n: Node, +) -> (TrapEntry, TrapEntry) { // Tree-sitter row, column values are 0-based while CodeQL starts // counting at 1. In addition Tree-sitter's row and column for the // end position are exclusive while CodeQL's end positions are inclusive. @@ -366,14 +374,23 @@ fn location_for<'a>(source: &Vec, file_label: &Label, label: Label, n: Node) ); } } - TrapEntry::Located(vec![ - Arg::Label(label), - Arg::Label(file_label.clone()), - Arg::Int(start_line), - Arg::Int(start_col), - Arg::Int(end_line), - Arg::Int(end_col), - ]) + ( + TrapEntry::MapLabelToKey( + label, + format!( + "loc,{{{}}},{},{},{},{}", + file_label, start_line, start_col, end_line, end_col + ), + ), + TrapEntry::Located(vec![ + Arg::Label(label), + Arg::Label(file_label.clone()), + Arg::Int(start_line), + Arg::Int(start_col), + Arg::Int(end_line), + Arg::Int(end_col), + ]), + ) } fn traverse(tree: &Tree, visitor: &mut Visitor) { From a54f923a734fcc2e560bbdf7e61206fd168ff562 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 13:22:58 +0000 Subject: [PATCH 09/13] Normalize the absolute path in the files table --- extractor/src/extractor.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 98c24aa9b5a..ef73416c9c5 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -41,7 +41,6 @@ impl Extractor { // label doesn't get redefined. counter += 1; let file_label = Label::Normal(counter); - let path_string = format!("{}", path.display()); let mut visitor = Visitor { source: &source, trap_output: vec![ @@ -51,7 +50,7 @@ impl Extractor { "files".to_owned(), vec![ Arg::Label(file_label), - Arg::String(path_string.clone()), + Arg::String(normalize_path(path)), Arg::String(format!("{}", path.file_name().unwrap().to_string_lossy())), Arg::String(format!("{}", path.extension().unwrap().to_string_lossy())), Arg::Int(1), // 1 = from source @@ -60,7 +59,7 @@ impl Extractor { ], counter, // TODO: should we handle path strings that are not valid UTF8 better? - path: path_string, + path: format!("{}", path.display()), file_label, stack: Vec::new(), tables: build_schema_lookup(&self.schema), @@ -73,25 +72,30 @@ impl Extractor { } } -fn full_id_for_file(path: &Path) -> String { - let full_id = format!("{};sourcefile", path.display()); - +/// Normalizes the path according the common CodeQL specification. Assumes that +/// `path` has already been canonicalized using `std::fs::canonicalize`. +fn normalize_path(path: &Path) -> String { + let result = format!("{}", path.display()); if cfg!(windows) { // Strip the Windows long path prefix, since std::fs::canonicalize adds it, // but it's not part of the common CodeQL spec for file ids. let win_long_path_prefix = r"\\?\"; - let full_id = match full_id.strip_prefix(win_long_path_prefix) { + let result = match result.strip_prefix(win_long_path_prefix) { Some(s) => s.to_owned(), - None => full_id, + None => result, }; // And replace backslashes with forward slashes. - full_id.replace(r"\", "/") + result.replace(r"\", "/") } else { - full_id + result } } +fn full_id_for_file(path: &Path) -> String { + format!("{};sourcefile", normalize_path(path)) +} + fn build_schema_lookup<'a>(schema: &'a Vec) -> Map<&'a TypeName, &'a Entry> { let mut map = std::collections::BTreeMap::new(); for entry in schema { From 35cb379db7b836b38c9c98690166de8a666f44cf Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 13:24:16 +0000 Subject: [PATCH 10/13] Fix name of table for locations --- extractor/src/extractor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index ef73416c9c5..cb7caeeda68 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -483,7 +483,7 @@ impl fmt::Display for TrapEntry { }, TrapEntry::Located(args) => write!( f, - "location({}, {}, {}, {}, {}, {})", + "locations_default({}, {}, {}, {}, {}, {})", args.get(0).unwrap(), args.get(1).unwrap(), args.get(2).unwrap(), From 0a754334cff73a44cca69dba3858a7f78ccced02 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 13:41:27 +0000 Subject: [PATCH 11/13] Don't generate the QL `File` class --- generator/src/ql_gen.rs | 48 +---------------------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) diff --git a/generator/src/ql_gen.rs b/generator/src/ql_gen.rs index 09054a43770..b67b0cc76b2 100644 --- a/generator/src/ql_gen.rs +++ b/generator/src/ql_gen.rs @@ -130,48 +130,6 @@ fn create_none_predicate( } } -/// Creates the hard-coded `File` class. -fn create_file_class() -> ql::Class { - let get_absolute_path = ql::Predicate { - name: "getAbsolutePath".to_owned(), - overridden: false, - return_type: Some(ql::Type::String), - formal_parameters: vec![], - //override string getAbsolutePath() { files(underlyingElement(this), result, _, _, _) } - body: ql::Expression::Pred( - "files".to_owned(), - vec![ - ql::Expression::Var("this".to_owned()), - ql::Expression::Var("result".to_owned()), - ql::Expression::Var("_".to_owned()), - ql::Expression::Var("_".to_owned()), - ql::Expression::Var("_".to_owned()), - ], - ), - }; - let to_string = ql::Predicate { - name: "toString".to_owned(), - overridden: false, - return_type: Some(ql::Type::String), - formal_parameters: vec![], - body: ql::Expression::Equals( - Box::new(ql::Expression::Var("result".to_owned())), - Box::new(ql::Expression::Dot( - Box::new(ql::Expression::Var("this".to_owned())), - "getAbsolutePath".to_owned(), - vec![], - )), - ), - }; - ql::Class { - name: "File".to_owned(), - is_abstract: false, - supertypes: vec![ql::Type::AtType("file".to_owned())], - characteristic_predicate: None, - predicates: vec![get_absolute_path, to_string], - } -} - /// Creates the special `Location` class to wrap the location table. fn create_location_class() -> ql::Class { let to_string = ql::Predicate { @@ -499,11 +457,7 @@ fn create_field_getters( /// Converts the given node types into CodeQL classes wrapping the dbscheme. pub fn convert_nodes(nodes: &Vec) -> Vec { let supertype_map = create_supertype_map(nodes); - let mut classes: Vec = vec![ - create_file_class(), - create_location_class(), - create_top_class(), - ]; + let mut classes: Vec = vec![create_location_class(), create_top_class()]; for node in nodes { match &node { From e73500ef7c14f02b40b68bac8a3ce04a56e32386 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 14:38:24 +0000 Subject: [PATCH 12/13] Cope with empty filenames/extensions --- extractor/src/extractor.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index cb7caeeda68..83e0ecf43c9 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -51,8 +51,14 @@ impl Extractor { vec![ Arg::Label(file_label), Arg::String(normalize_path(path)), - Arg::String(format!("{}", path.file_name().unwrap().to_string_lossy())), - Arg::String(format!("{}", path.extension().unwrap().to_string_lossy())), + Arg::String(match path.file_name() { + None => "".to_owned(), + Some(file_name) => format!("{}", file_name.to_string_lossy()), + }), + Arg::String(match path.extension() { + None => "".to_owned(), + Some(ext) => format!("{}", ext.to_string_lossy()), + }), Arg::Int(1), // 1 = from source ], ), From 075c72e6ef9c27523728caf3159c9d234e4c50b9 Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 30 Oct 2020 15:26:46 +0000 Subject: [PATCH 13/13] Iterate through path components to 'normalize' paths on windows --- extractor/src/extractor.rs | 43 +++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 83e0ecf43c9..dec97617287 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -81,20 +81,39 @@ impl Extractor { /// Normalizes the path according the common CodeQL specification. Assumes that /// `path` has already been canonicalized using `std::fs::canonicalize`. fn normalize_path(path: &Path) -> String { - let result = format!("{}", path.display()); if cfg!(windows) { - // Strip the Windows long path prefix, since std::fs::canonicalize adds it, - // but it's not part of the common CodeQL spec for file ids. - let win_long_path_prefix = r"\\?\"; - let result = match result.strip_prefix(win_long_path_prefix) { - Some(s) => s.to_owned(), - None => result, - }; - - // And replace backslashes with forward slashes. - result.replace(r"\", "/") + // The way Rust canonicalizes paths doesn't match the CodeQL spec, so we + // have to do a bit of work removing certain prefixes and replacing + // backslashes. + let mut components: Vec = Vec::new(); + for component in path.components() { + match component { + std::path::Component::Prefix(prefix) => match prefix.kind() { + std::path::Prefix::Disk(letter) | std::path::Prefix::VerbatimDisk(letter) => { + components.push(format!("{}:", letter as char)); + } + std::path::Prefix::Verbatim(x) | std::path::Prefix::DeviceNS(x) => { + components.push(x.to_string_lossy().to_string()); + } + std::path::Prefix::UNC(server, share) + | std::path::Prefix::VerbatimUNC(server, share) => { + components.push(server.to_string_lossy().to_string()); + components.push(share.to_string_lossy().to_string()); + } + }, + std::path::Component::Normal(n) => { + components.push(n.to_string_lossy().to_string()); + } + std::path::Component::RootDir => {} + std::path::Component::CurDir => {} + std::path::Component::ParentDir => {} + } + } + components.join("/") } else { - result + // For other operating systems, we can use the canonicalized path + // without modifications. + format!("{}", path.display()) } }