diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 143aeccf8a8..dec97617287 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -17,7 +17,9 @@ pub fn create(language: Language, schema: Vec) -> Extractor { Extractor { parser, schema } } + impl Extractor { + /// Extracts the source file at `path`, which is assumed to be canonicalized. pub fn extract<'a>(&'a mut self, path: &Path) -> std::io::Result { let span = span!( Level::TRACE, @@ -34,15 +36,37 @@ impl Extractor { .parser .parse(&source, None) .expect("Failed to parse file"); + let mut counter = -1; + // Create a label for the current file and increment the counter so that + // label doesn't get redefined. + counter += 1; + let file_label = Label::Normal(counter); let mut visitor = Visitor { source: &source, - trap_output: vec![TrapEntry::Comment(format!( - "Auto-generated TRAP file for {}", - path.display() - ))], - counter: -1, + trap_output: vec![ + TrapEntry::Comment(format!("Auto-generated TRAP file for {}", path.display())), + TrapEntry::MapLabelToKey(file_label, full_id_for_file(path)), + TrapEntry::GenericTuple( + "files".to_owned(), + vec![ + Arg::Label(file_label), + Arg::String(normalize_path(path)), + Arg::String(match path.file_name() { + None => "".to_owned(), + Some(file_name) => format!("{}", file_name.to_string_lossy()), + }), + Arg::String(match path.extension() { + None => "".to_owned(), + Some(ext) => format!("{}", ext.to_string_lossy()), + }), + Arg::Int(1), // 1 = from source + ], + ), + ], + counter, // TODO: should we handle path strings that are not valid UTF8 better? path: format!("{}", path.display()), + file_label, stack: Vec::new(), tables: build_schema_lookup(&self.schema), union_types: build_union_type_lookup(&self.schema), @@ -54,6 +78,49 @@ impl Extractor { } } +/// Normalizes the path according the common CodeQL specification. Assumes that +/// `path` has already been canonicalized using `std::fs::canonicalize`. +fn normalize_path(path: &Path) -> String { + if cfg!(windows) { + // The way Rust canonicalizes paths doesn't match the CodeQL spec, so we + // have to do a bit of work removing certain prefixes and replacing + // backslashes. + let mut components: Vec = Vec::new(); + for component in path.components() { + match component { + std::path::Component::Prefix(prefix) => match prefix.kind() { + std::path::Prefix::Disk(letter) | std::path::Prefix::VerbatimDisk(letter) => { + components.push(format!("{}:", letter as char)); + } + std::path::Prefix::Verbatim(x) | std::path::Prefix::DeviceNS(x) => { + components.push(x.to_string_lossy().to_string()); + } + std::path::Prefix::UNC(server, share) + | std::path::Prefix::VerbatimUNC(server, share) => { + components.push(server.to_string_lossy().to_string()); + components.push(share.to_string_lossy().to_string()); + } + }, + std::path::Component::Normal(n) => { + components.push(n.to_string_lossy().to_string()); + } + std::path::Component::RootDir => {} + std::path::Component::CurDir => {} + std::path::Component::ParentDir => {} + } + } + components.join("/") + } else { + // For other operating systems, we can use the canonicalized path + // without modifications. + format!("{}", path.display()) + } +} + +fn full_id_for_file(path: &Path) -> String { + format!("{};sourcefile", normalize_path(path)) +} + fn build_schema_lookup<'a>(schema: &'a Vec) -> Map<&'a TypeName, &'a Entry> { let mut map = std::collections::BTreeMap::new(); for entry in schema { @@ -77,6 +144,9 @@ fn build_union_type_lookup<'a>(schema: &'a Vec) -> Map<&'a TypeName, &'a struct Visitor<'a> { /// The file path of the source code (as string) path: String, + /// The label to use whenever we need to refer to the `@file` entity of this + /// source file. + file_label: Label, /// The source code as a UTF-8 byte array source: &'a Vec, /// The accumulated trap entries @@ -132,10 +202,11 @@ impl Visitor<'_> { self.counter += 1; let id = Label::Normal(self.counter); let loc = Label::Location(self.counter); - self.trap_output.push(TrapEntry::New(id)); - self.trap_output.push(TrapEntry::New(loc)); - self.trap_output - .push(location_for(&self.source, &self.path, loc, node)); + self.trap_output.push(TrapEntry::FreshId(id)); + let (loc_label_def, loc_tuple) = + location_for(&self.source, &self.file_label, loc, node); + self.trap_output.push(loc_label_def); + self.trap_output.push(loc_tuple); let table_name = node_type_name(node.kind(), node.is_named()); let args: Option>; if fields.is_empty() { @@ -282,8 +353,15 @@ fn sliced_source_arg(source: &Vec, n: Node) -> Arg { )) } -// Emit a 'Located' TrapEntry for the provided node, appropriately calibrated. -fn location_for<'a>(source: &Vec, fp: &String, label: Label, n: Node) -> TrapEntry { +// Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated. +// The first is the location and label definition, and the second is the +// 'Located' entry. +fn location_for<'a>( + source: &Vec, + file_label: &Label, + label: Label, + n: Node, +) -> (TrapEntry, TrapEntry) { // Tree-sitter row, column values are 0-based while CodeQL starts // counting at 1. In addition Tree-sitter's row and column for the // end position are exclusive while CodeQL's end positions are inclusive. @@ -325,14 +403,23 @@ fn location_for<'a>(source: &Vec, fp: &String, label: Label, n: Node) -> Tra ); } } - TrapEntry::Located(vec![ - Arg::Label(label), - Arg::String(fp.to_owned()), - Arg::Int(start_line), - Arg::Int(start_col), - Arg::Int(end_line), - Arg::Int(end_col), - ]) + ( + TrapEntry::MapLabelToKey( + label, + format!( + "loc,{{{}}},{},{},{},{}", + file_label, start_line, start_col, end_line, end_col + ), + ), + TrapEntry::Located(vec![ + Arg::Label(label), + Arg::Label(file_label.clone()), + Arg::Int(start_line), + Arg::Int(start_col), + Arg::Int(end_line), + Arg::Int(end_col), + ]), + ) } fn traverse(tree: &Tree, visitor: &mut Visitor) { @@ -369,20 +456,25 @@ impl fmt::Display for Program { } enum TrapEntry { - // @id = *@ - New(Label), + /// Maps the label to a fresh id, e.g. `#123 = *`. + FreshId(Label), + /// Maps the label to a key, e.g. `#7 = @"foo"`. + MapLabelToKey(Label, String), // @node_def(self, arg?, location)@ Definition(String, Label, Vec, Label), // @node_child(self, index, parent)@ ChildOf(String, Label, String, Option, Label), // @location(loc, path, r1, c1, r2, c2) Located(Vec), + /// foo_bar(arg*) + GenericTuple(String, Vec), Comment(String), } impl fmt::Display for TrapEntry { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - TrapEntry::New(id) => write!(f, "{} = *", id), + TrapEntry::FreshId(label) => write!(f, "{} = *", label), + TrapEntry::MapLabelToKey(label, key) => write!(f, "{} = @\"{}\"", label, key), TrapEntry::Definition(n, id, args, loc) => { let mut args_str = String::new(); for arg in args { @@ -416,7 +508,7 @@ impl fmt::Display for TrapEntry { }, TrapEntry::Located(args) => write!( f, - "location({}, {}, {}, {}, {}, {})", + "locations_default({}, {}, {}, {}, {}, {})", args.get(0).unwrap(), args.get(1).unwrap(), args.get(2).unwrap(), @@ -424,6 +516,16 @@ impl fmt::Display for TrapEntry { args.get(4).unwrap(), args.get(5).unwrap(), ), + TrapEntry::GenericTuple(name, args) => { + write!(f, "{}(", name)?; + for (index, arg) in args.iter().enumerate() { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{}", arg)?; + } + write!(f, ")") + } TrapEntry::Comment(line) => write!(f, "// {}", line), } } diff --git a/extractor/src/main.rs b/extractor/src/main.rs index 6f60b129a78..4dd1aa1e9fc 100644 --- a/extractor/src/main.rs +++ b/extractor/src/main.rs @@ -40,7 +40,7 @@ fn main() -> std::io::Result<()> { let schema = node_types::read_node_types_str(tree_sitter_ruby::NODE_TYPES)?; let mut extractor = extractor::create(language, schema); for line in std::io::BufReader::new(file_list).lines() { - let path = PathBuf::from(line?); + let path = PathBuf::from(line?).canonicalize()?; let trap_file = path_for(&trap_dir, &path, ".trap"); let src_archive_file = path_for(&src_archive_dir, &path, ""); let trap = extractor.extract(&path)?; diff --git a/generator/src/main.rs b/generator/src/main.rs index bdedcbdd938..b92cbce5c6c 100644 --- a/generator/src/main.rs +++ b/generator/src/main.rs @@ -113,7 +113,14 @@ fn add_field( /// Converts the given tree-sitter node types into CodeQL dbscheme entries. fn convert_nodes(nodes: &Vec) -> Vec { let mut entries: Vec = vec![ - create_location_table(), + create_location_union(), + create_locations_default_table(), + create_sourceline_union(), + create_numlines_table(), + create_files_table(), + create_folders_table(), + create_container_union(), + create_containerparent_table(), create_source_location_prefix_table(), ]; let mut top_members: Vec = Vec::new(); @@ -212,23 +219,103 @@ fn write_dbscheme(language: &Language, entries: &[dbscheme::Entry]) -> std::io:: dbscheme::write(&language.name, &mut file, &entries) } -fn create_location_table() -> dbscheme::Entry { +fn create_location_union() -> dbscheme::Entry { + dbscheme::Entry::Union(dbscheme::Union { + name: "location".to_owned(), + members: vec!["location_default".to_owned()], + }) +} + +fn create_files_table() -> dbscheme::Entry { dbscheme::Entry::Table(dbscheme::Table { - name: "location".to_string(), + name: "files".to_owned(), + keysets: None, + columns: vec![ + dbscheme::Column { + unique: true, + db_type: dbscheme::DbColumnType::Int, + name: "id".to_owned(), + ql_type: ql::Type::AtType("file".to_owned()), + ql_type_is_ref: false, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "name".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "simple".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "ext".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::Int, + name: "fromSource".to_owned(), + unique: false, + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + ], + }) +} +fn create_folders_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "folders".to_owned(), + keysets: None, + columns: vec![ + dbscheme::Column { + unique: true, + db_type: dbscheme::DbColumnType::Int, + name: "id".to_owned(), + ql_type: ql::Type::AtType("folder".to_owned()), + ql_type_is_ref: false, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "name".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + dbscheme::Column { + db_type: dbscheme::DbColumnType::String, + name: "simple".to_owned(), + unique: false, + ql_type: ql::Type::String, + ql_type_is_ref: true, + }, + ], + }) +} + +fn create_locations_default_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "locations_default".to_string(), keysets: None, columns: vec![ dbscheme::Column { unique: true, db_type: dbscheme::DbColumnType::Int, name: "id".to_string(), - ql_type: ql::Type::AtType("location".to_string()), + ql_type: ql::Type::AtType("location_default".to_string()), ql_type_is_ref: false, }, dbscheme::Column { unique: false, - db_type: dbscheme::DbColumnType::String, - name: "file_path".to_string(), - ql_type: ql::Type::String, + db_type: dbscheme::DbColumnType::Int, + name: "file".to_string(), + ql_type: ql::Type::AtType("file".to_owned()), ql_type_is_ref: true, }, dbscheme::Column { @@ -263,6 +350,80 @@ fn create_location_table() -> dbscheme::Entry { }) } +fn create_sourceline_union() -> dbscheme::Entry { + dbscheme::Entry::Union(dbscheme::Union { + name: "sourceline".to_owned(), + members: vec!["file".to_owned()], + }) +} + +fn create_numlines_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "numlines".to_owned(), + columns: vec![ + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "element_id".to_string(), + ql_type: ql::Type::AtType("sourceline".to_owned()), + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "num_lines".to_string(), + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "num_code".to_string(), + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "num_comment".to_string(), + ql_type: ql::Type::Int, + ql_type_is_ref: true, + }, + ], + keysets: None, + }) +} + +fn create_container_union() -> dbscheme::Entry { + dbscheme::Entry::Union(dbscheme::Union { + name: "container".to_owned(), + members: vec!["folder".to_owned(), "file".to_owned()], + }) +} + +fn create_containerparent_table() -> dbscheme::Entry { + dbscheme::Entry::Table(dbscheme::Table { + name: "containerparent".to_owned(), + columns: vec![ + dbscheme::Column { + unique: false, + db_type: dbscheme::DbColumnType::Int, + name: "parent".to_string(), + ql_type: ql::Type::AtType("container".to_owned()), + ql_type_is_ref: true, + }, + dbscheme::Column { + unique: true, + db_type: dbscheme::DbColumnType::Int, + name: "child".to_string(), + ql_type: ql::Type::AtType("container".to_owned()), + ql_type_is_ref: true, + }, + ], + keysets: None, + }) +} + fn create_source_location_prefix_table() -> dbscheme::Entry { dbscheme::Entry::Table(dbscheme::Table { name: "sourceLocationPrefix".to_string(), diff --git a/generator/src/ql.rs b/generator/src/ql.rs index d422cde9f3a..e54688e1ed1 100644 --- a/generator/src/ql.rs +++ b/generator/src/ql.rs @@ -79,7 +79,10 @@ pub enum Expression { String(String), Pred(String, Vec), Or(Vec), + And(Vec), Equals(Box, Box), + Exists(Vec, Box), + Dot(Box, String, Vec), } impl fmt::Display for Expression { @@ -105,12 +108,45 @@ impl fmt::Display for Expression { if index > 0 { write!(f, " or ")?; } - write!(f, "{}", disjunct)?; + write!(f, "({})", disjunct)?; + } + Ok(()) + } + } + Expression::And(conjuncts) => { + if conjuncts.is_empty() { + write!(f, "any()") + } else { + for (index, conjunct) in conjuncts.iter().enumerate() { + if index > 0 { + write!(f, " and ")?; + } + write!(f, "{}", conjunct)?; } Ok(()) } } Expression::Equals(a, b) => write!(f, "{} = {}", a, b), + Expression::Exists(params, formula) => { + write!(f, "exists(")?; + for (index, param) in params.iter().enumerate() { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{}", param)?; + } + write!(f, " | {})", formula) + } + Expression::Dot(x, member_pred, args) => { + write!(f, "{}.{}(", x, member_pred)?; + for (index, arg) in args.iter().enumerate() { + if index > 0 { + write!(f, ", ")?; + } + write!(f, "{}", arg)?; + } + write!(f, ")") + } } } } diff --git a/generator/src/ql_gen.rs b/generator/src/ql_gen.rs index d0251b92ee3..b67b0cc76b2 100644 --- a/generator/src/ql_gen.rs +++ b/generator/src/ql_gen.rs @@ -168,16 +168,32 @@ fn create_location_class() -> ql::Class { param_type: ql::Type::Int, }, ], - body: ql::Expression::Pred( - "location".to_owned(), - vec![ - ql::Expression::Var("this".to_owned()), - ql::Expression::Var("filePath".to_owned()), - ql::Expression::Var("startLine".to_owned()), - ql::Expression::Var("startColumn".to_owned()), - ql::Expression::Var("endLine".to_owned()), - ql::Expression::Var("endColumn".to_owned()), - ], + body: ql::Expression::Exists( + vec![ql::FormalParameter { + param_type: ql::Type::Normal("File".to_owned()), + name: "f".to_owned(), + }], + Box::new(ql::Expression::And(vec![ + ql::Expression::Pred( + "locations_default".to_owned(), + vec![ + ql::Expression::Var("this".to_owned()), + ql::Expression::Var("f".to_owned()), + ql::Expression::Var("startLine".to_owned()), + ql::Expression::Var("startColumn".to_owned()), + ql::Expression::Var("endLine".to_owned()), + ql::Expression::Var("endColumn".to_owned()), + ], + ), + ql::Expression::Equals( + Box::new(ql::Expression::Var("filePath".to_owned())), + Box::new(ql::Expression::Dot( + Box::new(ql::Expression::Var("f".to_owned())), + "getAbsolutePath".to_owned(), + vec![], + )), + ), + ])), ), }; ql::Class { diff --git a/ql/src/codeql_ruby/ast.qll b/ql/src/codeql_ruby/ast.qll index c0ae4a8398e..3b4fead2ab7 100644 --- a/ql/src/codeql_ruby/ast.qll +++ b/ql/src/codeql_ruby/ast.qll @@ -3,13 +3,22 @@ * Automatically generated from the tree-sitter grammar; do not edit */ +class File extends @file { + string getAbsolutePath() { files(this, result, _, _, _) } + + string toString() { result = this.getAbsolutePath() } +} + class Location extends @location { string toString() { result = "Location" } predicate hasLocationInfo( string filePath, int startLine, int startColumn, int endLine, int endColumn ) { - location(this, filePath, startLine, startColumn, endLine, endColumn) + exists(File f | + locations_default(this, f, startLine, startColumn, endLine, endColumn) and + filePath = f.getAbsolutePath() + ) } } diff --git a/ql/src/ruby.dbscheme b/ql/src/ruby.dbscheme index daa970ea1cd..3456e12d155 100644 --- a/ql/src/ruby.dbscheme +++ b/ql/src/ruby.dbscheme @@ -1,15 +1,47 @@ // CodeQL database schema for Ruby // Automatically generated from the tree-sitter grammar; do not edit -location( - unique int id: @location, - string file_path: string ref, +@location = @location_default + +locations_default( + unique int id: @location_default, + int file: @file ref, int start_line: int ref, int start_column: int ref, int end_line: int ref, int end_column: int ref ); +@sourceline = @file + +numlines( + int element_id: @sourceline ref, + int num_lines: int ref, + int num_code: int ref, + int num_comment: int ref +); + +files( + unique int id: @file, + string name: string ref, + string simple: string ref, + string ext: string ref, + int fromSource: int ref +); + +folders( + unique int id: @folder, + string name: string ref, + string simple: string ref +); + +@container = @folder | @file + +containerparent( + int parent: @container ref, + unique int child: @container ref +); + sourceLocationPrefix( string prefix: string ref );