Merge pull request #13969 from hmac/shared-extractor-globs

Shared extractor: support file path globs
This commit is contained in:
Harry Maclean
2023-08-23 16:41:39 +01:00
committed by GitHub
7 changed files with 184 additions and 81 deletions

BIN
ql/Cargo.lock generated

Binary file not shown.

View File

@@ -34,25 +34,25 @@ pub fn run(options: Options) -> std::io::Result<()> {
prefix: "ql",
ts_language: tree_sitter_ql::language(),
node_types: tree_sitter_ql::NODE_TYPES,
file_extensions: vec!["ql".into(), "qll".into()],
file_globs: vec!["*.ql".into(), "*.qll".into()],
},
simple::LanguageSpec {
prefix: "dbscheme",
ts_language: tree_sitter_ql_dbscheme::language(),
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
file_extensions: vec!["dbscheme".into()],
file_globs: vec!["*.dbscheme".into()],
},
simple::LanguageSpec {
prefix: "json",
ts_language: tree_sitter_json::language(),
node_types: tree_sitter_json::NODE_TYPES,
file_extensions: vec!["json".into(), "jsonl".into(), "jsonc".into()],
file_globs: vec!["*.json".into(), "*.jsonl".into(), "*.jsonc".into()],
},
simple::LanguageSpec {
prefix: "blame",
ts_language: tree_sitter_blame::language(),
node_types: tree_sitter_blame::NODE_TYPES,
file_extensions: vec!["blame".into()],
file_globs: vec!["*.blame".into()],
},
],
trap_dir: options.output_dir,

View File

@@ -1,11 +1,12 @@
[package]
name = "codeql-extractor"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
authors = ["GitHub"]
[dependencies]
flate2 = "1.0"
globset = "0.4"
tree-sitter = "0.20"
tracing = "0.1"
rayon = "1.5.0"
@@ -19,4 +20,5 @@ num_cpus = "1.14.0"
[dev-dependencies]
tree-sitter-ql = { git = "https://github.com/tree-sitter/tree-sitter-ql" }
tree-sitter-json = {git = "https://github.com/tausbn/tree-sitter-json" }
rand = "0.8.5"

View File

@@ -1,7 +1,6 @@
use crate::trap;
use globset::{GlobBuilder, GlobSetBuilder};
use rayon::prelude::*;
use std::collections::HashMap;
use std::ffi::{OsStr, OsString};
use std::fs::File;
use std::io::BufRead;
use std::path::{Path, PathBuf};
@@ -13,7 +12,7 @@ pub struct LanguageSpec {
pub prefix: &'static str,
pub ts_language: tree_sitter::Language,
pub node_types: &'static str,
pub file_extensions: Vec<OsString>,
pub file_globs: Vec<String>,
}
pub struct Extractor {
@@ -83,16 +82,26 @@ impl Extractor {
schemas.push(schema);
}
// Construct a map from file extension -> LanguageSpec
let mut file_extension_language_mapping: HashMap<&OsStr, Vec<usize>> = HashMap::new();
for (i, lang) in self.languages.iter().enumerate() {
for (j, _ext) in lang.file_extensions.iter().enumerate() {
let indexes = file_extension_language_mapping
.entry(&lang.file_extensions[j])
.or_default();
indexes.push(i);
// Construct a single globset containing all language globs,
// and a mapping from glob index to language index.
let (globset, glob_language_mapping) = {
let mut builder = GlobSetBuilder::new();
let mut glob_lang_mapping = vec![];
for (i, lang) in self.languages.iter().enumerate() {
for glob_str in &lang.file_globs {
let glob = GlobBuilder::new(glob_str)
.literal_separator(true)
.build()
.expect("invalid glob");
builder.add(glob);
glob_lang_mapping.push(i);
}
}
}
(
builder.build().expect("failed to build globset"),
glob_lang_mapping,
)
};
let lines: std::io::Result<Vec<String>> =
std::io::BufReader::new(file_list).lines().collect();
@@ -108,18 +117,29 @@ impl Extractor {
let source = std::fs::read(&path)?;
let mut trap_writer = trap::Writer::new();
match path.extension() {
match path.file_name() {
None => {
tracing::error!(?path, "No extension found, skipping file.");
tracing::error!(?path, "No file name found, skipping file.");
}
Some(ext) => {
if let Some(indexes) = file_extension_language_mapping.get(ext) {
for i in indexes {
let lang = &self.languages[*i];
Some(filename) => {
let matches = globset.matches(filename);
if matches.is_empty() {
tracing::error!(?path, "No matching language found, skipping file.");
} else {
let mut languages_processed = vec![false; self.languages.len()];
for m in matches {
let i = glob_language_mapping[m];
if languages_processed[i] {
continue;
}
languages_processed[i] = true;
let lang = &self.languages[i];
crate::extractor::extract(
lang.ts_language,
lang.prefix,
&schemas[*i],
&schemas[i],
&mut diagnostics_writer,
&mut trap_writer,
&path,
@@ -130,11 +150,9 @@ impl Extractor {
std::fs::copy(&path, &src_archive_file)?;
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
}
} else {
tracing::warn!(?path, "No language matches path, skipping file.");
}
}
};
}
Ok(()) as std::io::Result<()>
})
.expect("failed to extract files");

View File

@@ -0,0 +1,73 @@
use std::io::{Read, Write};
use std::{
fs::File,
path::{Path, PathBuf},
};
use flate2::read::GzDecoder;
pub struct SourceArchive {
pub root_dir: PathBuf,
pub file_list: PathBuf,
pub source_archive_dir: PathBuf,
pub trap_dir: PathBuf,
}
pub fn create_source_dir(files: Vec<(&'static str, &'static str)>) -> SourceArchive {
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
std::fs::create_dir_all(&root_dir).unwrap();
let root_dir = root_dir
.canonicalize()
.expect("failed to canonicalize root directory");
let trap_dir = create_dir(&root_dir, "trap");
let source_archive_dir = create_dir(&root_dir, "src");
let mut file_paths = vec![];
for (filename, contents) in files {
let path = source_archive_dir.join(filename);
let mut file = File::create(&path).unwrap();
file.write_all(contents.as_bytes()).unwrap();
file_paths.push(PathBuf::from(path));
}
let file_list = {
let path = root_dir.join("files.txt");
let mut file = File::create(&path).unwrap();
for path in file_paths {
file.write_all(path.as_path().display().to_string().as_bytes())
.unwrap();
file.write_all(b"\n").unwrap();
}
path
};
SourceArchive {
root_dir,
file_list,
source_archive_dir,
trap_dir,
}
}
pub fn expect_trap_file(root_dir: &Path, filename: &str) {
let root_dir_relative = {
let r = root_dir.display().to_string();
r.strip_prefix("/").unwrap().to_string()
};
let trap_gz = root_dir
.join("trap")
.join(root_dir_relative)
.join("src")
.join(format!("{filename}.trap.gz"));
let mut decoder = GzDecoder::new(File::open(trap_gz).unwrap());
let mut first_line = [0; 31];
decoder.read_exact(&mut first_line).unwrap();
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
}
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
let full_path = root.join(path);
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
full_path.into()
}

View File

@@ -1,13 +1,12 @@
use std::fs::File;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use codeql_extractor::extractor::simple;
use codeql_extractor::trap;
use flate2::read::GzDecoder;
use tree_sitter_ql;
/// An very simple happy-path test.
mod common;
use common::{create_source_dir, expect_trap_file, SourceArchive};
/// A very simple happy-path test.
/// We run the extractor using the tree-sitter-ql grammar and a single source file,
/// and check that we get a reasonable-looking trap file in the expected location.
#[test]
@@ -16,31 +15,15 @@ fn simple_extractor() {
prefix: "ql",
ts_language: tree_sitter_ql::language(),
node_types: tree_sitter_ql::NODE_TYPES,
file_extensions: vec!["qll".into()],
file_globs: vec!["*.qll".into()],
};
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
std::fs::create_dir_all(&root_dir).unwrap();
let trap_dir = create_dir(&root_dir, "trap");
let source_archive_dir = create_dir(&root_dir, "src");
// Create foo.qll source file
let foo_qll = {
let path = source_archive_dir.join("foo.qll");
let mut file = File::create(&path).expect("Failed to create src/foo.qll");
file.write_all(b"predicate p(int a) { a = 1 }")
.expect("Failed to write to foo.qll");
PathBuf::from(path)
};
let file_list = {
let path = root_dir.join("files.txt");
let mut file = File::create(&path).expect("Failed to create files.txt");
file.write_all(foo_qll.as_path().display().to_string().as_bytes())
.expect("Failed to write to files.txt");
path
};
let SourceArchive {
root_dir,
file_list,
source_archive_dir,
trap_dir,
} = create_source_dir(vec![("foo.qll", "predicate p(int a) { a = 1 }")]);
let extractor = simple::Extractor {
prefix: "ql".to_string(),
@@ -51,31 +34,7 @@ fn simple_extractor() {
trap_compression: Ok(trap::Compression::Gzip),
};
// The extractor should run successfully
extractor.run().unwrap();
// Check for the presence of $root/trap/$root/src/foo.qll
{
let root_dir_relative = {
let r = root_dir.as_path().display().to_string();
r.strip_prefix("/").unwrap().to_string()
};
let foo_qll_trap_gz = root_dir
.join("trap")
.join(root_dir_relative)
.join("src/foo.qll.trap.gz");
let mut decoder =
GzDecoder::new(File::open(foo_qll_trap_gz).expect("Failed to open foo.qll.trap.gz"));
let mut first_line = [0; 31];
decoder
.read_exact(&mut first_line)
.expect("Failed to read from foo.qll.trap.gz");
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
}
}
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
let full_path = root.join(path);
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
full_path.into()
expect_trap_file(&root_dir, "foo.qll");
}

View File

@@ -0,0 +1,51 @@
use codeql_extractor::extractor::simple;
use codeql_extractor::trap;
use tree_sitter_ql;
mod common;
use common::{create_source_dir, expect_trap_file, SourceArchive};
/// Like the `simple_extractor` test but with multiple languages.
/// This is in a separate crate because the simple extractor API sets up a
/// global thread pool, and therefore can't be called twice in the same process.
#[test]
fn multiple_language_extractor() {
let lang_ql = simple::LanguageSpec {
prefix: "ql",
ts_language: tree_sitter_ql::language(),
node_types: tree_sitter_ql::NODE_TYPES,
file_globs: vec!["*.qll".into()],
};
let lang_json = simple::LanguageSpec {
prefix: "json",
ts_language: tree_sitter_json::language(),
node_types: tree_sitter_json::NODE_TYPES,
file_globs: vec!["*.json".into(), "*Jsonfile".into()],
};
let SourceArchive {
root_dir,
file_list,
source_archive_dir,
trap_dir,
} = create_source_dir(vec![
("foo.qll", "predicate p(int a) { a = 1 }"),
("bar.json", "{\"a\": 1}"),
("Jsonfile", "{\"b\": 2}"),
]);
let extractor = simple::Extractor {
prefix: "ql".to_string(),
languages: vec![lang_ql, lang_json],
trap_dir,
source_archive_dir,
file_list,
trap_compression: Ok(trap::Compression::Gzip),
};
extractor.run().unwrap();
expect_trap_file(&root_dir, "foo.qll");
expect_trap_file(&root_dir, "bar.json");
expect_trap_file(&root_dir, "Jsonfile");
}