mirror of
https://github.com/github/codeql.git
synced 2026-04-25 00:35:20 +02:00
Merge pull request #13969 from hmac/shared-extractor-globs
Shared extractor: support file path globs
This commit is contained in:
BIN
ql/Cargo.lock
generated
BIN
ql/Cargo.lock
generated
Binary file not shown.
@@ -34,25 +34,25 @@ pub fn run(options: Options) -> std::io::Result<()> {
|
||||
prefix: "ql",
|
||||
ts_language: tree_sitter_ql::language(),
|
||||
node_types: tree_sitter_ql::NODE_TYPES,
|
||||
file_extensions: vec!["ql".into(), "qll".into()],
|
||||
file_globs: vec!["*.ql".into(), "*.qll".into()],
|
||||
},
|
||||
simple::LanguageSpec {
|
||||
prefix: "dbscheme",
|
||||
ts_language: tree_sitter_ql_dbscheme::language(),
|
||||
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
|
||||
file_extensions: vec!["dbscheme".into()],
|
||||
file_globs: vec!["*.dbscheme".into()],
|
||||
},
|
||||
simple::LanguageSpec {
|
||||
prefix: "json",
|
||||
ts_language: tree_sitter_json::language(),
|
||||
node_types: tree_sitter_json::NODE_TYPES,
|
||||
file_extensions: vec!["json".into(), "jsonl".into(), "jsonc".into()],
|
||||
file_globs: vec!["*.json".into(), "*.jsonl".into(), "*.jsonc".into()],
|
||||
},
|
||||
simple::LanguageSpec {
|
||||
prefix: "blame",
|
||||
ts_language: tree_sitter_blame::language(),
|
||||
node_types: tree_sitter_blame::NODE_TYPES,
|
||||
file_extensions: vec!["blame".into()],
|
||||
file_globs: vec!["*.blame".into()],
|
||||
},
|
||||
],
|
||||
trap_dir: options.output_dir,
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
[package]
|
||||
name = "codeql-extractor"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
authors = ["GitHub"]
|
||||
|
||||
[dependencies]
|
||||
flate2 = "1.0"
|
||||
globset = "0.4"
|
||||
tree-sitter = "0.20"
|
||||
tracing = "0.1"
|
||||
rayon = "1.5.0"
|
||||
@@ -19,4 +20,5 @@ num_cpus = "1.14.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tree-sitter-ql = { git = "https://github.com/tree-sitter/tree-sitter-ql" }
|
||||
tree-sitter-json = {git = "https://github.com/tausbn/tree-sitter-json" }
|
||||
rand = "0.8.5"
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crate::trap;
|
||||
use globset::{GlobBuilder, GlobSetBuilder};
|
||||
use rayon::prelude::*;
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::{OsStr, OsString};
|
||||
use std::fs::File;
|
||||
use std::io::BufRead;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -13,7 +12,7 @@ pub struct LanguageSpec {
|
||||
pub prefix: &'static str,
|
||||
pub ts_language: tree_sitter::Language,
|
||||
pub node_types: &'static str,
|
||||
pub file_extensions: Vec<OsString>,
|
||||
pub file_globs: Vec<String>,
|
||||
}
|
||||
|
||||
pub struct Extractor {
|
||||
@@ -83,16 +82,26 @@ impl Extractor {
|
||||
schemas.push(schema);
|
||||
}
|
||||
|
||||
// Construct a map from file extension -> LanguageSpec
|
||||
let mut file_extension_language_mapping: HashMap<&OsStr, Vec<usize>> = HashMap::new();
|
||||
for (i, lang) in self.languages.iter().enumerate() {
|
||||
for (j, _ext) in lang.file_extensions.iter().enumerate() {
|
||||
let indexes = file_extension_language_mapping
|
||||
.entry(&lang.file_extensions[j])
|
||||
.or_default();
|
||||
indexes.push(i);
|
||||
// Construct a single globset containing all language globs,
|
||||
// and a mapping from glob index to language index.
|
||||
let (globset, glob_language_mapping) = {
|
||||
let mut builder = GlobSetBuilder::new();
|
||||
let mut glob_lang_mapping = vec![];
|
||||
for (i, lang) in self.languages.iter().enumerate() {
|
||||
for glob_str in &lang.file_globs {
|
||||
let glob = GlobBuilder::new(glob_str)
|
||||
.literal_separator(true)
|
||||
.build()
|
||||
.expect("invalid glob");
|
||||
builder.add(glob);
|
||||
glob_lang_mapping.push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
(
|
||||
builder.build().expect("failed to build globset"),
|
||||
glob_lang_mapping,
|
||||
)
|
||||
};
|
||||
|
||||
let lines: std::io::Result<Vec<String>> =
|
||||
std::io::BufReader::new(file_list).lines().collect();
|
||||
@@ -108,18 +117,29 @@ impl Extractor {
|
||||
let source = std::fs::read(&path)?;
|
||||
let mut trap_writer = trap::Writer::new();
|
||||
|
||||
match path.extension() {
|
||||
match path.file_name() {
|
||||
None => {
|
||||
tracing::error!(?path, "No extension found, skipping file.");
|
||||
tracing::error!(?path, "No file name found, skipping file.");
|
||||
}
|
||||
Some(ext) => {
|
||||
if let Some(indexes) = file_extension_language_mapping.get(ext) {
|
||||
for i in indexes {
|
||||
let lang = &self.languages[*i];
|
||||
Some(filename) => {
|
||||
let matches = globset.matches(filename);
|
||||
if matches.is_empty() {
|
||||
tracing::error!(?path, "No matching language found, skipping file.");
|
||||
} else {
|
||||
let mut languages_processed = vec![false; self.languages.len()];
|
||||
|
||||
for m in matches {
|
||||
let i = glob_language_mapping[m];
|
||||
if languages_processed[i] {
|
||||
continue;
|
||||
}
|
||||
languages_processed[i] = true;
|
||||
let lang = &self.languages[i];
|
||||
|
||||
crate::extractor::extract(
|
||||
lang.ts_language,
|
||||
lang.prefix,
|
||||
&schemas[*i],
|
||||
&schemas[i],
|
||||
&mut diagnostics_writer,
|
||||
&mut trap_writer,
|
||||
&path,
|
||||
@@ -130,11 +150,9 @@ impl Extractor {
|
||||
std::fs::copy(&path, &src_archive_file)?;
|
||||
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
|
||||
}
|
||||
} else {
|
||||
tracing::warn!(?path, "No language matches path, skipping file.");
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok(()) as std::io::Result<()>
|
||||
})
|
||||
.expect("failed to extract files");
|
||||
|
||||
73
shared/tree-sitter-extractor/tests/common/mod.rs
Normal file
73
shared/tree-sitter-extractor/tests/common/mod.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::io::{Read, Write};
|
||||
use std::{
|
||||
fs::File,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use flate2::read::GzDecoder;
|
||||
|
||||
pub struct SourceArchive {
|
||||
pub root_dir: PathBuf,
|
||||
pub file_list: PathBuf,
|
||||
pub source_archive_dir: PathBuf,
|
||||
pub trap_dir: PathBuf,
|
||||
}
|
||||
|
||||
pub fn create_source_dir(files: Vec<(&'static str, &'static str)>) -> SourceArchive {
|
||||
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
|
||||
std::fs::create_dir_all(&root_dir).unwrap();
|
||||
let root_dir = root_dir
|
||||
.canonicalize()
|
||||
.expect("failed to canonicalize root directory");
|
||||
|
||||
let trap_dir = create_dir(&root_dir, "trap");
|
||||
let source_archive_dir = create_dir(&root_dir, "src");
|
||||
|
||||
let mut file_paths = vec![];
|
||||
for (filename, contents) in files {
|
||||
let path = source_archive_dir.join(filename);
|
||||
let mut file = File::create(&path).unwrap();
|
||||
file.write_all(contents.as_bytes()).unwrap();
|
||||
file_paths.push(PathBuf::from(path));
|
||||
}
|
||||
|
||||
let file_list = {
|
||||
let path = root_dir.join("files.txt");
|
||||
let mut file = File::create(&path).unwrap();
|
||||
for path in file_paths {
|
||||
file.write_all(path.as_path().display().to_string().as_bytes())
|
||||
.unwrap();
|
||||
file.write_all(b"\n").unwrap();
|
||||
}
|
||||
path
|
||||
};
|
||||
|
||||
SourceArchive {
|
||||
root_dir,
|
||||
file_list,
|
||||
source_archive_dir,
|
||||
trap_dir,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expect_trap_file(root_dir: &Path, filename: &str) {
|
||||
let root_dir_relative = {
|
||||
let r = root_dir.display().to_string();
|
||||
r.strip_prefix("/").unwrap().to_string()
|
||||
};
|
||||
let trap_gz = root_dir
|
||||
.join("trap")
|
||||
.join(root_dir_relative)
|
||||
.join("src")
|
||||
.join(format!("{filename}.trap.gz"));
|
||||
let mut decoder = GzDecoder::new(File::open(trap_gz).unwrap());
|
||||
let mut first_line = [0; 31];
|
||||
decoder.read_exact(&mut first_line).unwrap();
|
||||
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
|
||||
}
|
||||
|
||||
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
|
||||
let full_path = root.join(path);
|
||||
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
|
||||
full_path.into()
|
||||
}
|
||||
@@ -1,13 +1,12 @@
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use codeql_extractor::extractor::simple;
|
||||
use codeql_extractor::trap;
|
||||
use flate2::read::GzDecoder;
|
||||
|
||||
use tree_sitter_ql;
|
||||
|
||||
/// An very simple happy-path test.
|
||||
mod common;
|
||||
use common::{create_source_dir, expect_trap_file, SourceArchive};
|
||||
|
||||
/// A very simple happy-path test.
|
||||
/// We run the extractor using the tree-sitter-ql grammar and a single source file,
|
||||
/// and check that we get a reasonable-looking trap file in the expected location.
|
||||
#[test]
|
||||
@@ -16,31 +15,15 @@ fn simple_extractor() {
|
||||
prefix: "ql",
|
||||
ts_language: tree_sitter_ql::language(),
|
||||
node_types: tree_sitter_ql::NODE_TYPES,
|
||||
file_extensions: vec!["qll".into()],
|
||||
file_globs: vec!["*.qll".into()],
|
||||
};
|
||||
|
||||
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
|
||||
std::fs::create_dir_all(&root_dir).unwrap();
|
||||
|
||||
let trap_dir = create_dir(&root_dir, "trap");
|
||||
let source_archive_dir = create_dir(&root_dir, "src");
|
||||
|
||||
// Create foo.qll source file
|
||||
let foo_qll = {
|
||||
let path = source_archive_dir.join("foo.qll");
|
||||
let mut file = File::create(&path).expect("Failed to create src/foo.qll");
|
||||
file.write_all(b"predicate p(int a) { a = 1 }")
|
||||
.expect("Failed to write to foo.qll");
|
||||
PathBuf::from(path)
|
||||
};
|
||||
|
||||
let file_list = {
|
||||
let path = root_dir.join("files.txt");
|
||||
let mut file = File::create(&path).expect("Failed to create files.txt");
|
||||
file.write_all(foo_qll.as_path().display().to_string().as_bytes())
|
||||
.expect("Failed to write to files.txt");
|
||||
path
|
||||
};
|
||||
let SourceArchive {
|
||||
root_dir,
|
||||
file_list,
|
||||
source_archive_dir,
|
||||
trap_dir,
|
||||
} = create_source_dir(vec![("foo.qll", "predicate p(int a) { a = 1 }")]);
|
||||
|
||||
let extractor = simple::Extractor {
|
||||
prefix: "ql".to_string(),
|
||||
@@ -51,31 +34,7 @@ fn simple_extractor() {
|
||||
trap_compression: Ok(trap::Compression::Gzip),
|
||||
};
|
||||
|
||||
// The extractor should run successfully
|
||||
extractor.run().unwrap();
|
||||
|
||||
// Check for the presence of $root/trap/$root/src/foo.qll
|
||||
{
|
||||
let root_dir_relative = {
|
||||
let r = root_dir.as_path().display().to_string();
|
||||
r.strip_prefix("/").unwrap().to_string()
|
||||
};
|
||||
let foo_qll_trap_gz = root_dir
|
||||
.join("trap")
|
||||
.join(root_dir_relative)
|
||||
.join("src/foo.qll.trap.gz");
|
||||
let mut decoder =
|
||||
GzDecoder::new(File::open(foo_qll_trap_gz).expect("Failed to open foo.qll.trap.gz"));
|
||||
let mut first_line = [0; 31];
|
||||
decoder
|
||||
.read_exact(&mut first_line)
|
||||
.expect("Failed to read from foo.qll.trap.gz");
|
||||
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
|
||||
}
|
||||
}
|
||||
|
||||
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
|
||||
let full_path = root.join(path);
|
||||
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
|
||||
full_path.into()
|
||||
expect_trap_file(&root_dir, "foo.qll");
|
||||
}
|
||||
|
||||
51
shared/tree-sitter-extractor/tests/multiple_languages.rs
Normal file
51
shared/tree-sitter-extractor/tests/multiple_languages.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use codeql_extractor::extractor::simple;
|
||||
use codeql_extractor::trap;
|
||||
use tree_sitter_ql;
|
||||
|
||||
mod common;
|
||||
use common::{create_source_dir, expect_trap_file, SourceArchive};
|
||||
|
||||
/// Like the `simple_extractor` test but with multiple languages.
|
||||
/// This is in a separate crate because the simple extractor API sets up a
|
||||
/// global thread pool, and therefore can't be called twice in the same process.
|
||||
#[test]
|
||||
fn multiple_language_extractor() {
|
||||
let lang_ql = simple::LanguageSpec {
|
||||
prefix: "ql",
|
||||
ts_language: tree_sitter_ql::language(),
|
||||
node_types: tree_sitter_ql::NODE_TYPES,
|
||||
file_globs: vec!["*.qll".into()],
|
||||
};
|
||||
let lang_json = simple::LanguageSpec {
|
||||
prefix: "json",
|
||||
ts_language: tree_sitter_json::language(),
|
||||
node_types: tree_sitter_json::NODE_TYPES,
|
||||
file_globs: vec!["*.json".into(), "*Jsonfile".into()],
|
||||
};
|
||||
|
||||
let SourceArchive {
|
||||
root_dir,
|
||||
file_list,
|
||||
source_archive_dir,
|
||||
trap_dir,
|
||||
} = create_source_dir(vec![
|
||||
("foo.qll", "predicate p(int a) { a = 1 }"),
|
||||
("bar.json", "{\"a\": 1}"),
|
||||
("Jsonfile", "{\"b\": 2}"),
|
||||
]);
|
||||
|
||||
let extractor = simple::Extractor {
|
||||
prefix: "ql".to_string(),
|
||||
languages: vec![lang_ql, lang_json],
|
||||
trap_dir,
|
||||
source_archive_dir,
|
||||
file_list,
|
||||
trap_compression: Ok(trap::Compression::Gzip),
|
||||
};
|
||||
|
||||
extractor.run().unwrap();
|
||||
|
||||
expect_trap_file(&root_dir, "foo.qll");
|
||||
expect_trap_file(&root_dir, "bar.json");
|
||||
expect_trap_file(&root_dir, "Jsonfile");
|
||||
}
|
||||
Reference in New Issue
Block a user