Merge pull request #83 from github/threads

Parallelize extraction
This commit is contained in:
Nick Rolfe
2021-01-07 17:14:41 +00:00
committed by GitHub
5 changed files with 203 additions and 53 deletions

View File

@@ -27,6 +27,7 @@ jobs:
unzip -q codeql-linux64.zip
env:
GITHUB_TOKEN: ${{ github.token }}
CODEQL_THREADS: 4 # TODO: remove this once it's set by the CLI
- uses: actions/cache@v2
with:
path: |
@@ -46,6 +47,7 @@ jobs:
run: |
codeql/codeql database create \
--search-path "${{ github.workspace }}" \
--threads 4 \
--language ruby --source-root "${{ github.workspace }}/repo" \
"${{ runner.temp }}/database"
- name: Measure database

110
Cargo.lock generated
View File

@@ -108,6 +108,12 @@ dependencies = [
"vec_map",
]
[[package]]
name = "const_fn"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd51eab21ab4fd6a3bf889e2d0958c0a6e3a61ad04260325e919e652a2a62826"
[[package]]
name = "crc32fast"
version = "1.2.1"
@@ -117,6 +123,58 @@ dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d"
dependencies = [
"cfg-if 1.0.0",
"const_fn",
"crossbeam-utils",
"lazy_static",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d"
dependencies = [
"autocfg",
"cfg-if 1.0.0",
"lazy_static",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "flate2"
version = "1.0.19"
@@ -206,6 +264,15 @@ version = "2.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
[[package]]
name = "memoffset"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87"
dependencies = [
"autocfg",
]
[[package]]
name = "miniz_oxide"
version = "0.4.3"
@@ -243,6 +310,16 @@ dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "pin-project-lite"
version = "0.2.0"
@@ -267,6 +344,31 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674"
dependencies = [
"autocfg",
"crossbeam-deque",
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "regex"
version = "1.4.2"
@@ -302,6 +404,8 @@ dependencies = [
"clap",
"flate2",
"node-types",
"num_cpus",
"rayon",
"tracing",
"tracing-subscriber",
"tree-sitter",
@@ -339,6 +443,12 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2"
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "semver"
version = "0.9.0"

View File

@@ -14,3 +14,5 @@ tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git"
clap = "2.33"
tracing = "0.1"
tracing-subscriber = { version = "0.2", features = ["env-filter"] }
rayon = "1.5.0"
num_cpus = "1.13.0"

View File

@@ -148,55 +148,40 @@ impl TrapWriter {
}
}
pub struct Extractor {
pub parser: Parser,
pub schema: NodeTypeMap,
}
/// Extracts the source file at `path`, which is assumed to be canonicalized.
pub fn extract(language: Language, schema: &NodeTypeMap, path: &Path) -> std::io::Result<Program> {
let span = span!(
Level::TRACE,
"extract",
file = %path.display()
);
let _enter = span.enter();
info!("extracting: {}", path.display());
pub fn create(language: Language, schema: NodeTypeMap) -> Extractor {
let mut parser = Parser::new();
parser.set_language(language).unwrap();
let source = std::fs::read(&path)?;
let tree = parser.parse(&source, None).expect("Failed to parse file");
let mut trap_writer = new_trap_writer();
trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
let file_label = &trap_writer.populate_file(path);
let mut visitor = Visitor {
source: &source,
trap_writer: trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
file_label: *file_label,
token_counter: 0,
toplevel_child_counter: 0,
stack: Vec::new(),
schema,
};
traverse(&tree, &mut visitor);
Extractor { parser, schema }
}
impl Extractor {
/// Extracts the source file at `path`, which is assumed to be canonicalized.
pub fn extract<'a>(&'a mut self, path: &Path) -> std::io::Result<Program> {
let span = span!(
Level::TRACE,
"extract",
file = %path.display()
);
let _enter = span.enter();
info!("extracting: {}", path.display());
let source = std::fs::read(&path)?;
let tree = &self
.parser
.parse(&source, None)
.expect("Failed to parse file");
let mut trap_writer = new_trap_writer();
trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display()));
let file_label = &trap_writer.populate_file(path);
let mut visitor = Visitor {
source: &source,
trap_writer: trap_writer,
// TODO: should we handle path strings that are not valid UTF8 better?
path: format!("{}", path.display()),
file_label: *file_label,
token_counter: 0,
toplevel_child_counter: 0,
stack: Vec::new(),
schema: &self.schema,
};
traverse(&tree, &mut visitor);
&self.parser.reset();
Ok(Program(visitor.trap_writer.trap_output))
}
parser.reset();
Ok(Program(visitor.trap_writer.trap_output))
}
/// Normalizes the path according the common CodeQL specification. Assumes that

View File

@@ -1,7 +1,10 @@
mod extractor;
extern crate num_cpus;
use clap;
use flate2::write::GzEncoder;
use rayon::prelude::*;
use std::fs;
use std::io::{BufRead, BufWriter, Write};
use std::path::{Path, PathBuf};
@@ -42,6 +45,39 @@ impl TrapCompression {
}
}
/**
* Gets the number of threads the extractor should use, by reading the
* CODEQL_THREADS environment variable and using it as described in the
* extractor spec:
*
* "If the number is positive, it indicates the number of threads that should
* be used. If the number is negative or zero, it should be added to the number
* of cores available on the machine to determine how many threads to use
* (minimum of 1). If unspecified, should be considered as set to 1."
*/
fn num_codeql_threads() -> usize {
match std::env::var("CODEQL_THREADS") {
// Use 1 thread if the environment variable isn't set.
Err(_) => 1,
Ok(num) => match num.parse::<i32>() {
Ok(num) if num <= 0 => {
let reduction = -num as usize;
num_cpus::get() - reduction
}
Ok(num) => num as usize,
Err(_) => {
tracing::error!(
"Unable to parse CODEQL_THREADS value '{}'; defaulting to 1 thread.",
&num
);
1
}
},
}
}
fn main() -> std::io::Result<()> {
tracing_subscriber::fmt()
.with_target(false)
@@ -50,6 +86,21 @@ fn main() -> std::io::Result<()> {
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.init();
let num_threads = num_codeql_threads();
tracing::info!(
"Using {} {}",
num_threads,
if num_threads == 1 {
"thread"
} else {
"threads"
}
);
rayon::ThreadPoolBuilder::new()
.num_threads(num_threads)
.build_global()
.unwrap();
let matches = clap::App::new("Ruby extractor")
.version("1.0")
.author("GitHub")
@@ -76,12 +127,13 @@ fn main() -> std::io::Result<()> {
let language = tree_sitter_ruby::language();
let schema = node_types::read_node_types_str(tree_sitter_ruby::NODE_TYPES)?;
let mut extractor = extractor::create(language, schema);
for line in std::io::BufReader::new(file_list).lines() {
let path = PathBuf::from(line?).canonicalize()?;
let lines: std::io::Result<Vec<String>> = std::io::BufReader::new(file_list).lines().collect();
let lines = lines?;
lines.par_iter().try_for_each(|line| {
let path = PathBuf::from(line).canonicalize()?;
let trap_file = path_for(&trap_dir, &path, trap_compression.extension());
let src_archive_file = path_for(&src_archive_dir, &path, "");
let trap = extractor.extract(&path)?;
let trap = extractor::extract(language, &schema, &path)?;
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
std::fs::copy(&path, &src_archive_file)?;
std::fs::create_dir_all(&trap_file.parent().unwrap())?;
@@ -89,15 +141,14 @@ fn main() -> std::io::Result<()> {
let mut trap_file = BufWriter::new(trap_file);
match trap_compression {
TrapCompression::None => {
write!(trap_file, "{}", trap)?;
write!(trap_file, "{}", trap)
}
TrapCompression::Gzip => {
let mut compressed_writer = GzEncoder::new(trap_file, flate2::Compression::fast());
write!(compressed_writer, "{}", trap)?;
write!(compressed_writer, "{}", trap)
}
}
}
return Ok(());
})
}
fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {