diff --git a/Cargo.lock b/Cargo.lock index a493f322a57..cdc026359c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -108,6 +108,12 @@ dependencies = [ "vec_map", ] +[[package]] +name = "const_fn" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd51eab21ab4fd6a3bf889e2d0958c0a6e3a61ad04260325e919e652a2a62826" + [[package]] name = "crc32fast" version = "1.2.1" @@ -117,6 +123,58 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d" +dependencies = [ + "cfg-if 1.0.0", + "const_fn", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d" +dependencies = [ + "autocfg", + "cfg-if 1.0.0", + "lazy_static", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + [[package]] name = "flate2" version = "1.0.19" @@ -206,6 +264,15 @@ version = "2.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +[[package]] +name = "memoffset" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" +dependencies = [ + "autocfg", +] + [[package]] name = "miniz_oxide" version = "0.4.3" @@ -243,6 +310,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "pin-project-lite" version = "0.2.0" @@ -267,6 +344,31 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "regex" version = "1.4.2" @@ -302,6 +404,8 @@ dependencies = [ "clap", "flate2", "node-types", + "num_cpus", + "rayon", "tracing", "tracing-subscriber", "tree-sitter", @@ -339,6 +443,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + [[package]] name = "semver" version = "0.9.0" diff --git a/extractor/Cargo.toml b/extractor/Cargo.toml index 180e767b53b..5d98f99bebe 100644 --- a/extractor/Cargo.toml +++ b/extractor/Cargo.toml @@ -14,3 +14,5 @@ tree-sitter-ruby = { git = "https://github.com/tree-sitter/tree-sitter-ruby.git" clap = "2.33" tracing = "0.1" tracing-subscriber = { version = "0.2", features = ["env-filter"] } +rayon = "1.5.0" +num_cpus = "1.13.0" diff --git a/extractor/src/extractor.rs b/extractor/src/extractor.rs index 11d101bd653..dcd762e43cb 100644 --- a/extractor/src/extractor.rs +++ b/extractor/src/extractor.rs @@ -4,7 +4,7 @@ use std::collections::BTreeSet as Set; use std::fmt; use std::path::Path; use tracing::{error, info, span, Level}; -use tree_sitter::{Language, Node, Parser, Tree}; +use tree_sitter::{Node, Parser, Tree}; struct TrapWriter { /// The accumulated trap entries @@ -148,55 +148,38 @@ impl TrapWriter { } } -pub struct Extractor { - pub parser: Parser, - pub schema: NodeTypeMap, -} +/// Extracts the source file at `path`, which is assumed to be canonicalized. +pub fn extract(parser: &mut Parser, schema: &NodeTypeMap, path: &Path) -> std::io::Result { + let span = span!( + Level::TRACE, + "extract", + file = %path.display() + ); -pub fn create(language: Language, schema: NodeTypeMap) -> Extractor { - let mut parser = Parser::new(); - parser.set_language(language).unwrap(); + let _enter = span.enter(); - Extractor { parser, schema } -} + info!("extracting: {}", path.display()); -impl Extractor { - /// Extracts the source file at `path`, which is assumed to be canonicalized. - pub fn extract<'a>(&'a mut self, path: &Path) -> std::io::Result { - let span = span!( - Level::TRACE, - "extract", - file = %path.display() - ); + let source = std::fs::read(&path)?; + let tree = parser.parse(&source, None).expect("Failed to parse file"); + let mut trap_writer = new_trap_writer(); + trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display())); + let file_label = &trap_writer.populate_file(path); + let mut visitor = Visitor { + source: &source, + trap_writer: trap_writer, + // TODO: should we handle path strings that are not valid UTF8 better? + path: format!("{}", path.display()), + file_label: *file_label, + token_counter: 0, + toplevel_child_counter: 0, + stack: Vec::new(), + schema, + }; + traverse(&tree, &mut visitor); - let _enter = span.enter(); - - info!("extracting: {}", path.display()); - - let source = std::fs::read(&path)?; - let tree = &self - .parser - .parse(&source, None) - .expect("Failed to parse file"); - let mut trap_writer = new_trap_writer(); - trap_writer.comment(format!("Auto-generated TRAP file for {}", path.display())); - let file_label = &trap_writer.populate_file(path); - let mut visitor = Visitor { - source: &source, - trap_writer: trap_writer, - // TODO: should we handle path strings that are not valid UTF8 better? - path: format!("{}", path.display()), - file_label: *file_label, - token_counter: 0, - toplevel_child_counter: 0, - stack: Vec::new(), - schema: &self.schema, - }; - traverse(&tree, &mut visitor); - - &self.parser.reset(); - Ok(Program(visitor.trap_writer.trap_output)) - } + parser.reset(); + Ok(Program(visitor.trap_writer.trap_output)) } /// Normalizes the path according the common CodeQL specification. Assumes that diff --git a/extractor/src/main.rs b/extractor/src/main.rs index ca9747fb673..abcb1871dfe 100644 --- a/extractor/src/main.rs +++ b/extractor/src/main.rs @@ -1,10 +1,14 @@ mod extractor; +extern crate num_cpus; + use clap; use flate2::write::GzEncoder; +use rayon::prelude::*; use std::fs; use std::io::{BufRead, BufWriter, Write}; use std::path::{Path, PathBuf}; +use tree_sitter::Parser; enum TrapCompression { None, @@ -42,6 +46,41 @@ impl TrapCompression { } } +/** + * Gets the number of threads the extractor should use, by reading the + * CODEQL_THREADS environment variable and using it as follows: + * + * If the number is positive, it indicates the number of threads that should be + * used. If the number is negative or zero, it should be added to the number of + * cores available on the machine to determine how many threads to use (minimum + * of 1). If unspecified, should be considered as set to 1. + */ +fn num_codeql_threads() -> usize { + match std::env::var("CODEQL_THREADS") { + Ok(num) => match num.parse::() { + Ok(num) => { + if num <= 0 { + let reduction = -num as usize; + num_cpus::get() - reduction + } else { + num as usize + } + } + + Err(_) => { + tracing::error!( + "Unable to parse CODEQL_THREADS value '{}'; defaulting to 1 thread.", + &num + ); + 1 + } + }, + + // Use 1 thread if the environment variable isn't set. + Err(_) => 1, + } +} + fn main() -> std::io::Result<()> { tracing_subscriber::fmt() .with_target(false) @@ -50,6 +89,21 @@ fn main() -> std::io::Result<()> { .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) .init(); + let num_threads = num_codeql_threads(); + tracing::info!( + "Using {} {}", + num_threads, + if num_threads == 1 { + "thread" + } else { + "threads" + } + ); + rayon::ThreadPoolBuilder::new() + .num_threads(num_threads) + .build_global() + .unwrap(); + let matches = clap::App::new("Ruby extractor") .version("1.0") .author("GitHub") @@ -76,12 +130,15 @@ fn main() -> std::io::Result<()> { let language = tree_sitter_ruby::language(); let schema = node_types::read_node_types_str(tree_sitter_ruby::NODE_TYPES)?; - let mut extractor = extractor::create(language, schema); - for line in std::io::BufReader::new(file_list).lines() { - let path = PathBuf::from(line?).canonicalize()?; + let lines: std::io::Result> = std::io::BufReader::new(file_list).lines().collect(); + let lines = lines?; + lines.par_iter().try_for_each(|line| { + let mut parser = Parser::new(); + parser.set_language(language).unwrap(); + let path = PathBuf::from(line).canonicalize()?; let trap_file = path_for(&trap_dir, &path, trap_compression.extension()); let src_archive_file = path_for(&src_archive_dir, &path, ""); - let trap = extractor.extract(&path)?; + let trap = extractor::extract(&mut parser, &schema, &path)?; std::fs::create_dir_all(&src_archive_file.parent().unwrap())?; std::fs::copy(&path, &src_archive_file)?; std::fs::create_dir_all(&trap_file.parent().unwrap())?; @@ -96,8 +153,10 @@ fn main() -> std::io::Result<()> { write!(compressed_writer, "{}", trap)?; } } - } - return Ok(()); + std::io::Result::Ok(()) + })?; + + Ok(()) } fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {