Ruby: Merge extractor binaries into one

There is now one binary, codeql-ruby-extractor, which takes a positional argument specifying whether to extract, generate or autobuild.
2025-12-16 16:53:25 +01:00 · 2023-04-06 17:00:11 +08:00
parent 5a8a6f2971
commit 79089b40b9
5 changed files with 101 additions and 148 deletions
--- a/ruby/extractor/src/extractor.rs
+++ b/ruby/extractor/src/extractor.rs
@@ -0,0 +1,370 @@
+use clap::Args;
+use lazy_static::lazy_static;
+use rayon::prelude::*;
+use std::borrow::Cow;
+use std::fs;
+use std::io::BufRead;
+use std::path::{Path, PathBuf};
+use tree_sitter::{Language, Parser, Range};
+
+use codeql_extractor::{diagnostics, extractor, file_paths, node_types, trap};
+
+#[derive(Args)]
+pub struct Options {
+    /// Sets a custom source achive folder
+    #[arg(long)]
+    source_archive_dir: String,
+
+    /// Sets a custom trap folder
+    #[arg(long)]
+    output_dir: String,
+
+    /// A text file containing the paths of the files to extract
+    #[arg(long)]
+    file_list: String,
+}
+
+pub fn run(options: Options) -> std::io::Result<()> {
+    tracing_subscriber::fmt()
+        .with_target(false)
+        .without_time()
+        .with_level(true)
+        .with_env_filter(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("ruby_extractor=warn")),
+        )
+        .init();
+    let diagnostics = diagnostics::DiagnosticLoggers::new("ruby");
+    let mut main_thread_logger = diagnostics.logger();
+    let num_threads = match codeql_extractor::options::num_threads() {
+        Ok(num) => num,
+        Err(e) => {
+            main_thread_logger.write(
+                main_thread_logger
+                    .new_entry("configuration-error", "Configuration error")
+                    .message(
+                        "{}; defaulting to 1 thread.",
+                        &[diagnostics::MessageArg::Code(&e)],
+                    )
+                    .severity(diagnostics::Severity::Warning),
+            );
+            1
+        }
+    };
+    tracing::info!(
+        "Using {} {}",
+        num_threads,
+        if num_threads == 1 {
+            "thread"
+        } else {
+            "threads"
+        }
+    );
+    let trap_compression =
+        match trap::Compression::from_env("CODEQL_EXTRACTOR_RUBY_OPTION_TRAP_COMPRESSION") {
+            Ok(x) => x,
+            Err(e) => {
+                main_thread_logger.write(
+                    main_thread_logger
+                        .new_entry("configuration-error", "Configuration error")
+                        .message("{}; using gzip.", &[diagnostics::MessageArg::Code(&e)])
+                        .severity(diagnostics::Severity::Warning),
+                );
+                trap::Compression::Gzip
+            }
+        };
+    drop(main_thread_logger);
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(num_threads)
+        .build_global()
+        .unwrap();
+
+    let src_archive_dir = file_paths::path_from_string(&options.source_archive_dir);
+
+    let trap_dir = file_paths::path_from_string(&options.output_dir);
+
+    let file_list = fs::File::open(file_paths::path_from_string(&options.file_list))?;
+
+    let language = tree_sitter_ruby::language();
+    let erb = tree_sitter_embedded_template::language();
+    // Look up tree-sitter kind ids now, to avoid string comparisons when scanning ERB files.
+    let erb_directive_id = erb.id_for_node_kind("directive", true);
+    let erb_output_directive_id = erb.id_for_node_kind("output_directive", true);
+    let erb_code_id = erb.id_for_node_kind("code", true);
+    let schema = node_types::read_node_types_str("ruby", tree_sitter_ruby::NODE_TYPES)?;
+    let erb_schema =
+        node_types::read_node_types_str("erb", tree_sitter_embedded_template::NODE_TYPES)?;
+    let lines: std::io::Result<Vec<String>> = std::io::BufReader::new(file_list).lines().collect();
+    let lines = lines?;
+    lines
+        .par_iter()
+        .try_for_each(|line| {
+            let mut diagnostics_writer = diagnostics.logger();
+            let path = PathBuf::from(line).canonicalize()?;
+            let src_archive_file = file_paths::path_for(&src_archive_dir, &path, "");
+            let mut source = std::fs::read(&path)?;
+            let mut needs_conversion = false;
+            let code_ranges;
+            let mut trap_writer = trap::Writer::new();
+            if path.extension().map_or(false, |x| x == "erb") {
+                tracing::info!("scanning: {}", path.display());
+                extractor::extract(
+                    erb,
+                    "erb",
+                    &erb_schema,
+                    &mut diagnostics_writer,
+                    &mut trap_writer,
+                    &path,
+                    &source,
+                    &[],
+                );
+
+                let (ranges, line_breaks) = scan_erb(
+                    erb,
+                    &source,
+                    erb_directive_id,
+                    erb_output_directive_id,
+                    erb_code_id,
+                );
+                for i in line_breaks {
+                    if i < source.len() {
+                        source[i] = b'\n';
+                    }
+                }
+                code_ranges = ranges;
+            } else {
+                if let Some(encoding_name) = scan_coding_comment(&source) {
+                    // If the input is already UTF-8 then there is no need to recode the source
+                    // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
+                    // to interpret characters. In this case it is probably best to leave the input
+                    // unchanged.
+                    if !encoding_name.eq_ignore_ascii_case("utf-8")
+                        && !encoding_name.eq_ignore_ascii_case("ascii-8bit")
+                        && !encoding_name.eq_ignore_ascii_case("binary")
+                    {
+                        if let Some(encoding) = encoding_from_name(&encoding_name) {
+                            needs_conversion =
+                                encoding.whatwg_name().unwrap_or_default() != "utf-8";
+                            if needs_conversion {
+                                match encoding
+                                    .decode(&source, encoding::types::DecoderTrap::Replace)
+                                {
+                                    Ok(str) => source = str.as_bytes().to_owned(),
+                                    Err(msg) => {
+                                        needs_conversion = false;
+                                        diagnostics_writer.write(
+                                            diagnostics_writer
+                                                .new_entry(
+                                                    "character-decoding-error",
+                                                    "Character decoding error",
+                                                )
+                                                .file(&file_paths::normalize_path(&path))
+                                                .message(
+                                                    "Could not decode the file contents as {}: {}. The contents of the file must match the character encoding specified in the {} {}.",
+                                                    &[
+                                                        diagnostics::MessageArg::Code(&encoding_name),
+                                                        diagnostics::MessageArg::Code(&msg),
+                                                        diagnostics::MessageArg::Code("encoding:"),
+                                                        diagnostics::MessageArg::Link("directive", "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive")
+                                                    ],
+                                                )
+                                                .status_page()
+                                                .severity(diagnostics::Severity::Warning),
+                                        );
+                                    }
+                                }
+                            }
+                        } else {
+                            diagnostics_writer.write(
+                                diagnostics_writer
+                                    .new_entry("unknown-character-encoding", "Could not process some files due to an unknown character encoding")
+                                    .file(&file_paths::normalize_path(&path))
+                                    .message(
+                                        "Unknown character encoding {} in {} {}.",
+                                        &[
+                                            diagnostics::MessageArg::Code(&encoding_name),
+                                            diagnostics::MessageArg::Code("#encoding:"),
+                                            diagnostics::MessageArg::Link("directive", "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive")
+                                        ],
+                                    )
+                                    .status_page()
+                                    .severity(diagnostics::Severity::Warning),
+                            );
+                        }
+                    }
+                }
+                code_ranges = vec![];
+            }
+            extractor::extract(
+                language,
+                "ruby",
+                &schema,
+                &mut diagnostics_writer,
+                &mut trap_writer,
+                &path,
+                &source,
+                &code_ranges,
+            );
+            std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
+            if needs_conversion {
+                std::fs::write(&src_archive_file, &source)?;
+            } else {
+                std::fs::copy(&path, &src_archive_file)?;
+            }
+            write_trap(&trap_dir, path, &trap_writer, trap_compression)
+        })
+        .expect("failed to extract files");
+
+    let path = PathBuf::from("extras");
+    let mut trap_writer = trap::Writer::new();
+    extractor::populate_empty_location(&mut trap_writer);
+    write_trap(&trap_dir, path, &trap_writer, trap_compression)
+}
+
+lazy_static! {
+    static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
+}
+
+/// Returns the `encoding::Encoding` corresponding to the given encoding name, if one exists.
+fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
+    match encoding::label::encoding_from_whatwg_label(encoding_name) {
+        s @ Some(_) => s,
+        None => CP_NUMBER.captures(encoding_name).and_then(|cap| {
+            encoding::label::encoding_from_windows_code_page(
+                str::parse(cap.get(1).unwrap().as_str()).unwrap(),
+            )
+        }),
+    }
+}
+
+fn write_trap(
+    trap_dir: &Path,
+    path: PathBuf,
+    trap_writer: &trap::Writer,
+    trap_compression: trap::Compression,
+) -> std::io::Result<()> {
+    let trap_file = file_paths::path_for(trap_dir, &path, trap_compression.extension());
+    std::fs::create_dir_all(trap_file.parent().unwrap())?;
+    trap_writer.write_to_file(&trap_file, trap_compression)
+}
+
+fn scan_erb(
+    erb: Language,
+    source: &[u8],
+    directive_id: u16,
+    output_directive_id: u16,
+    code_id: u16,
+) -> (Vec<Range>, Vec<usize>) {
+    let mut parser = Parser::new();
+    parser.set_language(erb).unwrap();
+    let tree = parser.parse(source, None).expect("Failed to parse file");
+    let mut result = Vec::new();
+    let mut line_breaks = vec![];
+
+    for n in tree.root_node().children(&mut tree.walk()) {
+        let kind_id = n.kind_id();
+        if kind_id == directive_id || kind_id == output_directive_id {
+            for c in n.children(&mut tree.walk()) {
+                if c.kind_id() == code_id {
+                    let mut range = c.range();
+                    if range.end_byte < source.len() {
+                        line_breaks.push(range.end_byte);
+                        range.end_byte += 1;
+                        range.end_point.column += 1;
+                    }
+                    result.push(range);
+                }
+            }
+        }
+    }
+
+    if result.is_empty() {
+        let root = tree.root_node();
+
+        // Add an empty range at the end of the file
+        result.push(Range {
+            start_byte: root.end_byte(),
+            end_byte: root.end_byte(),
+            start_point: root.end_position(),
+            end_point: root.end_position(),
+        });
+    }
+    (result, line_breaks)
+}
+
+/// Advance `index` to the next non-whitespace character.
+/// Newlines are **not** considered whitespace.
+fn skip_space(content: &[u8], index: usize) -> usize {
+    let mut index = index;
+    while index < content.len() {
+        let c = content[index] as char;
+        // white space except \n
+        let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
+        if !is_space {
+            break;
+        }
+        index += 1;
+    }
+    index
+}
+fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
+    let mut index = 0;
+    // skip UTF-8 BOM marker if there is one
+    if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
+        index += 3;
+    }
+    // skip #! line if there is one
+    if index + 1 < content.len()
+        && content[index] as char == '#'
+        && content[index + 1] as char == '!'
+    {
+        index += 2;
+        while index < content.len() && content[index] as char != '\n' {
+            index += 1
+        }
+        index += 1
+    }
+    index = skip_space(content, index);
+
+    if index >= content.len() || content[index] as char != '#' {
+        return None;
+    }
+    index += 1;
+
+    const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
+    let mut word_index = 0;
+    while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
+        if content[index] as char == CODING[word_index]
+            || content[index] as char == CODING[word_index + 1]
+        {
+            word_index += 2
+        } else {
+            word_index = 0;
+        }
+        index += 1;
+    }
+    if word_index < CODING.len() {
+        return None;
+    }
+    index = skip_space(content, index);
+
+    if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
+        return None;
+    }
+    index += 1;
+    index = skip_space(content, index);
+
+    let start = index;
+    while index < content.len() {
+        let c = content[index] as char;
+        if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
+            index += 1;
+        } else {
+            break;
+        }
+    }
+    if index > start {
+        return Some(String::from_utf8_lossy(&content[start..index]));
+    }
+    None
+}