use clap::Args; use lazy_static::lazy_static; use rayon::prelude::*; use std::borrow::Cow; use std::fs; use std::io::BufRead; use std::path::{Path, PathBuf}; use tree_sitter::{Language, Parser, Range}; use codeql_extractor::{diagnostics, extractor, file_paths, node_types, trap}; #[derive(Args)] pub struct Options { /// Sets a custom source achive folder #[arg(long)] source_archive_dir: String, /// Sets a custom trap folder #[arg(long)] output_dir: String, /// A text file containing the paths of the files to extract #[arg(long)] file_list: String, } pub fn run(options: Options) -> std::io::Result<()> { extractor::set_tracing_level("ruby"); tracing::info!("Extraction started"); let diagnostics = diagnostics::DiagnosticLoggers::new("ruby"); let mut main_thread_logger = diagnostics.logger(); let num_threads = match codeql_extractor::options::num_threads() { Ok(num) => num, Err(e) => { main_thread_logger.write( main_thread_logger .new_entry("configuration-error", "Configuration error") .message( "{}; defaulting to 1 thread.", &[diagnostics::MessageArg::Code(&e)], ) .severity(diagnostics::Severity::Warning), ); 1 } }; tracing::info!( "Using {} {}", num_threads, if num_threads == 1 { "thread" } else { "threads" } ); let trap_compression = match trap::Compression::from_env("CODEQL_EXTRACTOR_RUBY_OPTION_TRAP_COMPRESSION") { Ok(x) => x, Err(e) => { main_thread_logger.write( main_thread_logger .new_entry("configuration-error", "Configuration error") .message("{}; using gzip.", &[diagnostics::MessageArg::Code(&e)]) .severity(diagnostics::Severity::Warning), ); trap::Compression::Gzip } }; drop(main_thread_logger); rayon::ThreadPoolBuilder::new() .num_threads(num_threads) .build_global() .unwrap(); let src_archive_dir = file_paths::path_from_string(&options.source_archive_dir); let trap_dir = file_paths::path_from_string(&options.output_dir); let file_list = fs::File::open(file_paths::path_from_string(&options.file_list))?; let language: Language = tree_sitter_ruby::LANGUAGE.into(); let erb: Language = tree_sitter_embedded_template::LANGUAGE.into(); // Look up tree-sitter kind ids now, to avoid string comparisons when scanning ERB files. let erb_directive_id = erb.id_for_node_kind("directive", true); let erb_output_directive_id = erb.id_for_node_kind("output_directive", true); let erb_code_id = erb.id_for_node_kind("code", true); let schema = node_types::read_node_types_str("ruby", tree_sitter_ruby::NODE_TYPES)?; let erb_schema = node_types::read_node_types_str("erb", tree_sitter_embedded_template::NODE_TYPES)?; let lines: std::io::Result> = std::io::BufReader::new(file_list).lines().collect(); let lines = lines?; lines .par_iter() .try_for_each(|line| { let mut diagnostics_writer = diagnostics.logger(); let path = PathBuf::from(line).canonicalize()?; let src_archive_file = file_paths::path_for(&src_archive_dir, &path, ""); let mut source = std::fs::read(&path)?; let mut needs_conversion = false; let code_ranges; let mut trap_writer = trap::Writer::new(); if path.extension().map_or(false, |x| x == "erb") { tracing::info!("scanning: {}", path.display()); extractor::extract( &erb, "erb", &erb_schema, &mut diagnostics_writer, &mut trap_writer, &path, &source, &[], ); let (ranges, line_breaks) = scan_erb( &erb, &source, erb_directive_id, erb_output_directive_id, erb_code_id, ); for i in line_breaks { if i < source.len() { source[i] = b'\n'; } } code_ranges = ranges; } else { if let Some(encoding_name) = scan_coding_comment(&source) { // If the input is already UTF-8 then there is no need to recode the source // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how // to interpret characters. In this case it is probably best to leave the input // unchanged. if !encoding_name.eq_ignore_ascii_case("utf-8") && !encoding_name.eq_ignore_ascii_case("ascii-8bit") && !encoding_name.eq_ignore_ascii_case("binary") { if let Some(encoding) = encoding_from_name(&encoding_name) { needs_conversion = encoding.whatwg_name().unwrap_or_default() != "utf-8"; if needs_conversion { match encoding .decode(&source, encoding::types::DecoderTrap::Replace) { Ok(str) => source = str.as_bytes().to_owned(), Err(msg) => { needs_conversion = false; diagnostics_writer.write( diagnostics_writer .new_entry( "character-decoding-error", "Character decoding error", ) .file(&file_paths::normalize_path(&path)) .message( "Could not decode the file contents as {}: {}. The contents of the file must match the character encoding specified in the {} {}.", &[ diagnostics::MessageArg::Code(&encoding_name), diagnostics::MessageArg::Code(&msg), diagnostics::MessageArg::Code("encoding:"), diagnostics::MessageArg::Link("directive", "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive") ], ) .status_page() .severity(diagnostics::Severity::Warning), ); } } } } else { diagnostics_writer.write( diagnostics_writer .new_entry("unknown-character-encoding", "Could not process some files due to an unknown character encoding") .file(&file_paths::normalize_path(&path)) .message( "Unknown character encoding {} in {} {}.", &[ diagnostics::MessageArg::Code(&encoding_name), diagnostics::MessageArg::Code("#encoding:"), diagnostics::MessageArg::Link("directive", "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive") ], ) .status_page() .severity(diagnostics::Severity::Warning), ); } } } code_ranges = vec![]; } extractor::extract( &language, "ruby", &schema, &mut diagnostics_writer, &mut trap_writer, &path, &source, &code_ranges, ); std::fs::create_dir_all(src_archive_file.parent().unwrap())?; if needs_conversion { std::fs::write(&src_archive_file, &source)?; } else { std::fs::copy(&path, &src_archive_file)?; } write_trap(&trap_dir, path, &trap_writer, trap_compression) }) .expect("failed to extract files"); let path = PathBuf::from("extras"); let mut trap_writer = trap::Writer::new(); extractor::populate_empty_location(&mut trap_writer); let res = write_trap(&trap_dir, path, &trap_writer, trap_compression); tracing::info!("Extraction complete"); res } lazy_static! { static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap(); } /// Returns the `encoding::Encoding` corresponding to the given encoding name, if one exists. fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> { match encoding::label::encoding_from_whatwg_label(encoding_name) { s @ Some(_) => s, None => CP_NUMBER.captures(encoding_name).and_then(|cap| { encoding::label::encoding_from_windows_code_page( str::parse(cap.get(1).unwrap().as_str()).unwrap(), ) }), } } fn write_trap( trap_dir: &Path, path: PathBuf, trap_writer: &trap::Writer, trap_compression: trap::Compression, ) -> std::io::Result<()> { let trap_file = file_paths::path_for(trap_dir, &path, trap_compression.extension()); std::fs::create_dir_all(trap_file.parent().unwrap())?; trap_writer.write_to_file(&trap_file, trap_compression) } fn scan_erb( erb: &Language, source: &[u8], directive_id: u16, output_directive_id: u16, code_id: u16, ) -> (Vec, Vec) { let mut parser = Parser::new(); parser.set_language(erb).unwrap(); let tree = parser.parse(source, None).expect("Failed to parse file"); let mut result = Vec::new(); let mut line_breaks = vec![]; for n in tree.root_node().children(&mut tree.walk()) { let kind_id = n.kind_id(); if kind_id == directive_id || kind_id == output_directive_id { for c in n.children(&mut tree.walk()) { if c.kind_id() == code_id { let mut range = c.range(); if range.end_byte < source.len() { line_breaks.push(range.end_byte); range.end_byte += 1; range.end_point.column += 1; } result.push(range); } } } } if result.is_empty() { let root = tree.root_node(); // Add an empty range at the end of the file result.push(Range { start_byte: root.end_byte(), end_byte: root.end_byte(), start_point: root.end_position(), end_point: root.end_position(), }); } (result, line_breaks) } /// Advance `index` to the next non-whitespace character. /// Newlines are **not** considered whitespace. fn skip_space(content: &[u8], index: usize) -> usize { let mut index = index; while index < content.len() { let c = content[index] as char; // white space except \n let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n'; if !is_space { break; } index += 1; } index } fn scan_coding_comment(content: &[u8]) -> std::option::Option> { let mut index = 0; // skip UTF-8 BOM marker if there is one if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf { index += 3; } // skip #! line if there is one if index + 1 < content.len() && content[index] as char == '#' && content[index + 1] as char == '!' { index += 2; while index < content.len() && content[index] as char != '\n' { index += 1 } index += 1 } index = skip_space(content, index); if index >= content.len() || content[index] as char != '#' { return None; } index += 1; const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g']; let mut word_index = 0; while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' { if content[index] as char == CODING[word_index] || content[index] as char == CODING[word_index + 1] { word_index += 2 } else { word_index = 0; } index += 1; } if word_index < CODING.len() { return None; } index = skip_space(content, index); if index < content.len() && content[index] as char != ':' && content[index] as char != '=' { return None; } index += 1; index = skip_space(content, index); let start = index; while index < content.len() { let c = content[index] as char; if c == '-' || c == '_' || c.is_ascii_alphanumeric() { index += 1; } else { break; } } if index > start { return Some(String::from_utf8_lossy(&content[start..index])); } None } #[test] fn test_scan_coding_comment() { let text = "# encoding: utf-8"; let result = scan_coding_comment(text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "#coding:utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "# foo\n# encoding: utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, None); let text = "# encoding: latin1 encoding: utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("latin1".into())); let text = "# encoding: nonsense"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("nonsense".into())); let text = "# coding = utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "# CODING = utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "# CoDiNg = utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "# blah blahblahcoding = utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); // unicode BOM is ignored let text = "\u{FEFF}# encoding: utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "\u{FEFF} # encoding: utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "#! /usr/bin/env ruby\n # encoding: utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); // A #! must be the first thing on a line, otherwise it's a normal comment let text = " #! /usr/bin/env ruby encoding = utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, Some("utf-8".into())); let text = " #! /usr/bin/env ruby \n # encoding = utf-8"; let result = scan_coding_comment(&text.as_bytes()); assert_eq!(result, None); }