mirror of
https://github.com/github/codeql.git
synced 2025-12-16 08:43:11 +01:00
430 lines
16 KiB
Rust
430 lines
16 KiB
Rust
use clap::Args;
|
|
use lazy_static::lazy_static;
|
|
use rayon::prelude::*;
|
|
use std::borrow::Cow;
|
|
use std::fs;
|
|
use std::io::BufRead;
|
|
use std::path::{Path, PathBuf};
|
|
use tree_sitter::{Language, Parser, Range};
|
|
|
|
use codeql_extractor::{diagnostics, extractor, file_paths, node_types, trap};
|
|
|
|
#[derive(Args)]
|
|
pub struct Options {
|
|
/// Sets a custom source achive folder
|
|
#[arg(long)]
|
|
source_archive_dir: String,
|
|
|
|
/// Sets a custom trap folder
|
|
#[arg(long)]
|
|
output_dir: String,
|
|
|
|
/// A text file containing the paths of the files to extract
|
|
#[arg(long)]
|
|
file_list: String,
|
|
}
|
|
|
|
pub fn run(options: Options) -> std::io::Result<()> {
|
|
extractor::set_tracing_level("ruby");
|
|
tracing::info!("Extraction started");
|
|
let diagnostics = diagnostics::DiagnosticLoggers::new("ruby");
|
|
let mut main_thread_logger = diagnostics.logger();
|
|
let num_threads = match codeql_extractor::options::num_threads() {
|
|
Ok(num) => num,
|
|
Err(e) => {
|
|
main_thread_logger.write(
|
|
main_thread_logger
|
|
.new_entry("configuration-error", "Configuration error")
|
|
.message(
|
|
"{}; defaulting to 1 thread.",
|
|
&[diagnostics::MessageArg::Code(&e)],
|
|
)
|
|
.severity(diagnostics::Severity::Warning),
|
|
);
|
|
1
|
|
}
|
|
};
|
|
tracing::info!(
|
|
"Using {} {}",
|
|
num_threads,
|
|
if num_threads == 1 {
|
|
"thread"
|
|
} else {
|
|
"threads"
|
|
}
|
|
);
|
|
let trap_compression =
|
|
match trap::Compression::from_env("CODEQL_EXTRACTOR_RUBY_OPTION_TRAP_COMPRESSION") {
|
|
Ok(x) => x,
|
|
Err(e) => {
|
|
main_thread_logger.write(
|
|
main_thread_logger
|
|
.new_entry("configuration-error", "Configuration error")
|
|
.message("{}; using gzip.", &[diagnostics::MessageArg::Code(&e)])
|
|
.severity(diagnostics::Severity::Warning),
|
|
);
|
|
trap::Compression::Gzip
|
|
}
|
|
};
|
|
drop(main_thread_logger);
|
|
rayon::ThreadPoolBuilder::new()
|
|
.num_threads(num_threads)
|
|
.build_global()
|
|
.unwrap();
|
|
|
|
let src_archive_dir = file_paths::path_from_string(&options.source_archive_dir);
|
|
|
|
let trap_dir = file_paths::path_from_string(&options.output_dir);
|
|
|
|
let file_list = fs::File::open(file_paths::path_from_string(&options.file_list))?;
|
|
|
|
let language: Language = tree_sitter_ruby::LANGUAGE.into();
|
|
let erb: Language = tree_sitter_embedded_template::LANGUAGE.into();
|
|
// Look up tree-sitter kind ids now, to avoid string comparisons when scanning ERB files.
|
|
let erb_directive_id = erb.id_for_node_kind("directive", true);
|
|
let erb_output_directive_id = erb.id_for_node_kind("output_directive", true);
|
|
let erb_code_id = erb.id_for_node_kind("code", true);
|
|
let schema = node_types::read_node_types_str("ruby", tree_sitter_ruby::NODE_TYPES)?;
|
|
let erb_schema =
|
|
node_types::read_node_types_str("erb", tree_sitter_embedded_template::NODE_TYPES)?;
|
|
let lines: std::io::Result<Vec<String>> = std::io::BufReader::new(file_list).lines().collect();
|
|
let lines = lines?;
|
|
lines
|
|
.par_iter()
|
|
.try_for_each(|line| {
|
|
let mut diagnostics_writer = diagnostics.logger();
|
|
let path = PathBuf::from(line).canonicalize()?;
|
|
let src_archive_file = file_paths::path_for(&src_archive_dir, &path, "");
|
|
let mut source = std::fs::read(&path)?;
|
|
let mut needs_conversion = false;
|
|
let code_ranges;
|
|
let mut trap_writer = trap::Writer::new();
|
|
if path.extension().map_or(false, |x| x == "erb") {
|
|
tracing::info!("scanning: {}", path.display());
|
|
extractor::extract(
|
|
&erb,
|
|
"erb",
|
|
&erb_schema,
|
|
&mut diagnostics_writer,
|
|
&mut trap_writer,
|
|
&path,
|
|
&source,
|
|
&[],
|
|
);
|
|
|
|
let (ranges, line_breaks) = scan_erb(
|
|
&erb,
|
|
&source,
|
|
erb_directive_id,
|
|
erb_output_directive_id,
|
|
erb_code_id,
|
|
);
|
|
for i in line_breaks {
|
|
if i < source.len() {
|
|
source[i] = b'\n';
|
|
}
|
|
}
|
|
code_ranges = ranges;
|
|
} else {
|
|
if let Some(encoding_name) = scan_coding_comment(&source) {
|
|
// If the input is already UTF-8 then there is no need to recode the source
|
|
// If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
|
|
// to interpret characters. In this case it is probably best to leave the input
|
|
// unchanged.
|
|
if !encoding_name.eq_ignore_ascii_case("utf-8")
|
|
&& !encoding_name.eq_ignore_ascii_case("ascii-8bit")
|
|
&& !encoding_name.eq_ignore_ascii_case("binary")
|
|
{
|
|
if let Some(encoding) = encoding_from_name(&encoding_name) {
|
|
needs_conversion =
|
|
encoding.whatwg_name().unwrap_or_default() != "utf-8";
|
|
if needs_conversion {
|
|
match encoding
|
|
.decode(&source, encoding::types::DecoderTrap::Replace)
|
|
{
|
|
Ok(str) => source = str.as_bytes().to_owned(),
|
|
Err(msg) => {
|
|
needs_conversion = false;
|
|
diagnostics_writer.write(
|
|
diagnostics_writer
|
|
.new_entry(
|
|
"character-decoding-error",
|
|
"Character decoding error",
|
|
)
|
|
.file(&file_paths::normalize_path(&path))
|
|
.message(
|
|
"Could not decode the file contents as {}: {}. The contents of the file must match the character encoding specified in the {} {}.",
|
|
&[
|
|
diagnostics::MessageArg::Code(&encoding_name),
|
|
diagnostics::MessageArg::Code(&msg),
|
|
diagnostics::MessageArg::Code("encoding:"),
|
|
diagnostics::MessageArg::Link("directive", "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive")
|
|
],
|
|
)
|
|
.status_page()
|
|
.severity(diagnostics::Severity::Warning),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
diagnostics_writer.write(
|
|
diagnostics_writer
|
|
.new_entry("unknown-character-encoding", "Could not process some files due to an unknown character encoding")
|
|
.file(&file_paths::normalize_path(&path))
|
|
.message(
|
|
"Unknown character encoding {} in {} {}.",
|
|
&[
|
|
diagnostics::MessageArg::Code(&encoding_name),
|
|
diagnostics::MessageArg::Code("#encoding:"),
|
|
diagnostics::MessageArg::Link("directive", "https://docs.ruby-lang.org/en/master/syntax/comments_rdoc.html#label-encoding+Directive")
|
|
],
|
|
)
|
|
.status_page()
|
|
.severity(diagnostics::Severity::Warning),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
code_ranges = vec![];
|
|
}
|
|
extractor::extract(
|
|
&language,
|
|
"ruby",
|
|
&schema,
|
|
&mut diagnostics_writer,
|
|
&mut trap_writer,
|
|
&path,
|
|
&source,
|
|
&code_ranges,
|
|
);
|
|
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
|
|
if needs_conversion {
|
|
std::fs::write(&src_archive_file, &source)?;
|
|
} else {
|
|
std::fs::copy(&path, &src_archive_file)?;
|
|
}
|
|
write_trap(&trap_dir, path, &trap_writer, trap_compression)
|
|
})
|
|
.expect("failed to extract files");
|
|
|
|
let path = PathBuf::from("extras");
|
|
let mut trap_writer = trap::Writer::new();
|
|
extractor::populate_empty_location(&mut trap_writer);
|
|
let res = write_trap(&trap_dir, path, &trap_writer, trap_compression);
|
|
tracing::info!("Extraction complete");
|
|
res
|
|
}
|
|
|
|
lazy_static! {
|
|
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
|
|
}
|
|
|
|
/// Returns the `encoding::Encoding` corresponding to the given encoding name, if one exists.
|
|
fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
|
|
match encoding::label::encoding_from_whatwg_label(encoding_name) {
|
|
s @ Some(_) => s,
|
|
None => CP_NUMBER.captures(encoding_name).and_then(|cap| {
|
|
encoding::label::encoding_from_windows_code_page(
|
|
str::parse(cap.get(1).unwrap().as_str()).unwrap(),
|
|
)
|
|
}),
|
|
}
|
|
}
|
|
|
|
fn write_trap(
|
|
trap_dir: &Path,
|
|
path: PathBuf,
|
|
trap_writer: &trap::Writer,
|
|
trap_compression: trap::Compression,
|
|
) -> std::io::Result<()> {
|
|
let trap_file = file_paths::path_for(trap_dir, &path, trap_compression.extension());
|
|
std::fs::create_dir_all(trap_file.parent().unwrap())?;
|
|
trap_writer.write_to_file(&trap_file, trap_compression)
|
|
}
|
|
|
|
fn scan_erb(
|
|
erb: &Language,
|
|
source: &[u8],
|
|
directive_id: u16,
|
|
output_directive_id: u16,
|
|
code_id: u16,
|
|
) -> (Vec<Range>, Vec<usize>) {
|
|
let mut parser = Parser::new();
|
|
parser.set_language(erb).unwrap();
|
|
let tree = parser.parse(source, None).expect("Failed to parse file");
|
|
let mut result = Vec::new();
|
|
let mut line_breaks = vec![];
|
|
|
|
for n in tree.root_node().children(&mut tree.walk()) {
|
|
let kind_id = n.kind_id();
|
|
if kind_id == directive_id || kind_id == output_directive_id {
|
|
for c in n.children(&mut tree.walk()) {
|
|
if c.kind_id() == code_id {
|
|
let mut range = c.range();
|
|
if range.end_byte < source.len() {
|
|
line_breaks.push(range.end_byte);
|
|
range.end_byte += 1;
|
|
range.end_point.column += 1;
|
|
}
|
|
result.push(range);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if result.is_empty() {
|
|
let root = tree.root_node();
|
|
|
|
// Add an empty range at the end of the file
|
|
result.push(Range {
|
|
start_byte: root.end_byte(),
|
|
end_byte: root.end_byte(),
|
|
start_point: root.end_position(),
|
|
end_point: root.end_position(),
|
|
});
|
|
}
|
|
(result, line_breaks)
|
|
}
|
|
|
|
/// Advance `index` to the next non-whitespace character.
|
|
/// Newlines are **not** considered whitespace.
|
|
fn skip_space(content: &[u8], index: usize) -> usize {
|
|
let mut index = index;
|
|
while index < content.len() {
|
|
let c = content[index] as char;
|
|
// white space except \n
|
|
let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
|
|
if !is_space {
|
|
break;
|
|
}
|
|
index += 1;
|
|
}
|
|
index
|
|
}
|
|
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
|
|
let mut index = 0;
|
|
// skip UTF-8 BOM marker if there is one
|
|
if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
|
|
index += 3;
|
|
}
|
|
// skip #! line if there is one
|
|
if index + 1 < content.len()
|
|
&& content[index] as char == '#'
|
|
&& content[index + 1] as char == '!'
|
|
{
|
|
index += 2;
|
|
while index < content.len() && content[index] as char != '\n' {
|
|
index += 1
|
|
}
|
|
index += 1
|
|
}
|
|
index = skip_space(content, index);
|
|
|
|
if index >= content.len() || content[index] as char != '#' {
|
|
return None;
|
|
}
|
|
index += 1;
|
|
|
|
const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
|
|
let mut word_index = 0;
|
|
while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
|
|
if content[index] as char == CODING[word_index]
|
|
|| content[index] as char == CODING[word_index + 1]
|
|
{
|
|
word_index += 2
|
|
} else {
|
|
word_index = 0;
|
|
}
|
|
index += 1;
|
|
}
|
|
if word_index < CODING.len() {
|
|
return None;
|
|
}
|
|
index = skip_space(content, index);
|
|
|
|
if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
|
|
return None;
|
|
}
|
|
index += 1;
|
|
index = skip_space(content, index);
|
|
|
|
let start = index;
|
|
while index < content.len() {
|
|
let c = content[index] as char;
|
|
if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
|
|
index += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if index > start {
|
|
return Some(String::from_utf8_lossy(&content[start..index]));
|
|
}
|
|
None
|
|
}
|
|
|
|
#[test]
|
|
fn test_scan_coding_comment() {
|
|
let text = "# encoding: utf-8";
|
|
let result = scan_coding_comment(text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "#coding:utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "# foo\n# encoding: utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, None);
|
|
|
|
let text = "# encoding: latin1 encoding: utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("latin1".into()));
|
|
|
|
let text = "# encoding: nonsense";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("nonsense".into()));
|
|
|
|
let text = "# coding = utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "# CODING = utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "# CoDiNg = utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "# blah blahblahcoding = utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
// unicode BOM is ignored
|
|
let text = "\u{FEFF}# encoding: utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "\u{FEFF} # encoding: utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
|
|
// A #! must be the first thing on a line, otherwise it's a normal comment
|
|
let text = " #! /usr/bin/env ruby encoding = utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, Some("utf-8".into()));
|
|
let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
|
|
let result = scan_coding_comment(&text.as_bytes());
|
|
assert_eq!(result, None);
|
|
}
|