mirror of
https://github.com/github/codeql.git
synced 2026-04-30 19:26:02 +02:00
Ruby: handle magic coding: comments
This commit is contained in:
BIN
ruby/Cargo.lock
generated
BIN
ruby/Cargo.lock
generated
Binary file not shown.
@@ -18,3 +18,5 @@ tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
|
||||
rayon = "1.5.0"
|
||||
num_cpus = "1.13.0"
|
||||
regex = "1.5.5"
|
||||
encoding = "0.2"
|
||||
lazy_static = "1.4.0"
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
mod extractor;
|
||||
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate num_cpus;
|
||||
|
||||
use clap::arg;
|
||||
use encoding::{self};
|
||||
use flate2::write::GzEncoder;
|
||||
use rayon::prelude::*;
|
||||
use std::borrow::Cow;
|
||||
use std::fs;
|
||||
use std::io::{BufRead, BufWriter};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -75,6 +79,25 @@ fn num_codeql_threads() -> usize {
|
||||
}
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
|
||||
}
|
||||
|
||||
fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
|
||||
match encoding::label::encoding_from_whatwg_label(&encoding_name) {
|
||||
Some(e) => return Some(e),
|
||||
None => {
|
||||
if let Some(cap) = CP_NUMBER.captures(&encoding_name) {
|
||||
return encoding::label::encoding_from_windows_code_page(
|
||||
str::parse(cap.get(1).unwrap().as_str()).unwrap(),
|
||||
);
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> std::io::Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_target(false)
|
||||
@@ -140,6 +163,7 @@ fn main() -> std::io::Result<()> {
|
||||
let path = PathBuf::from(line).canonicalize()?;
|
||||
let src_archive_file = path_for(&src_archive_dir, &path, "");
|
||||
let mut source = std::fs::read(&path)?;
|
||||
let mut needs_conversion = false;
|
||||
let code_ranges;
|
||||
let mut trap_writer = extractor::new_trap_writer();
|
||||
if path.extension().map_or(false, |x| x == "erb") {
|
||||
@@ -168,6 +192,43 @@ fn main() -> std::io::Result<()> {
|
||||
}
|
||||
code_ranges = ranges;
|
||||
} else {
|
||||
if let Some(encoding_name) = scan_coding_comment(&source) {
|
||||
// If the input is already UTF-8 then there is no need to recode the source
|
||||
// If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
|
||||
// to interpret characters. In this case it is probably best to leave the input
|
||||
// unchanged.
|
||||
if !encoding_name.eq_ignore_ascii_case("utf-8")
|
||||
&& !encoding_name.eq_ignore_ascii_case("ascii-8bit")
|
||||
&& !encoding_name.eq_ignore_ascii_case("binary")
|
||||
{
|
||||
if let Some(encoding) = encoding_from_name(&encoding_name) {
|
||||
needs_conversion =
|
||||
encoding.whatwg_name().unwrap_or_default() != "utf-8";
|
||||
if needs_conversion {
|
||||
match encoding
|
||||
.decode(&source, encoding::types::DecoderTrap::Replace)
|
||||
{
|
||||
Ok(str) => source = str.as_bytes().to_owned(),
|
||||
Err(msg) => {
|
||||
needs_conversion = false;
|
||||
tracing::warn!(
|
||||
"{}: character decoding failure: {} ({})",
|
||||
&path.to_string_lossy(),
|
||||
msg,
|
||||
&encoding_name
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"{}: unknown character encoding: '{}'",
|
||||
&path.to_string_lossy(),
|
||||
&encoding_name
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
code_ranges = vec![];
|
||||
}
|
||||
extractor::extract(
|
||||
@@ -180,7 +241,11 @@ fn main() -> std::io::Result<()> {
|
||||
&code_ranges,
|
||||
)?;
|
||||
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
|
||||
std::fs::copy(&path, &src_archive_file)?;
|
||||
if needs_conversion {
|
||||
std::fs::write(&src_archive_file, &source)?;
|
||||
} else {
|
||||
std::fs::copy(&path, &src_archive_file)?;
|
||||
}
|
||||
write_trap(&trap_dir, path, trap_writer, &trap_compression)
|
||||
})
|
||||
.expect("failed to extract files");
|
||||
@@ -299,3 +364,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn skip_space(content: &[u8], index: usize) -> usize {
|
||||
let mut index = index;
|
||||
while index < content.len() {
|
||||
let c = content[index] as char;
|
||||
// white space except \n
|
||||
let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
|
||||
if !is_space {
|
||||
break;
|
||||
}
|
||||
index += 1;
|
||||
}
|
||||
index
|
||||
}
|
||||
|
||||
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
|
||||
let mut index = 0;
|
||||
// skip UTF-8 BOM marker if there is one
|
||||
if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
|
||||
index += 3;
|
||||
}
|
||||
// skip #! line if there is one
|
||||
if index + 1 < content.len()
|
||||
&& content[index] as char == '#'
|
||||
&& content[index + 1] as char == '!'
|
||||
{
|
||||
index += 2;
|
||||
while index < content.len() && content[index] as char != '\n' {
|
||||
index += 1
|
||||
}
|
||||
index += 1
|
||||
}
|
||||
index = skip_space(content, index);
|
||||
|
||||
if index >= content.len() || content[index] as char != '#' {
|
||||
return None;
|
||||
}
|
||||
index += 1;
|
||||
|
||||
const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
|
||||
let mut word_index = 0;
|
||||
while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
|
||||
if content[index] as char == CODING[word_index]
|
||||
|| content[index] as char == CODING[word_index + 1]
|
||||
{
|
||||
word_index += 2
|
||||
} else {
|
||||
word_index = 0;
|
||||
}
|
||||
index += 1;
|
||||
}
|
||||
if word_index < CODING.len() {
|
||||
return None;
|
||||
}
|
||||
index = skip_space(content, index);
|
||||
|
||||
if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
|
||||
return None;
|
||||
}
|
||||
index += 1;
|
||||
index = skip_space(content, index);
|
||||
|
||||
let start = index;
|
||||
while index < content.len() {
|
||||
let c = content[index] as char;
|
||||
if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
|
||||
index += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if index > start {
|
||||
return Some(String::from_utf8_lossy(&content[start..index]));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scan_coding_comment() {
|
||||
let text = "# encoding: utf-8";
|
||||
let result = scan_coding_comment(text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "#coding:utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "# foo\n# encoding: utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, None);
|
||||
|
||||
let text = "# encoding: latin1 encoding: utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("latin1".into()));
|
||||
|
||||
let text = "# encoding: nonsense";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("nonsense".into()));
|
||||
|
||||
let text = "# coding = utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "# CODING = utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "# CoDiNg = utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "# blah blahblahcoding = utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
// unicode BOM is ignored
|
||||
let text = "\u{FEFF}# encoding: utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "\u{FEFF} # encoding: utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
|
||||
// A #! must be the first thing on a line, otherwise it's a normal comment
|
||||
let text = " #! /usr/bin/env ruby encoding = utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, Some("utf-8".into()));
|
||||
let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
|
||||
let result = scan_coding_comment(&text.as_bytes());
|
||||
assert_eq!(result, None);
|
||||
}
|
||||
|
||||
@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
|
||||
# 93| getStmt: [SymbolLiteral] :"\C-?"
|
||||
# 93| getComponent: [StringEscapeSequenceComponent] \C
|
||||
# 93| getComponent: [StringTextComponent] -?
|
||||
misc/iso-8859-15.rb:
|
||||
# 1| [Toplevel] iso-8859-15.rb
|
||||
# 4| getStmt: [MethodCall] call to print
|
||||
# 4| getReceiver: [SelfVariableAccess] self
|
||||
# 4| getArgument: [StringLiteral] "EUR = €"
|
||||
# 4| getComponent: [StringTextComponent] EUR = €
|
||||
literals/literals.rb:
|
||||
# 1| [Toplevel] literals.rb
|
||||
# 2| getStmt: [NilLiteral] nil
|
||||
|
||||
@@ -4604,6 +4604,17 @@ literals/literals.rb:
|
||||
# 193| cat file.txt
|
||||
# 193|
|
||||
# 195| 1: [HeredocEnd] SCRIPT
|
||||
misc/iso-8859-15.rb:
|
||||
# 1| [Program] Program
|
||||
# 4| 0: [Call] Call
|
||||
# 4| 0: [Identifier] print
|
||||
# 4| 1: [ArgumentList] ArgumentList
|
||||
# 4| 0: [String] String
|
||||
# 4| 0: [ReservedWord] "
|
||||
# 4| 1: [StringContent] EUR = €
|
||||
# 4| 2: [ReservedWord] "
|
||||
# 1| [Comment] #! /usr/bin/ruby
|
||||
# 2| [Comment] # coding: iso-8859-15
|
||||
misc/misc.erb:
|
||||
# 2| [Program] Program
|
||||
# 2| 0: [Call] Call
|
||||
|
||||
@@ -717,6 +717,7 @@ exprValue
|
||||
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
|
||||
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
|
||||
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
|
||||
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
|
||||
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
|
||||
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
|
||||
| misc/misc.rb:3:7:3:9 | foo | foo | string |
|
||||
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
|
||||
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
|
||||
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
|
||||
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
|
||||
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
|
||||
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
|
||||
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
|
||||
| misc/misc.rb:3:7:3:9 | foo | foo | string |
|
||||
|
||||
4
ruby/ql/test/library-tests/ast/misc/iso-8859-15.rb
Normal file
4
ruby/ql/test/library-tests/ast/misc/iso-8859-15.rb
Normal file
@@ -0,0 +1,4 @@
|
||||
#! /usr/bin/ruby
|
||||
# coding: iso-8859-15
|
||||
|
||||
print "EUR = <20>"
|
||||
Reference in New Issue
Block a user