Ruby: handle magic coding: comments

This commit is contained in:
Arthur Baars
2022-07-18 15:01:25 +02:00
parent ca819573f5
commit 7be106d7bb
7 changed files with 231 additions and 1 deletions

BIN
ruby/Cargo.lock generated

Binary file not shown.

View File

@@ -18,3 +18,5 @@ tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
rayon = "1.5.0"
num_cpus = "1.13.0"
regex = "1.5.5"
encoding = "0.2"
lazy_static = "1.4.0"

View File

@@ -1,10 +1,14 @@
mod extractor;
#[macro_use]
extern crate lazy_static;
extern crate num_cpus;
use clap::arg;
use encoding::{self};
use flate2::write::GzEncoder;
use rayon::prelude::*;
use std::borrow::Cow;
use std::fs;
use std::io::{BufRead, BufWriter};
use std::path::{Path, PathBuf};
@@ -75,6 +79,25 @@ fn num_codeql_threads() -> usize {
}
}
lazy_static! {
static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
}
fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
match encoding::label::encoding_from_whatwg_label(&encoding_name) {
Some(e) => return Some(e),
None => {
if let Some(cap) = CP_NUMBER.captures(&encoding_name) {
return encoding::label::encoding_from_windows_code_page(
str::parse(cap.get(1).unwrap().as_str()).unwrap(),
);
} else {
return None;
}
}
}
}
fn main() -> std::io::Result<()> {
tracing_subscriber::fmt()
.with_target(false)
@@ -140,6 +163,7 @@ fn main() -> std::io::Result<()> {
let path = PathBuf::from(line).canonicalize()?;
let src_archive_file = path_for(&src_archive_dir, &path, "");
let mut source = std::fs::read(&path)?;
let mut needs_conversion = false;
let code_ranges;
let mut trap_writer = extractor::new_trap_writer();
if path.extension().map_or(false, |x| x == "erb") {
@@ -168,6 +192,43 @@ fn main() -> std::io::Result<()> {
}
code_ranges = ranges;
} else {
if let Some(encoding_name) = scan_coding_comment(&source) {
// If the input is already UTF-8 then there is no need to recode the source
// If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
// to interpret characters. In this case it is probably best to leave the input
// unchanged.
if !encoding_name.eq_ignore_ascii_case("utf-8")
&& !encoding_name.eq_ignore_ascii_case("ascii-8bit")
&& !encoding_name.eq_ignore_ascii_case("binary")
{
if let Some(encoding) = encoding_from_name(&encoding_name) {
needs_conversion =
encoding.whatwg_name().unwrap_or_default() != "utf-8";
if needs_conversion {
match encoding
.decode(&source, encoding::types::DecoderTrap::Replace)
{
Ok(str) => source = str.as_bytes().to_owned(),
Err(msg) => {
needs_conversion = false;
tracing::warn!(
"{}: character decoding failure: {} ({})",
&path.to_string_lossy(),
msg,
&encoding_name
);
}
}
}
} else {
tracing::warn!(
"{}: unknown character encoding: '{}'",
&path.to_string_lossy(),
&encoding_name
);
}
}
}
code_ranges = vec![];
}
extractor::extract(
@@ -180,7 +241,11 @@ fn main() -> std::io::Result<()> {
&code_ranges,
)?;
std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
std::fs::copy(&path, &src_archive_file)?;
if needs_conversion {
std::fs::write(&src_archive_file, &source)?;
} else {
std::fs::copy(&path, &src_archive_file)?;
}
write_trap(&trap_dir, path, trap_writer, &trap_compression)
})
.expect("failed to extract files");
@@ -299,3 +364,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
}
result
}
fn skip_space(content: &[u8], index: usize) -> usize {
let mut index = index;
while index < content.len() {
let c = content[index] as char;
// white space except \n
let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
if !is_space {
break;
}
index += 1;
}
index
}
fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
let mut index = 0;
// skip UTF-8 BOM marker if there is one
if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
index += 3;
}
// skip #! line if there is one
if index + 1 < content.len()
&& content[index] as char == '#'
&& content[index + 1] as char == '!'
{
index += 2;
while index < content.len() && content[index] as char != '\n' {
index += 1
}
index += 1
}
index = skip_space(content, index);
if index >= content.len() || content[index] as char != '#' {
return None;
}
index += 1;
const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
let mut word_index = 0;
while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
if content[index] as char == CODING[word_index]
|| content[index] as char == CODING[word_index + 1]
{
word_index += 2
} else {
word_index = 0;
}
index += 1;
}
if word_index < CODING.len() {
return None;
}
index = skip_space(content, index);
if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
return None;
}
index += 1;
index = skip_space(content, index);
let start = index;
while index < content.len() {
let c = content[index] as char;
if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
index += 1;
} else {
break;
}
}
if index > start {
return Some(String::from_utf8_lossy(&content[start..index]));
}
None
}
#[test]
fn test_scan_coding_comment() {
let text = "# encoding: utf-8";
let result = scan_coding_comment(text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "#coding:utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# foo\n# encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, None);
let text = "# encoding: latin1 encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("latin1".into()));
let text = "# encoding: nonsense";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("nonsense".into()));
let text = "# coding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# CODING = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# CoDiNg = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "# blah blahblahcoding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
// unicode BOM is ignored
let text = "\u{FEFF}# encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "\u{FEFF} # encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
// A #! must be the first thing on a line, otherwise it's a normal comment
let text = " #! /usr/bin/env ruby encoding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, Some("utf-8".into()));
let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
let result = scan_coding_comment(&text.as_bytes());
assert_eq!(result, None);
}

View File

@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
# 93| getStmt: [SymbolLiteral] :"\C-?"
# 93| getComponent: [StringEscapeSequenceComponent] \C
# 93| getComponent: [StringTextComponent] -?
misc/iso-8859-15.rb:
# 1| [Toplevel] iso-8859-15.rb
# 4| getStmt: [MethodCall] call to print
# 4| getReceiver: [SelfVariableAccess] self
# 4| getArgument: [StringLiteral] "EUR = €"
# 4| getComponent: [StringTextComponent] EUR = €
literals/literals.rb:
# 1| [Toplevel] literals.rb
# 2| getStmt: [NilLiteral] nil

View File

@@ -4604,6 +4604,17 @@ literals/literals.rb:
# 193| cat file.txt
# 193|
# 195| 1: [HeredocEnd] SCRIPT
misc/iso-8859-15.rb:
# 1| [Program] Program
# 4| 0: [Call] Call
# 4| 0: [Identifier] print
# 4| 1: [ArgumentList] ArgumentList
# 4| 0: [String] String
# 4| 0: [ReservedWord] "
# 4| 1: [StringContent] EUR = €
# 4| 2: [ReservedWord] "
# 1| [Comment] #! /usr/bin/ruby
# 2| [Comment] # coding: iso-8859-15
misc/misc.erb:
# 2| [Program] Program
# 2| 0: [Call] Call

View File

@@ -717,6 +717,7 @@ exprValue
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
| misc/misc.rb:3:7:3:9 | foo | foo | string |
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
| literals/literals.rb:198:8:198:8 | 5 | 5 | int |
| literals/literals.rb:199:2:199:2 | :y | :y | symbol |
| literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
| misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
| misc/misc.rb:1:7:1:11 | "bar" | bar | string |
| misc/misc.rb:3:7:3:9 | foo | foo | string |

View File

@@ -0,0 +1,4 @@
#! /usr/bin/ruby
# coding: iso-8859-15
print "EUR = <20>"