skip binary files when extracting JavaScript

This commit is contained in:
Erik Krogh Kristensen
2020-08-13 13:49:30 +02:00
parent 3469ad7ca6
commit c28889225a
2 changed files with 90 additions and 63 deletions

View File

@@ -42,7 +42,68 @@ public class FileExtractor {
Pattern.compile("^(?s)\\s*\\{\\s*\"([^\"]|\\\\.)*\"\\s*:.*");
/** The charset for decoding UTF-8 strings. */
private static final Charset UTF8_CHARSET = Charset.forName("UTF-8");
private static final Charset UTF8_CHARSET = StandardCharsets.UTF_8;
/**
* Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
*/
private static boolean hasUnprintableUtf8(byte[] bytes, int length) {
// Constants for bytes with N high-order 1-bits.
// They are typed as `int` as the subsequent byte-to-int promotion would
// otherwise fill the high-order `int` bits with 1s.
final int high1 = 0b10000000;
final int high2 = 0b11000000;
final int high3 = 0b11100000;
final int high4 = 0b11110000;
final int high5 = 0b11111000;
int startIndex = skipBOM(bytes, length);
for (int i = startIndex; i < length; ++i) {
int b = bytes[i];
if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
// ASCII values 0-31 are unprintable, except 9-13 are whitespace.
// 127 is the unprintable DEL character.
if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
return true;
}
} else {
// Check for malformed UTF-8 multibyte code point
int trailingBytes = 0;
if ((b & high3) == high2) {
trailingBytes = 1; // 110xxxxx 10xxxxxx
} else if ((b & high4) == high3) {
trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
} else if ((b & high5) == high4) {
trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
} else {
return true; // 10xxxxxx and 11111xxx are not valid here.
}
// Trailing bytes must be of form 10xxxxxx
while (trailingBytes > 0) {
++i;
--trailingBytes;
if (i >= length) {
return false;
}
if ((bytes[i] & high2) != high1) {
return true;
}
}
}
}
return false;
}
/** Returns the index after the initial BOM, if any, otherwise 0. */
private static int skipBOM(byte[] bytes, int length) {
if (length >= 2
&& (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
|| bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
return 2;
} else {
return 0;
}
}
/** Information about supported file types. */
public static enum FileType {
@@ -66,6 +127,10 @@ public class FileExtractor {
@Override
protected boolean contains(File f, String lcExt, ExtractorConfig config) {
if (isBinaryFile(f, lcExt, config)) {
return false;
}
if (super.contains(f, lcExt, config)) return true;
// detect Node.js scripts that are meant to be run from
@@ -90,6 +155,29 @@ public class FileExtractor {
public String toString() {
return "javascript";
}
/** Number of bytes to read from the beginning of a ".js" file to detect if it is a binary file. */
private static final int fileHeaderSize = 128;
/** Computes if `f` is a binary file based on whether the initial `fileHeaderSize` bytes are printable UTF-8 chars. */
private boolean isBinaryFile(File f, String lcExt, ExtractorConfig config) {
try (FileInputStream fis = new FileInputStream(f)) {
byte[] bytes = new byte[fileHeaderSize];
int length = fis.read(bytes);
if (length == -1) return false;
// Avoid invalid or unprintable UTF-8 files.
if (config.getDefaultEncoding().equals(UTF8_CHARSET.name()) && hasUnprintableUtf8(bytes, length)) {
return true;
}
return false;
} catch (IOException e) {
Exceptions.ignore(e, "Let extractor handle this one.");
}
return false;
}
},
JSON(".json") {
@@ -160,7 +248,7 @@ public class FileExtractor {
if (length == -1) return false;
// Avoid invalid or unprintable UTF-8 files.
if (config.getDefaultEncoding().equals("UTF-8") && hasUnprintableUtf8(bytes, length)) {
if (config.getDefaultEncoding().equals(UTF8_CHARSET.name()) && hasUnprintableUtf8(bytes, length)) {
return true;
}
@@ -182,17 +270,6 @@ public class FileExtractor {
return false;
}
/** Returns the index after the initial BOM, if any, otherwise 0. */
private int skipBOM(byte[] bytes, int length) {
if (length >= 2
&& (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
|| bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
return 2;
} else {
return 0;
}
}
private boolean isXml(byte[] bytes, int length) {
int startIndex = skipBOM(bytes, length);
// Check for `<` encoded in Ascii/UTF-8 or litte-endian UTF-16.
@@ -211,56 +288,6 @@ public class FileExtractor {
return s.startsWith("! TOUCHSTONE file ") || s.startsWith("[Version] 2.0");
}
/**
* Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
*/
private boolean hasUnprintableUtf8(byte[] bytes, int length) {
// Constants for bytes with N high-order 1-bits.
// They are typed as `int` as the subsequent byte-to-int promotion would
// otherwise fill the high-order `int` bits with 1s.
final int high1 = 0b10000000;
final int high2 = 0b11000000;
final int high3 = 0b11100000;
final int high4 = 0b11110000;
final int high5 = 0b11111000;
int startIndex = skipBOM(bytes, length);
for (int i = startIndex; i < length; ++i) {
int b = bytes[i];
if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
// ASCII values 0-31 are unprintable, except 9-13 are whitespace.
// 127 is the unprintable DEL character.
if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
return true;
}
} else {
// Check for malformed UTF-8 multibyte code point
int trailingBytes = 0;
if ((b & high3) == high2) {
trailingBytes = 1; // 110xxxxx 10xxxxxx
} else if ((b & high4) == high3) {
trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
} else if ((b & high5) == high4) {
trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
} else {
return true; // 10xxxxxx and 11111xxx are not valid here.
}
// Trailing bytes must be of form 10xxxxxx
while (trailingBytes > 0) {
++i;
--trailingBytes;
if (i >= length) {
return false;
}
if ((bytes[i] & high2) != high1) {
return true;
}
}
}
}
return false;
}
/**
* Returns true if the byte sequence starts with a shebang line that is not recognized as a
* JavaScript interpreter.