mirror of
https://github.com/github/codeql.git
synced 2026-04-28 10:15:14 +02:00
skip binary files when extracting JavaScript
This commit is contained in:
@@ -42,7 +42,68 @@ public class FileExtractor {
|
||||
Pattern.compile("^(?s)\\s*\\{\\s*\"([^\"]|\\\\.)*\"\\s*:.*");
|
||||
|
||||
/** The charset for decoding UTF-8 strings. */
|
||||
private static final Charset UTF8_CHARSET = Charset.forName("UTF-8");
|
||||
private static final Charset UTF8_CHARSET = StandardCharsets.UTF_8;
|
||||
|
||||
/**
|
||||
* Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
|
||||
*/
|
||||
private static boolean hasUnprintableUtf8(byte[] bytes, int length) {
|
||||
// Constants for bytes with N high-order 1-bits.
|
||||
// They are typed as `int` as the subsequent byte-to-int promotion would
|
||||
// otherwise fill the high-order `int` bits with 1s.
|
||||
final int high1 = 0b10000000;
|
||||
final int high2 = 0b11000000;
|
||||
final int high3 = 0b11100000;
|
||||
final int high4 = 0b11110000;
|
||||
final int high5 = 0b11111000;
|
||||
|
||||
int startIndex = skipBOM(bytes, length);
|
||||
for (int i = startIndex; i < length; ++i) {
|
||||
int b = bytes[i];
|
||||
if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
|
||||
// ASCII values 0-31 are unprintable, except 9-13 are whitespace.
|
||||
// 127 is the unprintable DEL character.
|
||||
if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// Check for malformed UTF-8 multibyte code point
|
||||
int trailingBytes = 0;
|
||||
if ((b & high3) == high2) {
|
||||
trailingBytes = 1; // 110xxxxx 10xxxxxx
|
||||
} else if ((b & high4) == high3) {
|
||||
trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
|
||||
} else if ((b & high5) == high4) {
|
||||
trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
} else {
|
||||
return true; // 10xxxxxx and 11111xxx are not valid here.
|
||||
}
|
||||
// Trailing bytes must be of form 10xxxxxx
|
||||
while (trailingBytes > 0) {
|
||||
++i;
|
||||
--trailingBytes;
|
||||
if (i >= length) {
|
||||
return false;
|
||||
}
|
||||
if ((bytes[i] & high2) != high1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns the index after the initial BOM, if any, otherwise 0. */
|
||||
private static int skipBOM(byte[] bytes, int length) {
|
||||
if (length >= 2
|
||||
&& (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
|
||||
|| bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
|
||||
return 2;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Information about supported file types. */
|
||||
public static enum FileType {
|
||||
@@ -66,6 +127,10 @@ public class FileExtractor {
|
||||
|
||||
@Override
|
||||
protected boolean contains(File f, String lcExt, ExtractorConfig config) {
|
||||
if (isBinaryFile(f, lcExt, config)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (super.contains(f, lcExt, config)) return true;
|
||||
|
||||
// detect Node.js scripts that are meant to be run from
|
||||
@@ -90,6 +155,29 @@ public class FileExtractor {
|
||||
public String toString() {
|
||||
return "javascript";
|
||||
}
|
||||
|
||||
/** Number of bytes to read from the beginning of a ".js" file to detect if it is a binary file. */
|
||||
private static final int fileHeaderSize = 128;
|
||||
|
||||
/** Computes if `f` is a binary file based on whether the initial `fileHeaderSize` bytes are printable UTF-8 chars. */
|
||||
private boolean isBinaryFile(File f, String lcExt, ExtractorConfig config) {
|
||||
try (FileInputStream fis = new FileInputStream(f)) {
|
||||
byte[] bytes = new byte[fileHeaderSize];
|
||||
int length = fis.read(bytes);
|
||||
|
||||
if (length == -1) return false;
|
||||
|
||||
// Avoid invalid or unprintable UTF-8 files.
|
||||
if (config.getDefaultEncoding().equals(UTF8_CHARSET.name()) && hasUnprintableUtf8(bytes, length)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
Exceptions.ignore(e, "Let extractor handle this one.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
},
|
||||
|
||||
JSON(".json") {
|
||||
@@ -160,7 +248,7 @@ public class FileExtractor {
|
||||
if (length == -1) return false;
|
||||
|
||||
// Avoid invalid or unprintable UTF-8 files.
|
||||
if (config.getDefaultEncoding().equals("UTF-8") && hasUnprintableUtf8(bytes, length)) {
|
||||
if (config.getDefaultEncoding().equals(UTF8_CHARSET.name()) && hasUnprintableUtf8(bytes, length)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -182,17 +270,6 @@ public class FileExtractor {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns the index after the initial BOM, if any, otherwise 0. */
|
||||
private int skipBOM(byte[] bytes, int length) {
|
||||
if (length >= 2
|
||||
&& (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
|
||||
|| bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
|
||||
return 2;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isXml(byte[] bytes, int length) {
|
||||
int startIndex = skipBOM(bytes, length);
|
||||
// Check for `<` encoded in Ascii/UTF-8 or litte-endian UTF-16.
|
||||
@@ -211,56 +288,6 @@ public class FileExtractor {
|
||||
return s.startsWith("! TOUCHSTONE file ") || s.startsWith("[Version] 2.0");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
|
||||
*/
|
||||
private boolean hasUnprintableUtf8(byte[] bytes, int length) {
|
||||
// Constants for bytes with N high-order 1-bits.
|
||||
// They are typed as `int` as the subsequent byte-to-int promotion would
|
||||
// otherwise fill the high-order `int` bits with 1s.
|
||||
final int high1 = 0b10000000;
|
||||
final int high2 = 0b11000000;
|
||||
final int high3 = 0b11100000;
|
||||
final int high4 = 0b11110000;
|
||||
final int high5 = 0b11111000;
|
||||
|
||||
int startIndex = skipBOM(bytes, length);
|
||||
for (int i = startIndex; i < length; ++i) {
|
||||
int b = bytes[i];
|
||||
if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
|
||||
// ASCII values 0-31 are unprintable, except 9-13 are whitespace.
|
||||
// 127 is the unprintable DEL character.
|
||||
if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// Check for malformed UTF-8 multibyte code point
|
||||
int trailingBytes = 0;
|
||||
if ((b & high3) == high2) {
|
||||
trailingBytes = 1; // 110xxxxx 10xxxxxx
|
||||
} else if ((b & high4) == high3) {
|
||||
trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
|
||||
} else if ((b & high5) == high4) {
|
||||
trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
} else {
|
||||
return true; // 10xxxxxx and 11111xxx are not valid here.
|
||||
}
|
||||
// Trailing bytes must be of form 10xxxxxx
|
||||
while (trailingBytes > 0) {
|
||||
++i;
|
||||
--trailingBytes;
|
||||
if (i >= length) {
|
||||
return false;
|
||||
}
|
||||
if ((bytes[i] & high2) != high1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the byte sequence starts with a shebang line that is not recognized as a
|
||||
* JavaScript interpreter.
|
||||
|
||||
Reference in New Issue
Block a user