skip binary files when extracting JavaScript

2026-04-28 10:15:14 +02:00 · 2020-08-13 13:49:30 +02:00
parent 3469ad7ca6
commit c28889225a
2 changed files with 90 additions and 63 deletions
--- a/javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
@@ -42,7 +42,68 @@ public class FileExtractor {
      Pattern.compile("^(?s)\\s*\\{\\s*\"([^\"]|\\\\.)*\"\\s*:.*");

  /** The charset for decoding UTF-8 strings. */
-  private static final Charset UTF8_CHARSET = Charset.forName("UTF-8");
+  private static final Charset UTF8_CHARSET = StandardCharsets.UTF_8;
+
+  /**
+   * Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
+   */
+  private static boolean hasUnprintableUtf8(byte[] bytes, int length) {
+    // Constants for bytes with N high-order 1-bits.
+    // They are typed as `int` as the subsequent byte-to-int promotion would
+    // otherwise fill the high-order `int` bits with 1s.
+    final int high1 = 0b10000000;
+    final int high2 = 0b11000000;
+    final int high3 = 0b11100000;
+    final int high4 = 0b11110000;
+    final int high5 = 0b11111000;
+
+    int startIndex = skipBOM(bytes, length);
+    for (int i = startIndex; i < length; ++i) {
+      int b = bytes[i];
+      if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
+        // ASCII values 0-31 are unprintable, except 9-13 are whitespace.
+        // 127 is the unprintable DEL character.
+        if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
+          return true;
+        }
+      } else {
+        // Check for malformed UTF-8 multibyte code point
+        int trailingBytes = 0;
+        if ((b & high3) == high2) {
+          trailingBytes = 1; // 110xxxxx 10xxxxxx
+        } else if ((b & high4) == high3) {
+          trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
+        } else if ((b & high5) == high4) {
+          trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        } else {
+          return true; // 10xxxxxx and 11111xxx are not valid here.
+        }
+        // Trailing bytes must be of form 10xxxxxx
+        while (trailingBytes > 0) {
+          ++i;
+          --trailingBytes;
+          if (i >= length) {
+            return false;
+          }
+          if ((bytes[i] & high2) != high1) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  /** Returns the index after the initial BOM, if any, otherwise 0. */
+  private static int skipBOM(byte[] bytes, int length) {
+    if (length >= 2
+        && (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
+            || bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
+      return 2;
+    } else {
+      return 0;
+    }
+  }

  /** Information about supported file types. */
  public static enum FileType {
@@ -66,6 +127,10 @@ public class FileExtractor {

      @Override
      protected boolean contains(File f, String lcExt, ExtractorConfig config) {
+        if (isBinaryFile(f, lcExt, config)) {
+          return false;
+        }
+
        if (super.contains(f, lcExt, config)) return true;

        // detect Node.js scripts that are meant to be run from
@@ -90,6 +155,29 @@ public class FileExtractor {
      public String toString() {
        return "javascript";
      }
+
+      /** Number of bytes to read from the beginning of a ".js" file to detect if it is a binary file. */
+      private static final int fileHeaderSize = 128;
+
+      /** Computes if `f` is a binary file based on whether the initial `fileHeaderSize` bytes are printable UTF-8 chars. */
+      private boolean isBinaryFile(File f, String lcExt, ExtractorConfig config) {
+        try (FileInputStream fis = new FileInputStream(f)) {
+          byte[] bytes = new byte[fileHeaderSize];
+          int length = fis.read(bytes);
+
+          if (length == -1) return false;
+
+          // Avoid invalid or unprintable UTF-8 files.
+          if (config.getDefaultEncoding().equals(UTF8_CHARSET.name()) && hasUnprintableUtf8(bytes, length)) {
+            return true;
+          }
+
+          return false;
+        } catch (IOException e) {
+          Exceptions.ignore(e, "Let extractor handle this one.");
+        }
+        return false;
+      }
    },

    JSON(".json") {
@@ -160,7 +248,7 @@ public class FileExtractor {
          if (length == -1) return false;

          // Avoid invalid or unprintable UTF-8 files.
-          if (config.getDefaultEncoding().equals("UTF-8") && hasUnprintableUtf8(bytes, length)) {
+          if (config.getDefaultEncoding().equals(UTF8_CHARSET.name()) && hasUnprintableUtf8(bytes, length)) {
            return true;
          }

@@ -182,17 +270,6 @@ public class FileExtractor {
        return false;
      }

-      /** Returns the index after the initial BOM, if any, otherwise 0. */
-      private int skipBOM(byte[] bytes, int length) {
-        if (length >= 2
-            && (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
-                || bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
-          return 2;
-        } else {
-          return 0;
-        }
-      }
-
      private boolean isXml(byte[] bytes, int length) {
        int startIndex = skipBOM(bytes, length);
        // Check for `<` encoded in Ascii/UTF-8 or litte-endian UTF-16.
@@ -211,56 +288,6 @@ public class FileExtractor {
        return s.startsWith("! TOUCHSTONE file ") || s.startsWith("[Version] 2.0");
      }

-      /**
-       * Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
-       */
-      private boolean hasUnprintableUtf8(byte[] bytes, int length) {
-        // Constants for bytes with N high-order 1-bits.
-        // They are typed as `int` as the subsequent byte-to-int promotion would
-        // otherwise fill the high-order `int` bits with 1s.
-        final int high1 = 0b10000000;
-        final int high2 = 0b11000000;
-        final int high3 = 0b11100000;
-        final int high4 = 0b11110000;
-        final int high5 = 0b11111000;
-
-        int startIndex = skipBOM(bytes, length);
-        for (int i = startIndex; i < length; ++i) {
-          int b = bytes[i];
-          if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
-            // ASCII values 0-31 are unprintable, except 9-13 are whitespace.
-            // 127 is the unprintable DEL character.
-            if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
-              return true;
-            }
-          } else {
-            // Check for malformed UTF-8 multibyte code point
-            int trailingBytes = 0;
-            if ((b & high3) == high2) {
-              trailingBytes = 1; // 110xxxxx 10xxxxxx
-            } else if ((b & high4) == high3) {
-              trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
-            } else if ((b & high5) == high4) {
-              trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-            } else {
-              return true; // 10xxxxxx and 11111xxx are not valid here.
-            }
-            // Trailing bytes must be of form 10xxxxxx
-            while (trailingBytes > 0) {
-              ++i;
-              --trailingBytes;
-              if (i >= length) {
-                return false;
-              }
-              if ((bytes[i] & high2) != high1) {
-                return true;
-              }
-            }
-          }
-        }
-        return false;
-      }
-
      /**
       * Returns true if the byte sequence starts with a shebang line that is not recognized as a
       * JavaScript interpreter.