Merge pull request #20940 from asgerf/js/detect-minified-files

JS: Skip minified file if avg line length > 200
2026-02-12 05:01:06 +01:00 · 2026-01-19 14:31:09 +01:00
parent dc7ce3fba3 06cc323aee
commit bedb80346a
9 changed files with 190 additions and 7 deletions
--- a/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/AutoBuild.java
@@ -408,8 +408,10 @@ public class AutoBuild {
    for (String extension : fileTypes.keySet()) patterns.add("**/*" + extension);

    // exclude files whose name strongly suggests they are minified
-    patterns.add("-**/*.min.js");
-    patterns.add("-**/*-min.js");
+    if (!EnvironmentVariables.allowMinifiedFiles()) {
+      patterns.add("-**/*.min.js");
+      patterns.add("-**/*-min.js");
+    }

    // exclude `node_modules` and `bower_components`
    patterns.add("-**/node_modules");
@@ -1074,6 +1076,7 @@ protected DependencyInstallationResult preparePackagesAndDependencies(Set<Path>
    config = config.withSourceType(getSourceType());
    config = config.withVirtualSourceRoot(virtualSourceRoot);
    if (defaultEncoding != null) config = config.withDefaultEncoding(defaultEncoding);
+    config = config.withAllowMinified(EnvironmentVariables.allowMinifiedFiles());
    return config;
  }

--- a/javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/EnvironmentVariables.java
@@ -101,4 +101,12 @@ public class EnvironmentVariables {
  public static boolean isActionsExtractor() {
    return Env.systemEnv().getNonEmpty(CODEQL_EXTRACTOR_ACTIONS_WIP_DATABASE_ENV_VAR) != null;
  }
+
+  public static boolean allowMinifiedFiles() {
+    String env = Env.systemEnv().getNonEmpty("CODEQL_EXTRACTOR_JAVASCRIPT_ALLOW_MINIFIED_FILES");
+    if (env == null) {
+      return false; // default is to not allow minified files
+    }
+    return Boolean.parseBoolean(env);
+  }
 }
--- a/javascript/extractor/src/com/semmle/js/extractor/ExtractorConfig.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/ExtractorConfig.java
@@ -205,6 +205,9 @@ public class ExtractorConfig {
  /** Should parse errors be reported as violations instead of aborting extraction? */
  private boolean tolerateParseErrors;

+  /** Should minified files be allowed? */
+  private boolean allowMinified;
+
  /** How should HTML files be extracted? */
  private HtmlPopulator.Config htmlHandling;

@@ -236,6 +239,7 @@ public class ExtractorConfig {
    this.sourceType = SourceType.AUTO;
    this.htmlHandling = HtmlPopulator.Config.ELEMENTS;
    this.tolerateParseErrors = true;
+    this.allowMinified = false;
    if (experimental) {
      this.mozExtensions = true;
      this.jscript = true;
@@ -258,6 +262,7 @@ public class ExtractorConfig {
    this.v8Extensions = that.v8Extensions;
    this.e4x = that.e4x;
    this.tolerateParseErrors = that.tolerateParseErrors;
+    this.allowMinified = that.allowMinified;
    this.fileType = that.fileType;
    this.sourceType = that.sourceType;
    this.htmlHandling = that.htmlHandling;
@@ -357,6 +362,16 @@ public class ExtractorConfig {
    return res;
  }

+  public boolean isAllowMinified() {
+    return allowMinified;
+  }
+
+  public ExtractorConfig withAllowMinified(boolean allowMinified) {
+    ExtractorConfig res = new ExtractorConfig(this);
+    res.allowMinified = allowMinified;
+    return res;
+  }
+
  public boolean hasFileType() {
    return fileType != null;
  }
@@ -467,6 +482,8 @@ public class ExtractorConfig {
        + e4x
        + ", tolerateParseErrors="
        + tolerateParseErrors
+        + ", allowMinified="
+        + allowMinified
        + ", htmlHandling="
        + htmlHandling
        + ", fileType="
--- a/javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
@@ -549,10 +549,15 @@ public class FileExtractor {
          new TextualExtractor(
              trapwriter, locationManager, source, config.getExtractLines(), metrics, extractedFile);
      ParseResultInfo loc = extractor.extract(textualExtractor);
-      int numLines = textualExtractor.isSnippet() ? 0 : textualExtractor.getNumLines();
-      int linesOfCode = loc.getLinesOfCode(), linesOfComments = loc.getLinesOfComments();
-      trapwriter.addTuple("numlines", fileLabel, numLines, linesOfCode, linesOfComments);
-      trapwriter.addTuple("filetype", fileLabel, fileType.toString());
+      if (loc.getSkipReason() != null) {
+        System.err.println("Skipping file " + extractedFile + ": " + loc.getSkipReason());
+        System.err.flush();
+      } else {
+        int numLines = textualExtractor.isSnippet() ? 0 : textualExtractor.getNumLines();
+        int linesOfCode = loc.getLinesOfCode(), linesOfComments = loc.getLinesOfComments();
+        trapwriter.addTuple("numlines", fileLabel, numLines, linesOfCode, linesOfComments);
+        trapwriter.addTuple("filetype", fileLabel, fileType.toString());
+      }
      metrics.stopPhase(ExtractionPhase.FileExtractor_extractContents);
      metrics.writeTimingsToTrap(trapwriter);
      successful = true;
--- a/javascript/extractor/src/com/semmle/js/extractor/ParseResultInfo.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/ParseResultInfo.java
@@ -10,6 +10,7 @@ import java.util.List;
 public class ParseResultInfo {
  private int linesOfCode, linesOfComments;
  private List<ParseError> parseErrors;
+  private String skipReason;

  public ParseResultInfo(int linesOfCode, int linesOfComments, List<ParseError> parseErrors) {
    this.linesOfCode = linesOfCode;
@@ -17,6 +18,19 @@ public class ParseResultInfo {
    this.parseErrors = new ArrayList<>(parseErrors);
  }

+  private ParseResultInfo() {
+    this.linesOfCode = 0;
+    this.linesOfComments = 0;
+    this.parseErrors = new ArrayList<>();
+    this.skipReason = null;
+  }
+
+  public static final ParseResultInfo skipped(String reason) {
+    ParseResultInfo info = new ParseResultInfo();
+    info.skipReason = reason;
+    return info;
+  }
+
  public void add(ParseResultInfo that) {
    this.linesOfCode += that.linesOfCode;
    this.linesOfComments += that.linesOfComments;
@@ -41,4 +55,11 @@ public class ParseResultInfo {
  public List<ParseError> getParseErrors() {
    return parseErrors;
  }
+
+  /**
+   * If extraction of this file was skipped, gets the reason for skipping it.
+   */
+  public String getSkipReason() {
+    return skipReason;
+  }
 }
--- a/javascript/extractor/src/com/semmle/js/extractor/ScriptExtractor.java
+++ b/javascript/extractor/src/com/semmle/js/extractor/ScriptExtractor.java
@@ -38,10 +38,34 @@ public class ScriptExtractor implements IExtractor {
    return extension.equals(".cjs") || (extension.equals(".js") && "commonjs".equals(packageType));
  }

+  private boolean isMinified(String source) {
+    // If the average line length is over 200 characters, consider the file minified.
+    int numberOfLineBreaks = 0;
+    for (int i = 0; i < source.length(); i++) {
+      char c = source.charAt(i);
+      if (c == '\n') {
+        numberOfLineBreaks++;
+      } else if (c == '\r') {
+        numberOfLineBreaks++;
+        if (i + 1 < source.length() && source.charAt(i + 1) == '\n') {
+          i++; // skip the next \n in case of \r\n
+        }
+      }
+    }
+    int averageLineLength =
+        numberOfLineBreaks == 0 ? source.length() : source.length() / numberOfLineBreaks;
+    return averageLineLength > 200;
+  }
+
  @Override
  public ParseResultInfo extract(TextualExtractor textualExtractor) {
    LocationManager locationManager = textualExtractor.getLocationManager();
    String source = textualExtractor.getSource();
+
+    if (!config.isAllowMinified() && isMinified(source)) {
+      return ParseResultInfo.skipped("File appears to be minified.");
+    }
+
    String shebangLine = null, shebangLineTerm = null;

    if (source.startsWith("#!")) {
--- a/javascript/ql/src/change-notes/2025-12-05-skip-minified-files.md
+++ b/javascript/ql/src/change-notes/2025-12-05-skip-minified-files.md
@@ -0,0 +1,6 @@
+---
+category: majorAnalysis
+---
+* JavaScript files with an average line length greater than 200 are now considered minified and will no longer be analyzed.
+  For use-cases where minified files should be analyzed, the original behavior can be restored by setting the environment variable
+  `CODEQL_EXTRACTOR_JAVASCRIPT_ALLOW_MINIFIED_FILES=true`.
--- a/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/regexplib/dates.js
+++ b/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/regexplib/dates.js
@@ -132,3 +132,103 @@
 /^(([0-9])|([0-1][0-9])|([2][0-3])):?([0-5][0-9])$/g;
 /^[\w-\.]+@([\w-]+\.)+[\w-]{2,3}$/g;
 /(((0[1-9]|[12][0-9]|3[01])([/])(0[13578]|10|12)([/])(\d{4}))|(([0][1-9]|[12][0-9]|30)([/])(0[469]|11)([/])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([/])(02)([/])(\d{4}))|((29)(\.|-|\/)(02)([/])([02468][048]00))|((29)([/])(02)([/])([13579][26]00))|((29)([/])(02)([/])([0-9][0-9][0][48]))|((29)([/])(02)([/])([0-9][0-9][2468][048]))|((29)([/])(02)([/])([0-9][0-9][13579][26])))/g;
+//
+// Add some empty lines to lower the average line length so the file is not classified as minified.
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
--- a/javascript/ql/test/query-tests/filters/ClassifyFiles/ClassifyFiles.expected
+++ b/javascript/ql/test/query-tests/filters/ClassifyFiles/ClassifyFiles.expected
@@ -17,7 +17,6 @@
 | jquery-datatables.js:0:0:0:0 | jquery-datatables.js | library |
 | jquery-jstree.js:0:0:0:0 | jquery-jstree.js | library |
 | jquery-snippet.js:0:0:0:0 | jquery-snippet.js | library |
-| json-like.js:0:0:0:0 | json-like.js | generated |
 | jsx-old.js:0:0:0:0 | jsx-old.js | generated |
 | jsx.js:0:0:0:0 | jsx.js | generated |
 | multi-part-bundle.html:0:0:0:0 | multi-part-bundle.html | generated |