Merge pull request #16306 from github/nickrolfe/js-sensitive

JS: do fewer regexp matches in SensitiveActions
2025-12-16 16:53:25 +01:00 · 2024-04-24 09:49:44 +01:00
parent de58ee5a22 003d208574
commit af72c0848e
5 changed files with 116 additions and 26 deletions
--- a/javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll
+++ b/javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll
@@ -86,39 +86,37 @@ private predicate writesProperty(DataFlow::Node node, string name) {
 /** A write to a variable or property that might contain sensitive data. */
 private class BasicSensitiveWrite extends SensitiveWrite {
-  SensitiveDataClassification classification;
+  string name;
  BasicSensitiveWrite() {
-    exists(string name |
+    /*
-      /*
+     * PERFORMANCE OPTIMISATION:
-       * PERFORMANCE OPTIMISATION:
+     * `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
-       * `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
+     * To carry out a regex match, we must first compute the Cartesian product
-       * To carry out a regex match, we must first compute the Cartesian product
+     * of all possible `name`s and regexes, then match.
-       * of all possible `name`s and regexes, then match.
+     * To keep this product as small as possible,
-       * To keep this product as small as possible,
+     * we want to filter `name` as much as possible before the product.
-       * we want to filter `name` as much as possible before the product.
+     *
-       *
+     * Do this by factoring out a helper predicate containing the filtering
-       * Do this by factoring out a helper predicate containing the filtering
+     * logic that restricts `name`. This helper predicate will get picked first
-       * logic that restricts `name`. This helper predicate will get picked first
+     * in the join order, since it is the only call here that binds `name`.
-       * in the join order, since it is the only call here that binds `name`.
+     */
       */
-      writesProperty(this, name) and
+    writesProperty(this, name) and
-      nameIndicatesSensitiveData(name, classification)
+    nameIndicatesSensitiveData(name)
    )
  }
  /** Gets a classification of the kind of sensitive data the write might handle. */
-  SensitiveDataClassification getClassification() { result = classification }
+  SensitiveDataClassification getClassification() { nameIndicatesSensitiveData(name, result) }
 }
 /** An access to a variable or property that might contain sensitive data. */
 private class BasicSensitiveVariableAccess extends SensitiveVariableAccess {
-  SensitiveDataClassification classification;
+  BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name) }
-  BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name, classification) }
+  override SensitiveDataClassification getClassification() {
-
+    nameIndicatesSensitiveData(name, result)
-  override SensitiveDataClassification getClassification() { result = classification }
+  }
 }
 /** A function name that suggests it may be sensitive. */
@@ -138,11 +136,11 @@ abstract class SensitiveDataFunctionName extends SensitiveFunctionName {
 /** A method that might return sensitive data, based on the name. */
 class CredentialsFunctionName extends SensitiveDataFunctionName {
-  SensitiveDataClassification classification;
+  CredentialsFunctionName() { nameIndicatesSensitiveData(this) }
-  CredentialsFunctionName() { nameIndicatesSensitiveData(this, classification) }
+  override SensitiveDataClassification getClassification() {
-
+    nameIndicatesSensitiveData(this, result)
-  override SensitiveDataClassification getClassification() { result = classification }
+  }
 }
 /**
--- a/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll
+++ b/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
   *
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name) {
    exists(string combinedRegexp |
      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
      combinedRegexp =
        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
    |
      name.regexpMatch(combinedRegexp)
    ) and
    not name.regexpMatch(notSensitiveRegexp())
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
   *
   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
   * pass, since that combines all the regexps into one, and should be faster. Then call this
   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
--- a/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll
+++ b/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
   *
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name) {
    exists(string combinedRegexp |
      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
      combinedRegexp =
        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
    |
      name.regexpMatch(combinedRegexp)
    ) and
    not name.regexpMatch(notSensitiveRegexp())
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
   *
   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
   * pass, since that combines all the regexps into one, and should be faster. Then call this
   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
--- a/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll
+++ b/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
   *
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name) {
    exists(string combinedRegexp |
      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
      combinedRegexp =
        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
    |
      name.regexpMatch(combinedRegexp)
    ) and
    not name.regexpMatch(notSensitiveRegexp())
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
   *
   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
   * pass, since that combines all the regexps into one, and should be faster. Then call this
   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
--- a/swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll
+++ b/swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
   *
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name) {
    exists(string combinedRegexp |
      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
      combinedRegexp =
        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
    |
      name.regexpMatch(combinedRegexp)
    ) and
    not name.regexpMatch(notSensitiveRegexp())
  }
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
   *
   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
   * pass, since that combines all the regexps into one, and should be faster. Then call this
   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {