JS: do fewer regexp matches in SensitiveActions

2025-12-16 16:53:25 +01:00 · 2024-04-23 14:51:01 +01:00
parent bea7b94537
commit 003d208574
5 changed files with 116 additions and 26 deletions
--- a/javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll
+++ b/javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll
@@ -86,39 +86,37 @@ private predicate writesProperty(DataFlow::Node node, string name) {

 /** A write to a variable or property that might contain sensitive data. */
 private class BasicSensitiveWrite extends SensitiveWrite {
-  SensitiveDataClassification classification;
+  string name;

  BasicSensitiveWrite() {
-    exists(string name |
-      /*
-       * PERFORMANCE OPTIMISATION:
-       * `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
-       * To carry out a regex match, we must first compute the Cartesian product
-       * of all possible `name`s and regexes, then match.
-       * To keep this product as small as possible,
-       * we want to filter `name` as much as possible before the product.
-       *
-       * Do this by factoring out a helper predicate containing the filtering
-       * logic that restricts `name`. This helper predicate will get picked first
-       * in the join order, since it is the only call here that binds `name`.
-       */
+    /*
+     * PERFORMANCE OPTIMISATION:
+     * `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
+     * To carry out a regex match, we must first compute the Cartesian product
+     * of all possible `name`s and regexes, then match.
+     * To keep this product as small as possible,
+     * we want to filter `name` as much as possible before the product.
+     *
+     * Do this by factoring out a helper predicate containing the filtering
+     * logic that restricts `name`. This helper predicate will get picked first
+     * in the join order, since it is the only call here that binds `name`.
+     */

-      writesProperty(this, name) and
-      nameIndicatesSensitiveData(name, classification)
-    )
+    writesProperty(this, name) and
+    nameIndicatesSensitiveData(name)
  }

  /** Gets a classification of the kind of sensitive data the write might handle. */
-  SensitiveDataClassification getClassification() { result = classification }
+  SensitiveDataClassification getClassification() { nameIndicatesSensitiveData(name, result) }
 }

 /** An access to a variable or property that might contain sensitive data. */
 private class BasicSensitiveVariableAccess extends SensitiveVariableAccess {
-  SensitiveDataClassification classification;
+  BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name) }

-  BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name, classification) }
-
-  override SensitiveDataClassification getClassification() { result = classification }
+  override SensitiveDataClassification getClassification() {
+    nameIndicatesSensitiveData(name, result)
+  }
 }

 /** A function name that suggests it may be sensitive. */
@@ -138,11 +136,11 @@ abstract class SensitiveDataFunctionName extends SensitiveFunctionName {

 /** A method that might return sensitive data, based on the name. */
 class CredentialsFunctionName extends SensitiveDataFunctionName {
-  SensitiveDataClassification classification;
+  CredentialsFunctionName() { nameIndicatesSensitiveData(this) }

-  CredentialsFunctionName() { nameIndicatesSensitiveData(this, classification) }
-
-  override SensitiveDataClassification getClassification() { result = classification }
+  override SensitiveDataClassification getClassification() {
+    nameIndicatesSensitiveData(this, result)
+  }
 }

 /**
--- a/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll
+++ b/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }

+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
--- a/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll
+++ b/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }

+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
--- a/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll
+++ b/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }

+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
--- a/swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll
+++ b/swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
      "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
  }

+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
  /**
   * Holds if `name` may indicate the presence of sensitive data, and
   * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
   * given classification), and none of the regexps from `notSensitiveRegexp` matches
   * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
   */
  bindingset[name]
  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {