Python: Fix bad join order for sensitive data

Not the prettiest of solutions, but it does the job. Basically, we were calculating (and re-calculating) the same big relation between strings and regexes and then checking whether the latter matched the former. This resulted in tuple counts like the following: ``` [2021-07-12 16:09:24] (12s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::SensitiveVariableAssignment#class#ff#shared/4@7489c6: 4918074 ~0% {4} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH Flow::NameNode::getId_dispred#ff CARTESIAN PRODUCT OUTPUT Lhs.0 'arg0', Lhs.1 'arg1', Rhs.0, Rhs.1 'arg3' 2654 ~0% {4} r2 = JOIN r1 WITH PRIMITIVE regexpMatch#bb ON Lhs.3 'arg3',Lhs.1 'arg1' return r2 ``` (The above being just the bit that handles `DefinitionNode` in `SensitiveVariableAssignment`, and taking 12 seconds to evaluate.) By applying a bit of manual inlining and magic, this becomes somewhat more manageable: ``` [2021-07-12 15:59:44] (1s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::sensitiveString#ff/2@8830e2: 27671 ~2% {3} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveParameterName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 334012 ~2% {3} r2 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 361683 ~11% {3} r3 = r1 UNION r2 154644 ~0% {3} r4 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveFunctionName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 149198 ~1% {3} r5 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveStrConst#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 124257 ~5% {3} r6 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveAttributeName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 273455 ~21% {3} r7 = r5 UNION r6 428099 ~30% {3} r8 = r4 UNION r7 789782 ~78% {3} r9 = r3 UNION r8 1121 ~77% {3} r10 = JOIN r9 WITH PRIMITIVE regexpMatch#bb ON Lhs.2 'result',Lhs.1 1121 ~70% {2} r11 = SCAN r10 OUTPUT In.0 'classification', In.2 'result' return r11 ``` (The above being the total for all the sensitive names we care about, taking only 1.2 seconds to evaluate.) Incidentally, you may wonder why this has _fewer_ results than before. The answer is control flow splitting -- every sensitively-named `DefinitionNode` would have been matched in isolation previously. By pre-matching on just the names of these, we can subsequently join against those names that are known to be sensitive, which is a much faster operation. (We also get the benefit of deduplicating the strings that are matched, before actually performing the match, so if, say, an attribute name and a variable name are identical, then we'll only match them once.) We also exclude all docstrings as relevant string constants, as these presumably don't actually flow anywhere.
2025-12-18 01:33:15 +01:00 · 2021-07-12 16:05:20 +00:00
parent a73e382dfe
commit 1decf23785
1 changed files with 47 additions and 9 deletions
--- a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
+++ b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -60,7 +60,7 @@ private module SensitiveDataModeling {
  ) {
    t.start() and
    exists(Function f |
-      nameIndicatesSensitiveData(f.getName(), classification) and
+      f.getName() = sensitiveString(classification) and
      result.asExpr() = f.getDefinition()
    )
    or
@@ -83,7 +83,7 @@ private module SensitiveDataModeling {
    // Note: If this is implemented with type-tracking, we will get cross-talk as
    // illustrated in python/ql/test/experimental/dataflow/sensitive-data/test.py
    exists(DataFlow::LocalSourceNode source |
-      nameIndicatesSensitiveData(source.asExpr().(StrConst).getText(), classification) and
+      source.asExpr().(StrConst).getText() = sensitiveString(classification) and
      source.flowsTo(result)
    )
  }
@@ -97,7 +97,7 @@ private module SensitiveDataModeling {
      or
      // to cover functions that we don't have the definition for, and where the
      // reference to the function has not already been marked as being sensitive
-      nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
+      this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification)
    }

    override SensitiveDataClassification getClassification() { result = classification }
@@ -164,6 +164,46 @@ private module SensitiveDataModeling {
    nodeFrom = possibleSensitiveCallable()
  }

+  pragma[nomagic]
+  private string sensitiveStrConstCandidate() {
+    result = any(StrConst s | not s.isDocString()).getText() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveAttributeNameCandidate() {
+    result = any(DataFlow::AttrRead a).getAttributeName() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveParameterNameCandidate() {
+    result = any(Parameter p).getName() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveFunctionNameCandidate() {
+    result = any(Function f).getName() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveNameCandidate() {
+    result = any(Name n).getId() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveString(SensitiveDataClassification classification) {
+    result in [
+        sensitiveNameCandidate(), sensitiveAttributeNameCandidate(),
+        sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(),
+        sensitiveStrConstCandidate()
+      ] and
+    result.regexpMatch(maybeSensitiveRegexp(classification))
+  }
+
  /**
   * Any kind of variable assignment (also including with/for) where the name indicates
   * it contains sensitive data.
@@ -182,7 +222,7 @@ private module SensitiveDataModeling {

    SensitiveVariableAssignment() {
      exists(DefinitionNode def |
-        nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
+        def.(NameNode).getId() = sensitiveString(classification) and
        (
          this.asCfgNode() = def.getValue()
          or
@@ -193,7 +233,7 @@ private module SensitiveDataModeling {
      )
      or
      exists(With with |
-        nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
+        with.getOptionalVars().(Name).getId() = sensitiveString(classification) and
        this.asExpr() = with.getContextExpr()
      )
    }
@@ -209,7 +249,7 @@ private module SensitiveDataModeling {
      // Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
      // I considered excluding any `from ... import something_sensitive`, but then realized that
      // we should flag up `form ... import password as ...` as a password
-      nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
+      this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification)
      or
      // Things like `getattr(foo, <reference-to-string>)`
      this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
@@ -246,9 +286,7 @@ private module SensitiveDataModeling {
  class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
    SensitiveDataClassification classification;

-    SensitiveParameter() {
-      nameIndicatesSensitiveData(this.getParameter().getName(), classification)
-    }
+    SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) }

    override SensitiveDataClassification getClassification() { result = classification }
  }