Python: Model sensitive data based on variable names

2026-04-30 11:15:13 +02:00 · 2021-06-04 11:10:50 +02:00
parent f5fd0f8d1c
commit 350f79e1e1
3 changed files with 49 additions and 3 deletions
--- a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
+++ b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -111,6 +111,43 @@ private module SensitiveDataModeling {
    override SensitiveDataClassification getClassification() { result = classification }
  }

+  /**
+   * Any kind of variable assignment (also including with/for) where the name indicates
+   * it contains sensitive data.
+   *
+   * Note: We _could_ make any access to a variable with a sensitive name a source of
+   * sensitive data, but to make path explanations in data-flow/taint-tracking good,
+   * we don't want that, since it works against allowing users to understand the flow
+   * in the program (which is the whole point).
+   *
+   * Note: To make data-flow/taint-tracking work, the expression that is _assigned_ to
+   * the variable is marked as the source (as compared to marking the variable as the
+   * source).
+   */
+  class SensitiveVariableAssignment extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveVariableAssignment() {
+      exists(DefinitionNode def |
+        nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
+        (
+          this.asCfgNode() = def.getValue()
+          or
+          this.asCfgNode() = def.getValue().(ForNode).getSequence()
+        ) and
+        not this.asExpr() instanceof FunctionExpr and
+        not this.asExpr() instanceof ClassExpr
+      )
+      or
+      exists(With with |
+        nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
+        this.asExpr() = with.getContextExpr()
+      )
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
  /** An attribute access that is considered a source of sensitive data. */
  class SensitiveAttributeAccess extends SensitiveDataSource::Range {
    SensitiveDataClassification classification;
--- a/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql
+++ b/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql
@@ -1,5 +1,6 @@
 import python
 import semmle.python.dataflow.new.DataFlow
+import semmle.python.dataflow.new.TaintTracking
 import TestUtilities.InlineExpectationsTest
 import semmle.python.dataflow.new.SensitiveDataSources
 private import semmle.python.ApiGraphs
@@ -21,7 +22,7 @@ class SensitiveDataSourcesTest extends InlineExpectationsTest {
      or
      exists(DataFlow::Node use |
        use = API::builtin("print").getACall().getArg(_) and
-        DataFlow::localFlow(source, use) and
+        TaintTracking::localTaint(source, use) and
        location = use.getLocation() and
        element = use.toString() and
        value = source.getClassification() and
--- a/python/ql/test/experimental/dataflow/sensitive-data/test.py
+++ b/python/ql/test/experimental/dataflow/sensitive-data/test.py
@@ -29,12 +29,20 @@ foo = ObjectFromDatabase()
 foo.secret # $ SensitiveDataSource=secret
 foo.username # $ SensitiveDataSource=id

+
 # based on variable/parameter names
 def my_func(password): # $ SensitiveDataSource=password
    print(password) # $ SensitiveUse=password

-password = some_function()
-print(password) # $ MISSING: SensitiveUse=password
+password = some_function() # $ SensitiveDataSource=password
+print(password) # $ SensitiveUse=password
+
+for password in some_function2(): # $ SensitiveDataSource=password
+    print(password) # $ SensitiveUse=password
+
+with some_function3() as password: # $ SensitiveDataSource=password
+    print(password) # $ SensitiveUse=password
+

 # Special handling of lookups of sensitive properties
 request.args["password"], # $ SensitiveDataSource=password