Python: Port sensitive data modeling

No longer using points-to 🎉
2026-07-20 18:58:36 +02:00 · 2021-06-03 14:17:29 +02:00
parent 3b68c87b6c
commit 00a71a1c41
3 changed files with 110 additions and 17 deletions
--- a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
+++ b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -8,7 +8,6 @@ private import semmle.python.dataflow.new.DataFlow
 // Need to import since frameworks can extend `RemoteFlowSource::Range`
 private import semmle.python.Frameworks
 private import semmle.python.Concepts
-private import semmle.python.security.SensitiveData as OldSensitiveData
 private import semmle.python.security.internal.SensitiveDataHeuristics as SensitiveDataHeuristics

 // We export these explicitly, so we don't also export the `HeuristicNames` module.
@@ -49,17 +48,94 @@ module SensitiveDataSource {
  }
 }

-// TODO: rewrite this to not rely on the old points-to implementation
-private class PortOfOldModeling extends SensitiveDataSource::Range {
-  OldSensitiveData::SensitiveData::Source oldSensitiveSource;
+/** Actual sensitive data modeling */
+private module SensitiveDataModeling {
+  private import SensitiveDataHeuristics::HeuristicNames

-  PortOfOldModeling() { this.asCfgNode() = oldSensitiveSource }
+  /**
+   * Gets a reference to a function that is considered to be a sensitive source of
+   * `classification`.
+   */
+  private DataFlow::LocalSourceNode sensitiveFunction(
+    DataFlow::TypeTracker t, SensitiveDataClassification classification
+  ) {
+    t.start() and
+    exists(Function f |
+      nameIndicatesSensitiveData(f.getName(), classification) and
+      result.asExpr() = f.getDefinition()
+    )
+    or
+    exists(DataFlow::TypeTracker t2 | result = sensitiveFunction(t2, classification).track(t2, t))
+  }

-  override SensitiveDataClassification getClassification() {
-    exists(OldSensitiveData::SensitiveData classification |
-      oldSensitiveSource.isSourceOf(classification)
-    |
-      classification = "sensitive.data." + result
+  /**
+   * Gets a reference to a function that is considered to be a sensitive source of
+   * `classification`.
+   */
+  DataFlow::Node sensitiveFunction(SensitiveDataClassification classification) {
+    sensitiveFunction(DataFlow::TypeTracker::end(), classification).flowsTo(result)
+  }
+
+  /**
+   * Gets a reference to a string constant that, if used as the key in a lookup,
+   * indicates the presence of sensitive data with `classification`.
+   */
+  private DataFlow::LocalSourceNode sensitiveLookupStringConst(
+    DataFlow::TypeTracker t, SensitiveDataClassification classification
+  ) {
+    t.start() and
+    nameIndicatesSensitiveData(result.asExpr().(StrConst).getText(), classification)
+    or
+    exists(DataFlow::TypeTracker t2 |
+      result = sensitiveLookupStringConst(t2, classification).track(t2, t)
    )
  }
+
+  /**
+   * Gets a reference to a string constant that, if used as the key in a lookup,
+   * indicates the presence of sensitive data with `classification`.
+   */
+  DataFlow::Node sensitiveLookupStringConst(SensitiveDataClassification classification) {
+    sensitiveLookupStringConst(DataFlow::TypeTracker::end(), classification).flowsTo(result)
+  }
+
+  /** A function call that is considered a source of sensitive data. */
+  class SensitiveFunctionCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
+    SensitiveDataClassification classification;
+
+    SensitiveFunctionCall() {
+      this.getFunction() = sensitiveFunction(classification)
+      or
+      nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** An attribute access that is considered a source of sensitive data. */
+  class SensitiveAttributeAccess extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveAttributeAccess() {
+      nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
+      or
+      // I considered excluding any `from ... import something_sensitive`, but then realized that
+      // we should flag up `form ... import password as ...` as a password
+      this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A call to `get` on an object, where the key indicates the result will be sensitive data. */
+  class SensitiveGetCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
+    SensitiveDataClassification classification;
+
+    SensitiveGetCall() {
+      this.getFunction().asCfgNode().(AttrNode).getName() = "get" and
+      this.getArg(0) = sensitiveLookupStringConst(classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
 }
--- a/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql
+++ b/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql
@@ -2,19 +2,32 @@ import python
 import semmle.python.dataflow.new.DataFlow
 import TestUtilities.InlineExpectationsTest
 import semmle.python.dataflow.new.SensitiveDataSources
+private import semmle.python.ApiGraphs

 class SensitiveDataSourcesTest extends InlineExpectationsTest {
  SensitiveDataSourcesTest() { this = "SensitiveDataSourcesTest" }

-  override string getARelevantTag() { result = "SensitiveDataSource" }
+  override string getARelevantTag() { result in ["SensitiveDataSource", "SensitiveUse"] }

  override predicate hasActualResult(Location location, string element, string tag, string value) {
    exists(location.getFile().getRelativePath()) and
    exists(SensitiveDataSource source |
-      location = source.getLocation() and
-      element = source.toString() and
-      value = source.getClassification() and
-      tag = "SensitiveDataSource"
+      (
+        location = source.getLocation() and
+        element = source.toString() and
+        value = source.getClassification() and
+        tag = "SensitiveDataSource"
+      )
+      or
+      exists(DataFlow::Node use |
+        use = API::builtin("print").getACall().getArg(_) and
+        DataFlow::localFlow(source, use) and
+        location = use.getLocation() and
+        element = use.toString() and
+        value = source.getClassification() and
+        tag = "SensitiveUse"
+
+      )
    )
  }
 }
--- a/python/ql/test/experimental/dataflow/sensitive-data/test.py
+++ b/python/ql/test/experimental/dataflow/sensitive-data/test.py
@@ -1,5 +1,6 @@

-from not_found import get_passwd, account_id
+from not_found import get_passwd # $ SensitiveDataSource=password
+from not_found import account_id # $ SensitiveDataSource=id

 def get_password():
    pass
@@ -30,7 +31,7 @@ foo.username # $ SensitiveDataSource=id

 # plain variables
 password = some_function()
-print(password) # $ MISSING: SensitiveDataSource=password
+print(password) # $ MISSING: SensitiveUse=password

 # Special handling of lookups of sensitive properties
 request.args["password"], # $ MISSING: SensitiveDataSource=password
@@ -41,3 +42,6 @@ request.args.get(x) # $ SensitiveDataSource=password

 # I don't think handling `getlist` is super important, just included it to show what we don't handle
 request.args.getlist("password")[0] # $ MISSING: SensitiveDataSource=password
+
+from not_found import password2 as foo # $ SensitiveDataSource=password
+print(foo) # $ SensitiveUse=password