Merge pull request #6013 from RasmusWL/sensitive-improvements

Python: Improve sensitive data modeling
2026-04-30 11:15:13 +02:00 · 2021-06-15 14:45:40 +02:00
parent 19305a217a d19bc1252b
commit b19d64f173
7 changed files with 363 additions and 29 deletions
--- a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
+++ b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -5,10 +5,14 @@

 private import python
 private import semmle.python.dataflow.new.DataFlow
-// Need to import since frameworks can extend `RemoteFlowSource::Range`
+// Need to import `semmle.python.Frameworks` since frameworks can extend `SensitiveDataSource::Range`
 private import semmle.python.Frameworks
-private import semmle.python.Concepts
-private import semmle.python.security.SensitiveData as OldSensitiveData
+private import semmle.python.security.internal.SensitiveDataHeuristics as SensitiveDataHeuristics
+
+// We export these explicitly, so we don't also export the `HeuristicNames` module.
+class SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification;
+
+module SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification;

 /**
 * A data flow source of sensitive data, such as secrets, certificates, or passwords.
@@ -22,13 +26,9 @@ class SensitiveDataSource extends DataFlow::Node {
  SensitiveDataSource() { this = range }

  /**
-   * INTERNAL: Do not use.
-   *
-   * This will be rewritten to have better types soon, and therefore should only be used internally until then.
-   *
   * Gets the classification of the sensitive data.
   */
-  string getClassification() { result = range.getClassification() }
+  SensitiveDataClassification getClassification() { result = range.getClassification() }
 }

 /** Provides a class for modeling new sources of sensitive data, such as secrets, certificates, or passwords. */
@@ -41,26 +41,225 @@ module SensitiveDataSource {
   */
  abstract class Range extends DataFlow::Node {
    /**
-     * INTERNAL: Do not use.
-     *
-     * This will be rewritten to have better types soon, and therefore should only be used internally until then.
-     *
     * Gets the classification of the sensitive data.
     */
-    abstract string getClassification();
+    abstract SensitiveDataClassification getClassification();
  }
 }

-private class PortOfOldModeling extends SensitiveDataSource::Range {
-  OldSensitiveData::SensitiveData::Source oldSensitiveSource;
+/** Actual sensitive data modeling */
+private module SensitiveDataModeling {
+  private import SensitiveDataHeuristics::HeuristicNames

-  PortOfOldModeling() { this.asCfgNode() = oldSensitiveSource }
+  /**
+   * Gets a reference to a function that is considered to be a sensitive source of
+   * `classification`.
+   */
+  private DataFlow::LocalSourceNode sensitiveFunction(
+    DataFlow::TypeTracker t, SensitiveDataClassification classification
+  ) {
+    t.start() and
+    exists(Function f |
+      nameIndicatesSensitiveData(f.getName(), classification) and
+      result.asExpr() = f.getDefinition()
+    )
+    or
+    exists(DataFlow::TypeTracker t2 | result = sensitiveFunction(t2, classification).track(t2, t))
+  }

-  override string getClassification() {
-    exists(OldSensitiveData::SensitiveData classification |
-      oldSensitiveSource.isSourceOf(classification)
-    |
-      classification = "sensitive.data." + result
+  /**
+   * Gets a reference to a function that is considered to be a sensitive source of
+   * `classification`.
+   */
+  DataFlow::Node sensitiveFunction(SensitiveDataClassification classification) {
+    sensitiveFunction(DataFlow::TypeTracker::end(), classification).flowsTo(result)
+  }
+
+  /**
+   * Gets a reference to a string constant that, if used as the key in a lookup,
+   * indicates the presence of sensitive data with `classification`.
+   */
+  private DataFlow::LocalSourceNode sensitiveLookupStringConst(
+    DataFlow::TypeTracker t, SensitiveDataClassification classification
+  ) {
+    t.start() and
+    nameIndicatesSensitiveData(result.asExpr().(StrConst).getText(), classification)
+    or
+    exists(DataFlow::TypeTracker t2 |
+      result = sensitiveLookupStringConst(t2, classification).track(t2, t)
    )
  }
+
+  /**
+   * Gets a reference to a string constant that, if used as the key in a lookup,
+   * indicates the presence of sensitive data with `classification`.
+   *
+   * Also see `extraStepForCalls`.
+   */
+  DataFlow::Node sensitiveLookupStringConst(SensitiveDataClassification classification) {
+    sensitiveLookupStringConst(DataFlow::TypeTracker::end(), classification).flowsTo(result)
+  }
+
+  /** A function call that is considered a source of sensitive data. */
+  class SensitiveFunctionCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
+    SensitiveDataClassification classification;
+
+    SensitiveFunctionCall() {
+      this.getFunction() = sensitiveFunction(classification)
+      or
+      // to cover functions that we don't have the definition for, and where the
+      // reference to the function has not already been marked as being sensitive
+      nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /**
+   * Tracks any modeled source of sensitive data (with any classification),
+   * to limit the scope of `extraStepForCalls`. See it's QLDoc for more context.
+   */
+  private DataFlow::LocalSourceNode possibleSensitiveCallable(DataFlow::TypeTracker t) {
+    t.start() and
+    result instanceof SensitiveDataSource
+    or
+    exists(DataFlow::TypeTracker t2 | result = possibleSensitiveCallable(t2).track(t2, t))
+  }
+
+  /**
+   * Tracks any modeled source of sensitive data (with any classification),
+   * to limit the scope of `extraStepForCalls`. See it's QLDoc for more context.
+   */
+  private DataFlow::Node possibleSensitiveCallable() {
+    possibleSensitiveCallable(DataFlow::TypeTracker::end()).flowsTo(result)
+  }
+
+  /**
+   * Holds if the step from `nodeFrom` to `nodeTo` should be considered a
+   * taint-flow step for sensitive-data, to ensure calls are handled correctly.
+   *
+   * To handle calls properly, while preserving a good source for path explanations,
+   * you need to include this predicate as an additional taint step in your taint-tracking
+   * configurations.
+   *
+   * The core problem can be illustrated by the example below. If we consider the
+   * `print` a sink, what path and what source do we want to show? My initial approach
+   * would be to use type-tracking to propagate from the `not_found.get_passwd` attribute
+   * lookup, to the use of `non_sensitive_name`, and then create a new `SensitiveDataSource::Range`
+   * like `SensitiveFunctionCall`. Although that seems likely to work, it will also end up
+   * with a non-optimal path, which starts at _bad source_, and therefore doesn't show
+   * how we figured out that `non_sensitive_name`
+   * could be a function that returns a password (and in cases where there is many calls to
+   * `my_func` it will be annoying for someone to figure this out manually).
+   *
+   * By including this additional taint-step in the taint-tracking configuration, it's possible
+   * to get a path explanation going from _good source_ to the sink.
+   *
+   * ```python
+   * def my_func(non_sensitive_name):
+   *     x = non_sensitive_name() # <-- bad source
+   *     print(x) # <-- sink
+   *
+   * import not_found
+   * f = not_found.get_passwd # <-- good source
+   * my_func(f)
+   * ```
+   */
+  predicate extraStepForCalls(DataFlow::Node nodeFrom, DataFlow::CallCfgNode nodeTo) {
+    // However, we do still use the type-tracking approach to limit the size of this
+    // predicate.
+    nodeTo.getFunction() = nodeFrom and
+    nodeFrom = possibleSensitiveCallable()
+  }
+
+  /**
+   * Any kind of variable assignment (also including with/for) where the name indicates
+   * it contains sensitive data.
+   *
+   * Note: We _could_ make any access to a variable with a sensitive name a source of
+   * sensitive data, but to make path explanations in data-flow/taint-tracking good,
+   * we don't want that, since it works against allowing users to understand the flow
+   * in the program (which is the whole point).
+   *
+   * Note: To make data-flow/taint-tracking work, the expression that is _assigned_ to
+   * the variable is marked as the source (as compared to marking the variable as the
+   * source).
+   */
+  class SensitiveVariableAssignment extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveVariableAssignment() {
+      exists(DefinitionNode def |
+        nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
+        (
+          this.asCfgNode() = def.getValue()
+          or
+          this.asCfgNode() = def.getValue().(ForNode).getSequence()
+        ) and
+        not this.asExpr() instanceof FunctionExpr and
+        not this.asExpr() instanceof ClassExpr
+      )
+      or
+      exists(With with |
+        nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
+        this.asExpr() = with.getContextExpr()
+      )
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** An attribute access that is considered a source of sensitive data. */
+  class SensitiveAttributeAccess extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveAttributeAccess() {
+      // Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
+      // I considered excluding any `from ... import something_sensitive`, but then realized that
+      // we should flag up `form ... import password as ...` as a password
+      nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
+      or
+      // Things like `getattr(foo, <reference-to-string>)`
+      this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A subscript, where the key indicates the result will be sensitive data. */
+  class SensitiveSubscript extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveSubscript() {
+      this.asCfgNode().(SubscriptNode).getIndex() =
+        sensitiveLookupStringConst(classification).asCfgNode()
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A call to `get` on an object, where the key indicates the result will be sensitive data. */
+  class SensitiveGetCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
+    SensitiveDataClassification classification;
+
+    SensitiveGetCall() {
+      this.getFunction().asCfgNode().(AttrNode).getName() = "get" and
+      this.getArg(0) = sensitiveLookupStringConst(classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A parameter where the name indicates it will receive sensitive data. */
+  class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
+    SensitiveDataClassification classification;
+
+    SensitiveParameter() {
+      nameIndicatesSensitiveData(this.getParameter().getName(), classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
 }
+
+predicate sensitiveDataExtraStepForCalls = SensitiveDataModeling::extraStepForCalls/2;
--- a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll
+++ b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll
@@ -13,6 +13,7 @@ private import semmle.python.dataflow.new.TaintTracking
 private import semmle.python.Concepts
 private import semmle.python.dataflow.new.RemoteFlowSources
 private import semmle.python.dataflow.new.BarrierGuards
+private import semmle.python.dataflow.new.SensitiveDataSources

 /**
 * Provides a taint-tracking configuration for detecting use of a broken or weak
@@ -38,6 +39,10 @@ module NormalHashFunction {
      or
      node instanceof Sanitizer
    }
+
+    override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+      sensitiveDataExtraStepForCalls(node1, node2)
+    }
  }
 }

@@ -70,5 +75,9 @@ module ComputationallyExpensiveHashFunction {
      or
      node instanceof Sanitizer
    }
+
+    override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+      sensitiveDataExtraStepForCalls(node1, node2)
+    }
  }
 }
--- a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll
+++ b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll
@@ -52,7 +52,9 @@ module NormalHashFunction {
   * A source of sensitive data, considered as a flow source.
   */
  class SensitiveDataSourceAsSource extends Source, SensitiveDataSource {
-    override string getClassification() { result = SensitiveDataSource.super.getClassification() }
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataSource.super.getClassification()
+    }
  }

  /** The input to a hashing operation using a weak algorithm, considered as a flow sink. */
@@ -120,12 +122,12 @@ module ComputationallyExpensiveHashFunction {
   */
  class PasswordSourceAsSource extends Source, SensitiveDataSource {
    PasswordSourceAsSource() {
-      // TODO: once https://github.com/github/codeql/pull/5739 has been merged,
-      // don't use hardcoded value anymore
-      SensitiveDataSource.super.getClassification() = "password"
+      SensitiveDataSource.super.getClassification() = SensitiveDataClassification::password()
    }

-    override string getClassification() { result = SensitiveDataSource.super.getClassification() }
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataSource.super.getClassification()
+    }
  }

  /**