Python: ORM: Add data-flow plumbing for ORM modeling

The idea is that we will do `save ==> synthetic` and `synthetic ==> load`, so we don't need to do CP between save/load. This setup with synthetic node in the middle, also allows for a limited amount of the field-flow we can do with real flow-summary support.
2026-05-02 12:15:17 +02:00 · 2022-02-16 16:38:57 +01:00
parent d3f07cdc10
commit ef39968a56
3 changed files with 94 additions and 2 deletions
--- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll
@@ -935,6 +935,24 @@ string ppReprType(DataFlowType t) { none() }
 * taken into account.
 */
 predicate jumpStep(Node nodeFrom, Node nodeTo) {
+  jumpStepSharedWithTypeTracker(nodeFrom, nodeTo)
+  or
+  jumpStepNotSharedWithTypeTracker(nodeFrom, nodeTo)
+}
+
+/**
+ * Set of jumpSteps that are shared with type-tracker implementation.
+ *
+ * For ORM modeling we want to add jumpsteps to global dataflow, but since these are
+ * based on type-trackers, it's important that these new ORM jumsteps are not used in
+ * the type-trackers as well, as that would make evaluation of type-tracking recursive
+ * with the new jumpsteps.
+ *
+ * Holds if `pred` can flow to `succ`, by jumping from one callable to
+ * another. Additional steps specified by the configuration are *not*
+ * taken into account.
+ */
+predicate jumpStepSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
  runtimeJumpStep(nodeFrom, nodeTo)
  or
  // Read of module attribute:
@@ -948,6 +966,22 @@ predicate jumpStep(Node nodeFrom, Node nodeTo) {
  defaultValueFlowStep(nodeFrom, nodeTo)
 }

+/**
+ * Set of jumpSteps that are NOT shared with type-tracker implementation.
+ *
+ * For ORM modeling we want to add jumpsteps to global dataflow, but since these are
+ * based on type-trackers, it's important that these new ORM jumsteps are not used in
+ * the type-trackers as well, as that would make evaluation of type-tracking recursive
+ * with the new jumpsteps.
+ *
+ * Holds if `pred` can flow to `succ`, by jumping from one callable to
+ * another. Additional steps specified by the configuration are *not*
+ * taken into account.
+ */
+predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
+  any(Orm::AdditionalOrmSteps es).jumpStep(nodeFrom, nodeTo)
+}
+
 /**
 * Holds if the module `m` defines a name `name` by assigning `defn` to it. This is an
 * overapproximation, as `name` may not in fact be exported (e.g. by defining an `__all__` that does
@@ -991,6 +1025,51 @@ predicate storeStep(Node nodeFrom, Content c, Node nodeTo) {
  kwOverflowStoreStep(nodeFrom, c, nodeTo)
  or
  matchStoreStep(nodeFrom, c, nodeTo)
+  or
+  any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
+}
+
+/**
+ * INTERNAL: Do not use.
+ *
+ * Provides classes for modeling data-flow through ORM models saved in a DB.
+ */
+module Orm {
+  /**
+   * INTERNAL: Do not use.
+   *
+   * A unit class for adding additional data-flow steps for ORM models.
+   */
+  class AdditionalOrmSteps extends Unit {
+    /**
+     * Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
+     * content `c`.
+     */
+    abstract predicate storeStep(Node nodeFrom, Content c, Node nodeTo);
+
+    /**
+     * Holds if `pred` can flow to `succ`, by jumping from one callable to
+     * another. Additional steps specified by the configuration are *not*
+     * taken into account.
+     */
+    abstract predicate jumpStep(Node nodeFrom, Node nodeTo);
+  }
+
+  /** A synthetic node representing the data for an ORM model saved in a DB. */
+  class SyntheticOrmModelNode extends Node, TSyntheticOrmModelNode {
+    Class cls;
+
+    SyntheticOrmModelNode() { this = TSyntheticOrmModelNode(cls) }
+
+    override string toString() { result = "[orm-model] " + cls.toString() }
+
+    override Scope getScope() { result = cls.getEnclosingScope() }
+
+    override Location getLocation() { result = cls.getLocation() }
+
+    /** Gets the class that defines this ORM model. */
+    Class getClass() { result = cls }
+  }
 }

 /** Data flows from an element of a list to the list. */
--- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll
@@ -87,7 +87,20 @@ newtype TNode =
  /**
   * A synthetic node representing element content in a star pattern.
   */
-  TStarPatternElementNode(MatchStarPattern target)
+  TStarPatternElementNode(MatchStarPattern target) or
+  /**
+   * INTERNAL: Do not use.
+   *
+   * A synthetic node representing the data for an ORM model saved in a DB.
+   */
+  // TODO: Limiting the classes here to the ones that are actually ORM models was
+  // non-trivial, since that logic is based on API::Node results, and trying to do this
+  // causes non-monotonic recursion, and makes the API graph evaluation recursive with
+  // data-flow, which might do bad things for performance.
+  //
+  // So for now we live with having these synthetic ORM nodes for _all_ classes, which
+  // is a bit wasteful, but we don't think it will hurt too much.
+  TSyntheticOrmModelNode(Class cls)

 /** Helper for `Node::getEnclosingCallable`. */
 private DataFlowCallable getCallableScope(Scope s) {
--- a/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackerSpecific.qll
+++ b/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackerSpecific.qll
@@ -12,7 +12,7 @@ class TypeTrackingNode = DataFlowPublic::TypeTrackingNode;

 predicate simpleLocalFlowStep = DataFlowPrivate::simpleLocalFlowStep/2;

-predicate jumpStep = DataFlowPrivate::jumpStep/2;
+predicate jumpStep = DataFlowPrivate::jumpStepSharedWithTypeTracker/2;

 /**
 * Gets the name of a possible piece of content. For Python, this is currently only attribute names,