From 784eef3f2cc8fc0872846660afd1793560c51968 Mon Sep 17 00:00:00 2001
From: Anders Schack-Mulligen <aschackmull@github.com>
Date: Mon, 29 Aug 2022 11:45:42 +0200
Subject: [PATCH 1/2] Java: Support SCCs in TypeFlow.

---
 .../semmle/code/java/dataflow/TypeFlow.qll    | 228 ++++++++++++++----
 1 file changed, 184 insertions(+), 44 deletions(-)

diff --git a/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll b/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll
index 36eb301a2e1..ff4ea3a7209 100644
--- a/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll
+++ b/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll
@@ -53,6 +53,16 @@ private class TypeFlowNode extends TTypeFlowNode {
   }
 }
 
+private int getNodeKind(TypeFlowNode n) {
+  result = 1 and n instanceof TField
+  or
+  result = 2 and n instanceof TSsa
+  or
+  result = 3 and n instanceof TExpr
+  or
+  result = 4 and n instanceof TMethod
+}
+
 /** Gets `t` if it is a `RefType` or the boxed type if `t` is a primitive type. */
 private RefType boxIfNeeded(Type t) {
   t.(PrimitiveType).getBoxedType() = result or
@@ -146,27 +156,165 @@ private predicate joinStep(TypeFlowNode n1, TypeFlowNode n2) {
   joinStep0(n1, n2) and not isNull(n1)
 }
 
-private predicate joinStepRank1(int r, TypeFlowNode n1, TypeFlowNode n2) {
-  n1 =
-    rank[r](TypeFlowNode n |
-      joinStep(n, n2)
+private predicate anyStep(TypeFlowNode n1, TypeFlowNode n2) { joinStep(n1, n2) or step(n1, n2) }
+
+private predicate sccEdge(TypeFlowNode n1, TypeFlowNode n2) { anyStep(n1, n2) and anyStep+(n2, n1) }
+
+/*
+ * SCC reduction.
+ *
+ * This ought to be as easy as `equivalenceRelation(sccEdge/2)(n, scc)`, but
+ * this HOP is not currently supported for newtypes.
+ *
+ * A straightforward implementation would be:
+ * ```
+ * predicate sccRepr(TypeFlowNode n, TypeFlowNode scc) {
+ *  scc =
+ *    max(TypeFlowNode n2 |
+ *      sccEdge+(n, n2)
+ *    |
+ *      n2
+ *      order by
+ *        n2.getLocation().getStartLine(), n2.getLocation().getStartColumn(), getNodeKind(n2)
+ *    )
+ * }
+ *
+ * ```
+ * but this is quadratic in the size of the SCCs.
+ *
+ * Instead we find local maxima and determine the SCC representatives from those.
+ * (This is still worst-case quadratic in the size of the SCCs, but generally
+ * performs better.)
+ */
+
+private predicate sccEdgeWithMax(TypeFlowNode n1, TypeFlowNode n2, TypeFlowNode m) {
+  sccEdge(n1, n2) and
+  m =
+    max(TypeFlowNode n |
+      n = [n1, n2]
     |
-      n order by n.getLocation().getStartLine(), n.getLocation().getStartColumn()
+      n order by n.getLocation().getStartLine(), n.getLocation().getStartColumn(), getNodeKind(n)
     )
 }
 
-private predicate joinStepRank2(int r2, int r1, TypeFlowNode n) {
-  r1 = rank[r2](int r | joinStepRank1(r, _, n) | r)
-}
-
-private predicate joinStepRank(int r, TypeFlowNode n1, TypeFlowNode n2) {
-  exists(int r1 |
-    joinStepRank1(r1, n1, n2) and
-    joinStepRank2(r, r1, n2)
+private predicate hasLargerNeighbor(TypeFlowNode n) {
+  exists(TypeFlowNode n2 |
+    sccEdgeWithMax(n, n2, n2) and
+    not sccEdgeWithMax(n, n2, n)
+    or
+    sccEdgeWithMax(n2, n, n2) and
+    not sccEdgeWithMax(n2, n, n)
   )
 }
 
-private int lastRank(TypeFlowNode n) { result = max(int r | joinStepRank(r, _, n)) }
+private predicate localMax(TypeFlowNode m) {
+  sccEdgeWithMax(_, _, m) and
+  not hasLargerNeighbor(m)
+}
+
+private predicate sccReprFromLocalMax(TypeFlowNode scc) {
+  exists(TypeFlowNode m |
+    localMax(m) and
+    scc =
+      max(TypeFlowNode n2 |
+        sccEdge+(m, n2) and localMax(n2)
+      |
+        n2
+        order by
+          n2.getLocation().getStartLine(), n2.getLocation().getStartColumn(), getNodeKind(n2)
+      )
+  )
+}
+
+private predicate sccRepr(TypeFlowNode n, TypeFlowNode scc) {
+  sccEdge+(n, scc) and sccReprFromLocalMax(scc)
+}
+
+private predicate sccJoinStep(TypeFlowNode n, TypeFlowNode scc) {
+  exists(TypeFlowNode mid |
+    joinStep(n, mid) and
+    sccRepr(mid, scc) and
+    not sccRepr(n, scc)
+  )
+}
+
+private signature predicate edgeSig(TypeFlowNode n1, TypeFlowNode n2);
+
+private signature module RankedEdge {
+  predicate edgeRank(int r, TypeFlowNode n1, TypeFlowNode n2);
+
+  int lastRank(TypeFlowNode n);
+}
+
+private module RankEdge<edgeSig/2 edge> implements RankedEdge {
+  private predicate edgeRank1(int r, TypeFlowNode n1, TypeFlowNode n2) {
+    n1 =
+      rank[r](TypeFlowNode n |
+        edge(n, n2)
+      |
+        n order by n.getLocation().getStartLine(), n.getLocation().getStartColumn()
+      )
+  }
+
+  private predicate edgeRank2(int r2, int r1, TypeFlowNode n) {
+    r1 = rank[r2](int r | edgeRank1(r, _, n) | r)
+  }
+
+  predicate edgeRank(int r, TypeFlowNode n1, TypeFlowNode n2) {
+    exists(int r1 |
+      edgeRank1(r1, n1, n2) and
+      edgeRank2(r, r1, n2)
+    )
+  }
+
+  int lastRank(TypeFlowNode n) { result = max(int r | edgeRank(r, _, n)) }
+}
+
+private signature module TypePropagation {
+  predicate candType(TypeFlowNode n, RefType t);
+
+  bindingset[t]
+  predicate supportsType(TypeFlowNode n, RefType t);
+}
+
+/** Implements recursion through `forall` by way of edge ranking. */
+private module ForAll<RankedEdge Edge, TypePropagation T> {
+  /**
+   * Holds if `t` is a bound that holds on one of the incoming edges to `n` and
+   * thus is a candidate bound for `n`.
+   */
+  pragma[nomagic]
+  private predicate candJoinType(TypeFlowNode n, RefType t) {
+    exists(TypeFlowNode mid |
+      T::candType(mid, t) and
+      Edge::edgeRank(_, mid, n)
+    )
+  }
+
+  /**
+   * Holds if `t` is a candidate bound for `n` that is also valid for data coming
+   * through the edges into `n` ranked from `1` to `r`.
+   */
+  private predicate flowJoin(int r, TypeFlowNode n, RefType t) {
+    (
+      r = 1 and candJoinType(n, t)
+      or
+      flowJoin(r - 1, n, t) and Edge::edgeRank(r, _, n)
+    ) and
+    forall(TypeFlowNode mid | Edge::edgeRank(r, mid, n) | T::supportsType(mid, t))
+  }
+
+  /**
+   * Holds if `t` is a candidate bound for `n` that is also valid for data
+   * coming through all the incoming edges, and therefore is a valid bound for
+   * `n`.
+   */
+  predicate flowJoin(TypeFlowNode n, RefType t) { flowJoin(Edge::lastRank(n), n, t) }
+}
+
+module RankedJoinStep = RankEdge<joinStep/2>;
+
+module RankedSccJoinStep = RankEdge<sccJoinStep/2>;
 
 private predicate exactTypeBase(TypeFlowNode n, RefType t) {
   exists(ClassInstanceExpr e |
@@ -177,15 +325,10 @@ private predicate exactTypeBase(TypeFlowNode n, RefType t) {
   )
 }
 
-private predicate exactTypeRank(int r, TypeFlowNode n, RefType t) {
-  forall(TypeFlowNode mid | joinStepRank(r, mid, n) | exactType(mid, t)) and
-  joinStepRank(r, _, n)
-}
+private module ExactTypePropagation implements TypePropagation {
+  predicate candType = exactType/2;
 
-private predicate exactTypeJoin(int r, TypeFlowNode n, RefType t) {
-  exactTypeRank(1, n, t) and r = 1
-  or
-  exactTypeJoin(r - 1, n, t) and exactTypeRank(r, n, t)
+  predicate supportsType = exactType/2;
 }
 
 /**
@@ -199,7 +342,14 @@ private predicate exactType(TypeFlowNode n, RefType t) {
   or
   // The following is an optimized version of
   // `forex(TypeFlowNode mid | joinStep(mid, n) | exactType(mid, t))`
-  exactTypeJoin(lastRank(n), n, t)
+  ForAll<RankedJoinStep, ExactTypePropagation>::flowJoin(n, t)
+  or
+  exists(TypeFlowNode scc |
+    sccRepr(n, scc) and
+    // Optimized version of
+    // `forex(TypeFlowNode mid | sccJoinStep(mid, scc) | exactType(mid, t))`
+    ForAll<RankedSccJoinStep, ExactTypePropagation>::flowJoin(scc, t)
+  )
 }
 
 /**
@@ -343,30 +493,15 @@ private predicate typeFlowBase(TypeFlowNode n, RefType t) {
   )
 }
 
-/**
- * Holds if `t` is a bound that holds on one of the incoming edges to `n` and
- * thus is a candidate bound for `n`.
- */
-pragma[noinline]
-private predicate typeFlowJoinCand(TypeFlowNode n, RefType t) {
-  exists(TypeFlowNode mid | joinStep(mid, n) | typeFlow(mid, t))
-}
+private module TypeFlowPropagation implements TypePropagation {
+  predicate candType = typeFlow/2;
 
-/**
- * Holds if `t` is a candidate bound for `n` that is also valid for data coming
- * through the edges into `n` ranked from `1` to `r`.
- */
-private predicate typeFlowJoin(int r, TypeFlowNode n, RefType t) {
-  (
-    r = 1 and typeFlowJoinCand(n, t)
-    or
-    typeFlowJoin(r - 1, n, t) and joinStepRank(r, _, n)
-  ) and
-  forall(TypeFlowNode mid | joinStepRank(r, mid, n) |
+  bindingset[t]
+  predicate supportsType(TypeFlowNode mid, RefType t) {
     exists(RefType midtyp | exactType(mid, midtyp) or typeFlow(mid, midtyp) |
       pragma[only_bind_out](midtyp).getAnAncestor() = t
     )
-  )
+  }
 }
 
 /**
@@ -378,7 +513,12 @@ private predicate typeFlow(TypeFlowNode n, RefType t) {
   or
   exists(TypeFlowNode mid | typeFlow(mid, t) and step(mid, n))
   or
-  typeFlowJoin(lastRank(n), n, t)
+  ForAll<RankedJoinStep, TypeFlowPropagation>::flowJoin(n, t)
+  or
+  exists(TypeFlowNode scc |
+    sccRepr(n, scc) and
+    ForAll<RankedSccJoinStep, TypeFlowPropagation>::flowJoin(scc, t)
+  )
 }
 
 pragma[nomagic]

From bc57d87303816b92bf7c35df067f450191179554 Mon Sep 17 00:00:00 2001
From: Anders Schack-Mulligen <aschackmull@github.com>
Date: Tue, 6 Sep 2022 13:59:54 +0200
Subject: [PATCH 2/2] Java: Address comments.

---
 .../semmle/code/java/dataflow/TypeFlow.qll    | 114 ++++++++++--------
 1 file changed, 65 insertions(+), 49 deletions(-)

diff --git a/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll b/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll
index ff4ea3a7209..d5b66baafb2 100644
--- a/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll
+++ b/java/ql/lib/semmle/code/java/dataflow/TypeFlow.qll
@@ -158,16 +158,16 @@ private predicate joinStep(TypeFlowNode n1, TypeFlowNode n2) {
 
 private predicate anyStep(TypeFlowNode n1, TypeFlowNode n2) { joinStep(n1, n2) or step(n1, n2) }
 
-private predicate sccEdge(TypeFlowNode n1, TypeFlowNode n2) { anyStep(n1, n2) and anyStep+(n2, n1) }
+private import SccReduction
 
-/*
+/**
  * SCC reduction.
  *
  * This ought to be as easy as `equivalenceRelation(sccEdge/2)(n, scc)`, but
  * this HOP is not currently supported for newtypes.
  *
  * A straightforward implementation would be:
- * ```
+ * ```ql
  * predicate sccRepr(TypeFlowNode n, TypeFlowNode scc) {
  *  scc =
  *    max(TypeFlowNode n2 |
@@ -182,60 +182,67 @@ private predicate sccEdge(TypeFlowNode n1, TypeFlowNode n2) { anyStep(n1, n2) an
  * ```
  * but this is quadratic in the size of the SCCs.
  *
- * Instead we find local maxima and determine the SCC representatives from those.
+ * Instead we find local maxima by following SCC edges and determine the SCC
+ * representatives from those.
  * (This is still worst-case quadratic in the size of the SCCs, but generally
  * performs better.)
  */
+private module SccReduction {
+  private predicate sccEdge(TypeFlowNode n1, TypeFlowNode n2) {
+    anyStep(n1, n2) and anyStep+(n2, n1)
+  }
 
-private predicate sccEdgeWithMax(TypeFlowNode n1, TypeFlowNode n2, TypeFlowNode m) {
-  sccEdge(n1, n2) and
-  m =
-    max(TypeFlowNode n |
-      n = [n1, n2]
-    |
-      n order by n.getLocation().getStartLine(), n.getLocation().getStartColumn(), getNodeKind(n)
-    )
-}
-
-private predicate hasLargerNeighbor(TypeFlowNode n) {
-  exists(TypeFlowNode n2 |
-    sccEdgeWithMax(n, n2, n2) and
-    not sccEdgeWithMax(n, n2, n)
-    or
-    sccEdgeWithMax(n2, n, n2) and
-    not sccEdgeWithMax(n2, n, n)
-  )
-}
-
-private predicate localMax(TypeFlowNode m) {
-  sccEdgeWithMax(_, _, m) and
-  not hasLargerNeighbor(m)
-}
-
-private predicate sccReprFromLocalMax(TypeFlowNode scc) {
-  exists(TypeFlowNode m |
-    localMax(m) and
-    scc =
-      max(TypeFlowNode n2 |
-        sccEdge+(m, n2) and localMax(n2)
+  private predicate sccEdgeWithMax(TypeFlowNode n1, TypeFlowNode n2, TypeFlowNode m) {
+    sccEdge(n1, n2) and
+    m =
+      max(TypeFlowNode n |
+        n = [n1, n2]
       |
-        n2
-        order by
-          n2.getLocation().getStartLine(), n2.getLocation().getStartColumn(), getNodeKind(n2)
+        n order by n.getLocation().getStartLine(), n.getLocation().getStartColumn(), getNodeKind(n)
       )
-  )
-}
+  }
 
-private predicate sccRepr(TypeFlowNode n, TypeFlowNode scc) {
-  sccEdge+(n, scc) and sccReprFromLocalMax(scc)
-}
+  private predicate hasLargerNeighbor(TypeFlowNode n) {
+    exists(TypeFlowNode n2 |
+      sccEdgeWithMax(n, n2, n2) and
+      not sccEdgeWithMax(n, n2, n)
+      or
+      sccEdgeWithMax(n2, n, n2) and
+      not sccEdgeWithMax(n2, n, n)
+    )
+  }
 
-private predicate sccJoinStep(TypeFlowNode n, TypeFlowNode scc) {
-  exists(TypeFlowNode mid |
-    joinStep(n, mid) and
-    sccRepr(mid, scc) and
-    not sccRepr(n, scc)
-  )
+  private predicate localMax(TypeFlowNode m) {
+    sccEdgeWithMax(_, _, m) and
+    not hasLargerNeighbor(m)
+  }
+
+  private predicate sccReprFromLocalMax(TypeFlowNode scc) {
+    exists(TypeFlowNode m |
+      localMax(m) and
+      scc =
+        max(TypeFlowNode n2 |
+          sccEdge+(m, n2) and localMax(n2)
+        |
+          n2
+          order by
+            n2.getLocation().getStartLine(), n2.getLocation().getStartColumn(), getNodeKind(n2)
+        )
+    )
+  }
+
+  /** Holds if `n` is part of an SCC of size 2 or more represented by `scc`. */
+  predicate sccRepr(TypeFlowNode n, TypeFlowNode scc) {
+    sccEdge+(n, scc) and sccReprFromLocalMax(scc)
+  }
+
+  predicate sccJoinStep(TypeFlowNode n, TypeFlowNode scc) {
+    exists(TypeFlowNode mid |
+      joinStep(n, mid) and
+      sccRepr(mid, scc) and
+      not sccRepr(n, scc)
+    )
+  }
 }
 
 private signature predicate edgeSig(TypeFlowNode n1, TypeFlowNode n2);
@@ -247,6 +254,10 @@ private signature module RankedEdge {
 }
 
 private module RankEdge<edgeSig/2 edge> implements RankedEdge {
+  /**
+   * Holds if `r` is a ranking of the incoming edges `(n1,n2)` to `n2`. The used
+   * ordering is not necessarily total, so the ranking may have gaps.
+   */
   private predicate edgeRank1(int r, TypeFlowNode n1, TypeFlowNode n2) {
     n1 =
       rank[r](TypeFlowNode n |
@@ -256,10 +267,15 @@ private module RankEdge<edgeSig/2 edge> implements RankedEdge {
       )
   }
 
+  /**
+   * Holds if `r2` is a ranking of the ranks from `edgeRank1`. This removes the
+   * gaps from the ranking.
+   */
   private predicate edgeRank2(int r2, int r1, TypeFlowNode n) {
     r1 = rank[r2](int r | edgeRank1(r, _, n) | r)
   }
 
+  /** Holds if `r` is a ranking of the incoming edges `(n1,n2)` to `n2`. */
   predicate edgeRank(int r, TypeFlowNode n1, TypeFlowNode n2) {
     exists(int r1 |
       edgeRank1(r1, n1, n2) and