Merge pull request #12818 from MathiasVP/dataflow-for-missing-scanf-qery

C++: Use the new dataflow library in `cpp/missing-check-scanf`
2025-12-22 11:46:32 +01:00 · 2023-04-17 14:34:11 +01:00
parent 901db73d55 0db05fe4fa
commit d975ceb648
3 changed files with 113 additions and 145 deletions
--- a/cpp/ql/src/Critical/MissingCheckScanf.ql
+++ b/cpp/ql/src/Critical/MissingCheckScanf.ql
@@ -16,160 +16,133 @@
 import cpp
 import semmle.code.cpp.commons.Scanf
 import semmle.code.cpp.controlflow.Guards
-import semmle.code.cpp.ir.dataflow.DataFlow
+import semmle.code.cpp.dataflow.new.DataFlow::DataFlow
 import semmle.code.cpp.ir.IR
 import semmle.code.cpp.ir.ValueNumbering

-/**
- * Holds if `call` is a `scanf`-like function that may write to `output` at index `index`.
- *
- * Furthermore, `instr` is the instruction that defines the address of the `index`'th argument
- * of `call`, and `vn` is the value number of `instr.`
- */
-predicate isSource(ScanfFunctionCall call, int index, Instruction instr, ValueNumber vn, Expr output) {
-  output = call.getOutputArgument(index).getFullyConverted() and
-  instr.getConvertedResultExpression() = output and
-  vn.getAnInstruction() = instr
+/** Holds if `n` reaches an argument  to a call to a `scanf`-like function. */
+pragma[nomagic]
+predicate revFlow0(Node n) {
+  isSink(_, _, n, _)
+  or
+  exists(Node succ | revFlow0(succ) | localFlowStep(n, succ))
 }

 /**
- * Holds if `instr` is control-flow reachable in 0 or more steps from
- * a call to a `scanf`-like function.
+ * Holds if `n` represents an uninitialized stack-allocated variable, or a
+ * newly (and presumed uninitialized) heap allocation.
 */
+predicate isUninitialized(Node n) {
+  exists(n.asUninitialized()) or
+  n.asIndirectExpr(1) instanceof AllocationExpr
+}
+
 pragma[nomagic]
-predicate fwdFlow0(Instruction instr) {
-  isSource(_, _, instr, _, _)
-  or
-  exists(Instruction prev |
-    fwdFlow0(prev) and
-    prev.getASuccessor() = instr
+predicate fwdFlow0(Node n) {
+  revFlow0(n) and
+  (
+    isUninitialized(n)
+    or
+    exists(Node prev |
+      fwdFlow0(prev) and
+      localFlowStep(prev, n)
+    )
  )
 }

-/**
- * Holds if `instr` is part of the IR translation of `access` that
- * is not an expression being deallocated, and `instr` has value
- * number `vn`.
- */
-predicate isSink(Instruction instr, Access access, ValueNumber vn) {
-  instr.getAst() = access and
-  not any(DeallocationExpr dealloc).getFreedExpr() = access and
-  vn.getAnInstruction() = instr
+predicate isSink(ScanfFunctionCall call, int index, Node n, Expr input) {
+  input = call.getOutputArgument(index) and
+  n.asIndirectExpr() = input
 }

 /**
- * Holds if `instr` is part of a path from a call to a `scanf`-like function
+ * Holds if `call` is a `scanf`-like call and `output` is the `index`'th
+ * argument that has not been previously initialized.
+ */
+predicate isRelevantScanfCall(ScanfFunctionCall call, int index, Expr output) {
+  exists(Node n | fwdFlow0(n) and isSink(call, index, n, output))
+}
+
+/**
+ * Holds if `call` is a `scanf`-like function that may write to `output` at
+ * index `index` and `n` is the dataflow node that represents the data after
+ * it has been written to by `call`.
+ */
+predicate isSource(ScanfFunctionCall call, int index, Node n, Expr output) {
+  isRelevantScanfCall(call, index, output) and
+  output = call.getOutputArgument(index) and
+  n.asDefiningArgument() = output
+}
+
+/**
+ * Holds if `n` is reachable from an output argument of a relevant call to
+ * a `scanf`-like function.
+ */
+pragma[nomagic]
+predicate fwdFlow(Node n) {
+  isSource(_, _, n, _)
+  or
+  exists(Node prev |
+    fwdFlow(prev) and
+    localFlowStep(prev, n) and
+    not isSanitizerOut(prev)
+  )
+}
+
+/** Holds if `n` should not have outgoing flow. */
+predicate isSanitizerOut(Node n) {
+  // We disable flow out of sinks to reduce result duplication
+  isSink(n, _)
+  or
+  // If the node is being passed to a function it may be
+  // modified, and thus it's safe to later read the value.
+  exists(n.asIndirectArgument())
+}
+
+/**
+ * Holds if `n` is a node such that `n.asExpr() = e` and `e` is not an
+ * argument of a deallocation expression.
+ */
+predicate isSink(Node n, Expr e) {
+  n.asExpr() = e and
+  not any(DeallocationExpr dealloc).getFreedExpr() = e
+}
+
+/**
+ * Holds if `n` is part of a path from a call to a `scanf`-like function
 * to a use of the written variable.
 */
 pragma[nomagic]
-predicate revFlow0(Instruction instr) {
-  fwdFlow0(instr) and
+predicate revFlow(Node n) {
+  fwdFlow(n) and
  (
-    isSink(instr, _, _)
+    isSink(n, _)
    or
-    exists(Instruction succ | revFlow0(succ) | instr.getASuccessor() = succ)
-  )
-}
-
-/**
- * Holds if `instr` is part of a path from a call to a `scanf`-like function
- * that writes to a variable with value number `vn`, without passing through
- * redefinitions of the variable.
- */
-pragma[nomagic]
-private predicate fwdFlow(Instruction instr, ValueNumber vn) {
-  revFlow0(instr) and
-  (
-    isSource(_, _, instr, vn, _)
-    or
-    exists(Instruction prev |
-      fwdFlow(prev, vn) and
-      prev.getASuccessor() = instr and
-      not isBarrier(instr, vn)
+    exists(Node succ |
+      revFlow(succ) and
+      localFlowStep(n, succ) and
+      not isSanitizerOut(n)
    )
  )
 }

-/**
- * Holds if `instr` is part of a path from a call to a `scanf`-like function
- * that writes to a variable with value number `vn`, without passing through
- * redefinitions of the variable.
- *
- * Note: This predicate only holds for the `(intr, vn)` pairs that are also
- * control-flow reachable from an argument to a `scanf`-like function call.
- */
-pragma[nomagic]
-predicate revFlow(Instruction instr, ValueNumber vn) {
-  fwdFlow(instr, pragma[only_bind_out](vn)) and
-  (
-    isSink(instr, _, vn)
-    or
-    exists(Instruction succ | revFlow(succ, vn) |
-      instr.getASuccessor() = succ and
-      not isBarrier(succ, vn)
-    )
-  )
+/** A local flow step, restricted to relevant dataflow nodes. */
+private predicate step(Node n1, Node n2) {
+  revFlow(n1) and
+  revFlow(n2) and
+  localFlowStep(n1, n2)
 }

-/**
- * A type that bundles together a reachable instruction with the appropriate
- * value number (i.e., the value number that's transferred from the source
- * to the sink).
- */
-newtype TNode = MkNode(Instruction instr, ValueNumber vn) { revFlow(instr, vn) }
-
-class Node extends MkNode {
-  ValueNumber vn;
-  Instruction instr;
-
-  Node() { this = MkNode(instr, vn) }
-
-  final string toString() { result = instr.toString() }
-
-  final Node getASuccessor() { result = MkNode(pragma[only_bind_out](instr.getASuccessor()), vn) }
-
-  final Location getLocation() { result = instr.getLocation() }
-}
-
-/**
- * Holds if `instr` is an instruction with value number `vn` that is
- * used in a store operation, or is overwritten by another call to
- * a `scanf`-like function.
- */
-private predicate isBarrier(Instruction instr, ValueNumber vn) {
-  // We only need to compute barriers for instructions that we
-  // managed to hit during the initial flow stage.
-  revFlow0(pragma[only_bind_into](instr)) and
-  valueNumber(instr) = vn and
-  exists(Expr e | instr.getAst() = e |
-    instr = any(StoreInstruction s).getDestinationAddress()
-    or
-    isSource(_, _, _, _, [e, e.getParent().(AddressOfExpr)])
-  )
-}
-
-/** Holds if `n1` steps to `n2` in a single step. */
-predicate isSuccessor(Node n1, Node n2) { n1.getASuccessor() = n2 }
-
-predicate hasFlow(Node n1, Node n2) = fastTC(isSuccessor/2)(n1, n2)
-
-Node getNode(Instruction instr, ValueNumber vn) { result = MkNode(instr, vn) }
+predicate hasFlow(Node n1, Node n2) = fastTC(step/2)(n1, n2)

 /**
 * Holds if `source` is the `index`'th argument to the `scanf`-like call `call`, and `sink` is
- * an instruction that is part of the translation of `access` which is a transitive
- * control-flow successor of `call`.
- *
- * Furthermore, `source` and `sink` have identical global value numbers.
+ * a dataflow node that represents the expression `e`.
 */
-predicate hasFlow(
-  Instruction source, ScanfFunctionCall call, int index, Instruction sink, Access access
-) {
-  exists(ValueNumber vn |
-    isSource(call, index, source, vn, _) and
-    hasFlow(getNode(source, pragma[only_bind_into](vn)), getNode(sink, pragma[only_bind_into](vn))) and
-    isSink(sink, access, vn)
-  )
+predicate hasFlow(Node source, ScanfFunctionCall call, int index, Node sink, Expr e) {
+  isSource(call, index, source, _) and
+  hasFlow(source, sink) and
+  isSink(sink, e)
 }

 /**
@@ -177,7 +150,7 @@ predicate hasFlow(
 * success in writing the output argument at index `index`.
 */
 int getMinimumGuardConstant(ScanfFunctionCall call, int index) {
-  isSource(call, index, _, _, _) and
+  isSource(call, index, _, _) and
  result =
    index + 1 -
      count(ScanfFormatLiteral f, int n |
@@ -191,7 +164,7 @@ int getMinimumGuardConstant(ScanfFunctionCall call, int index) {
 * Holds the access to `e` isn't guarded by a check that ensures that `call` returned
 * at least `minGuard`.
 */
-predicate hasNonGuardedAccess(ScanfFunctionCall call, Access e, int minGuard) {
+predicate hasNonGuardedAccess(ScanfFunctionCall call, Expr e, int minGuard) {
  exists(int index |
    hasFlow(_, call, index, _, e) and
    minGuard = getMinimumGuardConstant(call, index)
@@ -211,7 +184,7 @@ BasicBlock blockGuardedBy(int value, string op, ScanfFunctionCall call) {
  exists(GuardCondition g, Expr left, Expr right |
    right = g.getAChild() and
    value = left.getValue().toInt() and
-    DataFlow::localExprFlow(call, right)
+    localExprFlow(call, right)
  |
    g.ensuresEq(left, right, 0, result, true) and op = "=="
    or
@@ -221,9 +194,9 @@ BasicBlock blockGuardedBy(int value, string op, ScanfFunctionCall call) {
  )
 }

-from ScanfFunctionCall call, Access access, int minGuard
-where hasNonGuardedAccess(call, access, minGuard)
-select access,
+from ScanfFunctionCall call, Expr e, int minGuard
+where hasNonGuardedAccess(call, e, minGuard)
+select e,
  "This variable is read, but may not have been written. " +
    "It should be guarded by a check that the $@ returns at least " + minGuard + ".", call,
  call.toString()
--- a/cpp/ql/test/query-tests/Critical/MissingCheckScanf/MissingCheckScanf.expected
+++ b/cpp/ql/test/query-tests/Critical/MissingCheckScanf/MissingCheckScanf.expected
@@ -1,9 +1,8 @@
 | test.cpp:35:7:35:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:34:3:34:7 | call to scanf | call to scanf |
-| test.cpp:51:7:51:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:50:3:50:7 | call to scanf | call to scanf |
 | test.cpp:68:7:68:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:67:3:67:7 | call to scanf | call to scanf |
 | test.cpp:80:7:80:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:79:3:79:7 | call to scanf | call to scanf |
-| test.cpp:90:8:90:8 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:89:3:89:7 | call to scanf | call to scanf |
-| test.cpp:98:8:98:8 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:97:3:97:7 | call to scanf | call to scanf |
+| test.cpp:90:7:90:8 | * ... | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:89:3:89:7 | call to scanf | call to scanf |
+| test.cpp:98:7:98:8 | * ... | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:97:3:97:7 | call to scanf | call to scanf |
 | test.cpp:108:7:108:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:107:3:107:8 | call to fscanf | call to fscanf |
 | test.cpp:115:7:115:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:114:3:114:8 | call to sscanf | call to sscanf |
 | test.cpp:164:8:164:8 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:162:7:162:11 | call to scanf | call to scanf |
@@ -12,13 +11,9 @@
 | test.cpp:224:8:224:8 | j | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 2. | test.cpp:221:7:221:11 | call to scanf | call to scanf |
 | test.cpp:248:9:248:9 | d | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 2. | test.cpp:246:25:246:29 | call to scanf | call to scanf |
 | test.cpp:252:9:252:9 | d | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 2. | test.cpp:250:14:250:18 | call to scanf | call to scanf |
-| test.cpp:264:7:264:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:263:3:263:7 | call to scanf | call to scanf |
 | test.cpp:272:7:272:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:271:3:271:7 | call to scanf | call to scanf |
 | test.cpp:280:7:280:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:279:3:279:7 | call to scanf | call to scanf |
 | test.cpp:292:7:292:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:291:3:291:7 | call to scanf | call to scanf |
-| test.cpp:302:8:302:12 | ptr_i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:301:3:301:7 | call to scanf | call to scanf |
-| test.cpp:310:7:310:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:309:3:309:7 | call to scanf | call to scanf |
 | test.cpp:404:25:404:25 | u | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:403:6:403:11 | call to sscanf | call to sscanf |
 | test.cpp:416:7:416:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:413:7:413:11 | call to scanf | call to scanf |
 | test.cpp:423:7:423:7 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:420:7:420:11 | call to scanf | call to scanf |
-| test.cpp:430:6:430:6 | i | This variable is read, but may not have been written. It should be guarded by a check that the $@ returns at least 1. | test.cpp:429:2:429:6 | call to scanf | call to scanf |
--- a/cpp/ql/test/query-tests/Critical/MissingCheckScanf/test.cpp
+++ b/cpp/ql/test/query-tests/Critical/MissingCheckScanf/test.cpp
@@ -48,7 +48,7 @@ int main()
 		int i = 0;

 		scanf("%d", &i);
-		use(i); // BAD. Design choice: already initialized variables shouldn't make a difference.
+		use(i); // GOOD. Design choice: already initialized variables are fine.
 	}

 	{
@@ -261,7 +261,7 @@ int main()
 		i = 0;

 		scanf("%d", &i);
-		use(i); // BAD
+		use(i); // GOOD
 	}

 	{
@@ -269,7 +269,7 @@ int main()

 		set_by_ref(i);
 		scanf("%d", &i);
-		use(i); // BAD
+		use(i); // GOOD [FALSE POSITIVE]
 	}

 	{
@@ -277,7 +277,7 @@ int main()

 		set_by_ptr(&i);
 		scanf("%d", &i);
-		use(i); // BAD
+		use(i); // GOOD [FALSE POSITIVE]
 	}

 	{
@@ -299,7 +299,7 @@ int main()
 		int *ptr_i = &i;

 		scanf("%d", &i);
-		use(*ptr_i); // BAD: may not have written `i`
+		use(*ptr_i); // BAD [NOT DETECTED]: may not have written `i`
 	}

 	{
@@ -307,7 +307,7 @@ int main()
 		int *ptr_i = &i;

 		scanf("%d", ptr_i);
-		use(i); // BAD: may not have written `*ptr_i`
+		use(i); // BAD [NOT DETECTED]: may not have written `*ptr_i`
 	}

 	{
@@ -427,5 +427,5 @@ void scan_and_write() {
 void scan_and_static_variable() {
 	static int i;
 	scanf("%d", &i);
-	use(i);  // GOOD [FALSE POSITIVE]: static variables are always 0-initialized
+	use(i);  // GOOD: static variables are always 0-initialized
 }