argv-out-of-bounds-1.ql: ssa query identifying the out-of-bound access

2025-12-16 22:03:03 +01:00 · 2022-03-03 19:19:12 -08:00
parent 010254d3d4
commit ae4e677dec
2 changed files with 124 additions and 0 deletions
--- a/README.org
+++ b/README.org
@@ -338,6 +338,21 @@
                      | 640     }
   #+END_SRC
   Further exploration is done in [[./argv-out-of-bounds-1.ql]], starting with an
   attempt at using the =SimpleRangeAnalysis= library
   #+begin_src javascript
       lowerBound(cmp.getLeftOperand().getFullyConverted()),  "left lower bound",
       lowerBound(cmp.getRightOperand().getFullyConverted()), "right lower bound"
   #+end_src
   The bounds are correct bounds for the types, but too general -- they include
   the possible results of iteration.  We are only interested in initial bounds
   that are statically determinate, those before any iteration happens.
   Put another way, this is not a general data flow problem; we only want to check
   initial value propagation along certain execution paths.  The =for= loop
   complicates this, as do the operations within it.  We really want to see
   execution paths that bypass the loop altogether.  That is done in the latter
   parts of [[./argv-out-of-bounds-1.ql]], using a =SsaDefinition=.
 # TODO
 # ** Running the query from the command line
--- a/argv-out-of-bounds-1.ql
+++ b/argv-out-of-bounds-1.ql
@@ -0,0 +1,109 @@
 /**
 * @name Argv index out-of-bounds-1
 * @ kind problem
 * @id cpp/example/argv-out-of-bounds-1
 */
 import cpp
 import semmle.code.cpp.rangeanalysis.SimpleRangeAnalysis
 /*
 * Use
 * https://codeql.github.com/docs/codeql-language-guides/codeql-library-for-cpp/
 * to find the codeql class matching c++ syntax
 *
 * See https://codeql.github.com/docs/codeql-language-guides/using-range-analsis-in-cpp/
 * for the entry points to the range analysis library.  Namely,
 * `import semmle.code.cpp.rangeanalysis.SimpleRangeAnalysis`
 * and
 * `lowerBound(expr)`
 */
 from
  Parameter argc, RelationalOperation cmp, Variable n, Parameter argv, ArrayExpr argvAccess,
  AssignExpr argvSet, SsaDefinition invalidN, SsaDefinition invalidNDef
 where
  //
  //     consider argc and n when they occur in the same comparison
  //
  argc.getName() = "argc" and
  argc.getAnAccess() = cmp.getAnOperand().getAChild*() and
  n.getAnAccess() = cmp.getAnOperand().getAChild*() and
  not n instanceof Parameter and
  //
  //    find indexed reads of argv using n
  //
  argvAccess.getArrayBase() = argv.getAnAccess() and
  argvAccess.getArrayOffset() = n.getAnAccess() and
  //
  //   find the indexed writes of argv
  //
  exists(ArrayExpr aa |
    aa.getArrayBase() = argv.getAnAccess() and
    aa.getArrayOffset() = n.getAnAccess() and
    argvSet.getLValue() = aa and
    /*  Separate the read */
    aa != argvAccess
  ) and
  //
  //   ignore argv use when it's an argument of strcmp
  //
  not exists(FunctionCall strcmp |
    strcmp.getTarget().getName() = "strcmp" and
    strcmp.getAnArgument*() = argvAccess
  ) and
  // // ----------------------
  // //
  // // The first thing to try is the range library via these predicates:
  // select argc, "definition of argc", cmp, "use in comparison", n, "other var in comparison",
  //   argvAccess, "argv indexed", argvSet, "argv assigned",
  //   , lowerBound(cmp.getLeftOperand()), "left lower bound"
  //   , lowerBound(cmp.getRightOperand()), "right lower bound"
  // // ----------------------
  // //
  // // Both bounds are 0, which is somewhat surprising.  The reason is that
  // // results include possible increments from loop; with overflow and cast,
  // // this makes 0 the minimum.
  // // These are correct bounds for the types, but too general -- they include the possible results
  // // of iteration.  We are only interested in initial bounds that are statically determinate,
  // // those before any iteration happens.
  // // Just to check:
  // select argc, "definition of argc", cmp, "use in comparison", n, "other var in comparison",
  //   argvAccess, "argv indexed", argvSet, "argv assigned",
  //   lowerBound(cmp.getLeftOperand().getFullyConverted()),  "left lower bound",
  //   lowerBound(cmp.getRightOperand().getFullyConverted()), "right lower bound"
  // ----------------------
  //
  // Step back from the general range library and note a few details about this
  // problem:
  // - The range of argc is set by the OS, so it's known at compile time
  // - The first value of n is set in the code, it's also statically determined
  // - We are concerned about the lowest possible value of the argv index, no other
  //
  // We can rephrase the problem:
  //
  // Find an execution path (if any), using statically known values, that reaches
  // an argv assignment with invalid index.
  //
  // To track only values of the argv index that are too low, we need to stay on
  // certain branches of the CFG, namely those matching a SSA defition of the
  // index variable.
  //
  // A first attempt, starting from `cmp` is the following.  This finds two
  // invalidNDef locations, n++ and n = 1, and several `argvAccess`s
  cmp.getLesserOperand() = invalidN.getAUse(n) and 
  cmp.getGreaterOperand() = argc.getAnAccess() and
  invalidN.getAnUltimateDefiningValue(n).getValue().toInt() > 0 and
  invalidNDef = invalidN.getAnUltimateSsaDefinition(n)
  //
  // We still find an access inside the loop,
  //    opt_user = g_strdup (argv[n]);
  // so let's narrow:
  and 
  invalidN.getAUse(n) = argvAccess.getArrayOffset() and
  argvSet = cmp.getAFalseSuccessor().getASuccessor*() and
  argvAccess = cmp.getAFalseSuccessor().getASuccessor*() 
 select argc, "definition of argc", cmp, "use in comparison", n, "other var in comparison",
  argvAccess, "argv indexed", argvSet, "argv assigned", invalidNDef, "invalid n definition"