From 80c6f082d114ce5772dd38330b528868e3914363 Mon Sep 17 00:00:00 2001 From: Owen Mansel-Chan Date: Thu, 28 May 2026 11:34:02 +0100 Subject: [PATCH] Fix TODO in `containerStep` --- .../new/internal/TaintTrackingPrivate.qll | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll index 2d2cceb73c1..de5a06eaaad 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll @@ -11,21 +11,35 @@ private import semmle.python.ApiGraphs */ predicate defaultTaintSanitizer(DataFlow::Node node) { none() } +/** + * Holds if default taint tracking should read content `contentSet` implicitly and + * propagate taint from a container to reads of that content. + */ +private predicate defaultTaintReadContent(DataFlow::ContentSet contentSet) { + // Tuple and dictionary content is precise, so use wildcard content sets to avoid + // blowing up the size of `Stage1::readSetEx` (otherwise this predicate would + // expand to one row per (node, distinct key or index) and the framework's + // read-set relation grows quadratically). `ContentSet.getAReadContent` expands + // these wildcards back to the specific contents when matching against stores. + contentSet.isAnyTupleElement() + or + contentSet.isAnyDictionaryElement() + or + // List and set element content is already imprecise, so no wildcard expansion is + // needed. + contentSet.getAStoreContent() instanceof DataFlow::ListElementContent + or + contentSet.getAStoreContent() instanceof DataFlow::SetElementContent +} + /** * Holds if default `TaintTracking::Configuration`s should allow implicit reads * of `c` at sinks and inputs to additional taint steps. */ bindingset[node] predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) { - // We allow implicit reads of precise content; imprecise content has already - // bubbled up. We use the wildcard content sets here rather than the - // per-key/per-index ones to avoid blowing up the size of `Stage1::readSetEx` - // (otherwise this predicate would expand to one row per (node, distinct key - // or index) and the framework's read-set relation grows quadratically). - // `ContentSet.getAReadContent` expands these wildcards back to the specific - // contents when matching against stores. exists(node) and - (c.isAnyTupleElement() or c.isAnyDictionaryElement()) + defaultTaintReadContent(c) } private module Cached { @@ -171,28 +185,15 @@ predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeT } /** - * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to containers - * (lists/sets/dictionaries): literals, constructor invocation, methods. Note that this - * is currently very imprecise, as an example, since we model `dict.get`, we treat any - * `.get()` will be tainted, whether it's true or not. + * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to reading + * content from containers (lists/sets/dictionaries/tuples): subscripts, iteration, + * constructor invocation, methods. */ predicate containerStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { - // construction by literal - // - // TODO: once we have proper flow-summary modeling, we might not need this step any - // longer -- but there needs to be a matching read-step for the store-step, and we - // don't provide that right now. - DataFlowPrivate::listStoreStep(nodeFrom, _, nodeTo) - or - DataFlowPrivate::setStoreStep(nodeFrom, _, nodeTo) - or - // comprehension, so there is taint-flow from `x` in `[x for x in xs]` to the - // resulting list of the list-comprehension. - // - // TODO: once we have proper flow-summary modeling, we might not need this step any - // longer -- but there needs to be a matching read-step for the store-step, and we - // don't provide that right now. - DataFlowPrivate::yieldStoreStep(nodeFrom, _, nodeTo) + exists(DataFlow::ContentSet contentSet | + DataFlowPrivate::readStep(nodeFrom, contentSet, nodeTo) and + defaultTaintReadContent(contentSet) + ) } /**