Add wildcard ContentSets to avoid performance problems

This commit is contained in:
Owen Mansel-Chan
2026-05-27 15:28:07 +01:00
parent e8779295ee
commit ec13e1bcd3
7 changed files with 162 additions and 105 deletions

View File

@@ -753,7 +753,7 @@ predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
* As of 2024-04-02 the type-tracking library only supports precise content, so there is
* no reason to include steps for list content right now.
*/
predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
predicate storeStepCommon(Node nodeFrom, Content c, Node nodeTo) {
tupleStoreStep(nodeFrom, c, nodeTo)
or
dictStoreStep(nodeFrom, c, nodeTo)
@@ -767,29 +767,31 @@ predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
* content `c`.
*/
predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
storeStepCommon(nodeFrom, c, nodeTo)
predicate storeStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
exists(Content c | cs = singleton(c) |
storeStepCommon(nodeFrom, c, nodeTo)
or
listStoreStep(nodeFrom, c, nodeTo)
or
setStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
matchStoreStep(nodeFrom, c, nodeTo)
or
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
or
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
or
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
or
yieldStoreStep(nodeFrom, c, nodeTo)
or
VariableCapture::storeStep(nodeFrom, c, nodeTo)
)
or
listStoreStep(nodeFrom, c, nodeTo)
or
setStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
matchStoreStep(nodeFrom, c, nodeTo)
or
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
or
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
nodeTo.(FlowSummaryNode).getSummaryNode())
or
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
or
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
or
yieldStoreStep(nodeFrom, c, nodeTo)
or
VariableCapture::storeStep(nodeFrom, c, nodeTo)
}
/**
@@ -985,7 +987,7 @@ predicate attributeStoreStep(Node nodeFrom, AttributeContent c, Node nodeTo) {
/**
* Subset of `readStep` that should be shared with type-tracking.
*/
predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
predicate readStepCommon(Node nodeFrom, Content c, Node nodeTo) {
subscriptReadStep(nodeFrom, c, nodeTo)
or
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
@@ -994,23 +996,25 @@ predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
*/
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
readStepCommon(nodeFrom, c, nodeTo)
predicate readStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
exists(Content c | cs = singleton(c) |
readStepCommon(nodeFrom, c, nodeTo)
or
matchReadStep(nodeFrom, c, nodeTo)
or
forReadStep(nodeFrom, c, nodeTo)
or
attributeReadStep(nodeFrom, c, nodeTo)
or
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
or
VariableCapture::readStep(nodeFrom, c, nodeTo)
)
or
matchReadStep(nodeFrom, c, nodeTo)
or
forReadStep(nodeFrom, c, nodeTo)
or
attributeReadStep(nodeFrom, c, nodeTo)
or
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
nodeTo.(FlowSummaryNode).getSummaryNode())
or
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
or
VariableCapture::readStep(nodeFrom, c, nodeTo)
or
Conversions::readStep(nodeFrom, c, nodeTo)
Conversions::readStep(nodeFrom, cs, nodeTo)
}
/** Data flows from a sequence to a subscript of the sequence. */
@@ -1074,11 +1078,7 @@ module Conversions {
nodeFrom = decoding.getAnInput() and
nodeTo = decoding.getOutput()
) and
(
c instanceof TupleElementContent
or
c instanceof DictionaryElementContent
)
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
}
predicate encoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1086,11 +1086,7 @@ module Conversions {
nodeFrom = encoding.getAnInput() and
nodeTo = encoding.getOutput()
) and
(
c instanceof TupleElementContent
or
c instanceof DictionaryElementContent
)
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
}
predicate formatReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1099,13 +1095,13 @@ module Conversions {
fmt.getOp() instanceof Mod and
fmt.getRight() = nodeFrom.asCfgNode()
) and
c instanceof TupleElementContent
c.isAnyTupleElement()
or
// format_map
// see https://docs.python.org/3/library/stdtypes.html#str.format_map
nodeTo.(MethodCallNode).calls(_, "format_map") and
nodeTo.(MethodCallNode).getArg(0) = nodeFrom and
c instanceof DictionaryElementContent
c.isAnyDictionaryElement()
}
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1122,18 +1118,20 @@ module Conversions {
* any value stored inside `f` is cleared at the pre-update node associated with `x`
* in `x.f = newValue`.
*/
predicate clearsContent(Node n, ContentSet c) {
matchClearStep(n, c)
predicate clearsContent(Node n, ContentSet cs) {
exists(Content c | cs = singleton(c) |
matchClearStep(n, c)
or
attributeClearStep(n, c)
or
dictClearStep(n, c)
or
dictSplatParameterNodeClearStep(n, c)
or
VariableCapture::clearsContent(n, c)
)
or
attributeClearStep(n, c)
or
dictClearStep(n, c)
or
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), c)
or
dictSplatParameterNodeClearStep(n, c)
or
VariableCapture::clearsContent(n, c)
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), cs)
}
/**

View File

@@ -898,19 +898,65 @@ class CapturedVariableContent extends Content, TCapturedVariableContent {
override string getMaDRepresentation() { none() }
}
/**
* An entity that represents a set of `Content`s.
*
* Most `ContentSet`s are singletons (i.e. they consist of a single `Content`),
* but `AnyDictionaryElement` and `AnyTupleElement` act as wildcards on the
* read side: a read at such a `ContentSet` matches any specific dictionary
* key / tuple index store, as well as (for dictionaries) the
* "unknown-bucket" Content `DictionaryElementAnyContent`.
*
* Keeping these as wildcard `ContentSet`s (rather than enumerating one
* `ContentSet` per key/index) keeps the dataflow `readSetEx` relation small
* when implicit reads are used (e.g. at sinks via `defaultImplicitTaintRead`).
*/
private newtype TContentSet =
TSingletonContent(Content c) or
TAnyTupleElement() or
TAnyDictionaryElement()
/**
* An entity that represents a set of `Content`s.
*
* The set may be interpreted differently depending on whether it is
* stored into (`getAStoreContent`) or read from (`getAReadContent`).
*/
class ContentSet instanceof Content {
class ContentSet extends TContentSet {
/** Holds if this content set is the singleton `{c}`. */
predicate isSingleton(Content c) { this = TSingletonContent(c) }
/** Holds if this content set is the wildcard for all tuple elements. */
predicate isAnyTupleElement() { this = TAnyTupleElement() }
/** Holds if this content set is the wildcard for all dictionary elements. */
predicate isAnyDictionaryElement() { this = TAnyDictionaryElement() }
/** Gets a content that may be stored into when storing into this set. */
Content getAStoreContent() { result = this }
Content getAStoreContent() { this = TSingletonContent(result) }
/** Gets a content that may be read from when reading from this set. */
Content getAReadContent() { result = this }
Content getAReadContent() {
this = TSingletonContent(result)
or
// Wildcard expansion: a read at "any tuple element" matches a store at any
// specific tuple index. (Stores always target a specific index, so we don't
// need a `TupleElementAnyContent` Content kind here.)
this = TAnyTupleElement() and result instanceof TupleElementContent
or
this = TAnyDictionaryElement() and
(result instanceof DictionaryElementContent or result instanceof DictionaryElementAnyContent)
}
/** Gets a textual representation of this content set. */
string toString() { result = super.toString() }
string toString() {
exists(Content c | this = TSingletonContent(c) | result = c.toString())
or
this = TAnyTupleElement() and result = "Any tuple element"
or
this = TAnyDictionaryElement() and result = "Any dictionary element"
}
}
/** Gets the singleton `ContentSet` wrapping the `Content` `c`. */
ContentSet singleton(Content c) { result = TSingletonContent(c) }

View File

@@ -66,21 +66,27 @@ module Input implements InputSig<Location, DataFlowImplSpecific::PythonDataFlow>
}
string encodeContent(ContentSet cs, string arg) {
cs = TListElementContent() and result = "ListElement" and arg = ""
or
cs = TSetElementContent() and result = "SetElement" and arg = ""
or
exists(int index |
cs = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
exists(Content c | cs.isSingleton(c) |
c = TListElementContent() and result = "ListElement" and arg = ""
or
c = TSetElementContent() and result = "SetElement" and arg = ""
or
exists(int index |
c = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
)
or
exists(string key |
c = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
)
or
c = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
or
exists(string attr | c = TAttributeContent(attr) and result = "Attribute" and arg = attr)
)
or
exists(string key |
cs = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
)
cs.isAnyTupleElement() and result = "AnyTupleElement" and arg = ""
or
cs = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
or
exists(string attr | cs = TAttributeContent(attr) and result = "Attribute" and arg = attr)
cs.isAnyDictionaryElement() and result = "AnyDictionaryElement" and arg = ""
}
bindingset[token]
@@ -139,27 +145,29 @@ module Private {
predicate withContent = SC::withContent/1;
/** Gets a summary component that represents a list element. */
SummaryComponent listElement() { result = content(any(ListElementContent c)) }
SummaryComponent listElement() { result = content(singleton(any(ListElementContent c))) }
/** Gets a summary component that represents a set element. */
SummaryComponent setElement() { result = content(any(SetElementContent c)) }
SummaryComponent setElement() { result = content(singleton(any(SetElementContent c))) }
/** Gets a summary component that represents a tuple element. */
SummaryComponent tupleElement(int index) {
exists(TupleElementContent c | c.getIndex() = index and result = content(c))
exists(TupleElementContent c | c.getIndex() = index and result = content(singleton(c)))
}
/** Gets a summary component that represents a dictionary element. */
SummaryComponent dictionaryElement(string key) {
exists(DictionaryElementContent c | c.getKey() = key and result = content(c))
exists(DictionaryElementContent c | c.getKey() = key and result = content(singleton(c)))
}
/** Gets a summary component that represents a dictionary element at any key. */
SummaryComponent dictionaryElementAny() { result = content(any(DictionaryElementAnyContent c)) }
SummaryComponent dictionaryElementAny() {
result = content(singleton(any(DictionaryElementAnyContent c)))
}
/** Gets a summary component that represents an attribute element. */
SummaryComponent attribute(string attr) {
exists(AttributeContent c | c.getAttribute() = attr and result = content(c))
exists(AttributeContent c | c.getAttribute() = attr and result = content(singleton(c)))
}
/** Gets a summary component that represents the return value of a call. */

View File

@@ -17,14 +17,15 @@ predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
*/
bindingset[node]
predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) {
// We allow implicit reads of precise content
// imprecise content has already bubled up.
// We allow implicit reads of precise content; imprecise content has already
// bubbled up. We use the wildcard content sets here rather than the
// per-key/per-index ones to avoid blowing up the size of `Stage1::readSetEx`
// (otherwise this predicate would expand to one row per (node, distinct key
// or index) and the framework's read-set relation grows quadratically).
// `ContentSet.getAReadContent` expands these wildcards back to the specific
// contents when matching against stores.
exists(node) and
(
c instanceof DataFlow::TupleElementContent
or
c instanceof DataFlow::DictionaryElementContent
)
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
}
private module Cached {

View File

@@ -241,7 +241,7 @@ module TypeTrackingInput implements Shared::TypeTrackingInput<Location> {
// is only fed set/list content)
not nodeFrom instanceof DataFlowPublic::IterableElementNode
or
TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, content)
TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, DataFlowPublic::singleton(content))
}
/**
@@ -272,14 +272,15 @@ module TypeTrackingInput implements Shared::TypeTrackingInput<Location> {
nodeFrom.asCfgNode() instanceof SequenceNode
)
or
TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, content)
TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, DataFlowPublic::singleton(content))
}
/**
* Holds if the `loadContent` of `nodeFrom` is stored in the `storeContent` of `nodeTo`.
*/
predicate loadStoreStep(Node nodeFrom, Node nodeTo, Content loadContent, Content storeContent) {
TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo, loadContent, storeContent)
TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo,
DataFlowPublic::singleton(loadContent), DataFlowPublic::singleton(storeContent))
}
/**

View File

@@ -61,10 +61,13 @@ module EscapingCaptureFlowConfig implements DataFlow::ConfigSig {
predicate allowImplicitRead(DataFlow::Node node, DataFlow::ContentSet cs) {
isSink(node) and
(
cs.(DataFlow::TupleElementContent).getIndex() in [0 .. 10] or
cs instanceof DataFlow::ListElementContent or
cs instanceof DataFlow::SetElementContent or
cs instanceof DataFlow::DictionaryElementAnyContent
cs.isAnyTupleElement()
or
cs.isAnyDictionaryElement()
or
cs.getAStoreContent() instanceof DataFlow::ListElementContent
or
cs.getAStoreContent() instanceof DataFlow::SetElementContent
)
}
}

View File

@@ -6,16 +6,16 @@ pat = ... # some pattern
compiled_pat = re.compile(pat)
# see https://docs.python.org/3/library/re.html#functions
ensure_not_tainted(
# returns Match object, which is tested properly below. (note: with the flow summary
# modeling, objects containing tainted values are not themselves tainted).
re.search(pat, ts),
re.match(pat, ts),
re.fullmatch(pat, ts),
ensure_tainted(
# returns Match object, which is tested properly below. (note: the match objects contain
# tainted values but are not themselves tainted - this test relies on implicit reads at sinks).
re.search(pat, ts), # $ tainted
re.match(pat, ts), # $ tainted
re.fullmatch(pat, ts), # $ tainted
compiled_pat.search(ts),
compiled_pat.match(ts),
compiled_pat.fullmatch(ts),
compiled_pat.search(ts), # $ tainted
compiled_pat.match(ts), # $ tainted
compiled_pat.fullmatch(ts), # $ tainted
)
# Match object