diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll index aa4130fb6a8..1c9ec5dff17 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll @@ -753,7 +753,7 @@ predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) { * As of 2024-04-02 the type-tracking library only supports precise content, so there is * no reason to include steps for list content right now. */ -predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) { +predicate storeStepCommon(Node nodeFrom, Content c, Node nodeTo) { tupleStoreStep(nodeFrom, c, nodeTo) or dictStoreStep(nodeFrom, c, nodeTo) @@ -767,29 +767,31 @@ predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) { * Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to * content `c`. */ -predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) { - storeStepCommon(nodeFrom, c, nodeTo) +predicate storeStep(Node nodeFrom, ContentSet cs, Node nodeTo) { + exists(Content c | cs = singleton(c) | + storeStepCommon(nodeFrom, c, nodeTo) + or + listStoreStep(nodeFrom, c, nodeTo) + or + setStoreStep(nodeFrom, c, nodeTo) + or + attributeStoreStep(nodeFrom, c, nodeTo) + or + matchStoreStep(nodeFrom, c, nodeTo) + or + any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo) + or + synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo) + or + synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo) + or + yieldStoreStep(nodeFrom, c, nodeTo) + or + VariableCapture::storeStep(nodeFrom, c, nodeTo) + ) or - listStoreStep(nodeFrom, c, nodeTo) - or - setStoreStep(nodeFrom, c, nodeTo) - or - attributeStoreStep(nodeFrom, c, nodeTo) - or - matchStoreStep(nodeFrom, c, nodeTo) - or - any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo) - or - FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c, + FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs, nodeTo.(FlowSummaryNode).getSummaryNode()) - or - synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo) - or - synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo) - or - yieldStoreStep(nodeFrom, c, nodeTo) - or - VariableCapture::storeStep(nodeFrom, c, nodeTo) } /** @@ -985,7 +987,7 @@ predicate attributeStoreStep(Node nodeFrom, AttributeContent c, Node nodeTo) { /** * Subset of `readStep` that should be shared with type-tracking. */ -predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) { +predicate readStepCommon(Node nodeFrom, Content c, Node nodeTo) { subscriptReadStep(nodeFrom, c, nodeTo) or iterableUnpackingReadStep(nodeFrom, c, nodeTo) @@ -994,23 +996,25 @@ predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) { /** * Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`. */ -predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) { - readStepCommon(nodeFrom, c, nodeTo) +predicate readStep(Node nodeFrom, ContentSet cs, Node nodeTo) { + exists(Content c | cs = singleton(c) | + readStepCommon(nodeFrom, c, nodeTo) + or + matchReadStep(nodeFrom, c, nodeTo) + or + forReadStep(nodeFrom, c, nodeTo) + or + attributeReadStep(nodeFrom, c, nodeTo) + or + synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo) + or + VariableCapture::readStep(nodeFrom, c, nodeTo) + ) or - matchReadStep(nodeFrom, c, nodeTo) - or - forReadStep(nodeFrom, c, nodeTo) - or - attributeReadStep(nodeFrom, c, nodeTo) - or - FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c, + FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs, nodeTo.(FlowSummaryNode).getSummaryNode()) or - synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo) - or - VariableCapture::readStep(nodeFrom, c, nodeTo) - or - Conversions::readStep(nodeFrom, c, nodeTo) + Conversions::readStep(nodeFrom, cs, nodeTo) } /** Data flows from a sequence to a subscript of the sequence. */ @@ -1074,11 +1078,7 @@ module Conversions { nodeFrom = decoding.getAnInput() and nodeTo = decoding.getOutput() ) and - ( - c instanceof TupleElementContent - or - c instanceof DictionaryElementContent - ) + (c.isAnyTupleElement() or c.isAnyDictionaryElement()) } predicate encoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) { @@ -1086,11 +1086,7 @@ module Conversions { nodeFrom = encoding.getAnInput() and nodeTo = encoding.getOutput() ) and - ( - c instanceof TupleElementContent - or - c instanceof DictionaryElementContent - ) + (c.isAnyTupleElement() or c.isAnyDictionaryElement()) } predicate formatReadStep(Node nodeFrom, ContentSet c, Node nodeTo) { @@ -1099,13 +1095,13 @@ module Conversions { fmt.getOp() instanceof Mod and fmt.getRight() = nodeFrom.asCfgNode() ) and - c instanceof TupleElementContent + c.isAnyTupleElement() or // format_map // see https://docs.python.org/3/library/stdtypes.html#str.format_map nodeTo.(MethodCallNode).calls(_, "format_map") and nodeTo.(MethodCallNode).getArg(0) = nodeFrom and - c instanceof DictionaryElementContent + c.isAnyDictionaryElement() } predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) { @@ -1122,18 +1118,20 @@ module Conversions { * any value stored inside `f` is cleared at the pre-update node associated with `x` * in `x.f = newValue`. */ -predicate clearsContent(Node n, ContentSet c) { - matchClearStep(n, c) +predicate clearsContent(Node n, ContentSet cs) { + exists(Content c | cs = singleton(c) | + matchClearStep(n, c) + or + attributeClearStep(n, c) + or + dictClearStep(n, c) + or + dictSplatParameterNodeClearStep(n, c) + or + VariableCapture::clearsContent(n, c) + ) or - attributeClearStep(n, c) - or - dictClearStep(n, c) - or - FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), c) - or - dictSplatParameterNodeClearStep(n, c) - or - VariableCapture::clearsContent(n, c) + FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), cs) } /** diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll index 8612d4a253e..173a5598149 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll @@ -898,19 +898,65 @@ class CapturedVariableContent extends Content, TCapturedVariableContent { override string getMaDRepresentation() { none() } } +/** + * An entity that represents a set of `Content`s. + * + * Most `ContentSet`s are singletons (i.e. they consist of a single `Content`), + * but `AnyDictionaryElement` and `AnyTupleElement` act as wildcards on the + * read side: a read at such a `ContentSet` matches any specific dictionary + * key / tuple index store, as well as (for dictionaries) the + * "unknown-bucket" Content `DictionaryElementAnyContent`. + * + * Keeping these as wildcard `ContentSet`s (rather than enumerating one + * `ContentSet` per key/index) keeps the dataflow `readSetEx` relation small + * when implicit reads are used (e.g. at sinks via `defaultImplicitTaintRead`). + */ +private newtype TContentSet = + TSingletonContent(Content c) or + TAnyTupleElement() or + TAnyDictionaryElement() + /** * An entity that represents a set of `Content`s. * * The set may be interpreted differently depending on whether it is * stored into (`getAStoreContent`) or read from (`getAReadContent`). */ -class ContentSet instanceof Content { +class ContentSet extends TContentSet { + /** Holds if this content set is the singleton `{c}`. */ + predicate isSingleton(Content c) { this = TSingletonContent(c) } + + /** Holds if this content set is the wildcard for all tuple elements. */ + predicate isAnyTupleElement() { this = TAnyTupleElement() } + + /** Holds if this content set is the wildcard for all dictionary elements. */ + predicate isAnyDictionaryElement() { this = TAnyDictionaryElement() } + /** Gets a content that may be stored into when storing into this set. */ - Content getAStoreContent() { result = this } + Content getAStoreContent() { this = TSingletonContent(result) } /** Gets a content that may be read from when reading from this set. */ - Content getAReadContent() { result = this } + Content getAReadContent() { + this = TSingletonContent(result) + or + // Wildcard expansion: a read at "any tuple element" matches a store at any + // specific tuple index. (Stores always target a specific index, so we don't + // need a `TupleElementAnyContent` Content kind here.) + this = TAnyTupleElement() and result instanceof TupleElementContent + or + this = TAnyDictionaryElement() and + (result instanceof DictionaryElementContent or result instanceof DictionaryElementAnyContent) + } /** Gets a textual representation of this content set. */ - string toString() { result = super.toString() } + string toString() { + exists(Content c | this = TSingletonContent(c) | result = c.toString()) + or + this = TAnyTupleElement() and result = "Any tuple element" + or + this = TAnyDictionaryElement() and result = "Any dictionary element" + } } + +/** Gets the singleton `ContentSet` wrapping the `Content` `c`. */ +ContentSet singleton(Content c) { result = TSingletonContent(c) } diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll b/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll index 41cb0368b50..1e9ffcf463c 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll @@ -66,21 +66,27 @@ module Input implements InputSig } string encodeContent(ContentSet cs, string arg) { - cs = TListElementContent() and result = "ListElement" and arg = "" - or - cs = TSetElementContent() and result = "SetElement" and arg = "" - or - exists(int index | - cs = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString() + exists(Content c | cs.isSingleton(c) | + c = TListElementContent() and result = "ListElement" and arg = "" + or + c = TSetElementContent() and result = "SetElement" and arg = "" + or + exists(int index | + c = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString() + ) + or + exists(string key | + c = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key + ) + or + c = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = "" + or + exists(string attr | c = TAttributeContent(attr) and result = "Attribute" and arg = attr) ) or - exists(string key | - cs = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key - ) + cs.isAnyTupleElement() and result = "AnyTupleElement" and arg = "" or - cs = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = "" - or - exists(string attr | cs = TAttributeContent(attr) and result = "Attribute" and arg = attr) + cs.isAnyDictionaryElement() and result = "AnyDictionaryElement" and arg = "" } bindingset[token] @@ -139,27 +145,29 @@ module Private { predicate withContent = SC::withContent/1; /** Gets a summary component that represents a list element. */ - SummaryComponent listElement() { result = content(any(ListElementContent c)) } + SummaryComponent listElement() { result = content(singleton(any(ListElementContent c))) } /** Gets a summary component that represents a set element. */ - SummaryComponent setElement() { result = content(any(SetElementContent c)) } + SummaryComponent setElement() { result = content(singleton(any(SetElementContent c))) } /** Gets a summary component that represents a tuple element. */ SummaryComponent tupleElement(int index) { - exists(TupleElementContent c | c.getIndex() = index and result = content(c)) + exists(TupleElementContent c | c.getIndex() = index and result = content(singleton(c))) } /** Gets a summary component that represents a dictionary element. */ SummaryComponent dictionaryElement(string key) { - exists(DictionaryElementContent c | c.getKey() = key and result = content(c)) + exists(DictionaryElementContent c | c.getKey() = key and result = content(singleton(c))) } /** Gets a summary component that represents a dictionary element at any key. */ - SummaryComponent dictionaryElementAny() { result = content(any(DictionaryElementAnyContent c)) } + SummaryComponent dictionaryElementAny() { + result = content(singleton(any(DictionaryElementAnyContent c))) + } /** Gets a summary component that represents an attribute element. */ SummaryComponent attribute(string attr) { - exists(AttributeContent c | c.getAttribute() = attr and result = content(c)) + exists(AttributeContent c | c.getAttribute() = attr and result = content(singleton(c))) } /** Gets a summary component that represents the return value of a call. */ diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll index 6959ceabe70..2d2cceb73c1 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll @@ -17,14 +17,15 @@ predicate defaultTaintSanitizer(DataFlow::Node node) { none() } */ bindingset[node] predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) { - // We allow implicit reads of precise content - // imprecise content has already bubled up. + // We allow implicit reads of precise content; imprecise content has already + // bubbled up. We use the wildcard content sets here rather than the + // per-key/per-index ones to avoid blowing up the size of `Stage1::readSetEx` + // (otherwise this predicate would expand to one row per (node, distinct key + // or index) and the framework's read-set relation grows quadratically). + // `ContentSet.getAReadContent` expands these wildcards back to the specific + // contents when matching against stores. exists(node) and - ( - c instanceof DataFlow::TupleElementContent - or - c instanceof DataFlow::DictionaryElementContent - ) + (c.isAnyTupleElement() or c.isAnyDictionaryElement()) } private module Cached { diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll b/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll index 95434b05451..215c7906e65 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll @@ -241,7 +241,7 @@ module TypeTrackingInput implements Shared::TypeTrackingInput { // is only fed set/list content) not nodeFrom instanceof DataFlowPublic::IterableElementNode or - TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, content) + TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, DataFlowPublic::singleton(content)) } /** @@ -272,14 +272,15 @@ module TypeTrackingInput implements Shared::TypeTrackingInput { nodeFrom.asCfgNode() instanceof SequenceNode ) or - TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, content) + TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, DataFlowPublic::singleton(content)) } /** * Holds if the `loadContent` of `nodeFrom` is stored in the `storeContent` of `nodeTo`. */ predicate loadStoreStep(Node nodeFrom, Node nodeTo, Content loadContent, Content storeContent) { - TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo, loadContent, storeContent) + TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo, + DataFlowPublic::singleton(loadContent), DataFlowPublic::singleton(storeContent)) } /** diff --git a/python/ql/src/Variables/LoopVariableCapture/LoopVariableCaptureQuery.qll b/python/ql/src/Variables/LoopVariableCapture/LoopVariableCaptureQuery.qll index 987740236f2..6b3e428b995 100644 --- a/python/ql/src/Variables/LoopVariableCapture/LoopVariableCaptureQuery.qll +++ b/python/ql/src/Variables/LoopVariableCapture/LoopVariableCaptureQuery.qll @@ -61,10 +61,13 @@ module EscapingCaptureFlowConfig implements DataFlow::ConfigSig { predicate allowImplicitRead(DataFlow::Node node, DataFlow::ContentSet cs) { isSink(node) and ( - cs.(DataFlow::TupleElementContent).getIndex() in [0 .. 10] or - cs instanceof DataFlow::ListElementContent or - cs instanceof DataFlow::SetElementContent or - cs instanceof DataFlow::DictionaryElementAnyContent + cs.isAnyTupleElement() + or + cs.isAnyDictionaryElement() + or + cs.getAStoreContent() instanceof DataFlow::ListElementContent + or + cs.getAStoreContent() instanceof DataFlow::SetElementContent ) } } diff --git a/python/ql/test/library-tests/frameworks/stdlib/test_re.py b/python/ql/test/library-tests/frameworks/stdlib/test_re.py index 8ed6f620bfa..8107b7dd988 100644 --- a/python/ql/test/library-tests/frameworks/stdlib/test_re.py +++ b/python/ql/test/library-tests/frameworks/stdlib/test_re.py @@ -6,16 +6,16 @@ pat = ... # some pattern compiled_pat = re.compile(pat) # see https://docs.python.org/3/library/re.html#functions -ensure_not_tainted( - # returns Match object, which is tested properly below. (note: with the flow summary - # modeling, objects containing tainted values are not themselves tainted). - re.search(pat, ts), - re.match(pat, ts), - re.fullmatch(pat, ts), +ensure_tainted( + # returns Match object, which is tested properly below. (note: the match objects contain + # tainted values but are not themselves tainted - this test relies on implicit reads at sinks). + re.search(pat, ts), # $ tainted + re.match(pat, ts), # $ tainted + re.fullmatch(pat, ts), # $ tainted - compiled_pat.search(ts), - compiled_pat.match(ts), - compiled_pat.fullmatch(ts), + compiled_pat.search(ts), # $ tainted + compiled_pat.match(ts), # $ tainted + compiled_pat.fullmatch(ts), # $ tainted ) # Match object