Merge pull request #15711 from RasmusWL/tt-content

Python: Add type tracking for content
This commit is contained in:
yoff
2024-04-09 10:37:43 +02:00
committed by GitHub
13 changed files with 360 additions and 84 deletions

View File

@@ -5,9 +5,14 @@
private import internal.TypeTrackingImpl as Impl
import Impl::Shared::TypeTracking<Impl::TypeTrackingInput>
private import semmle.python.dataflow.new.internal.DataFlowPublic as DataFlowPublic
/** A string that may appear as the name of an attribute or access path. */
class AttributeName = Impl::TypeTrackingInput::Content;
/**
* DEPRECATED.
*
* A string that may appear as the name of an attribute or access path.
*/
deprecated class AttributeName = Impl::TypeTrackingInput::Content;
/**
* A summary of the steps needed to track a value to a given dataflow node.
@@ -40,7 +45,11 @@ class TypeTracker extends Impl::TypeTracker {
* Holds if this is the starting point of type tracking, and the value starts in the attribute named `attrName`.
* The type tracking only ends after the attribute has been loaded.
*/
predicate startInAttr(string attrName) { this.startInContent(attrName) }
predicate startInAttr(string attrName) {
exists(DataFlowPublic::AttributeContent content | content.getAttribute() = attrName |
this.startInContent(content)
)
}
/**
* INTERNAL. DO NOT USE.
@@ -48,9 +57,8 @@ class TypeTracker extends Impl::TypeTracker {
* Gets the attribute associated with this type tracker.
*/
string getAttr() {
result = this.getContent().asSome()
or
this.getContent().isNone() and
result = ""
if this.getContent().asSome() instanceof DataFlowPublic::AttributeContent
then result = this.getContent().asSome().(DataFlowPublic::AttributeContent).getAttribute()
else result = ""
}
}

View File

@@ -642,23 +642,37 @@ predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
// Field flow
//--------
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
* content `c`.
* Subset of `storeStep` that should be shared with type-tracking.
*
* NOTE: This does not include attributeStoreStep right now, since it has its' own
* modeling in the type-tracking library (which is slightly different due to
* PostUpdateNodes).
*
* As of 2024-04-02 the type-tracking library only supports precise content, so there is
* no reason to include steps for list content right now.
*/
predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
listStoreStep(nodeFrom, c, nodeTo)
or
setStoreStep(nodeFrom, c, nodeTo)
or
predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
tupleStoreStep(nodeFrom, c, nodeTo)
or
dictStoreStep(nodeFrom, c, nodeTo)
or
moreDictStoreSteps(nodeFrom, c, nodeTo)
or
comprehensionStoreStep(nodeFrom, c, nodeTo)
or
iterableUnpackingStoreStep(nodeFrom, c, nodeTo)
}
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
* content `c`.
*/
predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
storeStepCommon(nodeFrom, c, nodeTo)
or
listStoreStep(nodeFrom, c, nodeTo)
or
setStoreStep(nodeFrom, c, nodeTo)
or
comprehensionStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
@@ -892,12 +906,19 @@ predicate attributeStoreStep(Node nodeFrom, AttributeContent c, Node nodeTo) {
}
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
* Subset of `readStep` that should be shared with type-tracking.
*/
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
subscriptReadStep(nodeFrom, c, nodeTo)
or
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
}
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
*/
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
readStepCommon(nodeFrom, c, nodeTo)
or
matchReadStep(nodeFrom, c, nodeTo)
or

View File

@@ -1,6 +1,7 @@
/** Step Summaries and Type Tracking */
private import TypeTrackerSpecific
private import semmle.python.dataflow.new.internal.DataFlowPublic as DataFlowPublic
cached
private module Cached {
@@ -12,10 +13,22 @@ private module Cached {
LevelStep() or
CallStep() or
ReturnStep() or
deprecated StoreStep(TypeTrackerContent content) { basicStoreStep(_, _, content) } or
deprecated LoadStep(TypeTrackerContent content) { basicLoadStep(_, _, content) } or
deprecated StoreStep(TypeTrackerContent content) {
exists(DataFlowPublic::AttributeContent dfc | dfc.getAttribute() = content |
basicStoreStep(_, _, dfc)
)
} or
deprecated LoadStep(TypeTrackerContent content) {
exists(DataFlowPublic::AttributeContent dfc | dfc.getAttribute() = content |
basicLoadStep(_, _, dfc)
)
} or
deprecated LoadStoreStep(TypeTrackerContent load, TypeTrackerContent store) {
basicLoadStoreStep(_, _, load, store)
exists(DataFlowPublic::AttributeContent dfcLoad, DataFlowPublic::AttributeContent dfcStore |
dfcLoad.getAttribute() = load and dfcStore.getAttribute() = store
|
basicLoadStoreStep(_, _, dfcLoad, dfcStore)
)
} or
deprecated WithContent(ContentFilter filter) { basicWithContentStep(_, _, filter) } or
deprecated WithoutContent(ContentFilter filter) { basicWithoutContentStep(_, _, filter) } or
@@ -29,13 +42,13 @@ private module Cached {
// Restrict `content` to those that might eventually match a load.
// We can't rely on `basicStoreStep` since `startInContent` might be used with
// a content that has no corresponding store.
exists(TypeTrackerContent loadContents |
exists(DataFlowPublic::AttributeContent loadContents |
(
basicLoadStep(_, _, loadContents)
or
basicLoadStoreStep(_, _, loadContents, _)
) and
compatibleContents(content, loadContents)
compatibleContents(content, loadContents.getAttribute())
)
}
@@ -45,13 +58,13 @@ private module Cached {
content = noContent()
or
// As in MkTypeTracker, restrict `content` to those that might eventually match a store.
exists(TypeTrackerContent storeContent |
exists(DataFlowPublic::AttributeContent storeContent |
(
basicStoreStep(_, _, storeContent)
or
basicLoadStoreStep(_, _, _, storeContent)
) and
compatibleContents(storeContent, content)
compatibleContents(storeContent.getAttribute(), content)
)
}
@@ -198,7 +211,10 @@ private module Cached {
flowsToStoreStep(nodeFrom, nodeTo, content) and
summary = StoreStep(content)
or
basicLoadStep(nodeFrom, nodeTo, content) and summary = LoadStep(content)
exists(DataFlowPublic::AttributeContent dfc | dfc.getAttribute() = content |
basicLoadStep(nodeFrom, nodeTo, dfc)
) and
summary = LoadStep(content)
)
or
exists(TypeTrackerContent loadContent, TypeTrackerContent storeContent |
@@ -281,7 +297,12 @@ deprecated private predicate smallstepProj(Node nodeFrom, StepSummary summary) {
deprecated private predicate flowsToStoreStep(
Node nodeFrom, TypeTrackingNode nodeTo, TypeTrackerContent content
) {
exists(Node obj | nodeTo.flowsTo(obj) and basicStoreStep(nodeFrom, obj, content))
exists(Node obj |
nodeTo.flowsTo(obj) and
exists(DataFlowPublic::AttributeContent dfc | dfc.getAttribute() = content |
basicStoreStep(nodeFrom, obj, dfc)
)
)
}
/**
@@ -292,7 +313,12 @@ deprecated private predicate flowsToLoadStoreStep(
TypeTrackerContent storeContent
) {
exists(Node obj |
nodeTo.flowsTo(obj) and basicLoadStoreStep(nodeFrom, obj, loadContent, storeContent)
nodeTo.flowsTo(obj) and
exists(DataFlowPublic::AttributeContent loadDfc, DataFlowPublic::AttributeContent storeDfc |
loadDfc.getAttribute() = loadContent and storeDfc.getAttribute() = storeContent
|
basicLoadStoreStep(nodeFrom, obj, loadDfc, storeDfc)
)
)
}

View File

@@ -15,7 +15,7 @@ deprecated class OptionalTypeTrackerContent extends string {
OptionalTypeTrackerContent() {
this = ""
or
this instanceof TypeTrackingImpl::TypeTrackingInput::Content
this = any(DataFlowPublic::AttributeContent dfc).getAttribute()
}
}

View File

@@ -8,6 +8,7 @@ private import semmle.python.dataflow.new.internal.DataFlowPrivate as DataFlowPr
private import codeql.typetracking.internal.SummaryTypeTracker as SummaryTypeTracker
private import semmle.python.dataflow.new.internal.FlowSummaryImpl as FlowSummaryImpl
private import semmle.python.dataflow.new.internal.DataFlowDispatch as DataFlowDispatch
private import semmle.python.dataflow.new.internal.IterableUnpacking as IterableUnpacking
private module SummaryTypeTrackerInput implements SummaryTypeTracker::Input {
// Dataflow nodes
@@ -97,24 +98,25 @@ private module SummaryTypeTrackerInput implements SummaryTypeTracker::Input {
private module TypeTrackerSummaryFlow = SummaryTypeTracker::SummaryFlow<SummaryTypeTrackerInput>;
/**
* Gets the name of a possible piece of content. For Python, this is currently only attribute names,
* using the name of the attribute for the corresponding content.
*/
private string getPossibleContentName() {
Stages::TypeTracking::ref() and // the TypeTracking::append() etc. predicates that we want to cache depend on this predicate, so we can place the `ref()` call here to get around identical files.
result = any(DataFlowPublic::AttrRef a).getAttributeName()
}
module TypeTrackingInput implements Shared::TypeTrackingInput {
class Node = DataFlowPublic::Node;
class LocalSourceNode = DataFlowPublic::LocalSourceNode;
class Content instanceof string {
Content() { this = getPossibleContentName() }
string toString() { result = this }
class Content extends DataFlowPublic::Content {
Content() {
// TODO: for now, it's not 100% clear if should support non-precise content in
// type-tracking, or if it will lead to bad results. We start with only allowing
// precise content, which should always be a good improvement! It also simplifies
// the process of examining new results from non-precise content steps in the
// future, since you will _only_ have to look over the results from the new
// non-precise steps.
this instanceof DataFlowPublic::AttributeContent
or
this instanceof DataFlowPublic::DictionaryElementContent
or
this instanceof DataFlowPublic::TupleElementContent
}
}
/**
@@ -134,7 +136,27 @@ module TypeTrackingInput implements Shared::TypeTrackingInput {
}
/** Holds if there is a simple local flow step from `nodeFrom` to `nodeTo` */
predicate simpleLocalSmallStep = DataFlowPrivate::simpleLocalFlowStepForTypetracking/2;
predicate simpleLocalSmallStep(Node nodeFrom, Node nodeTo) {
DataFlowPrivate::simpleLocalFlowStepForTypetracking(nodeFrom, nodeTo) and
// for `for k,v in foo` no need to do local flow step from the synthetic sequence
// node for `k,v` to the tuple `k,v` -- since type-tracking only supports one level
// of content tracking, and there is one read-step from `foo` the synthetic sequence
// node required, we can skip the flow step from the synthetic sequence node to the
// tuple itself, since the read-step from the tuple to the tuple elements will not
// matter.
not (
IterableUnpacking::iterableUnpackingForReadStep(_, _, nodeFrom) and
IterableUnpacking::iterableUnpackingTupleFlowStep(nodeFrom, nodeTo)
) and
// for nested iterable unpacking, such as `[[a]] = foo` or `((a,b),) = bar`, we can
// ignore the flow steps from the synthetic sequence node to the real sequence node,
// since we only support one level of content in type-trackers, and the nested
// structure requires two levels at least to be useful.
not exists(SequenceNode outer |
outer.getAnElement() = nodeTo.asCfgNode() and
IterableUnpacking::iterableUnpackingTupleFlowStep(nodeFrom, nodeTo)
)
}
/** Holds if there is a level step from `nodeFrom` to `nodeTo`, which may depend on the call graph. */
predicate levelStepCall(Node nodeFrom, LocalSourceNode nodeTo) { none() }
@@ -181,46 +203,68 @@ module TypeTrackingInput implements Shared::TypeTrackingInput {
* Holds if `nodeFrom` is being written to the `content` content of the object in `nodeTo`.
*/
predicate storeStep(Node nodeFrom, Node nodeTo, Content content) {
exists(DataFlowPublic::AttrWrite a |
a.mayHaveAttributeName(content) and
exists(DataFlowPublic::AttrWrite a, string attrName |
content.(DataFlowPublic::AttributeContent).getAttribute() = attrName and
a.mayHaveAttributeName(attrName) and
nodeFrom = a.getValue() and
nodeTo = a.getObject()
)
or
exists(DataFlowPublic::ContentSet contents |
contents.(DataFlowPublic::AttributeContent).getAttribute() = content
// type-tracking doesn't really handle PostUpdateNodes, so for some assignment steps
// like `my_dict["foo"] = foo` the data-flow step targets the PostUpdateNode for
// `my_dict`, where we want to translate that into a type-tracking step that targets
// the normal/non-PostUpdateNode for `my_dict`.
exists(DataFlowPublic::Node storeTarget |
DataFlowPrivate::storeStepCommon(nodeFrom, content, storeTarget)
|
TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, contents)
)
not storeTarget instanceof DataFlowPrivate::SyntheticPostUpdateNode and
nodeTo = storeTarget
or
nodeTo = storeTarget.(DataFlowPrivate::SyntheticPostUpdateNode).getPreUpdateNode()
) and
// when only supporting precise content, no need for IterableElementNode (since it
// is only fed set/list content)
not nodeFrom instanceof DataFlowPublic::IterableElementNode
or
TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, content)
}
/**
* Holds if `nodeTo` is the result of accessing the `content` content of `nodeFrom`.
*/
predicate loadStep(Node nodeFrom, LocalSourceNode nodeTo, Content content) {
exists(DataFlowPublic::AttrRead a |
a.mayHaveAttributeName(content) and
exists(DataFlowPublic::AttrRead a, string attrName |
content.(DataFlowPublic::AttributeContent).getAttribute() = attrName and
a.mayHaveAttributeName(attrName) and
nodeFrom = a.getObject() and
nodeTo = a
)
or
exists(DataFlowPublic::ContentSet contents |
contents.(DataFlowPublic::AttributeContent).getAttribute() = content
|
TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, contents)
DataFlowPrivate::readStepCommon(nodeFrom, content, nodeTo) and
// Since we only support one level of content in type-trackers we don't actually
// support `(aa, ab), (ba, bb) = ...`. Therefore we exclude the read-step from `(aa,
// ab)` to `aa` (since it is not needed).
not exists(SequenceNode outer |
outer.getAnElement() = nodeFrom.asCfgNode() and
IterableUnpacking::iterableUnpackingTupleFlowStep(_, nodeFrom)
) and
// Again, due to only supporting one level deep, for `for (k,v) in ...` we exclude read-step from
// the tuple to `k` and `v`.
not exists(DataFlowPublic::IterableSequenceNode seq, DataFlowPublic::IterableElementNode elem |
IterableUnpacking::iterableUnpackingForReadStep(_, _, seq) and
IterableUnpacking::iterableUnpackingConvertingReadStep(seq, _, elem) and
IterableUnpacking::iterableUnpackingConvertingStoreStep(elem, _, nodeFrom) and
nodeFrom.asCfgNode() instanceof SequenceNode
)
or
TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, content)
}
/**
* Holds if the `loadContent` of `nodeFrom` is stored in the `storeContent` of `nodeTo`.
*/
predicate loadStoreStep(Node nodeFrom, Node nodeTo, Content loadContent, Content storeContent) {
exists(DataFlowPublic::ContentSet loadContents, DataFlowPublic::ContentSet storeContents |
loadContents.(DataFlowPublic::AttributeContent).getAttribute() = loadContent and
storeContents.(DataFlowPublic::AttributeContent).getAttribute() = storeContent
|
TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo, loadContents, storeContents)
)
TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo, loadContent, storeContent)
}
/**