initial MaD implementation for Python

This commit is contained in:
Erik Krogh Kristensen
2022-04-26 12:43:34 +02:00
parent 1d44694280
commit 1c2c9159a9
12 changed files with 1068 additions and 2 deletions

View File

@@ -339,6 +339,9 @@ module API {
result = callee.getReturn() and
result.getAnImmediateUse() = this
}
/** Gets the number of arguments of this call. */
int getNumArgument() { result = count(this.getArg(_)) }
}
/**

View File

@@ -12,6 +12,7 @@ private import semmle.python.frameworks.Asyncpg
private import semmle.python.frameworks.ClickhouseDriver
private import semmle.python.frameworks.Cryptodome
private import semmle.python.frameworks.Cryptography
private import semmle.python.frameworks.data.ModelsAsData
private import semmle.python.frameworks.Dill
private import semmle.python.frameworks.Django
private import semmle.python.frameworks.Fabric

View File

@@ -0,0 +1,46 @@
/**
* Provides classes for contributing a model, or using the interpreted results
* of a model represented as data.
*
* - Use the `ModelInput` module to contribute new models.
* - Use the `ModelOutput` module to access the model results in terms of API nodes.
*
* The package name refers to a Pip package name.
*/
private import python
private import internal.ApiGraphModels as Shared
private import internal.ApiGraphModelsSpecific as Specific
import Shared::ModelInput as ModelInput
import Shared::ModelOutput as ModelOutput
private import semmle.python.dataflow.new.RemoteFlowSources
private import semmle.python.dataflow.new.DataFlow
private import semmle.python.ApiGraphs
private import semmle.python.dataflow.new.TaintTracking
/**
* A remote flow source originating from a CSV source row.
*/
private class RemoteFlowSourceFromCsv extends RemoteFlowSource {
RemoteFlowSourceFromCsv() { this = ModelOutput::getASourceNode("remote").getAnImmediateUse() }
override string getSourceType() { result = "Remote flow" }
}
/**
* Like `ModelOutput::summaryStep` but with API nodes mapped to data-flow nodes.
*/
private predicate summaryStepNodes(DataFlow::Node pred, DataFlow::Node succ, string kind) {
exists(API::Node predNode, API::Node succNode |
Specific::summaryStep(predNode, succNode, kind) and
pred = predNode.getARhs() and
succ = succNode.getAnImmediateUse()
)
}
/** Taint steps induced by summary models of kind `taint`. */
private class TaintStepFromSummary extends TaintTracking::AdditionalTaintStep {
override predicate step(DataFlow::Node pred, DataFlow::Node succ) {
summaryStepNodes(pred, succ, "taint")
}
}

View File

@@ -0,0 +1,175 @@
/**
* Module for parsing access paths from CSV models, both the identifying access path used
* by dynamic languages, and the input/output specifications for summary steps.
*
* This file is used by the shared data flow library and by the JavaScript libraries
* (which does not use the shared data flow libraries).
*/
/**
* Convenience-predicate for extracting two capture groups at once.
*/
bindingset[input, regexp]
private predicate regexpCaptureTwo(string input, string regexp, string capture1, string capture2) {
capture1 = input.regexpCapture(regexp, 1) and
capture2 = input.regexpCapture(regexp, 2)
}
/** Companion module to the `AccessPath` class. */
module AccessPath {
/** A string that should be parsed as an access path. */
abstract class Range extends string {
bindingset[this]
Range() { any() }
}
/**
* Parses an integer constant `n` or interval `n1..n2` (inclusive) and gets the value
* of the constant or any value contained in the interval.
*/
bindingset[arg]
int parseInt(string arg) {
result = arg.toInt()
or
// Match "n1..n2"
exists(string lo, string hi |
regexpCaptureTwo(arg, "(-?\\d+)\\.\\.(-?\\d+)", lo, hi) and
result = [lo.toInt() .. hi.toInt()]
)
}
/**
* Parses a lower-bounded interval `n..` and gets the lower bound.
*/
bindingset[arg]
int parseLowerBound(string arg) { result = arg.regexpCapture("(-?\\d+)\\.\\.", 1).toInt() }
/**
* Parses an integer constant or interval (bounded or unbounded) that explicitly
* references the arity, such as `N-1` or `N-3..N-1`.
*
* Note that expressions of form `N-x` will never resolve to a negative index,
* even if `N` is zero (it will have no result in that case).
*/
bindingset[arg, arity]
private int parseIntWithExplicitArity(string arg, int arity) {
result >= 0 and // do not allow N-1 to resolve to a negative index
exists(string lo |
// N-x
lo = arg.regexpCapture("N-(\\d+)", 1) and
result = arity - lo.toInt()
or
// N-x..
lo = arg.regexpCapture("N-(\\d+)\\.\\.", 1) and
result = [arity - lo.toInt(), arity - 1]
)
or
exists(string lo, string hi |
// x..N-y
regexpCaptureTwo(arg, "(-?\\d+)\\.\\.N-(\\d+)", lo, hi) and
result = [lo.toInt() .. arity - hi.toInt()]
or
// N-x..N-y
regexpCaptureTwo(arg, "N-(\\d+)\\.\\.N-(\\d+)", lo, hi) and
result = [arity - lo.toInt() .. arity - hi.toInt()] and
result >= 0
or
// N-x..y
regexpCaptureTwo(arg, "N-(\\d+)\\.\\.(\\d+)", lo, hi) and
result = [arity - lo.toInt() .. hi.toInt()] and
result >= 0
)
}
/**
* Parses an integer constant or interval (bounded or unbounded) and gets any
* of the integers contained within (of which there may be infinitely many).
*
* Has no result for arguments involving an explicit arity, such as `N-1`.
*/
bindingset[arg, result]
int parseIntUnbounded(string arg) {
result = parseInt(arg)
or
result >= parseLowerBound(arg)
}
/**
* Parses an integer constant or interval (bounded or unbounded) that
* may reference the arity of a call, such as `N-1` or `N-3..N-1`.
*
* Note that expressions of form `N-x` will never resolve to a negative index,
* even if `N` is zero (it will have no result in that case).
*/
bindingset[arg, arity]
int parseIntWithArity(string arg, int arity) {
result = parseInt(arg)
or
result in [parseLowerBound(arg) .. arity - 1]
or
result = parseIntWithExplicitArity(arg, arity)
}
}
/** Gets the `n`th token on the access path as a string. */
private string getRawToken(AccessPath path, int n) {
// Avoid splitting by '.' since tokens may contain dots, e.g. `Field[foo.Bar.x]`.
// Instead use regexpFind to match valid tokens, and supplement with a final length
// check (in `AccessPath.hasSyntaxError`) to ensure all characters were included in a token.
result = path.regexpFind("\\w+(?:\\[[^\\]]*\\])?(?=\\.|$)", n, _)
}
/**
* A string that occurs as an access path (either identifying or input/output spec)
* which might be relevant for this database.
*/
class AccessPath extends string instanceof AccessPath::Range {
/** Holds if this string is not a syntactically valid access path. */
predicate hasSyntaxError() {
// If the lengths match, all characters must haven been included in a token
// or seen by the `.` lookahead pattern.
this != "" and
not this.length() = sum(int n | | getRawToken(this, n).length() + 1) - 1
}
/** Gets the `n`th token on the access path (if there are no syntax errors). */
AccessPathToken getToken(int n) {
result = getRawToken(this, n) and
not this.hasSyntaxError()
}
/** Gets the number of tokens on the path (if there are no syntax errors). */
int getNumToken() {
result = count(int n | exists(getRawToken(this, n))) and
not this.hasSyntaxError()
}
}
/**
* An access part token such as `Argument[1]` or `ReturnValue`, appearing in one or more access paths.
*/
class AccessPathToken extends string {
AccessPathToken() { this = getRawToken(_, _) }
private string getPart(int part) {
result = this.regexpCapture("([^\\[]+)(?:\\[([^\\]]*)\\])?", part)
}
/** Gets the name of the token, such as `Member` from `Member[x]` */
string getName() { result = this.getPart(1) }
/**
* Gets the argument list, such as `1,2` from `Member[1,2]`,
* or has no result if there are no arguments.
*/
string getArgumentList() { result = this.getPart(2) }
/** Gets the `n`th argument to this token, such as `x` or `y` from `Member[x,y]`. */
string getArgument(int n) { result = this.getArgumentList().splitAt(",", n).trim() }
/** Gets an argument to this token, such as `x` or `y` from `Member[x,y]`. */
string getAnArgument() { result = this.getArgument(_) }
/** Gets the number of arguments to this token, such as 2 for `Member[x,y]` or zero for `ReturnValue`. */
int getNumArgument() { result = count(int n | exists(this.getArgument(n))) }
}

View File

@@ -0,0 +1,522 @@
/**
* INTERNAL use only. This is an experimental API subject to change without notice.
*
* Provides classes and predicates for dealing with flow models specified in CSV format.
*
* The CSV specification has the following columns:
* - Sources:
* `package; type; path; kind`
* - Sinks:
* `package; type; path; kind`
* - Summaries:
* `package; type; path; input; output; kind`
* - Types:
* `package1; type1; package2; type2; path`
*
* The interpretation of a row is similar to API-graphs with a left-to-right
* reading.
* 1. The `package` column selects a package name, as it would be referenced in the source code,
* such as an NPM package, PIP package, or Ruby gem. (See `ModelsAsData.qll` for language-specific details).
* It may also be a synthetic package used for a type definition (see type definitions below).
* 2. The `type` column selects all instances of a named type originating from that package,
* or the empty string if referring to the package itself.
* It can also be a synthetic type name defined by a type definition (see type definitions below).
* 3. The `path` column is a `.`-separated list of "access path tokens" to resolve, starting at the node selected by `package` and `type`.
*
* Every language supports the following tokens:
* - Argument[n]: the n-th argument to a call. May be a range of form `x..y` (inclusive) and/or a comma-separated list.
* Additionally, `N-1` refers to the last argument, `N-2` refers to the second-last, and so on.
* - Parameter[n]: the n-th parameter of a callback. May be a range of form `x..y` (inclusive) and/or a comma-separated list.
* - ReturnValue: the value returned by a function call
* - WithArity[n]: match a call with the given arity. May be a range of form `x..y` (inclusive) and/or a comma-separated list.
*
* The following tokens are common and should be implemented for languages where it makes sense:
* - Member[x]: a member named `x`; exactly what a "member" is depends on the language. May be a comma-separated list of names.
* - Instance: an instance of a class
* - Subclass: a subclass of a class
* - ArrayElement: an element of array
* - Element: an element of a collection-like object
* - MapKey: a key in map-like object
* - MapValue: a value in a map-like object
* - Awaited: the value from a resolved promise/future-like object
*
* For the time being, please consult `ApiGraphModelsSpecific.qll` to see which language-specific tokens are currently supported.
*
* 4. The `input` and `output` columns specify how data enters and leaves the element selected by the
* first `(package, type, path)` tuple. Both strings are `.`-separated access paths
* of the same syntax as the `path` column.
* 5. The `kind` column is a tag that can be referenced from QL to determine to
* which classes the interpreted elements should be added. For example, for
* sources `"remote"` indicates a default remote flow source, and for summaries
* `"taint"` indicates a default additional taint step and `"value"` indicates a
* globally applicable value-preserving step.
*
* ### Types
*
* A type row of form `package1; type1; package2; type2; path` indicates that `package2; type2; path`
* should be seen as an instance of the type `package1; type1`.
*
* A `(package,type)` pair may refer to a static type or a synthetic type name used internally in the model.
* Synthetic type names can be used to reuse intermediate sub-paths, when there are multiple ways to access the same
* element.
* See `ModelsAsData.qll` for the langauge-specific interpretation of packages and static type names.
*
* By convention, if one wants to avoid clashes with static types from the package, the type name
* should be prefixed with a tilde character (`~`). For example, `(foo, ~Bar)` can be used to indicate that
* the type is related to the `foo` package but is not intended to match a static type.
*/
private import ApiGraphModelsSpecific as Specific
private class Unit = Specific::Unit;
private module API = Specific::API;
private import Specific::AccessPathSyntax
/** Module containing hooks for providing input data to be interpreted as a model. */
module ModelInput {
/**
* A unit class for adding additional source model rows.
*
* Extend this class to add additional source definitions.
*/
class SourceModelCsv extends Unit {
/**
* Holds if `row` specifies a source definition.
*
* A row of form
* ```
* package;type;path;kind
* ```
* indicates that the value at `(package, type, path)` should be seen as a flow
* source of the given `kind`.
*
* The kind `remote` represents a general remote flow source.
*/
abstract predicate row(string row);
}
/**
* A unit class for adding additional sink model rows.
*
* Extend this class to add additional sink definitions.
*/
class SinkModelCsv extends Unit {
/**
* Holds if `row` specifies a sink definition.
*
* A row of form
* ```
* package;type;path;kind
* ```
* indicates that the value at `(package, type, path)` should be seen as a sink
* of the given `kind`.
*/
abstract predicate row(string row);
}
/**
* A unit class for adding additional summary model rows.
*
* Extend this class to add additional flow summary definitions.
*/
class SummaryModelCsv extends Unit {
/**
* Holds if `row` specifies a summary definition.
*
* A row of form
* ```
* package;type;path;input;output;kind
* ```
* indicates that for each call to `(package, type, path)`, the value referred to by `input`
* can flow to the value referred to by `output`.
*
* `kind` should be either `value` or `taint`, for value-preserving or taint-preserving steps,
* respectively.
*/
abstract predicate row(string row);
}
/**
* A unit class for adding additional type model rows.
*
* Extend this class to add additional type definitions.
*/
class TypeModelCsv extends Unit {
/**
* Holds if `row` specifies a type definition.
*
* A row of form,
* ```
* package1;type1;package2;type2;path
* ```
* indicates that `(package2, type2, path)` should be seen as an instance of `(package1, type1)`.
*/
abstract predicate row(string row);
}
}
private import ModelInput
/**
* An empty class, except in specific tests.
*
* If this is non-empty, all models are parsed even if the package is not
* considered relevant for the current database.
*/
abstract class TestAllModels extends Unit { }
/**
* Append `;dummy` to the value of `s` to work around the fact that `string.split(delim,n)`
* does not preserve empty trailing substrings.
*/
bindingset[result]
private string inversePad(string s) { s = result + ";dummy" }
private predicate sourceModel(string row) { any(SourceModelCsv s).row(inversePad(row)) }
private predicate sinkModel(string row) { any(SinkModelCsv s).row(inversePad(row)) }
private predicate summaryModel(string row) { any(SummaryModelCsv s).row(inversePad(row)) }
private predicate typeModel(string row) { any(TypeModelCsv s).row(inversePad(row)) }
/** Holds if a source model exists for the given parameters. */
predicate sourceModel(string package, string type, string path, string kind) {
exists(string row |
sourceModel(row) and
row.splitAt(";", 0) = package and
row.splitAt(";", 1) = type and
row.splitAt(";", 2) = path and
row.splitAt(";", 3) = kind
)
}
/** Holds if a sink model exists for the given parameters. */
private predicate sinkModel(string package, string type, string path, string kind) {
exists(string row |
sinkModel(row) and
row.splitAt(";", 0) = package and
row.splitAt(";", 1) = type and
row.splitAt(";", 2) = path and
row.splitAt(";", 3) = kind
)
}
/** Holds if a summary model `row` exists for the given parameters. */
private predicate summaryModel(
string package, string type, string path, string input, string output, string kind
) {
exists(string row |
summaryModel(row) and
row.splitAt(";", 0) = package and
row.splitAt(";", 1) = type and
row.splitAt(";", 2) = path and
row.splitAt(";", 3) = input and
row.splitAt(";", 4) = output and
row.splitAt(";", 5) = kind
)
}
/** Holds if an type model exists for the given parameters. */
private predicate typeModel(
string package1, string type1, string package2, string type2, string path
) {
exists(string row |
typeModel(row) and
row.splitAt(";", 0) = package1 and
row.splitAt(";", 1) = type1 and
row.splitAt(";", 2) = package2 and
row.splitAt(";", 3) = type2 and
row.splitAt(";", 4) = path
)
}
/**
* Gets a package that should be seen as an alias for the given other `package`,
* or the `package` itself.
*/
bindingset[package]
bindingset[result]
string getAPackageAlias(string package) {
typeModel(package, "", result, "", "")
or
result = package
}
/**
* Holds if CSV rows involving `package` might be relevant for the analysis of this database.
*/
private predicate isRelevantPackage(string package) {
(
sourceModel(package, _, _, _) or
sinkModel(package, _, _, _) or
summaryModel(package, _, _, _, _, _) or
typeModel(package, _, _, _, _)
) and
(
Specific::isPackageUsed(package)
or
exists(TestAllModels t)
)
or
exists(string other |
isRelevantPackage(other) and
typeModel(package, _, other, _, _)
)
}
/**
* Holds if `package,type,path` is used in some CSV row.
*/
pragma[nomagic]
predicate isRelevantFullPath(string package, string type, string path) {
isRelevantPackage(package) and
(
sourceModel(package, type, path, _) or
sinkModel(package, type, path, _) or
summaryModel(package, type, path, _, _, _) or
typeModel(_, _, package, type, path)
)
}
/** A string from a CSV row that should be parsed as an access path. */
private class AccessPathRange extends AccessPath::Range {
AccessPathRange() {
isRelevantFullPath(_, _, this)
or
exists(string package | isRelevantPackage(package) |
summaryModel(package, _, _, this, _, _) or
summaryModel(package, _, _, _, this, _)
)
}
}
/**
* Gets a successor of `node` in the API graph.
*/
bindingset[token]
API::Node getSuccessorFromNode(API::Node node, AccessPathToken token) {
// API graphs use the same label for arguments and parameters. An edge originating from a
// use-node represents be an argument, and an edge originating from a def-node represents a parameter.
// We just map both to the same thing.
token.getName() = ["Argument", "Parameter"] and
result = node.getParameter(AccessPath::parseIntUnbounded(token.getAnArgument()))
or
token.getName() = "ReturnValue" and
result = node.getReturn()
or
// Language-specific tokens
result = Specific::getExtraSuccessorFromNode(node, token)
}
/**
* Gets an API-graph successor for the given invocation.
*/
bindingset[token]
API::Node getSuccessorFromInvoke(Specific::InvokeNode invoke, AccessPathToken token) {
token.getName() = "Argument" and
result =
invoke
.getParameter(AccessPath::parseIntWithArity(token.getAnArgument(), invoke.getNumArgument()))
or
token.getName() = "ReturnValue" and
result = invoke.getReturn()
or
// Language-specific tokens
result = Specific::getExtraSuccessorFromInvoke(invoke, token)
}
/**
* Holds if `invoke` invokes a call-site filter given by `token`.
*/
pragma[inline]
private predicate invocationMatchesCallSiteFilter(Specific::InvokeNode invoke, AccessPathToken token) {
token.getName() = "WithArity" and
invoke.getNumArgument() = AccessPath::parseIntUnbounded(token.getAnArgument())
or
Specific::invocationMatchesExtraCallSiteFilter(invoke, token)
}
/**
* Gets the API node identified by the first `n` tokens of `path` in the given `(package, type, path)` tuple.
*/
pragma[nomagic]
private API::Node getNodeFromPath(string package, string type, AccessPath path, int n) {
isRelevantFullPath(package, type, path) and
(
n = 0 and
exists(string package2, string type2, AccessPath path2 |
typeModel(package, type, package2, type2, path2) and
result = getNodeFromPath(package2, type2, path2, path2.getNumToken())
)
or
// Language-specific cases, such as handling of global variables
result = Specific::getExtraNodeFromPath(package, type, path, n)
)
or
result = getSuccessorFromNode(getNodeFromPath(package, type, path, n - 1), path.getToken(n - 1))
or
// Similar to the other recursive case, but where the path may have stepped through one or more call-site filters
result =
getSuccessorFromInvoke(getInvocationFromPath(package, type, path, n - 1), path.getToken(n - 1))
}
/** Gets the node identified by the given `(package, type, path)` tuple. */
API::Node getNodeFromPath(string package, string type, AccessPath path) {
result = getNodeFromPath(package, type, path, path.getNumToken())
}
/**
* Gets an invocation identified by the given `(package, type, path)` tuple.
*
* Unlike `getNodeFromPath`, the `path` may end with one or more call-site filters.
*/
Specific::InvokeNode getInvocationFromPath(string package, string type, AccessPath path, int n) {
result = Specific::getAnInvocationOf(getNodeFromPath(package, type, path, n))
or
result = getInvocationFromPath(package, type, path, n - 1) and
invocationMatchesCallSiteFilter(result, path.getToken(n - 1))
}
/** Gets an invocation identified by the given `(package, type, path)` tuple. */
Specific::InvokeNode getInvocationFromPath(string package, string type, AccessPath path) {
result = getInvocationFromPath(package, type, path, path.getNumToken())
}
/**
* Holds if `name` is a valid name for an access path token in the identifying access path.
*/
bindingset[name]
predicate isValidTokenNameInIdentifyingAccessPath(string name) {
name = ["Argument", "Parameter", "ReturnValue", "WithArity"]
or
Specific::isExtraValidTokenNameInIdentifyingAccessPath(name)
}
/**
* Holds if `name` is a valid name for an access path token with no arguments, occuring
* in an identifying access path.
*/
bindingset[name]
predicate isValidNoArgumentTokenInIdentifyingAccessPath(string name) {
name = "ReturnValue"
or
Specific::isExtraValidNoArgumentTokenInIdentifyingAccessPath(name)
}
/**
* Holds if `argument` is a valid argument to an access path token with the given `name`, occurring
* in an identifying access path.
*/
bindingset[name, argument]
predicate isValidTokenArgumentInIdentifyingAccessPath(string name, string argument) {
name = ["Argument", "Parameter"] and
argument.regexpMatch("(N-|-)?\\d+(\\.\\.((N-|-)?\\d+)?)?")
or
name = "WithArity" and
argument.regexpMatch("\\d+(\\.\\.(\\d+)?)?")
or
Specific::isExtraValidTokenArgumentInIdentifyingAccessPath(name, argument)
}
/**
* Module providing access to the imported models in terms of API graph nodes.
*/
module ModelOutput {
/**
* Holds if a CSV source model contributed `source` with the given `kind`.
*/
API::Node getASourceNode(string kind) {
exists(string package, string type, string path |
sourceModel(package, type, path, kind) and
result = getNodeFromPath(package, type, path)
)
}
/**
* Holds if a CSV sink model contributed `sink` with the given `kind`.
*/
API::Node getASinkNode(string kind) {
exists(string package, string type, string path |
sinkModel(package, type, path, kind) and
result = getNodeFromPath(package, type, path)
)
}
/**
* Holds if a relevant CSV summary exists for these parameters.
*/
predicate relevantSummaryModel(
string package, string type, string path, string input, string output, string kind
) {
isRelevantPackage(package) and
summaryModel(package, type, path, input, output, kind)
}
/**
* Holds if a `baseNode` is an invocation identified by the `package,type,path` part of a summary row.
*/
predicate resolvedSummaryBase(
string package, string type, string path, Specific::InvokeNode baseNode
) {
summaryModel(package, type, path, _, _, _) and
baseNode = getInvocationFromPath(package, type, path)
}
/**
* Holds if `node` is seen as an instance of `(package,type)` due to a type definition
* contributed by a CSV model.
*/
API::Node getATypeNode(string package, string type) {
exists(string package2, string type2, AccessPath path |
typeModel(package, type, package2, type2, path) and
result = getNodeFromPath(package2, type2, path)
)
}
/**
* Gets an error message relating to an invalid CSV row in a model.
*/
string getAWarning() {
// Check number of columns
exists(string row, string kind, int expectedArity, int actualArity |
any(SourceModelCsv csv).row(row) and kind = "source" and expectedArity = 4
or
any(SinkModelCsv csv).row(row) and kind = "sink" and expectedArity = 4
or
any(SummaryModelCsv csv).row(row) and kind = "summary" and expectedArity = 6
or
any(TypeModelCsv csv).row(row) and kind = "type" and expectedArity = 5
|
actualArity = count(row.indexOf(";")) + 1 and
actualArity != expectedArity and
result =
"CSV " + kind + " row should have " + expectedArity + " columns but has " + actualArity +
": " + row
)
or
// Check names and arguments of access path tokens
exists(AccessPath path, AccessPathToken token |
isRelevantFullPath(_, _, path) and
token = path.getToken(_)
|
not isValidTokenNameInIdentifyingAccessPath(token.getName()) and
result = "Invalid token name '" + token.getName() + "' in access path: " + path
or
isValidTokenNameInIdentifyingAccessPath(token.getName()) and
exists(string argument |
argument = token.getAnArgument() and
not isValidTokenArgumentInIdentifyingAccessPath(token.getName(), argument) and
result =
"Invalid argument '" + argument + "' in token '" + token + "' in access path: " + path
)
or
isValidTokenNameInIdentifyingAccessPath(token.getName()) and
token.getNumArgument() = 0 and
not isValidNoArgumentTokenInIdentifyingAccessPath(token.getName()) and
result = "Invalid token '" + token + "' is missing its arguments, in access path: " + path
)
}
}

View File

@@ -0,0 +1,173 @@
/**
* Contains the language-specific part of the models-as-data implementation found in `ApiGraphModels.qll`.
*
* It must export the following members:
* ```ql
* class Unit // a unit type
* module AccessPathSyntax // a re-export of the AccessPathSyntax module
* class InvokeNode // a type representing an invocation connected to the API graph
* module API // the API graph module
* predicate isPackageUsed(string package)
* API::Node getExtraNodeFromPath(string package, string type, string path, int n)
* API::Node getExtraSuccessorFromNode(API::Node node, AccessPathToken token)
* API::Node getExtraSuccessorFromInvoke(API::InvokeNode node, AccessPathToken token)
* predicate invocationMatchesExtraCallSiteFilter(API::InvokeNode invoke, AccessPathToken token)
* InvokeNode getAnInvocationOf(API::Node node)
* predicate isExtraValidTokenNameInIdentifyingAccessPath(string name)
* predicate isExtraValidNoArgumentTokenInIdentifyingAccessPath(string name)
* predicate isExtraValidTokenArgumentInIdentifyingAccessPath(string name, string argument)
* ```
*/
private import python as PY
import semmle.python.dataflow.new.DataFlow
private import ApiGraphModels
import semmle.python.ApiGraphs
class Unit = PY::Unit;
// Re-export libraries needed by ApiGraphModels.qll
import semmle.python.frameworks.data.internal.AccessPathSyntax as AccessPathSyntax
private import AccessPathSyntax
/**
* Holds if models describing `package` may be relevant for the analysis of this database.
*/
predicate isPackageUsed(string package) { exists(API::moduleImport(package)) }
/** Gets a Python-specific interpretation of the `(package, type, path)` tuple after resolving the first `n` access path tokens. */
bindingset[package, type, path]
API::Node getExtraNodeFromPath(string package, string type, AccessPath path, int n) {
type = "" and
n = 0 and
result = API::moduleImport(package) and
exists(path)
}
/**
* Gets a Python-specific API graph successor of `node` reachable by resolving `token`.
*/
bindingset[token]
API::Node getExtraSuccessorFromNode(API::Node node, AccessPathToken token) {
token.getName() = "Member" and
result = node.getMember(token.getAnArgument())
or
token.getName() = "Instance" and
result = node.getReturn() // commonly used Token. In Python `Instance` is just an alias for `ReturnValue`
or
token.getName() = "Awaited" and
result = node.getAwaited()
// Some features don't have MaD tokens yet, they would need to be added to API-graphs first.
// - decorators ("DecoratedClass", "DecoratedMember", "DecoratedParameter")
// - Array/Map elements ("ArrayElement", "Element", "MapKey", "MapValue")
}
/**
* Gets a Python-specific API graph successor of `node` reachable by resolving `token`.
*/
bindingset[token]
API::Node getExtraSuccessorFromInvoke(API::CallNode node, AccessPathToken token) {
token.getName() = "Instance" and
result = node.getReturn()
or
token.getName() = "Argument" and
token.getAnArgument() = "self" and
result.getARhs() = node.(DataFlow::MethodCallNode).getObject() // TODO: Get proper support for this in API-graphs?
or
token.getName() = "Argument" and
exists(string arg | arg + ":" = token.getAnArgument() | result = node.getKeywordParameter(arg))
}
/**
* Holds if `invoke` matches the PY-specific call site filter in `token`.
*/
bindingset[token]
predicate invocationMatchesExtraCallSiteFilter(API::CallNode invoke, AccessPathToken token) {
token.getName() = "Call" and exists(invoke) // there is only one kind of call in Python.
}
/**
* Holds if `path` is an input or output spec for a summary with the given `base` node.
*/
pragma[nomagic]
private predicate relevantInputOutputPath(API::CallNode base, AccessPath inputOrOutput) {
exists(string package, string type, string input, string output, string path |
ModelOutput::relevantSummaryModel(package, type, path, input, output, _) and
ModelOutput::resolvedSummaryBase(package, type, path, base) and
inputOrOutput = [input, output]
)
}
/**
* Gets the API node for the first `n` tokens of the given input/output path, evaluated relative to `baseNode`.
*/
private API::Node getNodeFromInputOutputPath(API::CallNode baseNode, AccessPath path, int n) {
relevantInputOutputPath(baseNode, path) and
(
n = 1 and
result = getSuccessorFromInvoke(baseNode, path.getToken(0))
or
result =
getSuccessorFromNode(getNodeFromInputOutputPath(baseNode, path, n - 1), path.getToken(n - 1))
)
}
/**
* Gets the API node for the given input/output path, evaluated relative to `baseNode`.
*/
private API::Node getNodeFromInputOutputPath(API::CallNode baseNode, AccessPath path) {
result = getNodeFromInputOutputPath(baseNode, path, path.getNumToken())
}
/**
* Holds if a CSV summary contributed the step `pred -> succ` of the given `kind`.
*/
predicate summaryStep(API::Node pred, API::Node succ, string kind) {
exists(
string package, string type, string path, API::CallNode base, AccessPath input,
AccessPath output
|
ModelOutput::relevantSummaryModel(package, type, path, input, output, kind) and
ModelOutput::resolvedSummaryBase(package, type, path, base) and
pred = getNodeFromInputOutputPath(base, input) and
succ = getNodeFromInputOutputPath(base, output)
)
}
class InvokeNode = API::CallNode;
/** Gets an `InvokeNode` corresponding to an invocation of `node`. */
InvokeNode getAnInvocationOf(API::Node node) { result = node.getACall() }
/**
* Holds if `name` is a valid name for an access path token in the identifying access path.
*/
bindingset[name]
predicate isExtraValidTokenNameInIdentifyingAccessPath(string name) {
name = ["Member", "Instance", "Awaited", "Call", "Method", "Subclass"]
}
/**
* Holds if `name` is a valid name for an access path token with no arguments, occuring
* in an identifying access path.
*/
predicate isExtraValidNoArgumentTokenInIdentifyingAccessPath(string name) {
name = ["Instance", "Awaited", "Call", "Subclass"]
}
/**
* Holds if `argument` is a valid argument to an access path token with the given `name`, occurring
* in an identifying access path.
*/
bindingset[name, argument]
predicate isExtraValidTokenArgumentInIdentifyingAccessPath(string name, string argument) {
name = ["Member"] and
exists(argument)
or
name = ["Argument", "Parameter"] and
(
argument = "self"
or
argument.regexpMatch("\\w+:") // keyword argument
)
}

View File

@@ -0,0 +1,18 @@
taintFlow
| test.py:3:5:3:15 | ControlFlowNode for getSource() | test.py:4:8:4:8 | ControlFlowNode for x |
isSink
| test.py:4:8:4:8 | ControlFlowNode for x | test-sink |
isSource
| test.py:3:5:3:15 | ControlFlowNode for getSource() | test-source |
syntaxErrors
| Member[foo |
| Member[foo] .Member[bar] |
| Member[foo] Member[bar] |
| Member[foo], Member[bar] |
| Member[foo],Member[bar] |
| Member[foo]. Member[bar] |
| Member[foo]..Member[bar] |
| Member[foo]Member[bar] |
| Member[foo]] |
| Member[foo]].Member[bar] |
warning

View File

@@ -0,0 +1,4 @@
from testlib import getSource, mySink
x = getSource()
mySink(x)

View File

@@ -0,0 +1,85 @@
import python
import semmle.python.frameworks.data.internal.AccessPathSyntax as AccessPathSyntax
import semmle.python.frameworks.data.ModelsAsData
import semmle.python.dataflow.new.TaintTracking
import semmle.python.dataflow.new.DataFlow
// TODO:
/*
* class Steps extends ModelInput::SummaryModelCsv {
* override predicate row(string row) {
* // package;type;path;input;output;kind
* row =
* [
* "testlib;;Member[preserveTaint];Argument[0];ReturnValue;taint",
* "testlib;;Member[taintIntoCallback];Argument[0];Argument[1..2].Parameter[0];taint",
* "testlib;;Member[taintIntoCallbackThis];Argument[0];Argument[1..2].Parameter[this];taint",
* "testlib;;Member[preserveArgZeroAndTwo];Argument[0,2];ReturnValue;taint",
* "testlib;;Member[preserveAllButFirstArgument];Argument[1..];ReturnValue;taint",
* "testlib;;Member[preserveAllIfCall].Call;Argument[0..];ReturnValue;taint",
* "testlib;;Member[getSource].ReturnValue.Member[continue];Argument[this];ReturnValue;taint",
* ]
* }
* }
*/
class Sinks extends ModelInput::SinkModelCsv {
override predicate row(string row) {
// package;type;path;kind
row = ["testlib;;Member[mySink].Argument[0];test-sink"]
}
}
// TODO: Test taint steps (include that the base path may end with ".Call")
// TODO: Ctrl + f: TODO
// TODO: // There are no API-graph edges for: ArrayElement, Element, MapKey, MapValue (remove from valid tokens list)
// TODO: Verify that the list of valid tokens matches the implementation.
class Sources extends ModelInput::SourceModelCsv {
// package;type;path;kind
override predicate row(string row) {
row = ["testlib;;Member[getSource].ReturnValue;test-source"]
}
}
class BasicTaintTracking extends TaintTracking::Configuration {
BasicTaintTracking() { this = "BasicTaintTracking" }
override predicate isSource(DataFlow::Node source) {
source = ModelOutput::getASourceNode("test-source").getAnImmediateUse()
}
override predicate isSink(DataFlow::Node sink) {
sink = ModelOutput::getASinkNode("test-sink").getARhs()
}
}
query predicate taintFlow(DataFlow::Node source, DataFlow::Node sink) {
any(BasicTaintTracking tr).hasFlow(source, sink)
}
query predicate isSink(DataFlow::Node node, string kind) {
node = ModelOutput::getASinkNode(kind).getARhs()
}
query predicate isSource(DataFlow::Node node, string kind) {
node = ModelOutput::getASourceNode(kind).getAnImmediateUse()
}
class SyntaxErrorTest extends ModelInput::SinkModelCsv {
override predicate row(string row) {
row =
[
"testlib;;Member[foo],Member[bar];test-sink", "testlib;;Member[foo] Member[bar];test-sink",
"testlib;;Member[foo]. Member[bar];test-sink",
"testlib;;Member[foo], Member[bar];test-sink",
"testlib;;Member[foo]..Member[bar];test-sink",
"testlib;;Member[foo] .Member[bar];test-sink", "testlib;;Member[foo]Member[bar];test-sink",
"testlib;;Member[foo;test-sink", "testlib;;Member[foo]];test-sink",
"testlib;;Member[foo]].Member[bar];test-sink"
]
}
}
query predicate syntaxErrors(AccessPathSyntax::AccessPath path) { path.hasSyntaxError() }
query predicate warning = ModelOutput::getAWarning/0;

View File

@@ -0,0 +1,12 @@
| CSV type row should have 5 columns but has 2: test;TooFewColumns |
| CSV type row should have 5 columns but has 8: test;TooManyColumns;;;Member[Foo].Instance;too;many;columns |
| Invalid argument '0-1' in token 'Argument[0-1]' in access path: Method[foo].Argument[0-1] |
| Invalid argument '*' in token 'Argument[*]' in access path: Method[foo].Argument[*] |
| Invalid argument 'foo' in token 'Method[foo]' in access path: Method[foo].Arg[0] |
| Invalid argument 'foo' in token 'Method[foo]' in access path: Method[foo].Argument |
| Invalid argument 'foo' in token 'Method[foo]' in access path: Method[foo].Argument[0-1] |
| Invalid argument 'foo' in token 'Method[foo]' in access path: Method[foo].Argument[*] |
| Invalid argument 'foo' in token 'Method[foo]' in access path: Method[foo].Member |
| Invalid token 'Argument' is missing its arguments, in access path: Method[foo].Argument |
| Invalid token 'Member' is missing its arguments, in access path: Method[foo].Member |
| Invalid token name 'Arg' in access path: Method[foo].Arg[0] |

View File

@@ -0,0 +1,25 @@
import python
import semmle.python.frameworks.data.internal.AccessPathSyntax as AccessPathSyntax
import semmle.python.frameworks.data.internal.ApiGraphModels as ApiGraphModels
import semmle.python.frameworks.data.ModelsAsData
private class InvalidTypeModel extends ModelInput::TypeModelCsv {
override predicate row(string row) {
row =
[
"test;TooManyColumns;;;Member[Foo].Instance;too;many;columns", //
"test;TooFewColumns", //
"test;X;test;Y;Method[foo].Arg[0]", //
"test;X;test;Y;Method[foo].Argument[0-1]", //
"test;X;test;Y;Method[foo].Argument[*]", //
"test;X;test;Y;Method[foo].Argument", //
"test;X;test;Y;Method[foo].Member", //
]
}
}
class IsTesting extends ApiGraphModels::TestAllModels {
IsTesting() { this = this }
}
query predicate warning = ModelOutput::getAWarning/0;