Files
codeql/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll
2025-06-24 10:25:06 +02:00

646 lines
25 KiB
Plaintext

/**
* INTERNAL use only. This is an experimental API subject to change without notice.
*
* Provides classes and predicates for dealing with flow models specified
* in data extensions and CSV format.
*
* The CSV specification has the following columns:
* - Sources:
* `package; type; subtypes; name; signature; ext; output; kind; provenance`
* - Sinks:
* `package; type; subtypes; name; signature; ext; input; kind; provenance`
* - Summaries:
* `package; type; subtypes; name; signature; ext; input; output; kind; provenance`
* - Neutrals:
* `package; type; name; signature; kind; provenance`
* A neutral is used to indicate that a callable is neutral with respect to flow (no summary), source (is not a source) or sink (is not a sink).
*
* The interpretation of a row is similar to API-graphs with a left-to-right
* reading.
* 1. The `package` column selects a package.
* 2. The `type` column selects a type within that package.
* 3. The `subtypes` is a boolean that indicates whether to jump to an
* arbitrary subtype of that type.
* 4. The `name` column optionally selects a specific named member of the type.
* 5. The `signature` column optionally restricts the named member. If
* `signature` is blank then no such filtering is done. The format of the
* signature is a comma-separated list of types enclosed in parentheses. The
* types can be short names or fully qualified names (mixing these two options
* is not allowed within a single signature).
* 6. The `ext` column specifies additional API-graph-like edges. Currently
* there are only two valid values: "" and "Annotated". The empty string has no
* effect. "Annotated" applies if `name` and `signature` were left blank and
* acts by selecting an element that is annotated by the annotation type
* selected by the first 4 columns. This can be another member such as a field
* or method, or a parameter.
* 7. The `input` column specifies how data enters the element selected by the
* first 6 columns, and the `output` column specifies how data leaves the
* element selected by the first 6 columns. An `input` can be a dot separated
* path consisting of either "", "Argument[n]", "Argument[n1..n2]",
* "ReturnValue", "Element", "WithoutElement", or "WithElement":
* - "": Selects a write to the selected element in case this is a field.
* - "Argument[n]": Selects an argument in a call to the selected element.
* The arguments are zero-indexed, and `this` specifies the qualifier.
* - "Argument[n1..n2]": Similar to "Argument[n]" but select any argument in
* the given range. The range is inclusive at both ends.
* - "ReturnValue": Selects a value being returned by the selected element.
* This requires that the selected element is a method with a body.
* - "Element": Selects the collection elements of the selected element.
* - "WithoutElement": Selects the selected element but without
* its collection elements.
* - "WithElement": Selects the collection elements of the selected element, but
* points to the selected element.
*
* An `output` can be can be a dot separated path consisting of either "",
* "Argument[n]", "Argument[n1..n2]", "Parameter", "Parameter[n]",
* "Parameter[n1..n2]", "ReturnValue", or "Element":
* - "": Selects a read of a selected field, or a selected parameter.
* - "Argument[n]": Selects the post-update value of an argument in a call to the
* selected element. That is, the value of the argument after the call returns.
* The arguments are zero-indexed, and `this` specifies the qualifier.
* - "Argument[n1..n2]": Similar to "Argument[n]" but select any argument in
* the given range. The range is inclusive at both ends.
* - "Parameter": Selects the value of a parameter of the selected element.
* "Parameter" is also allowed in case the selected element is already a
* parameter itself.
* - "Parameter[n]": Similar to "Parameter" but restricted to a specific
* numbered parameter (zero-indexed, and `this` specifies the value of `this`).
* - "Parameter[n1..n2]": Similar to "Parameter[n]" but selects any parameter
* in the given range. The range is inclusive at both ends.
* - "ReturnValue": Selects the return value of a call to the selected element.
* - "Element": Selects the collection elements of the selected element.
* 8. The `kind` column is a tag that can be referenced from QL to determine to
* which classes the interpreted elements should be added. For example, for
* sources "remote" indicates a default remote flow source, and for summaries
* "taint" indicates a default additional taint step and "value" indicates a
* globally applicable value-preserving step. For neutrals the kind can be `summary`,
* `source` or `sink` to indicate that the neutral is neutral with respect to
* flow (no summary), source (is not a source) or sink (is not a sink).
* 9. The `provenance` column is a tag to indicate the origin and verification of a model.
* The format is {origin}-{verification} or just "manual" where the origin describes
* the origin of the model and verification describes how the model has been verified.
* Some examples are:
* - "df-generated": The model has been generated by the model generator tool.
* - "df-manual": The model has been generated by the model generator and verified by a human.
* - "manual": The model has been written by hand.
* This information is used in a heuristic for dataflow analysis to determine, if a
* model or source code should be used for determining flow.
*/
overlay[local?]
module;
import java
private import semmle.code.java.dataflow.DataFlow::DataFlow
private import FlowSummary as FlowSummary
private import internal.DataFlowPrivate
private import internal.FlowSummaryImpl
private import internal.FlowSummaryImpl::Public
private import internal.FlowSummaryImpl::Private
private import internal.FlowSummaryImpl::Private::External
private import internal.ExternalFlowExtensions as Extensions
private import codeql.mad.ModelValidation as SharedModelVal
/**
* A class for activating additional model rows.
*
* Extend this class to include experimental model rows with `this` name
* in data flow analysis.
*/
abstract private class ActiveExperimentalModelsInternal extends string {
bindingset[this]
ActiveExperimentalModelsInternal() { any() }
/**
* Holds if an experimental source model exists for the given parameters.
*/
predicate sourceModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string output, string kind, string provenance, QlBuiltins::ExtensionId madId
) {
Extensions::experimentalSourceModel(package, type, subtypes, name, signature, ext, output, kind,
provenance, this, madId)
}
/**
* Holds if an experimental sink model exists for the given parameters.
*/
predicate sinkModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string kind, string provenance, QlBuiltins::ExtensionId madId
) {
Extensions::experimentalSinkModel(package, type, subtypes, name, signature, ext, input, kind,
provenance, this, madId)
}
/**
* Holds if an experimental summary model exists for the given parameters.
*/
predicate summaryModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string output, string kind, string provenance, QlBuiltins::ExtensionId madId
) {
Extensions::experimentalSummaryModel(package, type, subtypes, name, signature, ext, input,
output, kind, provenance, this, madId)
}
}
deprecated class ActiveExperimentalModels = ActiveExperimentalModelsInternal;
/** Holds if a source model exists for the given parameters. */
predicate sourceModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string output, string kind, string provenance, QlBuiltins::ExtensionId madId
) {
(
Extensions::sourceModel(package, type, subtypes, name, signature, ext, output, kind, provenance,
madId)
or
any(ActiveExperimentalModelsInternal q)
.sourceModel(package, type, subtypes, name, signature, ext, output, kind, provenance, madId)
)
}
/** Holds if a sink model exists for the given parameters. */
predicate sinkModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string kind, string provenance, QlBuiltins::ExtensionId madId
) {
(
Extensions::sinkModel(package, type, subtypes, name, signature, ext, input, kind, provenance,
madId)
or
any(ActiveExperimentalModelsInternal q)
.sinkModel(package, type, subtypes, name, signature, ext, input, kind, provenance, madId)
)
}
/** Holds if a summary model exists for the given parameters. */
predicate summaryModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string output, string kind, string provenance, QlBuiltins::ExtensionId madId
) {
(
Extensions::summaryModel(package, type, subtypes, name, signature, ext, input, output, kind,
provenance, madId)
or
any(ActiveExperimentalModelsInternal q)
.summaryModel(package, type, subtypes, name, signature, ext, input, output, kind,
provenance, madId)
)
}
/**
* Holds if the given extension tuple `madId` should pretty-print as `model`.
*
* This predicate should only be used in tests.
*/
predicate interpretModelForTest(QlBuiltins::ExtensionId madId, string model) {
exists(
string package, string type, boolean subtypes, string name, string signature, string ext,
string output, string kind, string provenance
|
sourceModel(package, type, subtypes, name, signature, ext, output, kind, provenance, madId) or
Extensions::experimentalSourceModel(package, type, subtypes, name, signature, ext, output, kind,
provenance, _, madId)
|
model =
"Source: " + package + "; " + type + "; " + subtypes + "; " + name + "; " + signature + "; " +
ext + "; " + output + "; " + kind + "; " + provenance
)
or
exists(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string kind, string provenance
|
sinkModel(package, type, subtypes, name, signature, ext, input, kind, provenance, madId) or
Extensions::experimentalSinkModel(package, type, subtypes, name, signature, ext, input, kind,
provenance, _, madId)
|
model =
"Sink: " + package + "; " + type + "; " + subtypes + "; " + name + "; " + signature + "; " +
ext + "; " + input + "; " + kind + "; " + provenance
)
or
exists(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string output, string kind, string provenance
|
summaryModel(package, type, subtypes, name, signature, ext, input, output, kind, provenance,
madId) or
Extensions::experimentalSummaryModel(package, type, subtypes, name, signature, ext, input,
output, kind, provenance, _, madId)
|
model =
"Summary: " + package + "; " + type + "; " + subtypes + "; " + name + "; " + signature + "; " +
ext + "; " + input + "; " + output + "; " + kind + "; " + provenance
)
}
/** Holds if a neutral model exists for the given parameters. */
predicate neutralModel = Extensions::neutralModel/6;
private predicate relevantPackage(string package) {
sourceModel(package, _, _, _, _, _, _, _, _, _) or
sinkModel(package, _, _, _, _, _, _, _, _, _) or
summaryModel(package, _, _, _, _, _, _, _, _, _, _)
}
private predicate packageLink(string shortpkg, string longpkg) {
relevantPackage(shortpkg) and
relevantPackage(longpkg) and
longpkg.prefix(longpkg.indexOf(".")) = shortpkg
}
private predicate canonicalPackage(string package) {
relevantPackage(package) and not packageLink(_, package)
}
private predicate canonicalPkgLink(string package, string subpkg) {
canonicalPackage(package) and
(subpkg = package or packageLink(package, subpkg))
}
/**
* Holds if MaD framework coverage of `package` is `n` api endpoints of the
* kind `(kind, part)`, and `pkgs` is the number of subpackages of `package`
* which have MaD framework coverage (including `package` itself).
*/
predicate modelCoverage(string package, int pkgs, string kind, string part, int n) {
pkgs = strictcount(string subpkg | canonicalPkgLink(package, subpkg)) and
(
part = "source" and
n =
strictcount(string subpkg, string type, boolean subtypes, string name, string signature,
string ext, string output, string provenance |
canonicalPkgLink(package, subpkg) and
sourceModel(subpkg, type, subtypes, name, signature, ext, output, kind, provenance, _)
)
or
part = "sink" and
n =
strictcount(string subpkg, string type, boolean subtypes, string name, string signature,
string ext, string input, string provenance |
canonicalPkgLink(package, subpkg) and
sinkModel(subpkg, type, subtypes, name, signature, ext, input, kind, provenance, _)
)
or
part = "summary" and
n =
strictcount(string subpkg, string type, boolean subtypes, string name, string signature,
string ext, string input, string output, string provenance |
canonicalPkgLink(package, subpkg) and
summaryModel(subpkg, type, subtypes, name, signature, ext, input, output, kind, provenance,
_)
)
)
}
/** Provides a query predicate to check the MaD models for validation errors. */
module ModelValidation {
private import codeql.dataflow.internal.AccessPathSyntax as AccessPathSyntax
private predicate getRelevantAccessPath(string path) {
summaryModel(_, _, _, _, _, _, path, _, _, _, _) or
summaryModel(_, _, _, _, _, _, _, path, _, _, _) or
sinkModel(_, _, _, _, _, _, path, _, _, _) or
sourceModel(_, _, _, _, _, _, path, _, _, _)
}
private module MkAccessPath = AccessPathSyntax::AccessPath<getRelevantAccessPath/1>;
class AccessPath = MkAccessPath::AccessPath;
class AccessPathToken = MkAccessPath::AccessPathToken;
private string getInvalidModelInput() {
exists(string pred, AccessPath input, AccessPathToken part |
sinkModel(_, _, _, _, _, _, input, _, _, _) and pred = "sink"
or
summaryModel(_, _, _, _, _, _, input, _, _, _, _) and pred = "summary"
|
(
invalidSpecComponent(input, part) and
not part = "" and
not (part = "Argument" and pred = "sink") and
not parseArg(part, _) and
not part.getName() = "Field"
or
part = input.getToken(0) and
parseParam(part, _)
or
invalidIndexComponent(input, part)
) and
result = "Unrecognized input specification \"" + part + "\" in " + pred + " model."
)
}
private string getInvalidModelOutput() {
exists(string pred, AccessPath output, AccessPathToken part |
sourceModel(_, _, _, _, _, _, output, _, _, _) and pred = "source"
or
summaryModel(_, _, _, _, _, _, _, output, _, _, _) and pred = "summary"
|
(
invalidSpecComponent(output, part) and
not part = "" and
not (part = ["Argument", "Parameter"] and pred = "source") and
not part.getName() = "Field"
or
invalidIndexComponent(output, part)
) and
result = "Unrecognized output specification \"" + part + "\" in " + pred + " model."
)
}
private module KindValConfig implements SharedModelVal::KindValidationConfigSig {
predicate summaryKind(string kind) { summaryModel(_, _, _, _, _, _, _, _, kind, _, _) }
predicate sinkKind(string kind) { sinkModel(_, _, _, _, _, _, _, kind, _, _) }
predicate sourceKind(string kind) { sourceModel(_, _, _, _, _, _, _, kind, _, _) }
predicate neutralKind(string kind) { neutralModel(_, _, _, _, kind, _) }
}
private module KindVal = SharedModelVal::KindValidation<KindValConfig>;
private string getInvalidModelSignature() {
exists(
string pred, string package, string type, string name, string signature, string ext,
string provenance
|
sourceModel(package, type, _, name, signature, ext, _, _, provenance, _) and pred = "source"
or
sinkModel(package, type, _, name, signature, ext, _, _, provenance, _) and pred = "sink"
or
summaryModel(package, type, _, name, signature, ext, _, _, _, provenance, _) and
pred = "summary"
or
neutralModel(package, type, name, signature, _, provenance) and
ext = "" and
pred = "neutral"
|
not package.regexpMatch("[a-zA-Z0-9_\\.]*") and
result = "Dubious package \"" + package + "\" in " + pred + " model."
or
not type.regexpMatch("[a-zA-Z0-9_\\$<>]+") and
result = "Dubious type \"" + type + "\" in " + pred + " model."
or
not name.regexpMatch("[a-zA-Z0-9_\\-]*") and
result = "Dubious name \"" + name + "\" in " + pred + " model."
or
not signature.regexpMatch("|\\([a-zA-Z0-9_\\.\\$<>,\\[\\]]*\\)") and
result = "Dubious signature \"" + signature + "\" in " + pred + " model."
or
not ext.regexpMatch("|Annotated") and
result = "Unrecognized extra API graph element \"" + ext + "\" in " + pred + " model."
or
invalidProvenance(provenance) and
result = "Unrecognized provenance description \"" + provenance + "\" in " + pred + " model."
)
}
/** Holds if some row in a MaD flow model appears to contain typos. */
query predicate invalidModelRow(string msg) {
msg =
[
getInvalidModelSignature(), getInvalidModelInput(), getInvalidModelOutput(),
KindVal::getInvalidModelKind()
]
}
}
pragma[nomagic]
private predicate elementSpec(
string package, string type, boolean subtypes, string name, string signature, string ext
) {
sourceModel(package, type, subtypes, name, signature, ext, _, _, _, _)
or
sinkModel(package, type, subtypes, name, signature, ext, _, _, _, _)
or
summaryModel(package, type, subtypes, name, signature, ext, _, _, _, _, _)
or
neutralModel(package, type, name, signature, _, _) and ext = "" and subtypes = true
}
private string getNestedName(Type t) {
not t instanceof RefType and result = t.toString()
or
not t.(Array).getElementType() instanceof NestedType and result = t.(RefType).getNestedName()
or
result =
t.(Array).getElementType().(NestedType).getEnclosingType().getNestedName() + "$" + t.getName()
}
private string getQualifiedName(Type t) {
not t instanceof RefType and result = t.toString()
or
result = t.(RefType).getQualifiedName()
or
exists(Array a, Type c | a = t and c = a.getElementType() |
not c instanceof RefType and result = t.toString()
or
exists(string pkgName | pkgName = c.(RefType).getPackage().getName() |
if pkgName = "" then result = getNestedName(a) else result = pkgName + "." + getNestedName(a)
)
)
}
/**
* Gets a parenthesized string containing all parameter types of this callable, separated by a comma.
*
* Returns an empty parenthesized string if the callable has no parameters.
* Parameter types are represented by their type erasure.
*/
cached
string paramsString(Callable c) {
result =
"(" + concat(int i | | getNestedName(c.getParameterType(i).getErasure()), "," order by i) + ")"
}
private string paramsStringQualified(Callable c) {
result =
"(" + concat(int i | | getQualifiedName(c.getParameterType(i).getErasure()), "," order by i) +
")"
}
private Element interpretElement0(
string package, string type, boolean subtypes, string name, string signature, boolean isExact
) {
elementSpec(package, type, subtypes, name, signature, _) and
(
exists(Member m |
(
result = m and isExact = true
or
subtypes = true and result.(SrcMethod).overridesOrInstantiates+(m) and isExact = false
) and
m.hasQualifiedName(package, type, name)
|
signature = ""
or
paramsStringQualified(m) = signature
or
paramsString(m) = signature
)
or
exists(RefType t |
t.hasQualifiedName(package, type) and
isExact = false and
(if subtypes = true then result.(SrcRefType).getASourceSupertype*() = t else result = t) and
name = "" and
signature = ""
)
)
}
/** Gets the source/sink/summary/neutral element corresponding to the supplied parameters. */
cached
Element interpretElement(
string package, string type, boolean subtypes, string name, string signature, string ext,
boolean isExact
) {
elementSpec(package, type, subtypes, name, signature, ext) and
exists(Element e, boolean isExact0 |
e = interpretElement0(package, type, subtypes, name, signature, isExact0)
|
ext = "" and result = e and isExact = isExact0
or
ext = "Annotated" and result.(Annotatable).getAnAnnotation().getType() = e and isExact = false
)
}
private predicate parseField(AccessPathToken c, FieldContent f) {
exists(
string fieldRegex, string qualifiedName, string package, string className, string fieldName
|
c.getName() = "Field" and
qualifiedName = c.getAnArgument() and
fieldRegex = "^(.*)\\.([^.]+)\\.([^.]+)$" and
package = qualifiedName.regexpCapture(fieldRegex, 1) and
className = qualifiedName.regexpCapture(fieldRegex, 2) and
fieldName = qualifiedName.regexpCapture(fieldRegex, 3) and
f.getField().hasQualifiedName(package, className, fieldName)
)
}
/** A string representing a synthetic instance field. */
class SyntheticField extends string {
SyntheticField() { parseSynthField(_, this) }
/**
* Gets the type of this field. The default type is `Object`, but this can be
* overridden.
*/
Type getType() { result instanceof TypeObject }
}
private predicate parseSynthField(AccessPathToken c, string f) {
c.getName() = "SyntheticField" and
f = c.getAnArgument()
}
/** Holds if the specification component parses as a `Content`. */
predicate parseContent(AccessPathToken component, Content content) {
parseField(component, content)
or
parseSynthField(component, content.(SyntheticFieldContent).getField())
or
component = "ArrayElement" and content instanceof ArrayContent
or
component = "Element" and content instanceof CollectionContent
or
component = "MapKey" and content instanceof MapKeyContent
or
component = "MapValue" and content instanceof MapValueContent
}
cached
private module Cached {
/**
* Holds if `node` is specified as a source with the given kind in a MaD flow
* model.
*/
cached
predicate sourceNode(Node node, string kind, string model) {
exists(SourceSinkInterpretationInput::InterpretNode n |
isSourceNode(n, kind, model) and n.asNode() = node
)
}
/**
* Holds if `node` is specified as a sink with the given kind in a MaD flow
* model.
*/
cached
predicate sinkNode(Node node, string kind, string model) {
exists(SourceSinkInterpretationInput::InterpretNode n |
isSinkNode(n, kind, model) and n.asNode() = node
)
}
}
import Cached
/**
* Holds if `node` is specified as a source with the given kind in a MaD flow
* model.
*/
predicate sourceNode(Node node, string kind) { sourceNode(node, kind, _) }
/**
* Holds if `node` is specified as a sink with the given kind in a MaD flow
* model.
*/
predicate sinkNode(Node node, string kind) { sinkNode(node, kind, _) }
// adapter class for converting Mad summaries to `SummarizedCallable`s
private class SummarizedCallableAdapter extends SummarizedCallable {
SummarizedCallableAdapter() { summaryElement(this, _, _, _, _, _, _) }
private predicate relevantSummaryElementManual(
string input, string output, string kind, string model
) {
exists(Provenance provenance |
summaryElement(this, input, output, kind, provenance, model, _) and
provenance.isManual()
)
}
private predicate relevantSummaryElementGenerated(
string input, string output, string kind, string model
) {
exists(Provenance provenance |
summaryElement(this, input, output, kind, provenance, model, _) and
provenance.isGenerated()
) and
not exists(Provenance provenance |
neutralElement(this, "summary", provenance, _) and
provenance.isManual()
)
}
override predicate propagatesFlow(
string input, string output, boolean preservesValue, string model
) {
exists(string kind |
this.relevantSummaryElementManual(input, output, kind, model)
or
not this.relevantSummaryElementManual(_, _, _, _) and
this.relevantSummaryElementGenerated(input, output, kind, model)
|
if kind = "value" then preservesValue = true else preservesValue = false
)
}
override predicate hasProvenance(Provenance provenance) {
summaryElement(this, _, _, _, provenance, _, _)
}
override predicate hasExactModel() { summaryElement(this, _, _, _, _, _, true) }
}
final class SinkCallable = SinkModelCallable;
final class SourceCallable = SourceModelCallable;