Simplify AtmConfig:

- We no longer create new configs for each query we want to boost with ATM.
- Instead the `AtmConfig` module imports the configs for the Java queries it can and copies the configs for the ones that are defined in a ql file.
- The predicates that used to be defined in the `AtmConfig` class are now defined either in candidate extraction query or(in the case of `isKnownSink` which is used in more than one file) in `EndpointCharacteristic.qll`.
- Delete all the derived classes of AtmConfig.
- Surface all candidates that pass the endpoint filters, regardless of flow from a source.
This commit is contained in:
tiferet
2023-02-27 16:20:53 -08:00
parent efb6522656
commit f5833ffc3d
8 changed files with 141 additions and 328 deletions

View File

@@ -1,162 +1,89 @@
/**
* For internal use only.
*
* Configures boosting for adaptive threat modeling (ATM).
* Collects the query configurations to boost with ATM. Imports the configurations of supported Java queries where
* possible. Java queries that are defined in a `.ql` file get copied into this file.
*/
private import java as java
private import semmle.code.java.dataflow.TaintTracking
import semmle.code.java.security.RequestForgeryConfig
import semmle.code.java.security.SqlInjectionQuery
import EndpointTypes
import EndpointCharacteristics as EndpointCharacteristics
/* Copied from java/ql/src/Security/CWE/CWE-022/TaintedPath.ql */
private import semmle.code.java.dataflow.ExternalFlow
private import semmle.code.java.security.PathCreation
private import semmle.code.java.security.PathSanitizer
/**
* EXPERIMENTAL. This API may change in the future.
*
* A configuration class for defining known endpoints and endpoint filters for adaptive threat
* modeling (ATM). Each boosted query must define its own extension of this abstract class.
*
* A configuration defines a set of known sources (`isKnownSource`) and sinks (`isKnownSink`).
* It must also define a sink endpoint filter (`isEffectiveSink`) that filters candidate sinks
* predicted by the machine learning model to a set of effective sinks.
*
* To get started with ATM, you can copy-paste an implementation of the relevant predicates from a
* `DataFlow::Configuration` or `TaintTracking::Configuration` class for a standard security query.
* For example, for SQL injection you can start by defining the `isKnownSource` and `isKnownSink`
* predicates in the ATM configuration by copying and pasting the implementations of `isSource` and
* `isSink` from `SqlInjection::Configuration`.
*
* Note that if the security query configuration defines additional edges beyond the standard data
* flow edges, such as `NosqlInjection::Configuration`, you may need to replace the definition of
* `isAdditionalFlowStep` with a more generalised definition of additional edges. See
* `NosqlInjectionATM.qll` for an example of doing this.
/*
* Configurations that are copied from Java queries because they can't be directly imported.
*/
abstract class AtmConfig extends TaintTracking::Configuration {
bindingset[this]
AtmConfig() { any() }
/**
* Holds if `source` is a relevant taint source. When sources are not boosted, `isSource` is equivalent to
* `isKnownSource` (i.e there are no "effective" sources to be classified by an ML model).
*/
override predicate isSource(DataFlow::Node source) { this.isKnownSource(source) }
/* TaintedPathConfig cannot be imported directly since it is defined in a .ql file. It is therefore copied here. */
/* Copied from java/ql/src/Security/CWE/CWE-022/TaintedPath.ql */
class TaintedPathConfig extends TaintTracking::Configuration {
TaintedPathConfig() { this = "TaintedPathConfig" }
override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
/**
* Holds if `sink` is a known taint sink or an "effective" sink (a candidate to be classified by an ML model).
*/
override predicate isSink(DataFlow::Node sink) {
this.isKnownSink(sink) or this.isEffectiveSink(sink)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if `source` is a known source of flow.
*/
abstract predicate isKnownSource(DataFlow::Node source);
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if `sink` is a known sink of for this query
*/
final predicate isKnownSink(DataFlow::Node sink) {
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
// known sink for the class.
isKnownSink(sink, this.getASinkEndpointType())
}
/**
* Holds if `sink` is a known sink for this query of type `sinkType`.
*/
final predicate isKnownSink(DataFlow::Node sink, EndpointType sinkType) {
sinkType = this.getASinkEndpointType() and
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
// known sink for the class.
exists(EndpointCharacteristics::EndpointCharacteristic characteristic |
characteristic.appliesToEndpoint(sink) and
characteristic.hasImplications(sinkType, true, characteristic.maximalConfidence())
)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if the candidate source `candidateSource` predicted by the machine learning model should be
* an effective source, i.e. one considered as a possible source of flow in the boosted query.
*/
predicate isEffectiveSource(DataFlow::Node candidateSource) { none() }
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
* an effective sink, i.e. one considered as a possible sink of flow in the boosted query.
*/
predicate isEffectiveSink(DataFlow::Node candidateSink) {
not exists(this.getAReasonSinkExcluded(candidateSink))
}
/**
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink.
*/
final EndpointCharacteristics::EndpointCharacteristic getAReasonSinkExcluded(
DataFlow::Node candidateSink
) {
// An endpoint is an effective sink (sink candidate) if none of its characteristics give much indication whether or
// not it is a sink. Historically, we used endpoint filters, and scored endpoints that are filtered out neither by
// a standard endpoint filter nor by an endpoint filter specific to this sink type.
result.appliesToEndpoint(candidateSink) and
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
exists(float confidence |
confidence >= result.mediumConfidence() and
result.hasImplications(any(NegativeSinkType negative), true, confidence)
)
sink.asExpr() = any(PathCreation p).getAnInput()
or
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type,
// for every sink type relevant to this query.
not exists(EndpointType sinkType |
sinkType = this.getASinkEndpointType() and
not exists(float confidence |
confidence >= result.mediumConfidence() and
result.hasImplications(sinkType, false, confidence)
)
)
sinkNode(sink, "create-file")
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Get an endpoint type for the sources of this query. A query may have multiple applicable
* endpoint types for its sources.
*/
EndpointType getASourceEndpointType() { none() }
/**
* EXPERIMENTAL. This API may change in the future.
*
* Get all sink types that can be sinks for this query. A query may have multiple applicable
* endpoint types for its sinks.
*/
abstract EndpointType getASinkEndpointType();
pragma[inline]
predicate isFlowLikelyInBaseQuery(DataFlow::Node source, DataFlow::Node sink) {
this.isKnownSource(source) and this.isKnownSink(sink)
override predicate isSanitizer(DataFlow::Node sanitizer) {
sanitizer.getType() instanceof BoxedType or
sanitizer.getType() instanceof PrimitiveType or
sanitizer.getType() instanceof NumberType or
sanitizer instanceof PathInjectionSanitizer
}
/**
* Holds if if `sink` is an effective sink with flow from `source` which gets used as a sink candidate for scoring
* with the ML model.
*/
predicate isSinkCandidateWithFlow(DataFlow::PathNode sink) {
exists(DataFlow::PathNode source |
// Note: In JavaScript there's no need to check `isEffectiveSink` here explicitly, because `hasFlowPath` calls `isSink` which
// requires an endpoint to be either a known sink or an effective sink. Known sinks are later filtered out by
// `isFlowLikelyInBaseQuery`, leaving only effective sinks.
this.hasFlowPath(source, sink) and
not this.isFlowLikelyInBaseQuery(source.getNode(), sink.getNode()) and
isEffectiveSink(sink.getNode()) and
not isKnownSink(sink.getNode()) // As long as we're not boosting sources this is already implicitly checked by `isFlowLikelyInBaseQuery`
override predicate isAdditionalTaintStep(DataFlow::Node n1, DataFlow::Node n2) {
any(TaintedPathAdditionalTaintStep s).step(n1, n2)
}
}
/* TaintedPathCommon cannot be imported directly due to the hyphen in `CWE-022`. It is therefore copied here. */
/* Copied from java/ql/src/Security/CWE/CWE-022/TaintedPathCommon.qll */
/**
* A unit class for adding additional taint steps.
*
* Extend this class to add additional taint steps that should apply to tainted path flow configurations.
*/
class TaintedPathAdditionalTaintStep extends Unit {
abstract predicate step(DataFlow::Node n1, DataFlow::Node n2);
}
private class DefaultTaintedPathAdditionalTaintStep extends TaintedPathAdditionalTaintStep {
override predicate step(DataFlow::Node n1, DataFlow::Node n2) {
exists(Argument a |
a = n1.asExpr() and
a.getCall() = n2.asExpr() and
a = any(TaintPreservingUriCtorParam tpp).getAnArgument()
)
}
}
private class TaintPreservingUriCtorParam extends Parameter {
TaintPreservingUriCtorParam() {
exists(Constructor ctor, int idx, int nParams |
ctor.getDeclaringType() instanceof TypeUri and
this = ctor.getParameter(idx) and
nParams = ctor.getNumberOfParameters()
|
// URI(String scheme, String ssp, String fragment)
idx = 1 and nParams = 3
or
// URI(String scheme, String host, String path, String fragment)
idx = [1, 2] and nParams = 4
or
// URI(String scheme, String authority, String path, String query, String fragment)
idx = 2 and nParams = 5
or
// URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)
idx = 4 and nParams = 7
)
}
}

View File

@@ -11,12 +11,26 @@ private import semmle.code.java.dataflow.ExternalFlow
private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl
import experimental.adaptivethreatmodeling.EndpointTypes
private import experimental.adaptivethreatmodeling.ATMConfig
private import experimental.adaptivethreatmodeling.SqlInjectionATM
private import experimental.adaptivethreatmodeling.TaintedPathATM
private import experimental.adaptivethreatmodeling.RequestForgeryATM
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import semmle.code.java.Expr as Expr
/*
* Predicates that are used to surface prompt examples and candidates for classification with an ML model.
*/
/**
* Holds if `sink` is a known sink of type `sinkType`.
*/
predicate isKnownSink(DataFlow::Node sink, SinkType sinkType) {
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
// known sink for the class.
sinkType != any(NegativeSinkType negative) and
exists(EndpointCharacteristics::EndpointCharacteristic characteristic |
characteristic.appliesToEndpoint(sink) and
characteristic.hasImplications(sinkType, true, characteristic.maximalConfidence())
)
}
/**
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
* characteristics. Lists the problematic characterisitics and their implications for all such endpoints, together with
@@ -127,6 +141,10 @@ predicate hasMetadata(DataFlow::Node n, string metadata) {
)
}
/*
* EndpointCharacteristic classes.
*/
/**
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
* about whether to include the endpoint in the training set and with what label, as well as whether to score the
@@ -379,7 +397,7 @@ private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
IsSanitizerCharacteristic() { this = "sanitizer" }
override predicate appliesToEndpoint(DataFlow::Node n) {
exists(AtmConfig config | config.isSanitizer(n))
exists(TaintTracking::Configuration config | config.isSanitizer(n))
}
}

View File

@@ -53,8 +53,3 @@ class TaintedPathSinkType extends SinkType {
class RequestForgerySinkType extends SinkType {
RequestForgerySinkType() { this = "ssrf" }
}
/** Other sinks modeled by a MaD `kind` but not belonging to any of the existing sink types. */
class OtherMaDSinkType extends SinkType {
OtherMaDSinkType() { this = "other-sink" }
}

View File

@@ -1,38 +0,0 @@
/**
* For internal use only.
*
* A taint-tracking configuration for reasoning about SSRF (server side request forgery) vulnerabilities.
* Largely copied from java/ql/lib/semmle/code/java/security/RequestForgeryConfig.qll.
*
* Only import this directly from .ql files, to avoid the possibility of polluting the Configuration hierarchy
* accidentally.
*/
import ATMConfig
import semmle.code.java.dataflow.FlowSources
import semmle.code.java.security.RequestForgery
class RequestForgeryAtmConfig extends AtmConfig {
RequestForgeryAtmConfig() { this = "RequestForgeryAtmConfig" }
override predicate isKnownSource(DataFlow::Node source) {
source instanceof RemoteFlowSource and
// Exclude results of remote HTTP requests: fetching something else based on that result
// is no worse than following a redirect returned by the remote server, and typically
// we're requesting a resource via https which we trust to only send us to safe URLs.
not source.asExpr().(MethodAccess).getCallee() instanceof UrlConnectionGetInputStreamMethod
}
override EndpointType getASinkEndpointType() { result instanceof RequestForgerySinkType }
/*
* This is largely a copy of the taint tracking configuration for the standard SSRF
* query, except additional sinks have been added using the sink endpoint filter.
*/
override predicate isAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ) {
any(RequestForgeryAdditionalTaintStep r).propagatesTaint(pred, succ)
}
override predicate isSanitizer(DataFlow::Node node) { node instanceof RequestForgerySanitizer }
}

View File

@@ -1,36 +0,0 @@
/**
* For internal use only.
*
* A taint-tracking configuration for reasoning about SQL injection vulnerabilities.
* Defines shared code used by the SQL injection boosted query.
* Largely copied from semmle.code.java.security.SqlInjectionQuery.
*/
import ATMConfig
import semmle.code.java.dataflow.FlowSources
import semmle.code.java.security.QueryInjection
class SqlInjectionAtmConfig extends AtmConfig {
SqlInjectionAtmConfig() { this = "SqlInjectionAtmConfig" }
override predicate isKnownSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
override EndpointType getASinkEndpointType() {
result instanceof SqlSinkType or result instanceof SqlSinkType
}
/*
* This is largely a copy of the taint tracking configuration for the standard SQL injection
* query, except additional sinks have been added using the sink endpoint filter.
*/
override predicate isSanitizer(DataFlow::Node node) {
node.getType() instanceof PrimitiveType or
node.getType() instanceof BoxedType or
node.getType() instanceof NumberType
}
override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
any(AdditionalQueryInjectionTaintStep s).step(node1, node2)
}
}

View File

@@ -1,82 +0,0 @@
/**
* For internal use only.
*
* A taint-tracking configuration for reasoning about path injection vulnerabilities.
* Defines shared code used by the path injection boosted query.
* Largely copied from java/ql/src/Security/CWE/CWE-022/TaintedPath.ql.
*/
import java
import semmle.code.java.security.PathSanitizer
import ATMConfig
import semmle.code.java.dataflow.FlowSources
class TaintedPathAtmConfig extends AtmConfig {
TaintedPathAtmConfig() { this = "TaintedPathAtmConfig" }
override predicate isKnownSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
override EndpointType getASinkEndpointType() { result instanceof TaintedPathSinkType }
/*
* This is largely a copy of the taint tracking configuration for the standard path injection
* query, except additional ATM sinks have been added to the `isSink` predicate.
*/
override predicate isSanitizer(DataFlow::Node sanitizer) {
sanitizer.getType() instanceof BoxedType or
sanitizer.getType() instanceof PrimitiveType or
sanitizer.getType() instanceof NumberType or
sanitizer instanceof PathInjectionSanitizer
}
override predicate isAdditionalTaintStep(DataFlow::Node n1, DataFlow::Node n2) {
any(TaintedPathAdditionalTaintStep s).step(n1, n2)
}
}
/*
* Models a very basic guard for the tainted path queries.
* TODO: Copied from java/ql/src/Security/CWE/CWE-022/TaintedPathCommon.qll because I couldn't figure out how to import it.
*/
/**
* A unit class for adding additional taint steps.
*
* Extend this class to add additional taint steps that should apply to tainted path flow configurations.
*/
class TaintedPathAdditionalTaintStep extends Unit {
abstract predicate step(DataFlow::Node n1, DataFlow::Node n2);
}
private class DefaultTaintedPathAdditionalTaintStep extends TaintedPathAdditionalTaintStep {
override predicate step(DataFlow::Node n1, DataFlow::Node n2) {
exists(Argument a |
a = n1.asExpr() and
a.getCall() = n2.asExpr() and
a = any(TaintPreservingUriCtorParam tpp).getAnArgument()
)
}
}
private class TaintPreservingUriCtorParam extends Parameter {
TaintPreservingUriCtorParam() {
exists(Constructor ctor, int idx, int nParams |
ctor.getDeclaringType() instanceof TypeUri and
this = ctor.getParameter(idx) and
nParams = ctor.getNumberOfParameters()
|
// URI(String scheme, String ssp, String fragment)
idx = 1 and nParams = 3
or
// URI(String scheme, String host, String path, String fragment)
idx = [1, 2] and nParams = 4
or
// URI(String scheme, String authority, String path, String query, String fragment)
idx = 2 and nParams = 5
or
// URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)
idx = 4 and nParams = 7
)
}
}

View File

@@ -13,9 +13,6 @@ private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
private import experimental.adaptivethreatmodeling.EndpointTypes
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
/*
* ****** WARNING: ******
@@ -23,18 +20,18 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF
* the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
*/
from DataFlow::Node sink, AtmConfig::AtmConfig config, EndpointType sinkType, string message
from DataFlow::Node sink, EndpointType sinkType, string message
where
// If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
// when there's a codex-generated data extension file in `java/ql/lib/ext`.
not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) and
// Extract positive examples of sinks belonging to the existing ATM query configurations.
(
config.isKnownSink(sink, sinkType) and
EndpointCharacteristics::isKnownSink(sink, sinkType) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
not config.isSanitizer(sink) and
not exists(TaintTracking::Configuration config | config.isSanitizer(sink)) and
// Include only sinks that are arguments to an external API call, because these are the sinks we are most interested
// in.
sink instanceof ExternalAPIs::ExternalApiDataNode and

View File

@@ -1,6 +1,6 @@
/**
* Surfaces the endpoints that pass the endpoint filters and have flow from a source for each query config, and are
* therefore used as candidates for classification with an ML model.
* Surfaces the endpoints that pass the endpoint filters and are not already known to be sinks, and are therefore used
* as candidates for classification with an ML model.
*
* Note: This query does not actually classify the endpoints using the model.
*
@@ -17,30 +17,62 @@ private import semmle.code.java.dataflow.ExternalFlow
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
private import experimental.adaptivethreatmodeling.EndpointTypes
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
from DataFlow::Node sink, string message
/**
* Holds if the candidate sink `candidateSink` should be considered as a possible sink of type `sinkType`, and
* classified by the ML model. A candidate sink is a node that cannot be excluded form `sinkType` based on its
* characteristics.
*/
predicate isEffectiveSink(DataFlow::Node candidateSink, SinkType sinkType) {
sinkType != any(NegativeSinkType negative) and
not exists(EndpointCharacteristics::EndpointCharacteristic characteristic |
characteristic = getAReasonSinkExcluded(candidateSink, sinkType)
)
}
/**
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink for a given sink
* type.
*/
EndpointCharacteristics::EndpointCharacteristic getAReasonSinkExcluded(
DataFlow::Node candidateSink, SinkType sinkType
) {
// An endpoint is a sink candidate if none of its characteristics give much indication whether or not it is a sink.
sinkType != any(NegativeSinkType negative) and
result.appliesToEndpoint(candidateSink) and
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
exists(float confidence |
confidence >= result.mediumConfidence() and
result.hasImplications(any(NegativeSinkType negative), true, confidence)
)
or
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
exists(float confidence |
confidence >= result.mediumConfidence() and
result.hasImplications(sinkType, false, confidence)
)
}
from DataFlow::Node sinkCandidate, string message
where
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
not exists(AtmConfig::AtmConfig config, string kind |
config.isKnownSink(sink) and
sinkNode(sink, kind)
not exists(string kind |
sinkNode(sinkCandidate, kind)
// and EndpointCharacteristics::isKnownSink(sinkCandidate, sinkType) and kind = sinkType.getKind() // TODO: Uncomment this line once our sink types indeed correspond to MaD `kind`s.
) and
// The message is the concatenation of all relevant configs, and we surface only sinks that have at least one relevant
// config.
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
// a non-sink, and we surface only endpoints that have at least one such sink type.
message =
strictconcat(AtmConfig::AtmConfig config, DataFlow::PathNode sinkPathNode |
config.isSinkCandidateWithFlow(sinkPathNode) and
sinkPathNode.getNode() = sink
strictconcat(SinkType sinkType |
not EndpointCharacteristics::isKnownSink(sinkCandidate, sinkType) and
isEffectiveSink(sinkCandidate, sinkType)
|
config, ", "
sinkType + ", "
) + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
select sink, message
any(string metadata | EndpointCharacteristics::hasMetadata(sinkCandidate, metadata))
select sinkCandidate, message