Compare commits

...

1 Commits

Author SHA1 Message Date
Jean Helie
21d258fa37 wip 2023-03-01 13:17:01 +01:00
52 changed files with 2652 additions and 8 deletions

View File

@@ -8,16 +8,16 @@ provide:
- "cpp/ql/test/query-tests/Security/CWE/CWE-190/semmle/tainted/qlpack.yml"
- "go/ql/config/legacy-support/qlpack.yml"
- "go/build/codeql-extractor-go/codeql-extractor.yml"
- "javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml"
- "*/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml"
# This pack is explicitly excluded from the workspace since most users
# will want to use a version of this pack from the package cache. Internal
# users can uncomment the following line and place a custom ML model
# in the corresponding pack to test a custom ML model within their local
# checkout.
# - "javascript/ql/experimental/adaptivethreatmodeling/model/qlpack.yml"
- "javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/qlpack.yml"
- "javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml"
- "javascript/ql/experimental/adaptivethreatmodeling/test/qlpack.yml"
- "*/ql/experimental/adaptivethreatmodeling/model/qlpack.yml"
- "*/ql/experimental/adaptivethreatmodeling/modelbuilding/qlpack.yml"
- "*/ql/experimental/adaptivethreatmodeling/src/qlpack.yml"
- "*/ql/experimental/adaptivethreatmodeling/test/qlpack.yml"
- "csharp/ql/campaigns/Solorigate/lib/qlpack.yml"
- "csharp/ql/campaigns/Solorigate/src/qlpack.yml"
- "csharp/ql/campaigns/Solorigate/test/qlpack.yml"

View File

@@ -0,0 +1,161 @@
/**
* For internal use only.
*
* Configures boosting for adaptive threat modeling (ATM).
*/
private import java as java
private import semmle.code.java.dataflow.TaintTracking
import EndpointTypes
import EndpointCharacteristics as EndpointCharacteristics
import AdaptiveThreatModeling::ATM::ResultsInfo as AtmResultsInfo
/**
* EXPERIMENTAL. This API may change in the future.
*
* A configuration class for defining known endpoints and endpoint filters for adaptive threat
* modeling (ATM). Each boosted query must define its own extension of this abstract class.
*
* A configuration defines a set of known sources (`isKnownSource`) and sinks (`isKnownSink`).
* It must also define a sink endpoint filter (`isEffectiveSink`) that filters candidate sinks
* predicted by the machine learning model to a set of effective sinks.
*
* To get started with ATM, you can copy-paste an implementation of the relevant predicates from a
* `DataFlow::Configuration` or `TaintTracking::Configuration` class for a standard security query.
* For example, for SQL injection you can start by defining the `isKnownSource` and `isKnownSink`
* predicates in the ATM configuration by copying and pasting the implementations of `isSource` and
* `isSink` from `SqlInjection::Configuration`.
*
* Note that if the security query configuration defines additional edges beyond the standard data
* flow edges, such as `NosqlInjection::Configuration`, you may need to replace the definition of
* `isAdditionalFlowStep` with a more generalised definition of additional edges. See
* `NosqlInjectionATM.qll` for an example of doing this.
*/
abstract class AtmConfig extends TaintTracking::Configuration {
bindingset[this]
AtmConfig() { any() }
/**
* Holds if `source` is a relevant taint source. When sources are not boosted, `isSource` is equivalent to
* `isKnownSource` (i.e there are no "effective" sources to be classified by an ML model).
*/
override predicate isSource(DataFlow::Node source) { this.isKnownSource(source) }
/**
* Holds if `sink` is a known taint sink or an "effective" sink (a candidate to be classified by an ML model).
*/
override predicate isSink(DataFlow::Node sink) {
this.isKnownSink(sink) or this.isEffectiveSink(sink)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if `source` is a known source of flow.
*/
predicate isKnownSource(DataFlow::Node source) { none() }
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if `sink` is a known sink of flow.
*/
final predicate isKnownSink(DataFlow::Node sink) {
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
// known sink for the class.
exists(EndpointCharacteristics::EndpointCharacteristic characteristic |
characteristic.appliesToEndpoint(sink) and
characteristic
.hasImplications(this.getASinkEndpointType(), true, characteristic.maximalConfidence())
)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if the candidate source `candidateSource` predicted by the machine learning model should be
* an effective source, i.e. one considered as a possible source of flow in the boosted query.
*/
predicate isEffectiveSource(DataFlow::Node candidateSource) { none() }
/**
* EXPERIMENTAL. This API may change in the future.
*
* Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
* an effective sink, i.e. one considered as a possible sink of flow in the boosted query.
*/
predicate isEffectiveSink(DataFlow::Node candidateSink) {
not exists(this.getAReasonSinkExcluded(candidateSink))
}
/**
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink.
*/
final EndpointCharacteristics::EndpointCharacteristic getAReasonSinkExcluded(
DataFlow::Node candidateSink
) {
// An endpoint is an effective sink (sink candidate) if none of its characteristics give much indication whether or
// not it is a sink. Historically, we used endpoint filters, and scored endpoints that are filtered out neither by
// a standard endpoint filter nor by an endpoint filter specific to this sink type.
exists(EndpointCharacteristics::EndpointCharacteristic filter, float confidence |
filter.appliesToEndpoint(candidateSink) and
confidence >= filter.mediumConfidence() and
(
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
filter.hasImplications(any(NegativeType negative), true, confidence)
or
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
filter.hasImplications(this.getASinkEndpointType(), false, confidence)
) and
result = filter
)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Get an endpoint type for the sources of this query. A query may have multiple applicable
* endpoint types for its sources.
*/
EndpointType getASourceEndpointType() { none() }
/**
* EXPERIMENTAL. This API may change in the future.
*
* Get an endpoint type for the sinks of this query. A query may have multiple applicable
* endpoint types for its sinks.
*/
abstract EndpointType getASinkEndpointType();
/**
* EXPERIMENTAL. This API may change in the future.
*
* Specifies the default cut-off value that controls how many alerts are produced.
* The cut-off value must be in the range [0,1].
* A cut-off value of 0 only produces alerts that are likely true-positives.
* A cut-off value of 1 produces all alerts including those that are likely false-positives.
*/
float getScoreCutoff() { result = 0.0 }
/**
* Holds if there's an ATM alert (a flow path from `source` to `sink` with ML-determined likelihood `score`) according
* to this ML-boosted configuration, whereas the unboosted base query does not contain this source and sink
* combination.
*/
predicate hasBoostedFlowPath(DataFlow::PathNode source, DataFlow::PathNode sink, float score) {
this.hasFlowPath(source, sink) and
not AtmResultsInfo::isFlowLikelyInBaseQuery(source.getNode(), sink.getNode()) and
score = AtmResultsInfo::getScoreForFlow(source.getNode(), sink.getNode())
}
/**
* Holds if if `sink` is an effective sink with flow from `source` which gets used as a sink candidate for scoring
* with the ML model.
*/
predicate isSinkCandidateWithFlow(DataFlow::PathNode sink) {
exists(DataFlow::PathNode source |
this.hasFlowPath(source, sink) and
not AtmResultsInfo::isFlowLikelyInBaseQuery(source.getNode(), sink.getNode())
)
}
}

View File

@@ -0,0 +1,124 @@
/**
* For internal use only.
*
* Provides information about the results of boosted queries for use in adaptive threat modeling (ATM).
*/
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
import ATMConfig
private import BaseScoring
private import EndpointScoring as EndpointScoring
module ATM {
/**
* EXPERIMENTAL. This API may change in the future.
*
* This module contains informational predicates about the results returned by adaptive threat
* modeling (ATM).
*/
module ResultsInfo {
/**
* Indicates whether the flow from source to sink represents a result with
* sufficiently high likelihood of being a true-positive.
*/
pragma[inline]
private predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
any(ScoringResults results).shouldResultBeIncluded(source, sink)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Returns the score for the flow between the source `source` and the `sink` sink in the
* boosted query.
*/
pragma[inline]
float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink) {
any(DataFlow::Configuration cfg).hasFlow(source, sink) and
shouldResultBeIncluded(source, sink) and
result = unique(float s | s = any(ScoringResults results).getScoreForFlow(source, sink))
}
/**
* Pad a score returned from `getKnownScoreForFlow` to a particular length by adding a decimal
* point if one does not already exist, and "0"s after that decimal point.
*
* Note that this predicate must itself define an upper bound on `length`, so that it has a
* finite number of results. Currently this is defined as 12.
*/
private string paddedScore(float score, int length) {
// In this definition, we must restrict the values that `length` and `score` can take on so
// that the predicate has a finite number of results.
(score = getScoreForFlow(_, _) or score = 0) and
length = result.length() and
(
// We need to make sure the padded score contains a "." so lexically sorting the padded
// scores is equivalent to numerically sorting the scores.
score.toString().charAt(_) = "." and
result = score.toString()
or
not score.toString().charAt(_) = "." and
result = score.toString() + "."
)
or
result = paddedScore(score, length - 1) + "0" and
length <= 12
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Return a string representing the score of the flow between `source` and `sink` in the
* boosted query.
*
* The returned string is a fixed length, such that lexically sorting the strings returned by
* this predicate gives the same sort order as numerically sorting the scores of the flows.
*/
pragma[inline]
string getScoreStringForFlow(DataFlow::Node source, DataFlow::Node sink) {
exists(float score |
score = getScoreForFlow(source, sink) and
(
// A length of 12 is equivalent to 10 decimal places.
score.toString().length() >= 12 and
result = score.toString().substring(0, 12)
or
score.toString().length() < 12 and
result = paddedScore(score, 12)
)
)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Indicates whether the flow from source to sink is likely to be reported by the base security
* query.
*
* Currently this is a heuristic: it ignores potential differences in the definitions of
* additional flow steps.
*/
pragma[inline]
predicate isFlowLikelyInBaseQuery(DataFlow::Node source, DataFlow::Node sink) {
getCfg().isKnownSource(source) and getCfg().isKnownSink(sink)
}
/**
* EXPERIMENTAL. This API may change in the future.
*
* Get additional information about why ATM included the flow from source to sink as an alert.
*/
pragma[inline]
string getAdditionalAlertInfo(DataFlow::Node source, DataFlow::Node sink) {
exists(string sourceOrigins, string sinkOrigins |
sourceOrigins = concat(any(ScoringResults results).getASourceOrigin(source), ", ") and
sinkOrigins = concat(any(ScoringResults results).getASinkOrigin(sink), ", ") and
result =
"[Source origins: " +
any(string s | if sourceOrigins != "" then s = sourceOrigins else s = "unknown") +
"; sink origins: " +
any(string s | if sinkOrigins != "" then s = sinkOrigins else s = "unknown") + "]"
)
}
}
}

View File

@@ -0,0 +1,55 @@
/**
* For internal use only.
*
* Provides shared scoring functionality for use in adaptive threat modeling (ATM).
*/
private import java
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
private import ATMConfig
external predicate availableMlModels(
string modelChecksum, string modelLanguage, string modelName, string modelType
);
/** Get the ATM configuration. */
AtmConfig getCfg() { any() }
/**
* A string containing scoring information produced by a scoring model.
*
* Scoring models include embedding models and endpoint scoring models.
*/
abstract class ScoringResults extends string {
bindingset[this]
ScoringResults() { any() }
/**
* Get ATM's confidence that a path between `source` and `sink` represents a security
* vulnerability. This will be a number between 0.0 and 1.0.
*/
abstract float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink);
/**
* Get a string representing why ATM included the given source in the dataflow analysis.
*
* In general, there may be multiple reasons why ATM included the given source, in which case
* this predicate should have multiple results.
*/
abstract string getASourceOrigin(DataFlow::Node source);
/**
* Get a string representing why ATM included the given sink in the dataflow analysis.
*
* In general, there may be multiple reasons why ATM included the given sink, in which case this
* predicate should have multiple results.
*/
abstract string getASinkOrigin(DataFlow::Node sink);
/**
* Indicates whether the flow from source to sink represents a result with
* sufficiently high likelihood of being a true-positive.
*/
pragma[inline]
abstract predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink);
}

View File

@@ -0,0 +1,607 @@
/**
* For internal use only.
*/
private import java as java
import semmle.code.java.dataflow.TaintTracking
import semmle.code.java.security.QueryInjection
import semmle.code.java.security.PathCreation
import semmle.code.java.security.RequestForgery
private import semmle.code.java.dataflow.ExternalFlow
import experimental.adaptivethreatmodeling.EndpointTypes
private import experimental.adaptivethreatmodeling.ATMConfig
private import experimental.adaptivethreatmodeling.SqlTaintedATM
private import experimental.adaptivethreatmodeling.TaintedPathATM
private import experimental.adaptivethreatmodeling.RequestForgeryATM
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import semmle.code.java.Expr as Expr
/**
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
* characteristics. Lists the problematic characterisitics and their implications for all such endpoints, together with
* an error message indicating why this combination is problematic.
*
* Copied from javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
*/
query predicate erroneousEndpoints(
DataFlow::Node endpoint, EndpointCharacteristic characteristic, EndpointType endpointClass,
float confidence, string errorMessage
) {
// An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
// class.
exists(EndpointCharacteristic characteristic2, EndpointType endpointClass2, float confidence2 |
endpointClass.getEncoding() != endpointClass2.getEncoding() and
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointClass, true, confidence) and
characteristic2.hasImplications(endpointClass2, true, confidence2) and
confidence > characteristic.mediumConfidence() and
confidence2 > characteristic2.mediumConfidence() and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`.
not (
characteristic instanceof IsSanitizerCharacteristic or
characteristic2 instanceof IsSanitizerCharacteristic
)
) and
errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
or
// An enpoint's characteristics should not include positive indicators with medium/high confidence for some class and
// also include negative indicators with medium/high confidence for this same class.
exists(EndpointCharacteristic characteristic2, float confidence2 |
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointClass, true, confidence) and
characteristic2.hasImplications(endpointClass, false, confidence2) and
confidence > characteristic.mediumConfidence() and
confidence2 > characteristic2.mediumConfidence()
) and
errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
}
query predicate erroneousConfidences(
EndpointCharacteristic characteristic, float confidence, string errorMessage
) {
characteristic.hasImplications(_, _, confidence) and
(confidence < 0 or confidence > 1) and
errorMessage = "Characteristic has an indicator with confidence outside of [0, 1]"
}
/**
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
* about whether to include the endpoint in the training set and with what label, as well as whether to score the
* endpoint at inference time.
*/
abstract class EndpointCharacteristic extends string {
/**
* Holds when the string matches the name of the characteristic, which should describe some characteristic of the
* endpoint that is meaningful for determining whether it's a sink and if so of which type
*/
bindingset[this]
EndpointCharacteristic() { any() }
/**
* Holds for endpoints that have this characteristic. This predicate contains the logic that applies characteristics
* to the appropriate set of dataflow nodes.
*/
abstract predicate appliesToEndpoint(DataFlow::Node n);
/**
* This predicate describes what the characteristic tells us about an endpoint.
*
* Params:
* endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier
* class for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single
* sink type.
* isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
* false, it indicates that it _isn't_ a member of the class.
* confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
* belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
* indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
* this characteristic definitively do/don't belong to the class.
*/
abstract predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
);
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
final float getHighConfidenceThreshold() { result = 0.8 }
// The following are some confidence values that are used in practice by the subclasses. They are defined as named
// constants here to make it easier to change them in the future.
final float maximalConfidence() { result = 1.0 }
final float highConfidence() { result = 0.9 }
final float mediumConfidence() { result = 0.6 }
}
//----------------------------------------------------------------------------------------------------------------------
// Helper predicates.
//----------------------------------------------------------------------------------------------------------------------
predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess }
/**
* Holds if `n` has the given metadata.
*
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query as
* well as the queries that exatract positive and negative examples for the prompt / training set. The metadata is
* extracted as a string in the format of a Python dictionary.
*/
predicate hasMetadata(DataFlow::Node n, string metadata) {
exists(
Callable callee, Call call, string package, string type, boolean subtypes, string name,
string signature, string ext, int input, string provenance, boolean isPublic,
boolean isExternalApiDataNode
|
n.asExpr() = call.getArgument(input) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
(
if callee.isFinal() or callee.getDeclaringType().isFinal()
then subtypes = false // See https://github.com/github/codeql-java-team/issues/254#issuecomment-1422296423
else subtypes = true
) and
name = callee.getName() and // TODO: Will this work for constructors?
signature = paramsString(callee) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)?
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
provenance = "manual" and // TODO
(if callee.isPublic() then isPublic = true else isPublic = false) and
(
if n instanceof ExternalAPIs::ExternalApiDataNode
then isExternalApiDataNode = true
else isExternalApiDataNode = false
) and
metadata =
"{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
"', 'Argument index': " + input + ", 'Provenance': '" + provenance + "', 'Is public': " +
isPublic + ", 'Is passed to external API': " + isExternalApiDataNode + "}" // TODO: Why are the curly braces added twice?
)
}
// private predicate isKnownExternalApiQuerySink(DataFlow::Node n) {
// n instanceof Xxe::Sink or
// n instanceof TaintedPath::Sink or
// n instanceof XpathInjection::Sink or
// n instanceof Xss::Sink or
// n instanceof ClientSideUrlRedirect::Sink or
// n instanceof CodeInjection::Sink or
// n instanceof RequestForgery::Sink or
// n instanceof CorsMisconfigurationForCredentials::Sink or
// n instanceof CommandInjection::Sink or
// n instanceof PrototypePollution::Sink or
// n instanceof UnvalidatedDynamicMethodCall::Sink or
// n instanceof TaintedFormatString::Sink or
// n instanceof NosqlInjection::Sink or
// n instanceof PostMessageStar::Sink or
// n instanceof RegExpInjection::Sink or
// n instanceof SqlTainted::Sink or
// n instanceof XmlBomb::Sink or
// n instanceof ZipSlip::Sink or
// n instanceof UnsafeDeserialization::Sink or
// n instanceof ServerSideUrlRedirect::Sink or
// n instanceof CleartextStorage::Sink or
// n instanceof HttpToFileAccess::Sink
// }
// /**
// * Holds if the node `n` is a known sink in a modeled library.
// */
// private predicate isKnownLibrarySink(DataFlow::Node n) {
// isKnownExternalApiQuerySink(n) or
// n instanceof CleartextLogging::Sink or
// n instanceof StackTraceExposure::Sink or
// n instanceof ShellCommandInjectionFromEnvironment::Sink or
// n instanceof InsecureRandomness::Sink or
// n instanceof FileAccessToHttp::Sink or
// n instanceof IndirectCommandInjection::Sink
// }
// /**
// * Holds if the node `n` is known as the predecessor in a modeled flow step.
// */
// private predicate isKnownStepSrc(DataFlow::Node n) {
// TaintTracking::sharedTaintStep(n, _) or
// DataFlow::SharedFlowStep::step(n, _) or
// DataFlow::SharedFlowStep::step(n, _, _, _)
// }
// /**
// * Holds if the data flow node is a (possibly indirect) argument of a likely external library call.
// *
// * This includes direct arguments of likely external library calls as well as nested object
// * literals within those calls.
// */
// private predicate flowsToArgumentOfLikelyExternalLibraryCall(DataFlow::Node n) {
// n = getACallWithoutCallee().getAnArgument()
// or
// exists(DataFlow::SourceNode src | flowsToArgumentOfLikelyExternalLibraryCall(src) |
// n = src.getAPropertyWrite().getRhs()
// )
// or
// exists(DataFlow::ArrayCreationNode arr | flowsToArgumentOfLikelyExternalLibraryCall(arr) |
// n = arr.getAnElement()
// )
// }
// /**
// * Get calls for which we do not have the callee (i.e. the definition of the called function). This
// * acts as a heuristic for identifying calls to external library functions.
// */
// private DataFlow::CallNode getACallWithoutCallee() {
// forall(Function callee | callee = result.getACallee() | callee.getTopLevel().isExterns()) and
// not exists(DataFlow::ParameterNode param, DataFlow::FunctionNode callback |
// param.flowsTo(result.getCalleeNode()) and
// callback = getACallback(param, DataFlow::TypeBackTracker::end())
// )
// }
// /**
// * Gets a node that flows to callback-parameter `p`.
// */
// private DataFlow::SourceNode getACallback(DataFlow::ParameterNode p, DataFlow::TypeBackTracker t) {
// t.start() and
// result = p and
// any(DataFlow::FunctionNode f).getLastParameter() = p and
// exists(p.getACall())
// or
// exists(DataFlow::TypeBackTracker t2 | result = getACallback(p, t2).backtrack(t2, t))
// }
//----------------------------------------------------------------------------------------------------------------------
// Characteristics that are indicative of a sink.
// NOTE: Initially each sink type has only one characteristic, which is that it's a sink of this type in the standard
// Java libraries.
//----------------------------------------------------------------------------------------------------------------------
// /**
// * Endpoints identified as "DomBasedXssSink" by the standard Java libraries are XSS sinks with maximal confidence.
// */
// private class DomBasedXssSinkCharacteristic extends EndpointCharacteristic {
// DomBasedXssSinkCharacteristic() { this = any(XssSinkType type).getDescription() }
// override predicate appliesToEndpoint(DataFlow::Node n) { n instanceof DomBasedXss::Sink }
// override predicate hasImplications(
// EndpointType endpointClass, boolean isPositiveIndicator, float confidence
// ) {
// endpointClass instanceof XssSinkType and
// isPositiveIndicator = true and
// confidence = maximalConfidence()
// }
// }
/**
* Endpoints identified as "TaintedPathSink" by the standard Java libraries are path injection sinks with maximal
* confidence.
*/
private class TaintedPathSinkCharacteristic extends EndpointCharacteristic {
TaintedPathSinkCharacteristic() { this = any(TaintedPathSinkType type).getDescription() }
override predicate appliesToEndpoint(DataFlow::Node n) {
n.asExpr() = any(PathCreation p).getAnInput()
or
sinkNode(n, "create-file")
}
override predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof TaintedPathSinkType and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
/**
* Endpoints identified as "SqlTaintedSink" by the standard Java libraries are SQL injection sinks with maximal
* confidence.
*/
private class SqlTaintedSinkCharacteristic extends EndpointCharacteristic {
SqlTaintedSinkCharacteristic() { this = any(SqlTaintedSinkType type).getDescription() }
override predicate appliesToEndpoint(DataFlow::Node n) { n instanceof QueryInjectionSink }
override predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof SqlTaintedSinkType and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
/**
* Endpoints identified as "RequestForgerySink" by the standard Java libraries are server-side request forgery sinks
* with maximal confidence.
*/
private class RequestForgerySinkCharacteristic extends EndpointCharacteristic {
RequestForgerySinkCharacteristic() { this = any(RequestForgerySinkType type).getDescription() }
override predicate appliesToEndpoint(DataFlow::Node n) { n instanceof RequestForgerySink }
override predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof RequestForgerySinkType and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
//----------------------------------------------------------------------------------------------------------------------
// Characteristics that are indicative of not being a sink of any type, and have historically been used to select
// negative samples for training.
//----------------------------------------------------------------------------------------------------------------------
/**
* A characteristic that is an indicator of not being a sink of any type, because it's a modeled argument.
*/
abstract class OtherModeledArgumentCharacteristic extends EndpointCharacteristic {
bindingset[this]
OtherModeledArgumentCharacteristic() { any() }
}
/**
* A characteristic that is an indicator of not being a sink of any type, because it's an argument to a function of a
* builtin object.
*/
abstract private class ArgumentToBuiltinFunctionCharacteristic extends OtherModeledArgumentCharacteristic {
bindingset[this]
ArgumentToBuiltinFunctionCharacteristic() { any() }
}
/**
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type.
*/
abstract private class NotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
NotASinkCharacteristic() { any() }
override predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof NegativeType and
isPositiveIndicator = true and
confidence = highConfidence()
}
}
/**
* A medium-confidence characteristic that indicates that an endpoint is not a sink of any type.
*
* TODO: This class is currently not private, because the current extraction logic explicitly avoids including these
* endpoints in the training data. We might want to change this in the future.
*/
abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
LikelyNotASinkCharacteristic() { any() }
override predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof NegativeType and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is a type access. Type accesses are not sinks.
*/
private class IsTypeAccessCharacteristic extends NotASinkCharacteristic {
IsTypeAccessCharacteristic() { this = "type access" }
override predicate appliesToEndpoint(DataFlow::Node n) { isTypeAccess(n) }
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is a sanitizer for some sink type. A sanitizer can
* never be a sink.
*/
private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
IsSanitizerCharacteristic() { this = "is sanitizer" }
override predicate appliesToEndpoint(DataFlow::Node n) {
exists(AtmConfig config | config.isSanitizer(n))
}
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is an argument to a safe external API method.
*
* Based on java/ql/lib/semmle/code/java/security/ExternalAPIs.qll.
*
* TODO: Is this correct?
*/
private class SafeExternalApiMethodCharacteristic extends NotASinkCharacteristic {
string baseDescription;
SafeExternalApiMethodCharacteristic() {
baseDescription = "safe external API method " and
this = any(string s | s = baseDescription + ["org.junit", "other than org.junit"])
}
override predicate appliesToEndpoint(DataFlow::Node n) {
exists(Expr::Call call |
n.asExpr() = call.getAnArgument() and
call.getCallee() instanceof ExternalAPIs::SafeExternalApiMethod and
(
// The vast majority of calls to safe external API methods involve junit. To get a diverse set of negative
// examples, we break those off into a separate characteristic.
call.getCallee().getDeclaringType().getPackage().getName().matches("org.junit%") and
this = baseDescription + "org.junit"
or
not call.getCallee().getDeclaringType().getPackage().getName().matches("org.junit%") and
this = baseDescription + "other than org.junit"
)
)
}
}
//----------------------------------------------------------------------------------------------------------------------
// Characteristics that have historically acted as endpoint filters to exclude endpoints from scoring at inference time.
//----------------------------------------------------------------------------------------------------------------------
/** A characteristic that has historically acted as an endpoint filter for inference-time scoring. */
abstract class EndpointFilterCharacteristic extends EndpointCharacteristic {
bindingset[this]
EndpointFilterCharacteristic() { any() }
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is unlikely to be a sink of any type.
*/
abstract private class StandardEndpointFilterCharacteristic extends EndpointFilterCharacteristic {
bindingset[this]
StandardEndpointFilterCharacteristic() { any() }
override predicate hasImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof NegativeType and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is a constant expression. While a constant expression
* can be a sink, it cannot be part of a tainted flow: Constant expressions always evaluate to a constant primitive
* value, so they can't ever appear in an alert. These endpoints are therefore excluded from scoring at inference time.
*
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
* non-sinks. They are merely not interesting sinks to run through the ML model because they can never be part of a
* tainted flow.
*/
class IsConstantExpressionCharacteristic extends StandardEndpointFilterCharacteristic {
IsConstantExpressionCharacteristic() { this = "constant expression" }
override predicate appliesToEndpoint(DataFlow::Node n) {
n.asExpr() instanceof CompileTimeConstantExpr
}
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is not part of the source code for the project being
* analyzed.
*
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
* non-sinks. They are merely not interesting sinks to run through the ML model.
*/
private class IsExternalCharacteristic extends StandardEndpointFilterCharacteristic {
IsExternalCharacteristic() { this = "external" }
override predicate appliesToEndpoint(DataFlow::Node n) {
not exists(n.getLocation().getFile().getRelativePath())
}
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint is not the final step in a taint propagation. This
* prevents us from detecting expresssions near sinks that are not the sink itself.
*
* WARNING: These endpoints should not be used as negative samples for training, because a there are rare situations
* where a node is both a sink and the `from` node of a flow step: when the called API uses the given value dangerously
* and then returns the given value. Example: `stillTainted = dangerous(tainted)`, assuming that the implementation of
* `dangerous(x)` eventually returns `x`.
*/
private class IsFlowStep extends StandardEndpointFilterCharacteristic {
IsFlowStep() { this = "flow step" }
override predicate appliesToEndpoint(DataFlow::Node n) { isKnownStepSrc(n) }
/**
* Holds if the node `n` is known as the predecessor in a modeled flow step.
*/
private predicate isKnownStepSrc(DataFlow::Node n) {
any(TaintTracking::Configuration c).isAdditionalFlowStep(n, _) or
TaintTracking::localTaintStep(n, _)
}
}
/**
* An EndpointFilterCharacteristic that indicates that an endpoint sits in a test file.
*
* WARNING: These endpoints should not be used as negative samples for training, because there can in fact be sinks in
* test files -- we just don't care to model them because they aren't exploitable.
*/
private class TestFileCharacteristic extends StandardEndpointFilterCharacteristic {
TestFileCharacteristic() { this = "test file" }
override predicate appliesToEndpoint(DataFlow::Node n) {
exists(File f | f = n.getLocation().getFile() and isInTestFile(f))
}
/**
* Holds if `file` is a test file. Copied from java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll.
*
* TODO: Why can't I import utils.modelgenerator.internal.CaptureModelsSpecific?
*/
private predicate isInTestFile(File file) {
file.getAbsolutePath().matches("%src/test/%") or
file.getAbsolutePath().matches("%/guava-tests/%") or
file.getAbsolutePath().matches("%/guava-testlib/%")
}
}
// class IsArgumentToModeledFunctionCharacteristic extends StandardEndpointFilterCharacteristic {
// IsArgumentToModeledFunctionCharacteristic() { this = "argument to modeled function" }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// exists(DataFlow::InvokeNode invk, DataFlow::Node known |
// invk.getAnArgument() = n and
// invk.getAnArgument() = known and
// (
// isKnownLibrarySink(known)
// or
// isKnownStepSrc(known)
// or
// exists(OtherModeledArgumentCharacteristic characteristic |
// characteristic.appliesToEndpoint(known)
// )
// )
// )
// }
// }
// private class IsArgumentToSinklessLibraryCharacteristic extends StandardEndpointFilterCharacteristic {
// IsArgumentToSinklessLibraryCharacteristic() { this = "argument to sinkless library" }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// exists(DataFlow::InvokeNode invk, DataFlow::SourceNode commonSafeLibrary, string libraryName |
// libraryName = ["slugify", "striptags", "marked"]
// |
// commonSafeLibrary = DataFlow::moduleImport(libraryName) and
// invk = [commonSafeLibrary, commonSafeLibrary.getAPropertyRead()].getAnInvocation() and
// n = invk.getAnArgument()
// )
// }
// }
// private class IsSanitizerCharacteristic extends StandardEndpointFilterCharacteristic {
// IsSanitizerCharacteristic() { this = "sanitizer" }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// exists(DataFlow::CallNode call | n = call.getAnArgument() |
// call.getCalleeName().regexpMatch("(?i).*(escape|valid(ate)?|sanitize|purify).*")
// )
// }
// }
// private class IsPredicateCharacteristic extends StandardEndpointFilterCharacteristic {
// IsPredicateCharacteristic() { this = "predicate" }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// exists(DataFlow::CallNode call | n = call.getAnArgument() |
// call.getCalleeName().regexpMatch("(equals|(|is|has|can)(_|[A-Z])).*")
// )
// }
// }
// private class IsHashCharacteristic extends StandardEndpointFilterCharacteristic {
// IsHashCharacteristic() { this = "hash" }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// exists(DataFlow::CallNode call | n = call.getAnArgument() |
// call.getCalleeName().regexpMatch("(?i)^(sha\\d*|md5|hash)$")
// )
// }
// }
// private class IsNumericCharacteristic extends StandardEndpointFilterCharacteristic {
// IsNumericCharacteristic() { this = "numeric" }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// SyntacticHeuristics::isReadFrom(n, ".*index.*")
// }
// }
// private class InIrrelevantFileCharacteristic extends StandardEndpointFilterCharacteristic {
// private string category;
// InIrrelevantFileCharacteristic() {
// this = "in " + category + " file" and category = ["externs", "generated", "library", "test"]
// }
// override predicate appliesToEndpoint(DataFlow::Node n) {
// // Ignore candidate sinks within externs, generated, library, and test code
// ClassifyFiles::classify(n.getFile(), category)
// }
// }

View File

@@ -0,0 +1,139 @@
/**
* For internal use only.
*
* Extracts data about the database for use in adaptive threat modeling (ATM).
*/
private import java
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
private import FeaturizationConfig
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
*
* This is a single string containing a space-separated list of tokens.
*/
private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
exists(EndpointFeature f | f.getName() = featureName and result = f.getValue(endpoint)) and
featureName = getASupportedFeatureName()
}
/** Get a name of a supported generic token-based feature. */
string getASupportedFeatureName() { result = any(EndpointFeature f).getName() }
/**
* Generic token-based features for ATM.
*
* This predicate holds if the generic token-based feature named `featureName` has the value
* `featureValue` for the endpoint `endpoint`.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
featureValue = getTokenFeature(endpoint, featureName)
}
/**
* See EndpointFeature
*/
private newtype TEndpointFeature =
TEnclosingFunctionName() or
TInputArgumentIndex() or
TCalleeFlexibleAccessPath() or
TEnclosingFunctionSignature() or
TContextFunctionInterfaces()
/**
* An implementation of an endpoint feature: defines feature-name/value tuples for use in ML.
*/
abstract class EndpointFeature extends TEndpointFeature {
/**
* Gets the name of the feature. Used by the ML model.
* Names are coupled to models: changing the name of a feature requires retraining the model.
*/
abstract string getName();
/**
* Gets the value of the feature. Used by the ML model.
* Models are trained based on feature values, so changing the value of a feature requires retraining the model.
*/
abstract string getValue(DataFlow::Node endpoint);
string toString() { result = this.getName() }
}
//----------------------------------------------------------------------------------------------------------------------
// Feature: EnclosingFunctionName
//----------------------------------------------------------------------------------------------------------------------
/**
* The feature for the name of the function that encloses the endpoint.
*/
class EnclosingFunctionName extends EndpointFeature, TEnclosingFunctionName {
override string getName() { result = "enclosingFunctionName" }
override string getValue(DataFlow::Node endpoint) {
result = endpoint.getEnclosingCallable().getName()
}
}
//----------------------------------------------------------------------------------------------------------------------
// Feature: InputArgumentIndex
//----------------------------------------------------------------------------------------------------------------------
class InputArgumentIndex extends EndpointFeature, TInputArgumentIndex {
override string getName() { result = "InputArgumentIndex" }
override string getValue(DataFlow::Node endpoint) {
exists(Argument arg | endpoint.asExpr() = arg and result = arg.getPosition().toString())
}
}
//----------------------------------------------------------------------------------------------------------------------
// Feature: CalleeFlexibleAccessPath
//----------------------------------------------------------------------------------------------------------------------
class CalleeFlexibleAccessPath extends EndpointFeature, TCalleeFlexibleAccessPath {
override string getName() { result = "CalleeFlexibleAccessPath" }
override string getValue(DataFlow::Node endpoint) {
exists(Callable callee, Call call, string package, string type, string name |
endpoint.asExpr() = call.getAnArgument() and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
name = callee.getName() and
result = package + "." + type + "." + name
)
}
}
//----------------------------------------------------------------------------------------------------------------------
// Feature: EnclosingFunctionSignature
//----------------------------------------------------------------------------------------------------------------------
class EnclosingFunctionSignature extends EndpointFeature, TEnclosingFunctionSignature {
override string getName() { result = "enclosingFunctionSignature" }
override string getValue(DataFlow::Node endpoint) {
exists(Callable callee |
callee = endpoint.getEnclosingCallable() and
result = callee.paramsString()
)
}
}
//----------------------------------------------------------------------------------------------------------------------
// Feature: ContextFunctionInterfaces
//----------------------------------------------------------------------------------------------------------------------
class ContextFunctionInterfaces extends EndpointFeature, TContextFunctionInterfaces {
override string getName() { result = "contextFunctionInterfaces" }
override string getValue(DataFlow::Node endpoint) {
result =
concat(Method method, string line |
method.getLocation().getFile() = endpoint.getLocation().getFile() and
line = method.getStringSignature()
|
line, "\n" order by line
)
}
}

View File

@@ -0,0 +1,154 @@
/**
* For internal use only.
*
* Provides an implementation of scoring alerts for use in adaptive threat modeling (ATM).
*/
private import java
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
private import BaseScoring
private import EndpointFeatures as EndpointFeatures
private import FeaturizationConfig
private import EndpointTypes
private string getACompatibleModelChecksum() {
availableMlModels(result, "java", _, "atm-endpoint-scoring")
}
module ModelScoring {
/**
* A featurization config that only featurizes new candidate endpoints that are part of a flow
* path.
*/
class RelevantFeaturizationConfig extends FeaturizationConfig {
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
override DataFlow::Node getAnEndpointToFeaturize() {
getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
or
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result)
}
}
DataFlow::Node getARequestedEndpoint() {
result = any(FeaturizationConfig cfg).getAnEndpointToFeaturize()
}
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
scoreEndpoints(getARequestedEndpoint/0, EndpointFeatures::tokenFeatures/3,
EndpointFeatures::getASupportedFeatureName/0, getARequestedEndpointType/0,
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
}
/**
* Return ATM's confidence that `source` is a source for the given security query. This will be a
* number between 0.0 and 1.0.
*/
private float getScoreForSource(DataFlow::Node source) {
if getCfg().isKnownSource(source)
then result = 1.0
else (
// This restriction on `source` has no semantic effect but improves performance.
getCfg().isEffectiveSource(source) and
ModelScoring::endpointScores(source, getCfg().getASourceEndpointType().getEncoding(), result)
)
}
/**
* Return ATM's confidence that `sink` is a sink for the given security query. This will be a
* number between 0.0 and 1.0.
*/
private float getScoreForSink(DataFlow::Node sink) {
if getCfg().isKnownSink(sink)
then result = 1.0
else (
// This restriction on `sink` has no semantic effect but improves performance.
getCfg().isEffectiveSink(sink) and
ModelScoring::endpointScores(sink, getCfg().getASinkEndpointType().getEncoding(), result)
)
}
class EndpointScoringResults extends ScoringResults {
EndpointScoringResults() {
this = "EndpointScoringResults" and exists(getACompatibleModelChecksum())
}
/**
* Get ATM's confidence that a path between `source` and `sink` represents a security
* vulnerability. This will be a number between 0.0 and 1.0.
*/
override float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink) {
result = getScoreForSource(source) * getScoreForSink(sink)
}
/**
* Get a string representing why ATM included the given source in the dataflow analysis.
*
* In general, there may be multiple reasons why ATM included the given source, in which case
* this predicate should have multiple results.
*/
pragma[inline]
override string getASourceOrigin(DataFlow::Node source) {
result = "known" and getCfg().isKnownSource(source)
or
result = "predicted" and getCfg().isEffectiveSource(source)
}
/**
* Get a string representing why ATM included the given sink in the dataflow analysis.
*
* In general, there may be multiple reasons why ATM included the given sink, in which case
* this predicate should have multiple results.
*/
pragma[inline]
override string getASinkOrigin(DataFlow::Node sink) {
result = "known" and getCfg().isKnownSink(sink)
or
not getCfg().isKnownSink(sink) and
result =
"predicted (scores: " +
concat(EndpointType type, float score |
ModelScoring::endpointScores(sink, type.getEncoding(), score)
|
type.getDescription() + "=" + score.toString(), ", " order by type.getEncoding()
) + ")" and
getCfg().isEffectiveSink(sink)
}
pragma[inline]
override predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
exists(source) and
if getCfg().isKnownSink(sink)
then any()
else (
// This restriction on `sink` has no semantic effect but improves performance.
getCfg().isEffectiveSink(sink) and
exists(float sinkScore |
ModelScoring::endpointScores(sink, getCfg().getASinkEndpointType().getEncoding(), sinkScore) and
// Include the endpoint if (a) the query endpoint type scores higher than all other
// endpoint types, or (b) the query endpoint type scores at least
// 0.5 - (getCfg().getScoreCutoff() / 2).
sinkScore >=
[
max(float s | ModelScoring::endpointScores(sink, _, s)),
0.5 - getCfg().getScoreCutoff() / 2
]
)
)
}
}
module Debugging {
query predicate hopInputEndpoints(DataFlow::Node endpoint) {
endpoint = ModelScoring::getARequestedEndpoint()
}
query predicate endpointScores = ModelScoring::endpointScores/3;
query predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
any(ScoringResults scoringResults).shouldResultBeIncluded(source, sink) and
any(DataFlow::Configuration cfg).hasFlow(source, sink)
}
}

View File

@@ -0,0 +1,71 @@
/**
* For internal use only.
*
* Defines the set of classes that endpoint scoring models can predict. Endpoint scoring models must
* only predict classes defined within this file. This file is the source of truth for the integer
* representation of each of these classes.
*/
newtype TEndpointType =
TNegativeType() or
TXssSinkType() or
TNosqlInjectionSinkType() or
TSqlTaintedSinkType() or
TTaintedPathSinkType() or
TRequestForgerySinkType()
/** A class that can be predicted by endpoint scoring models. */
abstract class EndpointType extends TEndpointType {
abstract string getDescription();
/**
* Gets the integer representation of this endpoint type. This integer representation specifies the class number
* used by the endpoint scoring model (the classifier) to represent this endpoint type. Class 0 is the negative
* class (non-sink). Each positive int corresponds to a single sink type.
*/
abstract int getEncoding();
/**
* Gets the name of the sink/source kind for this endpoint type as used in Models as Data.
*
* See https://github.com/github/codeql/blob/44213f0144fdd54bb679ca48d68b28dcf820f7a8/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll#LL353C11-L357C31
*/
abstract string getKind();
string toString() { result = getDescription() }
}
/** The `Negative` class that can be predicted by endpoint scoring models. */
class NegativeType extends EndpointType, TNegativeType {
override string getDescription() { result = "Negative" }
override int getEncoding() { result = 0 }
override string getKind() { result = "" }
}
/** The `SqlTaintedSink` class that can be predicted by endpoint scoring models. */
class SqlTaintedSinkType extends EndpointType, TSqlTaintedSinkType {
override string getDescription() { result = "SqlTaintedSink" }
override int getEncoding() { result = 1 }
override string getKind() { result = "sql" }
}
/** The `TaintedPathSink` class that can be predicted by endpoint scoring models. */
class TaintedPathSinkType extends EndpointType, TTaintedPathSinkType {
override string getDescription() { result = "TaintedPathSink" }
override int getEncoding() { result = 2 }
override string getKind() { result = "create-file" }
}
/** The `RequestForgerySinkType` class that can be predicted by endpoint scoring models. */
class RequestForgerySinkType extends EndpointType, TRequestForgerySinkType {
override string getDescription() { result = "RequestForgerySink" }
override int getEncoding() { result = 3 }
override string getKind() { result = "open-url" } // TODO: is this correct, or should it be “jdbc-url”?
}

View File

@@ -0,0 +1,15 @@
import java
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
/**
* A configuration that defines which endpoints should be featurized.
*
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
* to featurize.
*/
abstract class FeaturizationConfig extends string {
bindingset[this]
FeaturizationConfig() { any() }
abstract DataFlow::Node getAnEndpointToFeaturize();
}

View File

@@ -0,0 +1,102 @@
/**
* FunctionBodyFeatures.qll
*
* Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features.
*/
import java
private import FeaturizationConfig
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
/**
* Gets a tokenized representation of the AST node for use in the `enclosingFunctionBody` feature.
*/
string getTokenizedAstNode(Top top) {
result = top.(Variable).getName()
or
result = top.(Field).getName()
or
result = top.(Literal).getValue()
}
/** Gets an AST node within the function `f` that we should featurize. */
pragma[inline]
Element getAnAstNodeToFeaturize(Callable c) {
result.(Stmt).getEnclosingCallable() = c or
result.(Expr).getEnclosingCallable() = c
}
/** DEPRECATED: Alias for getAnAstNodeToFeaturize */
deprecated Top getAnASTNodeToFeaturize(Callable c) { result = getAnAstNodeToFeaturize(c) }
/**
* Get the enclosing function for an endpoint.
*
* This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
*/
Callable getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
// Performance optimization: Restrict the set of endpoints to the endpoints to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
result = endpoint.getEnclosingCallable()
}
/** Returns an AST node within the function `f` that an associated token feature. */
Element getAnAstNodeWithAFeature(Callable c) {
// Performance optimization: Restrict the set of functions to those containing an endpoint to featurize.
c = getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
result = getAnAstNodeToFeaturize(c)
}
/** DEPRECATED: Alias for getAnAstNodeWithAFeature */
deprecated Element getAnASTNodeWithAFeature(Callable c) { result = getAnAstNodeWithAFeature(c) }
/** Returns the number of source-code characters in a function. */
int getNumCharsInFunction(Callable c) {
result =
strictsum(Element element |
element = getAnAstNodeWithAFeature(c)
|
getTokenizedAstNode(element).length()
)
}
/**
* Gets the maximum number of characters a feature can be.
* The evaluator string limit is 5395415 characters. We choose a limit lower than this.
*/
private int getMaxChars() { result = 1000000 }
/**
* Returns a featurized representation of the function that can be used to populate the
* `enclosingFunctionBody` feature for an endpoint.
*/
string getBodyTokensFeature(Callable c) {
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as
// absent. This approximates the behavior of the classifier on non-generic body features where
// large body features are replaced by the absent token.
//
// We count nodes instead of tokens because tokens are often not unique.
strictcount(Element element |
element = getAnAstNodeToFeaturize(c) and
exists(getTokenizedAstNode(element))
) <= 256 and
// Performance optimization: If a function has more than getMaxChars() characters in its body subtokens,
// then featurize it as absent.
getNumCharsInFunction(c) <= getMaxChars() and
result =
strictconcat(Location l, string token |
// The use of a nested exists here allows us to avoid duplicates due to two AST nodes in the
// same location featurizing to the same token. By using a nested exists, we take only unique
// (location, token) pairs.
exists(Element element |
element = getAnAstNodeToFeaturize(c) and
token = getTokenizedAstNode(element) and
l = element.getLocation()
)
|
token, " "
order by
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
l.getEndColumn(), token
)
}

View File

@@ -0,0 +1,38 @@
/**
* For internal use only.
*
* A taint-tracking configuration for reasoning about SSRF (server side request forgery) vulnerabilities.
* Largely copied from java/ql/lib/semmle/code/java/security/RequestForgeryConfig.qll.
*
* Only import this directly from .ql files, to avoid the possibility of polluting the Configuration hierarchy
* accidentally.
*/
import ATMConfig
import semmle.code.java.dataflow.FlowSources
import semmle.code.java.security.RequestForgery
class RequestForgeryAtmConfig extends AtmConfig {
RequestForgeryAtmConfig() { this = "RequestForgeryAtmConfig" }
override predicate isKnownSource(DataFlow::Node source) {
source instanceof RemoteFlowSource and
// Exclude results of remote HTTP requests: fetching something else based on that result
// is no worse than following a redirect returned by the remote server, and typically
// we're requesting a resource via https which we trust to only send us to safe URLs.
not source.asExpr().(MethodAccess).getCallee() instanceof UrlConnectionGetInputStreamMethod
}
override EndpointType getASinkEndpointType() { result instanceof RequestForgerySinkType }
/*
* This is largely a copy of the taint tracking configuration for the standard SSRF
* query, except additional sinks have been added using the sink endpoint filter.
*/
override predicate isAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ) {
any(RequestForgeryAdditionalTaintStep r).propagatesTaint(pred, succ)
}
override predicate isSanitizer(DataFlow::Node node) { node instanceof RequestForgerySanitizer }
}

View File

@@ -0,0 +1,34 @@
/**
* For internal use only.
*
* A taint-tracking configuration for reasoning about SQL injection vulnerabilities.
* Defines shared code used by the SQL injection boosted query.
* Largely copied from semmle.code.java.security.SqlInjectionQuery.
*/
import ATMConfig
import semmle.code.java.dataflow.FlowSources
import semmle.code.java.security.QueryInjection
class SqlTaintedAtmConfig extends AtmConfig {
SqlTaintedAtmConfig() { this = "SqlTaintedAtmConfig" }
override predicate isKnownSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
override EndpointType getASinkEndpointType() { result instanceof SqlTaintedSinkType }
/*
* This is largely a copy of the taint tracking configuration for the standard SQL injection
* query, except additional sinks have been added using the sink endpoint filter.
*/
override predicate isSanitizer(DataFlow::Node node) {
node.getType() instanceof PrimitiveType or
node.getType() instanceof BoxedType or
node.getType() instanceof NumberType
}
override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
any(AdditionalQueryInjectionTaintStep s).step(node1, node2)
}
}

View File

@@ -0,0 +1,82 @@
/**
* For internal use only.
*
* A taint-tracking configuration for reasoning about path injection vulnerabilities.
* Defines shared code used by the path injection boosted query.
* Largely copied from java/ql/src/Security/CWE/CWE-022/TaintedPath.ql.
*/
import java
import semmle.code.java.security.PathSanitizer
import ATMConfig
import semmle.code.java.dataflow.FlowSources
class TaintedPathAtmConfig extends AtmConfig {
TaintedPathAtmConfig() { this = "TaintedPathAtmConfig" }
override predicate isKnownSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
override EndpointType getASinkEndpointType() { result instanceof TaintedPathSinkType }
/*
* This is largely a copy of the taint tracking configuration for the standard path injection
* query, except additional ATM sinks have been added to the `isSink` predicate.
*/
override predicate isSanitizer(DataFlow::Node sanitizer) {
sanitizer.getType() instanceof BoxedType or
sanitizer.getType() instanceof PrimitiveType or
sanitizer.getType() instanceof NumberType or
sanitizer instanceof PathInjectionSanitizer
}
override predicate isAdditionalTaintStep(DataFlow::Node n1, DataFlow::Node n2) {
any(TaintedPathAdditionalTaintStep s).step(n1, n2)
}
}
/*
* Models a very basic guard for the tainted path queries.
* TODO: Copied from java/ql/src/Security/CWE/CWE-022/TaintedPathCommon.qll because I couldn't figure out how to import it.
*/
/**
* A unit class for adding additional taint steps.
*
* Extend this class to add additional taint steps that should apply to tainted path flow configurations.
*/
class TaintedPathAdditionalTaintStep extends Unit {
abstract predicate step(DataFlow::Node n1, DataFlow::Node n2);
}
private class DefaultTaintedPathAdditionalTaintStep extends TaintedPathAdditionalTaintStep {
override predicate step(DataFlow::Node n1, DataFlow::Node n2) {
exists(Argument a |
a = n1.asExpr() and
a.getCall() = n2.asExpr() and
a = any(TaintPreservingUriCtorParam tpp).getAnArgument()
)
}
}
private class TaintPreservingUriCtorParam extends Parameter {
TaintPreservingUriCtorParam() {
exists(Constructor ctor, int idx, int nParams |
ctor.getDeclaringType() instanceof TypeUri and
this = ctor.getParameter(idx) and
nParams = ctor.getNumberOfParameters()
|
// URI(String scheme, String ssp, String fragment)
idx = 1 and nParams = 3
or
// URI(String scheme, String host, String path, String fragment)
idx = [1, 2] and nParams = 4
or
// URI(String scheme, String authority, String path, String query, String fragment)
idx = 2 and nParams = 5
or
// URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)
idx = 4 and nParams = 7
)
}
}

View File

@@ -0,0 +1,10 @@
name: codeql/java-experimental-atm-lib
description: CodeQL libraries for the experimental ML-powered queries
version: 0.4.5
extractor: java
library: true
groups:
- java
- experimental
dependencies:
codeql/java-all: ${workspace}

View File

@@ -0,0 +1,3 @@
# Avoid checking in ML models
# This matches the mlModels property of qlpack.yml.
resources/*.codeqlmodel

View File

@@ -0,0 +1,4 @@
---
dependencies: {}
compiled: false
lockVersion: 1.0.0

View File

@@ -0,0 +1,8 @@
name: codeql/java-experimental-atm-model
description: Machine learning model supporting the experimental ML-powered queries
version: 0.0.1
groups:
- java
- experimental
mlModels:
- "resources/*.codeqlmodel"

View File

@@ -0,0 +1,67 @@
/**
* @name Debug result inclusion
* @description Use this query to understand why some alerts are included or excluded from the
* results of boosted queries. The results for this query are the union of the alerts
* generated by each boosted query. Each alert includes an explanation why it was
* included or excluded for each of the four security queries.
* @kind problem
* @problem.severity error
* @id adaptive-threat-modeling/java/debug-result-inclusion
*/
import java
import experimental.adaptivethreatmodeling.ATMConfig
import extraction.ExtractEndpointDataTraining
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
string getAReasonSinkExcluded(DataFlow::Node sinkCandidate, Query query) {
query instanceof SqlTaintedQuery and
result = any(SqlTaintedAtm::SqlTaintedAtmConfig cfg).getAReasonSinkExcluded(sinkCandidate)
or
query instanceof TaintedPathQuery and
result = any(TaintedPathAtm::TaintedPathAtmConfig cfg).getAReasonSinkExcluded(sinkCandidate)
or
query instanceof RequestForgeryQuery and
result = any(RequestForgeryAtm::RequestForgeryAtmConfig cfg).getAReasonSinkExcluded(sinkCandidate)
}
pragma[inline]
string getDescriptionForAlertCandidate(
DataFlow::Node sourceCandidate, DataFlow::Node sinkCandidate, Query query
) {
result = "excluded[reason=" + getAReasonSinkExcluded(sinkCandidate, query) + "]"
or
getDataFlowCfg(query).(AtmConfig).isKnownSink(sinkCandidate) and
result = "excluded[reason=known-sink]"
or
not exists(getAReasonSinkExcluded(sinkCandidate, query)) and
not getDataFlowCfg(query).hasFlow(sourceCandidate, sinkCandidate) and
(
if
getDataFlowCfg(query).isSource(sourceCandidate) or
getDataFlowCfg(query).isSource(sourceCandidate, _)
then result = "no flow"
else result = "not a known source"
)
or
getDataFlowCfg(query).hasFlow(sourceCandidate, sinkCandidate) and
result = "included"
}
pragma[inline]
string getDescriptionForAlert(DataFlow::Node sourceCandidate, DataFlow::Node sinkCandidate) {
result =
concat(Query query |
|
query.getName() + ": " +
getDescriptionForAlertCandidate(sourceCandidate, sinkCandidate, query), ", "
)
}
from DataFlow::Configuration cfg, DataFlow::Node source, DataFlow::Node sink
where cfg.hasFlow(source, sink)
select sink,
"This is an ATM result that may depend on $@ [" + getDescriptionForAlert(source, sink) + "]",
source, "a user-provided value"

View File

@@ -0,0 +1,6 @@
---
dependencies:
codeql/java-experimental-atm-model:
version: 0.0.1
compiled: false
lockVersion: 1.0.0

View File

@@ -0,0 +1,21 @@
/**
* For internal use only.
*
*
* Count the number of sinks and alerts for a particular dataflow config.
*/
import java
import evaluation.EndToEndEvaluation
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
query predicate countAlertsAndSinks(int numAlerts, int numSinks) {
numAlerts =
count(DataFlow::Configuration cfg, DataFlow::Node source, DataFlow::Node sink |
cfg.hasFlow(source, sink) and not isFlowExcluded(source, sink)
) and
numSinks =
count(DataFlow::Node sink |
exists(DataFlow::Configuration cfg | cfg.isSink(sink) or cfg.isSink(sink, _))
)
}

View File

@@ -0,0 +1,9 @@
/*
* For internal use only.
*
*
* Count the number of sinks and alerts for the `RequestForgery` security query.
*/
import semmle.code.java.security.RequestForgery
import CountAlertsAndSinks

View File

@@ -0,0 +1,9 @@
/*
* For internal use only.
*
*
* Count the number of sinks and alerts for the `SqlTainted` security query.
*/
import semmle.code.java.security.SqlInjectionQuery
import CountAlertsAndSinks

View File

@@ -0,0 +1,74 @@
/*
* For internal use only.
*
*
* Count the number of sinks and alerts for the `TaintedPath` security query.
*/
//TODO no libraries for TaintedPath so we copy paste the config used in the TaintedPath.ql query.
import java
import DataFlow::PathGraph
private import semmle.code.java.dataflow.ExternalFlow
import semmle.code.java.security.PathCreation
import semmle.code.java.security.PathSanitizer
import semmle.code.java.dataflow.FlowSources
import CountAlertsAndSinks
class TaintedPathConfig extends TaintTracking::Configuration {
TaintedPathConfig() { this = "TaintedPathConfig" }
override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
override predicate isSink(DataFlow::Node sink) {
sink.asExpr() = any(PathCreation p).getAnInput()
or
sinkNode(sink, "create-file")
}
override predicate isSanitizer(DataFlow::Node sanitizer) {
sanitizer.getType() instanceof BoxedType or
sanitizer.getType() instanceof PrimitiveType or
sanitizer.getType() instanceof NumberType or
sanitizer instanceof PathInjectionSanitizer
}
override predicate isAdditionalTaintStep(DataFlow::Node n1, DataFlow::Node n2) {
any(TaintedPathAdditionalTaintStep s).step(n1, n2)
}
}
class TaintedPathAdditionalTaintStep extends Unit {
abstract predicate step(DataFlow::Node n1, DataFlow::Node n2);
}
private class DefaultTaintedPathAdditionalTaintStep extends TaintedPathAdditionalTaintStep {
override predicate step(DataFlow::Node n1, DataFlow::Node n2) {
exists(Argument a |
a = n1.asExpr() and
a.getCall() = n2.asExpr() and
a = any(TaintPreservingUriCtorParam tpp).getAnArgument()
)
}
}
private class TaintPreservingUriCtorParam extends Parameter {
TaintPreservingUriCtorParam() {
exists(Constructor ctor, int idx, int nParams |
ctor.getDeclaringType() instanceof TypeUri and
this = ctor.getParameter(idx) and
nParams = ctor.getNumberOfParameters()
|
// URI(String scheme, String ssp, String fragment)
idx = 1 and nParams = 3
or
// URI(String scheme, String host, String path, String fragment)
idx = [1, 2] and nParams = 4
or
// URI(String scheme, String authority, String path, String query, String fragment)
idx = 2 and nParams = 5
or
// URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)
idx = 4 and nParams = 7
)
}
}

View File

@@ -0,0 +1,12 @@
private import java
private import extraction.Exclusions as Exclusions
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
/**
* Holds if the flow from `source` to `sink` should be excluded from the results of an end-to-end
* evaluation query.
*/
pragma[inline]
predicate isFlowExcluded(DataFlow::Node source, DataFlow::Node sink) {
Exclusions::isFileExcluded([source.getLocation().getFile(), sink.getLocation().getFile()])
}

View File

@@ -0,0 +1,29 @@
/**
* EndpointScoresIntegrationTest.ql
*
* Extract scores for each test endpoint that is an argument to a function call in the database.
* This is used by integration tests to verify that QL and the modeling codebase agree on the scores
* of a set of test endpoints.
*/
import java
import experimental.adaptivethreatmodeling.ATMConfig
import experimental.adaptivethreatmodeling.FeaturizationConfig
import experimental.adaptivethreatmodeling.EndpointScoring::ModelScoring as ModelScoring
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
private import semmle.code.java.dataflow.internal.DataFlowPrivate as DataFlowPrivate
/**
* A featurization config that featurizes endpoints that are arguments to function calls.
*
* This should only be used in extraction queries and tests.
*/
class FunctionArgumentFeaturizationConfig extends FeaturizationConfig {
FunctionArgumentFeaturizationConfig() { this = "FunctionArgumentFeaturization" }
override DataFlow::Node getAnEndpointToFeaturize() {
exists(Call call | result.asExpr() = call.getAnArgument())
}
}
query predicate endpointScores = ModelScoring::endpointScores/3;

View File

@@ -0,0 +1,16 @@
/**
* ModelCheck.ql
*
* Returns checksums of ATM models.
*/
/**
* The `availableMlModels` template predicate.
*
* This is populated by the evaluator with metadata for the available machine learning models.
*/
external predicate availableMlModels(
string modelChecksum, string modelLanguage, string modelName, string modelType
);
select any(string checksum | availableMlModels(checksum, "java", _, _))

View File

@@ -0,0 +1,67 @@
/*
* For internal use only.
*
* Counts sources and sinks for Java security queries.
*/
import java
import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
import semmle.code.java.dataflow.TaintTracking::TaintTracking as TaintTracking
// java/ql/lib/semmle/code/java/security$ ls *Query.qll | sed -e 's/\(.*\)Query.qll/import semmle.code.java.security.\1Query as \1/'
import semmle.code.java.security.AndroidIntentRedirectionQuery as AndroidIntentRedirection
import semmle.code.java.security.AndroidSensitiveCommunicationQuery as AndroidSensitiveCommunication
import semmle.code.java.security.AndroidWebViewCertificateValidationQuery as AndroidWebViewCertificateValidation
import semmle.code.java.security.CleartextStorageAndroidDatabaseQuery as CleartextStorageAndroidDatabase
import semmle.code.java.security.CleartextStorageAndroidFilesystemQuery as CleartextStorageAndroidFilesystem
import semmle.code.java.security.CleartextStorageClassQuery as CleartextStorageClass
import semmle.code.java.security.CleartextStorageCookieQuery as CleartextStorageCookie
import semmle.code.java.security.CleartextStoragePropertiesQuery as CleartextStorageProperties
import semmle.code.java.security.CleartextStorageQuery as CleartextStorage
import semmle.code.java.security.CleartextStorageSharedPrefsQuery as CleartextStorageSharedPrefs
import semmle.code.java.security.CommandLineQuery as CommandLine
import semmle.code.java.security.ConditionalBypassQuery as ConditionalBypass
import semmle.code.java.security.FragmentInjectionQuery as FragmentInjection
import semmle.code.java.security.GroovyInjectionQuery as GroovyInjection
import semmle.code.java.security.HardcodedCredentialsApiCallQuery as HardcodedCredentialsApiCall
import semmle.code.java.security.HardcodedCredentialsSourceCallQuery as HardcodedCredentialsSourceCall
import semmle.code.java.security.HttpsUrlsQuery as HttpsUrls
import semmle.code.java.security.ImplicitPendingIntentsQuery as ImplicitPendingIntents
import semmle.code.java.security.ImproperIntentVerificationQuery as ImproperIntentVerification
import semmle.code.java.security.InsecureBasicAuthQuery as InsecureBasicAuth
import semmle.code.java.security.InsecureTrustManagerQuery as InsecureTrustManager
import semmle.code.java.security.InsufficientKeySizeQuery as InsufficientKeySize
import semmle.code.java.security.IntentUriPermissionManipulationQuery as IntentUriPermissionManipulation
import semmle.code.java.security.JexlInjectionQuery as JexlInjection
import semmle.code.java.security.JndiInjectionQuery as JndiInjection
import semmle.code.java.security.LogInjectionQuery as LogInjection
import semmle.code.java.security.MissingJWTSignatureCheckQuery as MissingJWTSignatureCheck
import semmle.code.java.security.MvelInjectionQuery as MvelInjection
import semmle.code.java.security.OgnlInjectionQuery as OgnlInjection
import semmle.code.java.security.OverlyLargeRangeQuery as OverlyLargeRange
import semmle.code.java.security.PartialPathTraversalQuery as PartialPathTraversal
import semmle.code.java.security.RandomQuery as Random
import semmle.code.java.security.RsaWithoutOaepQuery as RsaWithoutOaep
import semmle.code.java.security.SensitiveKeyboardCacheQuery as SensitiveKeyboardCache
import semmle.code.java.security.SensitiveLoggingQuery as SensitiveLogging
import semmle.code.java.security.SpelInjectionQuery as SpelInjection
import semmle.code.java.security.SqlInjectionQuery as SqlInjection
import semmle.code.java.security.StaticInitializationVectorQuery as StaticInitializationVector
import semmle.code.java.security.TemplateInjectionQuery as TemplateInjection
import semmle.code.java.security.UnsafeAndroidAccessQuery as UnsafeAndroidAccess
import semmle.code.java.security.UnsafeCertTrustQuery as UnsafeCertTrust
import semmle.code.java.security.UnsafeContentUriResolutionQuery as UnsafeContentUriResolution
import semmle.code.java.security.UnsafeDeserializationQuery as UnsafeDeserialization
import semmle.code.java.security.WebviewDubuggingEnabledQuery as WebviewDubuggingEnabled
import semmle.code.java.security.XsltInjectionQuery as XsltInjection
DataFlow::Node getASink(TaintTracking::Configuration cfg) {
cfg.isSink(result) or cfg.isSink(result, _)
}
DataFlow::Node getASource(TaintTracking::Configuration cfg) {
cfg.isSource(result) or cfg.isSource(result, _)
}
from TaintTracking::Configuration cfg, int sources, int sinks
where count(getASource(cfg)) = sources and count(getASink(cfg)) = sinks
select cfg, sources, sinks

View File

@@ -0,0 +1,59 @@
/**
* For internal use only.
*
* Defines files that should be excluded from the evaluation of ML models.
*/
private import java
//TODO Couldn't find a library for the classifier so copy pasted predicate in java/ql/src/filters/ClassifyFiles.ql
predicate classify(File f, string tag) {
f instanceof GeneratedFile and tag = "generated"
or
exists(GeneratedClass gc | gc.getFile() = f | tag = "generated")
or
exists(TestClass tc | tc.getFile() = f | tag = "test")
or
exists(TestMethod tm | tm.getFile() = f | tag = "test")
}
/** Holds if the file should be excluded from end-to-end evaluation. */
predicate isFileExcluded(File file) {
// Ignore files that are outside the root folder of the analyzed source location.
//
// If the file doesn't have a relative path, then the source file is located outside the root
// folder of the analyzed source location, meaning that the files are additional files added to
// the database like standard library files that we would like to ignore.
not exists(file.getRelativePath())
or
// Ignore files based on their path.
exists(string ignorePattern, string separator |
ignorePattern =
// Exclude test files
"(tests?|test[_-]?case|" +
// Exclude library files
//
// - The Bower and npm package managers store packages in bower_components and node_modules
// folders respectively.
// - Specific exclusion for end-to-end: `applications/examples/static/epydoc` contains
// library code from Epydoc.
"3rd[_-]?party|bower_components|extern(s|al)?|node_modules|resources|third[_-]?party|_?vendor|"
+ "applications" + separator + "examples" + separator + "static" + separator + "epydoc|" +
// Exclude generated code
"gen|\\.?generated|" +
// Exclude benchmarks
"benchmarks?|" +
// Exclude documentation
"docs?|documentation)" and
separator = "(\\/|\\.)" and
exists(
file.getRelativePath()
.toLowerCase()
.regexpFind(separator + ignorePattern + separator + "|" + "^" + ignorePattern + separator +
"|" + separator + ignorePattern + "$", _, _)
)
)
or
// Ignore generated, library, and test files.
classify(file, ["externs", "generated", "library", "test"])
}

View File

@@ -0,0 +1,11 @@
/*
* For internal use only.
*
* Extracts training data we can use to train ML models for ML-powered queries.
*/
private import ExtractEndpointDataTraining as ExtractEndpointDataTraining
query predicate endpoints = ExtractEndpointDataTraining::reformattedTrainingEndpoints/5;
query predicate tokenFeatures = ExtractEndpointDataTraining::tokenFeatures/3;

View File

@@ -0,0 +1,251 @@
/**
* For internal use only.
*
* Extracts training data we can use to train ML models for ML-powered queries.
*/
import java
import experimental.adaptivethreatmodeling.EndpointCharacteristics
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
import NoFeaturizationRestrictionsConfig
private import Exclusions as Exclusions
import Queries
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
/**
* Gets the set of featureName-featureValue pairs for each endpoint in the training set.
*
* `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
* `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
* `featureValue` to the empty string in this case.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
trainingEndpoints(endpoint, _, _) and
(
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
or
// Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
featureName = EndpointFeatures::getASupportedFeatureName() and
not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
featureValue = ""
)
}
/**
* Holds if the given endpoint should be included in the training set as a sample belonging to endpointClass, and has
* the given characteristic. This query uses the endpoint characteristics to select and label endpoints for the training
* set, and provides a list of characteristics for each endpoint in the training set, which is used in the modeling
* code.
*
* Params:
* endpoint: The endpoint to include / exclude.
* endpointClass: The sink type. See the documentation of EndpointType.getEncoding for details about the relationship
* between an EndpointType and a class in the classifier.
* characteristic: Provides the list of characteristics that apply to the endpoint, which the modeling code currently
* uses for type balancing.
*
* Note: This predicate will produce multiple tuples for endpoints that have multiple characteristics, which we must
* then group together into a list of characteristics.
*/
query predicate trainingEndpoints(
DataFlow::Node endpoint, EndpointType endpointClass, EndpointCharacteristic characteristic
) {
characteristic.appliesToEndpoint(endpoint) and
// Only consider the source code for the project being analyzed.
exists(endpoint.getLocation().getFile().getRelativePath()) and
// Only select endpoints that can be part of a tainted flow: Constant expressions always evaluate to a constant
// primitive value. Therefore they can't ever appear in an alert, making them less interesting training examples.
// TODO: Experiment with removing this requirement.
// not endpoint.asExpr() instanceof CompileTimeConstantExpr and
not exists(EndpointFilterCharacteristic efc | efc.appliesToEndpoint(endpoint)) and
// Do not select endpoints filtered out by end-to-end evaluation.
// TODO: Experiment with removing this requirement.
not Exclusions::isFileExcluded(endpoint.getLocation().getFile()) and
// Filter out negative examples that also have a LikelyNotASinkReason, because this is currently done here
// https://github.com/github/codeql/blob/387e57546bf7352f7c1cfe781daa1a3799b7063e/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll#L77
// TODO: Experiment with removing this requirement.
not (
endpointClass instanceof NegativeType and
exists(EndpointCharacteristic c |
c.appliesToEndpoint(endpoint) and
c instanceof LikelyNotASinkCharacteristic
)
) and
// Don't surface endpoint filters as characteristics, because they were previously not surfaced.
// TODO: Experiment with surfacing these to the modeling code by removing the following line (and then make
// EndpointFilterCharacteristic private).
not characteristic instanceof EndpointFilterCharacteristic and
(
// If the list of characteristics includes positive indicators with high confidence for this class, select this as a
// training sample belonging to the class.
exists(EndpointCharacteristic characteristic2, float confidence |
characteristic2.appliesToEndpoint(endpoint) and
characteristic2.hasImplications(endpointClass, true, confidence) and
confidence >= characteristic2.getHighConfidenceThreshold()
) and
(
// Temporarily limit this only to positive classes. For negative classes, additionally select only endpoints that
// have no high confidence indicators that they are sinks, because this is what was previously done.
// TODO: Experiment with removing this requirement, and instead ensuring that an endpoint never has both a high
// confidence indicator that it _is_ a sink and a high confidence indicator that it is _not_ a sink.
not endpointClass instanceof NegativeType
or
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
characteristic3.appliesToEndpoint(endpoint) and
characteristic3.hasImplications(posClass, true, confidence3) and
confidence3 >= characteristic3.getHighConfidenceThreshold() and
not posClass instanceof NegativeType
)
)
or
// If the list of characteristics includes negative indicators with high confidence for all classes other than 0,
// select this as a training sample of class 0 (this means we had query-specific characteristics to decide this
// endpoint isn't a sink for each of our sink types).
endpointClass instanceof NegativeType and
forall(EndpointType otherClass | not otherClass instanceof NegativeType |
exists(EndpointCharacteristic characteristic2, float confidence |
characteristic2.appliesToEndpoint(endpoint) and
characteristic2.hasImplications(otherClass, false, confidence) and
confidence >= characteristic2.getHighConfidenceThreshold()
)
)
)
}
/**
* Temporary:
* Reformat the training data that was extracted with the new logic to match the format produced by the old predicate.
* This is the format expected by the endpoint pipeline.
*/
query predicate reformattedTrainingEndpoints(
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
) {
trainingEndpoints(endpoint, _, _) and
exists(Query query |
queryName = query.getName() and
// For sinks, only list that sink type, but for non-sinks, list all sink types.
(
exists(EndpointType endpointClass |
endpointClass.getDescription().matches(queryName + "%") and
not endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
or
exists(EndpointType endpointClass |
endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
) and
(
// NOTE: We don't use hasFlowFromSource in training, so we could just hardcode it to be false.
key = "hasFlowFromSource" and
(
if FlowFromSource::hasFlowFromSource(endpoint, query)
then value = "true"
else value = "false"
) and
valueType = "boolean"
or
// Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
// appear in an alert, making them less interesting training examples.
key = "isConstantExpression" and
(
if endpoint.asExpr() instanceof CompileTimeConstantExpr
then value = "true"
else value = "false"
) and
valueType = "boolean"
or
// Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
key = "isExcludedFromEndToEndEvaluation" and
(
if Exclusions::isFileExcluded(endpoint.getLocation().getFile())
then value = "true"
else value = "false"
) and
valueType = "boolean"
or
// The label for this query, considering the endpoint as a sink.
key = "sinkLabel" and
valueType = "string" and
value = "Sink" and
exists(EndpointType endpointClass |
endpointClass.getDescription().matches(queryName + "%") and
not endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
or
key = "sinkLabel" and
valueType = "string" and
value = "NotASink" and
exists(EndpointType endpointClass |
endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
or
// The reason, or reasons, why the endpoint was labeled NotASink for this query, only for negative examples.
key = "notASinkReason" and
exists(EndpointCharacteristic characteristic, EndpointType endpointClass |
characteristic.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointClass, true, _) and
endpointClass instanceof NegativeType and
value = characteristic
) and
// Don't include a notASinkReason for endpoints that are also known sinks.
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
characteristic3.appliesToEndpoint(endpoint) and
characteristic3.hasImplications(posClass, true, confidence3) and
confidence3 >= characteristic3.getHighConfidenceThreshold() and
not posClass instanceof NegativeType
) and
// Don't surface endpoint filters as notASinkReasons, because they were previously not surfaced.
// TODO: Experiment with surfacing these to the modeling code by removing the following line (and then make
// EndpointFilterCharacteristic private).
not value instanceof EndpointFilterCharacteristic and
valueType = "string"
)
)
}
/**
* Gets the ATM data flow configuration for the specified query.
* TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
*/
DataFlow::Configuration getDataFlowCfg(Query query) {
query instanceof SqlTaintedQuery and result instanceof SqlTaintedAtm::SqlTaintedAtmConfig
or
query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::TaintedPathAtmConfig
or
query instanceof RequestForgeryQuery and
result instanceof RequestForgeryAtm::RequestForgeryAtmConfig
}
// TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
private module FlowFromSource {
predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
}
/**
* A data flow configuration that replicates the data flow configuration for a specific query, but
* replaces the set of sinks with the set of endpoints we're extracting.
*
* We use this to find out when there is flow to a particular endpoint from a known source.
*
* This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
* from the CodeQL standard libraries for JavaScript.
*/
private class Configuration extends DataFlow::Configuration {
Query q;
Configuration() { this = getDataFlowCfg(q) }
Query getQuery() { result = q }
/** Holds if `sink` is an endpoint we're extracting. */
override predicate isSink(DataFlow::Node sink) { any() }
// override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) { exists(lbl) }
}
}

View File

@@ -0,0 +1,11 @@
/**
* @name Endpoint types
* @description Maps endpoint type encodings to human-readable descriptions.
* @kind table
* @id java/ml-powered/model-building/endpoint-type-encodings
*/
import experimental.adaptivethreatmodeling.EndpointTypes
from EndpointType type
select type.getEncoding() as label, type.getDescription() as labelName order by label

View File

@@ -0,0 +1,25 @@
/*
* For internal use only.
*
* Maps ML-powered queries to their `EndpointType` for clearer labelling while evaluating ML model during training.
*/
import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
import experimental.adaptivethreatmodeling.AdaptiveThreatModeling
from string queryName, AtmConfig c, EndpointType e
where
(
queryName = "SqlTainted" and
c instanceof SqlTaintedAtm::SqlTaintedAtmConfig
or
queryName = "TaintedPath" and
c instanceof TaintedPathAtm::TaintedPathAtmConfig
or
queryName = "RequestForgery" and
c instanceof RequestForgeryAtm::RequestForgeryAtmConfig
) and
e = c.getASinkEndpointType()
select queryName, e.getEncoding() as label

View File

@@ -0,0 +1,44 @@
/*
* For internal use only.
*
* Query for finding misclassified endpoints which we can use to debug ML-powered queries.
*/
import java
import experimental.adaptivethreatmodeling.AdaptiveThreatModeling
import experimental.adaptivethreatmodeling.ATMConfig
import experimental.adaptivethreatmodeling.BaseScoring
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
import experimental.adaptivethreatmodeling.EndpointTypes
import semmle.code.java.security.QueryInjection
/** Gets the positive endpoint type for which you wish to find misclassified examples. */
EndpointType getEndpointType() { result instanceof SqlTaintedSinkType }
/** Get a positive endpoint. This will be run through the classifier to determine whether it is misclassified. */
DataFlow::Node getAPositiveEndpoint() { result instanceof QueryInjectionSink }
/** An ATM configuration to find misclassified endpoints of type `getEndpointType()`. */
class ExtractMisclassifiedEndpointsAtmConfig extends AtmConfig {
ExtractMisclassifiedEndpointsAtmConfig() { this = "ExtractMisclassifiedEndpointsATMConfig" }
override predicate isEffectiveSink(DataFlow::Node sinkCandidate) {
sinkCandidate = getAPositiveEndpoint()
}
override EndpointType getASinkEndpointType() { result = getEndpointType() }
}
/** Get an endpoint from `getAPositiveEndpoint()` that is incorrectly excluded from the results. */
DataFlow::Node getAMisclassifedEndpoint() {
any(ExtractMisclassifiedEndpointsAtmConfig config).isEffectiveSink(result) and
not any(ScoringResults results).shouldResultBeIncluded(_, result)
}
/** The token features for each misclassified endpoint. */
query predicate tokenFeaturesForMisclassifiedEndpoints(
DataFlow::Node endpoint, string featureName, string featureValue
) {
endpoint = getAMisclassifedEndpoint() and
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
}

View File

@@ -0,0 +1,52 @@
/**
* Surfaces endpoints are non-sinks with high confidence, for use as negative examples in the prompt.
*
* @name Negative examples (experimental)
* @kind problem
* @id java/ml-powered/non-sink
* @tags experimental security
*/
private import java
import semmle.code.java.dataflow.TaintTracking
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
private import experimental.adaptivethreatmodeling.EndpointTypes
bindingset[rate]
DataFlow::Node getSampleFromSampleRate(float rate) {
exists(int r |
result =
rank[r](DataFlow::Node n, string path, int a, int b, int c, int d |
n.asExpr().getLocation().hasLocationInfo(path, a, b, c, d)
|
n order by path, a, b, c, d
) and
r % (1 / rate).ceil() = 0
)
}
from
DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic,
float confidence
where
characteristic.appliesToEndpoint(endpoint) and
confidence >= characteristic.highConfidence() and
characteristic.hasImplications(any(NegativeType negative), true, confidence) and
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
not exists(
EndpointCharacteristics::EndpointCharacteristic characteristic2, float confidence2,
EndpointType positiveType
|
characteristic2.appliesToEndpoint(endpoint) and
confidence2 >= characteristic2.maximalConfidence() and
not positiveType instanceof NegativeType and
characteristic2.hasImplications(positiveType, true, confidence2)
) and
endpoint = getSampleFromSampleRate(0.01)
select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString()

View File

@@ -0,0 +1,34 @@
/**
* Surfaces endpoints are sinks with high confidence, for use as positive examples in the prompt.
*
* @name Positive examples (experimental)
* @kind problem
* @id java/ml-powered/known-sink
* @tags experimental security
*/
private import java
import semmle.code.java.dataflow.TaintTracking
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintednAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
/*
* ****** WARNING: ******
* Before calling this query, make sure there's no codex-generated data extension file in `java/ql/lib/ext`. Otherwise,
* the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
*/
from DataFlow::Node sink, AtmConfig::AtmConfig config
where
config.isKnownSink(sink) and
// If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
// when there's a codex-generated data extension file in `java/ql/lib/ext`.
not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
not config.isSanitizer(sink)
select sink, config.getASinkEndpointType().getDescription()

View File

@@ -0,0 +1,48 @@
/**
* Surfaces the endpoints that pass the endpoint filters and have flow from a source for each query config, and are
* therefore used as candidates for classification with an ML model.
*
* Note: This query does not actually classify the endpoints using the model.
*
* @name Sink candidates with flow (experimental)
* @description Sink candidates with flow from a source
* @kind problem
* @id java/ml-powered/sink-candidates-with-flow
* @tags experimental security
*/
private import java
import semmle.code.java.dataflow.TaintTracking
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
from
DataFlow::Node sink, string message, string package, string type, boolean subtypes, string name,
string signature, string ext, string input, string provenance
where
exists(Callable callee, Call call, int index |
sink.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // TODO
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // TODO
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" // TODO
) and
// The message is the concatenation of all relevant configs, and we surface only sinks that have at least one relevant
// config.
message =
strictconcat(AtmConfig::AtmConfig config, DataFlow::PathNode sinkPathNode |
config.isSinkCandidateWithFlow(sinkPathNode) and
sinkPathNode.getNode() = sink
|
config.getASinkEndpointType().getDescription(), ", "
) + "\n{'package': '" + package + "', 'type': '" + type + "', 'subtypes': " + subtypes +
", 'name': '" + name + "', 'signature': '" + signature + "', 'ext': '" + ext + "', 'input': '"
+ input + "', 'provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
select sink, message

View File

@@ -0,0 +1,29 @@
/*
* For internal use only.
*
* Labels used in training and evaluation data to indicate knowledge about whether an endpoint is a
* sink for a particular security query.
*/
newtype TEndpointLabel =
TSinkLabel() or
TNotASinkLabel() or
TUnknownLabel()
abstract class EndpointLabel extends TEndpointLabel {
abstract string getEncoding();
string toString() { result = getEncoding() }
}
class SinkLabel extends EndpointLabel, TSinkLabel {
override string getEncoding() { result = "Sink" }
}
class NotASinkLabel extends EndpointLabel, TNotASinkLabel {
override string getEncoding() { result = "NotASink" }
}
class UnknownLabel extends EndpointLabel, TUnknownLabel {
override string getEncoding() { result = "Unknown" }
}

View File

@@ -0,0 +1,17 @@
/*
* For internal use only.
*/
private import experimental.adaptivethreatmodeling.FeaturizationConfig
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
/**
* A featurization config that featurizes all endpoints.
*
* This should only be used in extraction queries and tests.
*/
class NoRestrictionsFeaturizationConfig extends FeaturizationConfig {
NoRestrictionsFeaturizationConfig() { this = "NoRestrictionsFeaturization" }
override DataFlow::Node getAnEndpointToFeaturize() { any() }
}

View File

@@ -0,0 +1,28 @@
/*
* For internal use only.
*
* Represents the security queries for which we currently have ML-powered versions.
*/
newtype TQuery =
TSqlTaintedQuery() or
TTaintedPathQuery() or
TRequestForgeryQuery()
abstract class Query extends TQuery {
abstract string getName();
string toString() { result = getName() }
}
class SqlTaintedQuery extends Query, TSqlTaintedQuery {
override string getName() { result = "SqlTainted" }
}
class TaintedPathQuery extends Query, TTaintedPathQuery {
override string getName() { result = "TaintedPath" }
}
class RequestForgeryQuery extends Query, TRequestForgeryQuery {
override string getName() { result = "RequestForgery" }
}

View File

@@ -0,0 +1,10 @@
name: codeql/java-experimental-atm-model-building
description: CodeQL libraries for building machine learning models for the experimental ML-powered queries
extractor: java
library: false
groups:
- java
- experimental
dependencies:
codeql/java-experimental-atm-lib: ${workspace}
codeql/java-experimental-atm-model: "0.0.0"

View File

@@ -0,0 +1,25 @@
/**
* For internal use only.
*
* @name Server-side request forgery (experimental)
* @description Making web requests based on unvalidated user-input
* may cause the server to communicate with malicious servers.
* @kind path-problem
* @scored
* @problem.severity error
* @security-severity 9.1
* @precision high
* @id java/ml-powered/ssrf
* @tags experimental security
* external/cwe/cwe-918
*/
import experimental.adaptivethreatmodeling.RequestForgeryATM
import AtmResultsInfo
import DataFlow::PathGraph
from AtmConfig cfg, DataFlow::PathNode source, DataFlow::PathNode sink, float score
where cfg.hasBoostedFlowPath(source, sink, score)
select sink.getNode(), source, sink,
"(Experimental) Potential server-side request forgery due to a $@.", source.getNode(),
"user-provided value", score

View File

@@ -0,0 +1,25 @@
/**
* For internal use only.
*
* @name Query built from user-controlled sources (experimental)
* @description Building a SQL or Java Persistence query from user-controlled sources is vulnerable to insertion of
* malicious code by the user.
* @kind path-problem
* @scored
* @problem.severity error
* @security-severity 8.8
* @precision high
* @id java/ml-powered/sql-injection
* @tags experimental security
* external/cwe/cwe-089
* external/cwe/cwe-564
*/
import experimental.adaptivethreatmodeling.SqlTaintedATM
import AtmResultsInfo
import DataFlow::PathGraph
from AtmConfig cfg, DataFlow::PathNode source, DataFlow::PathNode sink, float score
where cfg.hasBoostedFlowPath(source, sink, score)
select sink.getNode(), source, sink, "(Experimental) This query depends on a $@.", source.getNode(),
"user-provided value", score

View File

@@ -0,0 +1,26 @@
/**
* For internal use only.
*
* @name Uncontrolled data used in path expression (experimental)
* @description Accessing paths influenced by users can allow an attacker to access unexpected resources.
* @kind path-problem
* @scored
* @problem.severity error
* @security-severity 7.5
* @precision high
* @id java/ml-powered/path-injection
* @tags experimental security
* external/cwe/cwe-022
* external/cwe/cwe-023
* external/cwe/cwe-036
* external/cwe/cwe-073
*/
import experimental.adaptivethreatmodeling.TaintedPathATM
import AtmResultsInfo
import DataFlow::PathGraph
from AtmConfig cfg, DataFlow::PathNode source, DataFlow::PathNode sink, float score
where cfg.hasBoostedFlowPath(source, sink, score)
select sink.getNode(), source, sink, "(Experimental) This path depends on a $@.", source.getNode(),
"user-provided value", score

View File

@@ -0,0 +1,6 @@
---
dependencies:
codeql/java-experimental-atm-model:
version: 0.0.1
compiled: false
lockVersion: 1.0.0

View File

@@ -0,0 +1,2 @@
- description: ATM boosted Code Scanning queries for Java
- queries: .

View File

@@ -0,0 +1,12 @@
name: codeql/java-experimental-atm-queries
description: Experimental ML-powered queries for Java
language: java
version: 0.4.5
suites: codeql-suites
defaultSuiteFile: codeql-suites/java-atm-code-scanning.qls
groups:
- java
- experimental
dependencies:
codeql/java-experimental-atm-lib: ${workspace}
codeql/java-experimental-atm-model: "0.0.1"

View File

@@ -0,0 +1,2 @@
**/*.testproj
**/*.actual

View File

@@ -0,0 +1,6 @@
---
dependencies:
codeql/java-experimental-atm-model:
version: 0.3.0
compiled: false
lockVersion: 1.0.0

View File

@@ -0,0 +1,4 @@
name: codeql/java-experimental-atm-tests
extractor: java
dependencies:
codeql/java-experimental-atm-model-building: ${workspace}

View File

@@ -5,4 +5,4 @@ groups:
- javascript
- experimental
mlModels:
- "resources/*.codeqlmodel"
- "resources/shellcommand.codeqlmodel"

View File

@@ -1,6 +1,6 @@
---
dependencies:
codeql/javascript-experimental-atm-model:
version: 0.3.0
dsp-testing/javascript-experimental-atm-model:
version: 0.3.1-2022-12-21-01h55m24s.gray-roof-szzhgkwk.689231edea6179400bcffbcb0e7f6eb2bacd29c6be27a2930dd4f63ccdb64f34
compiled: false
lockVersion: 1.0.0