mirror of
https://github.com/github/codeql.git
synced 2026-05-16 04:09:27 +02:00
Compare commits
1 Commits
codeql-cli
...
jhelie/add
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
21d258fa37 |
@@ -8,16 +8,16 @@ provide:
|
||||
- "cpp/ql/test/query-tests/Security/CWE/CWE-190/semmle/tainted/qlpack.yml"
|
||||
- "go/ql/config/legacy-support/qlpack.yml"
|
||||
- "go/build/codeql-extractor-go/codeql-extractor.yml"
|
||||
- "javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml"
|
||||
- "*/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml"
|
||||
# This pack is explicitly excluded from the workspace since most users
|
||||
# will want to use a version of this pack from the package cache. Internal
|
||||
# users can uncomment the following line and place a custom ML model
|
||||
# in the corresponding pack to test a custom ML model within their local
|
||||
# checkout.
|
||||
# - "javascript/ql/experimental/adaptivethreatmodeling/model/qlpack.yml"
|
||||
- "javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/qlpack.yml"
|
||||
- "javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml"
|
||||
- "javascript/ql/experimental/adaptivethreatmodeling/test/qlpack.yml"
|
||||
- "*/ql/experimental/adaptivethreatmodeling/model/qlpack.yml"
|
||||
- "*/ql/experimental/adaptivethreatmodeling/modelbuilding/qlpack.yml"
|
||||
- "*/ql/experimental/adaptivethreatmodeling/src/qlpack.yml"
|
||||
- "*/ql/experimental/adaptivethreatmodeling/test/qlpack.yml"
|
||||
- "csharp/ql/campaigns/Solorigate/lib/qlpack.yml"
|
||||
- "csharp/ql/campaigns/Solorigate/src/qlpack.yml"
|
||||
- "csharp/ql/campaigns/Solorigate/test/qlpack.yml"
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Configures boosting for adaptive threat modeling (ATM).
|
||||
*/
|
||||
|
||||
private import java as java
|
||||
private import semmle.code.java.dataflow.TaintTracking
|
||||
import EndpointTypes
|
||||
import EndpointCharacteristics as EndpointCharacteristics
|
||||
import AdaptiveThreatModeling::ATM::ResultsInfo as AtmResultsInfo
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* A configuration class for defining known endpoints and endpoint filters for adaptive threat
|
||||
* modeling (ATM). Each boosted query must define its own extension of this abstract class.
|
||||
*
|
||||
* A configuration defines a set of known sources (`isKnownSource`) and sinks (`isKnownSink`).
|
||||
* It must also define a sink endpoint filter (`isEffectiveSink`) that filters candidate sinks
|
||||
* predicted by the machine learning model to a set of effective sinks.
|
||||
*
|
||||
* To get started with ATM, you can copy-paste an implementation of the relevant predicates from a
|
||||
* `DataFlow::Configuration` or `TaintTracking::Configuration` class for a standard security query.
|
||||
* For example, for SQL injection you can start by defining the `isKnownSource` and `isKnownSink`
|
||||
* predicates in the ATM configuration by copying and pasting the implementations of `isSource` and
|
||||
* `isSink` from `SqlInjection::Configuration`.
|
||||
*
|
||||
* Note that if the security query configuration defines additional edges beyond the standard data
|
||||
* flow edges, such as `NosqlInjection::Configuration`, you may need to replace the definition of
|
||||
* `isAdditionalFlowStep` with a more generalised definition of additional edges. See
|
||||
* `NosqlInjectionATM.qll` for an example of doing this.
|
||||
*/
|
||||
abstract class AtmConfig extends TaintTracking::Configuration {
|
||||
bindingset[this]
|
||||
AtmConfig() { any() }
|
||||
|
||||
/**
|
||||
* Holds if `source` is a relevant taint source. When sources are not boosted, `isSource` is equivalent to
|
||||
* `isKnownSource` (i.e there are no "effective" sources to be classified by an ML model).
|
||||
*/
|
||||
override predicate isSource(DataFlow::Node source) { this.isKnownSource(source) }
|
||||
|
||||
/**
|
||||
* Holds if `sink` is a known taint sink or an "effective" sink (a candidate to be classified by an ML model).
|
||||
*/
|
||||
override predicate isSink(DataFlow::Node sink) {
|
||||
this.isKnownSink(sink) or this.isEffectiveSink(sink)
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Holds if `source` is a known source of flow.
|
||||
*/
|
||||
predicate isKnownSource(DataFlow::Node source) { none() }
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Holds if `sink` is a known sink of flow.
|
||||
*/
|
||||
final predicate isKnownSink(DataFlow::Node sink) {
|
||||
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
|
||||
// known sink for the class.
|
||||
exists(EndpointCharacteristics::EndpointCharacteristic characteristic |
|
||||
characteristic.appliesToEndpoint(sink) and
|
||||
characteristic
|
||||
.hasImplications(this.getASinkEndpointType(), true, characteristic.maximalConfidence())
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Holds if the candidate source `candidateSource` predicted by the machine learning model should be
|
||||
* an effective source, i.e. one considered as a possible source of flow in the boosted query.
|
||||
*/
|
||||
predicate isEffectiveSource(DataFlow::Node candidateSource) { none() }
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
|
||||
* an effective sink, i.e. one considered as a possible sink of flow in the boosted query.
|
||||
*/
|
||||
predicate isEffectiveSink(DataFlow::Node candidateSink) {
|
||||
not exists(this.getAReasonSinkExcluded(candidateSink))
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink.
|
||||
*/
|
||||
final EndpointCharacteristics::EndpointCharacteristic getAReasonSinkExcluded(
|
||||
DataFlow::Node candidateSink
|
||||
) {
|
||||
// An endpoint is an effective sink (sink candidate) if none of its characteristics give much indication whether or
|
||||
// not it is a sink. Historically, we used endpoint filters, and scored endpoints that are filtered out neither by
|
||||
// a standard endpoint filter nor by an endpoint filter specific to this sink type.
|
||||
exists(EndpointCharacteristics::EndpointCharacteristic filter, float confidence |
|
||||
filter.appliesToEndpoint(candidateSink) and
|
||||
confidence >= filter.mediumConfidence() and
|
||||
(
|
||||
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
|
||||
filter.hasImplications(any(NegativeType negative), true, confidence)
|
||||
or
|
||||
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
|
||||
filter.hasImplications(this.getASinkEndpointType(), false, confidence)
|
||||
) and
|
||||
result = filter
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Get an endpoint type for the sources of this query. A query may have multiple applicable
|
||||
* endpoint types for its sources.
|
||||
*/
|
||||
EndpointType getASourceEndpointType() { none() }
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Get an endpoint type for the sinks of this query. A query may have multiple applicable
|
||||
* endpoint types for its sinks.
|
||||
*/
|
||||
abstract EndpointType getASinkEndpointType();
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Specifies the default cut-off value that controls how many alerts are produced.
|
||||
* The cut-off value must be in the range [0,1].
|
||||
* A cut-off value of 0 only produces alerts that are likely true-positives.
|
||||
* A cut-off value of 1 produces all alerts including those that are likely false-positives.
|
||||
*/
|
||||
float getScoreCutoff() { result = 0.0 }
|
||||
|
||||
/**
|
||||
* Holds if there's an ATM alert (a flow path from `source` to `sink` with ML-determined likelihood `score`) according
|
||||
* to this ML-boosted configuration, whereas the unboosted base query does not contain this source and sink
|
||||
* combination.
|
||||
*/
|
||||
predicate hasBoostedFlowPath(DataFlow::PathNode source, DataFlow::PathNode sink, float score) {
|
||||
this.hasFlowPath(source, sink) and
|
||||
not AtmResultsInfo::isFlowLikelyInBaseQuery(source.getNode(), sink.getNode()) and
|
||||
score = AtmResultsInfo::getScoreForFlow(source.getNode(), sink.getNode())
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if if `sink` is an effective sink with flow from `source` which gets used as a sink candidate for scoring
|
||||
* with the ML model.
|
||||
*/
|
||||
predicate isSinkCandidateWithFlow(DataFlow::PathNode sink) {
|
||||
exists(DataFlow::PathNode source |
|
||||
this.hasFlowPath(source, sink) and
|
||||
not AtmResultsInfo::isFlowLikelyInBaseQuery(source.getNode(), sink.getNode())
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Provides information about the results of boosted queries for use in adaptive threat modeling (ATM).
|
||||
*/
|
||||
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
import ATMConfig
|
||||
private import BaseScoring
|
||||
private import EndpointScoring as EndpointScoring
|
||||
|
||||
module ATM {
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* This module contains informational predicates about the results returned by adaptive threat
|
||||
* modeling (ATM).
|
||||
*/
|
||||
module ResultsInfo {
|
||||
/**
|
||||
* Indicates whether the flow from source to sink represents a result with
|
||||
* sufficiently high likelihood of being a true-positive.
|
||||
*/
|
||||
pragma[inline]
|
||||
private predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
|
||||
any(ScoringResults results).shouldResultBeIncluded(source, sink)
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Returns the score for the flow between the source `source` and the `sink` sink in the
|
||||
* boosted query.
|
||||
*/
|
||||
pragma[inline]
|
||||
float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink) {
|
||||
any(DataFlow::Configuration cfg).hasFlow(source, sink) and
|
||||
shouldResultBeIncluded(source, sink) and
|
||||
result = unique(float s | s = any(ScoringResults results).getScoreForFlow(source, sink))
|
||||
}
|
||||
|
||||
/**
|
||||
* Pad a score returned from `getKnownScoreForFlow` to a particular length by adding a decimal
|
||||
* point if one does not already exist, and "0"s after that decimal point.
|
||||
*
|
||||
* Note that this predicate must itself define an upper bound on `length`, so that it has a
|
||||
* finite number of results. Currently this is defined as 12.
|
||||
*/
|
||||
private string paddedScore(float score, int length) {
|
||||
// In this definition, we must restrict the values that `length` and `score` can take on so
|
||||
// that the predicate has a finite number of results.
|
||||
(score = getScoreForFlow(_, _) or score = 0) and
|
||||
length = result.length() and
|
||||
(
|
||||
// We need to make sure the padded score contains a "." so lexically sorting the padded
|
||||
// scores is equivalent to numerically sorting the scores.
|
||||
score.toString().charAt(_) = "." and
|
||||
result = score.toString()
|
||||
or
|
||||
not score.toString().charAt(_) = "." and
|
||||
result = score.toString() + "."
|
||||
)
|
||||
or
|
||||
result = paddedScore(score, length - 1) + "0" and
|
||||
length <= 12
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Return a string representing the score of the flow between `source` and `sink` in the
|
||||
* boosted query.
|
||||
*
|
||||
* The returned string is a fixed length, such that lexically sorting the strings returned by
|
||||
* this predicate gives the same sort order as numerically sorting the scores of the flows.
|
||||
*/
|
||||
pragma[inline]
|
||||
string getScoreStringForFlow(DataFlow::Node source, DataFlow::Node sink) {
|
||||
exists(float score |
|
||||
score = getScoreForFlow(source, sink) and
|
||||
(
|
||||
// A length of 12 is equivalent to 10 decimal places.
|
||||
score.toString().length() >= 12 and
|
||||
result = score.toString().substring(0, 12)
|
||||
or
|
||||
score.toString().length() < 12 and
|
||||
result = paddedScore(score, 12)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Indicates whether the flow from source to sink is likely to be reported by the base security
|
||||
* query.
|
||||
*
|
||||
* Currently this is a heuristic: it ignores potential differences in the definitions of
|
||||
* additional flow steps.
|
||||
*/
|
||||
pragma[inline]
|
||||
predicate isFlowLikelyInBaseQuery(DataFlow::Node source, DataFlow::Node sink) {
|
||||
getCfg().isKnownSource(source) and getCfg().isKnownSink(sink)
|
||||
}
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL. This API may change in the future.
|
||||
*
|
||||
* Get additional information about why ATM included the flow from source to sink as an alert.
|
||||
*/
|
||||
pragma[inline]
|
||||
string getAdditionalAlertInfo(DataFlow::Node source, DataFlow::Node sink) {
|
||||
exists(string sourceOrigins, string sinkOrigins |
|
||||
sourceOrigins = concat(any(ScoringResults results).getASourceOrigin(source), ", ") and
|
||||
sinkOrigins = concat(any(ScoringResults results).getASinkOrigin(sink), ", ") and
|
||||
result =
|
||||
"[Source origins: " +
|
||||
any(string s | if sourceOrigins != "" then s = sourceOrigins else s = "unknown") +
|
||||
"; sink origins: " +
|
||||
any(string s | if sinkOrigins != "" then s = sinkOrigins else s = "unknown") + "]"
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Provides shared scoring functionality for use in adaptive threat modeling (ATM).
|
||||
*/
|
||||
|
||||
private import java
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
private import ATMConfig
|
||||
|
||||
external predicate availableMlModels(
|
||||
string modelChecksum, string modelLanguage, string modelName, string modelType
|
||||
);
|
||||
|
||||
/** Get the ATM configuration. */
|
||||
AtmConfig getCfg() { any() }
|
||||
|
||||
/**
|
||||
* A string containing scoring information produced by a scoring model.
|
||||
*
|
||||
* Scoring models include embedding models and endpoint scoring models.
|
||||
*/
|
||||
abstract class ScoringResults extends string {
|
||||
bindingset[this]
|
||||
ScoringResults() { any() }
|
||||
|
||||
/**
|
||||
* Get ATM's confidence that a path between `source` and `sink` represents a security
|
||||
* vulnerability. This will be a number between 0.0 and 1.0.
|
||||
*/
|
||||
abstract float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink);
|
||||
|
||||
/**
|
||||
* Get a string representing why ATM included the given source in the dataflow analysis.
|
||||
*
|
||||
* In general, there may be multiple reasons why ATM included the given source, in which case
|
||||
* this predicate should have multiple results.
|
||||
*/
|
||||
abstract string getASourceOrigin(DataFlow::Node source);
|
||||
|
||||
/**
|
||||
* Get a string representing why ATM included the given sink in the dataflow analysis.
|
||||
*
|
||||
* In general, there may be multiple reasons why ATM included the given sink, in which case this
|
||||
* predicate should have multiple results.
|
||||
*/
|
||||
abstract string getASinkOrigin(DataFlow::Node sink);
|
||||
|
||||
/**
|
||||
* Indicates whether the flow from source to sink represents a result with
|
||||
* sufficiently high likelihood of being a true-positive.
|
||||
*/
|
||||
pragma[inline]
|
||||
abstract predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink);
|
||||
}
|
||||
@@ -0,0 +1,607 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*/
|
||||
|
||||
private import java as java
|
||||
import semmle.code.java.dataflow.TaintTracking
|
||||
import semmle.code.java.security.QueryInjection
|
||||
import semmle.code.java.security.PathCreation
|
||||
import semmle.code.java.security.RequestForgery
|
||||
private import semmle.code.java.dataflow.ExternalFlow
|
||||
import experimental.adaptivethreatmodeling.EndpointTypes
|
||||
private import experimental.adaptivethreatmodeling.ATMConfig
|
||||
private import experimental.adaptivethreatmodeling.SqlTaintedATM
|
||||
private import experimental.adaptivethreatmodeling.TaintedPathATM
|
||||
private import experimental.adaptivethreatmodeling.RequestForgeryATM
|
||||
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
|
||||
private import semmle.code.java.Expr as Expr
|
||||
|
||||
/**
|
||||
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
|
||||
* characteristics. Lists the problematic characterisitics and their implications for all such endpoints, together with
|
||||
* an error message indicating why this combination is problematic.
|
||||
*
|
||||
* Copied from javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
|
||||
*/
|
||||
query predicate erroneousEndpoints(
|
||||
DataFlow::Node endpoint, EndpointCharacteristic characteristic, EndpointType endpointClass,
|
||||
float confidence, string errorMessage
|
||||
) {
|
||||
// An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
|
||||
// class.
|
||||
exists(EndpointCharacteristic characteristic2, EndpointType endpointClass2, float confidence2 |
|
||||
endpointClass.getEncoding() != endpointClass2.getEncoding() and
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
characteristic.hasImplications(endpointClass, true, confidence) and
|
||||
characteristic2.hasImplications(endpointClass2, true, confidence2) and
|
||||
confidence > characteristic.mediumConfidence() and
|
||||
confidence2 > characteristic2.mediumConfidence() and
|
||||
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
|
||||
// treated by the actual query as a sanitizer, since the final logic is something like
|
||||
// `isSink(n) and not isSanitizer(n)`.
|
||||
not (
|
||||
characteristic instanceof IsSanitizerCharacteristic or
|
||||
characteristic2 instanceof IsSanitizerCharacteristic
|
||||
)
|
||||
) and
|
||||
errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
|
||||
or
|
||||
// An enpoint's characteristics should not include positive indicators with medium/high confidence for some class and
|
||||
// also include negative indicators with medium/high confidence for this same class.
|
||||
exists(EndpointCharacteristic characteristic2, float confidence2 |
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
characteristic.hasImplications(endpointClass, true, confidence) and
|
||||
characteristic2.hasImplications(endpointClass, false, confidence2) and
|
||||
confidence > characteristic.mediumConfidence() and
|
||||
confidence2 > characteristic2.mediumConfidence()
|
||||
) and
|
||||
errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
|
||||
}
|
||||
|
||||
query predicate erroneousConfidences(
|
||||
EndpointCharacteristic characteristic, float confidence, string errorMessage
|
||||
) {
|
||||
characteristic.hasImplications(_, _, confidence) and
|
||||
(confidence < 0 or confidence > 1) and
|
||||
errorMessage = "Characteristic has an indicator with confidence outside of [0, 1]"
|
||||
}
|
||||
|
||||
/**
|
||||
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
|
||||
* about whether to include the endpoint in the training set and with what label, as well as whether to score the
|
||||
* endpoint at inference time.
|
||||
*/
|
||||
abstract class EndpointCharacteristic extends string {
|
||||
/**
|
||||
* Holds when the string matches the name of the characteristic, which should describe some characteristic of the
|
||||
* endpoint that is meaningful for determining whether it's a sink and if so of which type
|
||||
*/
|
||||
bindingset[this]
|
||||
EndpointCharacteristic() { any() }
|
||||
|
||||
/**
|
||||
* Holds for endpoints that have this characteristic. This predicate contains the logic that applies characteristics
|
||||
* to the appropriate set of dataflow nodes.
|
||||
*/
|
||||
abstract predicate appliesToEndpoint(DataFlow::Node n);
|
||||
|
||||
/**
|
||||
* This predicate describes what the characteristic tells us about an endpoint.
|
||||
*
|
||||
* Params:
|
||||
* endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier
|
||||
* class for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single
|
||||
* sink type.
|
||||
* isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
|
||||
* false, it indicates that it _isn't_ a member of the class.
|
||||
* confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
|
||||
* belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
|
||||
* indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
|
||||
* this characteristic definitively do/don't belong to the class.
|
||||
*/
|
||||
abstract predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
);
|
||||
|
||||
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
|
||||
final float getHighConfidenceThreshold() { result = 0.8 }
|
||||
|
||||
// The following are some confidence values that are used in practice by the subclasses. They are defined as named
|
||||
// constants here to make it easier to change them in the future.
|
||||
final float maximalConfidence() { result = 1.0 }
|
||||
|
||||
final float highConfidence() { result = 0.9 }
|
||||
|
||||
final float mediumConfidence() { result = 0.6 }
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Helper predicates.
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess }
|
||||
|
||||
/**
|
||||
* Holds if `n` has the given metadata.
|
||||
*
|
||||
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query as
|
||||
* well as the queries that exatract positive and negative examples for the prompt / training set. The metadata is
|
||||
* extracted as a string in the format of a Python dictionary.
|
||||
*/
|
||||
predicate hasMetadata(DataFlow::Node n, string metadata) {
|
||||
exists(
|
||||
Callable callee, Call call, string package, string type, boolean subtypes, string name,
|
||||
string signature, string ext, int input, string provenance, boolean isPublic,
|
||||
boolean isExternalApiDataNode
|
||||
|
|
||||
n.asExpr() = call.getArgument(input) and
|
||||
callee = call.getCallee() and
|
||||
package = callee.getDeclaringType().getPackage().getName() and
|
||||
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
|
||||
(
|
||||
if callee.isFinal() or callee.getDeclaringType().isFinal()
|
||||
then subtypes = false // See https://github.com/github/codeql-java-team/issues/254#issuecomment-1422296423
|
||||
else subtypes = true
|
||||
) and
|
||||
name = callee.getName() and // TODO: Will this work for constructors?
|
||||
signature = paramsString(callee) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)?
|
||||
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
|
||||
provenance = "manual" and // TODO
|
||||
(if callee.isPublic() then isPublic = true else isPublic = false) and
|
||||
(
|
||||
if n instanceof ExternalAPIs::ExternalApiDataNode
|
||||
then isExternalApiDataNode = true
|
||||
else isExternalApiDataNode = false
|
||||
) and
|
||||
metadata =
|
||||
"{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
|
||||
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
|
||||
"', 'Argument index': " + input + ", 'Provenance': '" + provenance + "', 'Is public': " +
|
||||
isPublic + ", 'Is passed to external API': " + isExternalApiDataNode + "}" // TODO: Why are the curly braces added twice?
|
||||
)
|
||||
}
|
||||
|
||||
// private predicate isKnownExternalApiQuerySink(DataFlow::Node n) {
|
||||
// n instanceof Xxe::Sink or
|
||||
// n instanceof TaintedPath::Sink or
|
||||
// n instanceof XpathInjection::Sink or
|
||||
// n instanceof Xss::Sink or
|
||||
// n instanceof ClientSideUrlRedirect::Sink or
|
||||
// n instanceof CodeInjection::Sink or
|
||||
// n instanceof RequestForgery::Sink or
|
||||
// n instanceof CorsMisconfigurationForCredentials::Sink or
|
||||
// n instanceof CommandInjection::Sink or
|
||||
// n instanceof PrototypePollution::Sink or
|
||||
// n instanceof UnvalidatedDynamicMethodCall::Sink or
|
||||
// n instanceof TaintedFormatString::Sink or
|
||||
// n instanceof NosqlInjection::Sink or
|
||||
// n instanceof PostMessageStar::Sink or
|
||||
// n instanceof RegExpInjection::Sink or
|
||||
// n instanceof SqlTainted::Sink or
|
||||
// n instanceof XmlBomb::Sink or
|
||||
// n instanceof ZipSlip::Sink or
|
||||
// n instanceof UnsafeDeserialization::Sink or
|
||||
// n instanceof ServerSideUrlRedirect::Sink or
|
||||
// n instanceof CleartextStorage::Sink or
|
||||
// n instanceof HttpToFileAccess::Sink
|
||||
// }
|
||||
// /**
|
||||
// * Holds if the node `n` is a known sink in a modeled library.
|
||||
// */
|
||||
// private predicate isKnownLibrarySink(DataFlow::Node n) {
|
||||
// isKnownExternalApiQuerySink(n) or
|
||||
// n instanceof CleartextLogging::Sink or
|
||||
// n instanceof StackTraceExposure::Sink or
|
||||
// n instanceof ShellCommandInjectionFromEnvironment::Sink or
|
||||
// n instanceof InsecureRandomness::Sink or
|
||||
// n instanceof FileAccessToHttp::Sink or
|
||||
// n instanceof IndirectCommandInjection::Sink
|
||||
// }
|
||||
// /**
|
||||
// * Holds if the node `n` is known as the predecessor in a modeled flow step.
|
||||
// */
|
||||
// private predicate isKnownStepSrc(DataFlow::Node n) {
|
||||
// TaintTracking::sharedTaintStep(n, _) or
|
||||
// DataFlow::SharedFlowStep::step(n, _) or
|
||||
// DataFlow::SharedFlowStep::step(n, _, _, _)
|
||||
// }
|
||||
// /**
|
||||
// * Holds if the data flow node is a (possibly indirect) argument of a likely external library call.
|
||||
// *
|
||||
// * This includes direct arguments of likely external library calls as well as nested object
|
||||
// * literals within those calls.
|
||||
// */
|
||||
// private predicate flowsToArgumentOfLikelyExternalLibraryCall(DataFlow::Node n) {
|
||||
// n = getACallWithoutCallee().getAnArgument()
|
||||
// or
|
||||
// exists(DataFlow::SourceNode src | flowsToArgumentOfLikelyExternalLibraryCall(src) |
|
||||
// n = src.getAPropertyWrite().getRhs()
|
||||
// )
|
||||
// or
|
||||
// exists(DataFlow::ArrayCreationNode arr | flowsToArgumentOfLikelyExternalLibraryCall(arr) |
|
||||
// n = arr.getAnElement()
|
||||
// )
|
||||
// }
|
||||
// /**
|
||||
// * Get calls for which we do not have the callee (i.e. the definition of the called function). This
|
||||
// * acts as a heuristic for identifying calls to external library functions.
|
||||
// */
|
||||
// private DataFlow::CallNode getACallWithoutCallee() {
|
||||
// forall(Function callee | callee = result.getACallee() | callee.getTopLevel().isExterns()) and
|
||||
// not exists(DataFlow::ParameterNode param, DataFlow::FunctionNode callback |
|
||||
// param.flowsTo(result.getCalleeNode()) and
|
||||
// callback = getACallback(param, DataFlow::TypeBackTracker::end())
|
||||
// )
|
||||
// }
|
||||
// /**
|
||||
// * Gets a node that flows to callback-parameter `p`.
|
||||
// */
|
||||
// private DataFlow::SourceNode getACallback(DataFlow::ParameterNode p, DataFlow::TypeBackTracker t) {
|
||||
// t.start() and
|
||||
// result = p and
|
||||
// any(DataFlow::FunctionNode f).getLastParameter() = p and
|
||||
// exists(p.getACall())
|
||||
// or
|
||||
// exists(DataFlow::TypeBackTracker t2 | result = getACallback(p, t2).backtrack(t2, t))
|
||||
// }
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Characteristics that are indicative of a sink.
|
||||
// NOTE: Initially each sink type has only one characteristic, which is that it's a sink of this type in the standard
|
||||
// Java libraries.
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// /**
|
||||
// * Endpoints identified as "DomBasedXssSink" by the standard Java libraries are XSS sinks with maximal confidence.
|
||||
// */
|
||||
// private class DomBasedXssSinkCharacteristic extends EndpointCharacteristic {
|
||||
// DomBasedXssSinkCharacteristic() { this = any(XssSinkType type).getDescription() }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) { n instanceof DomBasedXss::Sink }
|
||||
// override predicate hasImplications(
|
||||
// EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
// ) {
|
||||
// endpointClass instanceof XssSinkType and
|
||||
// isPositiveIndicator = true and
|
||||
// confidence = maximalConfidence()
|
||||
// }
|
||||
// }
|
||||
/**
|
||||
* Endpoints identified as "TaintedPathSink" by the standard Java libraries are path injection sinks with maximal
|
||||
* confidence.
|
||||
*/
|
||||
private class TaintedPathSinkCharacteristic extends EndpointCharacteristic {
|
||||
TaintedPathSinkCharacteristic() { this = any(TaintedPathSinkType type).getDescription() }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
n.asExpr() = any(PathCreation p).getAnInput()
|
||||
or
|
||||
sinkNode(n, "create-file")
|
||||
}
|
||||
|
||||
override predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointClass instanceof TaintedPathSinkType and
|
||||
isPositiveIndicator = true and
|
||||
confidence = maximalConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Endpoints identified as "SqlTaintedSink" by the standard Java libraries are SQL injection sinks with maximal
|
||||
* confidence.
|
||||
*/
|
||||
private class SqlTaintedSinkCharacteristic extends EndpointCharacteristic {
|
||||
SqlTaintedSinkCharacteristic() { this = any(SqlTaintedSinkType type).getDescription() }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) { n instanceof QueryInjectionSink }
|
||||
|
||||
override predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointClass instanceof SqlTaintedSinkType and
|
||||
isPositiveIndicator = true and
|
||||
confidence = maximalConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Endpoints identified as "RequestForgerySink" by the standard Java libraries are server-side request forgery sinks
|
||||
* with maximal confidence.
|
||||
*/
|
||||
private class RequestForgerySinkCharacteristic extends EndpointCharacteristic {
|
||||
RequestForgerySinkCharacteristic() { this = any(RequestForgerySinkType type).getDescription() }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) { n instanceof RequestForgerySink }
|
||||
|
||||
override predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointClass instanceof RequestForgerySinkType and
|
||||
isPositiveIndicator = true and
|
||||
confidence = maximalConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Characteristics that are indicative of not being a sink of any type, and have historically been used to select
|
||||
// negative samples for training.
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* A characteristic that is an indicator of not being a sink of any type, because it's a modeled argument.
|
||||
*/
|
||||
abstract class OtherModeledArgumentCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
OtherModeledArgumentCharacteristic() { any() }
|
||||
}
|
||||
|
||||
/**
|
||||
* A characteristic that is an indicator of not being a sink of any type, because it's an argument to a function of a
|
||||
* builtin object.
|
||||
*/
|
||||
abstract private class ArgumentToBuiltinFunctionCharacteristic extends OtherModeledArgumentCharacteristic {
|
||||
bindingset[this]
|
||||
ArgumentToBuiltinFunctionCharacteristic() { any() }
|
||||
}
|
||||
|
||||
/**
|
||||
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type.
|
||||
*/
|
||||
abstract private class NotASinkCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
NotASinkCharacteristic() { any() }
|
||||
|
||||
override predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointClass instanceof NegativeType and
|
||||
isPositiveIndicator = true and
|
||||
confidence = highConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A medium-confidence characteristic that indicates that an endpoint is not a sink of any type.
|
||||
*
|
||||
* TODO: This class is currently not private, because the current extraction logic explicitly avoids including these
|
||||
* endpoints in the training data. We might want to change this in the future.
|
||||
*/
|
||||
abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
LikelyNotASinkCharacteristic() { any() }
|
||||
|
||||
override predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointClass instanceof NegativeType and
|
||||
isPositiveIndicator = true and
|
||||
confidence = mediumConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is a type access. Type accesses are not sinks.
|
||||
*/
|
||||
private class IsTypeAccessCharacteristic extends NotASinkCharacteristic {
|
||||
IsTypeAccessCharacteristic() { this = "type access" }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) { isTypeAccess(n) }
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is a sanitizer for some sink type. A sanitizer can
|
||||
* never be a sink.
|
||||
*/
|
||||
private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
|
||||
IsSanitizerCharacteristic() { this = "is sanitizer" }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
exists(AtmConfig config | config.isSanitizer(n))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is an argument to a safe external API method.
|
||||
*
|
||||
* Based on java/ql/lib/semmle/code/java/security/ExternalAPIs.qll.
|
||||
*
|
||||
* TODO: Is this correct?
|
||||
*/
|
||||
private class SafeExternalApiMethodCharacteristic extends NotASinkCharacteristic {
|
||||
string baseDescription;
|
||||
|
||||
SafeExternalApiMethodCharacteristic() {
|
||||
baseDescription = "safe external API method " and
|
||||
this = any(string s | s = baseDescription + ["org.junit", "other than org.junit"])
|
||||
}
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
exists(Expr::Call call |
|
||||
n.asExpr() = call.getAnArgument() and
|
||||
call.getCallee() instanceof ExternalAPIs::SafeExternalApiMethod and
|
||||
(
|
||||
// The vast majority of calls to safe external API methods involve junit. To get a diverse set of negative
|
||||
// examples, we break those off into a separate characteristic.
|
||||
call.getCallee().getDeclaringType().getPackage().getName().matches("org.junit%") and
|
||||
this = baseDescription + "org.junit"
|
||||
or
|
||||
not call.getCallee().getDeclaringType().getPackage().getName().matches("org.junit%") and
|
||||
this = baseDescription + "other than org.junit"
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Characteristics that have historically acted as endpoint filters to exclude endpoints from scoring at inference time.
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
/** A characteristic that has historically acted as an endpoint filter for inference-time scoring. */
|
||||
abstract class EndpointFilterCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
EndpointFilterCharacteristic() { any() }
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is unlikely to be a sink of any type.
|
||||
*/
|
||||
abstract private class StandardEndpointFilterCharacteristic extends EndpointFilterCharacteristic {
|
||||
bindingset[this]
|
||||
StandardEndpointFilterCharacteristic() { any() }
|
||||
|
||||
override predicate hasImplications(
|
||||
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointClass instanceof NegativeType and
|
||||
isPositiveIndicator = true and
|
||||
confidence = mediumConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is a constant expression. While a constant expression
|
||||
* can be a sink, it cannot be part of a tainted flow: Constant expressions always evaluate to a constant primitive
|
||||
* value, so they can't ever appear in an alert. These endpoints are therefore excluded from scoring at inference time.
|
||||
*
|
||||
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
|
||||
* non-sinks. They are merely not interesting sinks to run through the ML model because they can never be part of a
|
||||
* tainted flow.
|
||||
*/
|
||||
class IsConstantExpressionCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
IsConstantExpressionCharacteristic() { this = "constant expression" }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
n.asExpr() instanceof CompileTimeConstantExpr
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is not part of the source code for the project being
|
||||
* analyzed.
|
||||
*
|
||||
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
|
||||
* non-sinks. They are merely not interesting sinks to run through the ML model.
|
||||
*/
|
||||
private class IsExternalCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
IsExternalCharacteristic() { this = "external" }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
not exists(n.getLocation().getFile().getRelativePath())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint is not the final step in a taint propagation. This
|
||||
* prevents us from detecting expresssions near sinks that are not the sink itself.
|
||||
*
|
||||
* WARNING: These endpoints should not be used as negative samples for training, because a there are rare situations
|
||||
* where a node is both a sink and the `from` node of a flow step: when the called API uses the given value dangerously
|
||||
* and then returns the given value. Example: `stillTainted = dangerous(tainted)`, assuming that the implementation of
|
||||
* `dangerous(x)` eventually returns `x`.
|
||||
*/
|
||||
private class IsFlowStep extends StandardEndpointFilterCharacteristic {
|
||||
IsFlowStep() { this = "flow step" }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) { isKnownStepSrc(n) }
|
||||
|
||||
/**
|
||||
* Holds if the node `n` is known as the predecessor in a modeled flow step.
|
||||
*/
|
||||
private predicate isKnownStepSrc(DataFlow::Node n) {
|
||||
any(TaintTracking::Configuration c).isAdditionalFlowStep(n, _) or
|
||||
TaintTracking::localTaintStep(n, _)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An EndpointFilterCharacteristic that indicates that an endpoint sits in a test file.
|
||||
*
|
||||
* WARNING: These endpoints should not be used as negative samples for training, because there can in fact be sinks in
|
||||
* test files -- we just don't care to model them because they aren't exploitable.
|
||||
*/
|
||||
private class TestFileCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
TestFileCharacteristic() { this = "test file" }
|
||||
|
||||
override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
exists(File f | f = n.getLocation().getFile() and isInTestFile(f))
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `file` is a test file. Copied from java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll.
|
||||
*
|
||||
* TODO: Why can't I import utils.modelgenerator.internal.CaptureModelsSpecific?
|
||||
*/
|
||||
private predicate isInTestFile(File file) {
|
||||
file.getAbsolutePath().matches("%src/test/%") or
|
||||
file.getAbsolutePath().matches("%/guava-tests/%") or
|
||||
file.getAbsolutePath().matches("%/guava-testlib/%")
|
||||
}
|
||||
}
|
||||
// class IsArgumentToModeledFunctionCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// IsArgumentToModeledFunctionCharacteristic() { this = "argument to modeled function" }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// exists(DataFlow::InvokeNode invk, DataFlow::Node known |
|
||||
// invk.getAnArgument() = n and
|
||||
// invk.getAnArgument() = known and
|
||||
// (
|
||||
// isKnownLibrarySink(known)
|
||||
// or
|
||||
// isKnownStepSrc(known)
|
||||
// or
|
||||
// exists(OtherModeledArgumentCharacteristic characteristic |
|
||||
// characteristic.appliesToEndpoint(known)
|
||||
// )
|
||||
// )
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
// private class IsArgumentToSinklessLibraryCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// IsArgumentToSinklessLibraryCharacteristic() { this = "argument to sinkless library" }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// exists(DataFlow::InvokeNode invk, DataFlow::SourceNode commonSafeLibrary, string libraryName |
|
||||
// libraryName = ["slugify", "striptags", "marked"]
|
||||
// |
|
||||
// commonSafeLibrary = DataFlow::moduleImport(libraryName) and
|
||||
// invk = [commonSafeLibrary, commonSafeLibrary.getAPropertyRead()].getAnInvocation() and
|
||||
// n = invk.getAnArgument()
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
// private class IsSanitizerCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// IsSanitizerCharacteristic() { this = "sanitizer" }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// exists(DataFlow::CallNode call | n = call.getAnArgument() |
|
||||
// call.getCalleeName().regexpMatch("(?i).*(escape|valid(ate)?|sanitize|purify).*")
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
// private class IsPredicateCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// IsPredicateCharacteristic() { this = "predicate" }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// exists(DataFlow::CallNode call | n = call.getAnArgument() |
|
||||
// call.getCalleeName().regexpMatch("(equals|(|is|has|can)(_|[A-Z])).*")
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
// private class IsHashCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// IsHashCharacteristic() { this = "hash" }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// exists(DataFlow::CallNode call | n = call.getAnArgument() |
|
||||
// call.getCalleeName().regexpMatch("(?i)^(sha\\d*|md5|hash)$")
|
||||
// )
|
||||
// }
|
||||
// }
|
||||
// private class IsNumericCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// IsNumericCharacteristic() { this = "numeric" }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// SyntacticHeuristics::isReadFrom(n, ".*index.*")
|
||||
// }
|
||||
// }
|
||||
// private class InIrrelevantFileCharacteristic extends StandardEndpointFilterCharacteristic {
|
||||
// private string category;
|
||||
// InIrrelevantFileCharacteristic() {
|
||||
// this = "in " + category + " file" and category = ["externs", "generated", "library", "test"]
|
||||
// }
|
||||
// override predicate appliesToEndpoint(DataFlow::Node n) {
|
||||
// // Ignore candidate sinks within externs, generated, library, and test code
|
||||
// ClassifyFiles::classify(n.getFile(), category)
|
||||
// }
|
||||
// }
|
||||
@@ -0,0 +1,139 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Extracts data about the database for use in adaptive threat modeling (ATM).
|
||||
*/
|
||||
|
||||
private import java
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
private import FeaturizationConfig
|
||||
|
||||
/**
|
||||
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
|
||||
*
|
||||
* This is a single string containing a space-separated list of tokens.
|
||||
*/
|
||||
private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
|
||||
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
exists(EndpointFeature f | f.getName() = featureName and result = f.getValue(endpoint)) and
|
||||
featureName = getASupportedFeatureName()
|
||||
}
|
||||
|
||||
/** Get a name of a supported generic token-based feature. */
|
||||
string getASupportedFeatureName() { result = any(EndpointFeature f).getName() }
|
||||
|
||||
/**
|
||||
* Generic token-based features for ATM.
|
||||
*
|
||||
* This predicate holds if the generic token-based feature named `featureName` has the value
|
||||
* `featureValue` for the endpoint `endpoint`.
|
||||
*/
|
||||
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
|
||||
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
featureValue = getTokenFeature(endpoint, featureName)
|
||||
}
|
||||
|
||||
/**
|
||||
* See EndpointFeature
|
||||
*/
|
||||
private newtype TEndpointFeature =
|
||||
TEnclosingFunctionName() or
|
||||
TInputArgumentIndex() or
|
||||
TCalleeFlexibleAccessPath() or
|
||||
TEnclosingFunctionSignature() or
|
||||
TContextFunctionInterfaces()
|
||||
|
||||
/**
|
||||
* An implementation of an endpoint feature: defines feature-name/value tuples for use in ML.
|
||||
*/
|
||||
abstract class EndpointFeature extends TEndpointFeature {
|
||||
/**
|
||||
* Gets the name of the feature. Used by the ML model.
|
||||
* Names are coupled to models: changing the name of a feature requires retraining the model.
|
||||
*/
|
||||
abstract string getName();
|
||||
|
||||
/**
|
||||
* Gets the value of the feature. Used by the ML model.
|
||||
* Models are trained based on feature values, so changing the value of a feature requires retraining the model.
|
||||
*/
|
||||
abstract string getValue(DataFlow::Node endpoint);
|
||||
|
||||
string toString() { result = this.getName() }
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Feature: EnclosingFunctionName
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* The feature for the name of the function that encloses the endpoint.
|
||||
*/
|
||||
class EnclosingFunctionName extends EndpointFeature, TEnclosingFunctionName {
|
||||
override string getName() { result = "enclosingFunctionName" }
|
||||
|
||||
override string getValue(DataFlow::Node endpoint) {
|
||||
result = endpoint.getEnclosingCallable().getName()
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Feature: InputArgumentIndex
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
class InputArgumentIndex extends EndpointFeature, TInputArgumentIndex {
|
||||
override string getName() { result = "InputArgumentIndex" }
|
||||
|
||||
override string getValue(DataFlow::Node endpoint) {
|
||||
exists(Argument arg | endpoint.asExpr() = arg and result = arg.getPosition().toString())
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Feature: CalleeFlexibleAccessPath
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
class CalleeFlexibleAccessPath extends EndpointFeature, TCalleeFlexibleAccessPath {
|
||||
override string getName() { result = "CalleeFlexibleAccessPath" }
|
||||
|
||||
override string getValue(DataFlow::Node endpoint) {
|
||||
exists(Callable callee, Call call, string package, string type, string name |
|
||||
endpoint.asExpr() = call.getAnArgument() and
|
||||
callee = call.getCallee() and
|
||||
package = callee.getDeclaringType().getPackage().getName() and
|
||||
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
|
||||
name = callee.getName() and
|
||||
result = package + "." + type + "." + name
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Feature: EnclosingFunctionSignature
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
class EnclosingFunctionSignature extends EndpointFeature, TEnclosingFunctionSignature {
|
||||
override string getName() { result = "enclosingFunctionSignature" }
|
||||
|
||||
override string getValue(DataFlow::Node endpoint) {
|
||||
exists(Callable callee |
|
||||
callee = endpoint.getEnclosingCallable() and
|
||||
result = callee.paramsString()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
// Feature: ContextFunctionInterfaces
|
||||
//----------------------------------------------------------------------------------------------------------------------
|
||||
class ContextFunctionInterfaces extends EndpointFeature, TContextFunctionInterfaces {
|
||||
override string getName() { result = "contextFunctionInterfaces" }
|
||||
|
||||
override string getValue(DataFlow::Node endpoint) {
|
||||
result =
|
||||
concat(Method method, string line |
|
||||
method.getLocation().getFile() = endpoint.getLocation().getFile() and
|
||||
line = method.getStringSignature()
|
||||
|
|
||||
line, "\n" order by line
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Provides an implementation of scoring alerts for use in adaptive threat modeling (ATM).
|
||||
*/
|
||||
|
||||
private import java
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
private import BaseScoring
|
||||
private import EndpointFeatures as EndpointFeatures
|
||||
private import FeaturizationConfig
|
||||
private import EndpointTypes
|
||||
|
||||
private string getACompatibleModelChecksum() {
|
||||
availableMlModels(result, "java", _, "atm-endpoint-scoring")
|
||||
}
|
||||
|
||||
module ModelScoring {
|
||||
/**
|
||||
* A featurization config that only featurizes new candidate endpoints that are part of a flow
|
||||
* path.
|
||||
*/
|
||||
class RelevantFeaturizationConfig extends FeaturizationConfig {
|
||||
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
|
||||
|
||||
override DataFlow::Node getAnEndpointToFeaturize() {
|
||||
getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
|
||||
or
|
||||
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result)
|
||||
}
|
||||
}
|
||||
|
||||
DataFlow::Node getARequestedEndpoint() {
|
||||
result = any(FeaturizationConfig cfg).getAnEndpointToFeaturize()
|
||||
}
|
||||
|
||||
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
|
||||
|
||||
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
|
||||
scoreEndpoints(getARequestedEndpoint/0, EndpointFeatures::tokenFeatures/3,
|
||||
EndpointFeatures::getASupportedFeatureName/0, getARequestedEndpointType/0,
|
||||
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return ATM's confidence that `source` is a source for the given security query. This will be a
|
||||
* number between 0.0 and 1.0.
|
||||
*/
|
||||
private float getScoreForSource(DataFlow::Node source) {
|
||||
if getCfg().isKnownSource(source)
|
||||
then result = 1.0
|
||||
else (
|
||||
// This restriction on `source` has no semantic effect but improves performance.
|
||||
getCfg().isEffectiveSource(source) and
|
||||
ModelScoring::endpointScores(source, getCfg().getASourceEndpointType().getEncoding(), result)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return ATM's confidence that `sink` is a sink for the given security query. This will be a
|
||||
* number between 0.0 and 1.0.
|
||||
*/
|
||||
private float getScoreForSink(DataFlow::Node sink) {
|
||||
if getCfg().isKnownSink(sink)
|
||||
then result = 1.0
|
||||
else (
|
||||
// This restriction on `sink` has no semantic effect but improves performance.
|
||||
getCfg().isEffectiveSink(sink) and
|
||||
ModelScoring::endpointScores(sink, getCfg().getASinkEndpointType().getEncoding(), result)
|
||||
)
|
||||
}
|
||||
|
||||
class EndpointScoringResults extends ScoringResults {
|
||||
EndpointScoringResults() {
|
||||
this = "EndpointScoringResults" and exists(getACompatibleModelChecksum())
|
||||
}
|
||||
|
||||
/**
|
||||
* Get ATM's confidence that a path between `source` and `sink` represents a security
|
||||
* vulnerability. This will be a number between 0.0 and 1.0.
|
||||
*/
|
||||
override float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink) {
|
||||
result = getScoreForSource(source) * getScoreForSink(sink)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a string representing why ATM included the given source in the dataflow analysis.
|
||||
*
|
||||
* In general, there may be multiple reasons why ATM included the given source, in which case
|
||||
* this predicate should have multiple results.
|
||||
*/
|
||||
pragma[inline]
|
||||
override string getASourceOrigin(DataFlow::Node source) {
|
||||
result = "known" and getCfg().isKnownSource(source)
|
||||
or
|
||||
result = "predicted" and getCfg().isEffectiveSource(source)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a string representing why ATM included the given sink in the dataflow analysis.
|
||||
*
|
||||
* In general, there may be multiple reasons why ATM included the given sink, in which case
|
||||
* this predicate should have multiple results.
|
||||
*/
|
||||
pragma[inline]
|
||||
override string getASinkOrigin(DataFlow::Node sink) {
|
||||
result = "known" and getCfg().isKnownSink(sink)
|
||||
or
|
||||
not getCfg().isKnownSink(sink) and
|
||||
result =
|
||||
"predicted (scores: " +
|
||||
concat(EndpointType type, float score |
|
||||
ModelScoring::endpointScores(sink, type.getEncoding(), score)
|
||||
|
|
||||
type.getDescription() + "=" + score.toString(), ", " order by type.getEncoding()
|
||||
) + ")" and
|
||||
getCfg().isEffectiveSink(sink)
|
||||
}
|
||||
|
||||
pragma[inline]
|
||||
override predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
|
||||
exists(source) and
|
||||
if getCfg().isKnownSink(sink)
|
||||
then any()
|
||||
else (
|
||||
// This restriction on `sink` has no semantic effect but improves performance.
|
||||
getCfg().isEffectiveSink(sink) and
|
||||
exists(float sinkScore |
|
||||
ModelScoring::endpointScores(sink, getCfg().getASinkEndpointType().getEncoding(), sinkScore) and
|
||||
// Include the endpoint if (a) the query endpoint type scores higher than all other
|
||||
// endpoint types, or (b) the query endpoint type scores at least
|
||||
// 0.5 - (getCfg().getScoreCutoff() / 2).
|
||||
sinkScore >=
|
||||
[
|
||||
max(float s | ModelScoring::endpointScores(sink, _, s)),
|
||||
0.5 - getCfg().getScoreCutoff() / 2
|
||||
]
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
module Debugging {
|
||||
query predicate hopInputEndpoints(DataFlow::Node endpoint) {
|
||||
endpoint = ModelScoring::getARequestedEndpoint()
|
||||
}
|
||||
|
||||
query predicate endpointScores = ModelScoring::endpointScores/3;
|
||||
|
||||
query predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
|
||||
any(ScoringResults scoringResults).shouldResultBeIncluded(source, sink) and
|
||||
any(DataFlow::Configuration cfg).hasFlow(source, sink)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Defines the set of classes that endpoint scoring models can predict. Endpoint scoring models must
|
||||
* only predict classes defined within this file. This file is the source of truth for the integer
|
||||
* representation of each of these classes.
|
||||
*/
|
||||
newtype TEndpointType =
|
||||
TNegativeType() or
|
||||
TXssSinkType() or
|
||||
TNosqlInjectionSinkType() or
|
||||
TSqlTaintedSinkType() or
|
||||
TTaintedPathSinkType() or
|
||||
TRequestForgerySinkType()
|
||||
|
||||
/** A class that can be predicted by endpoint scoring models. */
|
||||
abstract class EndpointType extends TEndpointType {
|
||||
abstract string getDescription();
|
||||
|
||||
/**
|
||||
* Gets the integer representation of this endpoint type. This integer representation specifies the class number
|
||||
* used by the endpoint scoring model (the classifier) to represent this endpoint type. Class 0 is the negative
|
||||
* class (non-sink). Each positive int corresponds to a single sink type.
|
||||
*/
|
||||
abstract int getEncoding();
|
||||
|
||||
/**
|
||||
* Gets the name of the sink/source kind for this endpoint type as used in Models as Data.
|
||||
*
|
||||
* See https://github.com/github/codeql/blob/44213f0144fdd54bb679ca48d68b28dcf820f7a8/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll#LL353C11-L357C31
|
||||
*/
|
||||
abstract string getKind();
|
||||
|
||||
string toString() { result = getDescription() }
|
||||
}
|
||||
|
||||
/** The `Negative` class that can be predicted by endpoint scoring models. */
|
||||
class NegativeType extends EndpointType, TNegativeType {
|
||||
override string getDescription() { result = "Negative" }
|
||||
|
||||
override int getEncoding() { result = 0 }
|
||||
|
||||
override string getKind() { result = "" }
|
||||
}
|
||||
|
||||
/** The `SqlTaintedSink` class that can be predicted by endpoint scoring models. */
|
||||
class SqlTaintedSinkType extends EndpointType, TSqlTaintedSinkType {
|
||||
override string getDescription() { result = "SqlTaintedSink" }
|
||||
|
||||
override int getEncoding() { result = 1 }
|
||||
|
||||
override string getKind() { result = "sql" }
|
||||
}
|
||||
|
||||
/** The `TaintedPathSink` class that can be predicted by endpoint scoring models. */
|
||||
class TaintedPathSinkType extends EndpointType, TTaintedPathSinkType {
|
||||
override string getDescription() { result = "TaintedPathSink" }
|
||||
|
||||
override int getEncoding() { result = 2 }
|
||||
|
||||
override string getKind() { result = "create-file" }
|
||||
}
|
||||
|
||||
/** The `RequestForgerySinkType` class that can be predicted by endpoint scoring models. */
|
||||
class RequestForgerySinkType extends EndpointType, TRequestForgerySinkType {
|
||||
override string getDescription() { result = "RequestForgerySink" }
|
||||
|
||||
override int getEncoding() { result = 3 }
|
||||
|
||||
override string getKind() { result = "open-url" } // TODO: is this correct, or should it be “jdbc-url”?
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
import java
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
|
||||
/**
|
||||
* A configuration that defines which endpoints should be featurized.
|
||||
*
|
||||
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
|
||||
* to featurize.
|
||||
*/
|
||||
abstract class FeaturizationConfig extends string {
|
||||
bindingset[this]
|
||||
FeaturizationConfig() { any() }
|
||||
|
||||
abstract DataFlow::Node getAnEndpointToFeaturize();
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
/**
|
||||
* FunctionBodyFeatures.qll
|
||||
*
|
||||
* Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features.
|
||||
*/
|
||||
|
||||
import java
|
||||
private import FeaturizationConfig
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
|
||||
/**
|
||||
* Gets a tokenized representation of the AST node for use in the `enclosingFunctionBody` feature.
|
||||
*/
|
||||
string getTokenizedAstNode(Top top) {
|
||||
result = top.(Variable).getName()
|
||||
or
|
||||
result = top.(Field).getName()
|
||||
or
|
||||
result = top.(Literal).getValue()
|
||||
}
|
||||
|
||||
/** Gets an AST node within the function `f` that we should featurize. */
|
||||
pragma[inline]
|
||||
Element getAnAstNodeToFeaturize(Callable c) {
|
||||
result.(Stmt).getEnclosingCallable() = c or
|
||||
result.(Expr).getEnclosingCallable() = c
|
||||
}
|
||||
|
||||
/** DEPRECATED: Alias for getAnAstNodeToFeaturize */
|
||||
deprecated Top getAnASTNodeToFeaturize(Callable c) { result = getAnAstNodeToFeaturize(c) }
|
||||
|
||||
/**
|
||||
* Get the enclosing function for an endpoint.
|
||||
*
|
||||
* This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
|
||||
*/
|
||||
Callable getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
|
||||
// Performance optimization: Restrict the set of endpoints to the endpoints to featurize.
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
result = endpoint.getEnclosingCallable()
|
||||
}
|
||||
|
||||
/** Returns an AST node within the function `f` that an associated token feature. */
|
||||
Element getAnAstNodeWithAFeature(Callable c) {
|
||||
// Performance optimization: Restrict the set of functions to those containing an endpoint to featurize.
|
||||
c = getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
|
||||
result = getAnAstNodeToFeaturize(c)
|
||||
}
|
||||
|
||||
/** DEPRECATED: Alias for getAnAstNodeWithAFeature */
|
||||
deprecated Element getAnASTNodeWithAFeature(Callable c) { result = getAnAstNodeWithAFeature(c) }
|
||||
|
||||
/** Returns the number of source-code characters in a function. */
|
||||
int getNumCharsInFunction(Callable c) {
|
||||
result =
|
||||
strictsum(Element element |
|
||||
element = getAnAstNodeWithAFeature(c)
|
||||
|
|
||||
getTokenizedAstNode(element).length()
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the maximum number of characters a feature can be.
|
||||
* The evaluator string limit is 5395415 characters. We choose a limit lower than this.
|
||||
*/
|
||||
private int getMaxChars() { result = 1000000 }
|
||||
|
||||
/**
|
||||
* Returns a featurized representation of the function that can be used to populate the
|
||||
* `enclosingFunctionBody` feature for an endpoint.
|
||||
*/
|
||||
string getBodyTokensFeature(Callable c) {
|
||||
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as
|
||||
// absent. This approximates the behavior of the classifier on non-generic body features where
|
||||
// large body features are replaced by the absent token.
|
||||
//
|
||||
// We count nodes instead of tokens because tokens are often not unique.
|
||||
strictcount(Element element |
|
||||
element = getAnAstNodeToFeaturize(c) and
|
||||
exists(getTokenizedAstNode(element))
|
||||
) <= 256 and
|
||||
// Performance optimization: If a function has more than getMaxChars() characters in its body subtokens,
|
||||
// then featurize it as absent.
|
||||
getNumCharsInFunction(c) <= getMaxChars() and
|
||||
result =
|
||||
strictconcat(Location l, string token |
|
||||
// The use of a nested exists here allows us to avoid duplicates due to two AST nodes in the
|
||||
// same location featurizing to the same token. By using a nested exists, we take only unique
|
||||
// (location, token) pairs.
|
||||
exists(Element element |
|
||||
element = getAnAstNodeToFeaturize(c) and
|
||||
token = getTokenizedAstNode(element) and
|
||||
l = element.getLocation()
|
||||
)
|
||||
|
|
||||
token, " "
|
||||
order by
|
||||
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
|
||||
l.getEndColumn(), token
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* A taint-tracking configuration for reasoning about SSRF (server side request forgery) vulnerabilities.
|
||||
* Largely copied from java/ql/lib/semmle/code/java/security/RequestForgeryConfig.qll.
|
||||
*
|
||||
* Only import this directly from .ql files, to avoid the possibility of polluting the Configuration hierarchy
|
||||
* accidentally.
|
||||
*/
|
||||
|
||||
import ATMConfig
|
||||
import semmle.code.java.dataflow.FlowSources
|
||||
import semmle.code.java.security.RequestForgery
|
||||
|
||||
class RequestForgeryAtmConfig extends AtmConfig {
|
||||
RequestForgeryAtmConfig() { this = "RequestForgeryAtmConfig" }
|
||||
|
||||
override predicate isKnownSource(DataFlow::Node source) {
|
||||
source instanceof RemoteFlowSource and
|
||||
// Exclude results of remote HTTP requests: fetching something else based on that result
|
||||
// is no worse than following a redirect returned by the remote server, and typically
|
||||
// we're requesting a resource via https which we trust to only send us to safe URLs.
|
||||
not source.asExpr().(MethodAccess).getCallee() instanceof UrlConnectionGetInputStreamMethod
|
||||
}
|
||||
|
||||
override EndpointType getASinkEndpointType() { result instanceof RequestForgerySinkType }
|
||||
|
||||
/*
|
||||
* This is largely a copy of the taint tracking configuration for the standard SSRF
|
||||
* query, except additional sinks have been added using the sink endpoint filter.
|
||||
*/
|
||||
|
||||
override predicate isAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ) {
|
||||
any(RequestForgeryAdditionalTaintStep r).propagatesTaint(pred, succ)
|
||||
}
|
||||
|
||||
override predicate isSanitizer(DataFlow::Node node) { node instanceof RequestForgerySanitizer }
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* A taint-tracking configuration for reasoning about SQL injection vulnerabilities.
|
||||
* Defines shared code used by the SQL injection boosted query.
|
||||
* Largely copied from semmle.code.java.security.SqlInjectionQuery.
|
||||
*/
|
||||
|
||||
import ATMConfig
|
||||
import semmle.code.java.dataflow.FlowSources
|
||||
import semmle.code.java.security.QueryInjection
|
||||
|
||||
class SqlTaintedAtmConfig extends AtmConfig {
|
||||
SqlTaintedAtmConfig() { this = "SqlTaintedAtmConfig" }
|
||||
|
||||
override predicate isKnownSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
|
||||
|
||||
override EndpointType getASinkEndpointType() { result instanceof SqlTaintedSinkType }
|
||||
|
||||
/*
|
||||
* This is largely a copy of the taint tracking configuration for the standard SQL injection
|
||||
* query, except additional sinks have been added using the sink endpoint filter.
|
||||
*/
|
||||
|
||||
override predicate isSanitizer(DataFlow::Node node) {
|
||||
node.getType() instanceof PrimitiveType or
|
||||
node.getType() instanceof BoxedType or
|
||||
node.getType() instanceof NumberType
|
||||
}
|
||||
|
||||
override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
|
||||
any(AdditionalQueryInjectionTaintStep s).step(node1, node2)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* A taint-tracking configuration for reasoning about path injection vulnerabilities.
|
||||
* Defines shared code used by the path injection boosted query.
|
||||
* Largely copied from java/ql/src/Security/CWE/CWE-022/TaintedPath.ql.
|
||||
*/
|
||||
|
||||
import java
|
||||
import semmle.code.java.security.PathSanitizer
|
||||
import ATMConfig
|
||||
import semmle.code.java.dataflow.FlowSources
|
||||
|
||||
class TaintedPathAtmConfig extends AtmConfig {
|
||||
TaintedPathAtmConfig() { this = "TaintedPathAtmConfig" }
|
||||
|
||||
override predicate isKnownSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
|
||||
|
||||
override EndpointType getASinkEndpointType() { result instanceof TaintedPathSinkType }
|
||||
|
||||
/*
|
||||
* This is largely a copy of the taint tracking configuration for the standard path injection
|
||||
* query, except additional ATM sinks have been added to the `isSink` predicate.
|
||||
*/
|
||||
|
||||
override predicate isSanitizer(DataFlow::Node sanitizer) {
|
||||
sanitizer.getType() instanceof BoxedType or
|
||||
sanitizer.getType() instanceof PrimitiveType or
|
||||
sanitizer.getType() instanceof NumberType or
|
||||
sanitizer instanceof PathInjectionSanitizer
|
||||
}
|
||||
|
||||
override predicate isAdditionalTaintStep(DataFlow::Node n1, DataFlow::Node n2) {
|
||||
any(TaintedPathAdditionalTaintStep s).step(n1, n2)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Models a very basic guard for the tainted path queries.
|
||||
* TODO: Copied from java/ql/src/Security/CWE/CWE-022/TaintedPathCommon.qll because I couldn't figure out how to import it.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A unit class for adding additional taint steps.
|
||||
*
|
||||
* Extend this class to add additional taint steps that should apply to tainted path flow configurations.
|
||||
*/
|
||||
class TaintedPathAdditionalTaintStep extends Unit {
|
||||
abstract predicate step(DataFlow::Node n1, DataFlow::Node n2);
|
||||
}
|
||||
|
||||
private class DefaultTaintedPathAdditionalTaintStep extends TaintedPathAdditionalTaintStep {
|
||||
override predicate step(DataFlow::Node n1, DataFlow::Node n2) {
|
||||
exists(Argument a |
|
||||
a = n1.asExpr() and
|
||||
a.getCall() = n2.asExpr() and
|
||||
a = any(TaintPreservingUriCtorParam tpp).getAnArgument()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private class TaintPreservingUriCtorParam extends Parameter {
|
||||
TaintPreservingUriCtorParam() {
|
||||
exists(Constructor ctor, int idx, int nParams |
|
||||
ctor.getDeclaringType() instanceof TypeUri and
|
||||
this = ctor.getParameter(idx) and
|
||||
nParams = ctor.getNumberOfParameters()
|
||||
|
|
||||
// URI(String scheme, String ssp, String fragment)
|
||||
idx = 1 and nParams = 3
|
||||
or
|
||||
// URI(String scheme, String host, String path, String fragment)
|
||||
idx = [1, 2] and nParams = 4
|
||||
or
|
||||
// URI(String scheme, String authority, String path, String query, String fragment)
|
||||
idx = 2 and nParams = 5
|
||||
or
|
||||
// URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)
|
||||
idx = 4 and nParams = 7
|
||||
)
|
||||
}
|
||||
}
|
||||
10
java/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml
Normal file
10
java/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml
Normal file
@@ -0,0 +1,10 @@
|
||||
name: codeql/java-experimental-atm-lib
|
||||
description: CodeQL libraries for the experimental ML-powered queries
|
||||
version: 0.4.5
|
||||
extractor: java
|
||||
library: true
|
||||
groups:
|
||||
- java
|
||||
- experimental
|
||||
dependencies:
|
||||
codeql/java-all: ${workspace}
|
||||
3
java/ql/experimental/adaptivethreatmodeling/model/.gitignore
vendored
Normal file
3
java/ql/experimental/adaptivethreatmodeling/model/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Avoid checking in ML models
|
||||
# This matches the mlModels property of qlpack.yml.
|
||||
resources/*.codeqlmodel
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
dependencies: {}
|
||||
compiled: false
|
||||
lockVersion: 1.0.0
|
||||
@@ -0,0 +1,8 @@
|
||||
name: codeql/java-experimental-atm-model
|
||||
description: Machine learning model supporting the experimental ML-powered queries
|
||||
version: 0.0.1
|
||||
groups:
|
||||
- java
|
||||
- experimental
|
||||
mlModels:
|
||||
- "resources/*.codeqlmodel"
|
||||
@@ -0,0 +1,67 @@
|
||||
/**
|
||||
* @name Debug result inclusion
|
||||
* @description Use this query to understand why some alerts are included or excluded from the
|
||||
* results of boosted queries. The results for this query are the union of the alerts
|
||||
* generated by each boosted query. Each alert includes an explanation why it was
|
||||
* included or excluded for each of the four security queries.
|
||||
* @kind problem
|
||||
* @problem.severity error
|
||||
* @id adaptive-threat-modeling/java/debug-result-inclusion
|
||||
*/
|
||||
|
||||
import java
|
||||
import experimental.adaptivethreatmodeling.ATMConfig
|
||||
import extraction.ExtractEndpointDataTraining
|
||||
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
|
||||
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
|
||||
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
|
||||
|
||||
string getAReasonSinkExcluded(DataFlow::Node sinkCandidate, Query query) {
|
||||
query instanceof SqlTaintedQuery and
|
||||
result = any(SqlTaintedAtm::SqlTaintedAtmConfig cfg).getAReasonSinkExcluded(sinkCandidate)
|
||||
or
|
||||
query instanceof TaintedPathQuery and
|
||||
result = any(TaintedPathAtm::TaintedPathAtmConfig cfg).getAReasonSinkExcluded(sinkCandidate)
|
||||
or
|
||||
query instanceof RequestForgeryQuery and
|
||||
result = any(RequestForgeryAtm::RequestForgeryAtmConfig cfg).getAReasonSinkExcluded(sinkCandidate)
|
||||
}
|
||||
|
||||
pragma[inline]
|
||||
string getDescriptionForAlertCandidate(
|
||||
DataFlow::Node sourceCandidate, DataFlow::Node sinkCandidate, Query query
|
||||
) {
|
||||
result = "excluded[reason=" + getAReasonSinkExcluded(sinkCandidate, query) + "]"
|
||||
or
|
||||
getDataFlowCfg(query).(AtmConfig).isKnownSink(sinkCandidate) and
|
||||
result = "excluded[reason=known-sink]"
|
||||
or
|
||||
not exists(getAReasonSinkExcluded(sinkCandidate, query)) and
|
||||
not getDataFlowCfg(query).hasFlow(sourceCandidate, sinkCandidate) and
|
||||
(
|
||||
if
|
||||
getDataFlowCfg(query).isSource(sourceCandidate) or
|
||||
getDataFlowCfg(query).isSource(sourceCandidate, _)
|
||||
then result = "no flow"
|
||||
else result = "not a known source"
|
||||
)
|
||||
or
|
||||
getDataFlowCfg(query).hasFlow(sourceCandidate, sinkCandidate) and
|
||||
result = "included"
|
||||
}
|
||||
|
||||
pragma[inline]
|
||||
string getDescriptionForAlert(DataFlow::Node sourceCandidate, DataFlow::Node sinkCandidate) {
|
||||
result =
|
||||
concat(Query query |
|
||||
|
|
||||
query.getName() + ": " +
|
||||
getDescriptionForAlertCandidate(sourceCandidate, sinkCandidate, query), ", "
|
||||
)
|
||||
}
|
||||
|
||||
from DataFlow::Configuration cfg, DataFlow::Node source, DataFlow::Node sink
|
||||
where cfg.hasFlow(source, sink)
|
||||
select sink,
|
||||
"This is an ATM result that may depend on $@ [" + getDescriptionForAlert(source, sink) + "]",
|
||||
source, "a user-provided value"
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
dependencies:
|
||||
codeql/java-experimental-atm-model:
|
||||
version: 0.0.1
|
||||
compiled: false
|
||||
lockVersion: 1.0.0
|
||||
@@ -0,0 +1,21 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
*
|
||||
* Count the number of sinks and alerts for a particular dataflow config.
|
||||
*/
|
||||
|
||||
import java
|
||||
import evaluation.EndToEndEvaluation
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
|
||||
query predicate countAlertsAndSinks(int numAlerts, int numSinks) {
|
||||
numAlerts =
|
||||
count(DataFlow::Configuration cfg, DataFlow::Node source, DataFlow::Node sink |
|
||||
cfg.hasFlow(source, sink) and not isFlowExcluded(source, sink)
|
||||
) and
|
||||
numSinks =
|
||||
count(DataFlow::Node sink |
|
||||
exists(DataFlow::Configuration cfg | cfg.isSink(sink) or cfg.isSink(sink, _))
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
*
|
||||
* Count the number of sinks and alerts for the `RequestForgery` security query.
|
||||
*/
|
||||
|
||||
import semmle.code.java.security.RequestForgery
|
||||
import CountAlertsAndSinks
|
||||
@@ -0,0 +1,9 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
*
|
||||
* Count the number of sinks and alerts for the `SqlTainted` security query.
|
||||
*/
|
||||
|
||||
import semmle.code.java.security.SqlInjectionQuery
|
||||
import CountAlertsAndSinks
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
*
|
||||
* Count the number of sinks and alerts for the `TaintedPath` security query.
|
||||
*/
|
||||
|
||||
//TODO no libraries for TaintedPath so we copy paste the config used in the TaintedPath.ql query.
|
||||
import java
|
||||
import DataFlow::PathGraph
|
||||
private import semmle.code.java.dataflow.ExternalFlow
|
||||
import semmle.code.java.security.PathCreation
|
||||
import semmle.code.java.security.PathSanitizer
|
||||
import semmle.code.java.dataflow.FlowSources
|
||||
import CountAlertsAndSinks
|
||||
|
||||
class TaintedPathConfig extends TaintTracking::Configuration {
|
||||
TaintedPathConfig() { this = "TaintedPathConfig" }
|
||||
|
||||
override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
|
||||
|
||||
override predicate isSink(DataFlow::Node sink) {
|
||||
sink.asExpr() = any(PathCreation p).getAnInput()
|
||||
or
|
||||
sinkNode(sink, "create-file")
|
||||
}
|
||||
|
||||
override predicate isSanitizer(DataFlow::Node sanitizer) {
|
||||
sanitizer.getType() instanceof BoxedType or
|
||||
sanitizer.getType() instanceof PrimitiveType or
|
||||
sanitizer.getType() instanceof NumberType or
|
||||
sanitizer instanceof PathInjectionSanitizer
|
||||
}
|
||||
|
||||
override predicate isAdditionalTaintStep(DataFlow::Node n1, DataFlow::Node n2) {
|
||||
any(TaintedPathAdditionalTaintStep s).step(n1, n2)
|
||||
}
|
||||
}
|
||||
|
||||
class TaintedPathAdditionalTaintStep extends Unit {
|
||||
abstract predicate step(DataFlow::Node n1, DataFlow::Node n2);
|
||||
}
|
||||
|
||||
private class DefaultTaintedPathAdditionalTaintStep extends TaintedPathAdditionalTaintStep {
|
||||
override predicate step(DataFlow::Node n1, DataFlow::Node n2) {
|
||||
exists(Argument a |
|
||||
a = n1.asExpr() and
|
||||
a.getCall() = n2.asExpr() and
|
||||
a = any(TaintPreservingUriCtorParam tpp).getAnArgument()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private class TaintPreservingUriCtorParam extends Parameter {
|
||||
TaintPreservingUriCtorParam() {
|
||||
exists(Constructor ctor, int idx, int nParams |
|
||||
ctor.getDeclaringType() instanceof TypeUri and
|
||||
this = ctor.getParameter(idx) and
|
||||
nParams = ctor.getNumberOfParameters()
|
||||
|
|
||||
// URI(String scheme, String ssp, String fragment)
|
||||
idx = 1 and nParams = 3
|
||||
or
|
||||
// URI(String scheme, String host, String path, String fragment)
|
||||
idx = [1, 2] and nParams = 4
|
||||
or
|
||||
// URI(String scheme, String authority, String path, String query, String fragment)
|
||||
idx = 2 and nParams = 5
|
||||
or
|
||||
// URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)
|
||||
idx = 4 and nParams = 7
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
private import java
|
||||
private import extraction.Exclusions as Exclusions
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
|
||||
/**
|
||||
* Holds if the flow from `source` to `sink` should be excluded from the results of an end-to-end
|
||||
* evaluation query.
|
||||
*/
|
||||
pragma[inline]
|
||||
predicate isFlowExcluded(DataFlow::Node source, DataFlow::Node sink) {
|
||||
Exclusions::isFileExcluded([source.getLocation().getFile(), sink.getLocation().getFile()])
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
/**
|
||||
* EndpointScoresIntegrationTest.ql
|
||||
*
|
||||
* Extract scores for each test endpoint that is an argument to a function call in the database.
|
||||
* This is used by integration tests to verify that QL and the modeling codebase agree on the scores
|
||||
* of a set of test endpoints.
|
||||
*/
|
||||
|
||||
import java
|
||||
import experimental.adaptivethreatmodeling.ATMConfig
|
||||
import experimental.adaptivethreatmodeling.FeaturizationConfig
|
||||
import experimental.adaptivethreatmodeling.EndpointScoring::ModelScoring as ModelScoring
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
private import semmle.code.java.dataflow.internal.DataFlowPrivate as DataFlowPrivate
|
||||
|
||||
/**
|
||||
* A featurization config that featurizes endpoints that are arguments to function calls.
|
||||
*
|
||||
* This should only be used in extraction queries and tests.
|
||||
*/
|
||||
class FunctionArgumentFeaturizationConfig extends FeaturizationConfig {
|
||||
FunctionArgumentFeaturizationConfig() { this = "FunctionArgumentFeaturization" }
|
||||
|
||||
override DataFlow::Node getAnEndpointToFeaturize() {
|
||||
exists(Call call | result.asExpr() = call.getAnArgument())
|
||||
}
|
||||
}
|
||||
|
||||
query predicate endpointScores = ModelScoring::endpointScores/3;
|
||||
@@ -0,0 +1,16 @@
|
||||
/**
|
||||
* ModelCheck.ql
|
||||
*
|
||||
* Returns checksums of ATM models.
|
||||
*/
|
||||
|
||||
/**
|
||||
* The `availableMlModels` template predicate.
|
||||
*
|
||||
* This is populated by the evaluator with metadata for the available machine learning models.
|
||||
*/
|
||||
external predicate availableMlModels(
|
||||
string modelChecksum, string modelLanguage, string modelName, string modelType
|
||||
);
|
||||
|
||||
select any(string checksum | availableMlModels(checksum, "java", _, _))
|
||||
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
* Counts sources and sinks for Java security queries.
|
||||
*/
|
||||
|
||||
import java
|
||||
import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
import semmle.code.java.dataflow.TaintTracking::TaintTracking as TaintTracking
|
||||
// java/ql/lib/semmle/code/java/security$ ls *Query.qll | sed -e 's/\(.*\)Query.qll/import semmle.code.java.security.\1Query as \1/'
|
||||
import semmle.code.java.security.AndroidIntentRedirectionQuery as AndroidIntentRedirection
|
||||
import semmle.code.java.security.AndroidSensitiveCommunicationQuery as AndroidSensitiveCommunication
|
||||
import semmle.code.java.security.AndroidWebViewCertificateValidationQuery as AndroidWebViewCertificateValidation
|
||||
import semmle.code.java.security.CleartextStorageAndroidDatabaseQuery as CleartextStorageAndroidDatabase
|
||||
import semmle.code.java.security.CleartextStorageAndroidFilesystemQuery as CleartextStorageAndroidFilesystem
|
||||
import semmle.code.java.security.CleartextStorageClassQuery as CleartextStorageClass
|
||||
import semmle.code.java.security.CleartextStorageCookieQuery as CleartextStorageCookie
|
||||
import semmle.code.java.security.CleartextStoragePropertiesQuery as CleartextStorageProperties
|
||||
import semmle.code.java.security.CleartextStorageQuery as CleartextStorage
|
||||
import semmle.code.java.security.CleartextStorageSharedPrefsQuery as CleartextStorageSharedPrefs
|
||||
import semmle.code.java.security.CommandLineQuery as CommandLine
|
||||
import semmle.code.java.security.ConditionalBypassQuery as ConditionalBypass
|
||||
import semmle.code.java.security.FragmentInjectionQuery as FragmentInjection
|
||||
import semmle.code.java.security.GroovyInjectionQuery as GroovyInjection
|
||||
import semmle.code.java.security.HardcodedCredentialsApiCallQuery as HardcodedCredentialsApiCall
|
||||
import semmle.code.java.security.HardcodedCredentialsSourceCallQuery as HardcodedCredentialsSourceCall
|
||||
import semmle.code.java.security.HttpsUrlsQuery as HttpsUrls
|
||||
import semmle.code.java.security.ImplicitPendingIntentsQuery as ImplicitPendingIntents
|
||||
import semmle.code.java.security.ImproperIntentVerificationQuery as ImproperIntentVerification
|
||||
import semmle.code.java.security.InsecureBasicAuthQuery as InsecureBasicAuth
|
||||
import semmle.code.java.security.InsecureTrustManagerQuery as InsecureTrustManager
|
||||
import semmle.code.java.security.InsufficientKeySizeQuery as InsufficientKeySize
|
||||
import semmle.code.java.security.IntentUriPermissionManipulationQuery as IntentUriPermissionManipulation
|
||||
import semmle.code.java.security.JexlInjectionQuery as JexlInjection
|
||||
import semmle.code.java.security.JndiInjectionQuery as JndiInjection
|
||||
import semmle.code.java.security.LogInjectionQuery as LogInjection
|
||||
import semmle.code.java.security.MissingJWTSignatureCheckQuery as MissingJWTSignatureCheck
|
||||
import semmle.code.java.security.MvelInjectionQuery as MvelInjection
|
||||
import semmle.code.java.security.OgnlInjectionQuery as OgnlInjection
|
||||
import semmle.code.java.security.OverlyLargeRangeQuery as OverlyLargeRange
|
||||
import semmle.code.java.security.PartialPathTraversalQuery as PartialPathTraversal
|
||||
import semmle.code.java.security.RandomQuery as Random
|
||||
import semmle.code.java.security.RsaWithoutOaepQuery as RsaWithoutOaep
|
||||
import semmle.code.java.security.SensitiveKeyboardCacheQuery as SensitiveKeyboardCache
|
||||
import semmle.code.java.security.SensitiveLoggingQuery as SensitiveLogging
|
||||
import semmle.code.java.security.SpelInjectionQuery as SpelInjection
|
||||
import semmle.code.java.security.SqlInjectionQuery as SqlInjection
|
||||
import semmle.code.java.security.StaticInitializationVectorQuery as StaticInitializationVector
|
||||
import semmle.code.java.security.TemplateInjectionQuery as TemplateInjection
|
||||
import semmle.code.java.security.UnsafeAndroidAccessQuery as UnsafeAndroidAccess
|
||||
import semmle.code.java.security.UnsafeCertTrustQuery as UnsafeCertTrust
|
||||
import semmle.code.java.security.UnsafeContentUriResolutionQuery as UnsafeContentUriResolution
|
||||
import semmle.code.java.security.UnsafeDeserializationQuery as UnsafeDeserialization
|
||||
import semmle.code.java.security.WebviewDubuggingEnabledQuery as WebviewDubuggingEnabled
|
||||
import semmle.code.java.security.XsltInjectionQuery as XsltInjection
|
||||
|
||||
DataFlow::Node getASink(TaintTracking::Configuration cfg) {
|
||||
cfg.isSink(result) or cfg.isSink(result, _)
|
||||
}
|
||||
|
||||
DataFlow::Node getASource(TaintTracking::Configuration cfg) {
|
||||
cfg.isSource(result) or cfg.isSource(result, _)
|
||||
}
|
||||
|
||||
from TaintTracking::Configuration cfg, int sources, int sinks
|
||||
where count(getASource(cfg)) = sources and count(getASink(cfg)) = sinks
|
||||
select cfg, sources, sinks
|
||||
@@ -0,0 +1,59 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Defines files that should be excluded from the evaluation of ML models.
|
||||
*/
|
||||
|
||||
private import java
|
||||
|
||||
//TODO Couldn't find a library for the classifier so copy pasted predicate in java/ql/src/filters/ClassifyFiles.ql
|
||||
predicate classify(File f, string tag) {
|
||||
f instanceof GeneratedFile and tag = "generated"
|
||||
or
|
||||
exists(GeneratedClass gc | gc.getFile() = f | tag = "generated")
|
||||
or
|
||||
exists(TestClass tc | tc.getFile() = f | tag = "test")
|
||||
or
|
||||
exists(TestMethod tm | tm.getFile() = f | tag = "test")
|
||||
}
|
||||
|
||||
/** Holds if the file should be excluded from end-to-end evaluation. */
|
||||
predicate isFileExcluded(File file) {
|
||||
// Ignore files that are outside the root folder of the analyzed source location.
|
||||
//
|
||||
// If the file doesn't have a relative path, then the source file is located outside the root
|
||||
// folder of the analyzed source location, meaning that the files are additional files added to
|
||||
// the database like standard library files that we would like to ignore.
|
||||
not exists(file.getRelativePath())
|
||||
or
|
||||
// Ignore files based on their path.
|
||||
exists(string ignorePattern, string separator |
|
||||
ignorePattern =
|
||||
// Exclude test files
|
||||
"(tests?|test[_-]?case|" +
|
||||
// Exclude library files
|
||||
//
|
||||
// - The Bower and npm package managers store packages in bower_components and node_modules
|
||||
// folders respectively.
|
||||
// - Specific exclusion for end-to-end: `applications/examples/static/epydoc` contains
|
||||
// library code from Epydoc.
|
||||
"3rd[_-]?party|bower_components|extern(s|al)?|node_modules|resources|third[_-]?party|_?vendor|"
|
||||
+ "applications" + separator + "examples" + separator + "static" + separator + "epydoc|" +
|
||||
// Exclude generated code
|
||||
"gen|\\.?generated|" +
|
||||
// Exclude benchmarks
|
||||
"benchmarks?|" +
|
||||
// Exclude documentation
|
||||
"docs?|documentation)" and
|
||||
separator = "(\\/|\\.)" and
|
||||
exists(
|
||||
file.getRelativePath()
|
||||
.toLowerCase()
|
||||
.regexpFind(separator + ignorePattern + separator + "|" + "^" + ignorePattern + separator +
|
||||
"|" + separator + ignorePattern + "$", _, _)
|
||||
)
|
||||
)
|
||||
or
|
||||
// Ignore generated, library, and test files.
|
||||
classify(file, ["externs", "generated", "library", "test"])
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
* Extracts training data we can use to train ML models for ML-powered queries.
|
||||
*/
|
||||
|
||||
private import ExtractEndpointDataTraining as ExtractEndpointDataTraining
|
||||
|
||||
query predicate endpoints = ExtractEndpointDataTraining::reformattedTrainingEndpoints/5;
|
||||
|
||||
query predicate tokenFeatures = ExtractEndpointDataTraining::tokenFeatures/3;
|
||||
@@ -0,0 +1,251 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Extracts training data we can use to train ML models for ML-powered queries.
|
||||
*/
|
||||
|
||||
import java
|
||||
import experimental.adaptivethreatmodeling.EndpointCharacteristics
|
||||
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
|
||||
import NoFeaturizationRestrictionsConfig
|
||||
private import Exclusions as Exclusions
|
||||
import Queries
|
||||
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
|
||||
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
|
||||
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
|
||||
|
||||
/**
|
||||
* Gets the set of featureName-featureValue pairs for each endpoint in the training set.
|
||||
*
|
||||
* `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
|
||||
* `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
|
||||
* `featureValue` to the empty string in this case.
|
||||
*/
|
||||
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
|
||||
trainingEndpoints(endpoint, _, _) and
|
||||
(
|
||||
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
|
||||
or
|
||||
// Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
|
||||
featureName = EndpointFeatures::getASupportedFeatureName() and
|
||||
not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
|
||||
featureValue = ""
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if the given endpoint should be included in the training set as a sample belonging to endpointClass, and has
|
||||
* the given characteristic. This query uses the endpoint characteristics to select and label endpoints for the training
|
||||
* set, and provides a list of characteristics for each endpoint in the training set, which is used in the modeling
|
||||
* code.
|
||||
*
|
||||
* Params:
|
||||
* endpoint: The endpoint to include / exclude.
|
||||
* endpointClass: The sink type. See the documentation of EndpointType.getEncoding for details about the relationship
|
||||
* between an EndpointType and a class in the classifier.
|
||||
* characteristic: Provides the list of characteristics that apply to the endpoint, which the modeling code currently
|
||||
* uses for type balancing.
|
||||
*
|
||||
* Note: This predicate will produce multiple tuples for endpoints that have multiple characteristics, which we must
|
||||
* then group together into a list of characteristics.
|
||||
*/
|
||||
query predicate trainingEndpoints(
|
||||
DataFlow::Node endpoint, EndpointType endpointClass, EndpointCharacteristic characteristic
|
||||
) {
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
// Only consider the source code for the project being analyzed.
|
||||
exists(endpoint.getLocation().getFile().getRelativePath()) and
|
||||
// Only select endpoints that can be part of a tainted flow: Constant expressions always evaluate to a constant
|
||||
// primitive value. Therefore they can't ever appear in an alert, making them less interesting training examples.
|
||||
// TODO: Experiment with removing this requirement.
|
||||
// not endpoint.asExpr() instanceof CompileTimeConstantExpr and
|
||||
not exists(EndpointFilterCharacteristic efc | efc.appliesToEndpoint(endpoint)) and
|
||||
// Do not select endpoints filtered out by end-to-end evaluation.
|
||||
// TODO: Experiment with removing this requirement.
|
||||
not Exclusions::isFileExcluded(endpoint.getLocation().getFile()) and
|
||||
// Filter out negative examples that also have a LikelyNotASinkReason, because this is currently done here
|
||||
// https://github.com/github/codeql/blob/387e57546bf7352f7c1cfe781daa1a3799b7063e/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll#L77
|
||||
// TODO: Experiment with removing this requirement.
|
||||
not (
|
||||
endpointClass instanceof NegativeType and
|
||||
exists(EndpointCharacteristic c |
|
||||
c.appliesToEndpoint(endpoint) and
|
||||
c instanceof LikelyNotASinkCharacteristic
|
||||
)
|
||||
) and
|
||||
// Don't surface endpoint filters as characteristics, because they were previously not surfaced.
|
||||
// TODO: Experiment with surfacing these to the modeling code by removing the following line (and then make
|
||||
// EndpointFilterCharacteristic private).
|
||||
not characteristic instanceof EndpointFilterCharacteristic and
|
||||
(
|
||||
// If the list of characteristics includes positive indicators with high confidence for this class, select this as a
|
||||
// training sample belonging to the class.
|
||||
exists(EndpointCharacteristic characteristic2, float confidence |
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
characteristic2.hasImplications(endpointClass, true, confidence) and
|
||||
confidence >= characteristic2.getHighConfidenceThreshold()
|
||||
) and
|
||||
(
|
||||
// Temporarily limit this only to positive classes. For negative classes, additionally select only endpoints that
|
||||
// have no high confidence indicators that they are sinks, because this is what was previously done.
|
||||
// TODO: Experiment with removing this requirement, and instead ensuring that an endpoint never has both a high
|
||||
// confidence indicator that it _is_ a sink and a high confidence indicator that it is _not_ a sink.
|
||||
not endpointClass instanceof NegativeType
|
||||
or
|
||||
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
|
||||
characteristic3.appliesToEndpoint(endpoint) and
|
||||
characteristic3.hasImplications(posClass, true, confidence3) and
|
||||
confidence3 >= characteristic3.getHighConfidenceThreshold() and
|
||||
not posClass instanceof NegativeType
|
||||
)
|
||||
)
|
||||
or
|
||||
// If the list of characteristics includes negative indicators with high confidence for all classes other than 0,
|
||||
// select this as a training sample of class 0 (this means we had query-specific characteristics to decide this
|
||||
// endpoint isn't a sink for each of our sink types).
|
||||
endpointClass instanceof NegativeType and
|
||||
forall(EndpointType otherClass | not otherClass instanceof NegativeType |
|
||||
exists(EndpointCharacteristic characteristic2, float confidence |
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
characteristic2.hasImplications(otherClass, false, confidence) and
|
||||
confidence >= characteristic2.getHighConfidenceThreshold()
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Temporary:
|
||||
* Reformat the training data that was extracted with the new logic to match the format produced by the old predicate.
|
||||
* This is the format expected by the endpoint pipeline.
|
||||
*/
|
||||
query predicate reformattedTrainingEndpoints(
|
||||
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
|
||||
) {
|
||||
trainingEndpoints(endpoint, _, _) and
|
||||
exists(Query query |
|
||||
queryName = query.getName() and
|
||||
// For sinks, only list that sink type, but for non-sinks, list all sink types.
|
||||
(
|
||||
exists(EndpointType endpointClass |
|
||||
endpointClass.getDescription().matches(queryName + "%") and
|
||||
not endpointClass instanceof NegativeType and
|
||||
trainingEndpoints(endpoint, endpointClass, _)
|
||||
)
|
||||
or
|
||||
exists(EndpointType endpointClass |
|
||||
endpointClass instanceof NegativeType and
|
||||
trainingEndpoints(endpoint, endpointClass, _)
|
||||
)
|
||||
) and
|
||||
(
|
||||
// NOTE: We don't use hasFlowFromSource in training, so we could just hardcode it to be false.
|
||||
key = "hasFlowFromSource" and
|
||||
(
|
||||
if FlowFromSource::hasFlowFromSource(endpoint, query)
|
||||
then value = "true"
|
||||
else value = "false"
|
||||
) and
|
||||
valueType = "boolean"
|
||||
or
|
||||
// Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
|
||||
// appear in an alert, making them less interesting training examples.
|
||||
key = "isConstantExpression" and
|
||||
(
|
||||
if endpoint.asExpr() instanceof CompileTimeConstantExpr
|
||||
then value = "true"
|
||||
else value = "false"
|
||||
) and
|
||||
valueType = "boolean"
|
||||
or
|
||||
// Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
|
||||
key = "isExcludedFromEndToEndEvaluation" and
|
||||
(
|
||||
if Exclusions::isFileExcluded(endpoint.getLocation().getFile())
|
||||
then value = "true"
|
||||
else value = "false"
|
||||
) and
|
||||
valueType = "boolean"
|
||||
or
|
||||
// The label for this query, considering the endpoint as a sink.
|
||||
key = "sinkLabel" and
|
||||
valueType = "string" and
|
||||
value = "Sink" and
|
||||
exists(EndpointType endpointClass |
|
||||
endpointClass.getDescription().matches(queryName + "%") and
|
||||
not endpointClass instanceof NegativeType and
|
||||
trainingEndpoints(endpoint, endpointClass, _)
|
||||
)
|
||||
or
|
||||
key = "sinkLabel" and
|
||||
valueType = "string" and
|
||||
value = "NotASink" and
|
||||
exists(EndpointType endpointClass |
|
||||
endpointClass instanceof NegativeType and
|
||||
trainingEndpoints(endpoint, endpointClass, _)
|
||||
)
|
||||
or
|
||||
// The reason, or reasons, why the endpoint was labeled NotASink for this query, only for negative examples.
|
||||
key = "notASinkReason" and
|
||||
exists(EndpointCharacteristic characteristic, EndpointType endpointClass |
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
characteristic.hasImplications(endpointClass, true, _) and
|
||||
endpointClass instanceof NegativeType and
|
||||
value = characteristic
|
||||
) and
|
||||
// Don't include a notASinkReason for endpoints that are also known sinks.
|
||||
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
|
||||
characteristic3.appliesToEndpoint(endpoint) and
|
||||
characteristic3.hasImplications(posClass, true, confidence3) and
|
||||
confidence3 >= characteristic3.getHighConfidenceThreshold() and
|
||||
not posClass instanceof NegativeType
|
||||
) and
|
||||
// Don't surface endpoint filters as notASinkReasons, because they were previously not surfaced.
|
||||
// TODO: Experiment with surfacing these to the modeling code by removing the following line (and then make
|
||||
// EndpointFilterCharacteristic private).
|
||||
not value instanceof EndpointFilterCharacteristic and
|
||||
valueType = "string"
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the ATM data flow configuration for the specified query.
|
||||
* TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
|
||||
*/
|
||||
DataFlow::Configuration getDataFlowCfg(Query query) {
|
||||
query instanceof SqlTaintedQuery and result instanceof SqlTaintedAtm::SqlTaintedAtmConfig
|
||||
or
|
||||
query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::TaintedPathAtmConfig
|
||||
or
|
||||
query instanceof RequestForgeryQuery and
|
||||
result instanceof RequestForgeryAtm::RequestForgeryAtmConfig
|
||||
}
|
||||
|
||||
// TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
|
||||
private module FlowFromSource {
|
||||
predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
|
||||
exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
|
||||
}
|
||||
|
||||
/**
|
||||
* A data flow configuration that replicates the data flow configuration for a specific query, but
|
||||
* replaces the set of sinks with the set of endpoints we're extracting.
|
||||
*
|
||||
* We use this to find out when there is flow to a particular endpoint from a known source.
|
||||
*
|
||||
* This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
|
||||
* from the CodeQL standard libraries for JavaScript.
|
||||
*/
|
||||
private class Configuration extends DataFlow::Configuration {
|
||||
Query q;
|
||||
|
||||
Configuration() { this = getDataFlowCfg(q) }
|
||||
|
||||
Query getQuery() { result = q }
|
||||
|
||||
/** Holds if `sink` is an endpoint we're extracting. */
|
||||
override predicate isSink(DataFlow::Node sink) { any() }
|
||||
// override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) { exists(lbl) }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
/**
|
||||
* @name Endpoint types
|
||||
* @description Maps endpoint type encodings to human-readable descriptions.
|
||||
* @kind table
|
||||
* @id java/ml-powered/model-building/endpoint-type-encodings
|
||||
*/
|
||||
|
||||
import experimental.adaptivethreatmodeling.EndpointTypes
|
||||
|
||||
from EndpointType type
|
||||
select type.getEncoding() as label, type.getDescription() as labelName order by label
|
||||
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
* Maps ML-powered queries to their `EndpointType` for clearer labelling while evaluating ML model during training.
|
||||
*/
|
||||
|
||||
import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
|
||||
import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
|
||||
import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
|
||||
import experimental.adaptivethreatmodeling.AdaptiveThreatModeling
|
||||
|
||||
from string queryName, AtmConfig c, EndpointType e
|
||||
where
|
||||
(
|
||||
queryName = "SqlTainted" and
|
||||
c instanceof SqlTaintedAtm::SqlTaintedAtmConfig
|
||||
or
|
||||
queryName = "TaintedPath" and
|
||||
c instanceof TaintedPathAtm::TaintedPathAtmConfig
|
||||
or
|
||||
queryName = "RequestForgery" and
|
||||
c instanceof RequestForgeryAtm::RequestForgeryAtmConfig
|
||||
) and
|
||||
e = c.getASinkEndpointType()
|
||||
select queryName, e.getEncoding() as label
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
* Query for finding misclassified endpoints which we can use to debug ML-powered queries.
|
||||
*/
|
||||
|
||||
import java
|
||||
import experimental.adaptivethreatmodeling.AdaptiveThreatModeling
|
||||
import experimental.adaptivethreatmodeling.ATMConfig
|
||||
import experimental.adaptivethreatmodeling.BaseScoring
|
||||
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
|
||||
import experimental.adaptivethreatmodeling.EndpointTypes
|
||||
import semmle.code.java.security.QueryInjection
|
||||
|
||||
/** Gets the positive endpoint type for which you wish to find misclassified examples. */
|
||||
EndpointType getEndpointType() { result instanceof SqlTaintedSinkType }
|
||||
|
||||
/** Get a positive endpoint. This will be run through the classifier to determine whether it is misclassified. */
|
||||
DataFlow::Node getAPositiveEndpoint() { result instanceof QueryInjectionSink }
|
||||
|
||||
/** An ATM configuration to find misclassified endpoints of type `getEndpointType()`. */
|
||||
class ExtractMisclassifiedEndpointsAtmConfig extends AtmConfig {
|
||||
ExtractMisclassifiedEndpointsAtmConfig() { this = "ExtractMisclassifiedEndpointsATMConfig" }
|
||||
|
||||
override predicate isEffectiveSink(DataFlow::Node sinkCandidate) {
|
||||
sinkCandidate = getAPositiveEndpoint()
|
||||
}
|
||||
|
||||
override EndpointType getASinkEndpointType() { result = getEndpointType() }
|
||||
}
|
||||
|
||||
/** Get an endpoint from `getAPositiveEndpoint()` that is incorrectly excluded from the results. */
|
||||
DataFlow::Node getAMisclassifedEndpoint() {
|
||||
any(ExtractMisclassifiedEndpointsAtmConfig config).isEffectiveSink(result) and
|
||||
not any(ScoringResults results).shouldResultBeIncluded(_, result)
|
||||
}
|
||||
|
||||
/** The token features for each misclassified endpoint. */
|
||||
query predicate tokenFeaturesForMisclassifiedEndpoints(
|
||||
DataFlow::Node endpoint, string featureName, string featureValue
|
||||
) {
|
||||
endpoint = getAMisclassifedEndpoint() and
|
||||
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
/**
|
||||
* Surfaces endpoints are non-sinks with high confidence, for use as negative examples in the prompt.
|
||||
*
|
||||
* @name Negative examples (experimental)
|
||||
* @kind problem
|
||||
* @id java/ml-powered/non-sink
|
||||
* @tags experimental security
|
||||
*/
|
||||
|
||||
private import java
|
||||
import semmle.code.java.dataflow.TaintTracking
|
||||
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
|
||||
private import experimental.adaptivethreatmodeling.EndpointTypes
|
||||
|
||||
bindingset[rate]
|
||||
DataFlow::Node getSampleFromSampleRate(float rate) {
|
||||
exists(int r |
|
||||
result =
|
||||
rank[r](DataFlow::Node n, string path, int a, int b, int c, int d |
|
||||
n.asExpr().getLocation().hasLocationInfo(path, a, b, c, d)
|
||||
|
|
||||
n order by path, a, b, c, d
|
||||
) and
|
||||
r % (1 / rate).ceil() = 0
|
||||
)
|
||||
}
|
||||
|
||||
from
|
||||
DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic,
|
||||
float confidence
|
||||
where
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
confidence >= characteristic.highConfidence() and
|
||||
characteristic.hasImplications(any(NegativeType negative), true, confidence) and
|
||||
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
|
||||
// certain about in the prompt.
|
||||
not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _) and
|
||||
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
|
||||
// treated by the actual query as a sanitizer, since the final logic is something like
|
||||
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
|
||||
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
|
||||
not exists(
|
||||
EndpointCharacteristics::EndpointCharacteristic characteristic2, float confidence2,
|
||||
EndpointType positiveType
|
||||
|
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
confidence2 >= characteristic2.maximalConfidence() and
|
||||
not positiveType instanceof NegativeType and
|
||||
characteristic2.hasImplications(positiveType, true, confidence2)
|
||||
) and
|
||||
endpoint = getSampleFromSampleRate(0.01)
|
||||
select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString()
|
||||
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* Surfaces endpoints are sinks with high confidence, for use as positive examples in the prompt.
|
||||
*
|
||||
* @name Positive examples (experimental)
|
||||
* @kind problem
|
||||
* @id java/ml-powered/known-sink
|
||||
* @tags experimental security
|
||||
*/
|
||||
|
||||
private import java
|
||||
import semmle.code.java.dataflow.TaintTracking
|
||||
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
|
||||
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
|
||||
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintednAtm
|
||||
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
|
||||
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
|
||||
|
||||
/*
|
||||
* ****** WARNING: ******
|
||||
* Before calling this query, make sure there's no codex-generated data extension file in `java/ql/lib/ext`. Otherwise,
|
||||
* the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
|
||||
*/
|
||||
|
||||
from DataFlow::Node sink, AtmConfig::AtmConfig config
|
||||
where
|
||||
config.isKnownSink(sink) and
|
||||
// If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
|
||||
// when there's a codex-generated data extension file in `java/ql/lib/ext`.
|
||||
not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) and
|
||||
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
|
||||
// treated by the actual query as a sanitizer, since the final logic is something like
|
||||
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
|
||||
not config.isSanitizer(sink)
|
||||
select sink, config.getASinkEndpointType().getDescription()
|
||||
@@ -0,0 +1,48 @@
|
||||
/**
|
||||
* Surfaces the endpoints that pass the endpoint filters and have flow from a source for each query config, and are
|
||||
* therefore used as candidates for classification with an ML model.
|
||||
*
|
||||
* Note: This query does not actually classify the endpoints using the model.
|
||||
*
|
||||
* @name Sink candidates with flow (experimental)
|
||||
* @description Sink candidates with flow from a source
|
||||
* @kind problem
|
||||
* @id java/ml-powered/sink-candidates-with-flow
|
||||
* @tags experimental security
|
||||
*/
|
||||
|
||||
private import java
|
||||
import semmle.code.java.dataflow.TaintTracking
|
||||
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
|
||||
private import experimental.adaptivethreatmodeling.SqlTaintedATM as SqlTaintedAtm
|
||||
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
|
||||
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
|
||||
|
||||
from
|
||||
DataFlow::Node sink, string message, string package, string type, boolean subtypes, string name,
|
||||
string signature, string ext, string input, string provenance
|
||||
where
|
||||
exists(Callable callee, Call call, int index |
|
||||
sink.asExpr() = call.getArgument(index) and
|
||||
callee = call.getCallee() and
|
||||
package = callee.getDeclaringType().getPackage().getName() and
|
||||
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
|
||||
subtypes = true and // TODO
|
||||
name = callee.getName() and // TODO: Will this work for constructors?
|
||||
signature = callee.paramsString() and
|
||||
ext = "" and // TODO
|
||||
input = "Argument[" + index + "]" and // TODO: why are slashes added?
|
||||
provenance = "manual" // TODO
|
||||
) and
|
||||
// The message is the concatenation of all relevant configs, and we surface only sinks that have at least one relevant
|
||||
// config.
|
||||
message =
|
||||
strictconcat(AtmConfig::AtmConfig config, DataFlow::PathNode sinkPathNode |
|
||||
config.isSinkCandidateWithFlow(sinkPathNode) and
|
||||
sinkPathNode.getNode() = sink
|
||||
|
|
||||
config.getASinkEndpointType().getDescription(), ", "
|
||||
) + "\n{'package': '" + package + "', 'type': '" + type + "', 'subtypes': " + subtypes +
|
||||
", 'name': '" + name + "', 'signature': '" + signature + "', 'ext': '" + ext + "', 'input': '"
|
||||
+ input + "', 'provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
|
||||
select sink, message
|
||||
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
* Labels used in training and evaluation data to indicate knowledge about whether an endpoint is a
|
||||
* sink for a particular security query.
|
||||
*/
|
||||
|
||||
newtype TEndpointLabel =
|
||||
TSinkLabel() or
|
||||
TNotASinkLabel() or
|
||||
TUnknownLabel()
|
||||
|
||||
abstract class EndpointLabel extends TEndpointLabel {
|
||||
abstract string getEncoding();
|
||||
|
||||
string toString() { result = getEncoding() }
|
||||
}
|
||||
|
||||
class SinkLabel extends EndpointLabel, TSinkLabel {
|
||||
override string getEncoding() { result = "Sink" }
|
||||
}
|
||||
|
||||
class NotASinkLabel extends EndpointLabel, TNotASinkLabel {
|
||||
override string getEncoding() { result = "NotASink" }
|
||||
}
|
||||
|
||||
class UnknownLabel extends EndpointLabel, TUnknownLabel {
|
||||
override string getEncoding() { result = "Unknown" }
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*/
|
||||
|
||||
private import experimental.adaptivethreatmodeling.FeaturizationConfig
|
||||
private import semmle.code.java.dataflow.DataFlow::DataFlow as DataFlow
|
||||
|
||||
/**
|
||||
* A featurization config that featurizes all endpoints.
|
||||
*
|
||||
* This should only be used in extraction queries and tests.
|
||||
*/
|
||||
class NoRestrictionsFeaturizationConfig extends FeaturizationConfig {
|
||||
NoRestrictionsFeaturizationConfig() { this = "NoRestrictionsFeaturization" }
|
||||
|
||||
override DataFlow::Node getAnEndpointToFeaturize() { any() }
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* For internal use only.
|
||||
*
|
||||
* Represents the security queries for which we currently have ML-powered versions.
|
||||
*/
|
||||
|
||||
newtype TQuery =
|
||||
TSqlTaintedQuery() or
|
||||
TTaintedPathQuery() or
|
||||
TRequestForgeryQuery()
|
||||
|
||||
abstract class Query extends TQuery {
|
||||
abstract string getName();
|
||||
|
||||
string toString() { result = getName() }
|
||||
}
|
||||
|
||||
class SqlTaintedQuery extends Query, TSqlTaintedQuery {
|
||||
override string getName() { result = "SqlTainted" }
|
||||
}
|
||||
|
||||
class TaintedPathQuery extends Query, TTaintedPathQuery {
|
||||
override string getName() { result = "TaintedPath" }
|
||||
}
|
||||
|
||||
class RequestForgeryQuery extends Query, TRequestForgeryQuery {
|
||||
override string getName() { result = "RequestForgery" }
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
name: codeql/java-experimental-atm-model-building
|
||||
description: CodeQL libraries for building machine learning models for the experimental ML-powered queries
|
||||
extractor: java
|
||||
library: false
|
||||
groups:
|
||||
- java
|
||||
- experimental
|
||||
dependencies:
|
||||
codeql/java-experimental-atm-lib: ${workspace}
|
||||
codeql/java-experimental-atm-model: "0.0.0"
|
||||
@@ -0,0 +1,25 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* @name Server-side request forgery (experimental)
|
||||
* @description Making web requests based on unvalidated user-input
|
||||
* may cause the server to communicate with malicious servers.
|
||||
* @kind path-problem
|
||||
* @scored
|
||||
* @problem.severity error
|
||||
* @security-severity 9.1
|
||||
* @precision high
|
||||
* @id java/ml-powered/ssrf
|
||||
* @tags experimental security
|
||||
* external/cwe/cwe-918
|
||||
*/
|
||||
|
||||
import experimental.adaptivethreatmodeling.RequestForgeryATM
|
||||
import AtmResultsInfo
|
||||
import DataFlow::PathGraph
|
||||
|
||||
from AtmConfig cfg, DataFlow::PathNode source, DataFlow::PathNode sink, float score
|
||||
where cfg.hasBoostedFlowPath(source, sink, score)
|
||||
select sink.getNode(), source, sink,
|
||||
"(Experimental) Potential server-side request forgery due to a $@.", source.getNode(),
|
||||
"user-provided value", score
|
||||
@@ -0,0 +1,25 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* @name Query built from user-controlled sources (experimental)
|
||||
* @description Building a SQL or Java Persistence query from user-controlled sources is vulnerable to insertion of
|
||||
* malicious code by the user.
|
||||
* @kind path-problem
|
||||
* @scored
|
||||
* @problem.severity error
|
||||
* @security-severity 8.8
|
||||
* @precision high
|
||||
* @id java/ml-powered/sql-injection
|
||||
* @tags experimental security
|
||||
* external/cwe/cwe-089
|
||||
* external/cwe/cwe-564
|
||||
*/
|
||||
|
||||
import experimental.adaptivethreatmodeling.SqlTaintedATM
|
||||
import AtmResultsInfo
|
||||
import DataFlow::PathGraph
|
||||
|
||||
from AtmConfig cfg, DataFlow::PathNode source, DataFlow::PathNode sink, float score
|
||||
where cfg.hasBoostedFlowPath(source, sink, score)
|
||||
select sink.getNode(), source, sink, "(Experimental) This query depends on a $@.", source.getNode(),
|
||||
"user-provided value", score
|
||||
@@ -0,0 +1,26 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* @name Uncontrolled data used in path expression (experimental)
|
||||
* @description Accessing paths influenced by users can allow an attacker to access unexpected resources.
|
||||
* @kind path-problem
|
||||
* @scored
|
||||
* @problem.severity error
|
||||
* @security-severity 7.5
|
||||
* @precision high
|
||||
* @id java/ml-powered/path-injection
|
||||
* @tags experimental security
|
||||
* external/cwe/cwe-022
|
||||
* external/cwe/cwe-023
|
||||
* external/cwe/cwe-036
|
||||
* external/cwe/cwe-073
|
||||
*/
|
||||
|
||||
import experimental.adaptivethreatmodeling.TaintedPathATM
|
||||
import AtmResultsInfo
|
||||
import DataFlow::PathGraph
|
||||
|
||||
from AtmConfig cfg, DataFlow::PathNode source, DataFlow::PathNode sink, float score
|
||||
where cfg.hasBoostedFlowPath(source, sink, score)
|
||||
select sink.getNode(), source, sink, "(Experimental) This path depends on a $@.", source.getNode(),
|
||||
"user-provided value", score
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
dependencies:
|
||||
codeql/java-experimental-atm-model:
|
||||
version: 0.0.1
|
||||
compiled: false
|
||||
lockVersion: 1.0.0
|
||||
@@ -0,0 +1,2 @@
|
||||
- description: ATM boosted Code Scanning queries for Java
|
||||
- queries: .
|
||||
12
java/ql/experimental/adaptivethreatmodeling/src/qlpack.yml
Normal file
12
java/ql/experimental/adaptivethreatmodeling/src/qlpack.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
name: codeql/java-experimental-atm-queries
|
||||
description: Experimental ML-powered queries for Java
|
||||
language: java
|
||||
version: 0.4.5
|
||||
suites: codeql-suites
|
||||
defaultSuiteFile: codeql-suites/java-atm-code-scanning.qls
|
||||
groups:
|
||||
- java
|
||||
- experimental
|
||||
dependencies:
|
||||
codeql/java-experimental-atm-lib: ${workspace}
|
||||
codeql/java-experimental-atm-model: "0.0.1"
|
||||
2
java/ql/experimental/adaptivethreatmodeling/test/.gitignore
vendored
Normal file
2
java/ql/experimental/adaptivethreatmodeling/test/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
**/*.testproj
|
||||
**/*.actual
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
dependencies:
|
||||
codeql/java-experimental-atm-model:
|
||||
version: 0.3.0
|
||||
compiled: false
|
||||
lockVersion: 1.0.0
|
||||
@@ -0,0 +1,4 @@
|
||||
name: codeql/java-experimental-atm-tests
|
||||
extractor: java
|
||||
dependencies:
|
||||
codeql/java-experimental-atm-model-building: ${workspace}
|
||||
@@ -5,4 +5,4 @@ groups:
|
||||
- javascript
|
||||
- experimental
|
||||
mlModels:
|
||||
- "resources/*.codeqlmodel"
|
||||
- "resources/shellcommand.codeqlmodel"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
dependencies:
|
||||
codeql/javascript-experimental-atm-model:
|
||||
version: 0.3.0
|
||||
dsp-testing/javascript-experimental-atm-model:
|
||||
version: 0.3.1-2022-12-21-01h55m24s.gray-roof-szzhgkwk.689231edea6179400bcffbcb0e7f6eb2bacd29c6be27a2930dd4f63ccdb64f34
|
||||
compiled: false
|
||||
lockVersion: 1.0.0
|
||||
|
||||
Reference in New Issue
Block a user