Automodel extraction queries in java telemetry query directory

This commit is contained in:
Stephan Brandauer
2023-04-14 12:35:59 +02:00
parent 246d904712
commit 6eefb268dd
6 changed files with 757 additions and 0 deletions

View File

@@ -0,0 +1,320 @@
/**
* For internal use only.
*/
private import java
private import semmle.code.java.dataflow.DataFlow
private import semmle.code.java.dataflow.TaintTracking
private import semmle.code.java.security.PathCreation
private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow
private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import semmle.code.java.Expr as Expr
private import semmle.code.java.security.QueryInjection
private import semmle.code.java.security.RequestForgery
import AutomodelSharedCharacteristics as SharedCharacteristics
import AutomodelEndpointTypes as AutomodelEndpointTypes
module CandidatesImpl implements SharedCharacteristics::CandidateSig {
class Endpoint = DataFlow::ParameterNode;
class EndpointType = AutomodelEndpointTypes::EndpointType;
predicate isNegative(AutomodelEndpointTypes::EndpointType t) {
t instanceof AutomodelEndpointTypes::NegativeSinkType
}
string getLocationString(Endpoint e) { result = e.getLocation().toString() }
predicate isKnownLabel(string label, string humanReadableLabel, EndpointType type) {
label = "read-file" and
humanReadableLabel = "read file" and
type instanceof AutomodelEndpointTypes::TaintedPathSinkType
or
label = "create-file" and
humanReadableLabel = "create file" and
type instanceof AutomodelEndpointTypes::TaintedPathSinkType
or
label = "sql" and
humanReadableLabel = "mad modeled sql" and
type instanceof AutomodelEndpointTypes::SqlSinkType
or
label = "open-url" and
humanReadableLabel = "open url" and
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
or
label = "jdbc-url" and
humanReadableLabel = "jdbc url" and
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
or
label = "command-injection" and
humanReadableLabel = "command injection" and
type instanceof AutomodelEndpointTypes::CommandInjectionSinkType
}
predicate isSink(Endpoint e, string label) {
exists(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input
|
sinkSpec(e, package, type, subtypes, name, signature, ext, input) and
ExternalFlow::sinkModel(package, type, subtypes, name, [signature, ""], ext, input, label, _)
)
}
predicate isNeutral(Endpoint e) {
exists(string package, string type, string name, string signature |
sinkSpec(e, package, type, _, name, signature, _, _) and
ExternalFlow::neutralModel(package, type, name, [signature, ""], _)
)
}
additional predicate sinkSpec(
Endpoint e, string package, string type, boolean subtypes, string name, string signature,
string ext, string input
) {
package = e.getEnclosingCallable().getDeclaringType().getPackage().toString() and
type = e.getEnclosingCallable().getDeclaringType().getName() and
subtypes = false and
name = e.getEnclosingCallable().getName() and
signature = ExternalFlow::paramsString(e.getEnclosingCallable()) and
ext = "" and
exists(int paramIdx | e.isParameterOf(_, paramIdx) | input = "Argument[" + paramIdx + "]")
}
predicate hasMetadata(Endpoint n, string metadata) {
exists(
string package, string type, boolean subtypes, string name, string signature, string ext,
int input, string provenance, boolean isPublic, boolean isFinal, string calleeJavaDoc
|
hasMetadata(n, package, type, name, signature, input, isFinal, isPublic, calleeJavaDoc) and
(if isFinal = true then subtypes = false else subtypes = true) and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
provenance = "ai-generated" and
metadata =
"{" //
+ "'Package': '" + package //
+ "', 'Type': '" + type //
+ "', 'Subtypes': " + subtypes //
+ ", 'Name': '" + name //
+ "', 'Signature': '" + signature //
+ "', 'Ext': '" + ext //
+ "', 'Argument index': " + input //
+ ", 'Provenance': '" + provenance //
+ "', 'Is public': " + isPublic //
+ "', 'Callee JavaDoc': '" + calleeJavaDoc.replaceAll("'", "\"") //
+ "'}" // TODO: Why are the curly braces added twice?
)
}
}
module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<CandidatesImpl>;
class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
class Endpoint = CandidatesImpl::Endpoint;
/*
* Predicates that are used to surface prompt examples and candidates for classification with an ML model.
*/
/**
* Holds if `n` has the given metadata.
*
* This is a helper function to extract and export needed information about each endpoint.
*/
predicate hasMetadata(
Endpoint n, string package, string type, string name, string signature, int input,
boolean isFinal, boolean isPublic, string calleeJavaDoc
) {
exists(Callable callee |
n.asParameter() = callee.getParameter(input) and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getErasure().(RefType).nestedName() and
(
if callee.isFinal() or callee.getDeclaringType().isFinal()
then isFinal = true
else isFinal = false
) and
name = callee.getSourceDeclaration().getName() and
signature = ExternalFlow::paramsString(callee) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)?
(if callee.isPublic() then isPublic = true else isPublic = false) and
if exists(callee.(Documentable).getJavadoc())
then calleeJavaDoc = callee.(Documentable).getJavadoc().toString()
else calleeJavaDoc = ""
)
}
/*
* EndpointCharacteristic classes that are specific to Automodel for Java.
*/
/**
* A negative characteristic that indicates that an is-style boolean method is unexploitable even if it is a sink.
*
* A sink is highly unlikely to be exploitable if its callee's name starts with `is` and the callee has a boolean return
* type (e.g. `isDirectory`). These kinds of calls normally do only checks, and appear before the proper call that does
* the dangerous/interesting thing, so we want the latter to be modeled as the sink.
*
* TODO: this might filter too much, it's possible that methods with more than one parameter contain interesting sinks
*/
private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
UnexploitableIsCharacteristic() { this = "unexploitable (is-style boolean method)" }
override predicate appliesToEndpoint(Endpoint e) {
not CandidatesImpl::isSink(e, _) and
e.getEnclosingCallable().getName().matches("is%") and
e.getEnclosingCallable().getReturnType() instanceof BooleanType
}
}
/**
* A negative characteristic that indicates that an existence-checking boolean method is unexploitable even if it is a
* sink.
*
* A sink is highly unlikely to be exploitable if its callee's name is `exists` or `notExists` and the callee has a
* boolean return type. These kinds of calls normally do only checks, and appear before the proper call that does the
* dangerous/interesting thing, so we want the latter to be modeled as the sink.
*/
private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
UnexploitableExistsCharacteristic() { this = "unexploitable (existence-checking boolean method)" }
override predicate appliesToEndpoint(Endpoint e) {
not CandidatesImpl::isSink(e, _) and
exists(Callable callee |
callee = e.getEnclosingCallable() and
(
callee.getName().toLowerCase() = "exists" or
callee.getName().toLowerCase() = "notexists"
) and
callee.getReturnType() instanceof BooleanType
)
}
}
/**
* A negative characteristic that indicates that an endpoint is an argument to an exception, which is not a sink.
*/
private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
ExceptionCharacteristic() { this = "exception" }
override predicate appliesToEndpoint(Endpoint e) {
e.getEnclosingCallable().getDeclaringType().getASupertype*() instanceof TypeThrowable
}
}
/**
* A negative characteristic that indicates that an endpoint sits in a test file.
*
* WARNING: These endpoints should not be used as negative samples for training, because there can in fact be sinks in
* test files -- we just don't care to model them because they aren't exploitable.
*/
private class TestFileCharacteristic extends CharacteristicsImpl::LikelyNotASinkCharacteristic {
TestFileCharacteristic() { this = "test file" }
override predicate appliesToEndpoint(Endpoint e) {
exists(File f | f = e.getLocation().getFile() and isInTestFile(f))
}
private predicate isInTestFile(File file) {
file.getAbsolutePath().matches("%src/test/%") or
file.getAbsolutePath().matches("%/guava-tests/%") or
file.getAbsolutePath().matches("%/guava-testlib/%")
}
}
/**
* A negative characteristic that filters out calls to undocumented methods. The assumption is that methods that are
* intended / likely to be called from outside the package are documented.
*
* Note that in practice we have seen some interesting sinks in methods that are external-facing but undocumented (and
* appear in empty Javadoc pages), so this filter can be expected to lead to the loss of some interesting sinks.
*/
private class UndocumentedMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
{
UndocumentedMethodCharacteristic() { this = "undocumented method" }
override predicate appliesToEndpoint(Endpoint e) {
not exists(e.getEnclosingCallable().(Documentable).getJavadoc())
}
}
/**
* A negative characteristic that filters out non-public methods. Non-public methods are not interesting to include in
* the standard Java modeling, because they cannot be called from outside the package.
*/
private class NonPublicMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
{
NonPublicMethodCharacteristic() { this = "non-public method" }
override predicate appliesToEndpoint(Endpoint e) { not e.getEnclosingCallable().isPublic() }
}
/**
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
* characteristics. Lists the problematic characteristics and their implications for all such endpoints, together with
* an error message indicating why this combination is problematic.
*
* Copied from
* javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
*/
predicate erroneousEndpoints(
Endpoint endpoint, EndpointCharacteristic characteristic,
AutomodelEndpointTypes::EndpointType endpointType, float confidence, string errorMessage,
boolean ignoreKnownModelingErrors
) {
// An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
// sink/source type (including the negative type).
exists(
EndpointCharacteristic characteristic2, AutomodelEndpointTypes::EndpointType endpointClass2,
float confidence2
|
endpointType != endpointClass2 and
(
endpointType instanceof AutomodelEndpointTypes::SinkType and
endpointClass2 instanceof AutomodelEndpointTypes::SinkType
or
endpointType instanceof AutomodelEndpointTypes::SourceType and
endpointClass2 instanceof AutomodelEndpointTypes::SourceType
) and
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointType, true, confidence) and
characteristic2.hasImplications(endpointClass2, true, confidence2) and
confidence > SharedCharacteristics::mediumConfidence() and
confidence2 > SharedCharacteristics::mediumConfidence() and
(
ignoreKnownModelingErrors = true and
not knownOverlappingCharacteristics(characteristic, characteristic2)
or
ignoreKnownModelingErrors = false
)
) and
errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
or
// An endpoint's characteristics should not include positive indicators with medium/high confidence for some class and
// also include negative indicators with medium/high confidence for this same class.
exists(EndpointCharacteristic characteristic2, float confidence2 |
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointType, true, confidence) and
characteristic2.hasImplications(endpointType, false, confidence2) and
confidence > SharedCharacteristics::mediumConfidence() and
confidence2 > SharedCharacteristics::mediumConfidence()
) and
ignoreKnownModelingErrors = false and
errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
}
/**
* Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that
* have some overlap in their results. This indicates a problem with the underlying Java modeling. Specifically,
* `PathCreation` is prone to FPs.
*/
private predicate knownOverlappingCharacteristics(
EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2
) {
characteristic1 != characteristic2 and
characteristic1 = ["mad taint step", "create path", "read file", "known non-sink"] and
characteristic2 = ["mad taint step", "create path", "read file", "known non-sink"]
}

View File

@@ -0,0 +1,60 @@
/**
* For internal use only.
*
* Defines the set of classes that endpoint scoring models can predict. Endpoint scoring models must
* only predict classes defined within this file. This file is the source of truth for the integer
* representation of each of these classes.
*/
/** A class that can be predicted by a classifier. */
abstract class EndpointType extends string {
/**
* Holds when the string matches the name of the sink / source type.
*/
bindingset[this]
EndpointType() { any() }
/**
* Gets the name of the sink/source kind for this endpoint type as used in models-as-data.
*
* See https://github.com/github/codeql/blob/44213f0144fdd54bb679ca48d68b28dcf820f7a8/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll#LL353C11-L357C31
*/
final string getKind() { result = this }
}
/** A class for sink types that can be predicted by a classifier. */
abstract class SinkType extends EndpointType {
bindingset[this]
SinkType() { any() }
}
/** A class for source types that can be predicted by a classifier. */
abstract class SourceType extends EndpointType {
bindingset[this]
SourceType() { any() }
}
/** The `Negative` class for non-sinks. */
class NegativeSinkType extends SinkType {
NegativeSinkType() { this = "non-sink" }
}
/** A sink relevant to the SQL injection query */
class SqlSinkType extends SinkType {
SqlSinkType() { this = "sql" }
}
/** A sink relevant to the tainted path injection query. */
class TaintedPathSinkType extends SinkType {
TaintedPathSinkType() { this = "tainted-path" }
}
/** A sink relevant to the SSRF query. */
class RequestForgerySinkType extends SinkType {
RequestForgerySinkType() { this = "ssrf" }
}
/** A sink relevant to the command injection query. */
class CommandInjectionSinkType extends SinkType {
CommandInjectionSinkType() { this = "command-injection" }
}

View File

@@ -0,0 +1,39 @@
/**
* Surfaces the endpoints that pass the endpoint filters and are not already known to be sinks, and are therefore used
* as candidates for classification with an ML model.
*
* Note: This query does not actually classify the endpoints using the model.
*
* @name Automodel candidates
* @description A query to extract automodel candidates.
* @kind problem
* @severity info
* @id java/ml-powered/extract-automodel-candidates
* @tags automodel extract candidates
*/
import AutomodelEndpointCharacteristics
from Endpoint sinkCandidate, string message
where
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
u.appliesToEndpoint(sinkCandidate)
) and
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
not CharacteristicsImpl::isSink(sinkCandidate, _) and
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
// a non-sink, and we surface only endpoints that have at least one such sink type.
message =
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
not CharacteristicsImpl::isKnownSink(sinkCandidate, sinkType) and
CharacteristicsImpl::isSinkCandidate(sinkCandidate, sinkType)
|
sinkType + ", "
) + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | CharacteristicsImpl::hasMetadata(sinkCandidate, metadata))
select sinkCandidate, message

View File

@@ -0,0 +1,36 @@
/**
* Surfaces endpoints are non-sinks with high confidence, for use as negative examples in the prompt.
*
* @name Negative examples (experimental)
* @kind problem
* @severity info
* @id java/ml-powered/non-sink
* @tags automodel extract negative-examples
*/
import AutomodelEndpointCharacteristics
import AutomodelEndpointTypes
from Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message
where
characteristic.appliesToEndpoint(endpoint) and
confidence >= SharedCharacteristics::highConfidence() and
characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(endpoint, _, _, _, _, false) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
not exists(EndpointCharacteristic characteristic2, float confidence2, SinkType positiveType |
not positiveType instanceof NegativeSinkType and
characteristic2.appliesToEndpoint(endpoint) and
confidence2 >= SharedCharacteristics::maximalConfidence() and
characteristic2.hasImplications(positiveType, true, confidence2)
) and
message =
characteristic + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata))
select endpoint, message

View File

@@ -0,0 +1,43 @@
/**
* Surfaces endpoints are sinks with high confidence, for use as positive examples in the prompt.
*
* @name Positive examples (experimental)
* @kind problem
* @severity info
* @id java/ml-powered/known-sink
* @tags automodel extract positive-examples
*/
private import java
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import AutomodelEndpointCharacteristics
private import AutomodelEndpointTypes
// private import experimental.adaptivethreatmodeling.ATMConfigs // To import the configurations of all supported Java queries
/*
* ****** WARNING: ******
* Before calling this query, make sure there's no codex-generated data extension file in `java/ql/lib/ext`. Otherwise,
* the ML-generated, noisy sinks will end up polluting the positive examples used in the prompt!
*/
from Endpoint sink, SinkType sinkType, string message
where
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(sink, _, _, _, _, false) and
// Extract positive examples of sinks belonging to the existing ATM query configurations.
(
CharacteristicsImpl::isKnownSink(sink, sinkType) and
// If there are _any_ erroneous endpoints, return an error message for all rows. This will prevent us from
// accidentally running this query when there's a codex-generated data extension file in `java/ql/lib/ext`.
if not erroneousEndpoints(_, _, _, _, _, true)
then
message =
sinkType + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | CharacteristicsImpl::hasMetadata(sink, metadata))
else
message =
"Error: There are erroneous endpoints! Please check whether there's a codex-generated data extension file in `java/ql/lib/ext`."
)
select sink, message

View File

@@ -0,0 +1,259 @@
float maximalConfidence() { result = 1.0 }
float highConfidence() { result = 0.9 }
float mediumConfidence() { result = 0.6 }
signature module CandidateSig {
class Endpoint;
class EndpointType;
string getLocationString(Endpoint e);
/**
* Defines what labels are known, and what endpoint type they correspond to.
*/
predicate isKnownLabel(string label, string humanReadableLabel, EndpointType type);
/**
* EndpointType must have a 'negative' type that denotes the absence of any sink.
* This predicate should hold for that type, and that type only.
*/
predicate isNegative(EndpointType t);
/**
* Should hold for any endpoint that is a sink of the given (known or unknown) label.
*/
predicate isSink(Endpoint e, string label);
/**
* Should hold for any endpoint that is known to not be any sink.
*/
predicate isNeutral(Endpoint e);
/**
* Holds if `e` has the given metadata.
*
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query
* as well as the queries that extract positive and negative examples for the prompt / training set. The metadata is
* extracted as a string in the format of a Python dictionary.
*/
predicate hasMetadata(Endpoint e, string metadata);
}
module SharedCharacteristics<CandidateSig Candidate> {
predicate isNegative(Candidate::EndpointType e) { Candidate::isNegative(e) }
predicate isSink(Candidate::Endpoint e, string label) { Candidate::isSink(e, label) }
predicate isNeutral(Candidate::Endpoint e) { Candidate::isNeutral(e) }
/**
* Holds if `sink` is a known sink of type `endpointType`.
*/
predicate isKnownSink(Candidate::Endpoint sink, Candidate::EndpointType endpointType) {
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
// known sink for the class.
not isNegative(endpointType) and
exists(EndpointCharacteristic characteristic |
characteristic.appliesToEndpoint(sink) and
characteristic.hasImplications(endpointType, true, maximalConfidence())
)
}
/**
* Holds if the candidate sink `candidateSink` should be considered as a possible sink of type `sinkType`, and
* classified by the ML model. A candidate sink is a node that cannot be excluded from `sinkType` based on its
* characteristics.
*/
predicate isSinkCandidate(Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType) {
not isNegative(sinkType) and
not exists(getAReasonSinkExcluded(candidateSink, sinkType))
}
predicate hasMetadata(Candidate::Endpoint n, string metadata) {
Candidate::hasMetadata(n, metadata)
}
/**
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink for a given sink
* type.
*/
EndpointCharacteristic getAReasonSinkExcluded(
Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType
) {
// An endpoint is a sink candidate if none of its characteristics give much indication whether or not it is a sink.
not isNegative(sinkType) and
result.appliesToEndpoint(candidateSink) and
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
(
exists(float confidence |
confidence >= mediumConfidence() and
result.hasImplications(any(Candidate::EndpointType t | isNegative(t)), true, confidence)
)
or
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
exists(float confidence |
confidence >= mediumConfidence() and
result.hasImplications(sinkType, false, confidence)
)
)
}
/**
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
* about whether to include the endpoint in the training set and with what label, as well as whether to score the
* endpoint at inference time.
*/
abstract class EndpointCharacteristic extends string {
/**
* Holds when the string matches the name of the characteristic, which should describe some characteristic of the
* endpoint that is meaningful for determining whether it's a sink and if so of which type
*/
bindingset[this]
EndpointCharacteristic() { any() }
/**
* Holds for parameters that have this characteristic. This predicate contains the logic that applies characteristics
* to the appropriate set of dataflow parameters.
*/
abstract predicate appliesToEndpoint(Candidate::Endpoint n);
/**
* This predicate describes what the characteristic tells us about an endpoint.
*
* Params:
* endpointType: The sink/source type.
* isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
* false, it indicates that it _isn't_ a member of the class.
* confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
* belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
* indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
* this characteristic definitively do/don't belong to the class.
*/
abstract predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
);
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
final float getHighConfidenceThreshold() { result = 0.8 }
}
/**
* A high-confidence characteristic that indicates that an endpoint is a sink of a specified type. These endpoints can
* be used as positive samples for training or for a few-shot prompt.
*/
abstract class SinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
SinkCharacteristic() { any() }
abstract Candidate::EndpointType getSinkType();
final override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
endpointType = this.getSinkType() and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
/**
* Endpoints identified as sinks by the MaD modeling are sinks with maximal confidence.
*/
private class KnownSinkCharacteristic extends SinkCharacteristic {
string madLabel;
Candidate::EndpointType endpointType;
KnownSinkCharacteristic() { Candidate::isKnownLabel(madLabel, this, endpointType) }
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSink(e, madLabel) }
override Candidate::EndpointType getSinkType() { result = endpointType }
}
/**
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type. These endpoints can be
* used as negative samples for training or for a few-shot prompt.
*/
abstract class NotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
NotASinkCharacteristic() { any() }
override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
Candidate::isNegative(endpointType) and
isPositiveIndicator = true and
confidence = highConfidence()
}
}
/**
* A negative characteristic that indicates that an endpoint is not part of the source code for the project being
* analyzed.
*
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
* non-sinks. They are merely not interesting sinks to run through the ML model.
*/
private class IsExternalCharacteristic extends LikelyNotASinkCharacteristic {
IsExternalCharacteristic() { this = "external" }
override predicate appliesToEndpoint(Candidate::Endpoint e) {
not exists(Candidate::getLocationString(e))
}
}
/**
* A negative characteristic that indicates that an endpoint was manually modeled as a neutral model.
*
* TODO: It may be necessary to turn this into a LikelyNotASinkCharacteristic, pending answers to the definition of a
* neutral model (https://github.com/github/codeql-java-team/issues/254#issuecomment-1435309148).
*/
private class NeutralModelCharacteristic extends NotASinkCharacteristic {
NeutralModelCharacteristic() { this = "known non-sink" }
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isNeutral(e) }
}
/**
* A medium-confidence characteristic that indicates that an endpoint is unlikely to be a sink of any type. These
* endpoints can be excluded from scoring at inference time, both to save time and to avoid false positives. They should
* not, however, be used as negative samples for training or for a few-shot prompt, because they may include a small
* number of sinks.
*/
abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
LikelyNotASinkCharacteristic() { any() }
override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
Candidate::isNegative(endpointType) and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
/**
* A characteristic that indicates not necessarily that an endpoint is not a sink, but rather that it is not a sink
* that's interesting to model in the standard Java libraries. These filters should be removed when extracting sink
* candidates within a user's codebase for customized modeling.
*
* These endpoints should not be used as negative samples for training or for a few-shot prompt, because they are not
* necessarily non-sinks.
*/
abstract class UninterestingToModelCharacteristic extends EndpointCharacteristic {
bindingset[this]
UninterestingToModelCharacteristic() { any() }
override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
Candidate::isNegative(endpointType) and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
}