mirror of
https://github.com/github/codeql.git
synced 2026-04-24 00:05:14 +02:00
Automodel extraction queries in java telemetry query directory
This commit is contained in:
320
java/ql/src/Telemetry/AutomodelEndpointCharacteristics.qll
Normal file
320
java/ql/src/Telemetry/AutomodelEndpointCharacteristics.qll
Normal file
@@ -0,0 +1,320 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*/
|
||||
|
||||
private import java
|
||||
private import semmle.code.java.dataflow.DataFlow
|
||||
private import semmle.code.java.dataflow.TaintTracking
|
||||
private import semmle.code.java.security.PathCreation
|
||||
private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow
|
||||
private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl
|
||||
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
|
||||
private import semmle.code.java.Expr as Expr
|
||||
private import semmle.code.java.security.QueryInjection
|
||||
private import semmle.code.java.security.RequestForgery
|
||||
import AutomodelSharedCharacteristics as SharedCharacteristics
|
||||
import AutomodelEndpointTypes as AutomodelEndpointTypes
|
||||
|
||||
module CandidatesImpl implements SharedCharacteristics::CandidateSig {
|
||||
class Endpoint = DataFlow::ParameterNode;
|
||||
|
||||
class EndpointType = AutomodelEndpointTypes::EndpointType;
|
||||
|
||||
predicate isNegative(AutomodelEndpointTypes::EndpointType t) {
|
||||
t instanceof AutomodelEndpointTypes::NegativeSinkType
|
||||
}
|
||||
|
||||
string getLocationString(Endpoint e) { result = e.getLocation().toString() }
|
||||
|
||||
predicate isKnownLabel(string label, string humanReadableLabel, EndpointType type) {
|
||||
label = "read-file" and
|
||||
humanReadableLabel = "read file" and
|
||||
type instanceof AutomodelEndpointTypes::TaintedPathSinkType
|
||||
or
|
||||
label = "create-file" and
|
||||
humanReadableLabel = "create file" and
|
||||
type instanceof AutomodelEndpointTypes::TaintedPathSinkType
|
||||
or
|
||||
label = "sql" and
|
||||
humanReadableLabel = "mad modeled sql" and
|
||||
type instanceof AutomodelEndpointTypes::SqlSinkType
|
||||
or
|
||||
label = "open-url" and
|
||||
humanReadableLabel = "open url" and
|
||||
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
|
||||
or
|
||||
label = "jdbc-url" and
|
||||
humanReadableLabel = "jdbc url" and
|
||||
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
|
||||
or
|
||||
label = "command-injection" and
|
||||
humanReadableLabel = "command injection" and
|
||||
type instanceof AutomodelEndpointTypes::CommandInjectionSinkType
|
||||
}
|
||||
|
||||
predicate isSink(Endpoint e, string label) {
|
||||
exists(
|
||||
string package, string type, boolean subtypes, string name, string signature, string ext,
|
||||
string input
|
||||
|
|
||||
sinkSpec(e, package, type, subtypes, name, signature, ext, input) and
|
||||
ExternalFlow::sinkModel(package, type, subtypes, name, [signature, ""], ext, input, label, _)
|
||||
)
|
||||
}
|
||||
|
||||
predicate isNeutral(Endpoint e) {
|
||||
exists(string package, string type, string name, string signature |
|
||||
sinkSpec(e, package, type, _, name, signature, _, _) and
|
||||
ExternalFlow::neutralModel(package, type, name, [signature, ""], _)
|
||||
)
|
||||
}
|
||||
|
||||
additional predicate sinkSpec(
|
||||
Endpoint e, string package, string type, boolean subtypes, string name, string signature,
|
||||
string ext, string input
|
||||
) {
|
||||
package = e.getEnclosingCallable().getDeclaringType().getPackage().toString() and
|
||||
type = e.getEnclosingCallable().getDeclaringType().getName() and
|
||||
subtypes = false and
|
||||
name = e.getEnclosingCallable().getName() and
|
||||
signature = ExternalFlow::paramsString(e.getEnclosingCallable()) and
|
||||
ext = "" and
|
||||
exists(int paramIdx | e.isParameterOf(_, paramIdx) | input = "Argument[" + paramIdx + "]")
|
||||
}
|
||||
|
||||
predicate hasMetadata(Endpoint n, string metadata) {
|
||||
exists(
|
||||
string package, string type, boolean subtypes, string name, string signature, string ext,
|
||||
int input, string provenance, boolean isPublic, boolean isFinal, string calleeJavaDoc
|
||||
|
|
||||
hasMetadata(n, package, type, name, signature, input, isFinal, isPublic, calleeJavaDoc) and
|
||||
(if isFinal = true then subtypes = false else subtypes = true) and
|
||||
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
|
||||
provenance = "ai-generated" and
|
||||
metadata =
|
||||
"{" //
|
||||
+ "'Package': '" + package //
|
||||
+ "', 'Type': '" + type //
|
||||
+ "', 'Subtypes': " + subtypes //
|
||||
+ ", 'Name': '" + name //
|
||||
+ "', 'Signature': '" + signature //
|
||||
+ "', 'Ext': '" + ext //
|
||||
+ "', 'Argument index': " + input //
|
||||
+ ", 'Provenance': '" + provenance //
|
||||
+ "', 'Is public': " + isPublic //
|
||||
+ "', 'Callee JavaDoc': '" + calleeJavaDoc.replaceAll("'", "\"") //
|
||||
+ "'}" // TODO: Why are the curly braces added twice?
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<CandidatesImpl>;
|
||||
|
||||
class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
|
||||
|
||||
class Endpoint = CandidatesImpl::Endpoint;
|
||||
|
||||
/*
|
||||
* Predicates that are used to surface prompt examples and candidates for classification with an ML model.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Holds if `n` has the given metadata.
|
||||
*
|
||||
* This is a helper function to extract and export needed information about each endpoint.
|
||||
*/
|
||||
predicate hasMetadata(
|
||||
Endpoint n, string package, string type, string name, string signature, int input,
|
||||
boolean isFinal, boolean isPublic, string calleeJavaDoc
|
||||
) {
|
||||
exists(Callable callee |
|
||||
n.asParameter() = callee.getParameter(input) and
|
||||
package = callee.getDeclaringType().getPackage().getName() and
|
||||
type = callee.getDeclaringType().getErasure().(RefType).nestedName() and
|
||||
(
|
||||
if callee.isFinal() or callee.getDeclaringType().isFinal()
|
||||
then isFinal = true
|
||||
else isFinal = false
|
||||
) and
|
||||
name = callee.getSourceDeclaration().getName() and
|
||||
signature = ExternalFlow::paramsString(callee) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)?
|
||||
(if callee.isPublic() then isPublic = true else isPublic = false) and
|
||||
if exists(callee.(Documentable).getJavadoc())
|
||||
then calleeJavaDoc = callee.(Documentable).getJavadoc().toString()
|
||||
else calleeJavaDoc = ""
|
||||
)
|
||||
}
|
||||
|
||||
/*
|
||||
* EndpointCharacteristic classes that are specific to Automodel for Java.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A negative characteristic that indicates that an is-style boolean method is unexploitable even if it is a sink.
|
||||
*
|
||||
* A sink is highly unlikely to be exploitable if its callee's name starts with `is` and the callee has a boolean return
|
||||
* type (e.g. `isDirectory`). These kinds of calls normally do only checks, and appear before the proper call that does
|
||||
* the dangerous/interesting thing, so we want the latter to be modeled as the sink.
|
||||
*
|
||||
* TODO: this might filter too much, it's possible that methods with more than one parameter contain interesting sinks
|
||||
*/
|
||||
private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
|
||||
UnexploitableIsCharacteristic() { this = "unexploitable (is-style boolean method)" }
|
||||
|
||||
override predicate appliesToEndpoint(Endpoint e) {
|
||||
not CandidatesImpl::isSink(e, _) and
|
||||
e.getEnclosingCallable().getName().matches("is%") and
|
||||
e.getEnclosingCallable().getReturnType() instanceof BooleanType
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that indicates that an existence-checking boolean method is unexploitable even if it is a
|
||||
* sink.
|
||||
*
|
||||
* A sink is highly unlikely to be exploitable if its callee's name is `exists` or `notExists` and the callee has a
|
||||
* boolean return type. These kinds of calls normally do only checks, and appear before the proper call that does the
|
||||
* dangerous/interesting thing, so we want the latter to be modeled as the sink.
|
||||
*/
|
||||
private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
|
||||
UnexploitableExistsCharacteristic() { this = "unexploitable (existence-checking boolean method)" }
|
||||
|
||||
override predicate appliesToEndpoint(Endpoint e) {
|
||||
not CandidatesImpl::isSink(e, _) and
|
||||
exists(Callable callee |
|
||||
callee = e.getEnclosingCallable() and
|
||||
(
|
||||
callee.getName().toLowerCase() = "exists" or
|
||||
callee.getName().toLowerCase() = "notexists"
|
||||
) and
|
||||
callee.getReturnType() instanceof BooleanType
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that indicates that an endpoint is an argument to an exception, which is not a sink.
|
||||
*/
|
||||
private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
|
||||
ExceptionCharacteristic() { this = "exception" }
|
||||
|
||||
override predicate appliesToEndpoint(Endpoint e) {
|
||||
e.getEnclosingCallable().getDeclaringType().getASupertype*() instanceof TypeThrowable
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that indicates that an endpoint sits in a test file.
|
||||
*
|
||||
* WARNING: These endpoints should not be used as negative samples for training, because there can in fact be sinks in
|
||||
* test files -- we just don't care to model them because they aren't exploitable.
|
||||
*/
|
||||
private class TestFileCharacteristic extends CharacteristicsImpl::LikelyNotASinkCharacteristic {
|
||||
TestFileCharacteristic() { this = "test file" }
|
||||
|
||||
override predicate appliesToEndpoint(Endpoint e) {
|
||||
exists(File f | f = e.getLocation().getFile() and isInTestFile(f))
|
||||
}
|
||||
|
||||
private predicate isInTestFile(File file) {
|
||||
file.getAbsolutePath().matches("%src/test/%") or
|
||||
file.getAbsolutePath().matches("%/guava-tests/%") or
|
||||
file.getAbsolutePath().matches("%/guava-testlib/%")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that filters out calls to undocumented methods. The assumption is that methods that are
|
||||
* intended / likely to be called from outside the package are documented.
|
||||
*
|
||||
* Note that in practice we have seen some interesting sinks in methods that are external-facing but undocumented (and
|
||||
* appear in empty Javadoc pages), so this filter can be expected to lead to the loss of some interesting sinks.
|
||||
*/
|
||||
private class UndocumentedMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
|
||||
{
|
||||
UndocumentedMethodCharacteristic() { this = "undocumented method" }
|
||||
|
||||
override predicate appliesToEndpoint(Endpoint e) {
|
||||
not exists(e.getEnclosingCallable().(Documentable).getJavadoc())
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that filters out non-public methods. Non-public methods are not interesting to include in
|
||||
* the standard Java modeling, because they cannot be called from outside the package.
|
||||
*/
|
||||
private class NonPublicMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
|
||||
{
|
||||
NonPublicMethodCharacteristic() { this = "non-public method" }
|
||||
|
||||
override predicate appliesToEndpoint(Endpoint e) { not e.getEnclosingCallable().isPublic() }
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
|
||||
* characteristics. Lists the problematic characteristics and their implications for all such endpoints, together with
|
||||
* an error message indicating why this combination is problematic.
|
||||
*
|
||||
* Copied from
|
||||
* javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
|
||||
*/
|
||||
predicate erroneousEndpoints(
|
||||
Endpoint endpoint, EndpointCharacteristic characteristic,
|
||||
AutomodelEndpointTypes::EndpointType endpointType, float confidence, string errorMessage,
|
||||
boolean ignoreKnownModelingErrors
|
||||
) {
|
||||
// An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
|
||||
// sink/source type (including the negative type).
|
||||
exists(
|
||||
EndpointCharacteristic characteristic2, AutomodelEndpointTypes::EndpointType endpointClass2,
|
||||
float confidence2
|
||||
|
|
||||
endpointType != endpointClass2 and
|
||||
(
|
||||
endpointType instanceof AutomodelEndpointTypes::SinkType and
|
||||
endpointClass2 instanceof AutomodelEndpointTypes::SinkType
|
||||
or
|
||||
endpointType instanceof AutomodelEndpointTypes::SourceType and
|
||||
endpointClass2 instanceof AutomodelEndpointTypes::SourceType
|
||||
) and
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
characteristic.hasImplications(endpointType, true, confidence) and
|
||||
characteristic2.hasImplications(endpointClass2, true, confidence2) and
|
||||
confidence > SharedCharacteristics::mediumConfidence() and
|
||||
confidence2 > SharedCharacteristics::mediumConfidence() and
|
||||
(
|
||||
ignoreKnownModelingErrors = true and
|
||||
not knownOverlappingCharacteristics(characteristic, characteristic2)
|
||||
or
|
||||
ignoreKnownModelingErrors = false
|
||||
)
|
||||
) and
|
||||
errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
|
||||
or
|
||||
// An endpoint's characteristics should not include positive indicators with medium/high confidence for some class and
|
||||
// also include negative indicators with medium/high confidence for this same class.
|
||||
exists(EndpointCharacteristic characteristic2, float confidence2 |
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
characteristic.hasImplications(endpointType, true, confidence) and
|
||||
characteristic2.hasImplications(endpointType, false, confidence2) and
|
||||
confidence > SharedCharacteristics::mediumConfidence() and
|
||||
confidence2 > SharedCharacteristics::mediumConfidence()
|
||||
) and
|
||||
ignoreKnownModelingErrors = false and
|
||||
errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that
|
||||
* have some overlap in their results. This indicates a problem with the underlying Java modeling. Specifically,
|
||||
* `PathCreation` is prone to FPs.
|
||||
*/
|
||||
private predicate knownOverlappingCharacteristics(
|
||||
EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2
|
||||
) {
|
||||
characteristic1 != characteristic2 and
|
||||
characteristic1 = ["mad taint step", "create path", "read file", "known non-sink"] and
|
||||
characteristic2 = ["mad taint step", "create path", "read file", "known non-sink"]
|
||||
}
|
||||
60
java/ql/src/Telemetry/AutomodelEndpointTypes.qll
Normal file
60
java/ql/src/Telemetry/AutomodelEndpointTypes.qll
Normal file
@@ -0,0 +1,60 @@
|
||||
/**
|
||||
* For internal use only.
|
||||
*
|
||||
* Defines the set of classes that endpoint scoring models can predict. Endpoint scoring models must
|
||||
* only predict classes defined within this file. This file is the source of truth for the integer
|
||||
* representation of each of these classes.
|
||||
*/
|
||||
|
||||
/** A class that can be predicted by a classifier. */
|
||||
abstract class EndpointType extends string {
|
||||
/**
|
||||
* Holds when the string matches the name of the sink / source type.
|
||||
*/
|
||||
bindingset[this]
|
||||
EndpointType() { any() }
|
||||
|
||||
/**
|
||||
* Gets the name of the sink/source kind for this endpoint type as used in models-as-data.
|
||||
*
|
||||
* See https://github.com/github/codeql/blob/44213f0144fdd54bb679ca48d68b28dcf820f7a8/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll#LL353C11-L357C31
|
||||
*/
|
||||
final string getKind() { result = this }
|
||||
}
|
||||
|
||||
/** A class for sink types that can be predicted by a classifier. */
|
||||
abstract class SinkType extends EndpointType {
|
||||
bindingset[this]
|
||||
SinkType() { any() }
|
||||
}
|
||||
|
||||
/** A class for source types that can be predicted by a classifier. */
|
||||
abstract class SourceType extends EndpointType {
|
||||
bindingset[this]
|
||||
SourceType() { any() }
|
||||
}
|
||||
|
||||
/** The `Negative` class for non-sinks. */
|
||||
class NegativeSinkType extends SinkType {
|
||||
NegativeSinkType() { this = "non-sink" }
|
||||
}
|
||||
|
||||
/** A sink relevant to the SQL injection query */
|
||||
class SqlSinkType extends SinkType {
|
||||
SqlSinkType() { this = "sql" }
|
||||
}
|
||||
|
||||
/** A sink relevant to the tainted path injection query. */
|
||||
class TaintedPathSinkType extends SinkType {
|
||||
TaintedPathSinkType() { this = "tainted-path" }
|
||||
}
|
||||
|
||||
/** A sink relevant to the SSRF query. */
|
||||
class RequestForgerySinkType extends SinkType {
|
||||
RequestForgerySinkType() { this = "ssrf" }
|
||||
}
|
||||
|
||||
/** A sink relevant to the command injection query. */
|
||||
class CommandInjectionSinkType extends SinkType {
|
||||
CommandInjectionSinkType() { this = "command-injection" }
|
||||
}
|
||||
39
java/ql/src/Telemetry/AutomodelExtractCandidates.ql
Normal file
39
java/ql/src/Telemetry/AutomodelExtractCandidates.ql
Normal file
@@ -0,0 +1,39 @@
|
||||
/**
|
||||
* Surfaces the endpoints that pass the endpoint filters and are not already known to be sinks, and are therefore used
|
||||
* as candidates for classification with an ML model.
|
||||
*
|
||||
* Note: This query does not actually classify the endpoints using the model.
|
||||
*
|
||||
* @name Automodel candidates
|
||||
* @description A query to extract automodel candidates.
|
||||
* @kind problem
|
||||
* @severity info
|
||||
* @id java/ml-powered/extract-automodel-candidates
|
||||
* @tags automodel extract candidates
|
||||
*/
|
||||
|
||||
import AutomodelEndpointCharacteristics
|
||||
|
||||
from Endpoint sinkCandidate, string message
|
||||
where
|
||||
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
|
||||
u.appliesToEndpoint(sinkCandidate)
|
||||
) and
|
||||
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
|
||||
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
|
||||
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
|
||||
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
|
||||
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
|
||||
not CharacteristicsImpl::isSink(sinkCandidate, _) and
|
||||
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
|
||||
// a non-sink, and we surface only endpoints that have at least one such sink type.
|
||||
message =
|
||||
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
|
||||
not CharacteristicsImpl::isKnownSink(sinkCandidate, sinkType) and
|
||||
CharacteristicsImpl::isSinkCandidate(sinkCandidate, sinkType)
|
||||
|
|
||||
sinkType + ", "
|
||||
) + "\n" +
|
||||
// Extract the needed metadata for this endpoint.
|
||||
any(string metadata | CharacteristicsImpl::hasMetadata(sinkCandidate, metadata))
|
||||
select sinkCandidate, message
|
||||
36
java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql
Normal file
36
java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql
Normal file
@@ -0,0 +1,36 @@
|
||||
/**
|
||||
* Surfaces endpoints are non-sinks with high confidence, for use as negative examples in the prompt.
|
||||
*
|
||||
* @name Negative examples (experimental)
|
||||
* @kind problem
|
||||
* @severity info
|
||||
* @id java/ml-powered/non-sink
|
||||
* @tags automodel extract negative-examples
|
||||
*/
|
||||
|
||||
import AutomodelEndpointCharacteristics
|
||||
import AutomodelEndpointTypes
|
||||
|
||||
from Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message
|
||||
where
|
||||
characteristic.appliesToEndpoint(endpoint) and
|
||||
confidence >= SharedCharacteristics::highConfidence() and
|
||||
characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
|
||||
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
|
||||
// certain about in the prompt.
|
||||
not erroneousEndpoints(endpoint, _, _, _, _, false) and
|
||||
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
|
||||
// treated by the actual query as a sanitizer, since the final logic is something like
|
||||
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
|
||||
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
|
||||
not exists(EndpointCharacteristic characteristic2, float confidence2, SinkType positiveType |
|
||||
not positiveType instanceof NegativeSinkType and
|
||||
characteristic2.appliesToEndpoint(endpoint) and
|
||||
confidence2 >= SharedCharacteristics::maximalConfidence() and
|
||||
characteristic2.hasImplications(positiveType, true, confidence2)
|
||||
) and
|
||||
message =
|
||||
characteristic + "\n" +
|
||||
// Extract the needed metadata for this endpoint.
|
||||
any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata))
|
||||
select endpoint, message
|
||||
43
java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql
Normal file
43
java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql
Normal file
@@ -0,0 +1,43 @@
|
||||
/**
|
||||
* Surfaces endpoints are sinks with high confidence, for use as positive examples in the prompt.
|
||||
*
|
||||
* @name Positive examples (experimental)
|
||||
* @kind problem
|
||||
* @severity info
|
||||
* @id java/ml-powered/known-sink
|
||||
* @tags automodel extract positive-examples
|
||||
*/
|
||||
|
||||
private import java
|
||||
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
|
||||
private import AutomodelEndpointCharacteristics
|
||||
private import AutomodelEndpointTypes
|
||||
|
||||
// private import experimental.adaptivethreatmodeling.ATMConfigs // To import the configurations of all supported Java queries
|
||||
/*
|
||||
* ****** WARNING: ******
|
||||
* Before calling this query, make sure there's no codex-generated data extension file in `java/ql/lib/ext`. Otherwise,
|
||||
* the ML-generated, noisy sinks will end up polluting the positive examples used in the prompt!
|
||||
*/
|
||||
|
||||
from Endpoint sink, SinkType sinkType, string message
|
||||
where
|
||||
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
|
||||
// certain about in the prompt.
|
||||
not erroneousEndpoints(sink, _, _, _, _, false) and
|
||||
// Extract positive examples of sinks belonging to the existing ATM query configurations.
|
||||
(
|
||||
CharacteristicsImpl::isKnownSink(sink, sinkType) and
|
||||
// If there are _any_ erroneous endpoints, return an error message for all rows. This will prevent us from
|
||||
// accidentally running this query when there's a codex-generated data extension file in `java/ql/lib/ext`.
|
||||
if not erroneousEndpoints(_, _, _, _, _, true)
|
||||
then
|
||||
message =
|
||||
sinkType + "\n" +
|
||||
// Extract the needed metadata for this endpoint.
|
||||
any(string metadata | CharacteristicsImpl::hasMetadata(sink, metadata))
|
||||
else
|
||||
message =
|
||||
"Error: There are erroneous endpoints! Please check whether there's a codex-generated data extension file in `java/ql/lib/ext`."
|
||||
)
|
||||
select sink, message
|
||||
259
java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll
Normal file
259
java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll
Normal file
@@ -0,0 +1,259 @@
|
||||
float maximalConfidence() { result = 1.0 }
|
||||
|
||||
float highConfidence() { result = 0.9 }
|
||||
|
||||
float mediumConfidence() { result = 0.6 }
|
||||
|
||||
signature module CandidateSig {
|
||||
class Endpoint;
|
||||
|
||||
class EndpointType;
|
||||
|
||||
string getLocationString(Endpoint e);
|
||||
|
||||
/**
|
||||
* Defines what labels are known, and what endpoint type they correspond to.
|
||||
*/
|
||||
predicate isKnownLabel(string label, string humanReadableLabel, EndpointType type);
|
||||
|
||||
/**
|
||||
* EndpointType must have a 'negative' type that denotes the absence of any sink.
|
||||
* This predicate should hold for that type, and that type only.
|
||||
*/
|
||||
predicate isNegative(EndpointType t);
|
||||
|
||||
/**
|
||||
* Should hold for any endpoint that is a sink of the given (known or unknown) label.
|
||||
*/
|
||||
predicate isSink(Endpoint e, string label);
|
||||
|
||||
/**
|
||||
* Should hold for any endpoint that is known to not be any sink.
|
||||
*/
|
||||
predicate isNeutral(Endpoint e);
|
||||
|
||||
/**
|
||||
* Holds if `e` has the given metadata.
|
||||
*
|
||||
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query
|
||||
* as well as the queries that extract positive and negative examples for the prompt / training set. The metadata is
|
||||
* extracted as a string in the format of a Python dictionary.
|
||||
*/
|
||||
predicate hasMetadata(Endpoint e, string metadata);
|
||||
}
|
||||
|
||||
module SharedCharacteristics<CandidateSig Candidate> {
|
||||
predicate isNegative(Candidate::EndpointType e) { Candidate::isNegative(e) }
|
||||
|
||||
predicate isSink(Candidate::Endpoint e, string label) { Candidate::isSink(e, label) }
|
||||
|
||||
predicate isNeutral(Candidate::Endpoint e) { Candidate::isNeutral(e) }
|
||||
|
||||
/**
|
||||
* Holds if `sink` is a known sink of type `endpointType`.
|
||||
*/
|
||||
predicate isKnownSink(Candidate::Endpoint sink, Candidate::EndpointType endpointType) {
|
||||
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
|
||||
// known sink for the class.
|
||||
not isNegative(endpointType) and
|
||||
exists(EndpointCharacteristic characteristic |
|
||||
characteristic.appliesToEndpoint(sink) and
|
||||
characteristic.hasImplications(endpointType, true, maximalConfidence())
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if the candidate sink `candidateSink` should be considered as a possible sink of type `sinkType`, and
|
||||
* classified by the ML model. A candidate sink is a node that cannot be excluded from `sinkType` based on its
|
||||
* characteristics.
|
||||
*/
|
||||
predicate isSinkCandidate(Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType) {
|
||||
not isNegative(sinkType) and
|
||||
not exists(getAReasonSinkExcluded(candidateSink, sinkType))
|
||||
}
|
||||
|
||||
predicate hasMetadata(Candidate::Endpoint n, string metadata) {
|
||||
Candidate::hasMetadata(n, metadata)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink for a given sink
|
||||
* type.
|
||||
*/
|
||||
EndpointCharacteristic getAReasonSinkExcluded(
|
||||
Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType
|
||||
) {
|
||||
// An endpoint is a sink candidate if none of its characteristics give much indication whether or not it is a sink.
|
||||
not isNegative(sinkType) and
|
||||
result.appliesToEndpoint(candidateSink) and
|
||||
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
|
||||
(
|
||||
exists(float confidence |
|
||||
confidence >= mediumConfidence() and
|
||||
result.hasImplications(any(Candidate::EndpointType t | isNegative(t)), true, confidence)
|
||||
)
|
||||
or
|
||||
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
|
||||
exists(float confidence |
|
||||
confidence >= mediumConfidence() and
|
||||
result.hasImplications(sinkType, false, confidence)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
|
||||
* about whether to include the endpoint in the training set and with what label, as well as whether to score the
|
||||
* endpoint at inference time.
|
||||
*/
|
||||
abstract class EndpointCharacteristic extends string {
|
||||
/**
|
||||
* Holds when the string matches the name of the characteristic, which should describe some characteristic of the
|
||||
* endpoint that is meaningful for determining whether it's a sink and if so of which type
|
||||
*/
|
||||
bindingset[this]
|
||||
EndpointCharacteristic() { any() }
|
||||
|
||||
/**
|
||||
* Holds for parameters that have this characteristic. This predicate contains the logic that applies characteristics
|
||||
* to the appropriate set of dataflow parameters.
|
||||
*/
|
||||
abstract predicate appliesToEndpoint(Candidate::Endpoint n);
|
||||
|
||||
/**
|
||||
* This predicate describes what the characteristic tells us about an endpoint.
|
||||
*
|
||||
* Params:
|
||||
* endpointType: The sink/source type.
|
||||
* isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
|
||||
* false, it indicates that it _isn't_ a member of the class.
|
||||
* confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
|
||||
* belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
|
||||
* indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
|
||||
* this characteristic definitively do/don't belong to the class.
|
||||
*/
|
||||
abstract predicate hasImplications(
|
||||
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
|
||||
);
|
||||
|
||||
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
|
||||
final float getHighConfidenceThreshold() { result = 0.8 }
|
||||
}
|
||||
|
||||
/**
|
||||
* A high-confidence characteristic that indicates that an endpoint is a sink of a specified type. These endpoints can
|
||||
* be used as positive samples for training or for a few-shot prompt.
|
||||
*/
|
||||
abstract class SinkCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
SinkCharacteristic() { any() }
|
||||
|
||||
abstract Candidate::EndpointType getSinkType();
|
||||
|
||||
final override predicate hasImplications(
|
||||
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
endpointType = this.getSinkType() and
|
||||
isPositiveIndicator = true and
|
||||
confidence = maximalConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Endpoints identified as sinks by the MaD modeling are sinks with maximal confidence.
|
||||
*/
|
||||
private class KnownSinkCharacteristic extends SinkCharacteristic {
|
||||
string madLabel;
|
||||
Candidate::EndpointType endpointType;
|
||||
|
||||
KnownSinkCharacteristic() { Candidate::isKnownLabel(madLabel, this, endpointType) }
|
||||
|
||||
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSink(e, madLabel) }
|
||||
|
||||
override Candidate::EndpointType getSinkType() { result = endpointType }
|
||||
}
|
||||
|
||||
/**
|
||||
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type. These endpoints can be
|
||||
* used as negative samples for training or for a few-shot prompt.
|
||||
*/
|
||||
abstract class NotASinkCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
NotASinkCharacteristic() { any() }
|
||||
|
||||
override predicate hasImplications(
|
||||
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
Candidate::isNegative(endpointType) and
|
||||
isPositiveIndicator = true and
|
||||
confidence = highConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that indicates that an endpoint is not part of the source code for the project being
|
||||
* analyzed.
|
||||
*
|
||||
* WARNING: These endpoints should not be used as negative samples for training, because they are not necessarily
|
||||
* non-sinks. They are merely not interesting sinks to run through the ML model.
|
||||
*/
|
||||
private class IsExternalCharacteristic extends LikelyNotASinkCharacteristic {
|
||||
IsExternalCharacteristic() { this = "external" }
|
||||
|
||||
override predicate appliesToEndpoint(Candidate::Endpoint e) {
|
||||
not exists(Candidate::getLocationString(e))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative characteristic that indicates that an endpoint was manually modeled as a neutral model.
|
||||
*
|
||||
* TODO: It may be necessary to turn this into a LikelyNotASinkCharacteristic, pending answers to the definition of a
|
||||
* neutral model (https://github.com/github/codeql-java-team/issues/254#issuecomment-1435309148).
|
||||
*/
|
||||
private class NeutralModelCharacteristic extends NotASinkCharacteristic {
|
||||
NeutralModelCharacteristic() { this = "known non-sink" }
|
||||
|
||||
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isNeutral(e) }
|
||||
}
|
||||
|
||||
/**
|
||||
* A medium-confidence characteristic that indicates that an endpoint is unlikely to be a sink of any type. These
|
||||
* endpoints can be excluded from scoring at inference time, both to save time and to avoid false positives. They should
|
||||
* not, however, be used as negative samples for training or for a few-shot prompt, because they may include a small
|
||||
* number of sinks.
|
||||
*/
|
||||
abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
LikelyNotASinkCharacteristic() { any() }
|
||||
|
||||
override predicate hasImplications(
|
||||
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
Candidate::isNegative(endpointType) and
|
||||
isPositiveIndicator = true and
|
||||
confidence = mediumConfidence()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A characteristic that indicates not necessarily that an endpoint is not a sink, but rather that it is not a sink
|
||||
* that's interesting to model in the standard Java libraries. These filters should be removed when extracting sink
|
||||
* candidates within a user's codebase for customized modeling.
|
||||
*
|
||||
* These endpoints should not be used as negative samples for training or for a few-shot prompt, because they are not
|
||||
* necessarily non-sinks.
|
||||
*/
|
||||
abstract class UninterestingToModelCharacteristic extends EndpointCharacteristic {
|
||||
bindingset[this]
|
||||
UninterestingToModelCharacteristic() { any() }
|
||||
|
||||
override predicate hasImplications(
|
||||
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
|
||||
) {
|
||||
Candidate::isNegative(endpointType) and
|
||||
isPositiveIndicator = true and
|
||||
confidence = mediumConfidence()
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user