Create separate automodel pack.

This commit is contained in:
Anders Starcke Henriksen
2023-08-03 13:47:30 +02:00
parent 89aa86a1d6
commit e2abd3ff13
36 changed files with 7 additions and 0 deletions

View File

@@ -1,186 +0,0 @@
private import java
private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow
private import semmle.code.java.dataflow.TaintTracking
private import semmle.code.java.security.RequestForgeryConfig
private import semmle.code.java.security.CommandLineQuery
private import semmle.code.java.security.SqlConcatenatedQuery
private import semmle.code.java.security.SqlInjectionQuery
private import semmle.code.java.security.UrlRedirectQuery
private import semmle.code.java.security.TaintedPathQuery
private import semmle.code.java.security.SqlInjectionQuery
private import AutomodelJavaUtil
private newtype TSinkModel =
MkSinkModel(
string package, string type, boolean subtypes, string name, string signature, string ext,
string input, string kind, string provenance
) {
ExternalFlow::sinkModel(package, type, subtypes, name, signature, ext, input, kind, provenance)
}
class SinkModel extends TSinkModel {
string package;
string type;
boolean subtypes;
string name;
string signature;
string ext;
string input;
string kind;
string provenance;
SinkModel() {
this = MkSinkModel(package, type, subtypes, name, signature, ext, input, kind, provenance)
}
/** Gets the package for this sink model. */
string getPackage() { result = package }
/** Gets the type for this sink model. */
string getType() { result = type }
/** Gets whether this sink model considers subtypes. */
boolean getSubtypes() { result = subtypes }
/** Gets the name for this sink model. */
string getName() { result = name }
/** Gets the signature for this sink model. */
string getSignature() { result = signature }
/** Gets the input for this sink model. */
string getInput() { result = input }
/** Gets the extension for this sink model. */
string getExt() { result = ext }
/** Gets the kind for this sink model. */
string getKind() { result = kind }
/** Gets the provenance for this sink model. */
string getProvenance() { result = provenance }
/** Gets the number of instances of this sink model. */
int getInstanceCount() { result = count(PotentialSinkModelExpr p | p.getSinkModel() = this) }
/** Gets a string representation of this sink model. */
string toString() {
result =
"SinkModel(" + package + ", " + type + ", " + subtypes + ", " + name + ", " + signature + ", "
+ ext + ", " + input + ", " + kind + ", " + provenance + ")"
}
/** Gets a string representation of this sink model as it would appear in a Models-as-Data file. */
string getRepr() {
result =
"\"" + package + "\", \"" + type + "\", " + pyBool(subtypes) + ", \"" + name + "\", \"" +
signature + "\", \"" + ext + "\", \"" + input + "\", \"" + kind + "\", \"" + provenance +
"\""
}
}
/** An expression that may correspond to a sink model. */
class PotentialSinkModelExpr extends Expr {
/**
* Holds if this expression has the given signature. The signature should contain enough
* information to determine a corresponding sink model, if one exists.
*/
pragma[nomagic]
predicate hasSignature(
string package, string type, boolean subtypes, string name, string signature, string input
) {
exists(Call call, Callable callable, int argIdx |
call.getCallee() = callable and
(
this = call.getArgument(argIdx)
or
this = call.getQualifier() and argIdx = -1
) and
input = getArgumentForIndex(argIdx) and
package = callable.getDeclaringType().getPackage().getName() and
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
subtypes = considerSubtypes(callable) and
name = callable.getName() and
signature = ExternalFlow::paramsString(callable)
)
}
/** Gets a sink model that corresponds to this expression. */
SinkModel getSinkModel() {
this.hasSignature(result.getPackage(), result.getType(), result.getSubtypes(), result.getName(),
result.getSignature(), result.getInput())
}
}
private string pyBool(boolean b) {
b = true and result = "True"
or
b = false and result = "False"
}
/**
* Gets a string representation of the existing sink model at the expression `e`, in the format in
* which it would appear in a Models-as-Data file. Also restricts the provenance of the sink model
* to be `ai-generated`.
*/
string getSinkModelRepr(PotentialSinkModelExpr e) {
result = e.getSinkModel().getRepr() and
e.getSinkModel().getProvenance() = "ai-generated"
}
/**
* Gets the string representation of a sink model in a format suitable for appending to an alert
* message.
*/
string getSinkModelQueryRepr(PotentialSinkModelExpr e) {
result = "\nsinkModel: " + getSinkModelRepr(e)
}
/**
* A parameterised module that takes a dataflow config, and exposes a predicate for counting the
* number of AI-generated sink models that appear in alerts for that query.
*/
private module SinkTallier<DataFlow::ConfigSig Config> {
module ConfigFlow = TaintTracking::Global<Config>;
predicate getSinkModelCount(int c, SinkModel s) {
s = any(ConfigFlow::PathNode sink).getNode().asExpr().(PotentialSinkModelExpr).getSinkModel() and
c =
strictcount(ConfigFlow::PathNode sink |
ConfigFlow::flowPath(_, sink) and
s = sink.getNode().asExpr().(PotentialSinkModelExpr).getSinkModel()
)
}
}
predicate sinkModelTallyPerQuery(string queryName, int alertCount, SinkModel sinkModel) {
queryName = "java/request-forgery" and
SinkTallier<RequestForgeryConfig>::getSinkModelCount(alertCount, sinkModel)
or
queryName = "java/command-line-injection" and
exists(int c1, int c2 |
SinkTallier<RemoteUserInputToArgumentToExecFlowConfig>::getSinkModelCount(c1, sinkModel) and
SinkTallier<LocalUserInputToArgumentToExecFlowConfig>::getSinkModelCount(c2, sinkModel) and
alertCount = c1 + c2
)
or
queryName = "java/concatenated-sql-query" and
SinkTallier<UncontrolledStringBuilderSourceFlowConfig>::getSinkModelCount(alertCount, sinkModel)
or
queryName = "java/ssrf" and
SinkTallier<RequestForgeryConfig>::getSinkModelCount(alertCount, sinkModel)
or
queryName = "java/path-injection" and
SinkTallier<TaintedPathConfig>::getSinkModelCount(alertCount, sinkModel)
or
queryName = "java/unvalidated-url-redirection" and
SinkTallier<UrlRedirectConfig>::getSinkModelCount(alertCount, sinkModel)
or
queryName = "java/sql-injection" and
SinkTallier<QueryInjectionFlowConfig>::getSinkModelCount(alertCount, sinkModel)
}
predicate sinkModelTally(int alertCount, SinkModel sinkModel) {
sinkModelTallyPerQuery(_, _, sinkModel) and
alertCount = sum(int c | sinkModelTallyPerQuery(_, c, sinkModel))
}

View File

@@ -1,16 +0,0 @@
/**
* @name Number of alerts per sink model
* @description Counts the number of alerts using `ai-generated` sink models.
* @kind table
* @id java/ml/metrics-count-alerts-per-sink-model
* @tags internal automodel metrics
*/
private import java
private import AutomodelAlertSinkUtil
from int alertCount, SinkModel s
where sinkModelTally(alertCount, s) and s.getProvenance() = "ai-generated"
select alertCount, s.getPackage() as package, s.getType() as type, s.getSubtypes() as subtypes,
s.getName() as name, s.getSignature() as signature, s.getInput() as input, s.getExt() as ext,
s.getKind() as kind, s.getProvenance() as provenance order by alertCount desc

View File

@@ -1,19 +0,0 @@
/**
* @name Number of alerts per sink model and query
* @description Counts the number of alerts per query using `ai-generated` sink models.
* @kind table
* @id java/ml/metrics-count-alerts-per-sink-model-and-query
* @tags internal automodel metrics
*/
private import java
private import AutomodelAlertSinkUtil
from string queryId, int alertCount, SinkModel s
where
sinkModelTallyPerQuery(queryId, alertCount, s) and
s.getProvenance() = "ai-generated"
select queryId, alertCount, s.getPackage() as package, s.getType() as type,
s.getSubtypes() as subtypes, s.getName() as name, s.getSignature() as signature,
s.getInput() as input, s.getExt() as ext, s.getKind() as kind, s.getProvenance() as provenance
order by queryId, alertCount desc

View File

@@ -1,450 +0,0 @@
/**
* For internal use only.
*/
private import java
private import semmle.code.Location as Location
private import semmle.code.java.dataflow.DataFlow
private import semmle.code.java.dataflow.TaintTracking
private import semmle.code.java.security.PathCreation
private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow
private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import semmle.code.java.Expr as Expr
private import semmle.code.java.security.QueryInjection
private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclusions
private import AutomodelJavaUtil as AutomodelJavaUtil
private import semmle.code.java.security.PathSanitizer as PathSanitizer
private import AutomodelSharedGetCallable as AutomodelSharedGetCallable
import AutomodelSharedCharacteristics as SharedCharacteristics
import AutomodelEndpointTypes as AutomodelEndpointTypes
newtype JavaRelatedLocationType = CallContext()
/**
* A class representing nodes that are arguments to calls.
*/
private class ArgumentNode extends DataFlow::Node {
Call c;
ArgumentNode() {
exists(Argument arg | this.asExpr() = arg and not arg.isVararg() and c = arg.getCall())
or
this.(DataFlow::ImplicitVarargsArray).getCall() = c
or
this = DataFlow::getInstanceArgument(c)
}
Call getCall() { result = c }
}
/**
* A candidates implementation.
*
* Some important notes:
* - This mode is using arguments as endpoints.
* - We use the `CallContext` (the surrounding call expression) as related location.
*/
module ApplicationCandidatesImpl implements SharedCharacteristics::CandidateSig {
// for documentation of the implementations here, see the QLDoc in the CandidateSig signature module.
class Endpoint = ArgumentNode;
class EndpointType = AutomodelEndpointTypes::EndpointType;
class NegativeEndpointType = AutomodelEndpointTypes::NegativeSinkType;
class RelatedLocation = Location::Top;
class RelatedLocationType = JavaRelatedLocationType;
// Sanitizers are currently not modeled in MaD. TODO: check if this has large negative impact.
predicate isSanitizer(Endpoint e, EndpointType t) {
exists(t) and
(
e.getType() instanceof BoxedType
or
e.getType() instanceof PrimitiveType
or
e.getType() instanceof NumberType
)
or
t instanceof AutomodelEndpointTypes::PathInjectionSinkType and
e instanceof PathSanitizer::PathInjectionSanitizer
}
RelatedLocation asLocation(Endpoint e) { result = e.asExpr() }
predicate isKnownKind = AutomodelJavaUtil::isKnownKind/2;
predicate isSink(Endpoint e, string kind, string provenance) {
exists(string package, string type, string name, string signature, string ext, string input |
sinkSpec(e, package, type, name, signature, ext, input) and
ExternalFlow::sinkModel(package, type, _, name, [signature, ""], ext, input, kind, provenance)
)
or
isCustomSink(e, kind) and provenance = "custom-sink"
}
predicate isNeutral(Endpoint e) {
exists(string package, string type, string name, string signature |
sinkSpec(e, package, type, name, signature, _, _) and
ExternalFlow::neutralModel(package, type, name, [signature, ""], "sink", _)
)
}
additional predicate sinkSpec(
Endpoint e, string package, string type, string name, string signature, string ext, string input
) {
ApplicationModeGetCallable::getCallable(e).hasQualifiedName(package, type, name) and
signature = ExternalFlow::paramsString(ApplicationModeGetCallable::getCallable(e)) and
ext = "" and
(
exists(Call c, int argIdx |
e.asExpr() = c.getArgument(argIdx) and
input = AutomodelJavaUtil::getArgumentForIndex(argIdx)
)
or
exists(Call c |
e.asExpr() = c.getQualifier() and input = AutomodelJavaUtil::getArgumentForIndex(-1)
)
)
}
/**
* Gets the related location for the given endpoint.
*
* The only related location we model is the the call expression surrounding to
* which the endpoint is either argument or qualifier (known as the call context).
*/
RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType type) {
type = CallContext() and
result = any(Call c | e.asExpr() = [c.getAnArgument(), c.getQualifier()])
}
}
private class JavaCallable = Callable;
private module ApplicationModeGetCallable implements AutomodelSharedGetCallable::GetCallableSig {
class Callable = JavaCallable;
class Endpoint = ApplicationCandidatesImpl::Endpoint;
/**
* Returns the API callable being modeled.
*/
Callable getCallable(Endpoint e) {
exists(Call c |
e.asExpr() = [c.getAnArgument(), c.getQualifier()] and
result = c.getCallee()
)
}
}
/**
* Contains endpoints that are defined in QL code rather than as a MaD model. Ideally this predicate
* should be empty.
*/
private predicate isCustomSink(Endpoint e, string kind) {
e instanceof QueryInjectionSink and kind = "sql"
}
module CharacteristicsImpl =
SharedCharacteristics::SharedCharacteristics<ApplicationCandidatesImpl>;
class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
class Endpoint = ApplicationCandidatesImpl::Endpoint;
/*
* Predicates that are used to surface prompt examples and candidates for classification with an ML model.
*/
/**
* A MetadataExtractor that extracts metadata for application mode.
*/
class ApplicationModeMetadataExtractor extends string {
ApplicationModeMetadataExtractor() { this = "ApplicationModeMetadataExtractor" }
predicate hasMetadata(
Endpoint e, string package, string type, string subtypes, string name, string signature,
string input
) {
exists(Call call, Callable callable, int argIdx |
call.getCallee() = callable and
(
e.asExpr() = call.getArgument(argIdx)
or
e.asExpr() = call.getQualifier() and argIdx = -1
) and
input = AutomodelJavaUtil::getArgumentForIndex(argIdx) and
package = callable.getDeclaringType().getPackage().getName() and
// we're using the erased types because the MaD convention is to not specify type parameters.
// Whether something is or isn't a sink doesn't usually depend on the type parameters.
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
subtypes = AutomodelJavaUtil::considerSubtypes(callable).toString() and
name = callable.getName() and
signature = ExternalFlow::paramsString(callable)
)
}
}
/*
* EndpointCharacteristic classes that are specific to Automodel for Java.
*/
/**
* A negative characteristic that indicates that an is-style boolean method is unexploitable even if it is a sink.
*
* A sink is highly unlikely to be exploitable if its callable's name starts with `is` and the callable has a boolean return
* type (e.g. `isDirectory`). These kinds of calls normally do only checks, and appear before the proper call that does
* the dangerous/interesting thing, so we want the latter to be modeled as the sink.
*
* TODO: this might filter too much, it's possible that methods with more than one parameter contain interesting sinks
*/
private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
UnexploitableIsCharacteristic() { this = "unexploitable (is-style boolean method)" }
override predicate appliesToEndpoint(Endpoint e) {
not ApplicationCandidatesImpl::isSink(e, _, _) and
ApplicationModeGetCallable::getCallable(e).getName().matches("is%") and
ApplicationModeGetCallable::getCallable(e).getReturnType() instanceof BooleanType
}
}
/**
* A negative characteristic that indicates that an existence-checking boolean method is unexploitable even if it is a
* sink.
*
* A sink is highly unlikely to be exploitable if its callable's name is `exists` or `notExists` and the callable has a
* boolean return type. These kinds of calls normally do only checks, and appear before the proper call that does the
* dangerous/interesting thing, so we want the latter to be modeled as the sink.
*/
private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
UnexploitableExistsCharacteristic() { this = "unexploitable (existence-checking boolean method)" }
override predicate appliesToEndpoint(Endpoint e) {
not ApplicationCandidatesImpl::isSink(e, _, _) and
exists(Callable callable |
callable = ApplicationModeGetCallable::getCallable(e) and
callable.getName().toLowerCase() = ["exists", "notexists"] and
callable.getReturnType() instanceof BooleanType
)
}
}
/**
* A negative characteristic that indicates that an endpoint is an argument to an exception, which is not a sink.
*/
private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
ExceptionCharacteristic() { this = "exception" }
override predicate appliesToEndpoint(Endpoint e) {
ApplicationModeGetCallable::getCallable(e).getDeclaringType().getASupertype*() instanceof
TypeThrowable
}
}
/**
* A negative characteristic that indicates that an endpoint is a MaD taint step. MaD modeled taint steps are global,
* so they are not sinks for any query. Non-MaD taint steps might be specific to a particular query, so we don't
* filter those out.
*/
private class IsMaDTaintStepCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
IsMaDTaintStepCharacteristic() { this = "taint step" }
override predicate appliesToEndpoint(Endpoint e) {
FlowSummaryImpl::Private::Steps::summaryThroughStepValue(e, _, _) or
FlowSummaryImpl::Private::Steps::summaryThroughStepTaint(e, _, _) or
FlowSummaryImpl::Private::Steps::summaryGetterStep(e, _, _, _) or
FlowSummaryImpl::Private::Steps::summarySetterStep(e, _, _, _)
}
}
/**
* A negative characteristic that filters out qualifiers that are classes (i.e. static calls). These
* are unlikely to have any non-trivial flow going into them.
*
* Technically, an accessed type _could_ come from outside of the source code, but there's not
* much likelihood of that being user-controlled.
*/
private class ClassQualifierCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
ClassQualifierCharacteristic() { this = "class qualifier" }
override predicate appliesToEndpoint(Endpoint e) {
exists(Call c |
e.asExpr() = c.getQualifier() and
c.getQualifier() instanceof TypeAccess
)
}
}
/**
* A call to a method that's known locally will not be considered as a candidate to model.
*
* The reason is that we would expect data/taint flow into the method implementation to uncover
* any sinks that are present there.
*/
private class ArgumentToLocalCall extends CharacteristicsImpl::UninterestingToModelCharacteristic {
ArgumentToLocalCall() { this = "argument to local call" }
override predicate appliesToEndpoint(Endpoint e) {
ApplicationModeGetCallable::getCallable(e).fromSource()
}
}
/**
* A Characteristic that marks endpoints as uninteresting to model, according to the Java ModelExclusions module.
*/
private class ExcludedFromModeling extends CharacteristicsImpl::UninterestingToModelCharacteristic {
ExcludedFromModeling() { this = "excluded from modeling" }
override predicate appliesToEndpoint(Endpoint e) {
ModelExclusions::isUninterestingForModels(ApplicationModeGetCallable::getCallable(e))
}
}
/**
* A negative characteristic that filters out non-public methods. Non-public methods are not interesting to include in
* the standard Java modeling, because they cannot be called from outside the package.
*/
private class NonPublicMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
{
NonPublicMethodCharacteristic() { this = "non-public method" }
override predicate appliesToEndpoint(Endpoint e) {
not ApplicationModeGetCallable::getCallable(e).isPublic()
}
}
/**
* A negative characteristic that indicates that an endpoint is a non-sink argument to a method whose sinks have already
* been modeled _manually_. This is restricted to manual sinks only, because only during the manual process do we have
* the expectation that all sinks present in a method have been considered.
*
* WARNING: These endpoints should not be used as negative samples for training, because some sinks may have been missed
* when the method was modeled. Specifically, as we start using ATM to merge in new declarations, we can be less sure
* that a method with one argument modeled as a MaD sink has also had its remaining arguments manually reviewed. The
* ML model might have predicted argument 0 of some method to be a sink but not argument 1, when in fact argument 1 is
* also a sink.
*/
private class OtherArgumentToModeledMethodCharacteristic extends CharacteristicsImpl::LikelyNotASinkCharacteristic
{
OtherArgumentToModeledMethodCharacteristic() {
this = "other argument to a method that has already been modeled manually"
}
override predicate appliesToEndpoint(Endpoint e) {
not ApplicationCandidatesImpl::isSink(e, _, _) and
exists(Endpoint otherSink |
ApplicationCandidatesImpl::isSink(otherSink, _, "manual") and
e.getCall() = otherSink.getCall() and
e != otherSink
)
}
}
/**
* A characteristic that marks functional expression as likely not sinks.
*
* These expressions may well _contain_ sinks, but rarely are sinks themselves.
*/
private class FunctionValueCharacteristic extends CharacteristicsImpl::LikelyNotASinkCharacteristic {
FunctionValueCharacteristic() { this = "function value" }
override predicate appliesToEndpoint(Endpoint e) { e.asExpr() instanceof FunctionalExpr }
}
/**
* A negative characteristic that indicates that an endpoint is not a `to` node for any known taint step. Such a node
* cannot be tainted, because taint can't flow into it.
*
* WARNING: These endpoints should not be used as negative samples for training, because they may include sinks for
* which our taint tracking modeling is incomplete.
*/
private class CannotBeTaintedCharacteristic extends CharacteristicsImpl::LikelyNotASinkCharacteristic
{
CannotBeTaintedCharacteristic() { this = "cannot be tainted" }
override predicate appliesToEndpoint(Endpoint e) { not this.isKnownOutNodeForStep(e) }
/**
* Holds if the node `n` is known as the predecessor in a modeled flow step.
*/
private predicate isKnownOutNodeForStep(Endpoint e) {
e.asExpr() instanceof Call or // we just assume flow in that case
TaintTracking::localTaintStep(_, e) or
FlowSummaryImpl::Private::Steps::summaryThroughStepValue(_, e, _) or
FlowSummaryImpl::Private::Steps::summaryThroughStepTaint(_, e, _) or
FlowSummaryImpl::Private::Steps::summaryGetterStep(_, _, e, _) or
FlowSummaryImpl::Private::Steps::summarySetterStep(_, _, e, _)
}
}
/**
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
* characteristics. Lists the problematic characteristics and their implications for all such endpoints, together with
* an error message indicating why this combination is problematic.
*
* Copied from
* javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
*/
predicate erroneousEndpoints(
Endpoint endpoint, EndpointCharacteristic characteristic,
AutomodelEndpointTypes::EndpointType endpointType, float confidence, string errorMessage,
boolean ignoreKnownModelingErrors
) {
// An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
// sink/source type (including the negative type).
exists(
EndpointCharacteristic characteristic2, AutomodelEndpointTypes::EndpointType endpointClass2,
float confidence2
|
endpointType != endpointClass2 and
(
endpointType instanceof AutomodelEndpointTypes::SinkType and
endpointClass2 instanceof AutomodelEndpointTypes::SinkType
or
endpointType instanceof AutomodelEndpointTypes::SourceType and
endpointClass2 instanceof AutomodelEndpointTypes::SourceType
) and
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointType, true, confidence) and
characteristic2.hasImplications(endpointClass2, true, confidence2) and
confidence > SharedCharacteristics::mediumConfidence() and
confidence2 > SharedCharacteristics::mediumConfidence() and
(
ignoreKnownModelingErrors = true and
not knownOverlappingCharacteristics(characteristic, characteristic2)
or
ignoreKnownModelingErrors = false
)
) and
errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
or
// An endpoint's characteristics should not include positive indicators with medium/high confidence for some class and
// also include negative indicators with medium/high confidence for this same class.
exists(EndpointCharacteristic characteristic2, float confidence2 |
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointType, true, confidence) and
characteristic2.hasImplications(endpointType, false, confidence2) and
confidence > SharedCharacteristics::mediumConfidence() and
confidence2 > SharedCharacteristics::mediumConfidence()
) and
ignoreKnownModelingErrors = false and
errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
}
/**
* Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that
* have some overlap in their results. This indicates a problem with the underlying Java modeling. Specifically,
* `PathCreation` is prone to FPs.
*/
private predicate knownOverlappingCharacteristics(
EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2
) {
characteristic1 != characteristic2 and
characteristic1 = ["mad taint step", "create path", "read file", "known non-sink"] and
characteristic2 = ["mad taint step", "create path", "read file", "known non-sink"]
}

View File

@@ -1,85 +0,0 @@
/**
* Surfaces the endpoints that are not already known to be sinks, and are therefore used as candidates for
* classification with an ML model.
*
* Note: This query does not actually classify the endpoints using the model.
*
* @name Automodel candidates (application mode)
* @description A query to extract automodel candidates in application mode.
* @kind problem
* @problem.severity recommendation
* @id java/ml/extract-automodel-application-candidates
* @tags internal extract automodel application-mode candidates
*/
import java
private import AutomodelApplicationModeCharacteristics
private import AutomodelJavaUtil
/**
* Gets a sample of endpoints (of at most `limit` samples) with the given method signature.
*
* The main purpose of this helper predicate is to avoid selecting too many candidates, as this may
* cause the SARIF file to exceed the maximum size limit.
*/
bindingset[limit]
private Endpoint getSampleForSignature(
int limit, string package, string type, string subtypes, string name, string signature,
string input
) {
exists(int n, int num_endpoints, ApplicationModeMetadataExtractor meta |
num_endpoints =
count(Endpoint e | meta.hasMetadata(e, package, type, subtypes, name, signature, input))
|
result =
rank[n](Endpoint e, Location loc |
loc = e.getLocation() and
meta.hasMetadata(e, package, type, subtypes, name, signature, input)
|
e
order by
loc.getFile().getAbsolutePath(), loc.getStartLine(), loc.getStartColumn(),
loc.getEndLine(), loc.getEndColumn()
) and
// To avoid selecting samples that are too close together (as the ranking above goes by file
// path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By
// default this would always include the first sample, so we add a random-chosen prime offset
// to the first sample index, and reduce modulo the number of endpoints.
// Finally, we add 1 to the result, as ranking results in a 1-indexed relation.
n = 1 + (([0 .. limit - 1] * (num_endpoints / limit).floor() + 46337) % num_endpoints)
)
}
from
Endpoint endpoint, string message, ApplicationModeMetadataExtractor meta, DollarAtString package,
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
DollarAtString input
where
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
u.appliesToEndpoint(endpoint)
) and
endpoint = getSampleForSignature(9, package, type, subtypes, name, signature, input) and
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
not CharacteristicsImpl::isSink(endpoint, _, _) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
// a non-sink, and we surface only endpoints that have at least one such sink type.
message =
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
|
sinkType, ", "
)
select endpoint, message + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", //
package, "package", //
type, "type", //
subtypes, "subtypes", //
name, "name", // method name
signature, "signature", //
input, "input" //

View File

@@ -1,73 +0,0 @@
/**
* Surfaces endpoints that are non-sinks with high confidence, for use as negative examples in the prompt.
*
* @name Negative examples (application mode)
* @kind problem
* @problem.severity recommendation
* @id java/ml/extract-automodel-application-negative-examples
* @tags internal extract automodel application-mode negative examples
*/
private import java
private import AutomodelApplicationModeCharacteristics
private import AutomodelEndpointTypes
private import AutomodelJavaUtil
/**
* Gets a sample of endpoints (of at most `limit` samples) for which the given characteristic applies.
*
* The main purpose of this helper predicate is to avoid selecting too many samples, as this may
* cause the SARIF file to exceed the maximum size limit.
*/
bindingset[limit]
Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) {
exists(int n, int num_endpoints | num_endpoints = count(Endpoint e | c.appliesToEndpoint(e)) |
result =
rank[n](Endpoint e, Location loc |
loc = e.getLocation() and c.appliesToEndpoint(e)
|
e
order by
loc.getFile().getAbsolutePath(), loc.getStartLine(), loc.getStartColumn(),
loc.getEndLine(), loc.getEndColumn()
) and
// To avoid selecting samples that are too close together (as the ranking above goes by file
// path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By
// default this would always include the first sample, so we add a random-chosen prime offset
// to the first sample index, and reduce modulo the number of endpoints.
// Finally, we add 1 to the result, as ranking results in a 1-indexed relation.
n = 1 + (([0 .. limit - 1] * (num_endpoints / limit).floor() + 46337) % num_endpoints)
)
}
from
Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message,
ApplicationModeMetadataExtractor meta, DollarAtString package, DollarAtString type,
DollarAtString subtypes, DollarAtString name, DollarAtString signature, DollarAtString input
where
endpoint = getSampleForCharacteristic(characteristic, 100) and
confidence >= SharedCharacteristics::highConfidence() and
characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(endpoint, _, _, _, _, false) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
not exists(EndpointCharacteristic characteristic2, float confidence2, SinkType positiveType |
not positiveType instanceof NegativeSinkType and
characteristic2.appliesToEndpoint(endpoint) and
confidence2 >= SharedCharacteristics::maximalConfidence() and
characteristic2.hasImplications(positiveType, true, confidence2)
) and
message = characteristic
select endpoint, message + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", //
package, "package", //
type, "type", //
subtypes, "subtypes", //
name, "name", //
signature, "signature", //
input, "input" //

View File

@@ -1,33 +0,0 @@
/**
* Surfaces endpoints that are sinks with high confidence, for use as positive examples in the prompt.
*
* @name Positive examples (application mode)
* @kind problem
* @problem.severity recommendation
* @id java/ml/extract-automodel-application-positive-examples
* @tags internal extract automodel application-mode positive examples
*/
private import AutomodelApplicationModeCharacteristics
private import AutomodelEndpointTypes
private import AutomodelJavaUtil
from
Endpoint endpoint, SinkType sinkType, ApplicationModeMetadataExtractor meta,
DollarAtString package, DollarAtString type, DollarAtString subtypes, DollarAtString name,
DollarAtString signature, DollarAtString input
where
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(endpoint, _, _, _, _, false) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
// Extract positive examples of sinks belonging to the existing ATM query configurations.
CharacteristicsImpl::isKnownSink(endpoint, sinkType)
select endpoint, sinkType + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", //
package, "package", //
type, "type", //
subtypes, "subtypes", //
name, "name", //
signature, "signature", //
input, "input" //

View File

@@ -1,19 +0,0 @@
/**
* @name Number of instances of each sink model
* @description Counts the number of instances of `ai-generated` sink models.
* @kind table
* @id java/ml/metrics-count-instances-per-sink-model
* @tags internal automodel metrics
*/
private import java
private import AutomodelAlertSinkUtil
from int instanceCount, SinkModel s
where
instanceCount = s.getInstanceCount() and
instanceCount > 0 and
s.getProvenance() = "ai-generated"
select instanceCount, s.getPackage() as package, s.getType() as type, s.getSubtypes() as subtypes,
s.getName() as name, s.getSignature() as signature, s.getInput() as input, s.getExt() as ext,
s.getKind() as kind, s.getProvenance() as provenance order by instanceCount desc

View File

@@ -1,60 +0,0 @@
/**
* For internal use only.
*
* Defines the set of classes that endpoint scoring models can predict. Endpoint scoring models must
* only predict classes defined within this file. This file is the source of truth for the integer
* representation of each of these classes.
*/
/** A class that can be predicted by a classifier. */
abstract class EndpointType extends string {
/**
* Holds when the string matches the name of the sink / source type.
*/
bindingset[this]
EndpointType() { any() }
/**
* Gets the name of the sink/source kind for this endpoint type as used in models-as-data.
*
* See https://github.com/github/codeql/blob/44213f0144fdd54bb679ca48d68b28dcf820f7a8/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll#LL353C11-L357C31
*/
final string getKind() { result = this }
}
/** A class for sink types that can be predicted by a classifier. */
abstract class SinkType extends EndpointType {
bindingset[this]
SinkType() { any() }
}
/** A class for source types that can be predicted by a classifier. */
abstract class SourceType extends EndpointType {
bindingset[this]
SourceType() { any() }
}
/** The `Negative` class for non-sinks. */
class NegativeSinkType extends SinkType {
NegativeSinkType() { this = "non-sink" }
}
/** A sink relevant to the SQL injection query */
class SqlInjectionSinkType extends SinkType {
SqlInjectionSinkType() { this = "sql-injection" }
}
/** A sink relevant to the tainted path injection query. */
class PathInjectionSinkType extends SinkType {
PathInjectionSinkType() { this = "path-injection" }
}
/** A sink relevant to the SSRF query. */
class RequestForgerySinkType extends SinkType {
RequestForgerySinkType() { this = "request-forgery" }
}
/** A sink relevant to the command injection query. */
class CommandInjectionSinkType extends SinkType {
CommandInjectionSinkType() { this = "command-injection" }
}

View File

@@ -1,343 +0,0 @@
/**
* For internal use only.
*/
private import java
private import semmle.code.Location as Location
private import semmle.code.java.dataflow.DataFlow
private import semmle.code.java.dataflow.TaintTracking
private import semmle.code.java.security.PathCreation
private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow
private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import semmle.code.java.Expr as Expr
private import semmle.code.java.security.QueryInjection
private import semmle.code.java.security.RequestForgery
private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclusions
private import AutomodelJavaUtil as AutomodelJavaUtil
private import AutomodelSharedGetCallable as AutomodelSharedGetCallable
import AutomodelSharedCharacteristics as SharedCharacteristics
import AutomodelEndpointTypes as AutomodelEndpointTypes
newtype JavaRelatedLocationType =
MethodDoc() or
ClassDoc()
newtype TFrameworkModeEndpoint =
TExplicitParameter(Parameter p) or
TQualifier(Callable c)
/**
* A framework mode endpoint.
*/
abstract class FrameworkModeEndpoint extends TFrameworkModeEndpoint {
/**
* Returns the parameter index of the endpoint.
*/
abstract int getIndex();
/**
* Returns the name of the parameter of the endpoint.
*/
abstract string getParamName();
/**
* Returns the callable that contains the endpoint.
*/
abstract Callable getEnclosingCallable();
abstract Top asTop();
string toString() { result = this.asTop().toString() }
Location getLocation() { result = this.asTop().getLocation() }
}
class ExplicitParameterEndpoint extends FrameworkModeEndpoint, TExplicitParameter {
Parameter param;
ExplicitParameterEndpoint() { this = TExplicitParameter(param) and param.fromSource() }
override int getIndex() { result = param.getPosition() }
override string getParamName() { result = param.getName() }
override Callable getEnclosingCallable() { result = param.getCallable() }
override Top asTop() { result = param }
}
class QualifierEndpoint extends FrameworkModeEndpoint, TQualifier {
Callable callable;
QualifierEndpoint() {
this = TQualifier(callable) and not callable.isStatic() and callable.fromSource()
}
override int getIndex() { result = -1 }
override string getParamName() { result = "this" }
override Callable getEnclosingCallable() { result = callable }
override Top asTop() { result = callable }
}
/**
* A candidates implementation for framework mode.
*
* Some important notes:
* - This mode is using parameters as endpoints.
* - Sink- and neutral-information is being used from MaD models.
* - When available, we use method- and class-java-docs as related locations.
*/
module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
// for documentation of the implementations here, see the QLDoc in the CandidateSig signature module.
class Endpoint = FrameworkModeEndpoint;
class EndpointType = AutomodelEndpointTypes::EndpointType;
class NegativeEndpointType = AutomodelEndpointTypes::NegativeSinkType;
class RelatedLocation = Location::Top;
class RelatedLocationType = JavaRelatedLocationType;
// Sanitizers are currently not modeled in MaD. TODO: check if this has large negative impact.
predicate isSanitizer(Endpoint e, EndpointType t) { none() }
RelatedLocation asLocation(Endpoint e) { result = e.asTop() }
predicate isKnownKind = AutomodelJavaUtil::isKnownKind/2;
predicate isSink(Endpoint e, string kind, string provenance) {
exists(string package, string type, string name, string signature, string ext, string input |
sinkSpec(e, package, type, name, signature, ext, input) and
ExternalFlow::sinkModel(package, type, _, name, [signature, ""], ext, input, kind, provenance)
)
}
predicate isNeutral(Endpoint e) {
exists(string package, string type, string name, string signature |
sinkSpec(e, package, type, name, signature, _, _) and
ExternalFlow::neutralModel(package, type, name, [signature, ""], "sink", _)
)
}
additional predicate sinkSpec(
Endpoint e, string package, string type, string name, string signature, string ext, string input
) {
FrameworkModeGetCallable::getCallable(e).hasQualifiedName(package, type, name) and
signature = ExternalFlow::paramsString(FrameworkModeGetCallable::getCallable(e)) and
ext = "" and
input = AutomodelJavaUtil::getArgumentForIndex(e.getIndex())
}
/**
* Gets the related location for the given endpoint.
*
* Related locations can be JavaDoc comments of the class or the method.
*/
RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType type) {
type = MethodDoc() and
result = FrameworkModeGetCallable::getCallable(e).(Documentable).getJavadoc()
or
type = ClassDoc() and
result = FrameworkModeGetCallable::getCallable(e).getDeclaringType().(Documentable).getJavadoc()
}
}
private class JavaCallable = Callable;
private module FrameworkModeGetCallable implements AutomodelSharedGetCallable::GetCallableSig {
class Callable = JavaCallable;
class Endpoint = FrameworkCandidatesImpl::Endpoint;
/**
* Returns the callable that contains the given endpoint.
*
* Each Java mode should implement this predicate.
*/
Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() }
}
module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<FrameworkCandidatesImpl>;
class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
class Endpoint = FrameworkCandidatesImpl::Endpoint;
/*
* Predicates that are used to surface prompt examples and candidates for classification with an ML model.
*/
/**
* A MetadataExtractor that extracts metadata for framework mode.
*/
class FrameworkModeMetadataExtractor extends string {
FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" }
predicate hasMetadata(
Endpoint e, string package, string type, string subtypes, string name, string signature,
string input, string parameterName
) {
parameterName = e.getParamName() and
name = e.getEnclosingCallable().getName() and
input = AutomodelJavaUtil::getArgumentForIndex(e.getIndex()) and
package = e.getEnclosingCallable().getDeclaringType().getPackage().getName() and
type = e.getEnclosingCallable().getDeclaringType().getErasure().(RefType).nestedName() and
subtypes = AutomodelJavaUtil::considerSubtypes(e.getEnclosingCallable()).toString() and
signature = ExternalFlow::paramsString(e.getEnclosingCallable())
}
}
/*
* EndpointCharacteristic classes that are specific to Automodel for Java.
*/
/**
* A negative characteristic that indicates that an is-style boolean method is unexploitable even if it is a sink.
*
* A sink is highly unlikely to be exploitable if its callable's name starts with `is` and the callable has a boolean return
* type (e.g. `isDirectory`). These kinds of calls normally do only checks, and appear before the proper call that does
* the dangerous/interesting thing, so we want the latter to be modeled as the sink.
*
* TODO: this might filter too much, it's possible that methods with more than one parameter contain interesting sinks
*/
private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
UnexploitableIsCharacteristic() { this = "unexploitable (is-style boolean method)" }
override predicate appliesToEndpoint(Endpoint e) {
not FrameworkCandidatesImpl::isSink(e, _, _) and
FrameworkModeGetCallable::getCallable(e).getName().matches("is%") and
FrameworkModeGetCallable::getCallable(e).getReturnType() instanceof BooleanType
}
}
/**
* A negative characteristic that indicates that an existence-checking boolean method is unexploitable even if it is a
* sink.
*
* A sink is highly unlikely to be exploitable if its callable's name is `exists` or `notExists` and the callable has a
* boolean return type. These kinds of calls normally do only checks, and appear before the proper call that does the
* dangerous/interesting thing, so we want the latter to be modeled as the sink.
*/
private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
UnexploitableExistsCharacteristic() { this = "unexploitable (existence-checking boolean method)" }
override predicate appliesToEndpoint(Endpoint e) {
not FrameworkCandidatesImpl::isSink(e, _, _) and
exists(Callable callable |
callable = FrameworkModeGetCallable::getCallable(e) and
callable.getName().toLowerCase() = ["exists", "notexists"] and
callable.getReturnType() instanceof BooleanType
)
}
}
/**
* A negative characteristic that indicates that an endpoint is an argument to an exception, which is not a sink.
*/
private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
ExceptionCharacteristic() { this = "exception" }
override predicate appliesToEndpoint(Endpoint e) {
FrameworkModeGetCallable::getCallable(e).getDeclaringType().getASupertype*() instanceof
TypeThrowable
}
}
/**
* A characteristic that limits candidates to parameters of methods that are recognized as `ModelApi`, iow., APIs that
* are considered worth modeling.
*/
private class NotAModelApiParameter extends CharacteristicsImpl::UninterestingToModelCharacteristic {
NotAModelApiParameter() { this = "not a model API parameter" }
override predicate appliesToEndpoint(Endpoint e) {
not e.getEnclosingCallable() instanceof ModelExclusions::ModelApi
}
}
/**
* A negative characteristic that filters out non-public methods. Non-public methods are not interesting to include in
* the standard Java modeling, because they cannot be called from outside the package.
*/
private class NonPublicMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
{
NonPublicMethodCharacteristic() { this = "non-public method" }
override predicate appliesToEndpoint(Endpoint e) {
not FrameworkModeGetCallable::getCallable(e).isPublic()
}
}
/**
* Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
* characteristics. Lists the problematic characteristics and their implications for all such endpoints, together with
* an error message indicating why this combination is problematic.
*
* Copied from
* javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
*/
predicate erroneousEndpoints(
Endpoint endpoint, EndpointCharacteristic characteristic,
AutomodelEndpointTypes::EndpointType endpointType, float confidence, string errorMessage,
boolean ignoreKnownModelingErrors
) {
// An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
// sink/source type (including the negative type).
exists(
EndpointCharacteristic characteristic2, AutomodelEndpointTypes::EndpointType endpointClass2,
float confidence2
|
endpointType != endpointClass2 and
(
endpointType instanceof AutomodelEndpointTypes::SinkType and
endpointClass2 instanceof AutomodelEndpointTypes::SinkType
or
endpointType instanceof AutomodelEndpointTypes::SourceType and
endpointClass2 instanceof AutomodelEndpointTypes::SourceType
) and
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointType, true, confidence) and
characteristic2.hasImplications(endpointClass2, true, confidence2) and
confidence > SharedCharacteristics::mediumConfidence() and
confidence2 > SharedCharacteristics::mediumConfidence() and
(
ignoreKnownModelingErrors = true and
not knownOverlappingCharacteristics(characteristic, characteristic2)
or
ignoreKnownModelingErrors = false
)
) and
errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
or
// An endpoint's characteristics should not include positive indicators with medium/high confidence for some class and
// also include negative indicators with medium/high confidence for this same class.
exists(EndpointCharacteristic characteristic2, float confidence2 |
characteristic.appliesToEndpoint(endpoint) and
characteristic2.appliesToEndpoint(endpoint) and
characteristic.hasImplications(endpointType, true, confidence) and
characteristic2.hasImplications(endpointType, false, confidence2) and
confidence > SharedCharacteristics::mediumConfidence() and
confidence2 > SharedCharacteristics::mediumConfidence()
) and
ignoreKnownModelingErrors = false and
errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
}
/**
* Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that
* have some overlap in their results. This indicates a problem with the underlying Java modeling. Specifically,
* `PathCreation` is prone to FPs.
*/
private predicate knownOverlappingCharacteristics(
EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2
) {
characteristic1 != characteristic2 and
characteristic1 = ["mad taint step", "create path", "read file", "known non-sink"] and
characteristic2 = ["mad taint step", "create path", "read file", "known non-sink"]
}

View File

@@ -1,52 +0,0 @@
/**
* Surfaces the endpoints that are not already known to be sinks, and are therefore used as candidates for
* classification with an ML model.
*
* Note: This query does not actually classify the endpoints using the model.
*
* @name Automodel candidates (framework mode)
* @description A query to extract automodel candidates in framework mode.
* @kind problem
* @problem.severity recommendation
* @id java/ml/extract-automodel-framework-candidates
* @tags internal extract automodel framework-mode candidates
*/
private import AutomodelFrameworkModeCharacteristics
private import AutomodelJavaUtil
from
Endpoint endpoint, string message, FrameworkModeMetadataExtractor meta, DollarAtString package,
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
DollarAtString input, DollarAtString parameterName
where
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
u.appliesToEndpoint(endpoint)
) and
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
not CharacteristicsImpl::isSink(endpoint, _, _) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input, parameterName) and
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
// a non-sink, and we surface only endpoints that have at least one such sink type.
message =
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
|
sinkType, ", "
)
select endpoint,
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
package, "package", //
type, "type", //
subtypes, "subtypes", //
name, "name", //
signature, "signature", //
input, "input", //
parameterName, "parameterName" //

View File

@@ -1,49 +0,0 @@
/**
* Surfaces endpoints that are non-sinks with high confidence, for use as negative examples in the prompt.
*
* @name Negative examples (framework mode)
* @kind problem
* @problem.severity recommendation
* @id java/ml/extract-automodel-framework-negative-examples
* @tags internal extract automodel framework-mode negative examples
*/
private import AutomodelFrameworkModeCharacteristics
private import AutomodelEndpointTypes
private import AutomodelJavaUtil
from
Endpoint endpoint, EndpointCharacteristic characteristic, float confidence,
DollarAtString message, FrameworkModeMetadataExtractor meta, DollarAtString package,
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
DollarAtString input, DollarAtString parameterName
where
characteristic.appliesToEndpoint(endpoint) and
confidence >= SharedCharacteristics::highConfidence() and
characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(endpoint, _, _, _, _, false) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input, parameterName) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
not exists(EndpointCharacteristic characteristic2, float confidence2, SinkType positiveType |
not positiveType instanceof NegativeSinkType and
characteristic2.appliesToEndpoint(endpoint) and
confidence2 >= SharedCharacteristics::maximalConfidence() and
characteristic2.hasImplications(positiveType, true, confidence2)
) and
message = characteristic
select endpoint,
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
package, "package", //
type, "type", //
subtypes, "subtypes", //
name, "name", //
signature, "signature", //
input, "input", //
parameterName, "parameterName" //

View File

@@ -1,36 +0,0 @@
/**
* Surfaces endpoints that are sinks with high confidence, for use as positive examples in the prompt.
*
* @name Positive examples (framework mode)
* @kind problem
* @problem.severity recommendation
* @id java/ml/extract-automodel-framework-positive-examples
* @tags internal extract automodel framework-mode positive examples
*/
private import AutomodelFrameworkModeCharacteristics
private import AutomodelEndpointTypes
private import AutomodelJavaUtil
from
Endpoint endpoint, SinkType sinkType, FrameworkModeMetadataExtractor meta, DollarAtString package,
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
DollarAtString input, DollarAtString parameterName
where
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(endpoint, _, _, _, _, false) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input, parameterName) and
// Extract positive examples of sinks belonging to the existing ATM query configurations.
CharacteristicsImpl::isKnownSink(endpoint, sinkType)
select endpoint,
sinkType + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
package, "package", //
type, "type", //
subtypes, "subtypes", //
name, "name", //
signature, "signature", //
input, "input", //
parameterName, "parameterName" //

View File

@@ -1,68 +0,0 @@
private import java
private import AutomodelEndpointTypes as AutomodelEndpointTypes
/**
* A helper class to represent a string value that can be returned by a query using $@ notation.
*
* It extends `string`, but adds a mock `hasLocationInfo` method that returns the string itself as the file name.
*
* Use this, when you want to return a string value from a query using $@ notation - the string value
* will be included in the sarif file.
*
*
* Background information on `hasLocationInfo`:
* https://codeql.github.com/docs/writing-codeql-queries/providing-locations-in-codeql-queries/#providing-location-information
*/
class DollarAtString extends string {
bindingset[this]
DollarAtString() { any() }
bindingset[this]
predicate hasLocationInfo(string path, int sl, int sc, int el, int ec) {
path = this and sl = 1 and sc = 1 and el = 1 and ec = 1
}
}
/**
* Holds for all combinations of MaD kinds (`kind`) and their human readable
* descriptions.
*/
predicate isKnownKind(string kind, AutomodelEndpointTypes::EndpointType type) {
kind = "path-injection" and
type instanceof AutomodelEndpointTypes::PathInjectionSinkType
or
kind = "sql-injection" and
type instanceof AutomodelEndpointTypes::SqlInjectionSinkType
or
kind = "request-forgery" and
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
or
kind = "command-injection" and
type instanceof AutomodelEndpointTypes::CommandInjectionSinkType
}
/** Gets the models-as-data description for the method argument with the index `index`. */
bindingset[index]
string getArgumentForIndex(int index) {
index = -1 and result = "Argument[this]"
or
index >= 0 and result = "Argument[" + index + "]"
}
/**
* By convention, the subtypes property of the MaD declaration should only be
* true when there _can_ exist any subtypes with a different implementation.
*
* It would technically be ok to always use the value 'true', but this would
* break convention.
*/
pragma[nomagic]
boolean considerSubtypes(Callable callable) {
if
callable.isStatic() or
callable.getDeclaringType().isStatic() or
callable.isFinal() or
callable.getDeclaringType().isFinal()
then result = false
else result = true
}

View File

@@ -1,311 +0,0 @@
float maximalConfidence() { result = 1.0 }
float highConfidence() { result = 0.9 }
float mediumConfidence() { result = 0.6 }
/**
* A specification of how to instantiate the shared characteristics for a given candidate class.
*
* The `CandidateSig` implementation specifies a type to use for Endpoints (eg., `ParameterNode`), as well as a type
* to label endpoint classes (the `EndpointType`). One of the endpoint classes needs to be a 'negative' class, meaning
* "not any of the other known endpoint types".
*/
signature module CandidateSig {
/**
* An endpoint is a potential candidate for modeling. This will typically be bound to the language's
* DataFlow node class, or a subtype thereof.
*/
class Endpoint;
/**
* A related location for an endpoint. This will typically be bound to the supertype of all AST nodes (eg., `Top`).
*/
class RelatedLocation;
/**
* A label for a related location.
*
* Eg., method-doc, class-doc, etc.
*/
class RelatedLocationType;
/**
* A class kind for an endpoint.
*/
class EndpointType extends string;
/**
* An EndpointType that denotes the absence of any sink.
*/
class NegativeEndpointType extends EndpointType;
/**
* Gets the endpoint as a location.
*
* This is a utility function to convert an endpoint to its corresponding location.
*/
RelatedLocation asLocation(Endpoint e);
/**
* Defines what MaD kinds are known, and what endpoint type they correspond to.
*/
predicate isKnownKind(string kind, EndpointType type);
/**
* Holds if `e` is a flow sanitizer, and has type `t`.
*/
predicate isSanitizer(Endpoint e, EndpointType t);
/**
* Holds if `e` is a sink with the label `kind`, and provenance `provenance`.
*/
predicate isSink(Endpoint e, string kind, string provenance);
/**
* Holds if `e` is not a sink of any kind.
*/
predicate isNeutral(Endpoint e);
/**
* Gets a related location.
*
* A related location is a source code location that may hold extra information about an endpoint that can be useful
* to the machine learning model.
*
* For example, a related location for a method call may be the documentation comment of a method.
*/
RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType name);
}
/**
* A set of shared characteristics for a given candidate class.
*
* This module is language-agnostic, although the `CandidateSig` module will be language-specific.
*
* The language specific implementation can also further extend the behavior of this module by adding additional
* implementations of endpoint characteristics exported by this module.
*/
module SharedCharacteristics<CandidateSig Candidate> {
predicate isSink = Candidate::isSink/3;
predicate isNeutral = Candidate::isNeutral/1;
/**
* Holds if `sink` is a known sink of type `endpointType`.
*/
predicate isKnownSink(Candidate::Endpoint sink, Candidate::EndpointType endpointType) {
// If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
// known sink for the class.
not endpointType instanceof Candidate::NegativeEndpointType and
exists(EndpointCharacteristic characteristic |
characteristic.appliesToEndpoint(sink) and
characteristic.hasImplications(endpointType, true, maximalConfidence())
)
}
/**
* Holds if the candidate sink `candidateSink` should be considered as a possible sink of type `sinkType`, and
* classified by the ML model. A candidate sink is a node that cannot be excluded from `sinkType` based on its
* characteristics.
*/
predicate isSinkCandidate(Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType) {
not sinkType instanceof Candidate::NegativeEndpointType and
not exists(getAReasonSinkExcluded(candidateSink, sinkType))
}
/**
* Gets the related location of `e` with name `name`, if it exists.
* Otherwise, gets the candidate itself.
*/
Candidate::RelatedLocation getRelatedLocationOrCandidate(
Candidate::Endpoint e, Candidate::RelatedLocationType type
) {
if exists(Candidate::getRelatedLocation(e, type))
then result = Candidate::getRelatedLocation(e, type)
else result = Candidate::asLocation(e)
}
/**
* Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink for a given sink
* type.
*/
EndpointCharacteristic getAReasonSinkExcluded(
Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType
) {
// An endpoint is a sink candidate if none of its characteristics give much indication whether or not it is a sink.
not sinkType instanceof Candidate::NegativeEndpointType and
result.appliesToEndpoint(candidateSink) and
(
// Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
exists(float confidence |
confidence >= mediumConfidence() and
result.hasImplications(any(Candidate::NegativeEndpointType t), true, confidence)
)
or
// Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
exists(float confidence |
confidence >= mediumConfidence() and
result.hasImplications(sinkType, false, confidence)
)
)
}
/**
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
* about whether to include the endpoint in the training set and with what kind, as well as whether to score the
* endpoint at inference time.
*/
abstract class EndpointCharacteristic extends string {
/**
* Holds for the string that is the name of the characteristic. This should describe some property of an endpoint
* that is meaningful for determining whether it's a sink, and if so, of which sink type.
*/
bindingset[this]
EndpointCharacteristic() { any() }
/**
* Holds for endpoints that have this characteristic.
*/
abstract predicate appliesToEndpoint(Candidate::Endpoint n);
/**
* This predicate describes what the characteristic tells us about an endpoint.
*
* Params:
* endpointType: The sink/source type.
* isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
* false, it indicates that it _isn't_ a member of the class.
* confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
* belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
* indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
* this characteristic definitively do/don't belong to the class.
*/
abstract predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
);
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
final float getHighConfidenceThreshold() { result = 0.8 }
}
/**
* A high-confidence characteristic that indicates that an endpoint is a sink of a specified type. These endpoints can
* be used as positive samples for training or for a few-shot prompt.
*/
abstract class SinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
SinkCharacteristic() { any() }
abstract Candidate::EndpointType getSinkType();
final override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
endpointType = this.getSinkType() and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
/**
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type. These endpoints can be
* used as negative samples for training or for a few-shot prompt.
*/
abstract class NotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
NotASinkCharacteristic() { any() }
override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
endpointType instanceof Candidate::NegativeEndpointType and
isPositiveIndicator = true and
confidence = highConfidence()
}
}
/**
* A medium-confidence characteristic that indicates that an endpoint is unlikely to be a sink of any type. These
* endpoints can be excluded from scoring at inference time, both to save time and to avoid false positives. They should
* not, however, be used as negative samples for training or for a few-shot prompt, because they may include a small
* number of sinks.
*/
abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
LikelyNotASinkCharacteristic() { any() }
override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
endpointType instanceof Candidate::NegativeEndpointType and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
/**
* A characteristic that indicates not necessarily that an endpoint is not a sink, but rather that it is not a sink
* that's interesting to model in the standard Java libraries. These filters should be removed when extracting sink
* candidates within a user's codebase for customized modeling.
*
* These endpoints should not be used as negative samples for training or for a few-shot prompt, because they are not
* necessarily non-sinks.
*/
abstract class UninterestingToModelCharacteristic extends EndpointCharacteristic {
bindingset[this]
UninterestingToModelCharacteristic() { any() }
override predicate hasImplications(
Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
) {
endpointType instanceof Candidate::NegativeEndpointType and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
/**
* Contains default implementations that are derived solely from the `CandidateSig` implementation.
*/
private module DefaultCharacteristicImplementations {
/**
* Endpoints identified as sinks by the `CandidateSig` implementation are sinks with maximal confidence.
*/
private class KnownSinkCharacteristic extends SinkCharacteristic {
string madKind;
Candidate::EndpointType endpointType;
KnownSinkCharacteristic() {
Candidate::isKnownKind(madKind, endpointType) and
// bind "this" to a unique string differing from that of the SinkType classes
this = madKind + "-characteristic"
}
override predicate appliesToEndpoint(Candidate::Endpoint e) {
Candidate::isSink(e, madKind, _)
}
override Candidate::EndpointType getSinkType() { result = endpointType }
}
/**
* A negative characteristic that indicates that an endpoint was manually modeled as a neutral model.
*/
private class NeutralModelCharacteristic extends NotASinkCharacteristic {
NeutralModelCharacteristic() { this = "known non-sink" }
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isNeutral(e) }
}
/**
* A negative characteristic that indicates that an endpoint is not part of the source code for the project being
* analyzed.
*/
private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
IsSanitizerCharacteristic() { this = "known sanitizer" }
override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSanitizer(e, _) }
}
}
}

View File

@@ -1,21 +0,0 @@
/**
* An automodel extraction mode instantiates this interface to define how to access
* the callable that's associated with an endpoint.
*/
signature module GetCallableSig {
/**
* A callable is the definition of a method, function, etc. - something that can be called.
*/
class Callable;
/**
* An endpoint is a potential candidate for modeling. This will typically be bound to the language's
* DataFlow node class, or a subtype thereof.
*/
class Endpoint;
/**
* Gets the callable that's associated with the given endpoint.
*/
Callable getCallable(Endpoint endpoint);
}

View File

@@ -1,62 +0,0 @@
/**
* This file contains query predicates for use when gathering metrics at scale using Multi Repo
* Variant Analysis.
*/
private import java
private import AutomodelAlertSinkUtil
/**
* Holds if `alertCount` is the number of alerts for the query with ID `queryId` for which the
* sinks correspond to the given `ai-generated` sink model.
*/
query predicate sinkModelCountPerQuery(
string queryId, int alertCount, string package, string type, boolean subtypes, string name,
string signature, string input, string ext, string kind, string provenance
) {
exists(SinkModel s |
sinkModelTallyPerQuery(queryId, alertCount, s) and
s.getProvenance() = "ai-generated" and
s.getPackage() = package and
s.getType() = type and
s.getSubtypes() = subtypes and
s.getName() = name and
s.getSignature() = signature and
s.getInput() = input and
s.getExt() = ext and
s.getKind() = kind and
s.getProvenance() = provenance
)
}
/**
* Holds if `instanceCount` is the number of instances corresponding to the given `ai-generated`
* sink model (as identified by the `package`, `name`, `input`, etc.).
*/
query predicate instanceCount(
int instanceCount, string package, string type, boolean subtypes, string name, string signature,
string input, string ext, string kind, string provenance
) {
exists(SinkModel s |
instanceCount = s.getInstanceCount() and
instanceCount > 0 and
s.getProvenance() = "ai-generated" and
s.getPackage() = package and
s.getType() = type and
s.getSubtypes() = subtypes and
s.getName() = name and
s.getSignature() = signature and
s.getInput() = input and
s.getExt() = ext and
s.getKind() = kind and
s.getProvenance() = provenance
)
}
// MRVA requires a select clause, so we repurpose it to tell us which query predicates had results.
from string hadResults
where
sinkModelCountPerQuery(_, _, _, _, _, _, _, _, _, _, _) and hadResults = "sinkModelCountPerQuery"
or
instanceCount(_, _, _, _, _, _, _, _, _, _) and hadResults = "instanceCount"
select hadResults