add typecheckable mechanism to enforce minimal set of metadata

This commit is contained in:
Stephan Brandauer
2023-05-03 16:00:50 +02:00
parent 6d29273c43
commit 32f2614fe0
5 changed files with 85 additions and 100 deletions

View File

@@ -12,9 +12,11 @@
* @tags internal automodel extract candidates
*/
import AutomodelEndpointCharacteristics
private import AutomodelFrameworkModeCharacteristics
from Endpoint endpoint, string message
from
Endpoint endpoint, string message, MetadataExtractor meta, string package, string type,
boolean subtypes, string name, string signature, int input
where
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
u.appliesToEndpoint(endpoint)
@@ -25,18 +27,20 @@ where
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
not CharacteristicsImpl::isSink(endpoint, _) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
// a non-sink, and we surface only endpoints that have at least one such sink type.
message =
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
|
sinkType + ", "
) + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata))
select endpoint, message + "\nrelated locations: $@, $@.", //
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
|
sinkType + ", "
)
select endpoint,
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"),
"Callable-JavaDoc", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), "Class-JavaDoc" //
"Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"),
"Class-JavaDoc", //
package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature,
"signature", input.toString(), "input" //

View File

@@ -8,10 +8,13 @@
* @tags internal automodel extract examples negative
*/
import AutomodelEndpointCharacteristics
import AutomodelEndpointTypes
private import AutomodelFrameworkModeCharacteristics
private import AutomodelEndpointTypes
from Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message
from
Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message,
MetadataExtractor meta, string package, string type, boolean subtypes, string name,
string signature, int input
where
characteristic.appliesToEndpoint(endpoint) and
confidence >= SharedCharacteristics::highConfidence() and
@@ -19,6 +22,7 @@ where
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(endpoint, _, _, _, _, false) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
@@ -29,11 +33,11 @@ where
confidence2 >= SharedCharacteristics::maximalConfidence() and
characteristic2.hasImplications(positiveType, true, confidence2)
) and
message =
characteristic + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata))
select endpoint, message + "\nrelated locations: $@, $@.",
message = characteristic
select endpoint,
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"),
"Callable-JavaDoc", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), "Class-JavaDoc" //
"Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"),
"Class-JavaDoc", //
package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature,
"signature", input.toString(), "input" //

View File

@@ -8,25 +8,23 @@
* @tags internal automodel extract examples positive
*/
private import java
private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
private import AutomodelEndpointCharacteristics
private import AutomodelFrameworkModeCharacteristics
private import AutomodelEndpointTypes
from Endpoint sink, SinkType sinkType, string message
from
Endpoint endpoint, SinkType sinkType, MetadataExtractor meta, string package, string type,
boolean subtypes, string name, string signature, int input
where
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
// certain about in the prompt.
not erroneousEndpoints(sink, _, _, _, _, false) and
not erroneousEndpoints(endpoint, _, _, _, _, false) and
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
// Extract positive examples of sinks belonging to the existing ATM query configurations.
(
CharacteristicsImpl::isKnownSink(sink, sinkType) and
message =
sinkType + "\n" +
// Extract the needed metadata for this endpoint.
any(string metadata | CharacteristicsImpl::hasMetadata(sink, metadata))
)
select sink, message + "\nrelated locations: $@, $@.",
CharacteristicsImpl::getRelatedLocationOrCandidate(sink, "Callable-JavaDoc"),
"Callable-JavaDoc", //
CharacteristicsImpl::getRelatedLocationOrCandidate(sink, "Class-JavaDoc"), "Class-JavaDoc" //
CharacteristicsImpl::isKnownSink(endpoint, sinkType)
select endpoint,
sinkType + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"),
"Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"),
"Class-JavaDoc", //
package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature,
"signature", input.toString(), "input" //

View File

@@ -17,6 +17,22 @@ private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclus
import AutomodelSharedCharacteristics as SharedCharacteristics
import AutomodelEndpointTypes as AutomodelEndpointTypes
Callable getCallable(DataFlow::ParameterNode e) { result = e.getEnclosingCallable() }
/**
* A meta data extractor. Any Java extraction mode needs to implement exactly
* one instance of this class.
*/
abstract class MetadataExtractor extends string {
bindingset[this]
MetadataExtractor() { any() }
abstract predicate hasMetadata(
DataFlow::ParameterNode e, string package, string type, boolean subtypes, string name,
string signature, int input
);
}
module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
class Endpoint = DataFlow::ParameterNode;
@@ -87,26 +103,6 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
exists(int paramIdx | e.isParameterOf(_, paramIdx) | input = "Argument[" + paramIdx + "]")
}
predicate hasMetadata(Endpoint e, string metadata) {
exists(
string package, string type, boolean subtypes, string name, string signature, int input,
boolean isPublic, boolean isFinal, boolean isStatic
|
hasMetadata(e, package, type, name, signature, input, isFinal, isStatic, isPublic) and
(if isFinal = true or isStatic = true then subtypes = false else subtypes = true) and
metadata =
"{" //
+ "'Package': '" + package //
+ "', 'Type': '" + type //
+ "', 'Subtypes': " + subtypes //
+ ", 'Name': '" + name //
+ ", 'ParamName': '" + e.toString() //
+ "', 'Signature': '" + signature //
+ "', 'Argument index': " + input //
+ "'}" // TODO: Why are the curly braces added twice?
)
}
RelatedLocation getRelatedLocation(Endpoint e, string name) {
name = "Callable-JavaDoc" and
result = getCallable(e).(Documentable).getJavadoc()
@@ -116,8 +112,6 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
}
}
Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() }
module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<FrameworkCandidatesImpl>;
class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
@@ -129,32 +123,32 @@ class Endpoint = FrameworkCandidatesImpl::Endpoint;
*/
/**
* Holds if `n` has the given metadata.
*
* This is a helper function to extract and export needed information about each endpoint.
* A MetadataExtractor that extracts metadata for framework mode.
*/
predicate hasMetadata(
Endpoint n, string package, string type, string name, string signature, int input,
boolean isFinal, boolean isStatic, boolean isPublic
) {
exists(Callable callable |
n.asParameter() = callable.getParameter(input) and
package = callable.getDeclaringType().getPackage().getName() and
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
(
if callable.isStatic() or callable.getDeclaringType().isStatic()
then isStatic = true
else isStatic = false
) and
(
if callable.isFinal() or callable.getDeclaringType().isFinal()
then isFinal = true
else isFinal = false
) and
name = callable.getSourceDeclaration().getName() and
signature = ExternalFlow::paramsString(callable) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)?
(if callable.isPublic() then isPublic = true else isPublic = false)
)
class FrameworkModeMetadataExtractor extends MetadataExtractor {
FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" }
override predicate hasMetadata(
Endpoint e, string package, string type, boolean subtypes, string name, string signature,
int input
) {
exists(Callable callable |
e.asParameter() = callable.getParameter(input) and
package = callable.getDeclaringType().getPackage().getName() and
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
(
if
callable.isStatic() or
callable.getDeclaringType().isStatic() or
callable.isFinal() or
callable.getDeclaringType().isFinal()
then subtypes = true
else subtypes = false
) and
name = e.toString() and
signature = ExternalFlow::paramsString(callable)
)
}
}
/*

View File

@@ -55,19 +55,6 @@ signature module CandidateSig {
*/
predicate isNeutral(Endpoint e);
/**
* Holds if `e` has the given metadata.
*
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query
* as well as the queries that extract positive and negative examples for the prompt / training set. The metadata is
* extracted as a string in the format of a Python dictionary, eg.:
*
* `{'Package': 'com.foo.util', 'Type': 'HelperClass', ... }`.
*
* The meta data will be passed on to the machine learning code by the extraction queries.
*/
predicate hasMetadata(Endpoint e, string metadata);
RelatedLocation getRelatedLocation(Endpoint e, string name);
}
@@ -107,8 +94,6 @@ module SharedCharacteristics<CandidateSig Candidate> {
not exists(getAReasonSinkExcluded(candidateSink, sinkType))
}
predicate hasMetadata = Candidate::hasMetadata/2;
/**
* If it exists, gets a related location for a given endpoint or candidate.
* If it doesn't exist, returns the candidate itself as a 'null' value.