From 32f2614fe0e324dc1c4fc87d81a2f0abc35eda1b Mon Sep 17 00:00:00 2001 From: Stephan Brandauer Date: Wed, 3 May 2023 16:00:50 +0200 Subject: [PATCH] add typecheckable mechanism to enforce minimal set of metadata --- .../Telemetry/AutomodelExtractCandidates.ql | 28 +++--- .../AutomodelExtractNegativeExamples.ql | 24 ++--- .../AutomodelExtractPositiveExamples.ql | 30 +++---- .../AutomodelFrameworkModeCharacteristics.qll | 88 +++++++++---------- .../AutomodelSharedCharacteristics.qll | 15 ---- 5 files changed, 85 insertions(+), 100 deletions(-) diff --git a/java/ql/src/Telemetry/AutomodelExtractCandidates.ql b/java/ql/src/Telemetry/AutomodelExtractCandidates.ql index a0b575f2ccf..eb94f1698fc 100644 --- a/java/ql/src/Telemetry/AutomodelExtractCandidates.ql +++ b/java/ql/src/Telemetry/AutomodelExtractCandidates.ql @@ -12,9 +12,11 @@ * @tags internal automodel extract candidates */ -import AutomodelEndpointCharacteristics +private import AutomodelFrameworkModeCharacteristics -from Endpoint endpoint, string message +from + Endpoint endpoint, string message, MetadataExtractor meta, string package, string type, + boolean subtypes, string name, string signature, int input where not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u | u.appliesToEndpoint(endpoint) @@ -25,18 +27,20 @@ where // overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been // modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it. not CharacteristicsImpl::isSink(endpoint, _) and + meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and // The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be // a non-sink, and we surface only endpoints that have at least one such sink type. message = strictconcat(AutomodelEndpointTypes::SinkType sinkType | - not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and - CharacteristicsImpl::isSinkCandidate(endpoint, sinkType) - | - sinkType + ", " - ) + "\n" + - // Extract the needed metadata for this endpoint. - any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata)) -select endpoint, message + "\nrelated locations: $@, $@.", // + not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and + CharacteristicsImpl::isSinkCandidate(endpoint, sinkType) + | + sinkType + ", " + ) +select endpoint, + message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", // CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"), - "Callable-JavaDoc", // - CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), "Class-JavaDoc" // + "Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), + "Class-JavaDoc", // + package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature, + "signature", input.toString(), "input" // diff --git a/java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql b/java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql index 694637862e5..86dac852487 100644 --- a/java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql +++ b/java/ql/src/Telemetry/AutomodelExtractNegativeExamples.ql @@ -8,10 +8,13 @@ * @tags internal automodel extract examples negative */ -import AutomodelEndpointCharacteristics -import AutomodelEndpointTypes +private import AutomodelFrameworkModeCharacteristics +private import AutomodelEndpointTypes -from Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message +from + Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message, + MetadataExtractor meta, string package, string type, boolean subtypes, string name, + string signature, int input where characteristic.appliesToEndpoint(endpoint) and confidence >= SharedCharacteristics::highConfidence() and @@ -19,6 +22,7 @@ where // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly // certain about in the prompt. not erroneousEndpoints(endpoint, _, _, _, _, false) and + meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be // treated by the actual query as a sanitizer, since the final logic is something like // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because @@ -29,11 +33,11 @@ where confidence2 >= SharedCharacteristics::maximalConfidence() and characteristic2.hasImplications(positiveType, true, confidence2) ) and - message = - characteristic + "\n" + - // Extract the needed metadata for this endpoint. - any(string metadata | CharacteristicsImpl::hasMetadata(endpoint, metadata)) -select endpoint, message + "\nrelated locations: $@, $@.", + message = characteristic +select endpoint, + message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", // CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"), - "Callable-JavaDoc", // - CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), "Class-JavaDoc" // + "Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), + "Class-JavaDoc", // + package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature, + "signature", input.toString(), "input" // diff --git a/java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql b/java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql index 62470d19c89..af84d3a2db4 100644 --- a/java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql +++ b/java/ql/src/Telemetry/AutomodelExtractPositiveExamples.ql @@ -8,25 +8,23 @@ * @tags internal automodel extract examples positive */ -private import java -private import semmle.code.java.security.ExternalAPIs as ExternalAPIs -private import AutomodelEndpointCharacteristics +private import AutomodelFrameworkModeCharacteristics private import AutomodelEndpointTypes -from Endpoint sink, SinkType sinkType, string message +from + Endpoint endpoint, SinkType sinkType, MetadataExtractor meta, string package, string type, + boolean subtypes, string name, string signature, int input where // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly // certain about in the prompt. - not erroneousEndpoints(sink, _, _, _, _, false) and + not erroneousEndpoints(endpoint, _, _, _, _, false) and + meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and // Extract positive examples of sinks belonging to the existing ATM query configurations. - ( - CharacteristicsImpl::isKnownSink(sink, sinkType) and - message = - sinkType + "\n" + - // Extract the needed metadata for this endpoint. - any(string metadata | CharacteristicsImpl::hasMetadata(sink, metadata)) - ) -select sink, message + "\nrelated locations: $@, $@.", - CharacteristicsImpl::getRelatedLocationOrCandidate(sink, "Callable-JavaDoc"), - "Callable-JavaDoc", // - CharacteristicsImpl::getRelatedLocationOrCandidate(sink, "Class-JavaDoc"), "Class-JavaDoc" // + CharacteristicsImpl::isKnownSink(endpoint, sinkType) +select endpoint, + sinkType + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", // + CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Callable-JavaDoc"), + "Callable-JavaDoc", CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, "Class-JavaDoc"), + "Class-JavaDoc", // + package, "package", type, "type", subtypes.toString(), "subtypes", name, "name", signature, + "signature", input.toString(), "input" // diff --git a/java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll b/java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll index d4228536822..2290e86cec4 100644 --- a/java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll +++ b/java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll @@ -17,6 +17,22 @@ private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclus import AutomodelSharedCharacteristics as SharedCharacteristics import AutomodelEndpointTypes as AutomodelEndpointTypes +Callable getCallable(DataFlow::ParameterNode e) { result = e.getEnclosingCallable() } + +/** + * A meta data extractor. Any Java extraction mode needs to implement exactly + * one instance of this class. + */ +abstract class MetadataExtractor extends string { + bindingset[this] + MetadataExtractor() { any() } + + abstract predicate hasMetadata( + DataFlow::ParameterNode e, string package, string type, boolean subtypes, string name, + string signature, int input + ); +} + module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig { class Endpoint = DataFlow::ParameterNode; @@ -87,26 +103,6 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig { exists(int paramIdx | e.isParameterOf(_, paramIdx) | input = "Argument[" + paramIdx + "]") } - predicate hasMetadata(Endpoint e, string metadata) { - exists( - string package, string type, boolean subtypes, string name, string signature, int input, - boolean isPublic, boolean isFinal, boolean isStatic - | - hasMetadata(e, package, type, name, signature, input, isFinal, isStatic, isPublic) and - (if isFinal = true or isStatic = true then subtypes = false else subtypes = true) and - metadata = - "{" // - + "'Package': '" + package // - + "', 'Type': '" + type // - + "', 'Subtypes': " + subtypes // - + ", 'Name': '" + name // - + ", 'ParamName': '" + e.toString() // - + "', 'Signature': '" + signature // - + "', 'Argument index': " + input // - + "'}" // TODO: Why are the curly braces added twice? - ) - } - RelatedLocation getRelatedLocation(Endpoint e, string name) { name = "Callable-JavaDoc" and result = getCallable(e).(Documentable).getJavadoc() @@ -116,8 +112,6 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig { } } -Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() } - module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics; class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic; @@ -129,32 +123,32 @@ class Endpoint = FrameworkCandidatesImpl::Endpoint; */ /** - * Holds if `n` has the given metadata. - * - * This is a helper function to extract and export needed information about each endpoint. + * A MetadataExtractor that extracts metadata for framework mode. */ -predicate hasMetadata( - Endpoint n, string package, string type, string name, string signature, int input, - boolean isFinal, boolean isStatic, boolean isPublic -) { - exists(Callable callable | - n.asParameter() = callable.getParameter(input) and - package = callable.getDeclaringType().getPackage().getName() and - type = callable.getDeclaringType().getErasure().(RefType).nestedName() and - ( - if callable.isStatic() or callable.getDeclaringType().isStatic() - then isStatic = true - else isStatic = false - ) and - ( - if callable.isFinal() or callable.getDeclaringType().isFinal() - then isFinal = true - else isFinal = false - ) and - name = callable.getSourceDeclaration().getName() and - signature = ExternalFlow::paramsString(callable) and // TODO: Why are brackets being escaped (`\[\]` vs `[]`)? - (if callable.isPublic() then isPublic = true else isPublic = false) - ) +class FrameworkModeMetadataExtractor extends MetadataExtractor { + FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" } + + override predicate hasMetadata( + Endpoint e, string package, string type, boolean subtypes, string name, string signature, + int input + ) { + exists(Callable callable | + e.asParameter() = callable.getParameter(input) and + package = callable.getDeclaringType().getPackage().getName() and + type = callable.getDeclaringType().getErasure().(RefType).nestedName() and + ( + if + callable.isStatic() or + callable.getDeclaringType().isStatic() or + callable.isFinal() or + callable.getDeclaringType().isFinal() + then subtypes = true + else subtypes = false + ) and + name = e.toString() and + signature = ExternalFlow::paramsString(callable) + ) + } } /* diff --git a/java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll b/java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll index abb549317f8..2cbb346005c 100644 --- a/java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll +++ b/java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll @@ -55,19 +55,6 @@ signature module CandidateSig { */ predicate isNeutral(Endpoint e); - /** - * Holds if `e` has the given metadata. - * - * This is a helper function to extract and export needed information about each endpoint in the sink candidate query - * as well as the queries that extract positive and negative examples for the prompt / training set. The metadata is - * extracted as a string in the format of a Python dictionary, eg.: - * - * `{'Package': 'com.foo.util', 'Type': 'HelperClass', ... }`. - * - * The meta data will be passed on to the machine learning code by the extraction queries. - */ - predicate hasMetadata(Endpoint e, string metadata); - RelatedLocation getRelatedLocation(Endpoint e, string name); } @@ -107,8 +94,6 @@ module SharedCharacteristics { not exists(getAReasonSinkExcluded(candidateSink, sinkType)) } - predicate hasMetadata = Candidate::hasMetadata/2; - /** * If it exists, gets a related location for a given endpoint or candidate. * If it doesn't exist, returns the candidate itself as a 'null' value.