From f32bb65c540bb8f93a56cfa6be1502bd1c0bc449 Mon Sep 17 00:00:00 2001 From: tiferet Date: Tue, 31 Jan 2023 13:02:50 -0800 Subject: [PATCH] Refactor the CodeQL code that extracts metadata for methods presented to Codex, to make it easy to add another field --- .../src/ExtractNegativeExamples.ql | 24 +++++------------- .../src/ExtractPositiveExamples.ql | 25 +++++-------------- .../src/ExtractSinkCandidatesWithFlow.ql | 25 ++++++------------- 3 files changed, 19 insertions(+), 55 deletions(-) diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql index b83dfea3cc6..04bbf88616d 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql @@ -27,8 +27,7 @@ DataFlow::Node getSampleFromSampleRate(float rate) { from DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic, - float confidence, string message, string package, string type, boolean subtypes, string name, - string signature, string ext, string input, string provenance + float confidence, string message where characteristic.appliesToEndpoint(endpoint) and confidence >= characteristic.highConfidence() and @@ -52,21 +51,10 @@ where characteristic2.hasImplications(positiveType, true, confidence2) ) and endpoint = getSampleFromSampleRate(0.01) and - exists(Callable callee, Call call, int index | - endpoint.asExpr() = call.getArgument(index) and - callee = call.getCallee() and - package = callee.getDeclaringType().getPackage().getName() and - type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations? - subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 - name = callee.getName() and // TODO: Will this work for constructors? - signature = callee.paramsString() and - ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 - input = "Argument[" + index + "]" and // TODO: why are slashes added? - provenance = "manual" // TODO - ) and message = - "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + - "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes + - ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext + - "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice? + "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + "\n" + + // Extract the needed metadata for this endpoint. + any(string concatenatedMetadata | + EndpointCharacteristics::hasMetaData(endpoint, concatenatedMetadata) + ) select endpoint, message diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql index 959181c7c10..00a906e0f2e 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql @@ -21,9 +21,7 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF * the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt! */ -from - DataFlow::Node sink, AtmConfig::AtmConfig config, string message, string package, string type, - boolean subtypes, string name, string signature, string ext, string input, string provenance +from DataFlow::Node sink, AtmConfig::AtmConfig config, string message where config.isKnownSink(sink) and // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query @@ -33,21 +31,10 @@ where // treated by the actual query as a sanitizer, since the final logic is something like // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt. not config.isSanitizer(sink) and - exists(Callable callee, Call call, int index | - sink.asExpr() = call.getArgument(index) and - callee = call.getCallee() and - package = callee.getDeclaringType().getPackage().getName() and - type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations? - subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 - name = callee.getName() and // TODO: Will this work for constructors? - signature = callee.paramsString() and - ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 - input = "Argument[" + index + "]" and // TODO: why are slashes added? - provenance = "manual" // TODO - ) and message = - config.getASinkEndpointType().getDescription() + "\n{'Package': '" + package + "', 'Type': '" + - type + "', 'Subtypes': " + subtypes + ", 'Name': '" + name + "', 'Signature': '" + signature + - "', 'Ext': '" + ext + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + - "'}" // TODO: Why are the curly braces added twice? + config.getASinkEndpointType().getDescription() + "\n" + + // Extract the needed metadata for this endpoint. + any(string concatenatedMetadata | + EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata) + ) select sink, message diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql index 0a56397a91e..e0eabc349cf 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql @@ -13,27 +13,14 @@ private import java import semmle.code.java.dataflow.TaintTracking +private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm -from - DataFlow::Node sink, string message, string package, string type, boolean subtypes, string name, - string signature, string ext, string input, string provenance +from DataFlow::Node sink, string message where - exists(Callable callee, Call call, int index | - sink.asExpr() = call.getArgument(index) and - callee = call.getCallee() and - package = callee.getDeclaringType().getPackage().getName() and - type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations? - subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 - name = callee.getName() and // TODO: Will this work for constructors? - signature = callee.paramsString() and - ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 - input = "Argument[" + index + "]" and // TODO: why are slashes added? - provenance = "manual" // TODO - ) and // The message is the concatenation of all relevant configs, and we surface only sinks that have at least one relevant // config. message = @@ -42,7 +29,9 @@ where sinkPathNode.getNode() = sink | config.getASinkEndpointType().getDescription(), ", " - ) + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes + - ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext + - "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice? + ) + "\n" + + // Extract the needed metadata for this endpoint. + any(string concatenatedMetadata | + EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata) + ) select sink, message