From d3a5ee53c6be3e89de3bb5026aaff3f9d7fdb59b Mon Sep 17 00:00:00 2001 From: tiferet Date: Wed, 1 Feb 2023 12:14:03 -0800 Subject: [PATCH] Refactor the CodeQL code that extracts metadata for methods presented to Codex, to make it easy to add another field --- .../EndpointCharacteristics.qll | 29 +++++++++++++++++++ .../src/ExtractNegativeExamples.ql | 4 +-- .../src/ExtractPositiveExamples.ql | 4 +-- .../src/ExtractSinkCandidatesWithFlow.ql | 4 +-- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll index 403d2a9898d..0a9b5cc21d4 100644 --- a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll +++ b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll @@ -70,6 +70,35 @@ predicate erroneousConfidences( predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess } +/** + * Holds if `n` has the given metadata. + * + * This is a helper function to extract and export needed information about each endpoint in the sink candidate query as + * well as the queries that exatract positive and negative examples for the prompt / training set. The metadata is + * extracted as a string in the format of a Python dictionary. + */ +predicate hasMetadata(DataFlow::Node n, string metadata) { + exists( + Callable callee, Call call, int index, string package, string type, boolean subtypes, + string name, string signature, string ext, string input, string provenance + | + n.asExpr() = call.getArgument(index) and + callee = call.getCallee() and + package = callee.getDeclaringType().getPackage().getName() and + type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations? + subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 + name = callee.getName() and // TODO: Will this work for constructors? + signature = callee.paramsString() and + ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 + input = "Argument[" + index + "]" and // TODO: why are slashes added? + provenance = "manual" and // TODO + metadata = + "{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes + + ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext + + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice? + ) +} + /** * A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions * about whether to include the endpoint in the training set and with what label, as well as whether to score the diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql index 04bbf88616d..8a90d99f2a8 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql @@ -54,7 +54,5 @@ where message = "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + "\n" + // Extract the needed metadata for this endpoint. - any(string concatenatedMetadata | - EndpointCharacteristics::hasMetaData(endpoint, concatenatedMetadata) - ) + any(string metadata | EndpointCharacteristics::hasMetadata(endpoint, metadata)) select endpoint, message diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql index 00a906e0f2e..dc58251b10e 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql @@ -34,7 +34,5 @@ where message = config.getASinkEndpointType().getDescription() + "\n" + // Extract the needed metadata for this endpoint. - any(string concatenatedMetadata | - EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata) - ) + any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata)) select sink, message diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql index e0eabc349cf..a7d296e243a 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql @@ -31,7 +31,5 @@ where config.getASinkEndpointType().getDescription(), ", " ) + "\n" + // Extract the needed metadata for this endpoint. - any(string concatenatedMetadata | - EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata) - ) + any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata)) select sink, message