Refactor the CodeQL code that extracts metadata for methods presented to Codex, to make it easy to add another field

This commit is contained in:
tiferet
2023-02-01 12:14:03 -08:00
parent f32bb65c54
commit d3a5ee53c6
4 changed files with 32 additions and 9 deletions

View File

@@ -70,6 +70,35 @@ predicate erroneousConfidences(
predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess }
/**
* Holds if `n` has the given metadata.
*
* This is a helper function to extract and export needed information about each endpoint in the sink candidate query as
* well as the queries that exatract positive and negative examples for the prompt / training set. The metadata is
* extracted as a string in the format of a Python dictionary.
*/
predicate hasMetadata(DataFlow::Node n, string metadata) {
exists(
Callable callee, Call call, int index, string package, string type, boolean subtypes,
string name, string signature, string ext, string input, string provenance
|
n.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" and // TODO
metadata =
"{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
"', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
)
}
/**
* A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
* about whether to include the endpoint in the training set and with what label, as well as whether to score the

View File

@@ -54,7 +54,5 @@ where
message =
"Non-sink of type " + characteristic + " with confidence " + confidence.toString() + "\n" +
// Extract the needed metadata for this endpoint.
any(string concatenatedMetadata |
EndpointCharacteristics::hasMetaData(endpoint, concatenatedMetadata)
)
any(string metadata | EndpointCharacteristics::hasMetadata(endpoint, metadata))
select endpoint, message

View File

@@ -34,7 +34,5 @@ where
message =
config.getASinkEndpointType().getDescription() + "\n" +
// Extract the needed metadata for this endpoint.
any(string concatenatedMetadata |
EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
)
any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
select sink, message

View File

@@ -31,7 +31,5 @@ where
config.getASinkEndpointType().getDescription(), ", "
) + "\n" +
// Extract the needed metadata for this endpoint.
any(string concatenatedMetadata |
EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
)
any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
select sink, message