Refactor the CodeQL code that extracts metadata for methods presented to Codex, to make it easy to add another field

This commit is contained in:
tiferet
2023-01-31 13:02:50 -08:00
parent 633bfdba28
commit f32bb65c54
3 changed files with 19 additions and 55 deletions

View File

@@ -27,8 +27,7 @@ DataFlow::Node getSampleFromSampleRate(float rate) {
from
DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic,
float confidence, string message, string package, string type, boolean subtypes, string name,
string signature, string ext, string input, string provenance
float confidence, string message
where
characteristic.appliesToEndpoint(endpoint) and
confidence >= characteristic.highConfidence() and
@@ -52,21 +51,10 @@ where
characteristic2.hasImplications(positiveType, true, confidence2)
) and
endpoint = getSampleFromSampleRate(0.01) and
exists(Callable callee, Call call, int index |
endpoint.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" // TODO
) and
message =
"Non-sink of type " + characteristic + " with confidence " + confidence.toString() +
"\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
"', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
"Non-sink of type " + characteristic + " with confidence " + confidence.toString() + "\n" +
// Extract the needed metadata for this endpoint.
any(string concatenatedMetadata |
EndpointCharacteristics::hasMetaData(endpoint, concatenatedMetadata)
)
select endpoint, message

View File

@@ -21,9 +21,7 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF
* the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
*/
from
DataFlow::Node sink, AtmConfig::AtmConfig config, string message, string package, string type,
boolean subtypes, string name, string signature, string ext, string input, string provenance
from DataFlow::Node sink, AtmConfig::AtmConfig config, string message
where
config.isKnownSink(sink) and
// If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
@@ -33,21 +31,10 @@ where
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
not config.isSanitizer(sink) and
exists(Callable callee, Call call, int index |
sink.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" // TODO
) and
message =
config.getASinkEndpointType().getDescription() + "\n{'Package': '" + package + "', 'Type': '" +
type + "', 'Subtypes': " + subtypes + ", 'Name': '" + name + "', 'Signature': '" + signature +
"', 'Ext': '" + ext + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance +
"'}" // TODO: Why are the curly braces added twice?
config.getASinkEndpointType().getDescription() + "\n" +
// Extract the needed metadata for this endpoint.
any(string concatenatedMetadata |
EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
)
select sink, message

View File

@@ -13,27 +13,14 @@
private import java
import semmle.code.java.dataflow.TaintTracking
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm
from
DataFlow::Node sink, string message, string package, string type, boolean subtypes, string name,
string signature, string ext, string input, string provenance
from DataFlow::Node sink, string message
where
exists(Callable callee, Call call, int index |
sink.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" // TODO
) and
// The message is the concatenation of all relevant configs, and we surface only sinks that have at least one relevant
// config.
message =
@@ -42,7 +29,9 @@ where
sinkPathNode.getNode() = sink
|
config.getASinkEndpointType().getDescription(), ", "
) + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
"', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
) + "\n" +
// Extract the needed metadata for this endpoint.
any(string concatenatedMetadata |
EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
)
select sink, message