When extracting positive and negative examples for the Java prompt, extract the data used in the MaD extensible predicate.

This will enable the codex prompt to optionally use this data in additional columns.
This commit is contained in:
tiferet
2023-01-20 11:18:02 -08:00
parent 7666843316
commit ec5425d952
3 changed files with 46 additions and 9 deletions

View File

@@ -27,7 +27,8 @@ DataFlow::Node getSampleFromSampleRate(float rate) {
from
DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic,
float confidence
float confidence, string message, string package, string type, boolean subtypes, string name,
string signature, string ext, string input, string provenance
where
characteristic.appliesToEndpoint(endpoint) and
confidence >= characteristic.highConfidence() and
@@ -50,5 +51,22 @@ where
not positiveType instanceof NegativeType and
characteristic2.hasImplications(positiveType, true, confidence2)
) and
endpoint = getSampleFromSampleRate(0.01)
select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString()
endpoint = getSampleFromSampleRate(0.01) and
exists(Callable callee, Call call, int index |
endpoint.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" // TODO
) and
message =
"Non-sink of type " + characteristic + " with confidence " + confidence.toString() +
"\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
"', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
select endpoint, message

View File

@@ -21,7 +21,9 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF
* the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
*/
from DataFlow::Node sink, AtmConfig::AtmConfig config
from
DataFlow::Node sink, AtmConfig::AtmConfig config, string message, string package, string type,
boolean subtypes, string name, string signature, string ext, string input, string provenance
where
config.isKnownSink(sink) and
// If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
@@ -30,5 +32,22 @@ where
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
// treated by the actual query as a sanitizer, since the final logic is something like
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
not config.isSanitizer(sink)
select sink, config.getASinkEndpointType().getDescription()
not config.isSanitizer(sink) and
exists(Callable callee, Call call, int index |
sink.asExpr() = call.getArgument(index) and
callee = call.getCallee() and
package = callee.getDeclaringType().getPackage().getName() and
type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
name = callee.getName() and // TODO: Will this work for constructors?
signature = callee.paramsString() and
ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
input = "Argument[" + index + "]" and // TODO: why are slashes added?
provenance = "manual" // TODO
) and
message =
config.getASinkEndpointType().getDescription() + "\n{'Package': '" + package + "', 'Type': '" +
type + "', 'Subtypes': " + subtypes + ", 'Name': '" + name + "', 'Signature': '" + signature +
"', 'Ext': '" + ext + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance +
"'}" // TODO: Why are the curly braces added twice?
select sink, message

View File

@@ -42,7 +42,7 @@ where
sinkPathNode.getNode() = sink
|
config.getASinkEndpointType().getDescription(), ", "
) + "\n{'package': '" + package + "', 'type': '" + type + "', 'subtypes': " + subtypes +
", 'name': '" + name + "', 'signature': '" + signature + "', 'ext': '" + ext + "', 'input': '"
+ input + "', 'provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
) + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
"', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
select sink, message