From ec5425d952cfdb41b39875a014d242cefc3ee3ab Mon Sep 17 00:00:00 2001 From: tiferet Date: Fri, 20 Jan 2023 11:18:02 -0800 Subject: [PATCH] When extracting positive and negative examples for the Java prompt, extract the data used in the MaD extensible predicate. This will enable the codex prompt to optionally use this data in additional columns. --- .../src/ExtractNegativeExamples.ql | 24 +++++++++++++++--- .../src/ExtractPositiveExamples.ql | 25 ++++++++++++++++--- .../src/ExtractSinkCandidatesWithFlow.ql | 6 ++--- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql index 84111f67164..b83dfea3cc6 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql @@ -27,7 +27,8 @@ DataFlow::Node getSampleFromSampleRate(float rate) { from DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic, - float confidence + float confidence, string message, string package, string type, boolean subtypes, string name, + string signature, string ext, string input, string provenance where characteristic.appliesToEndpoint(endpoint) and confidence >= characteristic.highConfidence() and @@ -50,5 +51,22 @@ where not positiveType instanceof NegativeType and characteristic2.hasImplications(positiveType, true, confidence2) ) and - endpoint = getSampleFromSampleRate(0.01) -select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + endpoint = getSampleFromSampleRate(0.01) and + exists(Callable callee, Call call, int index | + endpoint.asExpr() = call.getArgument(index) and + callee = call.getCallee() and + package = callee.getDeclaringType().getPackage().getName() and + type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations? + subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 + name = callee.getName() and // TODO: Will this work for constructors? + signature = callee.paramsString() and + ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 + input = "Argument[" + index + "]" and // TODO: why are slashes added? + provenance = "manual" // TODO + ) and + message = + "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes + + ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext + + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice? +select endpoint, message diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql index b790a417caf..959181c7c10 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql @@ -21,7 +21,9 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF * the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt! */ -from DataFlow::Node sink, AtmConfig::AtmConfig config +from + DataFlow::Node sink, AtmConfig::AtmConfig config, string message, string package, string type, + boolean subtypes, string name, string signature, string ext, string input, string provenance where config.isKnownSink(sink) and // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query @@ -30,5 +32,22 @@ where // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be // treated by the actual query as a sanitizer, since the final logic is something like // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt. - not config.isSanitizer(sink) -select sink, config.getASinkEndpointType().getDescription() + not config.isSanitizer(sink) and + exists(Callable callee, Call call, int index | + sink.asExpr() = call.getArgument(index) and + callee = call.getCallee() and + package = callee.getDeclaringType().getPackage().getName() and + type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations? + subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 + name = callee.getName() and // TODO: Will this work for constructors? + signature = callee.paramsString() and + ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069 + input = "Argument[" + index + "]" and // TODO: why are slashes added? + provenance = "manual" // TODO + ) and + message = + config.getASinkEndpointType().getDescription() + "\n{'Package': '" + package + "', 'Type': '" + + type + "', 'Subtypes': " + subtypes + ", 'Name': '" + name + "', 'Signature': '" + signature + + "', 'Ext': '" + ext + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + + "'}" // TODO: Why are the curly braces added twice? +select sink, message diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql index ec2a2b3cda8..0a56397a91e 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql @@ -42,7 +42,7 @@ where sinkPathNode.getNode() = sink | config.getASinkEndpointType().getDescription(), ", " - ) + "\n{'package': '" + package + "', 'type': '" + type + "', 'subtypes': " + subtypes + - ", 'name': '" + name + "', 'signature': '" + signature + "', 'ext': '" + ext + "', 'input': '" - + input + "', 'provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice? + ) + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes + + ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext + + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice? select sink, message