When extracting positive and negative examples for the Java prompt, extract the data used in the MaD extensible predicate.

This will enable the codex prompt to optionally use this data in additional columns.
2026-05-21 14:47:10 +02:00 · 2023-01-20 11:18:02 -08:00
parent 7666843316
commit ec5425d952
3 changed files with 46 additions and 9 deletions
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
@@ -27,7 +27,8 @@ DataFlow::Node getSampleFromSampleRate(float rate) {

 from
  DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic,
-  float confidence
+  float confidence, string message, string package, string type, boolean subtypes, string name,
+  string signature, string ext, string input, string provenance
 where
  characteristic.appliesToEndpoint(endpoint) and
  confidence >= characteristic.highConfidence() and
@@ -50,5 +51,22 @@ where
    not positiveType instanceof NegativeType and
    characteristic2.hasImplications(positiveType, true, confidence2)
  ) and
-  endpoint = getSampleFromSampleRate(0.01)
-select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString()
+  endpoint = getSampleFromSampleRate(0.01) and
+  exists(Callable callee, Call call, int index |
+    endpoint.asExpr() = call.getArgument(index) and
+    callee = call.getCallee() and
+    package = callee.getDeclaringType().getPackage().getName() and
+    type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
+    subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
+    name = callee.getName() and // TODO: Will this work for constructors?
+    signature = callee.paramsString() and
+    ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
+    input = "Argument[" + index + "]" and // TODO: why are slashes added?
+    provenance = "manual" // TODO
+  ) and
+  message =
+    "Non-sink of type " + characteristic + " with confidence " + confidence.toString() +
+      "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
+      ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
+      "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
+select endpoint, message
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
@@ -21,7 +21,9 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF
 * the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
 */

-from DataFlow::Node sink, AtmConfig::AtmConfig config
+from
+  DataFlow::Node sink, AtmConfig::AtmConfig config, string message, string package, string type,
+  boolean subtypes, string name, string signature, string ext, string input, string provenance
 where
  config.isKnownSink(sink) and
  // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
@@ -30,5 +32,22 @@ where
  // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
  // treated by the actual query as a sanitizer, since the final logic is something like
  // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
-  not config.isSanitizer(sink)
-select sink, config.getASinkEndpointType().getDescription()
+  not config.isSanitizer(sink) and
+  exists(Callable callee, Call call, int index |
+    sink.asExpr() = call.getArgument(index) and
+    callee = call.getCallee() and
+    package = callee.getDeclaringType().getPackage().getName() and
+    type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
+    subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
+    name = callee.getName() and // TODO: Will this work for constructors?
+    signature = callee.paramsString() and
+    ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
+    input = "Argument[" + index + "]" and // TODO: why are slashes added?
+    provenance = "manual" // TODO
+  ) and
+  message =
+    config.getASinkEndpointType().getDescription() + "\n{'Package': '" + package + "', 'Type': '" +
+      type + "', 'Subtypes': " + subtypes + ", 'Name': '" + name + "', 'Signature': '" + signature +
+      "', 'Ext': '" + ext + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance +
+      "'}" // TODO: Why are the curly braces added twice?
+select sink, message
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql
@@ -42,7 +42,7 @@ where
        sinkPathNode.getNode() = sink
      |
        config.getASinkEndpointType().getDescription(), ", "
-      ) + "\n{'package': '" + package + "', 'type': '" + type + "', 'subtypes': " + subtypes +
-      ", 'name': '" + name + "', 'signature': '" + signature + "', 'ext': '" + ext + "', 'input': '"
-      + input + "', 'provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
+      ) + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
+      ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
+      "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
 select sink, message