Refactor the CodeQL code that extracts metadata for methods presented to Codex, to make it easy to add another field

2026-05-24 16:17:07 +02:00 · 2023-01-31 13:02:50 -08:00
parent 633bfdba28
commit f32bb65c54
3 changed files with 19 additions and 55 deletions
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
@@ -27,8 +27,7 @@ DataFlow::Node getSampleFromSampleRate(float rate) {

 from
  DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic,
-  float confidence, string message, string package, string type, boolean subtypes, string name,
-  string signature, string ext, string input, string provenance
+  float confidence, string message
 where
  characteristic.appliesToEndpoint(endpoint) and
  confidence >= characteristic.highConfidence() and
@@ -52,21 +51,10 @@ where
    characteristic2.hasImplications(positiveType, true, confidence2)
  ) and
  endpoint = getSampleFromSampleRate(0.01) and
-  exists(Callable callee, Call call, int index |
-    endpoint.asExpr() = call.getArgument(index) and
-    callee = call.getCallee() and
-    package = callee.getDeclaringType().getPackage().getName() and
-    type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
-    subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
-    name = callee.getName() and // TODO: Will this work for constructors?
-    signature = callee.paramsString() and
-    ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
-    input = "Argument[" + index + "]" and // TODO: why are slashes added?
-    provenance = "manual" // TODO
-  ) and
  message =
-    "Non-sink of type " + characteristic + " with confidence " + confidence.toString() +
-      "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
-      ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
-      "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
+    "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + "\n" +
+      // Extract the needed metadata for this endpoint.
+      any(string concatenatedMetadata |
+        EndpointCharacteristics::hasMetaData(endpoint, concatenatedMetadata)
+      )
 select endpoint, message
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
@@ -21,9 +21,7 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestF
 * the ML-gnerarated, noisy sinks will end up poluting the positive examples used in the prompt!
 */

-from
-  DataFlow::Node sink, AtmConfig::AtmConfig config, string message, string package, string type,
-  boolean subtypes, string name, string signature, string ext, string input, string provenance
+from DataFlow::Node sink, AtmConfig::AtmConfig config, string message
 where
  config.isKnownSink(sink) and
  // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
@@ -33,21 +31,10 @@ where
  // treated by the actual query as a sanitizer, since the final logic is something like
  // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt.
  not config.isSanitizer(sink) and
-  exists(Callable callee, Call call, int index |
-    sink.asExpr() = call.getArgument(index) and
-    callee = call.getCallee() and
-    package = callee.getDeclaringType().getPackage().getName() and
-    type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
-    subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
-    name = callee.getName() and // TODO: Will this work for constructors?
-    signature = callee.paramsString() and
-    ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
-    input = "Argument[" + index + "]" and // TODO: why are slashes added?
-    provenance = "manual" // TODO
-  ) and
  message =
-    config.getASinkEndpointType().getDescription() + "\n{'Package': '" + package + "', 'Type': '" +
-      type + "', 'Subtypes': " + subtypes + ", 'Name': '" + name + "', 'Signature': '" + signature +
-      "', 'Ext': '" + ext + "', 'Argument index': '" + input + "', 'Provenance': '" + provenance +
-      "'}" // TODO: Why are the curly braces added twice?
+    config.getASinkEndpointType().getDescription() + "\n" +
+      // Extract the needed metadata for this endpoint.
+      any(string concatenatedMetadata |
+        EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
+      )
 select sink, message
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql
@@ -13,27 +13,14 @@

 private import java
 import semmle.code.java.dataflow.TaintTracking
+private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
 private import experimental.adaptivethreatmodeling.ATMConfig as AtmConfig
 private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
 private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
 private import experimental.adaptivethreatmodeling.RequestForgeryATM as RequestForgeryAtm

-from
-  DataFlow::Node sink, string message, string package, string type, boolean subtypes, string name,
-  string signature, string ext, string input, string provenance
+from DataFlow::Node sink, string message
 where
-  exists(Callable callee, Call call, int index |
-    sink.asExpr() = call.getArgument(index) and
-    callee = call.getCallee() and
-    package = callee.getDeclaringType().getPackage().getName() and
-    type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
-    subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
-    name = callee.getName() and // TODO: Will this work for constructors?
-    signature = callee.paramsString() and
-    ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
-    input = "Argument[" + index + "]" and // TODO: why are slashes added?
-    provenance = "manual" // TODO
-  ) and
  // The message is the concatenation of all relevant configs, and we surface only sinks that have at least one relevant
  // config.
  message =
@@ -42,7 +29,9 @@ where
        sinkPathNode.getNode() = sink
      |
        config.getASinkEndpointType().getDescription(), ", "
-      ) + "\n{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
-      ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
-      "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
+      ) + "\n" +
+      // Extract the needed metadata for this endpoint.
+      any(string concatenatedMetadata |
+        EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
+      )
 select sink, message