Refactor the CodeQL code that extracts metadata for methods presented to Codex, to make it easy to add another field

2026-05-25 00:27:09 +02:00 · 2023-02-01 12:14:03 -08:00
parent f32bb65c54
commit d3a5ee53c6
4 changed files with 32 additions and 9 deletions
--- a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll
+++ b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll
@@ -70,6 +70,35 @@ predicate erroneousConfidences(

 predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess }

+/**
+ * Holds if `n` has the given metadata.
+ *
+ * This is a helper function to extract and export needed information about each endpoint in the sink candidate query as
+ * well as the queries that exatract positive and negative examples for the prompt / training set. The metadata is
+ * extracted as a string in the format of a Python dictionary.
+ */
+predicate hasMetadata(DataFlow::Node n, string metadata) {
+  exists(
+    Callable callee, Call call, int index, string package, string type, boolean subtypes,
+    string name, string signature, string ext, string input, string provenance
+  |
+    n.asExpr() = call.getArgument(index) and
+    callee = call.getCallee() and
+    package = callee.getDeclaringType().getPackage().getName() and
+    type = callee.getDeclaringType().getName() and //TODO: Will this work for inner classes? Will it produce X$Y? What about lambdas? What about enums? What about interfaces? What about annotations?
+    subtypes = true and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
+    name = callee.getName() and // TODO: Will this work for constructors?
+    signature = callee.paramsString() and
+    ext = "" and // see https://github.slack.com/archives/CP9127VUK/p1673979477496069
+    input = "Argument[" + index + "]" and // TODO: why are slashes added?
+    provenance = "manual" and // TODO
+    metadata =
+      "{'Package': '" + package + "', 'Type': '" + type + "', 'Subtypes': " + subtypes +
+        ", 'Name': '" + name + "', 'Signature': '" + signature + "', 'Ext': '" + ext +
+        "', 'Argument index': '" + input + "', 'Provenance': '" + provenance + "'}" // TODO: Why are the curly braces added twice?
+  )
+}
+
 /**
 * A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
 * about whether to include the endpoint in the training set and with what label, as well as whether to score the
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
@@ -54,7 +54,5 @@ where
  message =
    "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + "\n" +
      // Extract the needed metadata for this endpoint.
-      any(string concatenatedMetadata |
-        EndpointCharacteristics::hasMetaData(endpoint, concatenatedMetadata)
-      )
+      any(string metadata | EndpointCharacteristics::hasMetadata(endpoint, metadata))
 select endpoint, message
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
@@ -34,7 +34,5 @@ where
  message =
    config.getASinkEndpointType().getDescription() + "\n" +
      // Extract the needed metadata for this endpoint.
-      any(string concatenatedMetadata |
-        EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
-      )
+      any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
 select sink, message
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractSinkCandidatesWithFlow.ql
@@ -31,7 +31,5 @@ where
        config.getASinkEndpointType().getDescription(), ", "
      ) + "\n" +
      // Extract the needed metadata for this endpoint.
-      any(string concatenatedMetadata |
-        EndpointCharacteristics::hasMetaData(sink, concatenatedMetadata)
-      )
+      any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
 select sink, message