Add an endpoint filter to filter out MaD-modeled taint steps.

This filter currently has some overlap with `CreatePathSinkCharacteristic`. We add a flag to `erroneousEndpoints` such that these known modeling errors can optionally be ignored. We turn the flag off when extracting prompt examples, to ensure the prompt contains only examples we're highly certain about. If there are errors even with this flag turned on, we return an error message in the query that extracts positive examples, to prevent us from accidentally running it when there's a codex-generated data extension file in `java/ql/lib/ext`.
2026-05-24 16:17:07 +02:00 · 2023-02-28 12:53:30 -08:00
parent 25f103a010
commit b48c6badba
3 changed files with 51 additions and 27 deletions
--- a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll
+++ b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll
@@ -40,7 +40,7 @@ predicate isKnownSink(DataFlow::Node sink, SinkType sinkType) {
 */
 predicate erroneousEndpoints(
  DataFlow::Node endpoint, EndpointCharacteristic characteristic, EndpointType endpointClass,
-  float confidence, string errorMessage
+  float confidence, string errorMessage, boolean ignoreKnownModelingErrors
 ) {
  // An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
  // sink/source type.
@@ -63,10 +63,13 @@ predicate erroneousEndpoints(
    not (
      characteristic instanceof IsSanitizerCharacteristic or
      characteristic2 instanceof IsSanitizerCharacteristic
+    ) and
+    (
+      ignoreKnownModelingErrors = true and
+      not knownOverlappingCharacteristics(characteristic, characteristic2)
+      or
+      ignoreKnownModelingErrors = false
    )
-    // and
-    // // We currently know of some sink types that overlap with one another.
-    // not knownOverlappingCharacteristics(characteristic, characteristic2)
  ) and
  errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
  or
@@ -80,6 +83,7 @@ predicate erroneousEndpoints(
    confidence > characteristic.mediumConfidence() and
    confidence2 > characteristic2.mediumConfidence()
  ) and
+  ignoreKnownModelingErrors = false and
  errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
 }

@@ -104,6 +108,18 @@ predicate erroneousConfidences(
 //   characteristic1 = ["file creation sink", "other path injection sink"] and
 //   characteristic2 = ["file creation sink", "other path injection sink"]
 // }
+/**
+ * Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that
+ * have some overlap in their results. This indicates a problem with the underlying Java modeling.
+ */
+private predicate knownOverlappingCharacteristics(
+  EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2
+) {
+  characteristic1 != characteristic2 and
+  characteristic1 = ["mad taint step", "create path"] and
+  characteristic2 = ["mad taint step", "create path"]
+}
+
 predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess }

 /**
@@ -374,21 +390,22 @@ private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
  }
 }

-// /**
-//  * A negative characteristic that indicates that an endpoint is a MaD taint step. MaD modeled taint steps are global,
-//  * so they are not sinks for any query. Non-MaD taint steps might be specific to a particular query, so we don't
-//  * filter those out.
-//  */
-// private class IsMaDTaintStepCharacteristic extends NotASinkCharacteristic {
-//   IsMaDTaintStepCharacteristic() { this = "mad taint step" }
-//   override predicate appliesToEndpoint(DataFlow::Node n) {
-//     FlowSummaryImpl::Private::Steps::summaryThroughStepValue(n, _, _)
-//     or
-//     // FlowSummaryImpl::Private::Steps::summaryThroughStepTaint(n, _, _) or
-//     // FlowSummaryImpl::Private::Steps::summaryGetterStep(n, _, _, _) or
-//     FlowSummaryImpl::Private::Steps::summarySetterStep(n, _, _, _)
-//   }
-// }
+/**
+ * A negative characteristic that indicates that an endpoint is a MaD taint step. MaD modeled taint steps are global,
+ * so they are not sinks for any query. Non-MaD taint steps might be specific to a particular query, so we don't
+ * filter those out.
+ */
+private class IsMaDTaintStepCharacteristic extends NotASinkCharacteristic {
+  IsMaDTaintStepCharacteristic() { this = "mad taint step" }
+
+  override predicate appliesToEndpoint(DataFlow::Node n) {
+    FlowSummaryImpl::Private::Steps::summaryThroughStepValue(n, _, _) or
+    FlowSummaryImpl::Private::Steps::summaryThroughStepTaint(n, _, _) or
+    FlowSummaryImpl::Private::Steps::summaryGetterStep(n, _, _, _) or
+    FlowSummaryImpl::Private::Steps::summarySetterStep(n, _, _, _)
+  }
+}
+
 /**
 * A negative characteristic that indicates that an endpoint is an argument to a safe external API method.
 *
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql
@@ -35,7 +35,7 @@ where
  characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
  // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
  // certain about in the prompt.
-  not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _) and
+  not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _, false) and
  // Exclude type access nodes because they will never be on a flow path so they're not useful negative examples.
  not EndpointCharacteristics::isTypeAccess(endpoint) and
  // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
--- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
+++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql
@@ -22,9 +22,9 @@ private import experimental.adaptivethreatmodeling.ATMConfigs // To import the c

 from DataFlow::Node sink, SinkType sinkType, string message
 where
-  // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query
-  // when there's a codex-generated data extension file in `java/ql/lib/ext`.
-  not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) and
+  // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
+  // certain about in the prompt.
+  not EndpointCharacteristics::erroneousEndpoints(sink, _, _, _, _, false) and
  // Extract positive examples of sinks belonging to the existing ATM query configurations.
  (
    EndpointCharacteristics::isKnownSink(sink, sinkType) and
@@ -35,9 +35,16 @@ where
    // Include only sinks that are arguments to an external API call, because these are the sinks we are most interested
    // in.
    sink instanceof ExternalAPIs::ExternalApiDataNode and
-    message =
-      sinkType + "\n" +
-        // Extract the needed metadata for this endpoint.
-        any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
+    // If there are _any_ erroneous endpoints, return an error message for all rows. This will prevent us from
+    // accidentally running this query when there's a codex-generated data extension file in `java/ql/lib/ext`.
+    if not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _, true)
+    then
+      message =
+        sinkType + "\n" +
+          // Extract the needed metadata for this endpoint.
+          any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata))
+    else
+      message =
+        "Error: There are erroneous endpoints! Please check whether there's a codex-generated data extension file in `java/ql/lib/ext`."
  )
 select sink, message