diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql index 379eb113e3e..e27a565d173 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql @@ -35,5 +35,18 @@ where // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly // certain about in the prompt. not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _) and + // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be + // treated by the actual query as a sanitizer, since the final logic is something like + // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because + // they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples. + not exists( + EndpointCharacteristics::EndpointCharacteristic characteristic2, float confidence2, + EndpointType positiveType + | + characteristic2.appliesToEndpoint(endpoint) and + confidence2 >= characteristic2.maximalConfidence() and + not positiveType instanceof NegativeType and + characteristic2.hasImplications(positiveType, true, confidence2) + ) and endpoint = getSampleFromSampleRate(0.01) select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString() diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql index 7319c68e11d..b790a417caf 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql @@ -26,5 +26,9 @@ where config.isKnownSink(sink) and // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query // when there's a codex-generated data extension file in `java/ql/lib/ext`. - not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) + not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) and + // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be + // treated by the actual query as a sanitizer, since the final logic is something like + // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as positive examples in the prompt. + not config.isSanitizer(sink) select sink, config.getASinkEndpointType().getDescription()