diff --git a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll index 355b94027d8..98433c4d8b0 100644 --- a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll +++ b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll @@ -16,6 +16,51 @@ private import experimental.adaptivethreatmodeling.RequestForgeryATM private import semmle.code.java.security.ExternalAPIs as ExternalAPIs private import semmle.code.java.Expr as Expr +/** + * Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint + * characteristics. Lists the problematic characterisitics and their implications for all such endpoints, together with + * an error message indicating why this combination is problematic. + * + * Copied from javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql + */ +query predicate erroneousEndpoints( + DataFlow::Node endpoint, EndpointCharacteristic characteristic, EndpointType endpointClass, + float confidence, string errorMessage +) { + // An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one + // class. + exists(EndpointCharacteristic characteristic2, EndpointType endpointClass2, float confidence2 | + endpointClass.getEncoding() != endpointClass2.getEncoding() and + characteristic.appliesToEndpoint(endpoint) and + characteristic2.appliesToEndpoint(endpoint) and + characteristic.hasImplications(endpointClass, true, confidence) and + characteristic2.hasImplications(endpointClass2, true, confidence2) and + confidence > characteristic.mediumConfidence() and + confidence2 > characteristic2.mediumConfidence() + ) and + errorMessage = "Endpoint has high-confidence positive indicators for multiple classes" + or + // An enpoint's characteristics should not include positive indicators with medium/high confidence for some class and + // also include negative indicators with medium/high confidence for this same class. + exists(EndpointCharacteristic characteristic2, float confidence2 | + characteristic.appliesToEndpoint(endpoint) and + characteristic2.appliesToEndpoint(endpoint) and + characteristic.hasImplications(endpointClass, true, confidence) and + characteristic2.hasImplications(endpointClass, false, confidence2) and + confidence > characteristic.mediumConfidence() and + confidence2 > characteristic2.mediumConfidence() + ) and + errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class" +} + +query predicate erroneousConfidences( + EndpointCharacteristic characteristic, float confidence, string errorMessage +) { + characteristic.hasImplications(_, _, confidence) and + (confidence < 0 or confidence > 1) and + errorMessage = "Characteristic has an indicator with confidence outside of [0, 1]" +} + /** * A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions * about whether to include the endpoint in the training set and with what label, as well as whether to score the diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql index 021cad6014e..379eb113e3e 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql @@ -26,11 +26,14 @@ DataFlow::Node getSampleFromSampleRate(float rate) { } from - DataFlow::Node sink, EndpointCharacteristics::EndpointCharacteristic characteristic, + DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic, float confidence where - characteristic.appliesToEndpoint(sink) and + characteristic.appliesToEndpoint(endpoint) and confidence >= characteristic.highConfidence() and characteristic.hasImplications(any(NegativeType negative), true, confidence) and - sink = getSampleFromSampleRate(0.01) -select sink, "Non-sink of type " + characteristic + " with confidence " + confidence.toString() + // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly + // certain about in the prompt. + not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _) and + endpoint = getSampleFromSampleRate(0.01) +select endpoint, "Non-sink of type " + characteristic + " with confidence " + confidence.toString() diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql index 3d908ed6c8f..cc27dbda245 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql @@ -21,5 +21,8 @@ from where characteristic.appliesToEndpoint(sink) and confidence >= characteristic.maximalConfidence() and - characteristic.hasImplications(config.getASinkEndpointType(), true, confidence) + characteristic.hasImplications(config.getASinkEndpointType(), true, confidence) and + // Exclude sinks that have contradictory endpoint characteristics, because we only want examples we're highly certain + // about in the prompt. + not EndpointCharacteristics::erroneousEndpoints(sink, _, _, _, _) select sink, characteristic.toString()