From b48c6badba9814b1319987d2b54744a89d01dc0d Mon Sep 17 00:00:00 2001 From: tiferet Date: Tue, 28 Feb 2023 12:53:30 -0800 Subject: [PATCH] Add an endpoint filter to filter out MaD-modeled taint steps. This filter currently has some overlap with `CreatePathSinkCharacteristic`. We add a flag to `erroneousEndpoints` such that these known modeling errors can optionally be ignored. We turn the flag off when extracting prompt examples, to ensure the prompt contains only examples we're highly certain about. If there are errors even with this flag turned on, we return an error message in the query that extracts positive examples, to prevent us from accidentally running it when there's a codex-generated data extension file in `java/ql/lib/ext`. --- .../EndpointCharacteristics.qll | 55 ++++++++++++------- .../src/ExtractNegativeExamples.ql | 2 +- .../src/ExtractPositiveExamples.ql | 21 ++++--- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll index 6e383be43ee..9011e64f526 100644 --- a/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll +++ b/java/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll @@ -40,7 +40,7 @@ predicate isKnownSink(DataFlow::Node sink, SinkType sinkType) { */ predicate erroneousEndpoints( DataFlow::Node endpoint, EndpointCharacteristic characteristic, EndpointType endpointClass, - float confidence, string errorMessage + float confidence, string errorMessage, boolean ignoreKnownModelingErrors ) { // An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one // sink/source type. @@ -63,10 +63,13 @@ predicate erroneousEndpoints( not ( characteristic instanceof IsSanitizerCharacteristic or characteristic2 instanceof IsSanitizerCharacteristic + ) and + ( + ignoreKnownModelingErrors = true and + not knownOverlappingCharacteristics(characteristic, characteristic2) + or + ignoreKnownModelingErrors = false ) - // and - // // We currently know of some sink types that overlap with one another. - // not knownOverlappingCharacteristics(characteristic, characteristic2) ) and errorMessage = "Endpoint has high-confidence positive indicators for multiple classes" or @@ -80,6 +83,7 @@ predicate erroneousEndpoints( confidence > characteristic.mediumConfidence() and confidence2 > characteristic2.mediumConfidence() ) and + ignoreKnownModelingErrors = false and errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class" } @@ -104,6 +108,18 @@ predicate erroneousConfidences( // characteristic1 = ["file creation sink", "other path injection sink"] and // characteristic2 = ["file creation sink", "other path injection sink"] // } +/** + * Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that + * have some overlap in their results. This indicates a problem with the underlying Java modeling. + */ +private predicate knownOverlappingCharacteristics( + EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2 +) { + characteristic1 != characteristic2 and + characteristic1 = ["mad taint step", "create path"] and + characteristic2 = ["mad taint step", "create path"] +} + predicate isTypeAccess(DataFlow::Node n) { n.asExpr() instanceof TypeAccess } /** @@ -374,21 +390,22 @@ private class IsSanitizerCharacteristic extends NotASinkCharacteristic { } } -// /** -// * A negative characteristic that indicates that an endpoint is a MaD taint step. MaD modeled taint steps are global, -// * so they are not sinks for any query. Non-MaD taint steps might be specific to a particular query, so we don't -// * filter those out. -// */ -// private class IsMaDTaintStepCharacteristic extends NotASinkCharacteristic { -// IsMaDTaintStepCharacteristic() { this = "mad taint step" } -// override predicate appliesToEndpoint(DataFlow::Node n) { -// FlowSummaryImpl::Private::Steps::summaryThroughStepValue(n, _, _) -// or -// // FlowSummaryImpl::Private::Steps::summaryThroughStepTaint(n, _, _) or -// // FlowSummaryImpl::Private::Steps::summaryGetterStep(n, _, _, _) or -// FlowSummaryImpl::Private::Steps::summarySetterStep(n, _, _, _) -// } -// } +/** + * A negative characteristic that indicates that an endpoint is a MaD taint step. MaD modeled taint steps are global, + * so they are not sinks for any query. Non-MaD taint steps might be specific to a particular query, so we don't + * filter those out. + */ +private class IsMaDTaintStepCharacteristic extends NotASinkCharacteristic { + IsMaDTaintStepCharacteristic() { this = "mad taint step" } + + override predicate appliesToEndpoint(DataFlow::Node n) { + FlowSummaryImpl::Private::Steps::summaryThroughStepValue(n, _, _) or + FlowSummaryImpl::Private::Steps::summaryThroughStepTaint(n, _, _) or + FlowSummaryImpl::Private::Steps::summaryGetterStep(n, _, _, _) or + FlowSummaryImpl::Private::Steps::summarySetterStep(n, _, _, _) + } +} + /** * A negative characteristic that indicates that an endpoint is an argument to a safe external API method. * diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql index 0600456b573..9741802db9e 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractNegativeExamples.ql @@ -35,7 +35,7 @@ where characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly // certain about in the prompt. - not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _) and + not EndpointCharacteristics::erroneousEndpoints(endpoint, _, _, _, _, false) and // Exclude type access nodes because they will never be on a flow path so they're not useful negative examples. not EndpointCharacteristics::isTypeAccess(endpoint) and // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be diff --git a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql index d69774cf60d..cc833e5d012 100644 --- a/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql +++ b/java/ql/experimental/adaptivethreatmodeling/src/ExtractPositiveExamples.ql @@ -22,9 +22,9 @@ private import experimental.adaptivethreatmodeling.ATMConfigs // To import the c from DataFlow::Node sink, SinkType sinkType, string message where - // If there are _any_ erroneous endpoints, return nothing. This will prevent us from accidentally running this query - // when there's a codex-generated data extension file in `java/ql/lib/ext`. - not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _) and + // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly + // certain about in the prompt. + not EndpointCharacteristics::erroneousEndpoints(sink, _, _, _, _, false) and // Extract positive examples of sinks belonging to the existing ATM query configurations. ( EndpointCharacteristics::isKnownSink(sink, sinkType) and @@ -35,9 +35,16 @@ where // Include only sinks that are arguments to an external API call, because these are the sinks we are most interested // in. sink instanceof ExternalAPIs::ExternalApiDataNode and - message = - sinkType + "\n" + - // Extract the needed metadata for this endpoint. - any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata)) + // If there are _any_ erroneous endpoints, return an error message for all rows. This will prevent us from + // accidentally running this query when there's a codex-generated data extension file in `java/ql/lib/ext`. + if not EndpointCharacteristics::erroneousEndpoints(_, _, _, _, _, true) + then + message = + sinkType + "\n" + + // Extract the needed metadata for this endpoint. + any(string metadata | EndpointCharacteristics::hasMetadata(sink, metadata)) + else + message = + "Error: There are erroneous endpoints! Please check whether there's a codex-generated data extension file in `java/ql/lib/ext`." ) select sink, message