diff --git a/java/ql/src/Telemetry/AutomodelApplicationModeCharacteristics.qll b/java/ql/src/Telemetry/AutomodelApplicationModeCharacteristics.qll new file mode 100644 index 00000000000..b0dca8018a9 --- /dev/null +++ b/java/ql/src/Telemetry/AutomodelApplicationModeCharacteristics.qll @@ -0,0 +1,360 @@ +/** + * For internal use only. + */ + +private import java +private import semmle.code.Location as Location +private import semmle.code.java.dataflow.DataFlow +private import semmle.code.java.dataflow.TaintTracking +private import semmle.code.java.security.PathCreation +private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow +private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl +private import semmle.code.java.security.ExternalAPIs as ExternalAPIs +private import semmle.code.java.Expr as Expr +private import semmle.code.java.security.QueryInjection +private import semmle.code.java.security.RequestForgery +private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclusions +import AutomodelSharedCharacteristics as SharedCharacteristics +import AutomodelEndpointTypes as AutomodelEndpointTypes + +/** + * A meta data extractor. Any Java extraction mode needs to implement exactly + * one instance of this class. + */ +abstract class MetadataExtractor extends string { + bindingset[this] + MetadataExtractor() { any() } + + abstract predicate hasMetadata( + Endpoint e, string package, string type, boolean subtypes, string name, string signature, + int input + ); +} + +newtype JavaRelatedLocationType = CallContext() + +/** + * A class representing nodes that are arguments to calls. + */ +private class ArgumentNode extends DataFlow::Node { + ArgumentNode() { this.asExpr() = [any(Call c).getAnArgument(), any(Call c).getQualifier()] } +} + +/** + * A candidates implementation for framework mode. + * + * Some important notes: + * - This mode is using parameters as endpoints. + * - Sink- and neutral-information is being used from MaD models. + * - When available, we use method- and class-java-docs as related locations. + */ +module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig { + // for documentation of the implementations here, see the QLDoc in the CandidateSig signature module. + class Endpoint = ArgumentNode; + + class EndpointType = AutomodelEndpointTypes::EndpointType; + + class NegativeEndpointType = AutomodelEndpointTypes::NegativeSinkType; + + class RelatedLocation = Location::Top; + + class RelatedLocationType = JavaRelatedLocationType; + + // Sanitizers are currently not modeled in MaD. TODO: check if this has large negative impact. + predicate isSanitizer(Endpoint e, EndpointType t) { none() } + + RelatedLocation asLocation(Endpoint e) { result = e.asExpr() } + + predicate isKnownKind(string kind, string humanReadableKind, EndpointType type) { + kind = "read-file" and + humanReadableKind = "read file" and + type instanceof AutomodelEndpointTypes::TaintedPathSinkType + or + kind = "create-file" and + humanReadableKind = "create file" and + type instanceof AutomodelEndpointTypes::TaintedPathSinkType + or + kind = "sql" and + humanReadableKind = "mad modeled sql" and + type instanceof AutomodelEndpointTypes::SqlSinkType + or + kind = "open-url" and + humanReadableKind = "open url" and + type instanceof AutomodelEndpointTypes::RequestForgerySinkType + or + kind = "jdbc-url" and + humanReadableKind = "jdbc url" and + type instanceof AutomodelEndpointTypes::RequestForgerySinkType + or + kind = "command-injection" and + humanReadableKind = "command injection" and + type instanceof AutomodelEndpointTypes::CommandInjectionSinkType + } + + predicate isSink(Endpoint e, string kind) { + exists(string package, string type, string name, string signature, string ext, string input | + sinkSpec(e, package, type, name, signature, ext, input) and + ExternalFlow::sinkModel(package, type, _, name, [signature, ""], ext, input, kind, _) + ) + } + + predicate isNeutral(Endpoint e) { + exists(string package, string type, string name, string signature | + sinkSpec(e, package, type, name, signature, _, _) and + ExternalFlow::neutralModel(package, type, name, [signature, ""], _, _) + ) + } + + additional predicate sinkSpec( + Endpoint e, string package, string type, string name, string signature, string ext, string input + ) { + FrameworkCandidatesImpl::getCallable(e).hasQualifiedName(package, type, name) and + signature = ExternalFlow::paramsString(getCallable(e)) and + ext = "" and + ( + exists(Call c, int argIdx | + e.asExpr() = c.getArgument(argIdx) and + input = "Argument[" + argIdx + "]" + ) + or + exists(Call c | e.asExpr() = c.getQualifier() and input = "Argument[this]") + ) + // exists(int paramIdx | e.isParameterOf(_, paramIdx) | + // if paramIdx = -1 then input = "Argument[this]" else input = "Argument[" + paramIdx + "]" + // ) + } + + /** + * Returns the related location for the given endpoint. + * + * Related locations can be JavaDoc comments of the class or the method. + */ + RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType type) { + type = CallContext() and + result = asLocation(e) + } + + /** + * Returns the callable that contains the given endpoint. + * + * Each Java mode should implement this predicate. + */ + additional Callable getCallable(Endpoint e) { + exists(Call c | + e.asExpr() = [c.getAnArgument(), c.getQualifier()] and + result = c.getCallee() + ) + } +} + +module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics; + +class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic; + +class Endpoint = FrameworkCandidatesImpl::Endpoint; + +/* + * Predicates that are used to surface prompt examples and candidates for classification with an ML model. + */ + +/** + * A MetadataExtractor that extracts metadata for framework mode. + */ +class FrameworkModeMetadataExtractor extends MetadataExtractor { + FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" } + + /** + * By convention, the subtypes property of the MaD declaration should only be + * true when there _can_ exist any subtypes with a different implementation. + * + * It would technically be ok to always use the value 'true', but this would + * break convention. + */ + boolean considerSubtypes(Callable callable) { + if + callable.isStatic() or + callable.getDeclaringType().isStatic() or + callable.isFinal() or + callable.getDeclaringType().isFinal() + then result = false + else result = true + } + + override predicate hasMetadata( + Endpoint e, string package, string type, boolean subtypes, string name, string signature, + int input + ) { + exists(Call call, Callable callable | + call.getCallee() = callable and + ( + e.asExpr() = call.getArgument(input) + or + e.asExpr() = call.getQualifier() and input = -1 + ) and + package = callable.getDeclaringType().getPackage().getName() and + type = callable.getDeclaringType().getErasure().(RefType).nestedName() and + subtypes = this.considerSubtypes(callable) and + name = callable.getName() and + signature = ExternalFlow::paramsString(callable) + ) + } +} + +/* + * EndpointCharacteristic classes that are specific to Automodel for Java. + */ + +/** + * A negative characteristic that indicates that an is-style boolean method is unexploitable even if it is a sink. + * + * A sink is highly unlikely to be exploitable if its callable's name starts with `is` and the callable has a boolean return + * type (e.g. `isDirectory`). These kinds of calls normally do only checks, and appear before the proper call that does + * the dangerous/interesting thing, so we want the latter to be modeled as the sink. + * + * TODO: this might filter too much, it's possible that methods with more than one parameter contain interesting sinks + */ +private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic { + UnexploitableIsCharacteristic() { this = "unexploitable (is-style boolean method)" } + + override predicate appliesToEndpoint(Endpoint e) { + not FrameworkCandidatesImpl::isSink(e, _) and + FrameworkCandidatesImpl::getCallable(e).getName().matches("is%") and + FrameworkCandidatesImpl::getCallable(e).getReturnType() instanceof BooleanType + } +} + +/** + * A negative characteristic that indicates that an existence-checking boolean method is unexploitable even if it is a + * sink. + * + * A sink is highly unlikely to be exploitable if its callable's name is `exists` or `notExists` and the callable has a + * boolean return type. These kinds of calls normally do only checks, and appear before the proper call that does the + * dangerous/interesting thing, so we want the latter to be modeled as the sink. + */ +private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic { + UnexploitableExistsCharacteristic() { this = "unexploitable (existence-checking boolean method)" } + + override predicate appliesToEndpoint(Endpoint e) { + not FrameworkCandidatesImpl::isSink(e, _) and + exists(Callable callable | + callable = FrameworkCandidatesImpl::getCallable(e) and + callable.getName().toLowerCase() = ["exists", "notexists"] and + callable.getReturnType() instanceof BooleanType + ) + } +} + +/** + * A negative characteristic that indicates that an endpoint is an argument to an exception, which is not a sink. + */ +private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic { + ExceptionCharacteristic() { this = "exception" } + + override predicate appliesToEndpoint(Endpoint e) { + FrameworkCandidatesImpl::getCallable(e).getDeclaringType().getASupertype*() instanceof + TypeThrowable + } +} + +/** + * A characteristic that limits candidates to parameters of methods that are recognized as `ModelApi`, iow., APIs that + * are considered worth modeling. + */ +private class NotAModelApiParameter extends CharacteristicsImpl::UninterestingToModelCharacteristic { + NotAModelApiParameter() { this = "not a model API parameter" } + + override predicate appliesToEndpoint(Endpoint e) { + not exists(ModelExclusions::ModelApi api | + exists(Call c | + c.getCallee() = api and + exists(int argIdx | exists(api.getParameter(argIdx)) | + argIdx = -1 and e.asExpr() = c.getQualifier() + or + argIdx >= 0 and e.asExpr() = c.getArgument(argIdx) + ) + ) + ) + } +} + +/** + * A negative characteristic that filters out non-public methods. Non-public methods are not interesting to include in + * the standard Java modeling, because they cannot be called from outside the package. + */ +private class NonPublicMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic +{ + NonPublicMethodCharacteristic() { this = "non-public method" } + + override predicate appliesToEndpoint(Endpoint e) { + not FrameworkCandidatesImpl::getCallable(e).isPublic() + } +} + +/** + * Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint + * characteristics. Lists the problematic characteristics and their implications for all such endpoints, together with + * an error message indicating why this combination is problematic. + * + * Copied from + * javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql + */ +predicate erroneousEndpoints( + Endpoint endpoint, EndpointCharacteristic characteristic, + AutomodelEndpointTypes::EndpointType endpointType, float confidence, string errorMessage, + boolean ignoreKnownModelingErrors +) { + // An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one + // sink/source type (including the negative type). + exists( + EndpointCharacteristic characteristic2, AutomodelEndpointTypes::EndpointType endpointClass2, + float confidence2 + | + endpointType != endpointClass2 and + ( + endpointType instanceof AutomodelEndpointTypes::SinkType and + endpointClass2 instanceof AutomodelEndpointTypes::SinkType + or + endpointType instanceof AutomodelEndpointTypes::SourceType and + endpointClass2 instanceof AutomodelEndpointTypes::SourceType + ) and + characteristic.appliesToEndpoint(endpoint) and + characteristic2.appliesToEndpoint(endpoint) and + characteristic.hasImplications(endpointType, true, confidence) and + characteristic2.hasImplications(endpointClass2, true, confidence2) and + confidence > SharedCharacteristics::mediumConfidence() and + confidence2 > SharedCharacteristics::mediumConfidence() and + ( + ignoreKnownModelingErrors = true and + not knownOverlappingCharacteristics(characteristic, characteristic2) + or + ignoreKnownModelingErrors = false + ) + ) and + errorMessage = "Endpoint has high-confidence positive indicators for multiple classes" + or + // An endpoint's characteristics should not include positive indicators with medium/high confidence for some class and + // also include negative indicators with medium/high confidence for this same class. + exists(EndpointCharacteristic characteristic2, float confidence2 | + characteristic.appliesToEndpoint(endpoint) and + characteristic2.appliesToEndpoint(endpoint) and + characteristic.hasImplications(endpointType, true, confidence) and + characteristic2.hasImplications(endpointType, false, confidence2) and + confidence > SharedCharacteristics::mediumConfidence() and + confidence2 > SharedCharacteristics::mediumConfidence() + ) and + ignoreKnownModelingErrors = false and + errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class" +} + +/** + * Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that + * have some overlap in their results. This indicates a problem with the underlying Java modeling. Specifically, + * `PathCreation` is prone to FPs. + */ +private predicate knownOverlappingCharacteristics( + EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2 +) { + characteristic1 != characteristic2 and + characteristic1 = ["mad taint step", "create path", "read file", "known non-sink"] and + characteristic2 = ["mad taint step", "create path", "read file", "known non-sink"] +} diff --git a/java/ql/src/Telemetry/AutomodelApplicationModeExtractCandidates.ql b/java/ql/src/Telemetry/AutomodelApplicationModeExtractCandidates.ql new file mode 100644 index 00000000000..14ba7b60799 --- /dev/null +++ b/java/ql/src/Telemetry/AutomodelApplicationModeExtractCandidates.ql @@ -0,0 +1,48 @@ +/** + * Surfaces the endpoints that are not already known to be sinks, and are therefore used as candidates for + * classification with an ML model. + * + * Note: This query does not actually classify the endpoints using the model. + * + * @name Automodel candidates + * @description A query to extract automodel candidates. + * @kind problem + * @severity info + * @id java/ml/extract-automodel-application-candidates + * @tags internal automodel extract candidates + */ + +private import AutomodelApplicationModeCharacteristics +private import AutomodelSharedUtil + +from + Endpoint endpoint, string message, MetadataExtractor meta, string package, string type, + boolean subtypes, string name, string signature, int input +where + not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u | + u.appliesToEndpoint(endpoint) + ) and + // If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we + // don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will + // label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in + // overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been + // modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it. + not CharacteristicsImpl::isSink(endpoint, _) and + meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and + // The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be + // a non-sink, and we surface only endpoints that have at least one such sink type. + message = + strictconcat(AutomodelEndpointTypes::SinkType sinkType | + not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and + CharacteristicsImpl::isSinkCandidate(endpoint, sinkType) + | + sinkType, ", " + ) +select endpoint, message + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", // + CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", // + package.(DollarAtString), "package", // + type.(DollarAtString), "type", // + subtypes.toString().(DollarAtString), "subtypes", // + name.(DollarAtString), "name", // method name + signature.(DollarAtString), "signature", // + input.toString().(DollarAtString), "input" // diff --git a/java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql b/java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql index a64327422a0..0fad9848efe 100644 --- a/java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql +++ b/java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql @@ -8,7 +8,7 @@ * @description A query to extract automodel candidates. * @kind problem * @severity info - * @id java/ml/extract-automodel-candidates + * @id java/ml/extract-automodel-framework-candidates * @tags internal automodel extract candidates */