Merge pull request #12830 from github/kaeluka/parameter-candidate-extraction

Java: Automodel Framework Mode Extraction Queries
2026-04-27 01:35:13 +02:00 · 2023-05-11 18:00:55 +02:00
parent 82e780d175 c31ad01579
commit 510febf46d
7 changed files with 848 additions and 0 deletions
--- a/java/ql/src/Telemetry/AutomodelEndpointTypes.qll
+++ b/java/ql/src/Telemetry/AutomodelEndpointTypes.qll
@@ -0,0 +1,60 @@
+/**
+ * For internal use only.
+ *
+ * Defines the set of classes that endpoint scoring models can predict. Endpoint scoring models must
+ * only predict classes defined within this file. This file is the source of truth for the integer
+ * representation of each of these classes.
+ */
+
+/** A class that can be predicted by a classifier. */
+abstract class EndpointType extends string {
+  /**
+   * Holds when the string matches the name of the sink / source type.
+   */
+  bindingset[this]
+  EndpointType() { any() }
+
+  /**
+   * Gets the name of the sink/source kind for this endpoint type as used in models-as-data.
+   *
+   * See https://github.com/github/codeql/blob/44213f0144fdd54bb679ca48d68b28dcf820f7a8/java/ql/lib/semmle/code/java/dataflow/ExternalFlow.qll#LL353C11-L357C31
+   */
+  final string getKind() { result = this }
+}
+
+/** A class for sink types that can be predicted by a classifier. */
+abstract class SinkType extends EndpointType {
+  bindingset[this]
+  SinkType() { any() }
+}
+
+/** A class for source types that can be predicted by a classifier. */
+abstract class SourceType extends EndpointType {
+  bindingset[this]
+  SourceType() { any() }
+}
+
+/** The `Negative` class for non-sinks. */
+class NegativeSinkType extends SinkType {
+  NegativeSinkType() { this = "non-sink" }
+}
+
+/** A sink relevant to the SQL injection query */
+class SqlSinkType extends SinkType {
+  SqlSinkType() { this = "sql" }
+}
+
+/** A sink relevant to the tainted path injection query. */
+class TaintedPathSinkType extends SinkType {
+  TaintedPathSinkType() { this = "tainted-path" }
+}
+
+/** A sink relevant to the SSRF query. */
+class RequestForgerySinkType extends SinkType {
+  RequestForgerySinkType() { this = "ssrf" }
+}
+
+/** A sink relevant to the command injection query. */
+class CommandInjectionSinkType extends SinkType {
+  CommandInjectionSinkType() { this = "command-injection" }
+}
--- a/java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll
+++ b/java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll
@@ -0,0 +1,331 @@
+/**
+ * For internal use only.
+ */
+
+private import java
+private import semmle.code.Location as Location
+private import semmle.code.java.dataflow.DataFlow
+private import semmle.code.java.dataflow.TaintTracking
+private import semmle.code.java.security.PathCreation
+private import semmle.code.java.dataflow.ExternalFlow as ExternalFlow
+private import semmle.code.java.dataflow.internal.FlowSummaryImpl as FlowSummaryImpl
+private import semmle.code.java.security.ExternalAPIs as ExternalAPIs
+private import semmle.code.java.Expr as Expr
+private import semmle.code.java.security.QueryInjection
+private import semmle.code.java.security.RequestForgery
+private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclusions
+import AutomodelSharedCharacteristics as SharedCharacteristics
+import AutomodelEndpointTypes as AutomodelEndpointTypes
+
+/**
+ * A meta data extractor. Any Java extraction mode needs to implement exactly
+ * one instance of this class.
+ */
+abstract class MetadataExtractor extends string {
+  bindingset[this]
+  MetadataExtractor() { any() }
+
+  abstract predicate hasMetadata(
+    DataFlow::ParameterNode e, string package, string type, boolean subtypes, string name,
+    string signature, int input
+  );
+}
+
+newtype JavaRelatedLocationType =
+  MethodDoc() or
+  ClassDoc()
+
+/**
+ * A candidates implementation for framework mode.
+ *
+ * Some important notes:
+ *  - This mode is using parameters as endpoints.
+ *  - Sink- and neutral-information is being used from MaD models.
+ *  - When available, we use method- and class-java-docs as related locations.
+ */
+module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
+  // for documentation of the implementations here, see the QLDoc in the CandidateSig signature module.
+  class Endpoint = DataFlow::ParameterNode;
+
+  class EndpointType = AutomodelEndpointTypes::EndpointType;
+
+  class NegativeEndpointType = AutomodelEndpointTypes::NegativeSinkType;
+
+  class RelatedLocation = Location::Top;
+
+  class RelatedLocationType = JavaRelatedLocationType;
+
+  // Sanitizers are currently not modeled in MaD. TODO: check if this has large negative impact.
+  predicate isSanitizer(Endpoint e, EndpointType t) { none() }
+
+  RelatedLocation asLocation(Endpoint e) { result = e.asParameter() }
+
+  predicate isKnownKind(string kind, string humanReadableKind, EndpointType type) {
+    kind = "read-file" and
+    humanReadableKind = "read file" and
+    type instanceof AutomodelEndpointTypes::TaintedPathSinkType
+    or
+    kind = "create-file" and
+    humanReadableKind = "create file" and
+    type instanceof AutomodelEndpointTypes::TaintedPathSinkType
+    or
+    kind = "sql" and
+    humanReadableKind = "mad modeled sql" and
+    type instanceof AutomodelEndpointTypes::SqlSinkType
+    or
+    kind = "open-url" and
+    humanReadableKind = "open url" and
+    type instanceof AutomodelEndpointTypes::RequestForgerySinkType
+    or
+    kind = "jdbc-url" and
+    humanReadableKind = "jdbc url" and
+    type instanceof AutomodelEndpointTypes::RequestForgerySinkType
+    or
+    kind = "command-injection" and
+    humanReadableKind = "command injection" and
+    type instanceof AutomodelEndpointTypes::CommandInjectionSinkType
+  }
+
+  predicate isSink(Endpoint e, string kind) {
+    exists(string package, string type, string name, string signature, string ext, string input |
+      sinkSpec(e, package, type, name, signature, ext, input) and
+      ExternalFlow::sinkModel(package, type, _, name, [signature, ""], ext, input, kind, _)
+    )
+  }
+
+  predicate isNeutral(Endpoint e) {
+    exists(string package, string type, string name, string signature |
+      sinkSpec(e, package, type, name, signature, _, _) and
+      ExternalFlow::neutralModel(package, type, name, [signature, ""], _, _)
+    )
+  }
+
+  additional predicate sinkSpec(
+    Endpoint e, string package, string type, string name, string signature, string ext, string input
+  ) {
+    FrameworkCandidatesImpl::getCallable(e).hasQualifiedName(package, type, name) and
+    signature = ExternalFlow::paramsString(getCallable(e)) and
+    ext = "" and
+    exists(int paramIdx | e.isParameterOf(_, paramIdx) |
+      if paramIdx = -1 then input = "Argument[this]" else input = "Argument[" + paramIdx + "]"
+    )
+  }
+
+  /**
+   * Returns the related location for the given endpoint.
+   *
+   * Related locations can be JavaDoc comments of the class or the method.
+   */
+  RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType type) {
+    type = MethodDoc() and
+    result = FrameworkCandidatesImpl::getCallable(e).(Documentable).getJavadoc()
+    or
+    type = ClassDoc() and
+    result = FrameworkCandidatesImpl::getCallable(e).getDeclaringType().(Documentable).getJavadoc()
+  }
+
+  /**
+   * Returns the callable that contains the given endpoint.
+   *
+   * Each Java mode should implement this predicate.
+   */
+  additional Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() }
+}
+
+module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<FrameworkCandidatesImpl>;
+
+class EndpointCharacteristic = CharacteristicsImpl::EndpointCharacteristic;
+
+class Endpoint = FrameworkCandidatesImpl::Endpoint;
+
+/*
+ * Predicates that are used to surface prompt examples and candidates for classification with an ML model.
+ */
+
+/**
+ * A MetadataExtractor that extracts metadata for framework mode.
+ */
+class FrameworkModeMetadataExtractor extends MetadataExtractor {
+  FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" }
+
+  /**
+   * By convention, the subtypes property of the MaD declaration should only be
+   * true when there _can_ exist any subtypes with a different implementation.
+   *
+   * It would technically be ok to always use the value 'true', but this would
+   * break convention.
+   */
+  boolean considerSubtypes(Callable callable) {
+    if
+      callable.isStatic() or
+      callable.getDeclaringType().isStatic() or
+      callable.isFinal() or
+      callable.getDeclaringType().isFinal()
+    then result = false
+    else result = true
+  }
+
+  override predicate hasMetadata(
+    Endpoint e, string package, string type, boolean subtypes, string name, string signature,
+    int input
+  ) {
+    exists(Callable callable |
+      e.asParameter() = callable.getParameter(input) and
+      package = callable.getDeclaringType().getPackage().getName() and
+      type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
+      subtypes = this.considerSubtypes(callable) and
+      name = e.toString() and
+      signature = ExternalFlow::paramsString(callable)
+    )
+  }
+}
+
+/*
+ * EndpointCharacteristic classes that are specific to Automodel for Java.
+ */
+
+/**
+ * A negative characteristic that indicates that an is-style boolean method is unexploitable even if it is a sink.
+ *
+ * A sink is highly unlikely to be exploitable if its callable's name starts with `is` and the callable has a boolean return
+ * type (e.g. `isDirectory`). These kinds of calls normally do only checks, and appear before the proper call that does
+ * the dangerous/interesting thing, so we want the latter to be modeled as the sink.
+ *
+ * TODO: this might filter too much, it's possible that methods with more than one parameter contain interesting sinks
+ */
+private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
+  UnexploitableIsCharacteristic() { this = "unexploitable (is-style boolean method)" }
+
+  override predicate appliesToEndpoint(Endpoint e) {
+    not FrameworkCandidatesImpl::isSink(e, _) and
+    FrameworkCandidatesImpl::getCallable(e).getName().matches("is%") and
+    FrameworkCandidatesImpl::getCallable(e).getReturnType() instanceof BooleanType
+  }
+}
+
+/**
+ * A negative characteristic that indicates that an existence-checking boolean method is unexploitable even if it is a
+ * sink.
+ *
+ * A sink is highly unlikely to be exploitable if its callable's name is `exists` or `notExists` and the callable has a
+ * boolean return type. These kinds of calls normally do only checks, and appear before the proper call that does the
+ * dangerous/interesting thing, so we want the latter to be modeled as the sink.
+ */
+private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
+  UnexploitableExistsCharacteristic() { this = "unexploitable (existence-checking boolean method)" }
+
+  override predicate appliesToEndpoint(Endpoint e) {
+    not FrameworkCandidatesImpl::isSink(e, _) and
+    exists(Callable callable |
+      callable = FrameworkCandidatesImpl::getCallable(e) and
+      callable.getName().toLowerCase() = ["exists", "notexists"] and
+      callable.getReturnType() instanceof BooleanType
+    )
+  }
+}
+
+/**
+ * A negative characteristic that indicates that an endpoint is an argument to an exception, which is not a sink.
+ */
+private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkCharacteristic {
+  ExceptionCharacteristic() { this = "exception" }
+
+  override predicate appliesToEndpoint(Endpoint e) {
+    FrameworkCandidatesImpl::getCallable(e).getDeclaringType().getASupertype*() instanceof
+      TypeThrowable
+  }
+}
+
+/**
+ * A characteristic that limits candidates to parameters of methods that are recognized as `ModelApi`, iow., APIs that
+ * are considered worth modeling.
+ */
+private class NotAModelApiParameter extends CharacteristicsImpl::UninterestingToModelCharacteristic {
+  NotAModelApiParameter() { this = "not a model API parameter" }
+
+  override predicate appliesToEndpoint(Endpoint e) {
+    not exists(ModelExclusions::ModelApi api | api.getAParameter() = e.asParameter())
+  }
+}
+
+/**
+ * A negative characteristic that filters out non-public methods. Non-public methods are not interesting to include in
+ * the standard Java modeling, because they cannot be called from outside the package.
+ */
+private class NonPublicMethodCharacteristic extends CharacteristicsImpl::UninterestingToModelCharacteristic
+{
+  NonPublicMethodCharacteristic() { this = "non-public method" }
+
+  override predicate appliesToEndpoint(Endpoint e) {
+    not FrameworkCandidatesImpl::getCallable(e).isPublic()
+  }
+}
+
+/**
+ * Holds if the given endpoint has a self-contradictory combination of characteristics. Detects errors in our endpoint
+ * characteristics. Lists the problematic characteristics and their implications for all such endpoints, together with
+ * an error message indicating why this combination is problematic.
+ *
+ * Copied from
+ *   javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ContradictoryEndpointCharacteristics.ql
+ */
+predicate erroneousEndpoints(
+  Endpoint endpoint, EndpointCharacteristic characteristic,
+  AutomodelEndpointTypes::EndpointType endpointType, float confidence, string errorMessage,
+  boolean ignoreKnownModelingErrors
+) {
+  // An endpoint's characteristics should not include positive indicators with medium/high confidence for more than one
+  // sink/source type (including the negative type).
+  exists(
+    EndpointCharacteristic characteristic2, AutomodelEndpointTypes::EndpointType endpointClass2,
+    float confidence2
+  |
+    endpointType != endpointClass2 and
+    (
+      endpointType instanceof AutomodelEndpointTypes::SinkType and
+      endpointClass2 instanceof AutomodelEndpointTypes::SinkType
+      or
+      endpointType instanceof AutomodelEndpointTypes::SourceType and
+      endpointClass2 instanceof AutomodelEndpointTypes::SourceType
+    ) and
+    characteristic.appliesToEndpoint(endpoint) and
+    characteristic2.appliesToEndpoint(endpoint) and
+    characteristic.hasImplications(endpointType, true, confidence) and
+    characteristic2.hasImplications(endpointClass2, true, confidence2) and
+    confidence > SharedCharacteristics::mediumConfidence() and
+    confidence2 > SharedCharacteristics::mediumConfidence() and
+    (
+      ignoreKnownModelingErrors = true and
+      not knownOverlappingCharacteristics(characteristic, characteristic2)
+      or
+      ignoreKnownModelingErrors = false
+    )
+  ) and
+  errorMessage = "Endpoint has high-confidence positive indicators for multiple classes"
+  or
+  // An endpoint's characteristics should not include positive indicators with medium/high confidence for some class and
+  // also include negative indicators with medium/high confidence for this same class.
+  exists(EndpointCharacteristic characteristic2, float confidence2 |
+    characteristic.appliesToEndpoint(endpoint) and
+    characteristic2.appliesToEndpoint(endpoint) and
+    characteristic.hasImplications(endpointType, true, confidence) and
+    characteristic2.hasImplications(endpointType, false, confidence2) and
+    confidence > SharedCharacteristics::mediumConfidence() and
+    confidence2 > SharedCharacteristics::mediumConfidence()
+  ) and
+  ignoreKnownModelingErrors = false and
+  errorMessage = "Endpoint has high-confidence positive and negative indicators for the same class"
+}
+
+/**
+ * Holds if `characteristic1` and `characteristic2` are among the pairs of currently known positive characteristics that
+ * have some overlap in their results. This indicates a problem with the underlying Java modeling. Specifically,
+ * `PathCreation` is prone to FPs.
+ */
+private predicate knownOverlappingCharacteristics(
+  EndpointCharacteristic characteristic1, EndpointCharacteristic characteristic2
+) {
+  characteristic1 != characteristic2 and
+  characteristic1 = ["mad taint step", "create path", "read file", "known non-sink"] and
+  characteristic2 = ["mad taint step", "create path", "read file", "known non-sink"]
+}
--- a/java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql
+++ b/java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql
@@ -0,0 +1,50 @@
+/**
+ * Surfaces the endpoints that are not already known to be sinks, and are therefore used as candidates for
+ * classification with an ML model.
+ *
+ * Note: This query does not actually classify the endpoints using the model.
+ *
+ * @name Automodel candidates
+ * @description A query to extract automodel candidates.
+ * @kind problem
+ * @severity info
+ * @id java/ml/extract-automodel-candidates
+ * @tags internal automodel extract candidates
+ */
+
+private import AutomodelFrameworkModeCharacteristics
+private import AutomodelSharedUtil
+
+from
+  Endpoint endpoint, string message, MetadataExtractor meta, string package, string type,
+  boolean subtypes, string name, string signature, int input
+where
+  not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
+    u.appliesToEndpoint(endpoint)
+  ) and
+  // If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
+  // don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
+  // label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
+  // overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
+  // modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
+  not CharacteristicsImpl::isSink(endpoint, _) and
+  meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
+  // The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
+  // a non-sink, and we surface only endpoints that have at least one such sink type.
+  message =
+    strictconcat(AutomodelEndpointTypes::SinkType sinkType |
+      not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
+      CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
+    |
+      sinkType, ", "
+    )
+select endpoint,
+  message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
+  CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
+  CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
+  package.(DollarAtString), "package", //
+  type.(DollarAtString), "type", //
+  subtypes.toString().(DollarAtString), "subtypes", //
+  name.(DollarAtString), "name", //
+  signature.(DollarAtString), "signature", //
+  input.toString().(DollarAtString), "input" //
--- a/java/ql/src/Telemetry/AutomodelFrameworkModeExtractNegativeExamples.ql
+++ b/java/ql/src/Telemetry/AutomodelFrameworkModeExtractNegativeExamples.ql
@@ -0,0 +1,47 @@
+/**
+ * Surfaces endpoints that are non-sinks with high confidence, for use as negative examples in the prompt.
+ *
+ * @name Negative examples (experimental)
+ * @kind problem
+ * @severity info
+ * @id java/ml/non-sink
+ * @tags internal automodel extract examples negative
+ */
+
+private import AutomodelFrameworkModeCharacteristics
+private import AutomodelEndpointTypes
+private import AutomodelSharedUtil
+
+from
+  Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message,
+  MetadataExtractor meta, string package, string type, boolean subtypes, string name,
+  string signature, int input
+where
+  characteristic.appliesToEndpoint(endpoint) and
+  confidence >= SharedCharacteristics::highConfidence() and
+  characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
+  // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
+  // certain about in the prompt.
+  not erroneousEndpoints(endpoint, _, _, _, _, false) and
+  meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
+  // It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
+  // treated by the actual query as a sanitizer, since the final logic is something like
+  // `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
+  // they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
+  not exists(EndpointCharacteristic characteristic2, float confidence2, SinkType positiveType |
+    not positiveType instanceof NegativeSinkType and
+    characteristic2.appliesToEndpoint(endpoint) and
+    confidence2 >= SharedCharacteristics::maximalConfidence() and
+    characteristic2.hasImplications(positiveType, true, confidence2)
+  ) and
+  message = characteristic
+select endpoint,
+  message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
+  CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
+  CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
+  package.(DollarAtString), "package", //
+  type.(DollarAtString), "type", //
+  subtypes.toString().(DollarAtString), "subtypes", //
+  name.(DollarAtString), "name", //
+  signature.(DollarAtString), "signature", //
+  input.toString().(DollarAtString), "input" //
--- a/java/ql/src/Telemetry/AutomodelFrameworkModeExtractPositiveExamples.ql
+++ b/java/ql/src/Telemetry/AutomodelFrameworkModeExtractPositiveExamples.ql
@@ -0,0 +1,34 @@
+/**
+ * Surfaces endpoints that are sinks with high confidence, for use as positive examples in the prompt.
+ *
+ * @name Positive examples (experimental)
+ * @kind problem
+ * @severity info
+ * @id java/ml/known-sink
+ * @tags internal automodel extract examples positive
+ */
+
+private import AutomodelFrameworkModeCharacteristics
+private import AutomodelEndpointTypes
+private import AutomodelSharedUtil
+
+from
+  Endpoint endpoint, SinkType sinkType, MetadataExtractor meta, string package, string type,
+  boolean subtypes, string name, string signature, int input
+where
+  // Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
+  // certain about in the prompt.
+  not erroneousEndpoints(endpoint, _, _, _, _, false) and
+  meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
+  // Extract positive examples of sinks belonging to the existing ATM query configurations.
+  CharacteristicsImpl::isKnownSink(endpoint, sinkType)
+select endpoint,
+  sinkType + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
+  CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
+  CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
+  package.(DollarAtString), "package", //
+  type.(DollarAtString), "type", //
+  subtypes.toString().(DollarAtString), "subtypes", //
+  name.(DollarAtString), "name", //
+  signature.(DollarAtString), "signature", //
+  input.toString().(DollarAtString), "input" //
--- a/java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll
+++ b/java/ql/src/Telemetry/AutomodelSharedCharacteristics.qll
@@ -0,0 +1,305 @@
+float maximalConfidence() { result = 1.0 }
+
+float highConfidence() { result = 0.9 }
+
+float mediumConfidence() { result = 0.6 }
+
+/**
+ * A specification of how to  instantiate the shared characteristics for a given candidate class.
+ *
+ * The `CandidateSig` implementation specifies a type to use for Endpoints (eg., `ParameterNode`), as well as a type
+ * to label endpoint classes (the `EndpointType`). One of the endpoint classes needs to be a 'negative' class, meaning
+ *   "not any of the other known endpoint types".
+ */
+signature module CandidateSig {
+  /**
+   * An endpoint is a potential candidate for modeling. This will typically be bound to the language's
+   * DataFlow node class, or a subtype thereof.
+   */
+  class Endpoint;
+
+  /**
+   * A related location for an endpoint. This will typically be bound to the supertype of all AST nodes (eg., `Top`).
+   */
+  class RelatedLocation;
+
+  /**
+   * A label for a related location.
+   *
+   * Eg., method-doc, class-doc, etc.
+   */
+  class RelatedLocationType;
+
+  /**
+   * A class kind for an endpoint.
+   */
+  class EndpointType extends string;
+
+  /**
+   * An EndpointType that denotes the absence of any sink.
+   */
+  class NegativeEndpointType extends EndpointType;
+
+  /**
+   * Gets the endpoint as a location.
+   *
+   * This is a utility function to convert an endpoint to its corresponding location.
+   */
+  RelatedLocation asLocation(Endpoint e);
+
+  /**
+   * Defines what MaD kinds are known, and what endpoint type they correspond to.
+   */
+  predicate isKnownKind(string kind, string humanReadableLabel, EndpointType type);
+
+  /**
+   * Holds if `e` is a flow sanitizer, and has type `t`.
+   */
+  predicate isSanitizer(Endpoint e, EndpointType t);
+
+  /**
+   * Holds if `e` is a sink with the label `kind`.
+   */
+  predicate isSink(Endpoint e, string kind);
+
+  /**
+   * Holds if `e` is not a sink of any kind.
+   */
+  predicate isNeutral(Endpoint e);
+
+  /**
+   * Gets a related location.
+   *
+   * A related location is a source code location that may hold extra information about an endpoint that can be useful
+   * to the machine learning model.
+   *
+   * For example, a related location for a method call may be the documentation comment of a method.
+   */
+  RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType name);
+}
+
+/**
+ * A set of shared characteristics for a given candidate class.
+ *
+ * This module is language-agnostic, although the `CandidateSig` module will be language-specific.
+ *
+ * The language specific implementation can also further extend the behavior of this module by adding additional
+ *   implementations of endpoint characteristics exported by this module.
+ */
+module SharedCharacteristics<CandidateSig Candidate> {
+  predicate isSink = Candidate::isSink/2;
+
+  predicate isNeutral = Candidate::isNeutral/1;
+
+  /**
+   * Holds if `sink` is a known sink of type `endpointType`.
+   */
+  predicate isKnownSink(Candidate::Endpoint sink, Candidate::EndpointType endpointType) {
+    // If the list of characteristics includes positive indicators with maximal confidence for this class, then it's a
+    // known sink for the class.
+    not endpointType instanceof Candidate::NegativeEndpointType and
+    exists(EndpointCharacteristic characteristic |
+      characteristic.appliesToEndpoint(sink) and
+      characteristic.hasImplications(endpointType, true, maximalConfidence())
+    )
+  }
+
+  /**
+   * Holds if the candidate sink `candidateSink` should be considered as a possible sink of type `sinkType`, and
+   * classified by the ML model. A candidate sink is a node that cannot be excluded from `sinkType` based on its
+   * characteristics.
+   */
+  predicate isSinkCandidate(Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType) {
+    not sinkType instanceof Candidate::NegativeEndpointType and
+    not exists(getAReasonSinkExcluded(candidateSink, sinkType))
+  }
+
+  /**
+   * Gets the related location of `e` with name `name`, if it exists.
+   * Otherwise, gets the candidate itself.
+   */
+  Candidate::RelatedLocation getRelatedLocationOrCandidate(
+    Candidate::Endpoint e, Candidate::RelatedLocationType type
+  ) {
+    if exists(Candidate::getRelatedLocation(e, type))
+    then result = Candidate::getRelatedLocation(e, type)
+    else result = Candidate::asLocation(e)
+  }
+
+  /**
+   * Gets the list of characteristics that cause `candidateSink` to be excluded as an effective sink for a given sink
+   * type.
+   */
+  EndpointCharacteristic getAReasonSinkExcluded(
+    Candidate::Endpoint candidateSink, Candidate::EndpointType sinkType
+  ) {
+    // An endpoint is a sink candidate if none of its characteristics give much indication whether or not it is a sink.
+    not sinkType instanceof Candidate::NegativeEndpointType and
+    result.appliesToEndpoint(candidateSink) and
+    (
+      // Exclude endpoints that have a characteristic that implies they're not sinks for _any_ sink type.
+      exists(float confidence |
+        confidence >= mediumConfidence() and
+        result.hasImplications(any(Candidate::NegativeEndpointType t), true, confidence)
+      )
+      or
+      // Exclude endpoints that have a characteristic that implies they're not sinks for _this particular_ sink type.
+      exists(float confidence |
+        confidence >= mediumConfidence() and
+        result.hasImplications(sinkType, false, confidence)
+      )
+    )
+  }
+
+  /**
+   * A set of characteristics that a particular endpoint might have. This set of characteristics is used to make decisions
+   * about whether to include the endpoint in the training set and with what kind, as well as whether to score the
+   * endpoint at inference time.
+   */
+  abstract class EndpointCharacteristic extends string {
+    /**
+     * Holds for the string that is the name of the characteristic. This should describe some property of an endpoint
+     * that is meaningful for determining whether it's a sink, and if so, of which sink type.
+     */
+    bindingset[this]
+    EndpointCharacteristic() { any() }
+
+    /**
+     * Holds for endpoints that have this characteristic.
+     */
+    abstract predicate appliesToEndpoint(Candidate::Endpoint n);
+
+    /**
+     * This predicate describes what the characteristic tells us about an endpoint.
+     *
+     * Params:
+     * endpointType: The sink/source type.
+     * isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
+     * false, it indicates that it _isn't_ a member of the class.
+     * confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
+     * belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
+     * indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
+     * this characteristic definitively do/don't belong to the class.
+     */
+    abstract predicate hasImplications(
+      Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
+    );
+
+    /** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
+    final float getHighConfidenceThreshold() { result = 0.8 }
+  }
+
+  /**
+   * A high-confidence characteristic that indicates that an endpoint is a sink of a specified type. These endpoints can
+   * be used as positive samples for training or for a few-shot prompt.
+   */
+  abstract class SinkCharacteristic extends EndpointCharacteristic {
+    bindingset[this]
+    SinkCharacteristic() { any() }
+
+    abstract Candidate::EndpointType getSinkType();
+
+    final override predicate hasImplications(
+      Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
+    ) {
+      endpointType = this.getSinkType() and
+      isPositiveIndicator = true and
+      confidence = maximalConfidence()
+    }
+  }
+
+  /**
+   * A high-confidence characteristic that indicates that an endpoint is not a sink of any type. These endpoints can be
+   * used as negative samples for training or for a few-shot prompt.
+   */
+  abstract class NotASinkCharacteristic extends EndpointCharacteristic {
+    bindingset[this]
+    NotASinkCharacteristic() { any() }
+
+    override predicate hasImplications(
+      Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
+    ) {
+      endpointType instanceof Candidate::NegativeEndpointType and
+      isPositiveIndicator = true and
+      confidence = highConfidence()
+    }
+  }
+
+  /**
+   * A medium-confidence characteristic that indicates that an endpoint is unlikely to be a sink of any type. These
+   * endpoints can be excluded from scoring at inference time, both to save time and to avoid false positives. They should
+   * not, however, be used as negative samples for training or for a few-shot prompt, because they may include a small
+   * number of sinks.
+   */
+  abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
+    bindingset[this]
+    LikelyNotASinkCharacteristic() { any() }
+
+    override predicate hasImplications(
+      Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
+    ) {
+      endpointType instanceof Candidate::NegativeEndpointType and
+      isPositiveIndicator = true and
+      confidence = mediumConfidence()
+    }
+  }
+
+  /**
+   * A characteristic that indicates not necessarily that an endpoint is not a sink, but rather that it is not a sink
+   * that's interesting to model in the standard Java libraries. These filters should be removed when extracting sink
+   * candidates within a user's codebase for customized modeling.
+   *
+   * These endpoints should not be used as negative samples for training or for a few-shot prompt, because they are not
+   * necessarily non-sinks.
+   */
+  abstract class UninterestingToModelCharacteristic extends EndpointCharacteristic {
+    bindingset[this]
+    UninterestingToModelCharacteristic() { any() }
+
+    override predicate hasImplications(
+      Candidate::EndpointType endpointType, boolean isPositiveIndicator, float confidence
+    ) {
+      endpointType instanceof Candidate::NegativeEndpointType and
+      isPositiveIndicator = true and
+      confidence = mediumConfidence()
+    }
+  }
+
+  /**
+   * Contains default implementations that are derived solely from the `CandidateSig` implementation.
+   */
+  private module DefaultCharacteristicImplementations {
+    /**
+     * Endpoints identified as sinks by the `CandidateSig` implementation are sinks with maximal confidence.
+     */
+    private class KnownSinkCharacteristic extends SinkCharacteristic {
+      string madKind;
+      Candidate::EndpointType endpointType;
+
+      KnownSinkCharacteristic() { Candidate::isKnownKind(madKind, this, endpointType) }
+
+      override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSink(e, madKind) }
+
+      override Candidate::EndpointType getSinkType() { result = endpointType }
+    }
+
+    /**
+     * A negative characteristic that indicates that an endpoint was manually modeled as a neutral model.
+     */
+    private class NeutralModelCharacteristic extends NotASinkCharacteristic {
+      NeutralModelCharacteristic() { this = "known non-sink" }
+
+      override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isNeutral(e) }
+    }
+
+    /**
+     * A negative characteristic that indicates that an endpoint is not part of the source code for the project being
+     * analyzed.
+     */
+    private class IsSanitizerCharacteristic extends NotASinkCharacteristic {
+      IsSanitizerCharacteristic() { this = "external" }
+
+      override predicate appliesToEndpoint(Candidate::Endpoint e) { Candidate::isSanitizer(e, _) }
+    }
+  }
+}
--- a/java/ql/src/Telemetry/AutomodelSharedUtil.qll
+++ b/java/ql/src/Telemetry/AutomodelSharedUtil.qll
@@ -0,0 +1,21 @@
+/**
+ * A helper class to represent a string value that can be returned by a query using $@ notation.
+ *
+ * It extends `string`, but adds a mock `hasLocationInfo` method that returns the string itself as the file name.
+ *
+ * Use this, when you want to return a string value from a query using $@ notation - the string value
+ * will be included in the sarif file.
+ *
+ *
+ * Background information on `hasLocationInfo`:
+ * https://codeql.github.com/docs/writing-codeql-queries/providing-locations-in-codeql-queries/#providing-location-information
+ */
+class DollarAtString extends string {
+  bindingset[this]
+  DollarAtString() { any() }
+
+  bindingset[this]
+  predicate hasLocationInfo(string path, int sl, int sc, int el, int ec) {
+    path = this and sl = 1 and sc = 1 and el = 1 and ec = 1
+  }
+}