Merge branch 'main' into codeql-ci/atm/release-0.4.2

2026-05-05 05:35:13 +02:00 · 2022-11-24 14:41:49 +00:00
parent 78d49e44b1 adf905d838
commit 56e5f01ce0
1559 changed files with 129828 additions and 82528 deletions
--- a/javascript/ql/experimental/adaptivethreatmodeling/README.md
+++ b/javascript/ql/experimental/adaptivethreatmodeling/README.md
@@ -1,6 +1,8 @@
-# [Internal only] Adaptive Threat Modeling for JavaScript
+# Adaptive Threat Modeling for JavaScript

 This directory contains CodeQL libraries and queries that power adaptive threat modeling for JavaScript.
 All APIs are experimental and may change in the future.

-These queries can only be run by internal users; for external users they will return no results.
+Only internal users can run these queries directly. External users can run these queries when performing
+JavaScript analysis on Code Scanning. For more information, see 
+[Code scanning finds more vulnerabilities using machine learning](https://github.blog/2022-02-17-code-scanning-finds-vulnerabilities-using-machine-learning/).
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Configures boosting for adaptive threat modeling (ATM).
@@ -50,7 +50,8 @@ abstract class AtmConfig extends string {
    // known sink for the class.
    exists(EndpointCharacteristic characteristic |
      characteristic.getEndpoints(sink) and
-      characteristic.getImplications(this.getASinkEndpointType(), true, 1.0)
+      characteristic
+          .getImplications(this.getASinkEndpointType(), true, characteristic.maximalConfidence())
    )
  }

--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/AdaptiveThreatModeling.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/AdaptiveThreatModeling.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Provides information about the results of boosted queries for use in adaptive threat modeling (ATM).
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Provides shared scoring functionality for use in adaptive threat modeling (ATM).
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/CoreKnowledge.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/CoreKnowledge.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Provides predicates that expose the knowledge of models
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll
@@ -30,18 +30,39 @@ abstract class EndpointCharacteristic extends string {
  /**
   * This predicate describes what the characteristic tells us about an endpoint.
   *
-   *  Params:
-   *  endpointClass: Class 0 is the negative class. Each positive int corresponds to a single sink type.
-   *  isPositiveIndicator: Does this characteristic indicate this endpoint _is_ a member of the class, or that it
-   *  _isn't_ a member of the class?
-   *  confidence: A number in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
-   *  belonging / not belonging to the given class.
+   * Params:
+   * endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier
+   * class for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single
+   * sink type.
+   * isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
+   * false, it indicates that it _isn't_ a member of the class.
+   * confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
+   * belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
+   * indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
+   * this characteristic definitively do/don't belong to the class.
   */
  abstract predicate getImplications(
    EndpointType endpointClass, boolean isPositiveIndicator, float confidence
  );
+
+  /** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
+  final float getHighConfidenceThreshold() { result = 0.8 }
+
+  // The following are some confidence values that are used in practice by the subclasses. They are defined as named
+  // constants here to make it easier to change them in the future.
+  final float maximalConfidence() { result = 1.0 }
+
+  final float highConfidence() { result = 0.9 }
+
+  final float mediumConfidence() { result = 0.6 }
 }

+/*
+ * Characteristics that are indicative of a sink.
+ * NOTE: Initially each sink type has only one characteristic, which is that it's a sink of this type in the standard
+ * JavaScript libraries.
+ */
+
 /**
 * Endpoints identified as "DomBasedXssSink" by the standard JavaScript libraries are XSS sinks with maximal confidence.
 */
@@ -53,7 +74,9 @@ private class DomBasedXssSinkCharacteristic extends EndpointCharacteristic {
  override predicate getImplications(
    EndpointType endpointClass, boolean isPositiveIndicator, float confidence
  ) {
-    endpointClass instanceof XssSinkType and isPositiveIndicator = true and confidence = 1.0
+    endpointClass instanceof XssSinkType and
+    isPositiveIndicator = true and
+    confidence = maximalConfidence()
  }
 }

@@ -69,7 +92,9 @@ private class TaintedPathSinkCharacteristic extends EndpointCharacteristic {
  override predicate getImplications(
    EndpointType endpointClass, boolean isPositiveIndicator, float confidence
  ) {
-    endpointClass instanceof TaintedPathSinkType and isPositiveIndicator = true and confidence = 1.0
+    endpointClass instanceof TaintedPathSinkType and
+    isPositiveIndicator = true and
+    confidence = maximalConfidence()
  }
 }

@@ -87,7 +112,7 @@ private class SqlInjectionSinkCharacteristic extends EndpointCharacteristic {
  ) {
    endpointClass instanceof SqlInjectionSinkType and
    isPositiveIndicator = true and
-    confidence = 1.0
+    confidence = maximalConfidence()
  }
 }

@@ -105,6 +130,315 @@ private class NosqlInjectionSinkCharacteristic extends EndpointCharacteristic {
  ) {
    endpointClass instanceof NosqlInjectionSinkType and
    isPositiveIndicator = true and
-    confidence = 1.0
+    confidence = maximalConfidence()
+  }
+}
+
+/*
+ * Characteristics that are indicative of not being a sink of any type.
+ */
+
+/**
+ * A characteristic that is an indicator of not being a sink of any type, because it's an argument to a function of a
+ * builtin object.
+ */
+abstract private class ArgumentToBuiltinFunctionCharacteristic extends EndpointCharacteristic {
+  bindingset[this]
+  ArgumentToBuiltinFunctionCharacteristic() { any() }
+}
+
+/**
+ * A high-confidence characteristic that indicates that an endpoint is not a sink of any type.
+ */
+abstract private class NotASinkCharacteristic extends EndpointCharacteristic {
+  bindingset[this]
+  NotASinkCharacteristic() { any() }
+
+  override predicate getImplications(
+    EndpointType endpointClass, boolean isPositiveIndicator, float confidence
+  ) {
+    endpointClass instanceof NegativeType and
+    isPositiveIndicator = true and
+    confidence = highConfidence()
+  }
+}
+
+/**
+ * A medium-confidence characteristic that indicates that an endpoint is not a sink of any type.
+ *
+ * TODO: This class is currently not private, because the current extraction logic explicitly avoids including these
+ * endpoints in the training data. We might want to change this in the future.
+ */
+abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
+  bindingset[this]
+  LikelyNotASinkCharacteristic() { any() }
+
+  override predicate getImplications(
+    EndpointType endpointClass, boolean isPositiveIndicator, float confidence
+  ) {
+    endpointClass instanceof NegativeType and
+    isPositiveIndicator = true and
+    confidence = mediumConfidence()
+  }
+}
+
+private class LodashUnderscore extends NotASinkCharacteristic {
+  LodashUnderscore() { this = "LodashUnderscoreArgument" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    any(LodashUnderscore::Member m).getACall().getAnArgument() = n
+  }
+}
+
+private class JQueryArgumentCharacteristic extends NotASinkCharacteristic {
+  JQueryArgumentCharacteristic() { this = "JQueryArgument" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    any(JQuery::MethodCall m).getAnArgument() = n
+  }
+}
+
+private class ClientRequestCharacteristic extends NotASinkCharacteristic {
+  ClientRequestCharacteristic() { this = "ClientRequest" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(ClientRequest r |
+      r.getAnArgument() = n or n = r.getUrl() or n = r.getHost() or n = r.getADataNode()
+    )
+  }
+}
+
+private class PromiseDefinitionCharacteristic extends NotASinkCharacteristic {
+  PromiseDefinitionCharacteristic() { this = "PromiseDefinition" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(PromiseDefinition p |
+      n = [p.getResolveParameter(), p.getRejectParameter()].getACall().getAnArgument()
+    )
+  }
+}
+
+private class CryptographicKeyCharacteristic extends NotASinkCharacteristic {
+  CryptographicKeyCharacteristic() { this = "CryptographicKey" }
+
+  override predicate getEndpoints(DataFlow::Node n) { n instanceof CryptographicKey }
+}
+
+private class CryptographicOperationFlowCharacteristic extends NotASinkCharacteristic {
+  CryptographicOperationFlowCharacteristic() { this = "CryptographicOperationFlow" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    any(CryptographicOperation op).getInput() = n
+  }
+}
+
+private class LoggerMethodCharacteristic extends NotASinkCharacteristic {
+  LoggerMethodCharacteristic() { this = "LoggerMethod" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call.getCalleeName() = getAStandardLoggerMethodName()
+    )
+  }
+}
+
+private class TimeoutCharacteristic extends NotASinkCharacteristic {
+  TimeoutCharacteristic() { this = "Timeout" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call.getCalleeName() = ["setTimeout", "clearTimeout"]
+    )
+  }
+}
+
+private class ReceiverStorageCharacteristic extends NotASinkCharacteristic {
+  ReceiverStorageCharacteristic() { this = "ReceiverStorage" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call.getReceiver() = DataFlow::globalVarRef(["localStorage", "sessionStorage"])
+    )
+  }
+}
+
+private class StringStartsWithCharacteristic extends NotASinkCharacteristic {
+  StringStartsWithCharacteristic() { this = "StringStartsWith" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call instanceof StringOps::StartsWith
+    )
+  }
+}
+
+private class StringEndsWithCharacteristic extends NotASinkCharacteristic {
+  StringEndsWithCharacteristic() { this = "StringEndsWith" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof StringOps::EndsWith)
+  }
+}
+
+private class StringRegExpTestCharacteristic extends NotASinkCharacteristic {
+  StringRegExpTestCharacteristic() { this = "StringRegExpTest" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call instanceof StringOps::RegExpTest
+    )
+  }
+}
+
+private class EventRegistrationCharacteristic extends NotASinkCharacteristic {
+  EventRegistrationCharacteristic() { this = "EventRegistration" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof EventRegistration)
+  }
+}
+
+private class EventDispatchCharacteristic extends NotASinkCharacteristic {
+  EventDispatchCharacteristic() { this = "EventDispatch" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof EventDispatch)
+  }
+}
+
+private class MembershipCandidateTestCharacteristic extends NotASinkCharacteristic {
+  MembershipCandidateTestCharacteristic() { this = "MembershipCandidateTest" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call = any(MembershipCandidate c).getTest()
+    )
+  }
+}
+
+private class FileSystemAccessCharacteristic extends NotASinkCharacteristic {
+  FileSystemAccessCharacteristic() { this = "FileSystemAccess" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof FileSystemAccess)
+  }
+}
+
+private class DatabaseAccessCharacteristic extends NotASinkCharacteristic {
+  DatabaseAccessCharacteristic() { this = "DatabaseAccess" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    // TODO database accesses are less well defined than database query sinks, so this may cover unmodeled sinks on
+    // existing database models
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      [
+        call, call.getAMethodCall()
+      /* command pattern where the query is built, and then exec'ed later */ ] instanceof
+        DatabaseAccess
+    )
+  }
+}
+
+private class DomCharacteristic extends NotASinkCharacteristic {
+  DomCharacteristic() { this = "DOM" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() | call = DOM::domValueRef())
+  }
+}
+
+private class NextFunctionCallCharacteristic extends NotASinkCharacteristic {
+  NextFunctionCallCharacteristic() { this = "NextFunctionCall" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call.getCalleeName() = "next" and
+      exists(DataFlow::FunctionNode f | call = f.getLastParameter().getACall())
+    )
+  }
+}
+
+private class DojoRequireCharacteristic extends NotASinkCharacteristic {
+  DojoRequireCharacteristic() { this = "DojoRequire" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call | n = call.getAnArgument() |
+      call = DataFlow::globalVarRef("dojo").getAPropertyRead("require").getACall()
+    )
+  }
+}
+
+private class Base64ManipulationCharacteristic extends NotASinkCharacteristic {
+  Base64ManipulationCharacteristic() { this = "Base64Manipulation" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(Base64::Decode d | n = d.getInput()) or
+    exists(Base64::Encode d | n = d.getInput())
+  }
+}
+
+private class ArgumentToArrayCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
+  LikelyNotASinkCharacteristic {
+  ArgumentToArrayCharacteristic() { this = "ArgumentToArray" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::SourceNode builtin, DataFlow::SourceNode receiver, DataFlow::InvokeNode invk |
+      builtin instanceof DataFlow::ArrayCreationNode
+    |
+      receiver = [builtin.getAnInvocation(), builtin] and
+      invk = [receiver, receiver.getAPropertyRead()].getAnInvocation() and
+      invk.getAnArgument() = n
+    )
+  }
+}
+
+private class ArgumentToBuiltinGlobalVarRefCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
+  LikelyNotASinkCharacteristic {
+  ArgumentToBuiltinGlobalVarRefCharacteristic() { this = "ArgumentToBuiltinGlobalVarRef" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::SourceNode builtin, DataFlow::SourceNode receiver, DataFlow::InvokeNode invk |
+      builtin =
+        DataFlow::globalVarRef([
+            "Map", "Set", "WeakMap", "WeakSet", "Number", "Object", "String", "Array", "Error",
+            "Math", "Boolean"
+          ])
+    |
+      receiver = [builtin.getAnInvocation(), builtin] and
+      invk = [receiver, receiver.getAPropertyRead()].getAnInvocation() and
+      invk.getAnArgument() = n
+    )
+  }
+}
+
+private class ConstantReceiverCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
+  NotASinkCharacteristic {
+  ConstantReceiverCharacteristic() { this = "ConstantReceiver" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(Expr primitive, MethodCallExpr c |
+      primitive instanceof ConstantString or
+      primitive instanceof NumberLiteral or
+      primitive instanceof BooleanLiteral
+    |
+      c.calls(primitive, _) and
+      c.getAnArgument() = n.asExpr()
+    )
+  }
+}
+
+private class BuiltinCallNameCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
+  NotASinkCharacteristic {
+  BuiltinCallNameCharacteristic() { this = "BuiltinCallName" }
+
+  override predicate getEndpoints(DataFlow::Node n) {
+    exists(DataFlow::CallNode call |
+      call.getAnArgument() = n and
+      call.getCalleeName() =
+        [
+          "indexOf", "hasOwnProperty", "substring", "isDecimal", "decode", "encode", "keys",
+          "shift", "values", "forEach", "toString", "slice", "splice", "push", "isArray", "sort"
+        ]
+    )
  }
 }
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Extracts data about the database for use in adaptive threat modeling (ATM).
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Provides an implementation of scoring alerts for use in adaptive threat modeling (ATM).
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointTypes.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointTypes.qll
@@ -16,6 +16,11 @@ newtype TEndpointType =
 abstract class EndpointType extends TEndpointType {
  abstract string getDescription();

+  /**
+   * Gets the integer representation of this endpoint type. This integer representation specifies the class number
+   * used by the endpoint scoring model (the classifier) to represent this endpoint type. Class 0 is the negative
+   * class (non-sink). Each positive int corresponds to a single sink type.
+   */
  abstract int getEncoding();

  string toString() { result = getDescription() }
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FunctionBodyFeatures.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FunctionBodyFeatures.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * FunctionBodyFeatures.qll
 *
 * Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features.
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml
@@ -1,4 +1,5 @@
 name: codeql/javascript-experimental-atm-lib
+description: CodeQL libraries for the experimental ML-powered queries
 version: 0.4.3
 extractor: javascript
 library: true
--- a/javascript/ql/experimental/adaptivethreatmodeling/model/qlpack.yml
+++ b/javascript/ql/experimental/adaptivethreatmodeling/model/qlpack.yml
@@ -1,4 +1,5 @@
 name: codeql/javascript-experimental-atm-model
+description: Machine learning model supporting the experimental ML-powered queries
 version: 0.3.1
 groups:
    - javascript
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/DebugResultInclusion.ql
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/DebugResultInclusion.ql
@@ -11,7 +11,7 @@

 import javascript
 import experimental.adaptivethreatmodeling.ATMConfig
-import extraction.ExtractEndpointData
+import extraction.ExtractEndpointDataTraining

 string getAReasonSinkExcluded(DataFlow::Node sinkCandidate, Query query) {
  query instanceof NosqlInjectionQuery and
@@ -33,7 +33,7 @@ string getDescriptionForAlertCandidate(
 ) {
  result = "excluded[reason=" + getAReasonSinkExcluded(sinkCandidate, query) + "]"
  or
-  getAtmCfg(query).isKnownSink(sinkCandidate) and
+  getDataFlowCfg(query).(AtmConfig).isKnownSink(sinkCandidate) and
  result = "excluded[reason=known-sink]"
  or
  not exists(getAReasonSinkExcluded(sinkCandidate, query)) and
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/counting/CountAlertsAndSinks.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/counting/CountAlertsAndSinks.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 *
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/Exclusions.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/Exclusions.qll
@@ -1,4 +1,4 @@
-/*
+/**
 * For internal use only.
 *
 * Defines files that should be excluded from the evaluation of ML models.
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.ql
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.ql
@@ -1,11 +0,0 @@
-/*
- * For internal use only.
- *
- * Extracts training and evaluation data we can use to train ML models for ML-powered queries.
- */
-
-import ExtractEndpointData as ExtractEndpointData
-
-query predicate endpoints = ExtractEndpointData::endpoints/5;
-
-query predicate tokenFeatures = ExtractEndpointData::tokenFeatures/3;
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll
@@ -1,215 +0,0 @@
-/*
- * For internal use only.
- *
- * Library code for training and evaluation data we can use to train ML models for ML-powered
- * queries.
- */
-
-import javascript
-import Exclusions as Exclusions
-import evaluation.EndToEndEvaluation as EndToEndEvaluation
-import experimental.adaptivethreatmodeling.ATMConfig
-import experimental.adaptivethreatmodeling.CoreKnowledge as CoreKnowledge
-import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
-import experimental.adaptivethreatmodeling.EndpointScoring as EndpointScoring
-import experimental.adaptivethreatmodeling.EndpointTypes
-import experimental.adaptivethreatmodeling.FilteringReasons
-import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
-
-/** DEPRECATED: Alias for NosqlInjectionAtm */
-deprecated module NosqlInjectionATM = NosqlInjectionAtm;
-
-import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
-
-/** DEPRECATED: Alias for SqlInjectionAtm */
-deprecated module SqlInjectionATM = SqlInjectionAtm;
-
-import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
-
-/** DEPRECATED: Alias for TaintedPathAtm */
-deprecated module TaintedPathATM = TaintedPathAtm;
-
-import experimental.adaptivethreatmodeling.XssATM as XssAtm
-
-/** DEPRECATED: Alias for XssAtm */
-deprecated module XssATM = XssAtm;
-
-import Labels
-import NoFeaturizationRestrictionsConfig
-import Queries
-
-/** Gets the ATM configuration object for the specified query. */
-AtmConfig getAtmCfg(Query query) {
-  query instanceof NosqlInjectionQuery and
-  result instanceof NosqlInjectionAtm::NosqlInjectionAtmConfig
-  or
-  query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::SqlInjectionAtmConfig
-  or
-  query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::TaintedPathAtmConfig
-  or
-  query instanceof XssQuery and result instanceof XssAtm::DomBasedXssAtmConfig
-}
-
-/** DEPRECATED: Alias for getAtmCfg */
-deprecated ATMConfig getATMCfg(Query query) { result = getAtmCfg(query) }
-
-/** Gets the ATM data flow configuration for the specified query. */
-DataFlow::Configuration getDataFlowCfg(Query query) {
-  query instanceof NosqlInjectionQuery and result instanceof NosqlInjectionAtm::Configuration
-  or
-  query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::Configuration
-  or
-  query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::Configuration
-  or
-  query instanceof XssQuery and result instanceof XssAtm::Configuration
-}
-
-/** Gets a known sink for the specified query. */
-private DataFlow::Node getASink(Query query) {
-  getAtmCfg(query).isKnownSink(result) and
-  // Only consider the source code for the project being analyzed.
-  exists(result.getFile().getRelativePath())
-}
-
-/** Gets a data flow node that is known not to be a sink for the specified query. */
-private DataFlow::Node getANotASink(NotASinkReason reason) {
-  CoreKnowledge::isOtherModeledArgument(result, reason) and
-  // Some endpoints can be assigned both a `NotASinkReason` and a `LikelyNotASinkReason`. We
-  // consider these endpoints to be `LikelyNotASink`, therefore this line excludes them from the
-  // definition of `NotASink`.
-  not CoreKnowledge::isOtherModeledArgument(result, any(LikelyNotASinkReason t)) and
-  not result = getASink(_) and
-  // Only consider the source code for the project being analyzed.
-  exists(result.getFile().getRelativePath())
-}
-
-/**
- * Gets a data flow node whose label is unknown for the specified query.
- *
- * In other words, this is an endpoint that is not `Sink`, `NotASink`, or `LikelyNotASink` for the
- * specified query.
- */
-private DataFlow::Node getAnUnknown(Query query) {
-  getAtmCfg(query).isEffectiveSink(result) and
-  // Effective sinks should exclude sinks but this is a defensive requirement
-  not result = getASink(query) and
-  // Effective sinks should exclude NotASink but for some queries (e.g. Xss) this is currently not always the case and
-  // so this is a defensive requirement
-  not result = getANotASink(_) and
-  // Only consider the source code for the project being analyzed.
-  exists(result.getFile().getRelativePath())
-}
-
-/** Gets the query-specific sink label for the given endpoint, if such a label exists. */
-private EndpointLabel getSinkLabelForEndpoint(DataFlow::Node endpoint, Query query) {
-  endpoint = getASink(query) and result instanceof SinkLabel
-  or
-  endpoint = getANotASink(_) and result instanceof NotASinkLabel
-  or
-  endpoint = getAnUnknown(query) and result instanceof UnknownLabel
-}
-
-/** Gets an endpoint that should be extracted. */
-DataFlow::Node getAnEndpoint(Query query) { exists(getSinkLabelForEndpoint(result, query)) }
-
-/**
- * Endpoints and associated metadata.
- *
- * Note that we draw a distinction between _features_, that are provided to the model at training
- * and query time, and _metadata_, that is only provided to the model at training time.
- *
- * Internal: See the design document for
- * [extensible extraction queries](https://docs.google.com/document/d/1g3ci2Nf1hGMG6ZUP0Y4PqCy_8elcoC_dhBvgTxdAWpg)
- * for technical information about the design of this predicate.
- */
-predicate endpoints(
-  DataFlow::Node endpoint, string queryName, string key, string value, string valueType
-) {
-  exists(Query query |
-    // Only provide metadata for labelled endpoints, since we do not extract all endpoints.
-    endpoint = getAnEndpoint(query) and
-    queryName = query.getName() and
-    (
-      // Holds if there is a taint flow path from a known source to the endpoint
-      key = "hasFlowFromSource" and
-      (
-        if FlowFromSource::hasFlowFromSource(endpoint, query)
-        then value = "true"
-        else value = "false"
-      ) and
-      valueType = "boolean"
-      or
-      // Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
-      // appear in an alert, making them less interesting training examples.
-      key = "isConstantExpression" and
-      (if endpoint.asExpr() instanceof ConstantExpr then value = "true" else value = "false") and
-      valueType = "boolean"
-      or
-      // Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
-      key = "isExcludedFromEndToEndEvaluation" and
-      (if Exclusions::isFileExcluded(endpoint.getFile()) then value = "true" else value = "false") and
-      valueType = "boolean"
-      or
-      // The label for this query, considering the endpoint as a sink.
-      key = "sinkLabel" and
-      value = getSinkLabelForEndpoint(endpoint, query).getEncoding() and
-      valueType = "string"
-      or
-      // The reason, or reasons, why the endpoint was labeled NotASink for this query.
-      key = "notASinkReason" and
-      exists(FilteringReason reason |
-        endpoint = getANotASink(reason) and
-        value = reason.getDescription()
-      ) and
-      valueType = "string"
-    )
-  )
-}
-
-/**
- * `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
- * `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
- * `featureValue` to the empty string in this case.
- */
-predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
-  endpoints(endpoint, _, _, _, _) and
-  (
-    EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
-    or
-    // Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
-    featureName = EndpointFeatures::getASupportedFeatureName() and
-    not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
-    featureValue = ""
-  )
-}
-
-module FlowFromSource {
-  predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
-    exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
-  }
-
-  /**
-   * A data flow configuration that replicates the data flow configuration for a specific query, but
-   * replaces the set of sinks with the set of endpoints we're extracting.
-   *
-   * We use this to find out when there is flow to a particular endpoint from a known source.
-   *
-   * This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
-   * from the CodeQL standard libraries for JavaScript.
-   */
-  private class Configuration extends DataFlow::Configuration {
-    Query q;
-
-    Configuration() { this = getDataFlowCfg(q) }
-
-    Query getQuery() { result = q }
-
-    /** Holds if `sink` is an endpoint we're extracting. */
-    override predicate isSink(DataFlow::Node sink) { sink = getAnEndpoint(q) }
-
-    /** Holds if `sink` is an endpoint we're extracting. */
-    override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) {
-      sink = getAnEndpoint(q) and exists(lbl)
-    }
-  }
-}
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointDataTraining.ql
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointDataTraining.ql
@@ -4,23 +4,8 @@
 * Extracts training data we can use to train ML models for ML-powered queries.
 */

-import javascript
-import ExtractEndpointData as ExtractEndpointData
+private import ExtractEndpointDataTraining as ExtractEndpointDataTraining

-query predicate endpoints(
-  DataFlow::Node endpoint, string queryName, string key, string value, string valueType
-) {
-  ExtractEndpointData::endpoints(endpoint, queryName, key, value, valueType) and
-  // only select endpoints that are either Sink or NotASink
-  ExtractEndpointData::endpoints(endpoint, queryName, "sinkLabel", ["Sink", "NotASink"], "string") and
-  // do not select endpoints filtered out by end-to-end evaluation
-  ExtractEndpointData::endpoints(endpoint, queryName, "isExcludedFromEndToEndEvaluation", "false",
-    "boolean") and
-  // only select endpoints that can be part of a tainted flow
-  ExtractEndpointData::endpoints(endpoint, queryName, "isConstantExpression", "false", "boolean")
-}
+query predicate endpoints = ExtractEndpointDataTraining::reformattedTrainingEndpoints/5;

-query predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
-  endpoints(endpoint, _, _, _, _) and
-  ExtractEndpointData::tokenFeatures(endpoint, featureName, featureValue)
-}
+query predicate tokenFeatures = ExtractEndpointDataTraining::tokenFeatures/3;
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointDataTraining.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointDataTraining.qll
@@ -0,0 +1,238 @@
+/**
+ * For internal use only.
+ *
+ * Extracts training data we can use to train ML models for ML-powered queries.
+ */
+
+import javascript
+import experimental.adaptivethreatmodeling.EndpointCharacteristics
+import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
+import NoFeaturizationRestrictionsConfig
+private import Exclusions as Exclusions
+import Queries
+import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
+import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
+import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
+import experimental.adaptivethreatmodeling.XssATM as XssAtm
+
+/**
+ * Gets the set of featureName-featureValue pairs for each endpoint in the training set.
+ *
+ * `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
+ * `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
+ * `featureValue` to the empty string in this case.
+ */
+predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
+  trainingEndpoints(endpoint, _, _) and
+  (
+    EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
+    or
+    // Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
+    featureName = EndpointFeatures::getASupportedFeatureName() and
+    not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
+    featureValue = ""
+  )
+}
+
+/**
+ * Holds if the given endpoint should be included in the training set as a sample belonging to endpointClass, and has
+ * the given characteristic. This query uses the endpoint characteristics to select and label endpoints for the training
+ * set, and provides a list of characteristics for each endpoint in the training set, which is used in the modeling
+ * code.
+ *
+ * Params:
+ * endpoint: The endpoint to include / exclude.
+ * endpointClass: The sink type. See the documentation of EndpointType.getEncoding for details about the relationship
+ * between an EndpointType and a class in the classifier.
+ * characteristic: Provides the list of characteristics that apply to the endpoint, which the modeling code currently
+ * uses for type balancing.
+ *
+ * Note: This predicate will produce multiple tuples for endpoints that have multiple characteristics, which we must
+ * then group together into a list of characteristics.
+ */
+query predicate trainingEndpoints(
+  DataFlow::Node endpoint, EndpointType endpointClass, EndpointCharacteristic characteristic
+) {
+  characteristic.getEndpoints(endpoint) and
+  // Only consider the source code for the project being analyzed.
+  exists(endpoint.getFile().getRelativePath()) and
+  // Only select endpoints that can be part of a tainted flow: Constant expressions always evaluate to a constant
+  // primitive value. Therefore they can't ever appear in an alert, making them less interesting training examples.
+  // TODO: Experiment with removing this requirement.
+  not endpoint.asExpr() instanceof ConstantExpr and
+  // Do not select endpoints filtered out by end-to-end evaluation.
+  // TODO: Experiment with removing this requirement.
+  not Exclusions::isFileExcluded(endpoint.getFile()) and
+  // Filter out negative examples that also have a LikelyNotASinkReason, because this is currently done here
+  // https://github.com/github/codeql/blob/387e57546bf7352f7c1cfe781daa1a3799b7063e/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll#L77
+  // TODO: Experiment with removing this requirement.
+  not (
+    endpointClass instanceof NegativeType and
+    exists(EndpointCharacteristic c |
+      c.getEndpoints(endpoint) and
+      c instanceof LikelyNotASinkCharacteristic
+    )
+  ) and
+  (
+    // If the list of characteristics includes positive indicators with high confidence for this class, select this as a
+    // training sample belonging to the class.
+    exists(EndpointCharacteristic characteristic2, float confidence |
+      characteristic2.getEndpoints(endpoint) and
+      characteristic2.getImplications(endpointClass, true, confidence) and
+      confidence >= characteristic2.getHighConfidenceThreshold()
+    ) and
+    (
+      // Temporarily limit this only to positive classes. For negative classes, additionally select only endpoints that
+      // have no high confidence indicators that they are sinks, because this is what was previously done.
+      // TODO: Experiment with removing this requirement, and instead ensuring that an endpoint never has both a high
+      // confidence indicator that it _is_ a sink and a high confidence indicator that it is _not_ a sink.
+      not endpointClass instanceof NegativeType
+      or
+      not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
+        characteristic3.getEndpoints(endpoint) and
+        characteristic3.getImplications(posClass, true, confidence3) and
+        confidence3 >= characteristic3.getHighConfidenceThreshold() and
+        not posClass instanceof NegativeType
+      )
+    )
+    or
+    // If the list of characteristics includes negative indicators with high confidence for all classes other than 0,
+    // select this as a training sample of class 0 (this means we had query-specific characteristics to decide this
+    // endpoint isn't a sink for each of our sink types).
+    endpointClass instanceof NegativeType and
+    forall(EndpointType otherClass | not otherClass instanceof NegativeType |
+      exists(EndpointCharacteristic characteristic2, float confidence |
+        characteristic2.getEndpoints(endpoint) and
+        characteristic2.getImplications(otherClass, false, confidence) and
+        confidence >= characteristic2.getHighConfidenceThreshold()
+      )
+    )
+  )
+}
+
+/**
+ * Temporary:
+ * Reformat the training data that was extracted with the new logic to match the format produced by the old predicate.
+ * This is the format expected by the endpoint pipeline.
+ */
+query predicate reformattedTrainingEndpoints(
+  DataFlow::Node endpoint, string queryName, string key, string value, string valueType
+) {
+  trainingEndpoints(endpoint, _, _) and
+  exists(Query query |
+    queryName = query.getName() and
+    // For sinks, only list that sink type, but for non-sinks, list all sink types.
+    (
+      exists(EndpointType endpointClass |
+        endpointClass.getDescription().matches(queryName + "%") and
+        not endpointClass instanceof NegativeType and
+        trainingEndpoints(endpoint, endpointClass, _)
+      )
+      or
+      exists(EndpointType endpointClass |
+        endpointClass instanceof NegativeType and
+        trainingEndpoints(endpoint, endpointClass, _)
+      )
+    ) and
+    (
+      // NOTE: We don't use hasFlowFromSource in training, so we could just hardcode it to be false.
+      key = "hasFlowFromSource" and
+      (
+        if FlowFromSource::hasFlowFromSource(endpoint, query)
+        then value = "true"
+        else value = "false"
+      ) and
+      valueType = "boolean"
+      or
+      // Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
+      // appear in an alert, making them less interesting training examples.
+      key = "isConstantExpression" and
+      (if endpoint.asExpr() instanceof ConstantExpr then value = "true" else value = "false") and
+      valueType = "boolean"
+      or
+      // Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
+      key = "isExcludedFromEndToEndEvaluation" and
+      (if Exclusions::isFileExcluded(endpoint.getFile()) then value = "true" else value = "false") and
+      valueType = "boolean"
+      or
+      // The label for this query, considering the endpoint as a sink.
+      key = "sinkLabel" and
+      valueType = "string" and
+      value = "Sink" and
+      exists(EndpointType endpointClass |
+        endpointClass.getDescription().matches(queryName + "%") and
+        not endpointClass instanceof NegativeType and
+        trainingEndpoints(endpoint, endpointClass, _)
+      )
+      or
+      key = "sinkLabel" and
+      valueType = "string" and
+      value = "NotASink" and
+      exists(EndpointType endpointClass |
+        endpointClass instanceof NegativeType and
+        trainingEndpoints(endpoint, endpointClass, _)
+      )
+      or
+      // The reason, or reasons, why the endpoint was labeled NotASink for this query, only for negative examples.
+      key = "notASinkReason" and
+      exists(EndpointCharacteristic characteristic, EndpointType endpointClass |
+        characteristic.getEndpoints(endpoint) and
+        characteristic.getImplications(endpointClass, true, _) and
+        endpointClass instanceof NegativeType and
+        value = characteristic
+      ) and
+      // Don't include a notASinkReason for endpoints that are also known sinks.
+      not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
+        characteristic3.getEndpoints(endpoint) and
+        characteristic3.getImplications(posClass, true, confidence3) and
+        confidence3 >= characteristic3.getHighConfidenceThreshold() and
+        not posClass instanceof NegativeType
+      ) and
+      valueType = "string"
+    )
+  )
+}
+
+/**
+ * Gets the ATM data flow configuration for the specified query.
+ * TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
+ */
+DataFlow::Configuration getDataFlowCfg(Query query) {
+  query instanceof NosqlInjectionQuery and result instanceof NosqlInjectionAtm::Configuration
+  or
+  query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::Configuration
+  or
+  query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::Configuration
+  or
+  query instanceof XssQuery and result instanceof XssAtm::Configuration
+}
+
+// TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
+private module FlowFromSource {
+  predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
+    exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
+  }
+
+  /**
+   * A data flow configuration that replicates the data flow configuration for a specific query, but
+   * replaces the set of sinks with the set of endpoints we're extracting.
+   *
+   * We use this to find out when there is flow to a particular endpoint from a known source.
+   *
+   * This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
+   * from the CodeQL standard libraries for JavaScript.
+   */
+  private class Configuration extends DataFlow::Configuration {
+    Query q;
+
+    Configuration() { this = getDataFlowCfg(q) }
+
+    Query getQuery() { result = q }
+
+    /** Holds if `sink` is an endpoint we're extracting. */
+    override predicate isSink(DataFlow::Node sink) { any() }
+
+    /** Holds if `sink` is an endpoint we're extracting. */
+    override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) { exists(lbl) }
+  }
+}
--- a/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/qlpack.yml
+++ b/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/qlpack.yml
@@ -1,4 +1,5 @@
 name: codeql/javascript-experimental-atm-model-building
+description: CodeQL libraries for building machine learning models for the experimental ML-powered queries
 extractor: javascript
 library: false
 groups:
--- a/javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml
+++ b/javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml
@@ -1,4 +1,5 @@
 name: codeql/javascript-experimental-atm-queries
+description: Experimental ML-powered queries for JavaScript
 language: javascript
 version: 0.4.3
 suites: codeql-suites
--- a/javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ExtractEndpointData.qlref
+++ b/javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_large_scale/ExtractEndpointData.qlref
@@ -1 +0,0 @@
-extraction/ExtractEndpointData.ql
--- a/javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_unit_tests/ExtractEndpointData.qlref
+++ b/javascript/ql/experimental/adaptivethreatmodeling/test/endpoint_unit_tests/ExtractEndpointData.qlref
@@ -1 +0,0 @@
-extraction/ExtractEndpointData.ql