Migrate representative entity -> representative function

2026-05-20 22:27:18 +02:00 · 2022-01-11 19:09:13 +00:00
parent 2edfb24c70
commit dd009d81a4
3 changed files with 54 additions and 98 deletions
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
@@ -15,69 +15,18 @@ external predicate availableMlModels(
 ATMConfig getCfg() { any() }

 /**
- * This module provides functionality that takes an endpoint and provides an entity that encloses that
- * endpoint and is suitable for similarity analysis.
+ * This module provides functionality that takes an endpoint and provides an function that encloses
+ * that endpoint.
 */
-module EndpointToEntity {
+module EndpointToFunction {
  private import CodeToFeatures

  /**
-   * Get an entity enclosing the endpoint that is suitable for similarity analysis. In general,
-   * this may associate multiple entities to a single endpoint.
+   * Get a function containing the endpoint that is suitable for featurization. In general,
+   * this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
   */
-  DatabaseFeatures::Entity getAnEntityForEndpoint(DataFlow::Node endpoint) {
-    DatabaseFeatures::entities(result, _, _, _, _, _, _, _, _) and
-    result.getDefinedFunction() = endpoint.getContainer().getEnclosingContainer*()
-  }
-}
-
-/**
- * This module provides functionality that takes an entity and provides effective endpoints within
- * that entity.
- *
- * We use the following terminology to describe endpoints:
- *
- * - The *candidate* endpoints are the set of data flow nodes that should be passed to the
- *   appropriate endpoint filter to produce the set of effective endpoints.
- *   When we have a model that beats the performance of the baseline, we will likely define the
- *   candidate endpoints based on the most confident predictions of the model.
- * - An *effective* endpoint is a candidate endpoint which passes through the endpoint filter.
- *   In other words, it is a candidate endpoint for which the `isEffectiveSink` (or
- *   `isEffectiveSource`) predicate defined in the `ATMConfig` instance in scope holds.
- */
-module EntityToEffectiveEndpoint {
-  private import CodeToFeatures
-
-  /**
-   * Returns endpoint candidates within the specified entities.
-   *
-   * The baseline implementation of this is that a candidate endpoint is any data flow node that is
-   * enclosed within the specified entity.
-   */
-  private DataFlow::Node getABaselineEndpointCandidate(DatabaseFeatures::Entity entity) {
-    result.getContainer().getEnclosingContainer*() = entity.getDefinedFunction()
-  }
-
-  /**
-   * Get an effective source enclosed by the specified entity.
-   *
-   * N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
-   * source may occur in a function defined within the specified entity.
-   */
-  DataFlow::Node getAnEffectiveSource(DatabaseFeatures::Entity entity) {
-    result = getABaselineEndpointCandidate(entity) and
-    getCfg().isEffectiveSource(result)
-  }
-
-  /**
-   * Get an effective sink enclosed by the specified entity.
-   *
-   * N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
-   * sink may occur in a function defined within the specified entity.
-   */
-  DataFlow::Node getAnEffectiveSink(DatabaseFeatures::Entity entity) {
-    result = getABaselineEndpointCandidate(entity) and
-    getCfg().isEffectiveSink(result)
+  Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
+    result = endpoint.getContainer().getEnclosingContainer*()
  }
 }

--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
@@ -31,7 +31,9 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
  endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
  (
    // Features for endpoints that are contained within a function.
-    exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
+    exists(DatabaseFeatures::Entity entity |
+      entity.getDefinedFunction() = getRepresentativeFunctionForEndpoint(endpoint)
+    |
      // The name of the function that encloses the endpoint.
      featureName = "enclosingFunctionName" and result = entity.getName()
      or
@@ -147,24 +149,27 @@ module FunctionBodies {
    result = node.(TemplateElement).getRawValue()
  }

+  /** Returns an AST node within the function `f` that we should featurize. */
+  pragma[inline]
+  ASTNode getAnASTNodeToFeaturize(Function f) {
+    result.getParent*() = f and
+    not result = f.getIdentifier() and
+    exists(getTokenizedAstNode(result))
+  }
+
  /** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
  private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
    // Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
-    entity =
-      getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
+    entity.getDefinedFunction() =
+      getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
    // Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
    // approximates the behavior of the classifer on non-generic body features where large body
    // features are replaced by the absent token.
    //
    // We count nodes instead of tokens because tokens are often not unique.
-    strictcount(ASTNode node |
-      node.getParent*() = entity.getDefinedFunction() and
-      not node = entity.getDefinedFunction().getIdentifier() and
-      exists(getTokenizedAstNode(node))
-    ) <= 256 and
+    strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
    exists(ASTNode node |
-      node.getParent*() = entity.getDefinedFunction() and
-      not node = entity.getDefinedFunction().getIdentifier() and
+      node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
      token = getTokenizedAstNode(node) and
      location = node.getLocation()
    )
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll
@@ -15,8 +15,8 @@ private string getACompatibleModelChecksum() {
 }

 /**
- * The maximum number of AST nodes an entity containing an endpoint should have before we should
- * choose a smaller entity to represent the endpoint.
+ * The maximum number of AST nodes an function containing an endpoint should have before we should
+ * choose a smaller function to represent the endpoint.
 *
 * This is intended to represent a balance in terms of the amount of context we provide to the
 * model: we don't want the function to be too small, because then it doesn't contain very much
@@ -26,54 +26,56 @@ private string getACompatibleModelChecksum() {
 private int getMaxNumAstNodes() { result = 1024 }

 /**
- * Returns the number of AST nodes contained within the specified entity.
+ * Returns the number of AST nodes contained within the specified function.
 */
-private int getNumAstNodesInEntity(DatabaseFeatures::Entity entity) {
-  // Restrict the values `entity` can take on
-  entity = EndpointToEntity::getAnEntityForEndpoint(_) and
-  result =
-    count(DatabaseFeatures::AstNode astNode | DatabaseFeatures::astNodes(entity, _, _, astNode, _))
+private int getNumAstNodesInFunction(Function function) {
+  // Restrict the values `function` can take on
+  function = EndpointToFunction::getAFunctionForEndpoint(_) and
+  result = count(EndpointFeatures::FunctionBodies::getAnASTNodeToFeaturize(function))
 }

 /**
- * Get a single entity to use as the representative entity for the endpoint.
+ * Get the enclosing function for an endpoint.
+ * 
+ * This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
 *
- * We try to use the largest entity containing the endpoint that's below the AST node limit defined
- * in `getMaxNumAstNodes`. In the event of a tie, we use the entity that appears first within the
- * source archive.
+ * We try to use the largest function containing the endpoint that's below the AST node limit
+ * defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
+ * within the source code.
 *
- * If no entities are smaller than the AST node limit, then we use the smallest entity containing
+ * If no functions are smaller than the AST node limit, then we use the smallest function containing
 * the endpoint.
 */
-DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpoint) {
-  // Check whether there's an entity containing the endpoint that's smaller than the AST node limit.
+Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
+  // Check whether there's a function containing the endpoint that's smaller than the AST node
+  // limit.
  if
-    getNumAstNodesInEntity(EndpointToEntity::getAnEntityForEndpoint(endpoint)) <=
+    getNumAstNodesInFunction(EndpointToFunction::getAFunctionForEndpoint(endpoint)) <=
      getMaxNumAstNodes()
  then
-    // Use the largest entity smaller than the AST node limit, resolving ties using the entity that
-    // appears first in the source archive.
+    // Use the largest function smaller than the AST node limit, resolving ties using the function
+    // that appears first in the source code.
    result =
-      min(DatabaseFeatures::Entity entity, int numAstNodes, Location l |
-        entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and
-        numAstNodes = getNumAstNodesInEntity(entity) and
+      min(Function function, int numAstNodes, Location l |
+        function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
+        numAstNodes = getNumAstNodesInFunction(function) and
        numAstNodes <= getMaxNumAstNodes() and
-        l = entity.getLocation()
+        l = function.getLocation()
      |
-        entity
+        function
        order by
          numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
      )
  else
-    // Use the smallest entity, resolving ties using the entity that
-    // appears first in the source archive.
+    // Use the smallest function, resolving ties using the function that appears first in the source
+    // code.
    result =
-      min(DatabaseFeatures::Entity entity, int numAstNodes, Location l |
-        entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and
-        numAstNodes = getNumAstNodesInEntity(entity) and
-        l = entity.getLocation()
+      min(Function function, int numAstNodes, Location l |
+        function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
+        numAstNodes = getNumAstNodesInFunction(function) and
+        l = function.getLocation()
      |
-        entity
+        function
        order by
          numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
      )