diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll index e746c78bdde..72be8174dc8 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll @@ -15,69 +15,18 @@ external predicate availableMlModels( ATMConfig getCfg() { any() } /** - * This module provides functionality that takes an endpoint and provides an entity that encloses that - * endpoint and is suitable for similarity analysis. + * This module provides functionality that takes an endpoint and provides an function that encloses + * that endpoint. */ -module EndpointToEntity { +module EndpointToFunction { private import CodeToFeatures /** - * Get an entity enclosing the endpoint that is suitable for similarity analysis. In general, - * this may associate multiple entities to a single endpoint. + * Get a function containing the endpoint that is suitable for featurization. In general, + * this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint. */ - DatabaseFeatures::Entity getAnEntityForEndpoint(DataFlow::Node endpoint) { - DatabaseFeatures::entities(result, _, _, _, _, _, _, _, _) and - result.getDefinedFunction() = endpoint.getContainer().getEnclosingContainer*() - } -} - -/** - * This module provides functionality that takes an entity and provides effective endpoints within - * that entity. - * - * We use the following terminology to describe endpoints: - * - * - The *candidate* endpoints are the set of data flow nodes that should be passed to the - * appropriate endpoint filter to produce the set of effective endpoints. - * When we have a model that beats the performance of the baseline, we will likely define the - * candidate endpoints based on the most confident predictions of the model. - * - An *effective* endpoint is a candidate endpoint which passes through the endpoint filter. - * In other words, it is a candidate endpoint for which the `isEffectiveSink` (or - * `isEffectiveSource`) predicate defined in the `ATMConfig` instance in scope holds. - */ -module EntityToEffectiveEndpoint { - private import CodeToFeatures - - /** - * Returns endpoint candidates within the specified entities. - * - * The baseline implementation of this is that a candidate endpoint is any data flow node that is - * enclosed within the specified entity. - */ - private DataFlow::Node getABaselineEndpointCandidate(DatabaseFeatures::Entity entity) { - result.getContainer().getEnclosingContainer*() = entity.getDefinedFunction() - } - - /** - * Get an effective source enclosed by the specified entity. - * - * N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective - * source may occur in a function defined within the specified entity. - */ - DataFlow::Node getAnEffectiveSource(DatabaseFeatures::Entity entity) { - result = getABaselineEndpointCandidate(entity) and - getCfg().isEffectiveSource(result) - } - - /** - * Get an effective sink enclosed by the specified entity. - * - * N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective - * sink may occur in a function defined within the specified entity. - */ - DataFlow::Node getAnEffectiveSink(DatabaseFeatures::Entity entity) { - result = getABaselineEndpointCandidate(entity) and - getCfg().isEffectiveSink(result) + Function getAFunctionForEndpoint(DataFlow::Node endpoint) { + result = endpoint.getContainer().getEnclosingContainer*() } } diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll index d5b0c6e22d9..7484d5fc287 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll @@ -31,7 +31,9 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) { endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and ( // Features for endpoints that are contained within a function. - exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) | + exists(DatabaseFeatures::Entity entity | + entity.getDefinedFunction() = getRepresentativeFunctionForEndpoint(endpoint) + | // The name of the function that encloses the endpoint. featureName = "enclosingFunctionName" and result = entity.getName() or @@ -147,24 +149,27 @@ module FunctionBodies { result = node.(TemplateElement).getRawValue() } + /** Returns an AST node within the function `f` that we should featurize. */ + pragma[inline] + ASTNode getAnASTNodeToFeaturize(Function f) { + result.getParent*() = f and + not result = f.getIdentifier() and + exists(getTokenizedAstNode(result)) + } + /** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */ private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) { // Performance optimization: Restrict the set of entities to those containing an endpoint to featurize. - entity = - getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and + entity.getDefinedFunction() = + getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and // Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This // approximates the behavior of the classifer on non-generic body features where large body // features are replaced by the absent token. // // We count nodes instead of tokens because tokens are often not unique. - strictcount(ASTNode node | - node.getParent*() = entity.getDefinedFunction() and - not node = entity.getDefinedFunction().getIdentifier() and - exists(getTokenizedAstNode(node)) - ) <= 256 and + strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and exists(ASTNode node | - node.getParent*() = entity.getDefinedFunction() and - not node = entity.getDefinedFunction().getIdentifier() and + node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and token = getTokenizedAstNode(node) and location = node.getLocation() ) diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll index 089db15fea3..1ed0f6aa480 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll @@ -15,8 +15,8 @@ private string getACompatibleModelChecksum() { } /** - * The maximum number of AST nodes an entity containing an endpoint should have before we should - * choose a smaller entity to represent the endpoint. + * The maximum number of AST nodes an function containing an endpoint should have before we should + * choose a smaller function to represent the endpoint. * * This is intended to represent a balance in terms of the amount of context we provide to the * model: we don't want the function to be too small, because then it doesn't contain very much @@ -26,54 +26,56 @@ private string getACompatibleModelChecksum() { private int getMaxNumAstNodes() { result = 1024 } /** - * Returns the number of AST nodes contained within the specified entity. + * Returns the number of AST nodes contained within the specified function. */ -private int getNumAstNodesInEntity(DatabaseFeatures::Entity entity) { - // Restrict the values `entity` can take on - entity = EndpointToEntity::getAnEntityForEndpoint(_) and - result = - count(DatabaseFeatures::AstNode astNode | DatabaseFeatures::astNodes(entity, _, _, astNode, _)) +private int getNumAstNodesInFunction(Function function) { + // Restrict the values `function` can take on + function = EndpointToFunction::getAFunctionForEndpoint(_) and + result = count(EndpointFeatures::FunctionBodies::getAnASTNodeToFeaturize(function)) } /** - * Get a single entity to use as the representative entity for the endpoint. + * Get the enclosing function for an endpoint. + * + * This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features. * - * We try to use the largest entity containing the endpoint that's below the AST node limit defined - * in `getMaxNumAstNodes`. In the event of a tie, we use the entity that appears first within the - * source archive. + * We try to use the largest function containing the endpoint that's below the AST node limit + * defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first + * within the source code. * - * If no entities are smaller than the AST node limit, then we use the smallest entity containing + * If no functions are smaller than the AST node limit, then we use the smallest function containing * the endpoint. */ -DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpoint) { - // Check whether there's an entity containing the endpoint that's smaller than the AST node limit. +Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) { + // Check whether there's a function containing the endpoint that's smaller than the AST node + // limit. if - getNumAstNodesInEntity(EndpointToEntity::getAnEntityForEndpoint(endpoint)) <= + getNumAstNodesInFunction(EndpointToFunction::getAFunctionForEndpoint(endpoint)) <= getMaxNumAstNodes() then - // Use the largest entity smaller than the AST node limit, resolving ties using the entity that - // appears first in the source archive. + // Use the largest function smaller than the AST node limit, resolving ties using the function + // that appears first in the source code. result = - min(DatabaseFeatures::Entity entity, int numAstNodes, Location l | - entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and - numAstNodes = getNumAstNodesInEntity(entity) and + min(Function function, int numAstNodes, Location l | + function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and + numAstNodes = getNumAstNodesInFunction(function) and numAstNodes <= getMaxNumAstNodes() and - l = entity.getLocation() + l = function.getLocation() | - entity + function order by numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn() ) else - // Use the smallest entity, resolving ties using the entity that - // appears first in the source archive. + // Use the smallest function, resolving ties using the function that appears first in the source + // code. result = - min(DatabaseFeatures::Entity entity, int numAstNodes, Location l | - entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and - numAstNodes = getNumAstNodesInEntity(entity) and - l = entity.getLocation() + min(Function function, int numAstNodes, Location l | + function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and + numAstNodes = getNumAstNodesInFunction(function) and + l = function.getLocation() | - entity + function order by numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn() )