Migrate representative entity -> representative function

This commit is contained in:
Henry Mercer
2022-01-11 19:09:13 +00:00
parent 2edfb24c70
commit dd009d81a4
3 changed files with 54 additions and 98 deletions

View File

@@ -15,69 +15,18 @@ external predicate availableMlModels(
ATMConfig getCfg() { any() }
/**
* This module provides functionality that takes an endpoint and provides an entity that encloses that
* endpoint and is suitable for similarity analysis.
* This module provides functionality that takes an endpoint and provides an function that encloses
* that endpoint.
*/
module EndpointToEntity {
module EndpointToFunction {
private import CodeToFeatures
/**
* Get an entity enclosing the endpoint that is suitable for similarity analysis. In general,
* this may associate multiple entities to a single endpoint.
* Get a function containing the endpoint that is suitable for featurization. In general,
* this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
*/
DatabaseFeatures::Entity getAnEntityForEndpoint(DataFlow::Node endpoint) {
DatabaseFeatures::entities(result, _, _, _, _, _, _, _, _) and
result.getDefinedFunction() = endpoint.getContainer().getEnclosingContainer*()
}
}
/**
* This module provides functionality that takes an entity and provides effective endpoints within
* that entity.
*
* We use the following terminology to describe endpoints:
*
* - The *candidate* endpoints are the set of data flow nodes that should be passed to the
* appropriate endpoint filter to produce the set of effective endpoints.
* When we have a model that beats the performance of the baseline, we will likely define the
* candidate endpoints based on the most confident predictions of the model.
* - An *effective* endpoint is a candidate endpoint which passes through the endpoint filter.
* In other words, it is a candidate endpoint for which the `isEffectiveSink` (or
* `isEffectiveSource`) predicate defined in the `ATMConfig` instance in scope holds.
*/
module EntityToEffectiveEndpoint {
private import CodeToFeatures
/**
* Returns endpoint candidates within the specified entities.
*
* The baseline implementation of this is that a candidate endpoint is any data flow node that is
* enclosed within the specified entity.
*/
private DataFlow::Node getABaselineEndpointCandidate(DatabaseFeatures::Entity entity) {
result.getContainer().getEnclosingContainer*() = entity.getDefinedFunction()
}
/**
* Get an effective source enclosed by the specified entity.
*
* N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
* source may occur in a function defined within the specified entity.
*/
DataFlow::Node getAnEffectiveSource(DatabaseFeatures::Entity entity) {
result = getABaselineEndpointCandidate(entity) and
getCfg().isEffectiveSource(result)
}
/**
* Get an effective sink enclosed by the specified entity.
*
* N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
* sink may occur in a function defined within the specified entity.
*/
DataFlow::Node getAnEffectiveSink(DatabaseFeatures::Entity entity) {
result = getABaselineEndpointCandidate(entity) and
getCfg().isEffectiveSink(result)
Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
result = endpoint.getContainer().getEnclosingContainer*()
}
}

View File

@@ -31,7 +31,9 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
(
// Features for endpoints that are contained within a function.
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
exists(DatabaseFeatures::Entity entity |
entity.getDefinedFunction() = getRepresentativeFunctionForEndpoint(endpoint)
|
// The name of the function that encloses the endpoint.
featureName = "enclosingFunctionName" and result = entity.getName()
or
@@ -147,24 +149,27 @@ module FunctionBodies {
result = node.(TemplateElement).getRawValue()
}
/** Returns an AST node within the function `f` that we should featurize. */
pragma[inline]
ASTNode getAnASTNodeToFeaturize(Function f) {
result.getParent*() = f and
not result = f.getIdentifier() and
exists(getTokenizedAstNode(result))
}
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
entity =
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
entity.getDefinedFunction() =
getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
// approximates the behavior of the classifer on non-generic body features where large body
// features are replaced by the absent token.
//
// We count nodes instead of tokens because tokens are often not unique.
strictcount(ASTNode node |
node.getParent*() = entity.getDefinedFunction() and
not node = entity.getDefinedFunction().getIdentifier() and
exists(getTokenizedAstNode(node))
) <= 256 and
strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
exists(ASTNode node |
node.getParent*() = entity.getDefinedFunction() and
not node = entity.getDefinedFunction().getIdentifier() and
node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
token = getTokenizedAstNode(node) and
location = node.getLocation()
)

View File

@@ -15,8 +15,8 @@ private string getACompatibleModelChecksum() {
}
/**
* The maximum number of AST nodes an entity containing an endpoint should have before we should
* choose a smaller entity to represent the endpoint.
* The maximum number of AST nodes an function containing an endpoint should have before we should
* choose a smaller function to represent the endpoint.
*
* This is intended to represent a balance in terms of the amount of context we provide to the
* model: we don't want the function to be too small, because then it doesn't contain very much
@@ -26,54 +26,56 @@ private string getACompatibleModelChecksum() {
private int getMaxNumAstNodes() { result = 1024 }
/**
* Returns the number of AST nodes contained within the specified entity.
* Returns the number of AST nodes contained within the specified function.
*/
private int getNumAstNodesInEntity(DatabaseFeatures::Entity entity) {
// Restrict the values `entity` can take on
entity = EndpointToEntity::getAnEntityForEndpoint(_) and
result =
count(DatabaseFeatures::AstNode astNode | DatabaseFeatures::astNodes(entity, _, _, astNode, _))
private int getNumAstNodesInFunction(Function function) {
// Restrict the values `function` can take on
function = EndpointToFunction::getAFunctionForEndpoint(_) and
result = count(EndpointFeatures::FunctionBodies::getAnASTNodeToFeaturize(function))
}
/**
* Get a single entity to use as the representative entity for the endpoint.
* Get the enclosing function for an endpoint.
*
* This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
*
* We try to use the largest entity containing the endpoint that's below the AST node limit defined
* in `getMaxNumAstNodes`. In the event of a tie, we use the entity that appears first within the
* source archive.
* We try to use the largest function containing the endpoint that's below the AST node limit
* defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
* within the source code.
*
* If no entities are smaller than the AST node limit, then we use the smallest entity containing
* If no functions are smaller than the AST node limit, then we use the smallest function containing
* the endpoint.
*/
DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpoint) {
// Check whether there's an entity containing the endpoint that's smaller than the AST node limit.
Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
// Check whether there's a function containing the endpoint that's smaller than the AST node
// limit.
if
getNumAstNodesInEntity(EndpointToEntity::getAnEntityForEndpoint(endpoint)) <=
getNumAstNodesInFunction(EndpointToFunction::getAFunctionForEndpoint(endpoint)) <=
getMaxNumAstNodes()
then
// Use the largest entity smaller than the AST node limit, resolving ties using the entity that
// appears first in the source archive.
// Use the largest function smaller than the AST node limit, resolving ties using the function
// that appears first in the source code.
result =
min(DatabaseFeatures::Entity entity, int numAstNodes, Location l |
entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and
numAstNodes = getNumAstNodesInEntity(entity) and
min(Function function, int numAstNodes, Location l |
function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
numAstNodes = getNumAstNodesInFunction(function) and
numAstNodes <= getMaxNumAstNodes() and
l = entity.getLocation()
l = function.getLocation()
|
entity
function
order by
numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
)
else
// Use the smallest entity, resolving ties using the entity that
// appears first in the source archive.
// Use the smallest function, resolving ties using the function that appears first in the source
// code.
result =
min(DatabaseFeatures::Entity entity, int numAstNodes, Location l |
entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and
numAstNodes = getNumAstNodesInEntity(entity) and
l = entity.getLocation()
min(Function function, int numAstNodes, Location l |
function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
numAstNodes = getNumAstNodesInFunction(function) and
l = function.getLocation()
|
entity
function
order by
numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
)