mirror of
https://github.com/github/codeql.git
synced 2026-05-20 22:27:18 +02:00
Migrate representative entity -> representative function
This commit is contained in:
@@ -15,69 +15,18 @@ external predicate availableMlModels(
|
||||
ATMConfig getCfg() { any() }
|
||||
|
||||
/**
|
||||
* This module provides functionality that takes an endpoint and provides an entity that encloses that
|
||||
* endpoint and is suitable for similarity analysis.
|
||||
* This module provides functionality that takes an endpoint and provides an function that encloses
|
||||
* that endpoint.
|
||||
*/
|
||||
module EndpointToEntity {
|
||||
module EndpointToFunction {
|
||||
private import CodeToFeatures
|
||||
|
||||
/**
|
||||
* Get an entity enclosing the endpoint that is suitable for similarity analysis. In general,
|
||||
* this may associate multiple entities to a single endpoint.
|
||||
* Get a function containing the endpoint that is suitable for featurization. In general,
|
||||
* this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
|
||||
*/
|
||||
DatabaseFeatures::Entity getAnEntityForEndpoint(DataFlow::Node endpoint) {
|
||||
DatabaseFeatures::entities(result, _, _, _, _, _, _, _, _) and
|
||||
result.getDefinedFunction() = endpoint.getContainer().getEnclosingContainer*()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This module provides functionality that takes an entity and provides effective endpoints within
|
||||
* that entity.
|
||||
*
|
||||
* We use the following terminology to describe endpoints:
|
||||
*
|
||||
* - The *candidate* endpoints are the set of data flow nodes that should be passed to the
|
||||
* appropriate endpoint filter to produce the set of effective endpoints.
|
||||
* When we have a model that beats the performance of the baseline, we will likely define the
|
||||
* candidate endpoints based on the most confident predictions of the model.
|
||||
* - An *effective* endpoint is a candidate endpoint which passes through the endpoint filter.
|
||||
* In other words, it is a candidate endpoint for which the `isEffectiveSink` (or
|
||||
* `isEffectiveSource`) predicate defined in the `ATMConfig` instance in scope holds.
|
||||
*/
|
||||
module EntityToEffectiveEndpoint {
|
||||
private import CodeToFeatures
|
||||
|
||||
/**
|
||||
* Returns endpoint candidates within the specified entities.
|
||||
*
|
||||
* The baseline implementation of this is that a candidate endpoint is any data flow node that is
|
||||
* enclosed within the specified entity.
|
||||
*/
|
||||
private DataFlow::Node getABaselineEndpointCandidate(DatabaseFeatures::Entity entity) {
|
||||
result.getContainer().getEnclosingContainer*() = entity.getDefinedFunction()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an effective source enclosed by the specified entity.
|
||||
*
|
||||
* N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
|
||||
* source may occur in a function defined within the specified entity.
|
||||
*/
|
||||
DataFlow::Node getAnEffectiveSource(DatabaseFeatures::Entity entity) {
|
||||
result = getABaselineEndpointCandidate(entity) and
|
||||
getCfg().isEffectiveSource(result)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an effective sink enclosed by the specified entity.
|
||||
*
|
||||
* N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
|
||||
* sink may occur in a function defined within the specified entity.
|
||||
*/
|
||||
DataFlow::Node getAnEffectiveSink(DatabaseFeatures::Entity entity) {
|
||||
result = getABaselineEndpointCandidate(entity) and
|
||||
getCfg().isEffectiveSink(result)
|
||||
Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
|
||||
result = endpoint.getContainer().getEnclosingContainer*()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,9 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
(
|
||||
// Features for endpoints that are contained within a function.
|
||||
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
|
||||
exists(DatabaseFeatures::Entity entity |
|
||||
entity.getDefinedFunction() = getRepresentativeFunctionForEndpoint(endpoint)
|
||||
|
|
||||
// The name of the function that encloses the endpoint.
|
||||
featureName = "enclosingFunctionName" and result = entity.getName()
|
||||
or
|
||||
@@ -147,24 +149,27 @@ module FunctionBodies {
|
||||
result = node.(TemplateElement).getRawValue()
|
||||
}
|
||||
|
||||
/** Returns an AST node within the function `f` that we should featurize. */
|
||||
pragma[inline]
|
||||
ASTNode getAnASTNodeToFeaturize(Function f) {
|
||||
result.getParent*() = f and
|
||||
not result = f.getIdentifier() and
|
||||
exists(getTokenizedAstNode(result))
|
||||
}
|
||||
|
||||
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
|
||||
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
|
||||
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
|
||||
entity =
|
||||
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
|
||||
entity.getDefinedFunction() =
|
||||
getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
|
||||
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
|
||||
// approximates the behavior of the classifer on non-generic body features where large body
|
||||
// features are replaced by the absent token.
|
||||
//
|
||||
// We count nodes instead of tokens because tokens are often not unique.
|
||||
strictcount(ASTNode node |
|
||||
node.getParent*() = entity.getDefinedFunction() and
|
||||
not node = entity.getDefinedFunction().getIdentifier() and
|
||||
exists(getTokenizedAstNode(node))
|
||||
) <= 256 and
|
||||
strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
|
||||
exists(ASTNode node |
|
||||
node.getParent*() = entity.getDefinedFunction() and
|
||||
not node = entity.getDefinedFunction().getIdentifier() and
|
||||
node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
|
||||
token = getTokenizedAstNode(node) and
|
||||
location = node.getLocation()
|
||||
)
|
||||
|
||||
@@ -15,8 +15,8 @@ private string getACompatibleModelChecksum() {
|
||||
}
|
||||
|
||||
/**
|
||||
* The maximum number of AST nodes an entity containing an endpoint should have before we should
|
||||
* choose a smaller entity to represent the endpoint.
|
||||
* The maximum number of AST nodes an function containing an endpoint should have before we should
|
||||
* choose a smaller function to represent the endpoint.
|
||||
*
|
||||
* This is intended to represent a balance in terms of the amount of context we provide to the
|
||||
* model: we don't want the function to be too small, because then it doesn't contain very much
|
||||
@@ -26,54 +26,56 @@ private string getACompatibleModelChecksum() {
|
||||
private int getMaxNumAstNodes() { result = 1024 }
|
||||
|
||||
/**
|
||||
* Returns the number of AST nodes contained within the specified entity.
|
||||
* Returns the number of AST nodes contained within the specified function.
|
||||
*/
|
||||
private int getNumAstNodesInEntity(DatabaseFeatures::Entity entity) {
|
||||
// Restrict the values `entity` can take on
|
||||
entity = EndpointToEntity::getAnEntityForEndpoint(_) and
|
||||
result =
|
||||
count(DatabaseFeatures::AstNode astNode | DatabaseFeatures::astNodes(entity, _, _, astNode, _))
|
||||
private int getNumAstNodesInFunction(Function function) {
|
||||
// Restrict the values `function` can take on
|
||||
function = EndpointToFunction::getAFunctionForEndpoint(_) and
|
||||
result = count(EndpointFeatures::FunctionBodies::getAnASTNodeToFeaturize(function))
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a single entity to use as the representative entity for the endpoint.
|
||||
* Get the enclosing function for an endpoint.
|
||||
*
|
||||
* This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
|
||||
*
|
||||
* We try to use the largest entity containing the endpoint that's below the AST node limit defined
|
||||
* in `getMaxNumAstNodes`. In the event of a tie, we use the entity that appears first within the
|
||||
* source archive.
|
||||
* We try to use the largest function containing the endpoint that's below the AST node limit
|
||||
* defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
|
||||
* within the source code.
|
||||
*
|
||||
* If no entities are smaller than the AST node limit, then we use the smallest entity containing
|
||||
* If no functions are smaller than the AST node limit, then we use the smallest function containing
|
||||
* the endpoint.
|
||||
*/
|
||||
DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpoint) {
|
||||
// Check whether there's an entity containing the endpoint that's smaller than the AST node limit.
|
||||
Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
|
||||
// Check whether there's a function containing the endpoint that's smaller than the AST node
|
||||
// limit.
|
||||
if
|
||||
getNumAstNodesInEntity(EndpointToEntity::getAnEntityForEndpoint(endpoint)) <=
|
||||
getNumAstNodesInFunction(EndpointToFunction::getAFunctionForEndpoint(endpoint)) <=
|
||||
getMaxNumAstNodes()
|
||||
then
|
||||
// Use the largest entity smaller than the AST node limit, resolving ties using the entity that
|
||||
// appears first in the source archive.
|
||||
// Use the largest function smaller than the AST node limit, resolving ties using the function
|
||||
// that appears first in the source code.
|
||||
result =
|
||||
min(DatabaseFeatures::Entity entity, int numAstNodes, Location l |
|
||||
entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and
|
||||
numAstNodes = getNumAstNodesInEntity(entity) and
|
||||
min(Function function, int numAstNodes, Location l |
|
||||
function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
|
||||
numAstNodes = getNumAstNodesInFunction(function) and
|
||||
numAstNodes <= getMaxNumAstNodes() and
|
||||
l = entity.getLocation()
|
||||
l = function.getLocation()
|
||||
|
|
||||
entity
|
||||
function
|
||||
order by
|
||||
numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
|
||||
)
|
||||
else
|
||||
// Use the smallest entity, resolving ties using the entity that
|
||||
// appears first in the source archive.
|
||||
// Use the smallest function, resolving ties using the function that appears first in the source
|
||||
// code.
|
||||
result =
|
||||
min(DatabaseFeatures::Entity entity, int numAstNodes, Location l |
|
||||
entity = EndpointToEntity::getAnEntityForEndpoint(endpoint) and
|
||||
numAstNodes = getNumAstNodesInEntity(entity) and
|
||||
l = entity.getLocation()
|
||||
min(Function function, int numAstNodes, Location l |
|
||||
function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
|
||||
numAstNodes = getNumAstNodesInFunction(function) and
|
||||
l = function.getLocation()
|
||||
|
|
||||
entity
|
||||
function
|
||||
order by
|
||||
numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user