Extract body tokens from the JS AST, not the CodeToFeatures AST

This commit is contained in:
Henry Mercer
2022-01-11 17:34:41 +00:00
parent 9065a7f320
commit 6900323ced

View File

@@ -128,14 +128,44 @@ private string getACallBasedTokenFeatureComponent(
/** This module provides functionality for getting the function body feature associated with a particular entity. */
module FunctionBodies {
string getTokenizedAstNode(ASTNode node) {
// NB: Unary and binary operator expressions e.g. -a, a + b and compound
// assignments e.g. a += b can be identified by the expression type.
result = node.(Identifier).getName()
or
// Computed property accesses for which we can predetermine the property being accessed.
// NB: May alias with operators e.g. could have '+' as a property name.
result = node.(IndexExpr).getPropertyName()
or
// We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`.
result = node.(NumberLiteral).getRawValue()
or
// We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation.
not node instanceof NumberLiteral and
result = node.(Literal).getValue()
or
result = node.(TemplateElement).getRawValue()
}
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
entity =
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
exists(DatabaseFeatures::AstNode node |
DatabaseFeatures::astNodes(entity, _, _, node, _) and
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
// approximates the behavior of the classifer on non-generic body features where large body
// features are replaced by the absent token.
//
// We count nodes instead of tokens because tokens are often not unique.
strictcount(ASTNode node |
node.getParent*() = entity.getDefinedFunction() and
not node = entity.getDefinedFunction().getIdentifier() and
exists(getTokenizedAstNode(node))
) <= 256 and
exists(ASTNode node |
node.getParent*() = entity.getDefinedFunction() and
not node = entity.getDefinedFunction().getIdentifier() and
token = getTokenizedAstNode(node) and
location = node.getLocation()
)
}
@@ -146,12 +176,6 @@ module FunctionBodies {
* This is a string containing natural language tokens in the order that they appear in the source code for the entity.
*/
string getBodyTokenFeatureForEntity(DatabaseFeatures::Entity entity) {
// If a function has more than 256 body subtokens, then featurize it as absent. This
// approximates the behavior of the classifer on non-generic body features where large body
// features are replaced by the absent token.
//
// We count locations instead of tokens because tokens are often not unique.
strictcount(Location l | bodyTokens(entity, l, _)) <= 256 and
result =
strictconcat(string token, Location l |
bodyTokens(entity, l, token)