Merge pull request #7357 from github/henrymercer/js-atm-only-featurize-with-flow

JS: Only featurize endpoints that are part of a flow path
This commit is contained in:
Henry Mercer
2021-12-17 18:03:40 +00:00
committed by GitHub
2 changed files with 88 additions and 56 deletions

View File

@@ -6,7 +6,20 @@
import javascript
import CodeToFeatures
import EndpointScoring
private import EndpointScoring
/**
* A configuration that defines which endpoints should be featurized.
*
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
* to featurize.
*/
abstract class FeaturizationConfig extends string {
bindingset[this]
FeaturizationConfig() { any() }
abstract DataFlow::Node getAnEndpointToFeaturize();
}
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -14,51 +27,55 @@ import EndpointScoring
* This is a single string containing a space-separated list of tokens.
*/
private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
// Features for endpoints that are contained within a function.
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
// The name of the function that encloses the endpoint.
featureName = "enclosingFunctionName" and result = entity.getName()
or
// A feature containing natural language tokens from the function that encloses the endpoint in
// the order that they appear in the source code.
featureName = "enclosingFunctionBody" and
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
result =
strictconcat(DataFlow::CallNode call, string component |
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
component, " "
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
(
// Features for endpoints that are contained within a function.
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
// The name of the function that encloses the endpoint.
featureName = "enclosingFunctionName" and result = entity.getName()
or
// A feature containing natural language tokens from the function that encloses the endpoint in
// the order that they appear in the source code.
featureName = "enclosingFunctionBody" and
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
// The access path of the function being called, both with and without structural info, if the
// function being called originates from an external API. For example, the endpoint here:
//
// ```js
// const mongoose = require('mongoose'),
// User = mongoose.model('User', null);
// User.findOne(ENDPOINT);
// ```
//
// would have a callee access path with structural info of
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
// path without structural info of `mongoose model findOne`.
//
// These features indicate that the callee comes from (reading the access path backwards) an
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
// external library.
exists(AccessPaths::Boolean includeStructuralInfo |
featureName =
"calleeAccessPath" +
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
or
result =
concat(API::Node node, string accessPath |
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
strictconcat(DataFlow::CallNode call, string component |
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
accessPath, " "
component, " "
)
or
// The access path of the function being called, both with and without structural info, if the
// function being called originates from an external API. For example, the endpoint here:
//
// ```js
// const mongoose = require('mongoose'),
// User = mongoose.model('User', null);
// User.findOne(ENDPOINT);
// ```
//
// would have a callee access path with structural info of
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
// path without structural info of `mongoose model findOne`.
//
// These features indicate that the callee comes from (reading the access path backwards) an
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
// external library.
exists(AccessPaths::Boolean includeStructuralInfo |
featureName =
"calleeAccessPath" +
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
result =
concat(API::Node node, string accessPath |
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
|
accessPath, " "
)
)
)
}
@@ -77,6 +94,8 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
private string getACallBasedTokenFeatureComponent(
DataFlow::Node endpoint, DataFlow::CallNode call, string featureName
) {
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
// Features for endpoints that are an argument to a function call.
endpoint = call.getAnArgument() and
(
@@ -111,6 +130,9 @@ private string getACallBasedTokenFeatureComponent(
module FunctionBodies {
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
entity =
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
exists(DatabaseFeatures::AstNode node |
DatabaseFeatures::astNodes(entity, _, _, node, _) and
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
@@ -276,7 +298,8 @@ private string getASupportedFeatureName() {
* `featureValue` for the endpoint `endpoint`.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
ModelScoring::endpoints(endpoint) and
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
(
if strictcount(getTokenFeature(endpoint, featureName)) = 1
then featureValue = getTokenFeature(endpoint, featureName)

View File

@@ -80,23 +80,30 @@ DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpo
}
module ModelScoring {
predicate endpoints(DataFlow::Node endpoint) {
getCfg().isEffectiveSource(endpoint) or
getCfg().isEffectiveSink(endpoint)
/**
* A featurization config that only featurizes new candidate endpoints that are part of a flow
* path.
*/
class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
override DataFlow::Node getAnEndpointToFeaturize() {
getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
or
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result)
}
}
private int requestedEndpointTypes() { result = any(EndpointType type).getEncoding() }
private predicate relevantTokenFeatures(
DataFlow::Node endpoint, string featureName, string featureValue
) {
endpoints(endpoint) and
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
DataFlow::Node getARequestedEndpoint() {
result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize()
}
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
scoreEndpoints(endpoints/1, requestedEndpointTypes/0, relevantTokenFeatures/3,
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
scoreEndpoints(getARequestedEndpoint/0, getARequestedEndpointType/0,
EndpointFeatures::tokenFeatures/3, getACompatibleModelChecksum/0)(endpoint,
encodedEndpointType, score)
}
/**
@@ -212,7 +219,9 @@ class EndpointScoringResults extends ScoringResults {
}
module Debugging {
query predicate hopInputEndpoints = ModelScoring::endpoints/1;
query predicate hopInputEndpoints(DataFlow::Node endpoint) {
endpoint = ModelScoring::getARequestedEndpoint()
}
query predicate endpointScores = ModelScoring::endpointScores/3;