mirror of
https://github.com/github/codeql.git
synced 2026-04-30 19:26:02 +02:00
Merge pull request #7357 from github/henrymercer/js-atm-only-featurize-with-flow
JS: Only featurize endpoints that are part of a flow path
This commit is contained in:
@@ -6,7 +6,20 @@
|
||||
|
||||
import javascript
|
||||
import CodeToFeatures
|
||||
import EndpointScoring
|
||||
private import EndpointScoring
|
||||
|
||||
/**
|
||||
* A configuration that defines which endpoints should be featurized.
|
||||
*
|
||||
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
|
||||
* to featurize.
|
||||
*/
|
||||
abstract class FeaturizationConfig extends string {
|
||||
bindingset[this]
|
||||
FeaturizationConfig() { any() }
|
||||
|
||||
abstract DataFlow::Node getAnEndpointToFeaturize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
|
||||
@@ -14,51 +27,55 @@ import EndpointScoring
|
||||
* This is a single string containing a space-separated list of tokens.
|
||||
*/
|
||||
private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
|
||||
// Features for endpoints that are contained within a function.
|
||||
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
|
||||
// The name of the function that encloses the endpoint.
|
||||
featureName = "enclosingFunctionName" and result = entity.getName()
|
||||
or
|
||||
// A feature containing natural language tokens from the function that encloses the endpoint in
|
||||
// the order that they appear in the source code.
|
||||
featureName = "enclosingFunctionBody" and
|
||||
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
|
||||
)
|
||||
or
|
||||
result =
|
||||
strictconcat(DataFlow::CallNode call, string component |
|
||||
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
||||
|
|
||||
component, " "
|
||||
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
(
|
||||
// Features for endpoints that are contained within a function.
|
||||
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
|
||||
// The name of the function that encloses the endpoint.
|
||||
featureName = "enclosingFunctionName" and result = entity.getName()
|
||||
or
|
||||
// A feature containing natural language tokens from the function that encloses the endpoint in
|
||||
// the order that they appear in the source code.
|
||||
featureName = "enclosingFunctionBody" and
|
||||
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
|
||||
)
|
||||
or
|
||||
// The access path of the function being called, both with and without structural info, if the
|
||||
// function being called originates from an external API. For example, the endpoint here:
|
||||
//
|
||||
// ```js
|
||||
// const mongoose = require('mongoose'),
|
||||
// User = mongoose.model('User', null);
|
||||
// User.findOne(ENDPOINT);
|
||||
// ```
|
||||
//
|
||||
// would have a callee access path with structural info of
|
||||
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
|
||||
// path without structural info of `mongoose model findOne`.
|
||||
//
|
||||
// These features indicate that the callee comes from (reading the access path backwards) an
|
||||
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
|
||||
// external library.
|
||||
exists(AccessPaths::Boolean includeStructuralInfo |
|
||||
featureName =
|
||||
"calleeAccessPath" +
|
||||
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
|
||||
or
|
||||
result =
|
||||
concat(API::Node node, string accessPath |
|
||||
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
|
||||
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
|
||||
strictconcat(DataFlow::CallNode call, string component |
|
||||
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
||||
|
|
||||
accessPath, " "
|
||||
component, " "
|
||||
)
|
||||
or
|
||||
// The access path of the function being called, both with and without structural info, if the
|
||||
// function being called originates from an external API. For example, the endpoint here:
|
||||
//
|
||||
// ```js
|
||||
// const mongoose = require('mongoose'),
|
||||
// User = mongoose.model('User', null);
|
||||
// User.findOne(ENDPOINT);
|
||||
// ```
|
||||
//
|
||||
// would have a callee access path with structural info of
|
||||
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
|
||||
// path without structural info of `mongoose model findOne`.
|
||||
//
|
||||
// These features indicate that the callee comes from (reading the access path backwards) an
|
||||
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
|
||||
// external library.
|
||||
exists(AccessPaths::Boolean includeStructuralInfo |
|
||||
featureName =
|
||||
"calleeAccessPath" +
|
||||
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
|
||||
result =
|
||||
concat(API::Node node, string accessPath |
|
||||
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
|
||||
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
|
||||
|
|
||||
accessPath, " "
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
@@ -77,6 +94,8 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
|
||||
private string getACallBasedTokenFeatureComponent(
|
||||
DataFlow::Node endpoint, DataFlow::CallNode call, string featureName
|
||||
) {
|
||||
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
// Features for endpoints that are an argument to a function call.
|
||||
endpoint = call.getAnArgument() and
|
||||
(
|
||||
@@ -111,6 +130,9 @@ private string getACallBasedTokenFeatureComponent(
|
||||
module FunctionBodies {
|
||||
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
|
||||
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
|
||||
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
|
||||
entity =
|
||||
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
|
||||
exists(DatabaseFeatures::AstNode node |
|
||||
DatabaseFeatures::astNodes(entity, _, _, node, _) and
|
||||
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
|
||||
@@ -276,7 +298,8 @@ private string getASupportedFeatureName() {
|
||||
* `featureValue` for the endpoint `endpoint`.
|
||||
*/
|
||||
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
|
||||
ModelScoring::endpoints(endpoint) and
|
||||
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
|
||||
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
|
||||
(
|
||||
if strictcount(getTokenFeature(endpoint, featureName)) = 1
|
||||
then featureValue = getTokenFeature(endpoint, featureName)
|
||||
|
||||
@@ -80,23 +80,30 @@ DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpo
|
||||
}
|
||||
|
||||
module ModelScoring {
|
||||
predicate endpoints(DataFlow::Node endpoint) {
|
||||
getCfg().isEffectiveSource(endpoint) or
|
||||
getCfg().isEffectiveSink(endpoint)
|
||||
/**
|
||||
* A featurization config that only featurizes new candidate endpoints that are part of a flow
|
||||
* path.
|
||||
*/
|
||||
class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
|
||||
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
|
||||
|
||||
override DataFlow::Node getAnEndpointToFeaturize() {
|
||||
getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
|
||||
or
|
||||
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result)
|
||||
}
|
||||
}
|
||||
|
||||
private int requestedEndpointTypes() { result = any(EndpointType type).getEncoding() }
|
||||
|
||||
private predicate relevantTokenFeatures(
|
||||
DataFlow::Node endpoint, string featureName, string featureValue
|
||||
) {
|
||||
endpoints(endpoint) and
|
||||
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
|
||||
DataFlow::Node getARequestedEndpoint() {
|
||||
result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize()
|
||||
}
|
||||
|
||||
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
|
||||
|
||||
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
|
||||
scoreEndpoints(endpoints/1, requestedEndpointTypes/0, relevantTokenFeatures/3,
|
||||
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
|
||||
scoreEndpoints(getARequestedEndpoint/0, getARequestedEndpointType/0,
|
||||
EndpointFeatures::tokenFeatures/3, getACompatibleModelChecksum/0)(endpoint,
|
||||
encodedEndpointType, score)
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -212,7 +219,9 @@ class EndpointScoringResults extends ScoringResults {
|
||||
}
|
||||
|
||||
module Debugging {
|
||||
query predicate hopInputEndpoints = ModelScoring::endpoints/1;
|
||||
query predicate hopInputEndpoints(DataFlow::Node endpoint) {
|
||||
endpoint = ModelScoring::getARequestedEndpoint()
|
||||
}
|
||||
|
||||
query predicate endpointScores = ModelScoring::endpointScores/3;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user