Compare commits

...

10 Commits

Author SHA1 Message Date
Ian Wright
a716d39370 repatch 2021-12-17 16:59:31 +00:00
Ian Wright
335b2466a9 patch again 2021-12-17 16:49:40 +00:00
Ian Wright
96ae9617ec post cherry-pick patch 2021-12-17 16:43:07 +00:00
Henry Mercer
82029663b2 JS: Push FeaturizationConfig context into more predicates 2021-12-17 16:03:15 +00:00
Henry Mercer
4cd15ba654 JS: Only featurize endpoints that are part of a flow path 2021-12-17 15:58:46 +00:00
Ian Wright
c17c10e450 Revert "JS: Push FeaturizationConfig context into more predicates"
This reverts commit a0f479d503.
2021-12-17 15:54:03 +00:00
Henry Mercer
a0f479d503 JS: Push FeaturizationConfig context into more predicates 2021-12-17 13:54:25 +00:00
Ian Wright
24a5e8a8e1 bump the release number 2021-12-17 13:12:05 +00:00
Henry Mercer
427cdf480a JS: Update featurization for absent features optimization
Absent features are now represented implicitly by the absence of a row
in the `tokenFeatures` relation, rather than explicitly by an empty
string. This leads to improved runtime performance. To enable this
implicit representation, we pass the set of supported token features to
the `scoreEndpoints` HOP. Requires CodeQL CLI v2.7.4.
2021-12-17 13:10:10 +00:00
Ian Wright
8e1f2645cb bump the release number 2021-12-17 13:10:10 +00:00
4 changed files with 87 additions and 66 deletions

View File

@@ -6,7 +6,20 @@
import javascript import javascript
import CodeToFeatures import CodeToFeatures
import EndpointScoring private import EndpointScoring
/**
* A configuration that defines which endpoints should be featurized.
*
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
* to featurize.
*/
abstract class FeaturizationConfig extends string {
bindingset[this]
FeaturizationConfig() { any() }
abstract DataFlow::Node getAnEndpointToFeaturize();
}
/** /**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`. * Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -14,51 +27,55 @@ import EndpointScoring
* This is a single string containing a space-separated list of tokens. * This is a single string containing a space-separated list of tokens.
*/ */
private string getTokenFeature(DataFlow::Node endpoint, string featureName) { private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
// Features for endpoints that are contained within a function. // Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) | endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
// The name of the function that encloses the endpoint. (
featureName = "enclosingFunctionName" and result = entity.getName() // Features for endpoints that are contained within a function.
or exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
// A feature containing natural language tokens from the function that encloses the endpoint in // The name of the function that encloses the endpoint.
// the order that they appear in the source code. featureName = "enclosingFunctionName" and result = entity.getName()
featureName = "enclosingFunctionBody" and or
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity)) // A feature containing natural language tokens from the function that encloses the endpoint in
) // the order that they appear in the source code.
or featureName = "enclosingFunctionBody" and
result = result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
strictconcat(DataFlow::CallNode call, string component |
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
component, " "
) )
or or
// The access path of the function being called, both with and without structural info, if the
// function being called originates from an external API. For example, the endpoint here:
//
// ```js
// const mongoose = require('mongoose'),
// User = mongoose.model('User', null);
// User.findOne(ENDPOINT);
// ```
//
// would have a callee access path with structural info of
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
// path without structural info of `mongoose model findOne`.
//
// These features indicate that the callee comes from (reading the access path backwards) an
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
// external library.
exists(AccessPaths::Boolean includeStructuralInfo |
featureName =
"calleeAccessPath" +
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
result = result =
concat(API::Node node, string accessPath | strictconcat(DataFlow::CallNode call, string component |
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
| |
accessPath, " " component, " "
) )
or
// The access path of the function being called, both with and without structural info, if the
// function being called originates from an external API. For example, the endpoint here:
//
// ```js
// const mongoose = require('mongoose'),
// User = mongoose.model('User', null);
// User.findOne(ENDPOINT);
// ```
//
// would have a callee access path with structural info of
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
// path without structural info of `mongoose model findOne`.
//
// These features indicate that the callee comes from (reading the access path backwards) an
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
// external library.
exists(AccessPaths::Boolean includeStructuralInfo |
featureName =
"calleeAccessPath" +
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
result =
concat(API::Node node, string accessPath |
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
|
accessPath, " "
)
)
) )
} }
@@ -77,6 +94,8 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
private string getACallBasedTokenFeatureComponent( private string getACallBasedTokenFeatureComponent(
DataFlow::Node endpoint, DataFlow::CallNode call, string featureName DataFlow::Node endpoint, DataFlow::CallNode call, string featureName
) { ) {
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
// Features for endpoints that are an argument to a function call. // Features for endpoints that are an argument to a function call.
endpoint = call.getAnArgument() and endpoint = call.getAnArgument() and
( (
@@ -111,6 +130,9 @@ private string getACallBasedTokenFeatureComponent(
module FunctionBodies { module FunctionBodies {
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */ /** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) { private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
entity =
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
exists(DatabaseFeatures::AstNode node | exists(DatabaseFeatures::AstNode node |
DatabaseFeatures::astNodes(entity, _, _, node, _) and DatabaseFeatures::astNodes(entity, _, _, node, _) and
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
@@ -261,7 +283,7 @@ private module AccessPaths {
} }
/** Get a name of a supported generic token-based feature. */ /** Get a name of a supported generic token-based feature. */
private string getASupportedFeatureName() { string getASupportedFeatureName() {
result = result =
[ [
"enclosingFunctionName", "calleeName", "receiverName", "argumentIndex", "calleeApiName", "enclosingFunctionName", "calleeName", "receiverName", "argumentIndex", "calleeApiName",
@@ -276,13 +298,7 @@ private string getASupportedFeatureName() {
* `featureValue` for the endpoint `endpoint`. * `featureValue` for the endpoint `endpoint`.
*/ */
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) { predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
ModelScoring::endpoints(endpoint) and // Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
( endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
if strictcount(getTokenFeature(endpoint, featureName)) = 1 featureValue = getTokenFeature(endpoint, featureName)
then featureValue = getTokenFeature(endpoint, featureName)
else (
// Performance note: this is a Cartesian product between all endpoints and feature names.
featureValue = "" and featureName = getASupportedFeatureName()
)
)
} }

View File

@@ -80,22 +80,25 @@ DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpo
} }
module ModelScoring { module ModelScoring {
predicate endpoints(DataFlow::Node endpoint) { /**
getCfg().isEffectiveSource(endpoint) or * A featurization config that only featurizes new candidate endpoints that are part of a flow
getCfg().isEffectiveSink(endpoint) * path.
*/
class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
override DataFlow::Node getAnEndpointToFeaturize() { getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
or
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result) }
} }
private int requestedEndpointTypes() { result = any(EndpointType type).getEncoding() } DataFlow::Node getARequestedEndpoint() { result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize() }
private predicate relevantTokenFeatures( private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
DataFlow::Node endpoint, string featureName, string featureValue
) {
endpoints(endpoint) and
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
}
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) = predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
scoreEndpoints(endpoints/1, requestedEndpointTypes/0, relevantTokenFeatures/3, scoreEndpoints(getARequestedEndpoint/0, EndpointFeatures::tokenFeatures/3,
EndpointFeatures::getASupportedFeatureName/0, getARequestedEndpointType/0,
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score) getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
} }
@@ -212,7 +215,9 @@ class EndpointScoringResults extends ScoringResults {
} }
module Debugging { module Debugging {
query predicate hopInputEndpoints = ModelScoring::endpoints/1; query predicate hopInputEndpoints(DataFlow::Node endpoint) {
endpoint = ModelScoring::getARequestedEndpoint()
}
query predicate endpointScores = ModelScoring::endpointScores/3; query predicate endpointScores = ModelScoring::endpointScores/3;

View File

@@ -1,5 +1,5 @@
name: codeql/javascript-experimental-atm-lib name: codeql/javascript-experimental-atm-lib
version: 0.0.0 version: 0.0.2
extractor: javascript extractor: javascript
library: true library: true
dependencies: dependencies:

View File

@@ -1,6 +1,6 @@
name: codeql/javascript-experimental-atm-queries name: codeql/javascript-experimental-atm-queries
language: javascript language: javascript
version: 0.0.0 version: 0.0.2
suites: codeql-suites suites: codeql-suites
defaultSuiteFile: codeql-suites/javascript-atm-code-scanning.qls defaultSuiteFile: codeql-suites/javascript-atm-code-scanning.qls
dependencies: dependencies: