Compare commits

...

10 Commits

Author SHA1 Message Date
Ian Wright
a716d39370 repatch 2021-12-17 16:59:31 +00:00
Ian Wright
335b2466a9 patch again 2021-12-17 16:49:40 +00:00
Ian Wright
96ae9617ec post cherry-pick patch 2021-12-17 16:43:07 +00:00
Henry Mercer
82029663b2 JS: Push FeaturizationConfig context into more predicates 2021-12-17 16:03:15 +00:00
Henry Mercer
4cd15ba654 JS: Only featurize endpoints that are part of a flow path 2021-12-17 15:58:46 +00:00
Ian Wright
c17c10e450 Revert "JS: Push FeaturizationConfig context into more predicates"
This reverts commit a0f479d503.
2021-12-17 15:54:03 +00:00
Henry Mercer
a0f479d503 JS: Push FeaturizationConfig context into more predicates 2021-12-17 13:54:25 +00:00
Ian Wright
24a5e8a8e1 bump the release number 2021-12-17 13:12:05 +00:00
Henry Mercer
427cdf480a JS: Update featurization for absent features optimization
Absent features are now represented implicitly by the absence of a row
in the `tokenFeatures` relation, rather than explicitly by an empty
string. This leads to improved runtime performance. To enable this
implicit representation, we pass the set of supported token features to
the `scoreEndpoints` HOP. Requires CodeQL CLI v2.7.4.
2021-12-17 13:10:10 +00:00
Ian Wright
8e1f2645cb bump the release number 2021-12-17 13:10:10 +00:00
4 changed files with 87 additions and 66 deletions

View File

@@ -6,7 +6,20 @@
import javascript
import CodeToFeatures
import EndpointScoring
private import EndpointScoring
/**
* A configuration that defines which endpoints should be featurized.
*
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
* to featurize.
*/
abstract class FeaturizationConfig extends string {
bindingset[this]
FeaturizationConfig() { any() }
abstract DataFlow::Node getAnEndpointToFeaturize();
}
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -14,51 +27,55 @@ import EndpointScoring
* This is a single string containing a space-separated list of tokens.
*/
private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
// Features for endpoints that are contained within a function.
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
// The name of the function that encloses the endpoint.
featureName = "enclosingFunctionName" and result = entity.getName()
or
// A feature containing natural language tokens from the function that encloses the endpoint in
// the order that they appear in the source code.
featureName = "enclosingFunctionBody" and
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
result =
strictconcat(DataFlow::CallNode call, string component |
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
component, " "
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
(
// Features for endpoints that are contained within a function.
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
// The name of the function that encloses the endpoint.
featureName = "enclosingFunctionName" and result = entity.getName()
or
// A feature containing natural language tokens from the function that encloses the endpoint in
// the order that they appear in the source code.
featureName = "enclosingFunctionBody" and
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
// The access path of the function being called, both with and without structural info, if the
// function being called originates from an external API. For example, the endpoint here:
//
// ```js
// const mongoose = require('mongoose'),
// User = mongoose.model('User', null);
// User.findOne(ENDPOINT);
// ```
//
// would have a callee access path with structural info of
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
// path without structural info of `mongoose model findOne`.
//
// These features indicate that the callee comes from (reading the access path backwards) an
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
// external library.
exists(AccessPaths::Boolean includeStructuralInfo |
featureName =
"calleeAccessPath" +
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
or
result =
concat(API::Node node, string accessPath |
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
strictconcat(DataFlow::CallNode call, string component |
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
accessPath, " "
component, " "
)
or
// The access path of the function being called, both with and without structural info, if the
// function being called originates from an external API. For example, the endpoint here:
//
// ```js
// const mongoose = require('mongoose'),
// User = mongoose.model('User', null);
// User.findOne(ENDPOINT);
// ```
//
// would have a callee access path with structural info of
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
// path without structural info of `mongoose model findOne`.
//
// These features indicate that the callee comes from (reading the access path backwards) an
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
// external library.
exists(AccessPaths::Boolean includeStructuralInfo |
featureName =
"calleeAccessPath" +
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
result =
concat(API::Node node, string accessPath |
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
|
accessPath, " "
)
)
)
}
@@ -77,6 +94,8 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
private string getACallBasedTokenFeatureComponent(
DataFlow::Node endpoint, DataFlow::CallNode call, string featureName
) {
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
// Features for endpoints that are an argument to a function call.
endpoint = call.getAnArgument() and
(
@@ -111,6 +130,9 @@ private string getACallBasedTokenFeatureComponent(
module FunctionBodies {
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
entity =
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
exists(DatabaseFeatures::AstNode node |
DatabaseFeatures::astNodes(entity, _, _, node, _) and
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
@@ -261,7 +283,7 @@ private module AccessPaths {
}
/** Get a name of a supported generic token-based feature. */
private string getASupportedFeatureName() {
string getASupportedFeatureName() {
result =
[
"enclosingFunctionName", "calleeName", "receiverName", "argumentIndex", "calleeApiName",
@@ -276,13 +298,7 @@ private string getASupportedFeatureName() {
* `featureValue` for the endpoint `endpoint`.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
ModelScoring::endpoints(endpoint) and
(
if strictcount(getTokenFeature(endpoint, featureName)) = 1
then featureValue = getTokenFeature(endpoint, featureName)
else (
// Performance note: this is a Cartesian product between all endpoints and feature names.
featureValue = "" and featureName = getASupportedFeatureName()
)
)
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
featureValue = getTokenFeature(endpoint, featureName)
}

View File

@@ -80,22 +80,25 @@ DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpo
}
module ModelScoring {
predicate endpoints(DataFlow::Node endpoint) {
getCfg().isEffectiveSource(endpoint) or
getCfg().isEffectiveSink(endpoint)
/**
* A featurization config that only featurizes new candidate endpoints that are part of a flow
* path.
*/
class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
override DataFlow::Node getAnEndpointToFeaturize() { getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
or
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result) }
}
private int requestedEndpointTypes() { result = any(EndpointType type).getEncoding() }
DataFlow::Node getARequestedEndpoint() { result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize() }
private predicate relevantTokenFeatures(
DataFlow::Node endpoint, string featureName, string featureValue
) {
endpoints(endpoint) and
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
}
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
scoreEndpoints(endpoints/1, requestedEndpointTypes/0, relevantTokenFeatures/3,
scoreEndpoints(getARequestedEndpoint/0, EndpointFeatures::tokenFeatures/3,
EndpointFeatures::getASupportedFeatureName/0, getARequestedEndpointType/0,
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
}
@@ -212,7 +215,9 @@ class EndpointScoringResults extends ScoringResults {
}
module Debugging {
query predicate hopInputEndpoints = ModelScoring::endpoints/1;
query predicate hopInputEndpoints(DataFlow::Node endpoint) {
endpoint = ModelScoring::getARequestedEndpoint()
}
query predicate endpointScores = ModelScoring::endpointScores/3;

View File

@@ -1,5 +1,5 @@
name: codeql/javascript-experimental-atm-lib
version: 0.0.0
version: 0.0.2
extractor: javascript
library: true
dependencies:

View File

@@ -1,6 +1,6 @@
name: codeql/javascript-experimental-atm-queries
language: javascript
version: 0.0.0
version: 0.0.2
suites: codeql-suites
defaultSuiteFile: codeql-suites/javascript-atm-code-scanning.qls
dependencies: