Merge pull request #7307 from adityasharad/atm/perf-debugging

JS/ATM: Various compilation fixes and performance improvements
This commit is contained in:
Henry Mercer
2021-12-10 11:00:27 +00:00
committed by GitHub
2 changed files with 103 additions and 41 deletions

View File

@@ -25,9 +25,8 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
exists(getACallBasedTokenFeatureComponent(endpoint, _, featureName)) and
result =
concat(DataFlow::CallNode call, string component |
strictconcat(DataFlow::CallNode call, string component |
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
|
component, " "
@@ -110,12 +109,13 @@ private string getACallBasedTokenFeatureComponent(
/** This module provides functionality for getting the function body feature associated with a particular entity. */
module FunctionBodies {
/** Holds if `node` is an AST node within the entity `entity` and `token` is a node attribute associated with `node`. */
private predicate bodyTokens(
DatabaseFeatures::Entity entity, DatabaseFeatures::AstNode node, string token
) {
DatabaseFeatures::astNodes(entity, _, _, node, _) and
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t))
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
exists(DatabaseFeatures::AstNode node |
DatabaseFeatures::astNodes(entity, _, _, node, _) and
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
location = node.getLocation()
)
}
/**
@@ -127,23 +127,18 @@ module FunctionBodies {
// If a function has more than 256 body subtokens, then featurize it as absent. This
// approximates the behavior of the classifer on non-generic body features where large body
// features are replaced by the absent token.
if count(DatabaseFeatures::AstNode node, string token | bodyTokens(entity, node, token)) > 256
then result = ""
else
result =
concat(int i, string rankedToken |
rankedToken =
rank[i](DatabaseFeatures::AstNode node, string token, Location l |
bodyTokens(entity, node, token) and l = node.getLocation()
|
token
order by
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
l.getEndColumn(), token
)
|
rankedToken, " " order by i
)
//
// We count locations instead of tokens because tokens are often not unique.
strictcount(Location l | bodyTokens(entity, l, _)) <= 256 and
result =
strictconcat(string token, Location l |
bodyTokens(entity, l, token)
|
token, " "
order by
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
l.getEndColumn(), token
)
}
}
@@ -247,11 +242,12 @@ private module AccessPaths {
else accessPath = previousAccessPath + " " + paramName
)
or
exists(string callbackName, string index |
exists(string callbackName, int index |
node =
getNamedParameter(previousNode.getASuccessor("param " + index).getMember(callbackName),
paramName) and
index != "-1" and // ignore receiver
getNamedParameter(previousNode
.getASuccessor(API::Label::parameter(index))
.getMember(callbackName), paramName) and
index != -1 and // ignore receiver
if includeStructuralInfo = true
then
accessPath =
@@ -280,10 +276,13 @@ private string getASupportedFeatureName() {
* `featureValue` for the endpoint `endpoint`.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
featureName = getASupportedFeatureName() and
ModelScoring::endpoints(endpoint) and
(
featureValue = unique(string x | x = getTokenFeature(endpoint, featureName))
or
not exists(unique(string x | x = getTokenFeature(endpoint, featureName))) and featureValue = ""
if strictcount(getTokenFeature(endpoint, featureName)) = 1
then featureValue = getTokenFeature(endpoint, featureName)
else (
// Performance note: this is a Cartesian product between all endpoints and feature names.
featureValue = "" and featureName = getASupportedFeatureName()
)
)
}

View File

@@ -92,14 +92,31 @@ abstract class FrameworkLibraryWithMarkerComment extends FrameworkLibrary {
/**
* Gets a regular expression that can be used to identify an instance of
* this framework library.
* this framework library, with `<VERSION>` as a placeholder for version
* numbers.
*
* The first capture group of this regular expression should match
* the version number. Any occurrences of the string `<VERSION>` in
* the regular expression are replaced by `versionRegex()` before
* matching.
* the version number.
*
* Subclasses should implement this predicate.
*
* Callers should avoid using this predicate directly,
* and instead use `getAMarkerCommentRegexWithoutPlaceholders()`,
* which will replace any occurrences of the string `<VERSION>` in
* the regular expression with `versionRegex()`.
*/
abstract string getAMarkerCommentRegex();
/**
* Gets a regular expression that can be used to identify an instance of
* this framework library.
*
* The first capture group of this regular expression is intended to match
* the version number.
*/
final string getAMarkerCommentRegexWithoutPlaceholders() {
result = this.getAMarkerCommentRegex().replaceAll("<VERSION>", versionRegex())
}
}
/**
@@ -182,18 +199,64 @@ class FrameworkLibraryInstanceWithMarkerComment extends FrameworkLibraryInstance
override predicate info(FrameworkLibrary fl, string v) { matchMarkerComment(_, this, fl, v) }
}
/** A marker comment that indicates a framework library. */
private class MarkerComment extends Comment {
MarkerComment() {
/*
* PERFORMANCE OPTIMISATION:
*
* Each framework library has a regular expression describing its marker comments.
* We want to find the set of marker comments and the framework regexes they match.
* In order to perform such regex matching, CodeQL needs to compute the
* Cartesian product of possible receiver strings and regexes first,
* containing `num_receivers * num_regexes` tuples.
*
* A straightforward attempt to match marker comments with individual
* framework regexes will compute the Cartesian product between
* the set of comments and the set of framework regexes.
* Total: `num_comments * num_frameworks` tuples.
*
* Instead, create a single regex that matches *all* frameworks.
* This is the regex union of the individual framework regexes
* i.e. `(regex_1)|(regex_2)|...|(regex_n)`
* This approach will compute the Cartesian product between
* the set of comments and the singleton set of this union regex.
* Total: `num_comments * 1` tuples.
*
* To identify the individual frameworks and extract the version number from capture groups,
* use the member predicate `matchesFramework` *after* this predicate has been computed.
*/
exists(string unionRegex |
unionRegex =
concat(FrameworkLibraryWithMarkerComment fl |
|
"(" + fl.getAMarkerCommentRegexWithoutPlaceholders() + ")", "|"
)
|
this.getText().regexpMatch(unionRegex)
)
}
/**
* Holds if this marker comment indicates an instance of the framework `fl`
* with version number `version`.
*/
predicate matchesFramework(FrameworkLibraryWithMarkerComment fl, string version) {
this.getText().regexpCapture(fl.getAMarkerCommentRegexWithoutPlaceholders(), 1) = version
}
}
/**
* Holds if comment `c` in toplevel `tl` matches the marker comment of library
* `fl` at `version`.
*/
cached
private predicate matchMarkerComment(
Comment c, TopLevel tl, FrameworkLibraryWithMarkerComment fl, string version
MarkerComment c, TopLevel tl, FrameworkLibraryWithMarkerComment fl, string version
) {
c.getTopLevel() = tl and
exists(string r | r = fl.getAMarkerCommentRegex().replaceAll("<VERSION>", versionRegex()) |
version = c.getText().regexpCapture(r, 1)
)
c.matchesFramework(fl, version)
}
/**