Compare commits

...

14 Commits

Author SHA1 Message Date
Tiferet Gazit
f28b85c059 Create many features with different scopes.
This will allow us to experiment with feature selection on the modeling side to pick out the best subset of scoped features.
2021-11-22 20:56:41 +00:00
Tiferet Gazit
b493702a5a Bug fix:
When getting the wrapper for the root node, we must give the container of the root node, not the endpoint, because they won't necessarily be the same for functions contained within functions.
2021-11-22 20:15:48 +00:00
Tiferet Gazit
e2ffba2e47 TODO Interim fix:
For now don't allow the neighborhood to go all the way out to the outermostEnclosingFunction, because that causes bugs.

After this change, `enclosingFunctionBodyEndpointNeighborhood` doesn't match `enclosingFunctionBody` even when the function is small, which is undesired behavior. `enclosingFunctionBody` includes the arguments to the function whereas `enclosingFunctionBodyEndpointNeighborhood` does not. We should find a way to fix this.
2021-11-22 20:10:24 +00:00
Tiferet Gazit
a507d6dc1d Formatting fixes 2021-11-22 20:02:28 +00:00
Tiferet Gazit
da5f664265 Create three features with different scopes:
Create three versions of `enclosingFunctionBodyEndpointNeighborhood`, each with a different threshold on the number of descendants.
2021-11-20 02:39:01 +00:00
Tiferet Gazit
d52af5599a Allow the neighborhood to be the entire enclosing function 2021-11-20 02:28:53 +00:00
Anna Railton
05460f6444 Add check for neighborhood node being top-level enclosing function 2021-11-18 16:34:06 +00:00
Anna Railton
c7f96928dc Reformat docstrings 2021-11-18 16:34:03 +00:00
Anna Railton
4351d9e861 Add predicates for the magic numbers in the predicates 2021-11-18 16:26:59 +00:00
Anna Railton
c3766477dc Apply suggestions from code review
Co-authored-by: Tiferet Gazit <tiferet@github.com>
2021-11-18 16:26:59 +00:00
annarailton
b6922813a8 Add neighborhoodBody feature to token features
Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
annarailton
5de0edc773 Add NeighborhoodBodies module
This provides functionality for getting the token features associated with a
neighborhood around an AST node. It is strongly related to `FunctionBodies`.

Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
annarailton
a8c1febf88 Add helper types for changes to EndpointFeatures
Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
annarailton
2b334055db Clean up to astNodes predicate to make more explicit
Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
2 changed files with 188 additions and 12 deletions

View File

@@ -4,6 +4,8 @@
* Extracts data about the functions in the database for use in adaptive threat modeling (ATM).
*/
private import EndpointFeatures::NeighborhoodBodies
module Raw {
private import javascript as raw
@@ -30,6 +32,9 @@ module Raw {
entity.getNumBodyStmt() = 0 and not exists(entity.getAReturnedExpr())
}
/**
* Wrapper for RawAstNode (could be an alias instead of a newtype)
*/
newtype WrappedAstNode = TAstNode(RawAstNode rawNode)
/**
@@ -44,6 +49,10 @@ module Raw {
AstNode getParentNode() { result = TAstNode(rawNode.getParent()) }
raw::ASTNode getNode() { result = rawNode }
raw::StmtContainer getContainer() { result = rawNode.getContainer() }
/**
* Holds if the AST node has `result` as its `index`th attribute.
*
@@ -128,6 +137,11 @@ module Raw {
}
}
/**
* Returns the `Raw::AstNode` wrapper of `rawNode`
*/
AstNode astNode(RawAstNode rawNode) { result = TAstNode(rawNode) }
/**
* Holds if `result` is the `index`'th child of the `parent` entity. Such
* a node is a root of an AST associated with this entity.
@@ -324,6 +338,11 @@ module Wrapped {
Raw::Location getLocation() { result = rawNode.getLocation() }
}
/**
* Returns the `Wrapped::AstNode` for a `rawNode` in the context of `entity`
*/
AstNode astNode(Raw::Entity entity, Raw::AstNode rawNode) { result = TAstNode(entity, rawNode) }
/**
* A synthetic AST node, created to be a leaf for an otherwise non-leaf attribute.
*/
@@ -383,6 +402,8 @@ module DatabaseFeatures {
override Location getLocation() { result = entity.getLocation() }
UnderlyingFunction getDefinedFunction() { result = entity.getDefinedFunction() }
Wrapped::Entity getWrappedEntity() { result = entity }
}
class AstNode extends EntityOrAstNode, TAstNode {
@@ -392,8 +413,12 @@ module DatabaseFeatures {
AstNode getChild(int index) { result = TAstNode(rawNode.getChild(index)) }
AstNode getAChild() { result = this.getChild(_) }
string getAttribute(int index) { result = rawNode.getAttribute(index) }
Wrapped::AstNode getRawNode() { result = rawNode }
override string getType() { result = rawNode.getType() }
override string toString() { result = this.getType() }
@@ -401,6 +426,9 @@ module DatabaseFeatures {
override Location getLocation() { result = rawNode.getLocation() }
}
/** Gets the `DatabaseFeatures::AstNode` that wraps `wrapped` */
AstNode astNode(Wrapped::AstNode wrapped) { result.getRawNode() = wrapped }
/** Consistency checks: these predicates should each have no results */
module Consistency {
query predicate nonLeafAttribute(AstNode node, int index, string attribute) {
@@ -423,15 +451,15 @@ module DatabaseFeatures {
}
query predicate astNodes(
Entity enclosingEntity, EntityOrAstNode parent, int index, AstNode node, string node_type
Entity enclosingEntity, EntityOrAstNode parent, int index, AstNode child, string childType
) {
node = enclosingEntity.getAstRoot(index) and
child = enclosingEntity.getAstRoot(index) and
parent = enclosingEntity and
node_type = node.getType()
childType = child.getType()
or
astNodes(enclosingEntity, _, _, parent, _) and
node = parent.(AstNode).getChild(index) and
node_type = node.getType()
child = parent.(AstNode).getChild(index) and
childType = child.getType()
}
query predicate nodeAttributes(AstNode node, string attr) {

View File

@@ -8,6 +8,28 @@ import javascript
import CodeToFeatures
import EndpointScoring
/** Maximum number of descendants of an AST node to be considered to be in the "neighborhood" of that node */
class DescendantsThreshold extends int {
DescendantsThreshold() { this = [1 .. 10000] }
predicate is1024() { this = 1024 }
predicate is512() { this = 512 }
predicate is256() { this = 256 }
predicate is128() { this = 128 }
predicate is64() { this = 64 }
predicate is32() { this = 32 }
int getMaxNumDescendants() { result = this }
}
/** Maximum number of subtokens in a function body */
private int maxNumBodySubtokens() { result = 256 }
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
*
@@ -25,6 +47,40 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
// A feature containing natural language tokens from the neighborhood around the endpoint
// (limited to within the function that encloses the endpoint), in the order they appear
// in the source code.
exists(
Raw::AstNode rootNode, DatabaseFeatures::AstNode rootNodeWrapped, DescendantsThreshold thresh
|
(
featureName = "enclosingFunctionBodyEndpointNeighborhood1024" and
thresh.is1024()
or
featureName = "enclosingFunctionBodyEndpointNeighborhood512" and
thresh.is512()
or
featureName = "enclosingFunctionBodyEndpointNeighborhood256" and
thresh.is256()
or
featureName = "enclosingFunctionBodyEndpointNeighborhood128" and
thresh.is128()
or
featureName = "enclosingFunctionBodyEndpointNeighborhood64" and
thresh.is64()
or
featureName = "enclosingFunctionBodyEndpointNeighborhood32" and
thresh.is32()
) and
rootNode =
NeighborhoodBodies::getNeighborhoodAstNode(Raw::astNode(endpoint.getAstNode()), thresh) and
rootNodeWrapped = DatabaseFeatures::astNode(Wrapped::astNode(rootNode.getContainer(), rootNode)) and
result =
unique(string x |
x = NeighborhoodBodies::getBodyTokenFeatureForNeighborhoodNode(rootNodeWrapped)
)
)
or
exists(getACallBasedTokenFeatureComponent(endpoint, _, featureName)) and
result =
concat(DataFlow::CallNode call, string component |
@@ -121,13 +177,17 @@ module FunctionBodies {
/**
* Gets the body token feature for the specified entity.
*
* This is a string containing natural language tokens in the order that they appear in the source code for the entity.
* This is a string containing natural language tokens in the order that they appear in the source
* code for the entity.
*
* If a function has more than `maxNumBodySubtokens` body subtokens, then featurize it as absent.
* This approximates the behavior of the classifer on non-generic body features where large body
* features are replaced by the absent token.
*/
string getBodyTokenFeatureForEntity(DatabaseFeatures::Entity entity) {
// If a function has more than 256 body subtokens, then featurize it as absent. This
// approximates the behavior of the classifer on non-generic body features where large body
// features are replaced by the absent token.
if count(DatabaseFeatures::AstNode node, string token | bodyTokens(entity, node, token)) > 256
if
count(DatabaseFeatures::AstNode node, string token | bodyTokens(entity, node, token)) >
maxNumBodySubtokens()
then result = ""
else
result =
@@ -147,6 +207,86 @@ module FunctionBodies {
}
}
/**
* This module provides functionality for getting the local neighborhood around an AST node within
* its enclosing function body, providing a locally-scoped version of the `enclosingFunctionBody` feature.
*/
module NeighborhoodBodies {
/**
* Return the ancestor of the input AST node that has the largest number of descendants (i.e. the
* node nearest the root) but has no more than `maxNumDescendants` descendants.
*
* TODO: Maybe instead of a threshold on number of descendants, we should instead have a threshold
* on the number of leaves in the subtree, which is a closer approximation to the number of tokens
* in the subtree.
*/
Raw::AstNode getNeighborhoodAstNode(Raw::AstNode node, DescendantsThreshold maxNumDescendants) {
if
node.getParentNode() = getOutermostEnclosingFunction(node) or
getNumDescendents(node.getParentNode()) > maxNumDescendants.getMaxNumDescendants()
then result = node
else result = getNeighborhoodAstNode(node.getParentNode(), maxNumDescendants)
}
/** Count number of descendants of an AST node */
int getNumDescendents(Raw::AstNode node) { result = count(node.getAChildNode*()) }
private ASTNode getContainer(ASTNode node) { result = node.getContainer() }
/** Return the AST node that is outermost enclosing function (as an AST Node) */
Raw::AstNode getOutermostEnclosingFunction(Raw::AstNode node) {
result = Raw::astNode(getContainer*(node.getNode())) and
result.getContainer() instanceof TopLevel
}
/**
* Holds if `childNode` is an AST node under `rootNode` and `token` is a node attribute associated
* with `childNode`. Note that only AST leaves have node attributes.
*
* TODO we may need to restrict `rootNode` to be a neighborhood root to avoid a potentially big result set.
*/
private predicate bodyTokens(
DatabaseFeatures::AstNode rootNode, DatabaseFeatures::AstNode childNode, string token
) {
childNode = rootNode.getAChild*() and
token = unique(string t | DatabaseFeatures::nodeAttributes(childNode, t))
}
/**
* Gets the body token feature limited to the part of the function body that lies under `rootNode` in the AST.
*
* This is a string of space-separated natural language tokens (AST leaves) in the order that they
* appear in the source code for the AST subtree rooted at `rootNode`. This is equivalent to the
* portion of the code that falls under the AST subtree rooted at the given node, except that
* non-leaf nodes (such as operators) are excluded.
*
* If a function has more than `maxNumBodySubtokens` body subtokens, then featurize it as absent.
* This approximates the behavior of the classifer on non-generic body features where large body
* features are replaced by the absent token.
*/
string getBodyTokenFeatureForNeighborhoodNode(DatabaseFeatures::AstNode rootNode) {
if
count(DatabaseFeatures::AstNode node, string token | bodyTokens(rootNode, node, token)) >
maxNumBodySubtokens()
then result = ""
else
result =
concat(int i, string rankedToken |
rankedToken =
rank[i](DatabaseFeatures::AstNode node, string token, Location l |
bodyTokens(rootNode, node, token) and l = node.getLocation()
|
token
order by
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
l.getEndColumn(), token
)
|
rankedToken, " " order by i
)
}
}
/**
* This module provides functionality for getting a representation of the access path of nodes
* within the program.
@@ -191,7 +331,10 @@ private module AccessPaths {
Boolean() { this = true or this = false }
}
/** Get the access path for the node. This includes structural information like `member`, `param`, and `functionalarg` if `includeStructuralInfo` is true. */
/**
* Get the access path for the node. This includes structural information like `member`, `param`,
* and `functionalarg` if `includeStructuralInfo` is true.
*/
predicate accessPaths(
API::Node node, Boolean includeStructuralInfo, string accessPath, string apiName
) {
@@ -269,7 +412,12 @@ private string getASupportedFeatureName() {
result =
[
"enclosingFunctionName", "calleeName", "receiverName", "argumentIndex", "calleeApiName",
"calleeAccessPath", "calleeAccessPathWithStructuralInfo", "enclosingFunctionBody"
"calleeAccessPath", "calleeAccessPathWithStructuralInfo", "enclosingFunctionBody",
"enclosingFunctionBodyEndpointNeighborhood1024",
"enclosingFunctionBodyEndpointNeighborhood512",
"enclosingFunctionBodyEndpointNeighborhood256",
"enclosingFunctionBodyEndpointNeighborhood128", "enclosingFunctionBodyEndpointNeighborhood64",
"enclosingFunctionBodyEndpointNeighborhood32"
]
}