Compare commits

...

8 Commits

Author SHA1 Message Date
Anna Railton
05460f6444 Add check for neighborhood node being top-level enclosing function 2021-11-18 16:34:06 +00:00
Anna Railton
c7f96928dc Reformat docstrings 2021-11-18 16:34:03 +00:00
Anna Railton
4351d9e861 Add predicates for the magic numbers in the predicates 2021-11-18 16:26:59 +00:00
Anna Railton
c3766477dc Apply suggestions from code review
Co-authored-by: Tiferet Gazit <tiferet@github.com>
2021-11-18 16:26:59 +00:00
annarailton
b6922813a8 Add neighborhoodBody feature to token features
Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
annarailton
5de0edc773 Add NeighborhoodBodies module
This provides functionality for getting the token features associated with a
neighborhood around an AST node. It is strongly related to `FunctionBodies`.

Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
annarailton
a8c1febf88 Add helper types for changes to EndpointFeatures
Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
annarailton
2b334055db Clean up to astNodes predicate to make more explicit
Co-authored-by: Chris Smowton <smowton@github.com>
2021-11-18 16:26:59 +00:00
2 changed files with 149 additions and 12 deletions

View File

@@ -4,6 +4,8 @@
* Extracts data about the functions in the database for use in adaptive threat modeling (ATM).
*/
private import EndpointFeatures::NeighborhoodBodies
module Raw {
private import javascript as raw
@@ -30,6 +32,9 @@ module Raw {
entity.getNumBodyStmt() = 0 and not exists(entity.getAReturnedExpr())
}
/**
* Wrapper for RawAstNode (could be an alias instead of a newtype)
*/
newtype WrappedAstNode = TAstNode(RawAstNode rawNode)
/**
@@ -44,6 +49,10 @@ module Raw {
AstNode getParentNode() { result = TAstNode(rawNode.getParent()) }
raw::ASTNode getNode() { result = rawNode }
raw::StmtContainer getContainer() { result = rawNode.getContainer() }
/**
* Holds if the AST node has `result` as its `index`th attribute.
*
@@ -128,6 +137,11 @@ module Raw {
}
}
/**
* Returns the `Raw::AstNode` wrapper of `rawNode`
*/
AstNode astNode(RawAstNode rawNode) { result = TAstNode(rawNode) }
/**
* Holds if `result` is the `index`'th child of the `parent` entity. Such
* a node is a root of an AST associated with this entity.
@@ -324,6 +338,11 @@ module Wrapped {
Raw::Location getLocation() { result = rawNode.getLocation() }
}
/**
* Returns the `Wrapped::AstNode` for a `rawNode` in the context of `entity`
*/
AstNode astNode(Raw::Entity entity, Raw::AstNode rawNode) { result = TAstNode(entity, rawNode) }
/**
* A synthetic AST node, created to be a leaf for an otherwise non-leaf attribute.
*/
@@ -383,6 +402,8 @@ module DatabaseFeatures {
override Location getLocation() { result = entity.getLocation() }
UnderlyingFunction getDefinedFunction() { result = entity.getDefinedFunction() }
Wrapped::Entity getWrappedEntity() { result = entity }
}
class AstNode extends EntityOrAstNode, TAstNode {
@@ -392,8 +413,12 @@ module DatabaseFeatures {
AstNode getChild(int index) { result = TAstNode(rawNode.getChild(index)) }
AstNode getAChild() { result = this.getChild(_) }
string getAttribute(int index) { result = rawNode.getAttribute(index) }
Wrapped::AstNode getRawNode() { result = rawNode }
override string getType() { result = rawNode.getType() }
override string toString() { result = this.getType() }
@@ -401,6 +426,9 @@ module DatabaseFeatures {
override Location getLocation() { result = rawNode.getLocation() }
}
/** Gets the `DatabaseFeatures::AstNode` that wraps `wrapped` */
AstNode astNode(Wrapped::AstNode wrapped) { result.getRawNode() = wrapped }
/** Consistency checks: these predicates should each have no results */
module Consistency {
query predicate nonLeafAttribute(AstNode node, int index, string attribute) {
@@ -423,15 +451,15 @@ module DatabaseFeatures {
}
query predicate astNodes(
Entity enclosingEntity, EntityOrAstNode parent, int index, AstNode node, string node_type
Entity enclosingEntity, EntityOrAstNode parent, int index, AstNode child, string childType
) {
node = enclosingEntity.getAstRoot(index) and
child = enclosingEntity.getAstRoot(index) and
parent = enclosingEntity and
node_type = node.getType()
childType = child.getType()
or
astNodes(enclosingEntity, _, _, parent, _) and
node = parent.(AstNode).getChild(index) and
node_type = node.getType()
child = parent.(AstNode).getChild(index) and
childType = child.getType()
}
query predicate nodeAttributes(AstNode node, string attr) {

View File

@@ -8,6 +8,12 @@ import javascript
import CodeToFeatures
import EndpointScoring
/** Maximum number of descendants of an AST node to be considered to be in the "neighborhood" of that node */
private int maxNumDescendants() { result = 128 }
/** Maximum number of subtokens in a function body */
private int maxNumBodySubtokens() { result = 256 }
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
*
@@ -25,6 +31,19 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
)
or
// A feature containing natural language tokens from the neighborhood around the endpoint
// (limited to within the function that encloses the endpoint), in the order they appear
// in the source code.
exists(Raw::AstNode rootNode, DatabaseFeatures::AstNode rootNodeWrapped |
featureName = "neighborhoodBody" and
rootNode = NeighborhoodBodies::getNeighborhoodAstNode(Raw::astNode(endpoint.getAstNode())) and
rootNodeWrapped = DatabaseFeatures::astNode(Wrapped::astNode(endpoint.getContainer(), rootNode)) and
result =
unique(string x |
x = NeighborhoodBodies::getBodyTokenFeatureForNeighborhoodNode(rootNodeWrapped)
)
)
or
exists(getACallBasedTokenFeatureComponent(endpoint, _, featureName)) and
result =
concat(DataFlow::CallNode call, string component |
@@ -121,13 +140,17 @@ module FunctionBodies {
/**
* Gets the body token feature for the specified entity.
*
* This is a string containing natural language tokens in the order that they appear in the source code for the entity.
* This is a string containing natural language tokens in the order that they appear in the source
* code for the entity.
*
* If a function has more than `maxNumBodySubtokens` body subtokens, then featurize it as absent.
* This approximates the behavior of the classifer on non-generic body features where large body
* features are replaced by the absent token.
*/
string getBodyTokenFeatureForEntity(DatabaseFeatures::Entity entity) {
// If a function has more than 256 body subtokens, then featurize it as absent. This
// approximates the behavior of the classifer on non-generic body features where large body
// features are replaced by the absent token.
if count(DatabaseFeatures::AstNode node, string token | bodyTokens(entity, node, token)) > 256
if
count(DatabaseFeatures::AstNode node, string token | bodyTokens(entity, node, token)) >
maxNumBodySubtokens()
then result = ""
else
result =
@@ -147,6 +170,88 @@ module FunctionBodies {
}
}
/**
* This module provides functionality for getting the local neighborhood around an AST node within
* its enclosing function body, providing a locally-scoped version of the `enclosingFunctionBody` feature.
*/
module NeighborhoodBodies {
/**
* Return the ancestor of the input AST node that has the largest number of descendants (i.e. the
* node nearest the root) but has no more than `maxNumDescendants` descendants.
*
* TODO: Maybe instead of a threshold on number of descendants, we should instead have a threshold
* on the number of leaves in the subtree, which is a closer approximation to the number of tokens
* in the subtree.
*/
Raw::AstNode getNeighborhoodAstNode(Raw::AstNode node) {
if
// `node` will always have a parent as we start at and endpoint
node.getParentNode() = getOutermostEnclosingFunction(node) or
getNumDescendents(node.getParentNode()) > maxNumDescendants()
then result = node
else result = getNeighborhoodAstNode(node.getParentNode())
}
/** Count number of descendants of an AST node */
int getNumDescendents(Raw::AstNode node) { result = count(node.getAChildNode*()) }
private ASTNode getContainer(ASTNode node) {
result = node.getContainer()
}
/** Return the AST node that is outermost enclosing function (as an AST Node) */
Raw::AstNode getOutermostEnclosingFunction(Raw::AstNode node) {
result = Raw::astNode(getContainer*(node.getNode())) and result.getContainer() instanceof TopLevel
}
/**
* Holds if `childNode` is an AST node under `rootNode` and `token` is a node attribute associated
* with `childNode`. Note that only AST leaves have node attributes.
*
* TODO we may need to restrict `rootNode` to be a neighborhood root to avoid a potentially big result set.
*/
private predicate bodyTokens(
DatabaseFeatures::AstNode rootNode, DatabaseFeatures::AstNode childNode, string token
) {
childNode = rootNode.getAChild*() and
token = unique(string t | DatabaseFeatures::nodeAttributes(childNode, t))
}
/**
* Gets the body token feature limited to the part of the function body that lies under `rootNode` in the AST.
*
* This is a string of space-separated natural language tokens (AST leaves) in the order that they
* appear in the source code for the AST subtree rooted at `rootNode`. This is equivalent to the
* portion of the code that falls under the AST subtree rooted at the given node, except that
* non-leaf nodes (such as operators) are excluded.
*
* If a function has more than `maxNumBodySubtokens` body subtokens, then featurize it as absent.
* This approximates the behavior of the classifer on non-generic body features where large body
* features are replaced by the absent token.
*/
string getBodyTokenFeatureForNeighborhoodNode(DatabaseFeatures::AstNode rootNode) {
if
count(DatabaseFeatures::AstNode node, string token | bodyTokens(rootNode, node, token)) >
maxNumBodySubtokens()
then result = ""
else
result =
concat(int i, string rankedToken |
rankedToken =
rank[i](DatabaseFeatures::AstNode node, string token, Location l |
bodyTokens(rootNode, node, token) and l = node.getLocation()
|
token
order by
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
l.getEndColumn(), token
)
|
rankedToken, " " order by i
)
}
}
/**
* This module provides functionality for getting a representation of the access path of nodes
* within the program.
@@ -191,7 +296,10 @@ private module AccessPaths {
Boolean() { this = true or this = false }
}
/** Get the access path for the node. This includes structural information like `member`, `param`, and `functionalarg` if `includeStructuralInfo` is true. */
/**
* Get the access path for the node. This includes structural information like `member`, `param`,
* and `functionalarg` if `includeStructuralInfo` is true.
*/
predicate accessPaths(
API::Node node, Boolean includeStructuralInfo, string accessPath, string apiName
) {
@@ -269,7 +377,8 @@ private string getASupportedFeatureName() {
result =
[
"enclosingFunctionName", "calleeName", "receiverName", "argumentIndex", "calleeApiName",
"calleeAccessPath", "calleeAccessPathWithStructuralInfo", "enclosingFunctionBody"
"calleeAccessPath", "calleeAccessPathWithStructuralInfo", "enclosingFunctionBody",
"neighborhoodBody"
]
}