Remove CodeToFeatures AST library

This commit is contained in:
Henry Mercer
2022-01-12 11:18:56 +00:00
parent 957e34d8a7
commit 6e37a65e84
2 changed files with 59 additions and 415 deletions

View File

@@ -1,404 +0,0 @@
/*
* For internal use only.
*
* Extracts data about the functions in the database for use in adaptive threat modeling (ATM).
*/
module Raw {
private import javascript as raw
class RawAstNode = raw::ASTNode;
class Entity = raw::Function;
class Location = raw::Location;
/**
* Exposed as a tool for defining anchors for semantic search.
*/
class UnderlyingFunction = raw::Function;
/**
* Determines whether an entity should be omitted from ATM.
*/
predicate isEntityIgnored(Entity entity) {
// Ignore entities which don't have definitions, for example those in TypeScript
// declaration files.
not exists(entity.getBody())
or
// Ignore entities with an empty body, for example the JavaScript function () => {}.
entity.getNumBodyStmt() = 0 and not exists(entity.getAReturnedExpr())
}
newtype WrappedAstNode = TAstNode(RawAstNode rawNode)
/**
* This class represents nodes in the AST.
*/
class AstNode extends TAstNode {
RawAstNode rawNode;
AstNode() { this = TAstNode(rawNode) }
AstNode getAChildNode() { result = TAstNode(rawNode.getAChild()) }
AstNode getParentNode() { result = TAstNode(rawNode.getParent()) }
/**
* Holds if the AST node has `result` as its `index`th attribute.
*
* The index is not intended to mean anything, and is only here for disambiguation.
* There are no guarantees about any particular index being used (or not being used).
*/
string astNodeAttribute(int index) {
(
// NB: Unary and binary operator expressions e.g. -a, a + b and compound
// assignments e.g. a += b can be identified by the expression type.
result = rawNode.(raw::Identifier).getName()
or
// Computed property accesses for which we can predetermine the property being accessed.
// NB: May alias with operators e.g. could have '+' as a property name.
result = rawNode.(raw::IndexExpr).getPropertyName()
or
// We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`.
result = rawNode.(raw::NumberLiteral).getRawValue()
or
// We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation.
not rawNode instanceof raw::NumberLiteral and
result = rawNode.(raw::Literal).getValue()
or
result = rawNode.(raw::TemplateElement).getRawValue()
) and
index = 0
}
/**
* Returns a string indicating the "type" of the AST node.
*/
string astNodeType() {
// The definition of this method should correspond with that of the `@ast_node` entry in the
// dbscheme.
result = "js_exprs." + any(int kind | exprs(rawNode, kind, _, _, _))
or
result = "js_properties." + any(int kind | properties(rawNode, _, _, kind, _))
or
result = "js_stmts." + any(int kind | stmts(rawNode, kind, _, _, _))
or
result = "js_toplevel" and rawNode instanceof raw::TopLevel
or
result = "js_typeexprs." + any(int kind | typeexprs(rawNode, kind, _, _, _))
}
/**
* Holds if `result` is the `index`'th child of the AST node, for some arbitrary indexing.
* A root of the AST should be its own child, with an arbitrary (though conventionally
* 0) index.
*
* Notably, the order in which child nodes are visited is not required to be meaningful,
* and no particular index is required to be meaningful. However, `(parent, index)`
* should be a keyset.
*/
pragma[nomagic]
AstNode astNodeChild(int index) {
result =
rank[index - 1](AstNode child, raw::Location l |
child = this.getAChildNode() and l = child.getLocation()
|
child
order by
l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn(),
child.astNodeType()
)
or
not exists(result.getParentNode()) and this = result and index = 0
}
raw::Location getLocation() { result = rawNode.getLocation() }
string toString() { result = rawNode.toString() }
predicate isEntityNameNode(Entity entity) {
exists(int index |
TAstNode(entity) = this.getParentNode() and
this = this.getParentNode().astNodeChild(index) and
// An entity name node must be the first child of the entity.
index = min(int otherIndex | exists(this.getParentNode().astNodeChild(otherIndex))) and
entity.getName() = rawNode.(raw::VarDecl).getName()
)
}
}
/**
* Holds if `result` is the `index`'th child of the `parent` entity. Such
* a node is a root of an AST associated with this entity.
*/
AstNode entityChild(AstNode parent, int index) {
// In JavaScript, entities appear in the AST parent/child relationship.
result = parent.astNodeChild(index)
}
/**
* Holds if `node` is contained in `entity`. Note that a single node may be contained
* in multiple entities, if they are nested. An entity, in particular, should be
* reported as contained within itself.
*/
predicate entityContains(Entity entity, AstNode node) {
node.getParentNode*() = TAstNode(entity) and not node.isEntityNameNode(entity)
}
/**
* Get the name of the entity.
*
* We attempt to assign unnamed entities approximate names if they are passed to a likely
* external library function. If we can't assign them an approximate name, we give them the name
* `""`, so that these entities are included in `AdaptiveThreatModeling.qll`.
*
* For entities which have multiple names, we choose the lexically smallest name.
*/
string getEntityName(Entity entity) {
if exists(entity.getName())
then
// https://github.com/github/ml-ql-adaptive-threat-modeling/issues/244 discusses making use
// of all the names during training.
result = min(entity.getName())
else
if exists(getApproximateNameForEntity(entity))
then result = getApproximateNameForEntity(entity)
else result = ""
}
/**
* Holds if the call `call` has `entity` is its `argumentIndex`th argument.
*/
private predicate entityUsedAsArgumentToCall(
Entity entity, raw::DataFlow::CallNode call, int argumentIndex
) {
raw::DataFlow::localFlowStep*(call.getArgument(argumentIndex), entity.flow())
}
/**
* Returns a generated name for the entity. This name is generated such that
* entities with the same names have similar behavior.
*/
private string getApproximateNameForEntity(Entity entity) {
count(raw::DataFlow::CallNode call, int index | entityUsedAsArgumentToCall(entity, call, index)) =
1 and
exists(raw::DataFlow::CallNode call, int index, string basePart |
entityUsedAsArgumentToCall(entity, call, index) and
(
if count(getReceiverName(call)) = 1
then basePart = getReceiverName(call) + "."
else basePart = ""
) and
result = basePart + call.getCalleeName() + "#functionalargument"
)
}
private string getReceiverName(raw::DataFlow::CallNode call) {
result = call.getReceiver().asExpr().(raw::VarAccess).getName()
}
}
module Wrapped {
/*
* We require any node with attributes to be a leaf. Where a non-leaf node
* has an attribute, we instead create a synthetic leaf node that has that
* attribute.
*/
/**
* Holds if the AST node `e` is a leaf node.
*/
private predicate isLeaf(Raw::AstNode e) { not exists(e.astNodeChild(_)) }
newtype WrappedEntity =
TEntity(Raw::Entity entity) {
exists(entity.getLocation().getFile().getRelativePath()) and
Raw::entityContains(entity, _)
}
/**
* A type ranging over the kinds of entities for which we want to consider embeddings.
*/
class Entity extends WrappedEntity {
Raw::Entity rawEntity;
Entity() { this = TEntity(rawEntity) and not Raw::isEntityIgnored(rawEntity) }
string getName() { result = Raw::getEntityName(rawEntity) }
AstNode getAstRoot(int index) {
result = TAstNode(rawEntity, Raw::entityChild(Raw::TAstNode(rawEntity), index))
}
string toString() { result = rawEntity.toString() }
Raw::Location getLocation() { result = rawEntity.getLocation() }
Raw::UnderlyingFunction getDefinedFunction() { result = rawEntity }
}
newtype WrappedAstNode =
TAstNode(Raw::Entity enclosingEntity, Raw::AstNode node) {
Raw::entityContains(enclosingEntity, node)
} or
TSyntheticNode(
Raw::Entity enclosingEntity, Raw::AstNode node, int syntheticChildIndex, int attrIndex
) {
Raw::entityContains(enclosingEntity, node) and
exists(node.astNodeAttribute(attrIndex)) and
not isLeaf(node) and
if exists(node.astNodeChild(_))
then
syntheticChildIndex =
attrIndex - min(int other | exists(node.astNodeAttribute(other))) +
max(int other | exists(node.astNodeChild(other))) + 1
else syntheticChildIndex = attrIndex
}
pragma[nomagic]
private AstNode injectedChild(Raw::Entity enclosingEntity, Raw::AstNode parent, int index) {
result = TAstNode(enclosingEntity, parent.astNodeChild(index)) or
result = TSyntheticNode(enclosingEntity, parent, index, _)
}
/**
* A type ranging over AST nodes. Ultimately, only nodes contained in entities will
* be considered.
*/
class AstNode extends WrappedAstNode {
Raw::Entity enclosingEntity;
Raw::AstNode rawNode;
AstNode() {
(
this = TAstNode(enclosingEntity, rawNode) or
this = TSyntheticNode(enclosingEntity, rawNode, _, _)
) and
not Raw::isEntityIgnored(enclosingEntity)
}
string getAttribute(int index) {
result = rawNode.astNodeAttribute(index) and
not exists(TSyntheticNode(enclosingEntity, rawNode, _, index))
}
string getType() { result = rawNode.astNodeType() }
AstNode getChild(int index) { result = injectedChild(enclosingEntity, rawNode, index) }
string toString() { result = this.getType() }
Raw::Location getLocation() { result = rawNode.getLocation() }
}
/**
* A synthetic AST node, created to be a leaf for an otherwise non-leaf attribute.
*/
class SyntheticAstNode extends AstNode, TSyntheticNode {
int childIndex;
int attributeIndex;
SyntheticAstNode() {
this = TSyntheticNode(enclosingEntity, rawNode, childIndex, attributeIndex)
}
override string getAttribute(int index) {
result = rawNode.astNodeAttribute(attributeIndex) and index = attributeIndex
}
override string getType() {
result = rawNode.astNodeType() + "::<synthetic " + childIndex + ">"
}
override AstNode getChild(int index) { none() }
}
}
module DatabaseFeatures {
/**
* Exposed as a tool for defining anchors for semantic search.
*/
class UnderlyingFunction = Raw::UnderlyingFunction;
private class Location = Raw::Location;
private newtype TEntityOrAstNode =
TEntity(Wrapped::Entity entity) or
TAstNode(Wrapped::AstNode astNode)
class EntityOrAstNode extends TEntityOrAstNode {
abstract string getType();
abstract string toString();
abstract Location getLocation();
}
class Entity extends EntityOrAstNode, TEntity {
Wrapped::Entity entity;
Entity() { this = TEntity(entity) }
string getName() { result = entity.getName() }
AstNode getAstRoot(int index) { result = TAstNode(entity.getAstRoot(index)) }
override string getType() { result = "javascript function" }
override string toString() { result = "Entity: " + this.getName() }
override Location getLocation() { result = entity.getLocation() }
UnderlyingFunction getDefinedFunction() { result = entity.getDefinedFunction() }
}
class AstNode extends EntityOrAstNode, TAstNode {
Wrapped::AstNode rawNode;
AstNode() { this = TAstNode(rawNode) }
AstNode getChild(int index) { result = TAstNode(rawNode.getChild(index)) }
string getAttribute(int index) { result = rawNode.getAttribute(index) }
override string getType() { result = rawNode.getType() }
override string toString() { result = this.getType() }
override Location getLocation() { result = rawNode.getLocation() }
}
query predicate entities(
Entity entity, string entity_name, string entity_type, string path, int startLine,
int startColumn, int endLine, int endColumn, string absolutePath
) {
entity_name = entity.getName() and
entity_type = entity.getType() and
exists(Location l | l = entity.getLocation() |
path = l.getFile().getRelativePath() and
absolutePath = l.getFile().getAbsolutePath() and
l.hasLocationInfo(_, startLine, startColumn, endLine, endColumn)
)
}
query predicate astNodes(
Entity enclosingEntity, EntityOrAstNode parent, int index, AstNode node, string node_type
) {
node = enclosingEntity.getAstRoot(index) and
parent = enclosingEntity and
node_type = node.getType()
or
astNodes(enclosingEntity, _, _, parent, _) and
node = parent.(AstNode).getChild(index) and
node_type = node.getType()
}
query predicate nodeAttributes(AstNode node, string attr) {
// Only get attributes of AST nodes we extract.
// This excludes nodes in standard libraries since the standard library files
// are located outside the source root.
astNodes(_, _, _, node, _) and
attr = node.getAttribute(_)
}
}

View File

@@ -5,7 +5,6 @@
*/
import javascript
private import CodeToFeatures
private import FeaturizationConfig
private import FunctionBodyFeatures as FunctionBodyFeatures
@@ -19,19 +18,18 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
(
// Features for endpoints that are contained within a function.
exists(DatabaseFeatures::Entity entity |
entity.getDefinedFunction() =
FunctionBodyFeatures::getRepresentativeFunctionForEndpoint(endpoint)
exists(Function function |
function = FunctionBodyFeatures::getRepresentativeFunctionForEndpoint(endpoint)
|
// The name of the function that encloses the endpoint.
featureName = "enclosingFunctionName" and result = entity.getName()
featureName = "enclosingFunctionName" and result = FunctionNames::getNameToFeaturize(function)
or
// A feature containing natural language tokens from the function that encloses the endpoint in
// the order that they appear in the source code.
featureName = "enclosingFunctionBody" and
result =
strictconcat(string token, Location l |
FunctionBodyFeatures::bodyTokens(entity.getDefinedFunction(), l, token)
FunctionBodyFeatures::bodyTokens(function, l, token)
|
token, " "
order by
@@ -84,11 +82,10 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
*
* This may in general report multiple strings, each containing a space-separated list of tokens.
*
* **Technical details:** This predicate can have multiple values per endpoint and feature name. As a
* result, the results from this predicate must be concatenated together. However concatenating
* other features like the function body tokens is expensive, so we separate out this predicate
* from others like `FunctionBodies::getBodyTokenFeatureForEntity` to avoid having to perform this
* concatenation operation on other features like the function body tokens.
* **Technical details:** This predicate can have multiple values per endpoint and feature name. As
* a result, the results from this predicate must be concatenated together. However concatenating
* other features like the function body tokens is expensive, so for performance reasons we separate
* out this predicate from those other features.
*/
private string getACallBasedTokenFeatureComponent(
DataFlow::Node endpoint, DataFlow::CallNode call, string featureName
@@ -243,6 +240,57 @@ private module AccessPaths {
}
}
private module FunctionNames {
/**
* Get the name of the function.
*
* We attempt to assign unnamed entities approximate names if they are passed to a likely
* external library function. If we can't assign them an approximate name, we give them the name
* `""`, so that these entities are included in `AdaptiveThreatModeling.qll`.
*
* For entities which have multiple names, we choose the lexically smallest name.
*/
string getNameToFeaturize(Function function) {
if exists(function.getName())
then result = min(function.getName())
else
if exists(getApproximateNameForFunction(function))
then result = getApproximateNameForFunction(function)
else result = ""
}
/**
* Holds if the call `call` has `function` is its `argumentIndex`th argument.
*/
private predicate functionUsedAsArgumentToCall(
Function function, DataFlow::CallNode call, int argumentIndex
) {
DataFlow::localFlowStep*(call.getArgument(argumentIndex), function.flow())
}
/**
* Returns a generated name for the function. This name is generated such that
* entities with the same names have similar behaviour.
*/
private string getApproximateNameForFunction(Function function) {
count(DataFlow::CallNode call, int index | functionUsedAsArgumentToCall(function, call, index)) =
1 and
exists(DataFlow::CallNode call, int index, string basePart |
functionUsedAsArgumentToCall(function, call, index) and
(
if count(getReceiverName(call)) = 1
then basePart = getReceiverName(call) + "."
else basePart = ""
) and
result = basePart + call.getCalleeName() + "#functionalargument"
)
}
private string getReceiverName(DataFlow::CallNode call) {
result = call.getReceiver().asExpr().(VarAccess).getName()
}
}
/** Get a name of a supported generic token-based feature. */
private string getASupportedFeatureName() {
result =