Merge branch 'main' into codeql-ci/atm/release-0.4.2

This commit is contained in:
Henry Mercer
2022-11-24 14:41:49 +00:00
1559 changed files with 129828 additions and 82528 deletions

View File

@@ -1,6 +1,8 @@
# [Internal only] Adaptive Threat Modeling for JavaScript
# Adaptive Threat Modeling for JavaScript
This directory contains CodeQL libraries and queries that power adaptive threat modeling for JavaScript.
All APIs are experimental and may change in the future.
These queries can only be run by internal users; for external users they will return no results.
Only internal users can run these queries directly. External users can run these queries when performing
JavaScript analysis on Code Scanning. For more information, see
[Code scanning finds more vulnerabilities using machine learning](https://github.blog/2022-02-17-code-scanning-finds-vulnerabilities-using-machine-learning/).

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Configures boosting for adaptive threat modeling (ATM).
@@ -50,7 +50,8 @@ abstract class AtmConfig extends string {
// known sink for the class.
exists(EndpointCharacteristic characteristic |
characteristic.getEndpoints(sink) and
characteristic.getImplications(this.getASinkEndpointType(), true, 1.0)
characteristic
.getImplications(this.getASinkEndpointType(), true, characteristic.maximalConfidence())
)
}

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Provides information about the results of boosted queries for use in adaptive threat modeling (ATM).

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Provides shared scoring functionality for use in adaptive threat modeling (ATM).

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Provides predicates that expose the knowledge of models

View File

@@ -30,18 +30,39 @@ abstract class EndpointCharacteristic extends string {
/**
* This predicate describes what the characteristic tells us about an endpoint.
*
* Params:
* endpointClass: Class 0 is the negative class. Each positive int corresponds to a single sink type.
* isPositiveIndicator: Does this characteristic indicate this endpoint _is_ a member of the class, or that it
* _isn't_ a member of the class?
* confidence: A number in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
* belonging / not belonging to the given class.
* Params:
* endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier
* class for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single
* sink type.
* isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if
* false, it indicates that it _isn't_ a member of the class.
* confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint
* belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak
* indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with
* this characteristic definitively do/don't belong to the class.
*/
abstract predicate getImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
);
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
final float getHighConfidenceThreshold() { result = 0.8 }
// The following are some confidence values that are used in practice by the subclasses. They are defined as named
// constants here to make it easier to change them in the future.
final float maximalConfidence() { result = 1.0 }
final float highConfidence() { result = 0.9 }
final float mediumConfidence() { result = 0.6 }
}
/*
* Characteristics that are indicative of a sink.
* NOTE: Initially each sink type has only one characteristic, which is that it's a sink of this type in the standard
* JavaScript libraries.
*/
/**
* Endpoints identified as "DomBasedXssSink" by the standard JavaScript libraries are XSS sinks with maximal confidence.
*/
@@ -53,7 +74,9 @@ private class DomBasedXssSinkCharacteristic extends EndpointCharacteristic {
override predicate getImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof XssSinkType and isPositiveIndicator = true and confidence = 1.0
endpointClass instanceof XssSinkType and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
@@ -69,7 +92,9 @@ private class TaintedPathSinkCharacteristic extends EndpointCharacteristic {
override predicate getImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof TaintedPathSinkType and isPositiveIndicator = true and confidence = 1.0
endpointClass instanceof TaintedPathSinkType and
isPositiveIndicator = true and
confidence = maximalConfidence()
}
}
@@ -87,7 +112,7 @@ private class SqlInjectionSinkCharacteristic extends EndpointCharacteristic {
) {
endpointClass instanceof SqlInjectionSinkType and
isPositiveIndicator = true and
confidence = 1.0
confidence = maximalConfidence()
}
}
@@ -105,6 +130,315 @@ private class NosqlInjectionSinkCharacteristic extends EndpointCharacteristic {
) {
endpointClass instanceof NosqlInjectionSinkType and
isPositiveIndicator = true and
confidence = 1.0
confidence = maximalConfidence()
}
}
/*
* Characteristics that are indicative of not being a sink of any type.
*/
/**
* A characteristic that is an indicator of not being a sink of any type, because it's an argument to a function of a
* builtin object.
*/
abstract private class ArgumentToBuiltinFunctionCharacteristic extends EndpointCharacteristic {
bindingset[this]
ArgumentToBuiltinFunctionCharacteristic() { any() }
}
/**
* A high-confidence characteristic that indicates that an endpoint is not a sink of any type.
*/
abstract private class NotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
NotASinkCharacteristic() { any() }
override predicate getImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof NegativeType and
isPositiveIndicator = true and
confidence = highConfidence()
}
}
/**
* A medium-confidence characteristic that indicates that an endpoint is not a sink of any type.
*
* TODO: This class is currently not private, because the current extraction logic explicitly avoids including these
* endpoints in the training data. We might want to change this in the future.
*/
abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic {
bindingset[this]
LikelyNotASinkCharacteristic() { any() }
override predicate getImplications(
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
) {
endpointClass instanceof NegativeType and
isPositiveIndicator = true and
confidence = mediumConfidence()
}
}
private class LodashUnderscore extends NotASinkCharacteristic {
LodashUnderscore() { this = "LodashUnderscoreArgument" }
override predicate getEndpoints(DataFlow::Node n) {
any(LodashUnderscore::Member m).getACall().getAnArgument() = n
}
}
private class JQueryArgumentCharacteristic extends NotASinkCharacteristic {
JQueryArgumentCharacteristic() { this = "JQueryArgument" }
override predicate getEndpoints(DataFlow::Node n) {
any(JQuery::MethodCall m).getAnArgument() = n
}
}
private class ClientRequestCharacteristic extends NotASinkCharacteristic {
ClientRequestCharacteristic() { this = "ClientRequest" }
override predicate getEndpoints(DataFlow::Node n) {
exists(ClientRequest r |
r.getAnArgument() = n or n = r.getUrl() or n = r.getHost() or n = r.getADataNode()
)
}
}
private class PromiseDefinitionCharacteristic extends NotASinkCharacteristic {
PromiseDefinitionCharacteristic() { this = "PromiseDefinition" }
override predicate getEndpoints(DataFlow::Node n) {
exists(PromiseDefinition p |
n = [p.getResolveParameter(), p.getRejectParameter()].getACall().getAnArgument()
)
}
}
private class CryptographicKeyCharacteristic extends NotASinkCharacteristic {
CryptographicKeyCharacteristic() { this = "CryptographicKey" }
override predicate getEndpoints(DataFlow::Node n) { n instanceof CryptographicKey }
}
private class CryptographicOperationFlowCharacteristic extends NotASinkCharacteristic {
CryptographicOperationFlowCharacteristic() { this = "CryptographicOperationFlow" }
override predicate getEndpoints(DataFlow::Node n) {
any(CryptographicOperation op).getInput() = n
}
}
private class LoggerMethodCharacteristic extends NotASinkCharacteristic {
LoggerMethodCharacteristic() { this = "LoggerMethod" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call.getCalleeName() = getAStandardLoggerMethodName()
)
}
}
private class TimeoutCharacteristic extends NotASinkCharacteristic {
TimeoutCharacteristic() { this = "Timeout" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call.getCalleeName() = ["setTimeout", "clearTimeout"]
)
}
}
private class ReceiverStorageCharacteristic extends NotASinkCharacteristic {
ReceiverStorageCharacteristic() { this = "ReceiverStorage" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call.getReceiver() = DataFlow::globalVarRef(["localStorage", "sessionStorage"])
)
}
}
private class StringStartsWithCharacteristic extends NotASinkCharacteristic {
StringStartsWithCharacteristic() { this = "StringStartsWith" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call instanceof StringOps::StartsWith
)
}
}
private class StringEndsWithCharacteristic extends NotASinkCharacteristic {
StringEndsWithCharacteristic() { this = "StringEndsWith" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof StringOps::EndsWith)
}
}
private class StringRegExpTestCharacteristic extends NotASinkCharacteristic {
StringRegExpTestCharacteristic() { this = "StringRegExpTest" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call instanceof StringOps::RegExpTest
)
}
}
private class EventRegistrationCharacteristic extends NotASinkCharacteristic {
EventRegistrationCharacteristic() { this = "EventRegistration" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof EventRegistration)
}
}
private class EventDispatchCharacteristic extends NotASinkCharacteristic {
EventDispatchCharacteristic() { this = "EventDispatch" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof EventDispatch)
}
}
private class MembershipCandidateTestCharacteristic extends NotASinkCharacteristic {
MembershipCandidateTestCharacteristic() { this = "MembershipCandidateTest" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call = any(MembershipCandidate c).getTest()
)
}
}
private class FileSystemAccessCharacteristic extends NotASinkCharacteristic {
FileSystemAccessCharacteristic() { this = "FileSystemAccess" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof FileSystemAccess)
}
}
private class DatabaseAccessCharacteristic extends NotASinkCharacteristic {
DatabaseAccessCharacteristic() { this = "DatabaseAccess" }
override predicate getEndpoints(DataFlow::Node n) {
// TODO database accesses are less well defined than database query sinks, so this may cover unmodeled sinks on
// existing database models
exists(DataFlow::CallNode call | n = call.getAnArgument() |
[
call, call.getAMethodCall()
/* command pattern where the query is built, and then exec'ed later */ ] instanceof
DatabaseAccess
)
}
}
private class DomCharacteristic extends NotASinkCharacteristic {
DomCharacteristic() { this = "DOM" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() | call = DOM::domValueRef())
}
}
private class NextFunctionCallCharacteristic extends NotASinkCharacteristic {
NextFunctionCallCharacteristic() { this = "NextFunctionCall" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call.getCalleeName() = "next" and
exists(DataFlow::FunctionNode f | call = f.getLastParameter().getACall())
)
}
}
private class DojoRequireCharacteristic extends NotASinkCharacteristic {
DojoRequireCharacteristic() { this = "DojoRequire" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call | n = call.getAnArgument() |
call = DataFlow::globalVarRef("dojo").getAPropertyRead("require").getACall()
)
}
}
private class Base64ManipulationCharacteristic extends NotASinkCharacteristic {
Base64ManipulationCharacteristic() { this = "Base64Manipulation" }
override predicate getEndpoints(DataFlow::Node n) {
exists(Base64::Decode d | n = d.getInput()) or
exists(Base64::Encode d | n = d.getInput())
}
}
private class ArgumentToArrayCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
LikelyNotASinkCharacteristic {
ArgumentToArrayCharacteristic() { this = "ArgumentToArray" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::SourceNode builtin, DataFlow::SourceNode receiver, DataFlow::InvokeNode invk |
builtin instanceof DataFlow::ArrayCreationNode
|
receiver = [builtin.getAnInvocation(), builtin] and
invk = [receiver, receiver.getAPropertyRead()].getAnInvocation() and
invk.getAnArgument() = n
)
}
}
private class ArgumentToBuiltinGlobalVarRefCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
LikelyNotASinkCharacteristic {
ArgumentToBuiltinGlobalVarRefCharacteristic() { this = "ArgumentToBuiltinGlobalVarRef" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::SourceNode builtin, DataFlow::SourceNode receiver, DataFlow::InvokeNode invk |
builtin =
DataFlow::globalVarRef([
"Map", "Set", "WeakMap", "WeakSet", "Number", "Object", "String", "Array", "Error",
"Math", "Boolean"
])
|
receiver = [builtin.getAnInvocation(), builtin] and
invk = [receiver, receiver.getAPropertyRead()].getAnInvocation() and
invk.getAnArgument() = n
)
}
}
private class ConstantReceiverCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
NotASinkCharacteristic {
ConstantReceiverCharacteristic() { this = "ConstantReceiver" }
override predicate getEndpoints(DataFlow::Node n) {
exists(Expr primitive, MethodCallExpr c |
primitive instanceof ConstantString or
primitive instanceof NumberLiteral or
primitive instanceof BooleanLiteral
|
c.calls(primitive, _) and
c.getAnArgument() = n.asExpr()
)
}
}
private class BuiltinCallNameCharacteristic extends ArgumentToBuiltinFunctionCharacteristic,
NotASinkCharacteristic {
BuiltinCallNameCharacteristic() { this = "BuiltinCallName" }
override predicate getEndpoints(DataFlow::Node n) {
exists(DataFlow::CallNode call |
call.getAnArgument() = n and
call.getCalleeName() =
[
"indexOf", "hasOwnProperty", "substring", "isDecimal", "decode", "encode", "keys",
"shift", "values", "forEach", "toString", "slice", "splice", "push", "isArray", "sort"
]
)
}
}

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Extracts data about the database for use in adaptive threat modeling (ATM).

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Provides an implementation of scoring alerts for use in adaptive threat modeling (ATM).

View File

@@ -16,6 +16,11 @@ newtype TEndpointType =
abstract class EndpointType extends TEndpointType {
abstract string getDescription();
/**
* Gets the integer representation of this endpoint type. This integer representation specifies the class number
* used by the endpoint scoring model (the classifier) to represent this endpoint type. Class 0 is the negative
* class (non-sink). Each positive int corresponds to a single sink type.
*/
abstract int getEncoding();
string toString() { result = getDescription() }

View File

@@ -1,4 +1,4 @@
/*
/**
* FunctionBodyFeatures.qll
*
* Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features.

View File

@@ -1,4 +1,5 @@
name: codeql/javascript-experimental-atm-lib
description: CodeQL libraries for the experimental ML-powered queries
version: 0.4.3
extractor: javascript
library: true

View File

@@ -1,4 +1,5 @@
name: codeql/javascript-experimental-atm-model
description: Machine learning model supporting the experimental ML-powered queries
version: 0.3.1
groups:
- javascript

View File

@@ -11,7 +11,7 @@
import javascript
import experimental.adaptivethreatmodeling.ATMConfig
import extraction.ExtractEndpointData
import extraction.ExtractEndpointDataTraining
string getAReasonSinkExcluded(DataFlow::Node sinkCandidate, Query query) {
query instanceof NosqlInjectionQuery and
@@ -33,7 +33,7 @@ string getDescriptionForAlertCandidate(
) {
result = "excluded[reason=" + getAReasonSinkExcluded(sinkCandidate, query) + "]"
or
getAtmCfg(query).isKnownSink(sinkCandidate) and
getDataFlowCfg(query).(AtmConfig).isKnownSink(sinkCandidate) and
result = "excluded[reason=known-sink]"
or
not exists(getAReasonSinkExcluded(sinkCandidate, query)) and

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
*

View File

@@ -1,4 +1,4 @@
/*
/**
* For internal use only.
*
* Defines files that should be excluded from the evaluation of ML models.

View File

@@ -1,11 +0,0 @@
/*
* For internal use only.
*
* Extracts training and evaluation data we can use to train ML models for ML-powered queries.
*/
import ExtractEndpointData as ExtractEndpointData
query predicate endpoints = ExtractEndpointData::endpoints/5;
query predicate tokenFeatures = ExtractEndpointData::tokenFeatures/3;

View File

@@ -1,215 +0,0 @@
/*
* For internal use only.
*
* Library code for training and evaluation data we can use to train ML models for ML-powered
* queries.
*/
import javascript
import Exclusions as Exclusions
import evaluation.EndToEndEvaluation as EndToEndEvaluation
import experimental.adaptivethreatmodeling.ATMConfig
import experimental.adaptivethreatmodeling.CoreKnowledge as CoreKnowledge
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
import experimental.adaptivethreatmodeling.EndpointScoring as EndpointScoring
import experimental.adaptivethreatmodeling.EndpointTypes
import experimental.adaptivethreatmodeling.FilteringReasons
import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
/** DEPRECATED: Alias for NosqlInjectionAtm */
deprecated module NosqlInjectionATM = NosqlInjectionAtm;
import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
/** DEPRECATED: Alias for SqlInjectionAtm */
deprecated module SqlInjectionATM = SqlInjectionAtm;
import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
/** DEPRECATED: Alias for TaintedPathAtm */
deprecated module TaintedPathATM = TaintedPathAtm;
import experimental.adaptivethreatmodeling.XssATM as XssAtm
/** DEPRECATED: Alias for XssAtm */
deprecated module XssATM = XssAtm;
import Labels
import NoFeaturizationRestrictionsConfig
import Queries
/** Gets the ATM configuration object for the specified query. */
AtmConfig getAtmCfg(Query query) {
query instanceof NosqlInjectionQuery and
result instanceof NosqlInjectionAtm::NosqlInjectionAtmConfig
or
query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::SqlInjectionAtmConfig
or
query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::TaintedPathAtmConfig
or
query instanceof XssQuery and result instanceof XssAtm::DomBasedXssAtmConfig
}
/** DEPRECATED: Alias for getAtmCfg */
deprecated ATMConfig getATMCfg(Query query) { result = getAtmCfg(query) }
/** Gets the ATM data flow configuration for the specified query. */
DataFlow::Configuration getDataFlowCfg(Query query) {
query instanceof NosqlInjectionQuery and result instanceof NosqlInjectionAtm::Configuration
or
query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::Configuration
or
query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::Configuration
or
query instanceof XssQuery and result instanceof XssAtm::Configuration
}
/** Gets a known sink for the specified query. */
private DataFlow::Node getASink(Query query) {
getAtmCfg(query).isKnownSink(result) and
// Only consider the source code for the project being analyzed.
exists(result.getFile().getRelativePath())
}
/** Gets a data flow node that is known not to be a sink for the specified query. */
private DataFlow::Node getANotASink(NotASinkReason reason) {
CoreKnowledge::isOtherModeledArgument(result, reason) and
// Some endpoints can be assigned both a `NotASinkReason` and a `LikelyNotASinkReason`. We
// consider these endpoints to be `LikelyNotASink`, therefore this line excludes them from the
// definition of `NotASink`.
not CoreKnowledge::isOtherModeledArgument(result, any(LikelyNotASinkReason t)) and
not result = getASink(_) and
// Only consider the source code for the project being analyzed.
exists(result.getFile().getRelativePath())
}
/**
* Gets a data flow node whose label is unknown for the specified query.
*
* In other words, this is an endpoint that is not `Sink`, `NotASink`, or `LikelyNotASink` for the
* specified query.
*/
private DataFlow::Node getAnUnknown(Query query) {
getAtmCfg(query).isEffectiveSink(result) and
// Effective sinks should exclude sinks but this is a defensive requirement
not result = getASink(query) and
// Effective sinks should exclude NotASink but for some queries (e.g. Xss) this is currently not always the case and
// so this is a defensive requirement
not result = getANotASink(_) and
// Only consider the source code for the project being analyzed.
exists(result.getFile().getRelativePath())
}
/** Gets the query-specific sink label for the given endpoint, if such a label exists. */
private EndpointLabel getSinkLabelForEndpoint(DataFlow::Node endpoint, Query query) {
endpoint = getASink(query) and result instanceof SinkLabel
or
endpoint = getANotASink(_) and result instanceof NotASinkLabel
or
endpoint = getAnUnknown(query) and result instanceof UnknownLabel
}
/** Gets an endpoint that should be extracted. */
DataFlow::Node getAnEndpoint(Query query) { exists(getSinkLabelForEndpoint(result, query)) }
/**
* Endpoints and associated metadata.
*
* Note that we draw a distinction between _features_, that are provided to the model at training
* and query time, and _metadata_, that is only provided to the model at training time.
*
* Internal: See the design document for
* [extensible extraction queries](https://docs.google.com/document/d/1g3ci2Nf1hGMG6ZUP0Y4PqCy_8elcoC_dhBvgTxdAWpg)
* for technical information about the design of this predicate.
*/
predicate endpoints(
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
) {
exists(Query query |
// Only provide metadata for labelled endpoints, since we do not extract all endpoints.
endpoint = getAnEndpoint(query) and
queryName = query.getName() and
(
// Holds if there is a taint flow path from a known source to the endpoint
key = "hasFlowFromSource" and
(
if FlowFromSource::hasFlowFromSource(endpoint, query)
then value = "true"
else value = "false"
) and
valueType = "boolean"
or
// Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
// appear in an alert, making them less interesting training examples.
key = "isConstantExpression" and
(if endpoint.asExpr() instanceof ConstantExpr then value = "true" else value = "false") and
valueType = "boolean"
or
// Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
key = "isExcludedFromEndToEndEvaluation" and
(if Exclusions::isFileExcluded(endpoint.getFile()) then value = "true" else value = "false") and
valueType = "boolean"
or
// The label for this query, considering the endpoint as a sink.
key = "sinkLabel" and
value = getSinkLabelForEndpoint(endpoint, query).getEncoding() and
valueType = "string"
or
// The reason, or reasons, why the endpoint was labeled NotASink for this query.
key = "notASinkReason" and
exists(FilteringReason reason |
endpoint = getANotASink(reason) and
value = reason.getDescription()
) and
valueType = "string"
)
)
}
/**
* `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
* `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
* `featureValue` to the empty string in this case.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
endpoints(endpoint, _, _, _, _) and
(
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
or
// Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
featureName = EndpointFeatures::getASupportedFeatureName() and
not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
featureValue = ""
)
}
module FlowFromSource {
predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
}
/**
* A data flow configuration that replicates the data flow configuration for a specific query, but
* replaces the set of sinks with the set of endpoints we're extracting.
*
* We use this to find out when there is flow to a particular endpoint from a known source.
*
* This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
* from the CodeQL standard libraries for JavaScript.
*/
private class Configuration extends DataFlow::Configuration {
Query q;
Configuration() { this = getDataFlowCfg(q) }
Query getQuery() { result = q }
/** Holds if `sink` is an endpoint we're extracting. */
override predicate isSink(DataFlow::Node sink) { sink = getAnEndpoint(q) }
/** Holds if `sink` is an endpoint we're extracting. */
override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) {
sink = getAnEndpoint(q) and exists(lbl)
}
}
}

View File

@@ -4,23 +4,8 @@
* Extracts training data we can use to train ML models for ML-powered queries.
*/
import javascript
import ExtractEndpointData as ExtractEndpointData
private import ExtractEndpointDataTraining as ExtractEndpointDataTraining
query predicate endpoints(
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
) {
ExtractEndpointData::endpoints(endpoint, queryName, key, value, valueType) and
// only select endpoints that are either Sink or NotASink
ExtractEndpointData::endpoints(endpoint, queryName, "sinkLabel", ["Sink", "NotASink"], "string") and
// do not select endpoints filtered out by end-to-end evaluation
ExtractEndpointData::endpoints(endpoint, queryName, "isExcludedFromEndToEndEvaluation", "false",
"boolean") and
// only select endpoints that can be part of a tainted flow
ExtractEndpointData::endpoints(endpoint, queryName, "isConstantExpression", "false", "boolean")
}
query predicate endpoints = ExtractEndpointDataTraining::reformattedTrainingEndpoints/5;
query predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
endpoints(endpoint, _, _, _, _) and
ExtractEndpointData::tokenFeatures(endpoint, featureName, featureValue)
}
query predicate tokenFeatures = ExtractEndpointDataTraining::tokenFeatures/3;

View File

@@ -0,0 +1,238 @@
/**
* For internal use only.
*
* Extracts training data we can use to train ML models for ML-powered queries.
*/
import javascript
import experimental.adaptivethreatmodeling.EndpointCharacteristics
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
import NoFeaturizationRestrictionsConfig
private import Exclusions as Exclusions
import Queries
import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
import experimental.adaptivethreatmodeling.XssATM as XssAtm
/**
* Gets the set of featureName-featureValue pairs for each endpoint in the training set.
*
* `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
* `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
* `featureValue` to the empty string in this case.
*/
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
trainingEndpoints(endpoint, _, _) and
(
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
or
// Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
featureName = EndpointFeatures::getASupportedFeatureName() and
not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
featureValue = ""
)
}
/**
* Holds if the given endpoint should be included in the training set as a sample belonging to endpointClass, and has
* the given characteristic. This query uses the endpoint characteristics to select and label endpoints for the training
* set, and provides a list of characteristics for each endpoint in the training set, which is used in the modeling
* code.
*
* Params:
* endpoint: The endpoint to include / exclude.
* endpointClass: The sink type. See the documentation of EndpointType.getEncoding for details about the relationship
* between an EndpointType and a class in the classifier.
* characteristic: Provides the list of characteristics that apply to the endpoint, which the modeling code currently
* uses for type balancing.
*
* Note: This predicate will produce multiple tuples for endpoints that have multiple characteristics, which we must
* then group together into a list of characteristics.
*/
query predicate trainingEndpoints(
DataFlow::Node endpoint, EndpointType endpointClass, EndpointCharacteristic characteristic
) {
characteristic.getEndpoints(endpoint) and
// Only consider the source code for the project being analyzed.
exists(endpoint.getFile().getRelativePath()) and
// Only select endpoints that can be part of a tainted flow: Constant expressions always evaluate to a constant
// primitive value. Therefore they can't ever appear in an alert, making them less interesting training examples.
// TODO: Experiment with removing this requirement.
not endpoint.asExpr() instanceof ConstantExpr and
// Do not select endpoints filtered out by end-to-end evaluation.
// TODO: Experiment with removing this requirement.
not Exclusions::isFileExcluded(endpoint.getFile()) and
// Filter out negative examples that also have a LikelyNotASinkReason, because this is currently done here
// https://github.com/github/codeql/blob/387e57546bf7352f7c1cfe781daa1a3799b7063e/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll#L77
// TODO: Experiment with removing this requirement.
not (
endpointClass instanceof NegativeType and
exists(EndpointCharacteristic c |
c.getEndpoints(endpoint) and
c instanceof LikelyNotASinkCharacteristic
)
) and
(
// If the list of characteristics includes positive indicators with high confidence for this class, select this as a
// training sample belonging to the class.
exists(EndpointCharacteristic characteristic2, float confidence |
characteristic2.getEndpoints(endpoint) and
characteristic2.getImplications(endpointClass, true, confidence) and
confidence >= characteristic2.getHighConfidenceThreshold()
) and
(
// Temporarily limit this only to positive classes. For negative classes, additionally select only endpoints that
// have no high confidence indicators that they are sinks, because this is what was previously done.
// TODO: Experiment with removing this requirement, and instead ensuring that an endpoint never has both a high
// confidence indicator that it _is_ a sink and a high confidence indicator that it is _not_ a sink.
not endpointClass instanceof NegativeType
or
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
characteristic3.getEndpoints(endpoint) and
characteristic3.getImplications(posClass, true, confidence3) and
confidence3 >= characteristic3.getHighConfidenceThreshold() and
not posClass instanceof NegativeType
)
)
or
// If the list of characteristics includes negative indicators with high confidence for all classes other than 0,
// select this as a training sample of class 0 (this means we had query-specific characteristics to decide this
// endpoint isn't a sink for each of our sink types).
endpointClass instanceof NegativeType and
forall(EndpointType otherClass | not otherClass instanceof NegativeType |
exists(EndpointCharacteristic characteristic2, float confidence |
characteristic2.getEndpoints(endpoint) and
characteristic2.getImplications(otherClass, false, confidence) and
confidence >= characteristic2.getHighConfidenceThreshold()
)
)
)
}
/**
* Temporary:
* Reformat the training data that was extracted with the new logic to match the format produced by the old predicate.
* This is the format expected by the endpoint pipeline.
*/
query predicate reformattedTrainingEndpoints(
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
) {
trainingEndpoints(endpoint, _, _) and
exists(Query query |
queryName = query.getName() and
// For sinks, only list that sink type, but for non-sinks, list all sink types.
(
exists(EndpointType endpointClass |
endpointClass.getDescription().matches(queryName + "%") and
not endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
or
exists(EndpointType endpointClass |
endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
) and
(
// NOTE: We don't use hasFlowFromSource in training, so we could just hardcode it to be false.
key = "hasFlowFromSource" and
(
if FlowFromSource::hasFlowFromSource(endpoint, query)
then value = "true"
else value = "false"
) and
valueType = "boolean"
or
// Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
// appear in an alert, making them less interesting training examples.
key = "isConstantExpression" and
(if endpoint.asExpr() instanceof ConstantExpr then value = "true" else value = "false") and
valueType = "boolean"
or
// Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
key = "isExcludedFromEndToEndEvaluation" and
(if Exclusions::isFileExcluded(endpoint.getFile()) then value = "true" else value = "false") and
valueType = "boolean"
or
// The label for this query, considering the endpoint as a sink.
key = "sinkLabel" and
valueType = "string" and
value = "Sink" and
exists(EndpointType endpointClass |
endpointClass.getDescription().matches(queryName + "%") and
not endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
or
key = "sinkLabel" and
valueType = "string" and
value = "NotASink" and
exists(EndpointType endpointClass |
endpointClass instanceof NegativeType and
trainingEndpoints(endpoint, endpointClass, _)
)
or
// The reason, or reasons, why the endpoint was labeled NotASink for this query, only for negative examples.
key = "notASinkReason" and
exists(EndpointCharacteristic characteristic, EndpointType endpointClass |
characteristic.getEndpoints(endpoint) and
characteristic.getImplications(endpointClass, true, _) and
endpointClass instanceof NegativeType and
value = characteristic
) and
// Don't include a notASinkReason for endpoints that are also known sinks.
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
characteristic3.getEndpoints(endpoint) and
characteristic3.getImplications(posClass, true, confidence3) and
confidence3 >= characteristic3.getHighConfidenceThreshold() and
not posClass instanceof NegativeType
) and
valueType = "string"
)
)
}
/**
* Gets the ATM data flow configuration for the specified query.
* TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
*/
DataFlow::Configuration getDataFlowCfg(Query query) {
query instanceof NosqlInjectionQuery and result instanceof NosqlInjectionAtm::Configuration
or
query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::Configuration
or
query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::Configuration
or
query instanceof XssQuery and result instanceof XssAtm::Configuration
}
// TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
private module FlowFromSource {
predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
}
/**
* A data flow configuration that replicates the data flow configuration for a specific query, but
* replaces the set of sinks with the set of endpoints we're extracting.
*
* We use this to find out when there is flow to a particular endpoint from a known source.
*
* This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
* from the CodeQL standard libraries for JavaScript.
*/
private class Configuration extends DataFlow::Configuration {
Query q;
Configuration() { this = getDataFlowCfg(q) }
Query getQuery() { result = q }
/** Holds if `sink` is an endpoint we're extracting. */
override predicate isSink(DataFlow::Node sink) { any() }
/** Holds if `sink` is an endpoint we're extracting. */
override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) { exists(lbl) }
}
}

View File

@@ -1,4 +1,5 @@
name: codeql/javascript-experimental-atm-model-building
description: CodeQL libraries for building machine learning models for the experimental ML-powered queries
extractor: javascript
library: false
groups:

View File

@@ -1,4 +1,5 @@
name: codeql/javascript-experimental-atm-queries
description: Experimental ML-powered queries for JavaScript
language: javascript
version: 0.4.3
suites: codeql-suites