Compare commits

...

14 Commits

Author SHA1 Message Date
tiferet
2bbd010db1 Start work on extracting the "training" part of the prompt from positive and negative examples in the current repo. 2022-12-01 17:10:34 -08:00
tiferet
413db2b15e Make it clearer to codex which parts of the prompt are code by using `` 2022-12-01 15:48:50 -08:00
tiferet
7eb04f8c32 Bug fix 2022-12-01 15:48:50 -08:00
tiferet
58bd6ac504 Fix implicit uses of this 2022-12-01 15:48:50 -08:00
tiferet
47b8f1420c Tokenize a neighborhood around the endpoint and properly create the last row of the codex prompt 2022-12-01 15:48:50 -08:00
tiferet
486752d19e Code improvements:
- Replace the containment logic with built in `Location` functionality.
- Generalize `tokenize` to output the tokens that fall within any location.
2022-12-01 15:48:50 -08:00
tiferet
a79bdf1cbc Generalize endpoint tokenization to work correctly across multiple lines 2022-12-01 15:48:50 -08:00
tiferet
3be6b42200 Add in Aditya's endpoint tokenization 2022-12-01 15:48:50 -08:00
tiferet
55839c8df2 small update 2022-12-01 15:48:50 -08:00
tiferet
0c87b25698 Typo fix 2022-12-01 15:48:50 -08:00
tiferet
33a8962f5a For now hardcode a training prompt string 2022-12-01 15:48:50 -08:00
tiferet
456aab0497 Make predicates private if they don't need to be public 2022-12-01 15:48:50 -08:00
tiferet
1919206e1e Start adding the codex prompt feature 2022-12-01 15:48:50 -08:00
tiferet
ad13f5585d Extract only a single feature, the codex prompt for the current endpoint. 2022-12-01 15:48:50 -08:00
2 changed files with 233 additions and 1 deletions

View File

@@ -7,6 +7,16 @@
import javascript
private import FeaturizationConfig
private import FunctionBodyFeatures as FunctionBodyFeatures
private import EndpointTypes as EndpointTypes
private import semmle.javascript.security.dataflow.NosqlInjectionCustomizations as NosqlInjectionCustomizations
private import semmle.javascript.security.dataflow.SqlInjectionCustomizations as SqlInjectionCustomizations
private import semmle.javascript.security.dataflow.TaintedPathCustomizations as TaintedPathCustomizations
private import semmle.javascript.security.dataflow.DomBasedXssCustomizations as DomBasedXssCustomizations
private import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.XssATM as XssAtm
private import experimental.adaptivethreatmodeling.EndpointCharacteristics as EndpointCharacteristics
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -72,7 +82,7 @@ private module FunctionNames {
}
/** Get a name of a supported generic token-based feature. */
string getASupportedFeatureName() { result = any(EndpointFeature f).getName() }
string getASupportedFeatureName() { result = "codexPrompt" }
/**
* Generic token-based features for ATM.
@@ -90,6 +100,7 @@ predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string feat
* See EndpointFeature
*/
private newtype TEndpointFeature =
TCodexPrompt() or
TEnclosingFunctionName() or
TReceiverName() or
TEnclosingFunctionBody() or
@@ -122,6 +133,194 @@ abstract class EndpointFeature extends TEndpointFeature {
string toString() { result = this.getName() }
}
/**
* The codex promt for this endpoint.
*/
class CodexPrompt extends EndpointFeature, TCodexPrompt {
override string getName() { result = "codexPrompt" }
override string getValue(DataFlow::Node endpoint) {
result = this.getTrainingSetPrompt() + this.getCurrentEndpointPrompt(endpoint)
}
/**
* Gets the beginning of the prompt, which contains the training examples, shuffled in random order.
* TODO
*/
private string getTrainingSetPrompt() {
result =
"# Examples of security vulnerability sinks and non-sinks\n|Dataflow node|Neighborhood|Classification|\n|---|---|---|\n|`refStr`|` const refStr = pageRef.gen === 0 ? ${pageRef.num}R : ${pageRef.num}R${pageRef.gen}; this.#pagesRefCache.set(refStr, pageNum); }`|non-sink|\n|`name`|` return res.redirect(back); } const ndb = req.mainClient.client.db(name); ndb.createCollection(delete_me, function (err) {`|sql injection sink|\n|`It took + (t1 - t0) + msec to create + count + +className+ instances programmatically.`|` } var t1 = new Date().getTime(); dojo.byId(results).innerHTML = It took + (t1 - t0) + msec to create + count + +className+ instances programmatically.; } dojo.ready(makeEm);`|non-sink|\n|`contents`|` const contents = fileData.buffer.toString(); res.json({message: contents});});`|non-sink|\n|`{ _id }`|` // If an ObjectID was correctly created from passed id param, try getting the ObjID first else falling back to try getting the string id // If not valid ObjectID created, try getting string id req.collection.findOne({ _id }, function (err, doc) { if (err) { req.session.error = Error: + err;`|nosql injection sink|\n|`bid`|` const body = <a href=https://ampbyexample.com target=_blank> <amp-img alt=AMP Ad height=250 src=//localhost:9876/amp4test/request-bank/${bid}/deposit/image width=300></amp-img> </a> <amp-pixel src=//localhost:9876/amp4test/request-bank/${bid}/deposit/pixel/foo?cid=CLIENT_ID(a)></amp-pixel>`|xss sink|\n|`nick`|` irc.me = nick; irc.nick(nick); irc.user(username, realname);`|non-sink|\n|`{where: {name: req.body.type}}`|` if(req.is(json)) { models.VisualizationType.find({ where: { name: req.body.type } }).then(function(vizType) { if(!vizType) { throw new Error(Unknown Viztype);`|nosql injection sink|\n|`sql`|` var callback = cb; var dbService = this.getService(connectionName); dbService.execute(sql, params, function(err, result) { if (err) { return callback(err);`|sql injection sink|\n|`filename`|` const writeStream = gfs.createWriteStream({ _id: newFileID, filename, mode: w, content_type: mimetype,`|path injection sink|\n|`req.url.substr(7)`|` <html style=width:100%; height:100%;> <body style=width:98%; height:98%;> <iframe src=${req.url.substr(7)} style=width:100%; height:100%;> </iframe>`|xss sink|\n|`assets/images/ + req.files.upload_file.name`|` res.send({ success: true, file_path: assets/images/ + req.files.upload_file.name }); });`|path injection sink|\n|`path`|`async function handleListingRequest({query: {path, search}}, res) { try { assert(path); const fileSet = await getListing(root, path);`|non-sink|\n|`{ lastLoginIp: lastLoginIp }`|` } models.User.findByPk(loggedInUser.data.id).then(user => { user.update({ lastLoginIp: lastLoginIp }).then(user => { res.json(user) }).catch(error => {`|non-sink|\n|`uploadId`|` Bucket: config.bucket, Key: key, UploadId: uploadId, MultipartUpload: { Parts: parts,`|non-sink|\n|`hash`|` componentDidMount() { const [, hash] = location.href.split(#) this.setState({ hash }) }`|non-sink|\n"
//hardNegativeExamplesForCodexPrompt() + hardPositiveExamplesForCodexPrompt(2, )
}
/**
* Gets the last line of the prompt, containing the current endpoint.
* TODO
*/
private string getCurrentEndpointPrompt(DataFlow::Node endpoint) {
result =
"|`" + this.tokenizeEndpoint(endpoint) + "`|`" + this.tokenizeNeighborhood(endpoint, 2) + "`|"
}
/**
* Holds if `endpoint` is a sink for a security vulnerability of type `sinkType`, where the string used to label this
* sink type is `sinkName`.
*/
private predicate isPositiveExampleFromCurrentRepo(
DataFlow::Node endpoint, EndpointTypes::EndpointType sinkType, string sinkName
) {
sinkType instanceof EndpointTypes::NosqlInjectionSinkType and
endpoint instanceof NosqlInjectionCustomizations::NosqlInjection::Sink and
sinkName = "nosql injection sink"
or
sinkType instanceof EndpointTypes::SqlInjectionSinkType and
endpoint instanceof SqlInjectionCustomizations::SqlInjection::Sink and
sinkName = "sql injection sink"
or
sinkType instanceof EndpointTypes::TaintedPathSinkType and
endpoint instanceof TaintedPathCustomizations::TaintedPath::Sink and
sinkName = "path injection sink"
or
sinkType instanceof EndpointTypes::XssSinkType and
endpoint instanceof DomBasedXssCustomizations::DomBasedXss::Sink and
sinkName = "xss sink"
}
/**
* Holds if `endpoint` is a not a sink for any type of security vulnerability for the reason specified by
* `characteristic`.
*/
private predicate isNegativeExampleFromCurrentRepo(
DataFlow::Node endpoint, EndpointCharacteristics::EndpointCharacteristic characteristic
) {
characteristic.appliesToEndpoint(endpoint) and
exists(float confidence |
characteristic
.hasImplications(any(EndpointTypes::NegativeType negativeClass), true, confidence) and
confidence >= characteristic.highConfidence()
)
}
private predicate selectTwoPositiveExamples(
DataFlow::Node endpoint, EndpointTypes::EndpointType sinkType, string sinkName, File file
) {
this.positiveExamplesForCodexPrompt(2, endpoint, sinkType, sinkName) and
file = endpoint.getFile()
}
/**
* Select `numExamples` positive examples for the codex prompt for each query, selecting from a diverse set
* of files.
*/
bindingset[numExamples]
private predicate positiveExamplesForCodexPrompt(
int numExamples, DataFlow::Node endpoint, EndpointTypes::EndpointType sinkType, string sinkName
) {
this.isPositiveExampleFromCurrentRepo(endpoint, sinkType, sinkName) and
// There is no previous positive example of the same type in the same file.
not exists(DataFlow::Node endpoint2 |
this.isPositiveExampleFromCurrentRepo(endpoint2, sinkType, sinkName) and
endpoint.getFile() = endpoint2.getFile() and
(
endpoint.getStartLine() > endpoint2.getStartLine()
or
endpoint.getStartLine() = endpoint2.getStartLine() and
endpoint.getStartColumn() > endpoint2.getStartColumn()
)
)
// and
// exists(int rankIndex, File file |
// rankIndex <= numExamples and rankIndex > 0 and
// file = endpoint.asExpr().getLocation().getFile() and not exists(int lowerRankIndex | | )
// endpoint = rank[rankIndex](string file, int a, int b, int c, int d |
// |
// endpoint order by sinkName, file
// )
// // and
// // r % (1 / rate).ceil() = 0
// )
// exists(int r | r <= numExamples and endpoint = rank[r](string file, int a, int b, int c, int d |
// endpoint.asExpr().getLocation().hasLocationInfo(file, a, b, c, d)
// |
// endpoint order by sinkName, file, a, b, c, d
// )
// // and
// // r % (1 / rate).ceil() = 0
// )
// // select `numExamples` examples for each query
// count( | | 1) <= numExamples and
// // select examples from a diverse set of files
// not exists(string file |
// file = endpoint.getLocation().getFile().getRelativePath() and
// file = any(EndpointTypes::EndpointType t).getAFileWithPositiveExample(t)
// )
// // and
// // TODO
}
/**
* Select `numExamples` negative examples for the codex prompt, selecting from a diverse set of characteristics.
* TODO
*/
private string hardNegativeExamplesForCodexPrompt() {
result =
"# Examples of security vulnerability sinks and non-sinks\n|Dataflow node|Neighborhood|Classification|\n|---|---|---|\n|`m[9] ? m[10] : null`|` this.authority = m[5] ? m[6] : null; this.path = m[7]; this.query = m[9] ? m[10] : null; this.fragment = m[12] ? m[13] : null; return this;`|non-sink|\n|`this.flowRunId`|` variables: { input: { flow_run_id: this.flowRunId, name: e }`|non-sink|\n|`req.body.firstName`|` res.json({ firstName: req.body.firstName, lastName: req.body.lastName, email: req.body.email`|non-sink|\n|`lang[1]`|` if (lang) { document.getElementsByTagName('html')[0].setAttribute('lang', lang[1]); }`|non-sink|\n|`token`|` }, }); tokenProvider.saveNewToken(token).then(ok => { insights.trackEvent({ name: 'ReposCreateTokenFinish',`|non-sink|\n|`filename`|`function sendFile(filename, response) { response.setHeader('Content-Type', mime.lookup(filename)); response.writeHead(200); const fileStream = createReadStream(filename);`|non-sink|\n|`year`|` postsData = await getPostsDateArchive( postType, !isNaN(parseInt(year, 10)) ? parseInt(year, 10) : null, !isNaN(parseInt(month, 10)) ? parseInt(month, 10) : null, !isNaN(parseInt(day, 10)) ? parseInt(day, 10) : null,`|non-sink|\n|`redirectTo === 'login' ? {redirectTo: to.path,} : to.query`|` return next({ name: redirectTo, query: redirectTo === 'login' ? { redirectTo: to.path, } : to.query, }); }`|non-sink|\n"
}
/**
* Gets the reconstructed source code text for a range of locations.
*/
string tokenize(Location location) {
result =
strictconcat(Token token |
location.containsLoosely(token.getLocation())
|
token.getValue(),
// Use space as the separator, since that is most likely.
// May not be an exact reconstruction, e.g. if the code
// had newlines between successive tokens.
// TODO: Don't add a space if the current or previous token is a period.
" "
order by
token.getLocation().getStartLine(), token.getLocation().getStartColumn()
)
}
/**
* Gets the reconstructed source code text for `node`.
*/
string tokenizeEndpoint(DataFlow::Node node) {
result = this.tokenize(node.getAstNode().getLocation())
}
/**
* Gets the reconstructed source code text for the neighborhood around `node`, including `neighborhoodSize` lines
* before and `neighborhoodSize` lines after `node`.
*/
bindingset[neighborhoodSize]
string tokenizeNeighborhood(DataFlow::Node node, int neighborhoodSize) {
result =
this.tokenize(any(Location location |
location.getFile() = node.getAstNode().getLocation().getFile() and
location.getStartLine() =
max(int line |
line = node.getAstNode().getLocation().getStartLine() - neighborhoodSize or line = 1
|
line
) and
location.getEndLine() =
min(int line |
line = node.getAstNode().getLocation().getEndLine() + neighborhoodSize + 1 or // Add 1 because the end column is 1
line = location.getFile().getNumberOfLines()
|
line
) and
location.getStartColumn() = 1 and
location.getEndColumn() = 1
))
}
}
/**
* The feature for the name of the function that encloses the endpoint.
*/

View File

@@ -40,6 +40,18 @@ class Location extends @location {
)
}
/** Holds if this location starts before or at the same place as location `that`. */
predicate startsBeforeOrWith(Location that) {
exists(File f, int sl1, int sc1, int sl2, int sc2 |
locations_default(this, f, sl1, sc1, _, _) and
locations_default(that, f, sl2, sc2, _, _)
|
sl1 < sl2
or
sl1 = sl2 and sc1 <= sc2
)
}
/** Holds if this location ends after location `that`. */
pragma[inline]
predicate endsAfter(Location that) {
@@ -53,12 +65,33 @@ class Location extends @location {
)
}
/** Holds if this location ends after or at the same place as location `that`. */
pragma[inline]
predicate endsAfterOrWith(Location that) {
exists(File f, int el1, int ec1, int el2, int ec2 |
locations_default(this, f, _, _, el1, ec1) and
locations_default(that, f, _, _, el2, ec2)
|
el1 > el2
or
el1 = el2 and ec1 >= ec2
)
}
/**
* Holds if this location contains location `that`, meaning that it starts
* before and ends after it.
*/
predicate contains(Location that) { this.startsBefore(that) and this.endsAfter(that) }
/**
* Holds if this location contains location `that`, meaning that it starts
* before or at the same place and ends after or at the same place.
*/
predicate containsLoosely(Location that) {
this.startsBeforeOrWith(that) and this.endsAfterOrWith(that)
}
/** Holds if this location is empty. */
predicate isEmpty() { exists(int l, int c | locations_default(this, _, l, c, l, c - 1)) }