Compare commits

...

16 Commits

Author SHA1 Message Date
tiferet
7f51a1aa52 Document prompt 2022-12-02 17:03:55 -08:00
tiferet
87782387a8 "Training" prompt created from training repos
Replace the previous hardcoded "training" part of the prompt with one generated from examples that come from training repos rather than evaluation repos.

These are diverse examples generated from a random selection of repos in the ATM training set.

Each example is from a different repo. There are two examples of each sink type and eight non-sink examples, each from a different negative endpoint characteristic.
2022-12-02 16:41:57 -08:00
tiferet
d8192ec8ed Delete unused code 2022-12-01 17:28:34 -08:00
tiferet
413db2b15e Make it clearer to codex which parts of the prompt are code by using `` 2022-12-01 15:48:50 -08:00
tiferet
7eb04f8c32 Bug fix 2022-12-01 15:48:50 -08:00
tiferet
58bd6ac504 Fix implicit uses of this 2022-12-01 15:48:50 -08:00
tiferet
47b8f1420c Tokenize a neighborhood around the endpoint and properly create the last row of the codex prompt 2022-12-01 15:48:50 -08:00
tiferet
486752d19e Code improvements:
- Replace the containment logic with built in `Location` functionality.
- Generalize `tokenize` to output the tokens that fall within any location.
2022-12-01 15:48:50 -08:00
tiferet
a79bdf1cbc Generalize endpoint tokenization to work correctly across multiple lines 2022-12-01 15:48:50 -08:00
tiferet
3be6b42200 Add in Aditya's endpoint tokenization 2022-12-01 15:48:50 -08:00
tiferet
55839c8df2 small update 2022-12-01 15:48:50 -08:00
tiferet
0c87b25698 Typo fix 2022-12-01 15:48:50 -08:00
tiferet
33a8962f5a For now hardcode a training prompt string 2022-12-01 15:48:50 -08:00
tiferet
456aab0497 Make predicates private if they don't need to be public 2022-12-01 15:48:50 -08:00
tiferet
1919206e1e Start adding the codex prompt feature 2022-12-01 15:48:50 -08:00
tiferet
ad13f5585d Extract only a single feature, the codex prompt for the current endpoint. 2022-12-01 15:48:50 -08:00
2 changed files with 130 additions and 1 deletions

View File

@@ -7,6 +7,15 @@
import javascript
private import FeaturizationConfig
private import FunctionBodyFeatures as FunctionBodyFeatures
private import EndpointTypes as EndpointTypes
private import semmle.javascript.security.dataflow.NosqlInjectionCustomizations as NosqlInjectionCustomizations
private import semmle.javascript.security.dataflow.SqlInjectionCustomizations as SqlInjectionCustomizations
private import semmle.javascript.security.dataflow.TaintedPathCustomizations as TaintedPathCustomizations
private import semmle.javascript.security.dataflow.DomBasedXssCustomizations as DomBasedXssCustomizations
private import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
private import experimental.adaptivethreatmodeling.XssATM as XssAtm
/**
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -72,7 +81,7 @@ private module FunctionNames {
}
/** Get a name of a supported generic token-based feature. */
string getASupportedFeatureName() { result = any(EndpointFeature f).getName() }
string getASupportedFeatureName() { result = "codexPrompt" }
/**
* Generic token-based features for ATM.
@@ -90,6 +99,7 @@ predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string feat
* See EndpointFeature
*/
private newtype TEndpointFeature =
TCodexPrompt() or
TEnclosingFunctionName() or
TReceiverName() or
TEnclosingFunctionBody() or
@@ -122,6 +132,92 @@ abstract class EndpointFeature extends TEndpointFeature {
string toString() { result = this.getName() }
}
/**
* The codex promt for this endpoint.
*/
class CodexPrompt extends EndpointFeature, TCodexPrompt {
override string getName() { result = "codexPrompt" }
override string getValue(DataFlow::Node endpoint) {
result = this.getTrainingSetPrompt() + this.getCurrentEndpointPrompt(endpoint)
}
/**
* Gets the beginning of the prompt, which contains the training examples, shuffled in random order.
* This part of the prompt was generated from examples that come from training repos rather than evaluation repos.
* These are diverse examples generated from a random selection of repos in the ATM training set.
* Each example is from a different repo. There are two examples of each sink type and eight non-sink examples, each
* from a different negative endpoint characteristic.
*/
private string getTrainingSetPrompt() {
result =
"# Examples of security vulnerability sinks and non-sinks\n|Dataflow node|Neighborhood|Classification|\n|---|---|---|\n|`WPUrls.ajaxurl`|` dataType: json , type: POST , url: WPUrls.ajaxurl, data: data, complete: function( json ) {`|non-sink|\n|`[ handlebars ]`|` use strict ; if (typeof define === function && define.amd) { define([ handlebars ], function(Handlebars) { return factory(Handlebars.default Handlebars); });`|path injection sink|\n|`url`|`} else { var matcher = new RegExp($.map(items.wanikanify_blackList, function(val) { return ( +val+ ) ;}).join( )); return matcher.test(url); } }`|non-sink|\n|`_.bind(connection.createGame, this, socket)`|`var connection = module.exports = function (socket) { socket.on( game:create , _.bind(connection.createGame, this, socket)); socket.on( game:spectate , _.bind(game.spectate, this, socket)); socket.on( register , _.bind(connection.register, this, socket));`|non-sink|\n|`sql`|` if (err) throw err; const sql = UPDATE customers SET address = Canyon 123 WHERE address = Valley 345 ; con.query(sql, function (err, result) { if (err) throw err; console.log(result.affectedRows + record(s) updated );`|sql injection sink|\n|` <style type= text/css id= shapely-style- + sufix + /> `|` if ( ! style.length ) { style = $( head ).append( <style type= text/css id= shapely-style- + sufix + /> ).find( #shapely-style- + sufix ); }`|xss sink|\n|`content`|` textBoxEditor(content) { console.log(content); } ngOnInit() {`|non-sink|\n|`imageURL`|` <div id = mypost > <Link to ={ /post?id= + postId}> <img src={imageURL} alt= /> <div className= img_info > <div><i className= fas fa-heart ></i> <span id= likes >{this.state.like}</span></div>`|xss sink|\n|`{ roomId }`|` } const game = await Game.findOne({ roomId }); if (!game) {`|nosql injection sink|\n|` SELECT owner, name, program FROM Programs WHERE name = + data + `|`app.get( /getProgram/:nombre , (request, response) => { var data = request.query.nombre; db.each( SELECT owner, name, program FROM Programs WHERE name = + data + , function(err, row) { response.json(row.program); });`|sql injection sink|\n|`listenToServer`|` processCommand(cmd); } setTimeout(listenToServer, 0); } }`|non-sink|\n|`negativeYearString`|` return Date.prototype.toJSON && new Date(NaN).toJSON() === null && new Date(negativeDate).toJSON().indexOf(negativeYearString) !== -1 && Date.prototype.toJSON.call({ // generic toISOString: function () { return true; }`|non-sink|\n|`__dirname`|`fs .readdirSync(__dirname) .filter(function(file) { return (file.indexOf( . ) !== 0) && (file !== basename);`|path injection sink|\n|`certificateId`|`app.get( /certificate/data/:id , (req, res) => { let certificateId = req.params.id; Certificates.findById(certificateId) .then(obj => { if (obj === null)`|nosql injection sink|\n|`{encoding: utf8 }`|`function updateChangelog() { var filename = path.resolve(__dirname, ../CHANGELOG.md ) , changelog = fs.readFileSync(filename, {encoding: utf8 }) , entry = new RegExp( ### ( + version + )(?: \\((.+?)\\))\\n )`|non-sink|\n|`depth`|` }); const indent = .repeat(depth); let sep = indent; column_sizes.forEach((size) => {`|non-sink|\n"
}
/**
* Gets the last line of the prompt, containing the current endpoint.
* TODO
*/
private string getCurrentEndpointPrompt(DataFlow::Node endpoint) {
result =
"|`" + this.tokenizeEndpoint(endpoint) + "`|`" + this.tokenizeNeighborhood(endpoint, 2) + "`|"
}
/**
* Gets the reconstructed source code text for a range of locations.
* TODO: This excludes comments
* TODO: Don't add a space if the current or previous token is a period.
*/
string tokenize(Location location) {
result =
strictconcat(Token token |
location.containsLoosely(token.getLocation())
|
token.getValue(),
// Use space as the separator, since that is most likely.
// May not be an exact reconstruction, e.g. if the code
// had newlines between successive tokens.
" "
order by
token.getLocation().getStartLine(), token.getLocation().getStartColumn()
)
}
/**
* Gets the reconstructed source code text for `node`.
*/
string tokenizeEndpoint(DataFlow::Node node) {
result = this.tokenize(node.getAstNode().getLocation())
}
/**
* Gets the reconstructed source code text for the neighborhood around `node`, including `neighborhoodSize` lines
* before and `neighborhoodSize` lines after `node`.
*/
bindingset[neighborhoodSize]
string tokenizeNeighborhood(DataFlow::Node node, int neighborhoodSize) {
result =
this.tokenize(any(Location location |
location.getFile() = node.getAstNode().getLocation().getFile() and
location.getStartLine() =
max(int line |
line = node.getAstNode().getLocation().getStartLine() - neighborhoodSize or line = 1
|
line
) and
location.getEndLine() =
min(int line |
line = node.getAstNode().getLocation().getEndLine() + neighborhoodSize + 1 or // Add 1 because the end column is 1
line = location.getFile().getNumberOfLines()
|
line
) and
location.getStartColumn() = 1 and
location.getEndColumn() = 1
))
}
}
/**
* The feature for the name of the function that encloses the endpoint.
*/

View File

@@ -40,6 +40,18 @@ class Location extends @location {
)
}
/** Holds if this location starts before or at the same place as location `that`. */
predicate startsBeforeOrWith(Location that) {
exists(File f, int sl1, int sc1, int sl2, int sc2 |
locations_default(this, f, sl1, sc1, _, _) and
locations_default(that, f, sl2, sc2, _, _)
|
sl1 < sl2
or
sl1 = sl2 and sc1 <= sc2
)
}
/** Holds if this location ends after location `that`. */
pragma[inline]
predicate endsAfter(Location that) {
@@ -53,12 +65,33 @@ class Location extends @location {
)
}
/** Holds if this location ends after or at the same place as location `that`. */
pragma[inline]
predicate endsAfterOrWith(Location that) {
exists(File f, int el1, int ec1, int el2, int ec2 |
locations_default(this, f, _, _, el1, ec1) and
locations_default(that, f, _, _, el2, ec2)
|
el1 > el2
or
el1 = el2 and ec1 >= ec2
)
}
/**
* Holds if this location contains location `that`, meaning that it starts
* before and ends after it.
*/
predicate contains(Location that) { this.startsBefore(that) and this.endsAfter(that) }
/**
* Holds if this location contains location `that`, meaning that it starts
* before or at the same place and ends after or at the same place.
*/
predicate containsLoosely(Location that) {
this.startsBeforeOrWith(that) and this.endsAfterOrWith(that)
}
/** Holds if this location is empty. */
predicate isEmpty() { exists(int l, int c | locations_default(this, _, l, c, l, c - 1)) }