Pull in the prompt work from branch tiferet/codex-prompt

This commit is contained in:
tiferet
2022-12-06 12:27:51 -08:00
parent 4a2046476a
commit dfbfa5d27d
3 changed files with 114 additions and 22 deletions

View File

@@ -0,0 +1,81 @@
import javascript
module ModelPrompt {
string getPrompt(DataFlow::Node endpoint) {
result = getTrainingSetPrompt() + getCurrentEndpointPrompt(endpoint)
}
/**
* Gets the beginning of the prompt, which contains the training examples, shuffled in random order.
* This part of the prompt was generated from examples that come from training repos rather than evaluation repos.
* These are diverse examples generated from a random selection of repos in the ATM training set.
* Each example is from a different repo. There are two examples of each sink type and eight non-sink examples, each
* from a different negative endpoint characteristic.
*/
private string getTrainingSetPrompt() {
result =
"# Examples of security vulnerability sinks and non-sinks\n|Dataflow node|Neighborhood|Classification|\n|---|---|---|\n|`WPUrls.ajaxurl`|` dataType: json , type: POST , url: WPUrls.ajaxurl, data: data, complete: function( json ) {`|non-sink|\n|`[ handlebars ]`|` use strict ; if (typeof define === function && define.amd) { define([ handlebars ], function(Handlebars) { return factory(Handlebars.default Handlebars); });`|path injection sink|\n|`url`|`} else { var matcher = new RegExp($.map(items.wanikanify_blackList, function(val) { return ( +val+ ) ;}).join( )); return matcher.test(url); } }`|non-sink|\n|`_.bind(connection.createGame, this, socket)`|`var connection = module.exports = function (socket) { socket.on( game:create , _.bind(connection.createGame, this, socket)); socket.on( game:spectate , _.bind(game.spectate, this, socket)); socket.on( register , _.bind(connection.register, this, socket));`|non-sink|\n|`sql`|` if (err) throw err; const sql = UPDATE customers SET address = Canyon 123 WHERE address = Valley 345 ; con.query(sql, function (err, result) { if (err) throw err; console.log(result.affectedRows + record(s) updated );`|sql injection sink|\n|` <style type= text/css id= shapely-style- + sufix + /> `|` if ( ! style.length ) { style = $( head ).append( <style type= text/css id= shapely-style- + sufix + /> ).find( #shapely-style- + sufix ); }`|xss sink|\n|`content`|` textBoxEditor(content) { console.log(content); } ngOnInit() {`|non-sink|\n|`imageURL`|` <div id = mypost > <Link to ={ /post?id= + postId}> <img src={imageURL} alt= /> <div className= img_info > <div><i className= fas fa-heart ></i> <span id= likes >{this.state.like}</span></div>`|xss sink|\n|`{ roomId }`|` } const game = await Game.findOne({ roomId }); if (!game) {`|nosql injection sink|\n|` SELECT owner, name, program FROM Programs WHERE name = + data + `|`app.get( /getProgram/:nombre , (request, response) => { var data = request.query.nombre; db.each( SELECT owner, name, program FROM Programs WHERE name = + data + , function(err, row) { response.json(row.program); });`|sql injection sink|\n|`listenToServer`|` processCommand(cmd); } setTimeout(listenToServer, 0); } }`|non-sink|\n|`negativeYearString`|` return Date.prototype.toJSON && new Date(NaN).toJSON() === null && new Date(negativeDate).toJSON().indexOf(negativeYearString) !== -1 && Date.prototype.toJSON.call({ // generic toISOString: function () { return true; }`|non-sink|\n|`__dirname`|`fs .readdirSync(__dirname) .filter(function(file) { return (file.indexOf( . ) !== 0) && (file !== basename);`|path injection sink|\n|`certificateId`|`app.get( /certificate/data/:id , (req, res) => { let certificateId = req.params.id; Certificates.findById(certificateId) .then(obj => { if (obj === null)`|nosql injection sink|\n|`{encoding: utf8 }`|`function updateChangelog() { var filename = path.resolve(__dirname, ../CHANGELOG.md ) , changelog = fs.readFileSync(filename, {encoding: utf8 }) , entry = new RegExp( ### ( + version + )(?: \\((.+?)\\))\\n )`|non-sink|\n|`depth`|` }); const indent = .repeat(depth); let sep = indent; column_sizes.forEach((size) => {`|non-sink|\n"
}
/**
* Gets the last line of the prompt, containing the current endpoint.
* TODO
*/
private string getCurrentEndpointPrompt(DataFlow::Node endpoint) {
result = "|`" + tokenizeEndpoint(endpoint) + "`|`" + tokenizeNeighborhood(endpoint, 2) + "`|"
}
/**
* Gets the reconstructed source code text for a range of locations.
* TODO: This excludes comments
* TODO: Don't add a space if the current or previous token is a period.
*/
string tokenize(Location location) {
result =
strictconcat(Token token |
location.containsLoosely(token.getLocation())
|
token.getValue(),
// Use space as the separator, since that is most likely.
// May not be an exact reconstruction, e.g. if the code
// had newlines between successive tokens.
" "
order by
token.getLocation().getStartLine(), token.getLocation().getStartColumn()
)
}
/**
* Gets the reconstructed source code text for `node`.
*/
string tokenizeEndpoint(DataFlow::Node node) {
result = tokenize(node.getAstNode().getLocation())
}
/**
* Gets the reconstructed source code text for the neighborhood around `node`, including `neighborhoodSize` lines
* before and `neighborhoodSize` lines after `node`.
*/
bindingset[neighborhoodSize]
string tokenizeNeighborhood(DataFlow::Node node, int neighborhoodSize) {
result =
tokenize(any(Location location |
location.getFile() = node.getAstNode().getLocation().getFile() and
location.getStartLine() =
max(int line |
line = node.getAstNode().getLocation().getStartLine() - neighborhoodSize or line = 1
|
line
) and
location.getEndLine() =
min(int line |
line = node.getAstNode().getLocation().getEndLine() + neighborhoodSize + 1 or // Add 1 because the end column is 1
line = location.getFile().getNumberOfLines()
|
line
) and
location.getStartColumn() = 1 and
location.getEndColumn() = 1
))
}
}

View File

@@ -1,22 +0,0 @@
import javascript
private import BaseScoring
import FeaturizationConfig
class PromptConfiguration extends FeaturizationConfig {
PromptConfiguration() { this = "PromptConfiguration" }
// abstract predicate getANodeAndPrompt(DataFlow::Node node, string prompt);
string getPrompt(DataFlow::Node node) {
result =
"# Examples of security vulnerability sinks and non-sinks\n|Dataflow node|Neighborhood|Classification|\n|---|---|---|\n| `bid` | `const body = <a href=https://ampbyexample.com target=_blank> <amp-img alt=AMP Ad height=250 src=//localhost:9876/amp4test/request-bank/${bid}/deposit/image width=300></amp-img> </a> <amp-pixel src=//localhost:9876/amp4test/request-bank/${bid}/deposit/pixel/foo?cid=CLIENT_ID(a)></amp-pixel>` | xss sink |\n| `nick` | `irc.me = nick; irc.nick(nick); irc.user(username, realname);` | non-sink || `hash` | `componentDidMount() { const [, hash] = location.href.split(#) this.setState({ hash }) }` | `"
+ extractString(node) + "` | "
}
string extractString(DataFlow::Node node) { result = node.getStringValue() }
override DataFlow::Node getAnEndpointToFeaturize() {
getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
or
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result)
}
}

View File

@@ -40,6 +40,18 @@ class Location extends @location {
)
}
/** Holds if this location starts before or at the same place as location `that`. */
predicate startsBeforeOrWith(Location that) {
exists(File f, int sl1, int sc1, int sl2, int sc2 |
locations_default(this, f, sl1, sc1, _, _) and
locations_default(that, f, sl2, sc2, _, _)
|
sl1 < sl2
or
sl1 = sl2 and sc1 <= sc2
)
}
/** Holds if this location ends after location `that`. */
pragma[inline]
predicate endsAfter(Location that) {
@@ -53,12 +65,33 @@ class Location extends @location {
)
}
/** Holds if this location ends after or at the same place as location `that`. */
pragma[inline]
predicate endsAfterOrWith(Location that) {
exists(File f, int el1, int ec1, int el2, int ec2 |
locations_default(this, f, _, _, el1, ec1) and
locations_default(that, f, _, _, el2, ec2)
|
el1 > el2
or
el1 = el2 and ec1 >= ec2
)
}
/**
* Holds if this location contains location `that`, meaning that it starts
* before and ends after it.
*/
predicate contains(Location that) { this.startsBefore(that) and this.endsAfter(that) }
/**
* Holds if this location contains location `that`, meaning that it starts
* before or at the same place and ends after or at the same place.
*/
predicate containsLoosely(Location that) {
this.startsBeforeOrWith(that) and this.endsAfterOrWith(that)
}
/** Holds if this location is empty. */
predicate isEmpty() { exists(int l, int c | locations_default(this, _, l, c, l, c - 1)) }