Generalize endpoint tokenization to work correctly across multiple lines

This commit is contained in:
tiferet
2022-11-30 11:57:07 -08:00
parent 3be6b42200
commit a79bdf1cbc

View File

@@ -226,24 +226,38 @@ class CodexPrompt extends EndpointFeature, TCodexPrompt {
}
/**
* Holds if the location of `node` contains the location of `token`:
* both are on the same single line of code and
* the column range of `node` equals or contains
* the column range of `token`.
* Holds if the location of `node` contains the location of `token`
*/
cached
predicate containsToken(AstNode node, Token token) {
exists(string file, int line, int sc, int ec, int tsc, int tec |
node.getLocation().hasLocationInfo(file, line, sc, line, ec) and
token.getLocation().hasLocationInfo(file, line, tsc, line, tec) and
sc <= tsc and
tec <= ec
exists(
string file, int node_start_line, int node_start_column, int node_end_line,
int node_end_column, int token_start_line, int token_start_column, int token_end_line,
int token_end_column
|
node.getLocation()
.hasLocationInfo(file, node_start_line, node_start_column, node_end_line, node_end_column) and
token
.getLocation()
.hasLocationInfo(file, token_start_line, token_start_column, token_end_line,
token_end_column) and
(
node_start_line < token_start_line
or
node_start_line = token_start_line and
node_start_column <= token_start_column
) and
(
node_end_line > token_end_line
or
node_end_line = token_end_line and
node_end_column >= token_end_column
)
)
}
/**
* Gets the reconstructed source code text for `node`,
* assuming it is on a single line of code.
* Gets the reconstructed source code text for `node`.
*/
string tokenise(DataFlow::Node node) {
result =
@@ -254,6 +268,7 @@ class CodexPrompt extends EndpointFeature, TCodexPrompt {
// Use space as the separator, since that is most likely.
// May not be an exact reconstruction, e.g. if the code
// had newlines between successive tokens.
// TODO: Don't add a space if the current or previous token is a period.
" "
order by
token.getLocation().getStartLine(), token.getLocation().getStartColumn()