Generalize endpoint tokenization to work correctly across multiple lines

2026-06-03 04:40:14 +02:00 · 2022-11-30 11:57:07 -08:00
parent 3be6b42200
commit a79bdf1cbc
1 changed files with 26 additions and 11 deletions
--- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
+++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
@@ -226,24 +226,38 @@ class CodexPrompt extends EndpointFeature, TCodexPrompt {
  }

  /**
-   * Holds if the location of `node` contains the location of `token`:
-   * both are on the same single line of code and
-   * the column range of `node` equals or contains
-   * the column range of `token`.
+   * Holds if the location of `node` contains the location of `token`
   */
  cached
  predicate containsToken(AstNode node, Token token) {
-    exists(string file, int line, int sc, int ec, int tsc, int tec |
-      node.getLocation().hasLocationInfo(file, line, sc, line, ec) and
-      token.getLocation().hasLocationInfo(file, line, tsc, line, tec) and
-      sc <= tsc and
-      tec <= ec
+    exists(
+      string file, int node_start_line, int node_start_column, int node_end_line,
+      int node_end_column, int token_start_line, int token_start_column, int token_end_line,
+      int token_end_column
+    |
+      node.getLocation()
+          .hasLocationInfo(file, node_start_line, node_start_column, node_end_line, node_end_column) and
+      token
+          .getLocation()
+          .hasLocationInfo(file, token_start_line, token_start_column, token_end_line,
+            token_end_column) and
+      (
+        node_start_line < token_start_line
+        or
+        node_start_line = token_start_line and
+        node_start_column <= token_start_column
+      ) and
+      (
+        node_end_line > token_end_line
+        or
+        node_end_line = token_end_line and
+        node_end_column >= token_end_column
+      )
    )
  }

  /**
-   * Gets the reconstructed source code text for `node`,
-   * assuming it is on a single line of code.
+   * Gets the reconstructed source code text for `node`.
   */
  string tokenise(DataFlow::Node node) {
    result =
@@ -254,6 +268,7 @@ class CodexPrompt extends EndpointFeature, TCodexPrompt {
          // Use space as the separator, since that is most likely.
          // May not be an exact reconstruction, e.g. if the code
          // had newlines between successive tokens.
+          // TODO: Don't add a space if the current or previous token is a period.
          " "
        order by
          token.getLocation().getStartLine(), token.getLocation().getStartColumn()