Merge pull request #17873 from github/tausbn/python-fix-generator-expression-locations

Python: Even more parser fixes
2026-05-05 21:55:19 +02:00 · 2024-11-01 12:47:19 +01:00
parent 03ffaac87a 5d6600e61f
commit 2892f0ff48
11 changed files with 48516 additions and 47821 deletions
--- a/python/extractor/tsg-python/python.tsg
+++ b/python/extractor/tsg-python/python.tsg
@@ -404,7 +404,7 @@

 ;;; GeneratorExp

-(generator_expression . "(" . (comment)* . (_) @start (_) @end . (comment)* . ")" .) @generatorexp
+(generator_expression . "(" . (comment)* . (expression) @start [(for_in_clause) (if_clause)] @end . (comment)* . ")" .) @generatorexp
 {
    attr (@generatorexp.node) _location_start = (location-start @start)
    attr (@generatorexp.node) _location_end = (location-end @end)
@@ -416,13 +416,13 @@
    attr (@if.node) _location_end = (location-end @expr)
 }

-(generator_expression . "(" . (comment)* . (_) @start (for_in_clause) @child (_) @end . (comment)* . ")" .) @genexpr
+(generator_expression . "(" . (comment)* . (expression) @start (for_in_clause) @child [(for_in_clause) (if_clause)] @end . (comment)* . ")" .) @genexpr
 {
    attr (@child.node) _location_start = (location-start @start)
    attr (@child.node) _location_end = (location-end @end)
 }

-(generator_expression . "(" . (comment)* . (_) @start (for_in_clause) @end . (comment)* . ")" .) @genexpr
+(generator_expression . "(" . (comment)* . (expression) @start (for_in_clause) @end . (comment)* . ")" .) @genexpr
 {
    attr (@end.node) _location_start = (location-start @start)
    attr (@end.node) _location_end = (location-end @end)
@@ -863,7 +863,7 @@
 ; information for the entire generator expression (yes, it is a wide parameter!) and so we must recreate the logic for
 ; setting this location information correctly.

-(generator_expression . "(" . (comment)* . (_) @start  (_) @end . (comment)* . ")" .) @genexpr
+(generator_expression . "(" . (comment)* . (expression) @start  [(for_in_clause) (if_clause)] @end . (comment)* . ")" .) @genexpr
 {
    ; Synthesize the `genexpr` function
    let @genexpr.fun = (ast-node @genexpr "Function")
@@ -2661,6 +2661,14 @@
    let @with.first = @first.node
 }

+; Async status
+; NOTE: We only set the `is_async` field on the _first_ clause of the `with` statement,
+; as this is the behaviour of the old parser.
+(with_statement "async" "with" @with_keyword (with_clause . (with_item) @with))
+{
+    attr (@with.node) is_async = #true
+}
+
 (with_item
    value: (_) @value
 ) @with
@@ -3264,6 +3272,16 @@
    }
 }

+; Async status
+(function_definition "async" "def" @def_keyword) @funcdef
+{
+    let start = (location-start @def_keyword)
+    attr (@funcdef.function) is_async = #true
+    attr (@funcdef.node) _location_start = start
+    attr (@funcdef.function) _location_start = start
+    attr (@funcdef.funcexpr) _location_start = start
+}
+
 ;;; Decorators

 (decorated_definition
@@ -3478,5 +3496,9 @@

 [(tuple element: (_)) (tuple_pattern)] @tup
 {
-    attr (@tup.node) parenthesised = #true
+    ; In order to avoid writing to the `parenthesised` attribute twice, we only set it here
+    ; if the surrounding expression is not a `parenthesized_expression`.
+    if (not (instance-of (get-parent @tup) "parenthesized_expression")) {
+        attr (@tup.node) parenthesised = #true
+    }
 }
--- a/python/extractor/tsg-python/tsp/grammar.js
+++ b/python/extractor/tsg-python/tsp/grammar.js
@@ -751,7 +751,6 @@ module.exports = grammar({
      $.comparison_operator,
      $.not_operator,
      $.boolean_operator,
-      $.await,
      $.lambda,
      $.primary_expression,
      $.conditional_expression,
@@ -759,6 +758,7 @@ module.exports = grammar({
    ),

    primary_expression: $ => choice(
+      $.await,
      $.binary_operator,
      $.identifier,
      $.keyword_identifier,
@@ -1202,7 +1202,7 @@ module.exports = grammar({

    await: $ => prec(PREC.unary, seq(
      'await',
-      $.expression
+      $.primary_expression
    )),

    comment: $ => token(seq('#', /.*/)),
--- a/python/extractor/tsg-python/tsp/src/grammar.json
+++ b/python/extractor/tsg-python/tsp/src/grammar.json
@@ -1,5 +1,4 @@
 {
-  "$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/grammar.schema.json",
  "name": "python",
  "word": "identifier",
  "rules": {
@@ -3843,10 +3842,6 @@
          "type": "SYMBOL",
          "name": "boolean_operator"
        },
-        {
-          "type": "SYMBOL",
-          "name": "await"
-        },
        {
          "type": "SYMBOL",
          "name": "lambda"
@@ -3868,6 +3863,10 @@
    "primary_expression": {
      "type": "CHOICE",
      "members": [
+        {
+          "type": "SYMBOL",
+          "name": "await"
+        },
        {
          "type": "SYMBOL",
          "name": "binary_operator"
@@ -6586,7 +6585,7 @@
          },
          {
            "type": "SYMBOL",
-            "name": "expression"
+            "name": "primary_expression"
          }
        ]
      }
@@ -6696,3 +6695,4 @@
    "parameter"
  ]
 }
+
--- a/python/extractor/tsg-python/tsp/src/node-types.json
+++ b/python/extractor/tsg-python/tsp/src/node-types.json
@@ -115,10 +115,6 @@
    "type": "expression",
    "named": true,
    "subtypes": [
-      {
-        "type": "await",
-        "named": true
-      },
      {
        "type": "boolean_operator",
        "named": true
@@ -229,6 +225,10 @@
        "type": "attribute",
        "named": true
      },
+      {
+        "type": "await",
+        "named": true
+      },
      {
        "type": "binary_operator",
        "named": true
@@ -587,7 +587,7 @@
      "required": true,
      "types": [
        {
-          "type": "expression",
+          "type": "primary_expression",
          "named": true
        }
      ]
@@ -2691,7 +2691,6 @@
  {
    "type": "module",
    "named": true,
-    "root": true,
    "fields": {},
    "children": {
      "multiple": true,
@@ -3816,10 +3815,6 @@
    "type": ":=",
    "named": false
  },
-  {
-    "type": ";",
-    "named": false
-  },
  {
    "type": "<",
    "named": false
@@ -3876,10 +3871,6 @@
    "type": "[",
    "named": false
  },
-  {
-    "type": "\\",
-    "named": false
-  },
  {
    "type": "]",
    "named": false
--- a/python/extractor/tsg-python/tsp/src/parser.c
+++ b/python/extractor/tsg-python/tsp/src/parser.c
--- a/python/extractor/tsg-python/tsp/src/scanner.cc
+++ b/python/extractor/tsg-python/tsp/src/scanner.cc
@@ -161,6 +161,22 @@ struct Scanner {
        } else if (lexer->lookahead == '\\') {
          if (delimiter.is_raw()) {
            lexer->advance(lexer, false);
+            // In raw strings, backslashes _can_ escape the same kind of quotes as the outer
+            // string, so we must take care to traverse any such escaped quotes now. If we don't do
+            // this, we will mistakenly consider the string to end at that escaped quote.
+            // Likewise, this also extends to escaped backslashes.
+            if (lexer->lookahead == end_character || lexer->lookahead == '\\') {
+              lexer->advance(lexer, false);
+            }
+            // Newlines after backslashes also cause issues, so we explicitly step over them here.
+            if (lexer->lookahead == '\r') {
+                lexer->advance(lexer, false);
+                if (lexer->lookahead == '\n') {
+                    lexer->advance(lexer, false);
+                }
+            } else if (lexer->lookahead == '\n') {
+                lexer->advance(lexer, false);
+            }
            continue;
          } else if (delimiter.is_bytes()) {
              lexer->mark_end(lexer);
--- a/python/extractor/tsg-python/tsp/src/tree_sitter/parser.h
+++ b/python/extractor/tsg-python/tsp/src/tree_sitter/parser.h
@@ -13,8 +13,9 @@ extern "C" {
 #define ts_builtin_sym_end 0
 #define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024

-#ifndef TREE_SITTER_API_H_
 typedef uint16_t TSStateId;
+
+#ifndef TREE_SITTER_API_H_
 typedef uint16_t TSSymbol;
 typedef uint16_t TSFieldId;
 typedef struct TSLanguage TSLanguage;
@@ -47,7 +48,6 @@ struct TSLexer {
  uint32_t (*get_column)(TSLexer *);
  bool (*is_at_included_range_start)(const TSLexer *);
  bool (*eof)(const TSLexer *);
-  void (*log)(const TSLexer *, const char *, ...);
 };

 typedef enum {
@@ -87,11 +87,6 @@ typedef union {
  } entry;
 } TSParseActionEntry;

-typedef struct {
-  int32_t start;
-  int32_t end;
-} TSCharacterRange;
-
 struct TSLanguage {
  uint32_t version;
  uint32_t symbol_count;
@@ -128,41 +123,15 @@ struct TSLanguage {
    unsigned (*serialize)(void *, char *);
    void (*deserialize)(void *, const char *, unsigned);
  } external_scanner;
-  const TSStateId *primary_state_ids;
 };

-static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
-  uint32_t index = 0;
-  uint32_t size = len - index;
-  while (size > 1) {
-    uint32_t half_size = size / 2;
-    uint32_t mid_index = index + half_size;
-    TSCharacterRange *range = &ranges[mid_index];
-    if (lookahead >= range->start && lookahead <= range->end) {
-      return true;
-    } else if (lookahead > range->end) {
-      index = mid_index;
-    }
-    size -= half_size;
-  }
-  TSCharacterRange *range = &ranges[index];
-  return (lookahead >= range->start && lookahead <= range->end);
-}
-
 /*
 *  Lexer Macros
 */

-#ifdef _MSC_VER
-#define UNUSED __pragma(warning(suppress : 4101))
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
 #define START_LEXER()           \
  bool result = false;          \
  bool skip = false;            \
-  UNUSED                        \
  bool eof = false;             \
  int32_t lookahead;            \
  goto start;                   \
@@ -178,17 +147,6 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
    goto next_state;         \
  }

-#define ADVANCE_MAP(...)                                              \
-  {                                                                   \
-    static const uint16_t map[] = { __VA_ARGS__ };                    \
-    for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) {  \
-      if (map[i] == lookahead) {                                      \
-        state = map[i + 1];                                           \
-        goto next_state;                                              \
-      }                                                               \
-    }                                                                 \
-  }
-
 #define SKIP(state_value) \
  {                       \
    skip = true;          \
@@ -207,7 +165,7 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
 *  Parse Table Macros
 */

-#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT)
+#define SMALL_STATE(id) id - LARGE_STATE_COUNT

 #define STATE(id) id

@@ -217,7 +175,7 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
  {{                                  \
    .shift = {                        \
      .type = TSParseActionTypeShift, \
-      .state = (state_value)          \
+      .state = state_value            \
    }                                 \
  }}

@@ -225,7 +183,7 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
  {{                                  \
    .shift = {                        \
      .type = TSParseActionTypeShift, \
-      .state = (state_value),         \
+      .state = state_value,           \
      .repetition = true              \
    }                                 \
  }}
@@ -238,15 +196,14 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
    }                                 \
  }}

-#define REDUCE(symbol_name, children, precedence, prod_id) \
-  {{                                                       \
-    .reduce = {                                            \
-      .type = TSParseActionTypeReduce,                     \
-      .symbol = symbol_name,                               \
-      .child_count = children,                             \
-      .dynamic_precedence = precedence,                    \
-      .production_id = prod_id                             \
-    },                                                     \
+#define REDUCE(symbol_val, child_count_val, ...) \
+  {{                                             \
+    .reduce = {                                  \
+      .type = TSParseActionTypeReduce,           \
+      .symbol = symbol_val,                      \
+      .child_count = child_count_val,            \
+      __VA_ARGS__                                \
+    },                                           \
  }}

 #define RECOVER()                    \