Python: Copy Python extractor to codeql repo

2026-04-27 17:55:19 +02:00 · 2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions
--- a/python/extractor/tests/tokenizer/basic.py
+++ b/python/extractor/tests/tokenizer/basic.py
@@ -0,0 +1,134 @@
+
+#AST nodes: Classes, Functions, Modules, expr, stmts
+
+class C:
+
+    def stmts(p0, p1):
+        global x
+        assert x == 2
+        y = 3
+        y += 4
+        while True:
+            break
+        while x > 0:
+            x -= 1
+            continue
+
+        f()
+        for x in y:
+            pass
+        if x:
+            print(y)
+        import a
+        import a.b as c
+        import a as b
+        from a.b import c
+
+
+        with open("file") as f:
+            pass
+        try:
+            1/0
+        except Exception as ex:
+            del y
+        finally:
+            del x
+        if x:
+            raise Exception()
+        else:
+            return
+
+    def exprs(p2, p3):
+        p2.x = 2
+        a = p3.y
+        x = 1 + 2
+        y = b'h4tpvhsa'
+        call(arg0, arg1, name0="Hi", name1=y, *(), **{})
+        x < y
+        {1:1, 2: 2}
+
+        x[a, 7]
+        (x for x in y)
+        17 if x < y else 16
+        lambda x : x * y
+        [ 1, 2, a, x.b, p1.c ]
+        [ a + "Hi" for a in str(y) ]
+
+
+
+        #a, *b = y
+        u"Hi"
+        x[0]
+        x[y[0]]
+        (p2, p3, 7)
+
+#Some multiline strings
+'''
+Single quotes string'''
+
+"""
+Double-quotes
+string"""
+
+r'''
+Bytes
+'''
+
+U"""
+Raw
+Unicode
+"""
+
+#Decorated function
+@deco
+def f():
+    pass
+
+#Inner function (see ODASA-1774)
+def outer():
+    def inner():
+        pass
+
+#Oddly laid out comprehension
+[[
+  x for x in y
+  ]
+  
+  for a in b
+]
+
+#Nested binary operations
+"Hello" + " " + "world"
+1+2+f()
+1+(2+3)
+
+# operations
+a|b&c+d-e
+x*f%g^h@j**k
+
+#Augmented assigns
+a @= b
+a |= b
+a *= b
+
+~a
+
+#Comparisons
+<
+>
+<=
+>=
+!=
+==
+is
+is not
+
+("""
+""")
+del x
+
+`backticks`
+
+x := y
+
+1 <> 2
--- a/python/extractor/tests/tokenizer/basic.tokens
+++ b/python/extractor/tests/tokenizer/basic.tokens
@@ -0,0 +1,472 @@
+2,0-2,52:           COMMENT        '#AST nodes: Classes, Functions, Modules, expr, stmts'
+4,0-4,5:            NAME           'class'
+4,6-4,7:            NAME           'C'
+4,7-4,8:            COLON          ':'
+4,8-4,9:            NEWLINE        '\n'
+6,0-6,4:            INDENT         '    '
+6,4-6,7:            NAME           'def'
+6,8-6,13:           NAME           'stmts'
+6,13-6,14:          LPAR           '('
+6,14-6,16:          NAME           'p0'
+6,16-6,17:          COMMA          ','
+6,18-6,20:          NAME           'p1'
+6,20-6,21:          RPAR           ')'
+6,21-6,22:          COLON          ':'
+6,22-6,23:          NEWLINE        '\n'
+7,0-7,8:            INDENT         '        '
+7,8-7,14:           NAME           'global'
+7,15-7,16:          NAME           'x'
+7,16-7,17:          NEWLINE        '\n'
+8,8-8,14:           NAME           'assert'
+8,15-8,16:          NAME           'x'
+8,17-8,19:          OP             '=='
+8,20-8,21:          NUMBER         '2'
+8,21-8,22:          NEWLINE        '\n'
+9,8-9,9:            NAME           'y'
+9,10-9,11:          OP             '='
+9,12-9,13:          NUMBER         '3'
+9,13-9,14:          NEWLINE        '\n'
+10,8-10,9:          NAME           'y'
+10,10-10,12:        OP             '+='
+10,13-10,14:        NUMBER         '4'
+10,14-10,15:        NEWLINE        '\n'
+11,8-11,13:         NAME           'while'
+11,14-11,18:        NAME           'True'
+11,18-11,19:        COLON          ':'
+11,19-11,20:        NEWLINE        '\n'
+12,0-12,12:         INDENT         '            '
+12,12-12,17:        NAME           'break'
+12,17-12,18:        NEWLINE        '\n'
+13,8-13,8:          DEDENT         ''
+13,8-13,13:         NAME           'while'
+13,14-13,15:        NAME           'x'
+13,16-13,17:        OP             '>'
+13,18-13,19:        NUMBER         '0'
+13,19-13,20:        COLON          ':'
+13,20-13,21:        NEWLINE        '\n'
+14,0-14,12:         INDENT         '            '
+14,12-14,13:        NAME           'x'
+14,14-14,16:        OP             '-='
+14,17-14,18:        NUMBER         '1'
+14,18-14,19:        NEWLINE        '\n'
+15,12-15,20:        NAME           'continue'
+15,20-15,21:        NEWLINE        '\n'
+17,8-17,8:          DEDENT         ''
+17,8-17,9:          NAME           'f'
+17,9-17,10:         LPAR           '('
+17,10-17,11:        RPAR           ')'
+17,11-17,12:        NEWLINE        '\n'
+18,8-18,11:         NAME           'for'
+18,12-18,13:        NAME           'x'
+18,14-18,16:        NAME           'in'
+18,17-18,18:        NAME           'y'
+18,18-18,19:        COLON          ':'
+18,19-18,20:        NEWLINE        '\n'
+19,0-19,12:         INDENT         '            '
+19,12-19,16:        NAME           'pass'
+19,16-19,17:        NEWLINE        '\n'
+20,8-20,8:          DEDENT         ''
+20,8-20,10:         NAME           'if'
+20,11-20,12:        NAME           'x'
+20,12-20,13:        COLON          ':'
+20,13-20,14:        NEWLINE        '\n'
+21,0-21,12:         INDENT         '            '
+21,12-21,17:        NAME           'print'
+21,17-21,18:        LPAR           '('
+21,18-21,19:        NAME           'y'
+21,19-21,20:        RPAR           ')'
+21,20-21,21:        NEWLINE        '\n'
+22,8-22,8:          DEDENT         ''
+22,8-22,14:         NAME           'import'
+22,15-22,16:        NAME           'a'
+22,16-22,17:        NEWLINE        '\n'
+23,8-23,14:         NAME           'import'
+23,15-23,16:        NAME           'a'
+23,16-23,17:        DOT            '.'
+23,17-23,18:        NAME           'b'
+23,19-23,21:        NAME           'as'
+23,22-23,23:        NAME           'c'
+23,23-23,24:        NEWLINE        '\n'
+24,8-24,14:         NAME           'import'
+24,15-24,16:        NAME           'a'
+24,17-24,19:        NAME           'as'
+24,20-24,21:        NAME           'b'
+24,21-24,22:        NEWLINE        '\n'
+25,8-25,12:         NAME           'from'
+25,13-25,14:        NAME           'a'
+25,14-25,15:        DOT            '.'
+25,15-25,16:        NAME           'b'
+25,17-25,23:        NAME           'import'
+25,24-25,25:        NAME           'c'
+25,25-25,26:        NEWLINE        '\n'
+28,8-28,12:         NAME           'with'
+28,13-28,17:        NAME           'open'
+28,17-28,18:        LPAR           '('
+28,18-28,24:        STRING         '"file"'
+28,24-28,25:        RPAR           ')'
+28,26-28,28:        NAME           'as'
+28,29-28,30:        NAME           'f'
+28,30-28,31:        COLON          ':'
+28,31-28,32:        NEWLINE        '\n'
+29,0-29,12:         INDENT         '            '
+29,12-29,16:        NAME           'pass'
+29,16-29,17:        NEWLINE        '\n'
+30,8-30,8:          DEDENT         ''
+30,8-30,11:         NAME           'try'
+30,11-30,12:        COLON          ':'
+30,12-30,13:        NEWLINE        '\n'
+31,0-31,12:         INDENT         '            '
+31,12-31,13:        NUMBER         '1'
+31,13-31,14:        OP             '/'
+31,14-31,15:        NUMBER         '0'
+31,15-31,16:        NEWLINE        '\n'
+32,8-32,8:          DEDENT         ''
+32,8-32,14:         NAME           'except'
+32,15-32,24:        NAME           'Exception'
+32,25-32,27:        NAME           'as'
+32,28-32,30:        NAME           'ex'
+32,30-32,31:        COLON          ':'
+32,31-32,32:        NEWLINE        '\n'
+33,0-33,12:         INDENT         '            '
+33,12-33,15:        NAME           'del'
+33,16-33,17:        NAME           'y'
+33,17-33,18:        NEWLINE        '\n'
+34,8-34,8:          DEDENT         ''
+34,8-34,15:         NAME           'finally'
+34,15-34,16:        COLON          ':'
+34,16-34,17:        NEWLINE        '\n'
+35,0-35,12:         INDENT         '            '
+35,12-35,15:        NAME           'del'
+35,16-35,17:        NAME           'x'
+35,17-35,18:        NEWLINE        '\n'
+36,8-36,8:          DEDENT         ''
+36,8-36,10:         NAME           'if'
+36,11-36,12:        NAME           'x'
+36,12-36,13:        COLON          ':'
+36,13-36,14:        NEWLINE        '\n'
+37,0-37,12:         INDENT         '            '
+37,12-37,17:        NAME           'raise'
+37,18-37,27:        NAME           'Exception'
+37,27-37,28:        LPAR           '('
+37,28-37,29:        RPAR           ')'
+37,29-37,30:        NEWLINE        '\n'
+38,8-38,8:          DEDENT         ''
+38,8-38,12:         NAME           'else'
+38,12-38,13:        COLON          ':'
+38,13-38,14:        NEWLINE        '\n'
+39,0-39,12:         INDENT         '            '
+39,12-39,18:        NAME           'return'
+39,18-39,19:        NEWLINE        '\n'
+41,4-41,4:          DEDENT         ''
+41,4-41,4:          DEDENT         ''
+41,4-41,7:          NAME           'def'
+41,8-41,13:         NAME           'exprs'
+41,13-41,14:        LPAR           '('
+41,14-41,16:        NAME           'p2'
+41,16-41,17:        COMMA          ','
+41,18-41,20:        NAME           'p3'
+41,20-41,21:        RPAR           ')'
+41,21-41,22:        COLON          ':'
+41,22-41,23:        NEWLINE        '\n'
+42,0-42,8:          INDENT         '        '
+42,8-42,10:         NAME           'p2'
+42,10-42,11:        DOT            '.'
+42,11-42,12:        NAME           'x'
+42,13-42,14:        OP             '='
+42,15-42,16:        NUMBER         '2'
+42,16-42,17:        NEWLINE        '\n'
+43,8-43,9:          NAME           'a'
+43,10-43,11:        OP             '='
+43,12-43,14:        NAME           'p3'
+43,14-43,15:        DOT            '.'
+43,15-43,16:        NAME           'y'
+43,16-43,17:        NEWLINE        '\n'
+44,8-44,9:          NAME           'x'
+44,10-44,11:        OP             '='
+44,12-44,13:        NUMBER         '1'
+44,14-44,15:        OP             '+'
+44,16-44,17:        NUMBER         '2'
+44,17-44,18:        NEWLINE        '\n'
+45,8-45,9:          NAME           'y'
+45,10-45,11:        OP             '='
+45,12-45,23:        STRING         'b\'h4tpvhsa\''
+45,23-45,24:        NEWLINE        '\n'
+46,8-46,12:         NAME           'call'
+46,12-46,13:        LPAR           '('
+46,13-46,17:        NAME           'arg0'
+46,17-46,18:        COMMA          ','
+46,19-46,23:        NAME           'arg1'
+46,23-46,24:        COMMA          ','
+46,25-46,30:        NAME           'name0'
+46,30-46,31:        OP             '='
+46,31-46,35:        STRING         '"Hi"'
+46,35-46,36:        COMMA          ','
+46,37-46,42:        NAME           'name1'
+46,42-46,43:        OP             '='
+46,43-46,44:        NAME           'y'
+46,44-46,45:        COMMA          ','
+46,46-46,47:        OP             '*'
+46,47-46,48:        LPAR           '('
+46,48-46,49:        RPAR           ')'
+46,49-46,50:        COMMA          ','
+46,51-46,53:        OP             '**'
+46,53-46,54:        LBRACE         '{'
+46,54-46,55:        RBRACE         '}'
+46,55-46,56:        RPAR           ')'
+46,56-46,57:        NEWLINE        '\n'
+47,8-47,9:          NAME           'x'
+47,10-47,11:        OP             '<'
+47,12-47,13:        NAME           'y'
+47,13-47,14:        NEWLINE        '\n'
+48,8-48,9:          LBRACE         '{'
+48,9-48,10:         NUMBER         '1'
+48,10-48,11:        COLON          ':'
+48,11-48,12:        NUMBER         '1'
+48,12-48,13:        COMMA          ','
+48,14-48,15:        NUMBER         '2'
+48,15-48,16:        COLON          ':'
+48,17-48,18:        NUMBER         '2'
+48,18-48,19:        RBRACE         '}'
+48,19-48,20:        NEWLINE        '\n'
+50,8-50,9:          NAME           'x'
+50,9-50,10:         LSQB           '['
+50,10-50,11:        NAME           'a'
+50,11-50,12:        COMMA          ','
+50,13-50,14:        NUMBER         '7'
+50,14-50,15:        RSQB           ']'
+50,15-50,16:        NEWLINE        '\n'
+51,8-51,9:          LPAR           '('
+51,9-51,10:         NAME           'x'
+51,11-51,14:        NAME           'for'
+51,15-51,16:        NAME           'x'
+51,17-51,19:        NAME           'in'
+51,20-51,21:        NAME           'y'
+51,21-51,22:        RPAR           ')'
+51,22-51,23:        NEWLINE        '\n'
+52,8-52,10:         NUMBER         '17'
+52,11-52,13:        NAME           'if'
+52,14-52,15:        NAME           'x'
+52,16-52,17:        OP             '<'
+52,18-52,19:        NAME           'y'
+52,20-52,24:        NAME           'else'
+52,25-52,27:        NUMBER         '16'
+52,27-52,28:        NEWLINE        '\n'
+53,8-53,14:         NAME           'lambda'
+53,15-53,16:        NAME           'x'
+53,17-53,18:        COLON          ':'
+53,19-53,20:        NAME           'x'
+53,21-53,22:        OP             '*'
+53,23-53,24:        NAME           'y'
+53,24-53,25:        NEWLINE        '\n'
+54,8-54,9:          LSQB           '['
+54,10-54,11:        NUMBER         '1'
+54,11-54,12:        COMMA          ','
+54,13-54,14:        NUMBER         '2'
+54,14-54,15:        COMMA          ','
+54,16-54,17:        NAME           'a'
+54,17-54,18:        COMMA          ','
+54,19-54,20:        NAME           'x'
+54,20-54,21:        DOT            '.'
+54,21-54,22:        NAME           'b'
+54,22-54,23:        COMMA          ','
+54,24-54,26:        NAME           'p1'
+54,26-54,27:        DOT            '.'
+54,27-54,28:        NAME           'c'
+54,29-54,30:        RSQB           ']'
+54,30-54,31:        NEWLINE        '\n'
+55,8-55,9:          LSQB           '['
+55,10-55,11:        NAME           'a'
+55,12-55,13:        OP             '+'
+55,14-55,18:        STRING         '"Hi"'
+55,19-55,22:        NAME           'for'
+55,23-55,24:        NAME           'a'
+55,25-55,27:        NAME           'in'
+55,28-55,31:        NAME           'str'
+55,31-55,32:        LPAR           '('
+55,32-55,33:        NAME           'y'
+55,33-55,34:        RPAR           ')'
+55,35-55,36:        RSQB           ']'
+55,36-55,37:        NEWLINE        '\n'
+59,8-59,18:         COMMENT        '#a, *b = y'
+60,8-60,13:         STRING         'u"Hi"'
+60,13-60,14:        NEWLINE        '\n'
+61,8-61,9:          NAME           'x'
+61,9-61,10:         LSQB           '['
+61,10-61,11:        NUMBER         '0'
+61,11-61,12:        RSQB           ']'
+61,12-61,13:        NEWLINE        '\n'
+62,8-62,9:          NAME           'x'
+62,9-62,10:         LSQB           '['
+62,10-62,11:        NAME           'y'
+62,11-62,12:        LSQB           '['
+62,12-62,13:        NUMBER         '0'
+62,13-62,14:        RSQB           ']'
+62,14-62,15:        RSQB           ']'
+62,15-62,16:        NEWLINE        '\n'
+63,8-63,9:          LPAR           '('
+63,9-63,11:         NAME           'p2'
+63,11-63,12:        COMMA          ','
+63,13-63,15:        NAME           'p3'
+63,15-63,16:        COMMA          ','
+63,17-63,18:        NUMBER         '7'
+63,18-63,19:        RPAR           ')'
+63,19-63,20:        NEWLINE        '\n'
+65,0-65,23:         COMMENT        '#Some multiline strings'
+66,0-66,0:          DEDENT         ''
+66,0-66,0:          DEDENT         ''
+66,0-67,23:         STRING         '\'\'\'\nSingle quotes string\'\'\''
+67,23-67,24:        NEWLINE        '\n'
+69,0-71,9:          STRING         '"""\nDouble-quotes\nstring"""'
+71,9-71,10:         NEWLINE        '\n'
+73,0-75,3:          STRING         'r\'\'\'\nBytes\n\'\'\''
+75,3-75,4:          NEWLINE        '\n'
+77,0-80,3:          STRING         'U"""\nRaw\nUnicode\n"""'
+80,3-80,4:          NEWLINE        '\n'
+82,0-82,19:         COMMENT        '#Decorated function'
+83,0-83,1:          AT             '@'
+83,1-83,5:          NAME           'deco'
+83,5-83,6:          NEWLINE        '\n'
+84,0-84,3:          NAME           'def'
+84,4-84,5:          NAME           'f'
+84,5-84,6:          LPAR           '('
+84,6-84,7:          RPAR           ')'
+84,7-84,8:          COLON          ':'
+84,8-84,9:          NEWLINE        '\n'
+85,0-85,4:          INDENT         '    '
+85,4-85,8:          NAME           'pass'
+85,8-85,9:          NEWLINE        '\n'
+87,0-87,32:         COMMENT        '#Inner function (see ODASA-1774)'
+88,0-88,0:          DEDENT         ''
+88,0-88,3:          NAME           'def'
+88,4-88,9:          NAME           'outer'
+88,9-88,10:         LPAR           '('
+88,10-88,11:        RPAR           ')'
+88,11-88,12:        COLON          ':'
+88,12-88,13:        NEWLINE        '\n'
+89,0-89,4:          INDENT         '    '
+89,4-89,7:          NAME           'def'
+89,8-89,13:         NAME           'inner'
+89,13-89,14:        LPAR           '('
+89,14-89,15:        RPAR           ')'
+89,15-89,16:        COLON          ':'
+89,16-89,17:        NEWLINE        '\n'
+90,0-90,8:          INDENT         '        '
+90,8-90,12:         NAME           'pass'
+90,12-90,13:        NEWLINE        '\n'
+92,0-92,29:         COMMENT        '#Oddly laid out comprehension'
+93,0-93,0:          DEDENT         ''
+93,0-93,0:          DEDENT         ''
+93,0-93,1:          LSQB           '['
+93,1-93,2:          LSQB           '['
+94,2-94,3:          NAME           'x'
+94,4-94,7:          NAME           'for'
+94,8-94,9:          NAME           'x'
+94,10-94,12:        NAME           'in'
+94,13-94,14:        NAME           'y'
+95,2-95,3:          RSQB           ']'
+97,2-97,5:          NAME           'for'
+97,6-97,7:          NAME           'a'
+97,8-97,10:         NAME           'in'
+97,11-97,12:        NAME           'b'
+98,0-98,1:          RSQB           ']'
+98,1-98,2:          NEWLINE        '\n'
+100,0-100,25:       COMMENT        '#Nested binary operations'
+101,0-101,7:        STRING         '"Hello"'
+101,8-101,9:        OP             '+'
+101,10-101,13:      STRING         '" "'
+101,14-101,15:      OP             '+'
+101,16-101,23:      STRING         '"world"'
+101,23-101,24:      NEWLINE        '\n'
+102,0-102,1:        NUMBER         '1'
+102,1-102,2:        OP             '+'
+102,2-102,3:        NUMBER         '2'
+102,3-102,4:        OP             '+'
+102,4-102,5:        NAME           'f'
+102,5-102,6:        LPAR           '('
+102,6-102,7:        RPAR           ')'
+102,7-102,8:        NEWLINE        '\n'
+103,0-103,1:        NUMBER         '1'
+103,1-103,2:        OP             '+'
+103,2-103,3:        LPAR           '('
+103,3-103,4:        NUMBER         '2'
+103,4-103,5:        OP             '+'
+103,5-103,6:        NUMBER         '3'
+103,6-103,7:        RPAR           ')'
+103,7-103,8:        NEWLINE        '\n'
+105,0-105,12:       COMMENT        '# operations'
+106,0-106,1:        NAME           'a'
+106,1-106,2:        OP             '|'
+106,2-106,3:        NAME           'b'
+106,3-106,4:        OP             '&'
+106,4-106,5:        NAME           'c'
+106,5-106,6:        OP             '+'
+106,6-106,7:        NAME           'd'
+106,7-106,8:        OP             '-'
+106,8-106,9:        NAME           'e'
+106,9-106,10:       NEWLINE        '\n'
+107,0-107,1:        NAME           'x'
+107,1-107,2:        OP             '*'
+107,2-107,3:        NAME           'f'
+107,3-107,4:        OP             '%'
+107,4-107,5:        NAME           'g'
+107,5-107,6:        OP             '^'
+107,6-107,7:        NAME           'h'
+107,7-107,8:        AT             '@'
+107,8-107,9:        NAME           'j'
+107,9-107,11:       OP             '**'
+107,11-107,12:      NAME           'k'
+107,12-107,13:      NEWLINE        '\n'
+109,0-109,18:       COMMENT        '#Augmented assigns'
+110,0-110,1:        NAME           'a'
+110,2-110,4:        OP             '@='
+110,5-110,6:        NAME           'b'
+110,6-110,7:        NEWLINE        '\n'
+111,0-111,1:        NAME           'a'
+111,2-111,4:        OP             '|='
+111,5-111,6:        NAME           'b'
+111,6-111,7:        NEWLINE        '\n'
+112,0-112,1:        NAME           'a'
+112,2-112,4:        OP             '*='
+112,5-112,6:        NAME           'b'
+112,6-112,7:        NEWLINE        '\n'
+114,0-114,1:        OP             '~'
+114,1-114,2:        NAME           'a'
+114,2-114,3:        NEWLINE        '\n'
+116,0-116,12:       COMMENT        '#Comparisons'
+117,0-117,1:        OP             '<'
+117,1-117,2:        NEWLINE        '\n'
+118,0-118,1:        OP             '>'
+118,1-118,2:        NEWLINE        '\n'
+119,0-119,2:        OP             '<='
+119,2-119,3:        NEWLINE        '\n'
+120,0-120,2:        OP             '>='
+120,2-120,3:        NEWLINE        '\n'
+121,0-121,2:        OP             '!='
+121,2-121,3:        NEWLINE        '\n'
+122,0-122,2:        OP             '=='
+122,2-122,3:        NEWLINE        '\n'
+123,0-123,2:        NAME           'is'
+123,2-123,3:        NEWLINE        '\n'
+124,0-124,2:        NAME           'is'
+124,3-124,6:        NAME           'not'
+124,6-124,7:        NEWLINE        '\n'
+126,0-126,1:        LPAR           '('
+126,1-127,3:        STRING         '"""\n"""'
+127,3-127,4:        RPAR           ')'
+127,4-127,5:        NEWLINE        '\n'
+128,0-128,3:        NAME           'del'
+128,4-128,5:        NAME           'x'
+128,5-128,6:        NEWLINE        '\n'
+130,0-130,1:        BACKQUOTE      '`'
+130,1-130,10:       NAME           'backticks'
+130,10-130,11:      BACKQUOTE      '`'
+130,11-130,12:      NEWLINE        '\n'
+132,0-132,1:        NAME           'x'
+132,3-132,4:        COLONEQUAL     ':='
+132,5-132,6:        NAME           'y'
+132,6-132,7:        NEWLINE        '\n'
+134,0-134,1:        NUMBER         '1'
+134,2-134,4:        OP             '<>'
+134,5-134,6:        NUMBER         '2'
+134,6-134,7:        NEWLINE        '\n'
+135,0-135,0:        ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/close_brace.py
+++ b/python/extractor/tests/tokenizer/close_brace.py
@@ -0,0 +1,3 @@
+}
+)
+]
--- a/python/extractor/tests/tokenizer/close_brace.tokens
+++ b/python/extractor/tests/tokenizer/close_brace.tokens
@@ -0,0 +1,7 @@
+1,0-1,1:            RBRACE         '}'
+1,1-1,2:            NEWLINE        '\n'
+2,0-2,1:            RPAR           ')'
+2,1-2,2:            NEWLINE        '\n'
+3,0-3,1:            RSQB           ']'
+3,1-3,2:            NEWLINE        '\n'
+4,0-4,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/comments.py
+++ b/python/extractor/tests/tokenizer/comments.py
@@ -0,0 +1,13 @@
+
+import sys
+
+def f():
+    code-here # Line end comment
+    #Indented comment
+#Unindented comment
+    return 1
+
+def g(arg):
+    return arg
+
+x = g(f())
--- a/python/extractor/tests/tokenizer/comments.tokens
+++ b/python/extractor/tests/tokenizer/comments.tokens
@@ -0,0 +1,43 @@
+2,0-2,6:            NAME           'import'
+2,7-2,10:           NAME           'sys'
+2,10-2,11:          NEWLINE        '\n'
+4,0-4,3:            NAME           'def'
+4,4-4,5:            NAME           'f'
+4,5-4,6:            LPAR           '('
+4,6-4,7:            RPAR           ')'
+4,7-4,8:            COLON          ':'
+4,8-4,9:            NEWLINE        '\n'
+5,0-5,4:            INDENT         '    '
+5,4-5,8:            NAME           'code'
+5,8-5,9:            OP             '-'
+5,9-5,13:           NAME           'here'
+5,14-5,32:          COMMENT        '# Line end comment'
+5,32-5,33:          NEWLINE        '\n'
+6,4-6,21:           COMMENT        '#Indented comment'
+7,0-7,19:           COMMENT        '#Unindented comment'
+8,4-8,10:           NAME           'return'
+8,11-8,12:          NUMBER         '1'
+8,12-8,13:          NEWLINE        '\n'
+10,0-10,0:          DEDENT         ''
+10,0-10,3:          NAME           'def'
+10,4-10,5:          NAME           'g'
+10,5-10,6:          LPAR           '('
+10,6-10,9:          NAME           'arg'
+10,9-10,10:         RPAR           ')'
+10,10-10,11:        COLON          ':'
+10,11-10,12:        NEWLINE        '\n'
+11,0-11,4:          INDENT         '    '
+11,4-11,10:         NAME           'return'
+11,11-11,14:        NAME           'arg'
+11,14-11,15:        NEWLINE        '\n'
+13,0-13,0:          DEDENT         ''
+13,0-13,1:          NAME           'x'
+13,2-13,3:          OP             '='
+13,4-13,5:          NAME           'g'
+13,5-13,6:          LPAR           '('
+13,6-13,7:          NAME           'f'
+13,7-13,8:          LPAR           '('
+13,8-13,9:          RPAR           ')'
+13,9-13,10:         RPAR           ')'
+13,10-13,11:        NEWLINE        '\n'
+14,0-14,0:          ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/continuation.py
+++ b/python/extractor/tests/tokenizer/continuation.py
@@ -0,0 +1,5 @@
+def foo():
+    pass \
+\
+\
+
--- a/python/extractor/tests/tokenizer/continuation.tokens
+++ b/python/extractor/tests/tokenizer/continuation.tokens
@@ -0,0 +1,11 @@
+1,0-1,3:            NAME           'def'
+1,4-1,7:            NAME           'foo'
+1,7-1,8:            LPAR           '('
+1,8-1,9:            RPAR           ')'
+1,9-1,10:           COLON          ':'
+1,10-1,11:          NEWLINE        '\n'
+2,0-2,4:            INDENT         '    '
+2,4-2,8:            NAME           'pass'
+5,0-5,1:            NEWLINE        '\n'
+6,0-6,0:            DEDENT         ''
+6,0-6,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/dollar.py
+++ b/python/extractor/tests/tokenizer/dollar.py
@@ -0,0 +1,2 @@
+$name
+$ßðđ0
--- a/python/extractor/tests/tokenizer/dollar.tokens
+++ b/python/extractor/tests/tokenizer/dollar.tokens
@@ -0,0 +1,5 @@
+1,0-1,5:            DOLLARNAME     '$name'
+1,5-1,6:            NEWLINE        '\n'
+2,0-2,5:            DOLLARNAME     '$ßðđ0'
+2,5-2,6:            NEWLINE        '\n'
+3,0-3,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/dots.py
+++ b/python/extractor/tests/tokenizer/dots.py
@@ -0,0 +1,4 @@
+.
+..
+...
+....
--- a/python/extractor/tests/tokenizer/dots.tokens
+++ b/python/extractor/tests/tokenizer/dots.tokens
@@ -0,0 +1,15 @@
+1,0-1,1:            DOT            '.'
+1,1-1,2:            NEWLINE        '\n'
+2,0-2,1:            DOT            '.'
+2,1-2,2:            DOT            '.'
+2,2-2,3:            NEWLINE        '\n'
+3,0-3,1:            DOT            '.'
+3,1-3,2:            DOT            '.'
+3,2-3,3:            DOT            '.'
+3,3-3,4:            NEWLINE        '\n'
+4,0-4,1:            DOT            '.'
+4,1-4,2:            DOT            '.'
+4,2-4,3:            DOT            '.'
+4,3-4,4:            DOT            '.'
+4,4-4,5:            NEWLINE        '\n'
+5,0-5,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/emoji.py
+++ b/python/extractor/tests/tokenizer/emoji.py
@@ -0,0 +1,2 @@
+"👦👦🏻👦🏼👦🏽👦🏾👦🏿👧👧🏻👧🏼👧🏽👧🏾👧🏿"
+"😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏"
--- a/python/extractor/tests/tokenizer/emoji.tokens
+++ b/python/extractor/tests/tokenizer/emoji.tokens
@@ -0,0 +1,5 @@
+1,0-1,24:           STRING         '"👦👦🏻👦🏼👦🏽👦🏾👦🏿👧👧🏻👧🏼👧🏽👧🏾👧🏿"'
+1,24-1,25:          NEWLINE        '\n'
+2,0-2,18:           STRING         '"😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏"'
+2,18-2,19:          NEWLINE        '\n'
+3,0-3,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/feeds.py
+++ b/python/extractor/tests/tokenizer/feeds.py
@@ -0,0 +1,4 @@
+
+
+
+name
--- a/python/extractor/tests/tokenizer/feeds.tokens
+++ b/python/extractor/tests/tokenizer/feeds.tokens
@@ -0,0 +1,3 @@
+4,0-4,4:            NAME           'name'
+4,4-4,5:            NEWLINE        '\n'
+5,0-5,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/gen_tokens.py
+++ b/python/extractor/tests/tokenizer/gen_tokens.py
@@ -0,0 +1,38 @@
+import sys
+import tokenize
+import token
+
+def printtoken(type, token, start, end, _): 
+    # Use Python 3 tokenize style output, regardless of version
+    if tokenize.tok_name[type] not in ("ENCODING", "NL"):
+        token_range = "%d,%d-%d,%d:" % (start + end)
+        print("%-20s%-15s%r" %
+            (token_range, tokenize.tok_name[type], token)
+        )
+
+OP_TYPES = {
+    "(" : token.LPAR,
+    ")" : token.RPAR,
+    "[" : token.LSQB,
+    "]" : token.RSQB,
+    "{" : token.LBRACE,
+    "}" : token.RBRACE,
+    ":" : token.COLON,
+    "," : token.COMMA,
+    "." : token.DOT,
+    "@" : token.AT,
+    }
+
+def main():
+    readline = open(sys.argv[1], "rb").readline
+    if sys.version < "3":
+        tokenize.tokenize(readline, printtoken)
+    else:
+        for type, token, start, end, _ in tokenize.tokenize(readline):
+            if tokenize.tok_name[type] == "OP":
+                type = OP_TYPES.get(token, type)
+            if tokenize.tok_name[type] not in ("ENCODING", "NL"):
+                printtoken(type, token, start, end, _)
+
+if __name__ == "__main__":
+    main()
--- a/python/extractor/tests/tokenizer/gen_tokens.tokens
+++ b/python/extractor/tests/tokenizer/gen_tokens.tokens
@@ -0,0 +1,275 @@
+1,0-1,6:            NAME           'import'
+1,7-1,10:           NAME           'sys'
+1,10-1,11:          NEWLINE        '\n'
+2,0-2,6:            NAME           'import'
+2,7-2,15:           NAME           'tokenize'
+2,15-2,16:          NEWLINE        '\n'
+3,0-3,6:            NAME           'import'
+3,7-3,12:           NAME           'token'
+3,12-3,13:          NEWLINE        '\n'
+5,0-5,3:            NAME           'def'
+5,4-5,14:           NAME           'printtoken'
+5,14-5,15:          LPAR           '('
+5,15-5,19:          NAME           'type'
+5,19-5,20:          COMMA          ','
+5,21-5,26:          NAME           'token'
+5,26-5,27:          COMMA          ','
+5,28-5,33:          NAME           'start'
+5,33-5,34:          COMMA          ','
+5,35-5,38:          NAME           'end'
+5,38-5,39:          COMMA          ','
+5,40-5,41:          NAME           '_'
+5,41-5,42:          RPAR           ')'
+5,42-5,43:          COLON          ':'
+5,44-5,45:          NEWLINE        '\n'
+6,4-6,63:           COMMENT        '# Use Python 3 tokenize style output, regardless of version'
+7,0-7,4:            INDENT         '    '
+7,4-7,6:            NAME           'if'
+7,7-7,15:           NAME           'tokenize'
+7,15-7,16:          DOT            '.'
+7,16-7,24:          NAME           'tok_name'
+7,24-7,25:          LSQB           '['
+7,25-7,29:          NAME           'type'
+7,29-7,30:          RSQB           ']'
+7,31-7,34:          NAME           'not'
+7,35-7,37:          NAME           'in'
+7,38-7,39:          LPAR           '('
+7,39-7,49:          STRING         '"ENCODING"'
+7,49-7,50:          COMMA          ','
+7,51-7,55:          STRING         '"NL"'
+7,55-7,56:          RPAR           ')'
+7,56-7,57:          COLON          ':'
+7,57-7,58:          NEWLINE        '\n'
+8,0-8,8:            INDENT         '        '
+8,8-8,19:           NAME           'token_range'
+8,20-8,21:          OP             '='
+8,22-8,36:          STRING         '"%d,%d-%d,%d:"'
+8,37-8,38:          OP             '%'
+8,39-8,40:          LPAR           '('
+8,40-8,45:          NAME           'start'
+8,46-8,47:          OP             '+'
+8,48-8,51:          NAME           'end'
+8,51-8,52:          RPAR           ')'
+8,52-8,53:          NEWLINE        '\n'
+9,8-9,13:           NAME           'print'
+9,13-9,14:          LPAR           '('
+9,14-9,28:          STRING         '"%-20s%-15s%r"'
+9,29-9,30:          OP             '%'
+10,12-10,13:        LPAR           '('
+10,13-10,24:        NAME           'token_range'
+10,24-10,25:        COMMA          ','
+10,26-10,34:        NAME           'tokenize'
+10,34-10,35:        DOT            '.'
+10,35-10,43:        NAME           'tok_name'
+10,43-10,44:        LSQB           '['
+10,44-10,48:        NAME           'type'
+10,48-10,49:        RSQB           ']'
+10,49-10,50:        COMMA          ','
+10,51-10,56:        NAME           'token'
+10,56-10,57:        RPAR           ')'
+11,8-11,9:          RPAR           ')'
+11,9-11,10:         NEWLINE        '\n'
+13,0-13,0:          DEDENT         ''
+13,0-13,0:          DEDENT         ''
+13,0-13,8:          NAME           'OP_TYPES'
+13,9-13,10:         OP             '='
+13,11-13,12:        LBRACE         '{'
+14,4-14,7:          STRING         '"("'
+14,8-14,9:          COLON          ':'
+14,10-14,15:        NAME           'token'
+14,15-14,16:        DOT            '.'
+14,16-14,20:        NAME           'LPAR'
+14,20-14,21:        COMMA          ','
+15,4-15,7:          STRING         '")"'
+15,8-15,9:          COLON          ':'
+15,10-15,15:        NAME           'token'
+15,15-15,16:        DOT            '.'
+15,16-15,20:        NAME           'RPAR'
+15,20-15,21:        COMMA          ','
+16,4-16,7:          STRING         '"["'
+16,8-16,9:          COLON          ':'
+16,10-16,15:        NAME           'token'
+16,15-16,16:        DOT            '.'
+16,16-16,20:        NAME           'LSQB'
+16,20-16,21:        COMMA          ','
+17,4-17,7:          STRING         '"]"'
+17,8-17,9:          COLON          ':'
+17,10-17,15:        NAME           'token'
+17,15-17,16:        DOT            '.'
+17,16-17,20:        NAME           'RSQB'
+17,20-17,21:        COMMA          ','
+18,4-18,7:          STRING         '"{"'
+18,8-18,9:          COLON          ':'
+18,10-18,15:        NAME           'token'
+18,15-18,16:        DOT            '.'
+18,16-18,22:        NAME           'LBRACE'
+18,22-18,23:        COMMA          ','
+19,4-19,7:          STRING         '"}"'
+19,8-19,9:          COLON          ':'
+19,10-19,15:        NAME           'token'
+19,15-19,16:        DOT            '.'
+19,16-19,22:        NAME           'RBRACE'
+19,22-19,23:        COMMA          ','
+20,4-20,7:          STRING         '":"'
+20,8-20,9:          COLON          ':'
+20,10-20,15:        NAME           'token'
+20,15-20,16:        DOT            '.'
+20,16-20,21:        NAME           'COLON'
+20,21-20,22:        COMMA          ','
+21,4-21,7:          STRING         '","'
+21,8-21,9:          COLON          ':'
+21,10-21,15:        NAME           'token'
+21,15-21,16:        DOT            '.'
+21,16-21,21:        NAME           'COMMA'
+21,21-21,22:        COMMA          ','
+22,4-22,7:          STRING         '"."'
+22,8-22,9:          COLON          ':'
+22,10-22,15:        NAME           'token'
+22,15-22,16:        DOT            '.'
+22,16-22,19:        NAME           'DOT'
+22,19-22,20:        COMMA          ','
+23,4-23,7:          STRING         '"@"'
+23,8-23,9:          COLON          ':'
+23,10-23,15:        NAME           'token'
+23,15-23,16:        DOT            '.'
+23,16-23,18:        NAME           'AT'
+23,18-23,19:        COMMA          ','
+24,4-24,5:          RBRACE         '}'
+24,5-24,6:          NEWLINE        '\n'
+26,0-26,3:          NAME           'def'
+26,4-26,8:          NAME           'main'
+26,8-26,9:          LPAR           '('
+26,9-26,10:         RPAR           ')'
+26,10-26,11:        COLON          ':'
+26,11-26,12:        NEWLINE        '\n'
+27,0-27,4:          INDENT         '    '
+27,4-27,12:         NAME           'readline'
+27,13-27,14:        OP             '='
+27,15-27,19:        NAME           'open'
+27,19-27,20:        LPAR           '('
+27,20-27,23:        NAME           'sys'
+27,23-27,24:        DOT            '.'
+27,24-27,28:        NAME           'argv'
+27,28-27,29:        LSQB           '['
+27,29-27,30:        NUMBER         '1'
+27,30-27,31:        RSQB           ']'
+27,31-27,32:        COMMA          ','
+27,33-27,37:        STRING         '"rb"'
+27,37-27,38:        RPAR           ')'
+27,38-27,39:        DOT            '.'
+27,39-27,47:        NAME           'readline'
+27,47-27,48:        NEWLINE        '\n'
+28,4-28,6:          NAME           'if'
+28,7-28,10:         NAME           'sys'
+28,10-28,11:        DOT            '.'
+28,11-28,18:        NAME           'version'
+28,19-28,20:        OP             '<'
+28,21-28,24:        STRING         '"3"'
+28,24-28,25:        COLON          ':'
+28,25-28,26:        NEWLINE        '\n'
+29,0-29,8:          INDENT         '        '
+29,8-29,16:         NAME           'tokenize'
+29,16-29,17:        DOT            '.'
+29,17-29,25:        NAME           'tokenize'
+29,25-29,26:        LPAR           '('
+29,26-29,34:        NAME           'readline'
+29,34-29,35:        COMMA          ','
+29,36-29,46:        NAME           'printtoken'
+29,46-29,47:        RPAR           ')'
+29,47-29,48:        NEWLINE        '\n'
+30,4-30,4:          DEDENT         ''
+30,4-30,8:          NAME           'else'
+30,8-30,9:          COLON          ':'
+30,9-30,10:         NEWLINE        '\n'
+31,0-31,8:          INDENT         '        '
+31,8-31,11:         NAME           'for'
+31,12-31,16:        NAME           'type'
+31,16-31,17:        COMMA          ','
+31,18-31,23:        NAME           'token'
+31,23-31,24:        COMMA          ','
+31,25-31,30:        NAME           'start'
+31,30-31,31:        COMMA          ','
+31,32-31,35:        NAME           'end'
+31,35-31,36:        COMMA          ','
+31,37-31,38:        NAME           '_'
+31,39-31,41:        NAME           'in'
+31,42-31,50:        NAME           'tokenize'
+31,50-31,51:        DOT            '.'
+31,51-31,59:        NAME           'tokenize'
+31,59-31,60:        LPAR           '('
+31,60-31,68:        NAME           'readline'
+31,68-31,69:        RPAR           ')'
+31,69-31,70:        COLON          ':'
+31,70-31,71:        NEWLINE        '\n'
+32,0-32,12:         INDENT         '            '
+32,12-32,14:        NAME           'if'
+32,15-32,23:        NAME           'tokenize'
+32,23-32,24:        DOT            '.'
+32,24-32,32:        NAME           'tok_name'
+32,32-32,33:        LSQB           '['
+32,33-32,37:        NAME           'type'
+32,37-32,38:        RSQB           ']'
+32,39-32,41:        OP             '=='
+32,42-32,46:        STRING         '"OP"'
+32,46-32,47:        COLON          ':'
+32,47-32,48:        NEWLINE        '\n'
+33,0-33,16:         INDENT         '                '
+33,16-33,20:        NAME           'type'
+33,21-33,22:        OP             '='
+33,23-33,31:        NAME           'OP_TYPES'
+33,31-33,32:        DOT            '.'
+33,32-33,35:        NAME           'get'
+33,35-33,36:        LPAR           '('
+33,36-33,41:        NAME           'token'
+33,41-33,42:        COMMA          ','
+33,43-33,47:        NAME           'type'
+33,47-33,48:        RPAR           ')'
+33,48-33,49:        NEWLINE        '\n'
+34,12-34,12:        DEDENT         ''
+34,12-34,14:        NAME           'if'
+34,15-34,23:        NAME           'tokenize'
+34,23-34,24:        DOT            '.'
+34,24-34,32:        NAME           'tok_name'
+34,32-34,33:        LSQB           '['
+34,33-34,37:        NAME           'type'
+34,37-34,38:        RSQB           ']'
+34,39-34,42:        NAME           'not'
+34,43-34,45:        NAME           'in'
+34,46-34,47:        LPAR           '('
+34,47-34,57:        STRING         '"ENCODING"'
+34,57-34,58:        COMMA          ','
+34,59-34,63:        STRING         '"NL"'
+34,63-34,64:        RPAR           ')'
+34,64-34,65:        COLON          ':'
+34,65-34,66:        NEWLINE        '\n'
+35,0-35,16:         INDENT         '                '
+35,16-35,26:        NAME           'printtoken'
+35,26-35,27:        LPAR           '('
+35,27-35,31:        NAME           'type'
+35,31-35,32:        COMMA          ','
+35,33-35,38:        NAME           'token'
+35,38-35,39:        COMMA          ','
+35,40-35,45:        NAME           'start'
+35,45-35,46:        COMMA          ','
+35,47-35,50:        NAME           'end'
+35,50-35,51:        COMMA          ','
+35,52-35,53:        NAME           '_'
+35,53-35,54:        RPAR           ')'
+35,54-35,55:        NEWLINE        '\n'
+37,0-37,0:          DEDENT         ''
+37,0-37,0:          DEDENT         ''
+37,0-37,0:          DEDENT         ''
+37,0-37,0:          DEDENT         ''
+37,0-37,2:          NAME           'if'
+37,3-37,11:         NAME           '__name__'
+37,12-37,14:        OP             '=='
+37,15-37,25:        STRING         '"__main__"'
+37,25-37,26:        COLON          ':'
+37,26-37,27:        NEWLINE        '\n'
+38,0-38,4:          INDENT         '    '
+38,4-38,8:          NAME           'main'
+38,8-38,9:          LPAR           '('
+38,9-38,10:         RPAR           ')'
+38,10-38,11:        NEWLINE        '\n'
+39,0-39,0:          DEDENT         ''
+39,0-39,0:          ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/illegal_indentation.py
+++ b/python/extractor/tests/tokenizer/illegal_indentation.py
@@ -0,0 +1,4 @@
+def foo(seq):
+    for var in seq:
+        body
+      illegal-dedent
--- a/python/extractor/tests/tokenizer/illegal_indentation.tokens
+++ b/python/extractor/tests/tokenizer/illegal_indentation.tokens
@@ -0,0 +1,24 @@
+1,0-1,3:            NAME           'def'
+1,4-1,7:            NAME           'foo'
+1,7-1,8:            LPAR           '('
+1,8-1,11:           NAME           'seq'
+1,11-1,12:          RPAR           ')'
+1,12-1,13:          COLON          ':'
+1,13-1,14:          NEWLINE        '\n'
+2,0-2,4:            INDENT         '    '
+2,4-2,7:            NAME           'for'
+2,8-2,11:           NAME           'var'
+2,12-2,14:          NAME           'in'
+2,15-2,18:          NAME           'seq'
+2,18-2,19:          COLON          ':'
+2,19-2,20:          NEWLINE        '\n'
+3,0-3,8:            INDENT         '        '
+3,8-3,12:           NAME           'body'
+3,12-3,13:          NEWLINE        '\n'
+4,6-4,6:            ILLEGALINDENT  ''
+4,6-4,13:           NAME           'illegal'
+4,13-4,14:          OP             '-'
+4,14-4,20:          NAME           'dedent'
+4,20-4,21:          NEWLINE        '\n'
+5,0-5,0:            DEDENT         ''
+5,0-5,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/illegal_indentation2.py
+++ b/python/extractor/tests/tokenizer/illegal_indentation2.py
@@ -0,0 +1,7 @@
+class C:
+    def foo(seq):
+        for var in seq:
+            body
+      illegal
+     dedent
+    sequence
--- a/python/extractor/tests/tokenizer/illegal_indentation2.tokens
+++ b/python/extractor/tests/tokenizer/illegal_indentation2.tokens
@@ -0,0 +1,34 @@
+1,0-1,5:            NAME           'class'
+1,6-1,7:            NAME           'C'
+1,7-1,8:            COLON          ':'
+1,8-1,9:            NEWLINE        '\n'
+2,0-2,4:            INDENT         '    '
+2,4-2,7:            NAME           'def'
+2,8-2,11:           NAME           'foo'
+2,11-2,12:          LPAR           '('
+2,12-2,15:          NAME           'seq'
+2,15-2,16:          RPAR           ')'
+2,16-2,17:          COLON          ':'
+2,17-2,18:          NEWLINE        '\n'
+3,0-3,8:            INDENT         '        '
+3,8-3,11:           NAME           'for'
+3,12-3,15:          NAME           'var'
+3,16-3,18:          NAME           'in'
+3,19-3,22:          NAME           'seq'
+3,22-3,23:          COLON          ':'
+3,23-3,24:          NEWLINE        '\n'
+4,0-4,12:           INDENT         '            '
+4,12-4,16:          NAME           'body'
+4,16-4,17:          NEWLINE        '\n'
+5,6-5,6:            DEDENT         ''
+5,6-5,6:            ILLEGALINDENT  ''
+5,6-5,13:           NAME           'illegal'
+5,13-5,14:          NEWLINE        '\n'
+6,0-6,5:            INDENT         '     '
+6,5-6,11:           NAME           'dedent'
+6,11-6,12:          NEWLINE        '\n'
+7,4-7,4:            DEDENT         ''
+7,4-7,12:           NAME           'sequence'
+7,12-7,13:          NEWLINE        '\n'
+8,0-8,0:            DEDENT         ''
+8,0-8,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/import.py
+++ b/python/extractor/tests/tokenizer/import.py
@@ -0,0 +1,2 @@
+import a
+import why
--- a/python/extractor/tests/tokenizer/import.tokens
+++ b/python/extractor/tests/tokenizer/import.tokens
@@ -0,0 +1,7 @@
+1,0-1,6:            NAME           'import'
+1,7-1,8:            NAME           'a'
+1,8-1,9:            NEWLINE        '\n'
+2,0-2,6:            NAME           'import'
+2,7-2,10:           NAME           'why'
+2,10-2,11:          NEWLINE        '\n'
+3,0-3,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/kannada.py
+++ b/python/extractor/tests/tokenizer/kannada.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+def ಏನಾದರೂ_ಮಾಡು():
+    print('ಏನೋ ಮಾಡಿದೆ')
+
+
+ಏನಾದರೂ_ಮಾಡು()
--- a/python/extractor/tests/tokenizer/kannada.tokens
+++ b/python/extractor/tests/tokenizer/kannada.tokens
@@ -0,0 +1,19 @@
+1,0-1,23:           COMMENT        '# -*- coding: utf-8 -*-'
+3,0-3,3:            NAME           'def'
+3,4-3,15:           NAME           'ಏನಾದರೂ_ಮಾಡು'
+3,15-3,16:          LPAR           '('
+3,16-3,17:          RPAR           ')'
+3,17-3,18:          COLON          ':'
+3,18-3,19:          NEWLINE        '\n'
+4,0-4,4:            INDENT         '    '
+4,4-4,9:            NAME           'print'
+4,9-4,10:           LPAR           '('
+4,10-4,22:          STRING         '\'ಏನೋ ಮಾಡಿದೆ\''
+4,22-4,23:          RPAR           ')'
+4,23-4,24:          NEWLINE        '\n'
+7,0-7,0:            DEDENT         ''
+7,0-7,11:           NAME           'ಏನಾದರೂ_ಮಾಡು'
+7,11-7,12:          LPAR           '('
+7,12-7,13:          RPAR           ')'
+7,13-7,14:          NEWLINE        '\n'
+8,0-8,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/latin.py
+++ b/python/extractor/tests/tokenizer/latin.py
@@ -0,0 +1,4 @@
+"Any old stuff can go here"
+# -*- coding: latin1 -*-
+# G<>nter
+
--- a/python/extractor/tests/tokenizer/latin.tokens
+++ b/python/extractor/tests/tokenizer/latin.tokens
@@ -0,0 +1,5 @@
+1,0-1,27:           STRING         '"Any old stuff can go here"'
+1,27-1,28:          NEWLINE        '\n'
+2,0-2,24:           COMMENT        '# -*- coding: latin1 -*-'
+3,0-3,8:            COMMENT        '# Günter'
+5,0-5,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/numbers.py
+++ b/python/extractor/tests/tokenizer/numbers.py
@@ -0,0 +1,83 @@
+
+#Some negative numbers
+
+-1
+-10000000000000000
+-1.0
+-3.0e17
+
+-(1)
+-(10000000000000000)
+-(1.0)
+-(3.0e17)
+
+(-1)
+(-10000000000000000)
+(-1.0)
+(-3.0e17)
+
+-1j
+
+-3.7e12j
+
+#Some other numbers
+0.058823529630899429
+
+1e-06
+.9999999
+0xffffff
+1e10
+1.
+2.79252680
+0x0001000
+4987312561856745907287624786230562734672583763984576267
+
+#Octal both styles
+0777
+0o777
+
+#Python2 longs
+0
+0L
+5L
+-2L
+498731256185674590728762478623056L
+
+0xfffffffL
+0xeeeeeeeeeeeeeeeeL
+
+0b00010101011111111111L
+0o77777777777L
+0777777777777L
+0j
+0_0234j
+
+0e0
+
+#Valid uses of underscore:
+
+1_1
+1_2_3.4_5_6e7_8_9
+0b1_1
+0o1_1
+0x1_1
+
+0b_010
+0o_010
+0x_010
+
+#Invalid uses of underscore:
+
+1__3
+2e_5
+2e+_5
+123_
+
+#Valid prefixed zero:
+
+0_0
+009.
+009e005
+00123
+
+1 if 1else 0
--- a/python/extractor/tests/tokenizer/numbers.tokens
+++ b/python/extractor/tests/tokenizer/numbers.tokens
@@ -0,0 +1,156 @@
+2,0-2,22:           COMMENT        '#Some negative numbers'
+4,0-4,1:            OP             '-'
+4,1-4,2:            NUMBER         '1'
+4,2-4,3:            NEWLINE        '\n'
+5,0-5,1:            OP             '-'
+5,1-5,18:           NUMBER         '10000000000000000'
+5,18-5,19:          NEWLINE        '\n'
+6,0-6,1:            OP             '-'
+6,1-6,4:            NUMBER         '1.0'
+6,4-6,5:            NEWLINE        '\n'
+7,0-7,1:            OP             '-'
+7,1-7,7:            NUMBER         '3.0e17'
+7,7-7,8:            NEWLINE        '\n'
+9,0-9,1:            OP             '-'
+9,1-9,2:            LPAR           '('
+9,2-9,3:            NUMBER         '1'
+9,3-9,4:            RPAR           ')'
+9,4-9,5:            NEWLINE        '\n'
+10,0-10,1:          OP             '-'
+10,1-10,2:          LPAR           '('
+10,2-10,19:         NUMBER         '10000000000000000'
+10,19-10,20:        RPAR           ')'
+10,20-10,21:        NEWLINE        '\n'
+11,0-11,1:          OP             '-'
+11,1-11,2:          LPAR           '('
+11,2-11,5:          NUMBER         '1.0'
+11,5-11,6:          RPAR           ')'
+11,6-11,7:          NEWLINE        '\n'
+12,0-12,1:          OP             '-'
+12,1-12,2:          LPAR           '('
+12,2-12,8:          NUMBER         '3.0e17'
+12,8-12,9:          RPAR           ')'
+12,9-12,10:         NEWLINE        '\n'
+14,0-14,1:          LPAR           '('
+14,1-14,2:          OP             '-'
+14,2-14,3:          NUMBER         '1'
+14,3-14,4:          RPAR           ')'
+14,4-14,5:          NEWLINE        '\n'
+15,0-15,1:          LPAR           '('
+15,1-15,2:          OP             '-'
+15,2-15,19:         NUMBER         '10000000000000000'
+15,19-15,20:        RPAR           ')'
+15,20-15,21:        NEWLINE        '\n'
+16,0-16,1:          LPAR           '('
+16,1-16,2:          OP             '-'
+16,2-16,5:          NUMBER         '1.0'
+16,5-16,6:          RPAR           ')'
+16,6-16,7:          NEWLINE        '\n'
+17,0-17,1:          LPAR           '('
+17,1-17,2:          OP             '-'
+17,2-17,8:          NUMBER         '3.0e17'
+17,8-17,9:          RPAR           ')'
+17,9-17,10:         NEWLINE        '\n'
+19,0-19,1:          OP             '-'
+19,1-19,3:          NUMBER         '1j'
+19,3-19,4:          NEWLINE        '\n'
+21,0-21,1:          OP             '-'
+21,1-21,8:          NUMBER         '3.7e12j'
+21,8-21,9:          NEWLINE        '\n'
+23,0-23,19:         COMMENT        '#Some other numbers'
+24,0-24,20:         NUMBER         '0.058823529630899429'
+24,20-24,21:        NEWLINE        '\n'
+26,0-26,5:          NUMBER         '1e-06'
+26,5-26,6:          NEWLINE        '\n'
+27,0-27,8:          NUMBER         '.9999999'
+27,8-27,9:          NEWLINE        '\n'
+28,0-28,8:          NUMBER         '0xffffff'
+28,8-28,9:          NEWLINE        '\n'
+29,0-29,4:          NUMBER         '1e10'
+29,4-29,5:          NEWLINE        '\n'
+30,0-30,2:          NUMBER         '1.'
+30,2-30,3:          NEWLINE        '\n'
+31,0-31,10:         NUMBER         '2.79252680'
+31,10-31,11:        NEWLINE        '\n'
+32,0-32,9:          NUMBER         '0x0001000'
+32,9-32,10:         NEWLINE        '\n'
+33,0-33,55:         NUMBER         '4987312561856745907287624786230562734672583763984576267'
+33,55-33,56:        NEWLINE        '\n'
+35,0-35,18:         COMMENT        '#Octal both styles'
+36,0-36,4:          NUMBER         '0777'
+36,4-36,5:          NEWLINE        '\n'
+37,0-37,5:          NUMBER         '0o777'
+37,5-37,6:          NEWLINE        '\n'
+39,0-39,14:         COMMENT        '#Python2 longs'
+40,0-40,1:          NUMBER         '0'
+40,1-40,2:          NEWLINE        '\n'
+41,0-41,2:          NUMBER         '0L'
+41,2-41,3:          NEWLINE        '\n'
+42,0-42,2:          NUMBER         '5L'
+42,2-42,3:          NEWLINE        '\n'
+43,0-43,1:          OP             '-'
+43,1-43,3:          NUMBER         '2L'
+43,3-43,4:          NEWLINE        '\n'
+44,0-44,34:         NUMBER         '498731256185674590728762478623056L'
+44,34-44,35:        NEWLINE        '\n'
+46,0-46,10:         NUMBER         '0xfffffffL'
+46,10-46,11:        NEWLINE        '\n'
+47,0-47,19:         NUMBER         '0xeeeeeeeeeeeeeeeeL'
+47,19-47,20:        NEWLINE        '\n'
+49,0-49,23:         NUMBER         '0b00010101011111111111L'
+49,23-49,24:        NEWLINE        '\n'
+50,0-50,14:         NUMBER         '0o77777777777L'
+50,14-50,15:        NEWLINE        '\n'
+51,0-51,14:         NUMBER         '0777777777777L'
+51,14-51,15:        NEWLINE        '\n'
+52,0-52,2:          NUMBER         '0j'
+52,2-52,3:          NEWLINE        '\n'
+53,0-53,7:          NUMBER         '0_0234j'
+53,7-53,8:          NEWLINE        '\n'
+55,0-55,3:          NUMBER         '0e0'
+55,3-55,4:          NEWLINE        '\n'
+57,0-57,26:         COMMENT        '#Valid uses of underscore:'
+59,0-59,3:          NUMBER         '1_1'
+59,3-59,4:          NEWLINE        '\n'
+60,0-60,17:         NUMBER         '1_2_3.4_5_6e7_8_9'
+60,17-60,18:        NEWLINE        '\n'
+61,0-61,5:          NUMBER         '0b1_1'
+61,5-61,6:          NEWLINE        '\n'
+62,0-62,5:          NUMBER         '0o1_1'
+62,5-62,6:          NEWLINE        '\n'
+63,0-63,5:          NUMBER         '0x1_1'
+63,5-63,6:          NEWLINE        '\n'
+65,0-65,6:          NUMBER         '0b_010'
+65,6-65,7:          NEWLINE        '\n'
+66,0-66,6:          NUMBER         '0o_010'
+66,6-66,7:          NEWLINE        '\n'
+67,0-67,6:          NUMBER         '0x_010'
+67,6-67,7:          NEWLINE        '\n'
+69,0-69,28:         COMMENT        '#Invalid uses of underscore:'
+71,0-71,3:          ERRORTOKEN     '1__'
+71,3-71,4:          NUMBER         '3'
+71,4-71,5:          NEWLINE        '\n'
+72,0-72,3:          ERRORTOKEN     '2e_'
+72,3-72,4:          NUMBER         '5'
+72,4-72,5:          NEWLINE        '\n'
+73,0-73,4:          ERRORTOKEN     '2e+_'
+73,4-73,5:          NUMBER         '5'
+73,5-73,6:          NEWLINE        '\n'
+74,0-74,5:          ERRORTOKEN     '123_\n'
+74,5-74,6:          NEWLINE        '\n'
+75,0-75,21:         COMMENT        '#Valid prefixed zero:'
+77,0-77,3:          NUMBER         '0_0'
+77,3-77,4:          NEWLINE        '\n'
+78,0-78,4:          NUMBER         '009.'
+78,4-78,5:          NEWLINE        '\n'
+79,0-79,7:          NUMBER         '009e005'
+79,7-79,8:          NEWLINE        '\n'
+80,0-80,5:          NUMBER         '00123'
+80,5-80,6:          NEWLINE        '\n'
+82,0-82,1:          NUMBER         '1'
+82,2-82,4:          NAME           'if'
+82,5-82,6:          NUMBER         '1'
+82,6-82,10:         NAME           'else'
+82,11-82,12:        NUMBER         '0'
+82,12-82,13:        NEWLINE        '\n'
+83,0-83,0:          ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/pep484.py
+++ b/python/extractor/tests/tokenizer/pep484.py
@@ -0,0 +1,19 @@
+#PEP 484 style annotations.
+
+def func(callee_type: CallableType,
+         formal_to_actual: List[List[int]],
+         strict: bool = True) -> List[Type]:
+    pass
+
+
+def func(self,
+         name: str,
+         args: List[str],
+         *,
+         cwd: str = None,
+         env: Dict[str, str] = None) -> None:
+    pass
+
+def specials(self, *varargs: vanno, **kwargs: kwanno):
+    pass
+
--- a/python/extractor/tests/tokenizer/pep484.tokens
+++ b/python/extractor/tests/tokenizer/pep484.tokens
@@ -0,0 +1,100 @@
+1,0-1,27:           COMMENT        '#PEP 484 style annotations.'
+3,0-3,3:            NAME           'def'
+3,4-3,8:            NAME           'func'
+3,8-3,9:            LPAR           '('
+3,9-3,20:           NAME           'callee_type'
+3,20-3,21:          COLON          ':'
+3,22-3,34:          NAME           'CallableType'
+3,34-3,35:          COMMA          ','
+4,9-4,25:           NAME           'formal_to_actual'
+4,25-4,26:          COLON          ':'
+4,27-4,31:          NAME           'List'
+4,31-4,32:          LSQB           '['
+4,32-4,36:          NAME           'List'
+4,36-4,37:          LSQB           '['
+4,37-4,40:          NAME           'int'
+4,40-4,41:          RSQB           ']'
+4,41-4,42:          RSQB           ']'
+4,42-4,43:          COMMA          ','
+5,9-5,15:           NAME           'strict'
+5,15-5,16:          COLON          ':'
+5,17-5,21:          NAME           'bool'
+5,22-5,23:          OP             '='
+5,24-5,28:          NAME           'True'
+5,28-5,29:          RPAR           ')'
+5,30-5,32:          RARROW         '->'
+5,33-5,37:          NAME           'List'
+5,37-5,38:          LSQB           '['
+5,38-5,42:          NAME           'Type'
+5,42-5,43:          RSQB           ']'
+5,43-5,44:          COLON          ':'
+5,44-5,45:          NEWLINE        '\n'
+6,0-6,4:            INDENT         '    '
+6,4-6,8:            NAME           'pass'
+6,8-6,9:            NEWLINE        '\n'
+9,0-9,0:            DEDENT         ''
+9,0-9,3:            NAME           'def'
+9,4-9,8:            NAME           'func'
+9,8-9,9:            LPAR           '('
+9,9-9,13:           NAME           'self'
+9,13-9,14:          COMMA          ','
+10,9-10,13:         NAME           'name'
+10,13-10,14:        COLON          ':'
+10,15-10,18:        NAME           'str'
+10,18-10,19:        COMMA          ','
+11,9-11,13:         NAME           'args'
+11,13-11,14:        COLON          ':'
+11,15-11,19:        NAME           'List'
+11,19-11,20:        LSQB           '['
+11,20-11,23:        NAME           'str'
+11,23-11,24:        RSQB           ']'
+11,24-11,25:        COMMA          ','
+12,9-12,10:         OP             '*'
+12,10-12,11:        COMMA          ','
+13,9-13,12:         NAME           'cwd'
+13,12-13,13:        COLON          ':'
+13,14-13,17:        NAME           'str'
+13,18-13,19:        OP             '='
+13,20-13,24:        NAME           'None'
+13,24-13,25:        COMMA          ','
+14,9-14,12:         NAME           'env'
+14,12-14,13:        COLON          ':'
+14,14-14,18:        NAME           'Dict'
+14,18-14,19:        LSQB           '['
+14,19-14,22:        NAME           'str'
+14,22-14,23:        COMMA          ','
+14,24-14,27:        NAME           'str'
+14,27-14,28:        RSQB           ']'
+14,29-14,30:        OP             '='
+14,31-14,35:        NAME           'None'
+14,35-14,36:        RPAR           ')'
+14,37-14,39:        RARROW         '->'
+14,40-14,44:        NAME           'None'
+14,44-14,45:        COLON          ':'
+14,45-14,46:        NEWLINE        '\n'
+15,0-15,4:          INDENT         '    '
+15,4-15,8:          NAME           'pass'
+15,8-15,9:          NEWLINE        '\n'
+17,0-17,0:          DEDENT         ''
+17,0-17,3:          NAME           'def'
+17,4-17,12:         NAME           'specials'
+17,12-17,13:        LPAR           '('
+17,13-17,17:        NAME           'self'
+17,17-17,18:        COMMA          ','
+17,19-17,20:        OP             '*'
+17,20-17,27:        NAME           'varargs'
+17,27-17,28:        COLON          ':'
+17,29-17,34:        NAME           'vanno'
+17,34-17,35:        COMMA          ','
+17,36-17,38:        OP             '**'
+17,38-17,44:        NAME           'kwargs'
+17,44-17,45:        COLON          ':'
+17,46-17,52:        NAME           'kwanno'
+17,52-17,53:        RPAR           ')'
+17,53-17,54:        COLON          ':'
+17,54-17,55:        NEWLINE        '\n'
+18,0-18,4:          INDENT         '    '
+18,4-18,8:          NAME           'pass'
+18,8-18,9:          NEWLINE        '\n'
+20,0-20,0:          DEDENT         ''
+20,0-20,0:          ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/shift_jis.py
+++ b/python/extractor/tests/tokenizer/shift_jis.py
@@ -0,0 +1,11 @@
+# encoding:shift-jis
+
+#This is copied from the Python test library copyright PSF.
+
+"""
+Python <20>̊J<CC8A><4A><EFBFBD>́A1990 <20>N<EFBFBD><4E><EFBFBD>납<EFBFBD><EB82A9><EFBFBD>J<EFBFBD>n<EFBFBD><6E><EFBFBD><EFBFBD><EFBFBD>Ă<EFBFBD><C482>܂<EFBFBD><DC82>B
+<EFBFBD>J<EFBFBD><EFBFBD><EFBFBD>҂<EFBFBD> Guido van Rossum <20>͋<EFBFBD><CD8B><EFBFBD><EFBFBD>p<EFBFBD>̃v<CC83><76><EFBFBD>O<EFBFBD><4F><EFBFBD>~<7E><><EFBFBD>O<EFBFBD><4F><EFBFBD><EFBFBD><EFBFBD>uABC<42>v<EFBFBD>̊J<CC8A><4A><EFBFBD>ɎQ<C98E><51><EFBFBD><EFBFBD><EFBFBD>Ă<EFBFBD><C482>܂<EFBFBD><DC82><EFBFBD><EFBFBD><EFBFBD><EFBFBD>AABC <20>͎<EFBFBD><CD8E>p<EFBFBD><70><EFBFBD>̖ړI<DA93>ɂ͂<C982><CD82>܂<EFBFBD><DC82>K<EFBFBD><4B><EFBFBD>Ă<EFBFBD><C482>܂<EFBFBD><DC82><EFBFBD><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD>B
+<EFBFBD><EFBFBD><EFBFBD>̂悤<EFBFBD>Ȕw<EFBFBD>i<EFBFBD><EFBFBD><EFBFBD>琶<EFBFBD>܂ꂽ Python <20>̌<EFBFBD><CC8C><EFBFBD><EFBFBD>݌v<DD8C>́A<CD81>u<EFBFBD>V<EFBFBD><56><EFBFBD>v<EFBFBD><76><EFBFBD>v<EFBFBD>Łu<C581>K<EFBFBD><4B><EFBFBD><EFBFBD><EFBFBD>e<EFBFBD>Ձv<D581>Ƃ<EFBFBD><C682><EFBFBD><EFBFBD>ڕW<DA95>ɏd<C98F>_<EFBFBD><5F><EFBFBD>u<EFBFBD><75><EFBFBD><EFBFBD><EFBFBD>Ă<EFBFBD><C482>܂<EFBFBD><DC82>B
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̃X<EFBFBD>N<EFBFBD><EFBFBD><EFBFBD>v<EFBFBD>g<EFBFBD>n<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ł̓<EFBFBD><EFBFBD>[<5B>U<EFBFBD>̖ڐ<CC96><DA90>̗<EFBFBD><CC97>֐<EFBFBD><D690><EFBFBD><EFBFBD>D<EFBFBD>悵<EFBFBD>ĐF<C490>X<EFBFBD>ȋ@<40>\<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>v<EFBFBD>f<EFBFBD>Ƃ<EFBFBD><EFBFBD>Ď<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ꍇ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̂ł<EFBFBD><EFBFBD><EFBFBD><EFBFBD>APython <20>ł͂<C582><CD82><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>׍H<D78D><48><EFBFBD>ǉ<EFBFBD><C789><EFBFBD><EFBFBD><EFBFBD><EFBFBD>邱<EFBFBD>Ƃ͂<C682><CD82>܂肠<DC82><E882A0><EFBFBD>܂<EFBFBD><DC82><EFBFBD><EFBFBD>B
+<EFBFBD><EFBFBD><EFBFBD>ꎩ<EFBFBD>̂̋@<40>\<EFBFBD>͍ŏ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɉ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>A<EFBFBD>K<EFBFBD>v<EFBFBD>ȋ@<40>\<EFBFBD>͊g<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>W<EFBFBD><EFBFBD><EFBFBD>[<5B><><EFBFBD>Ƃ<EFBFBD><C682>Ēǉ<C492><C789><EFBFBD><EFBFBD><EFBFBD><EFBFBD>A<EFBFBD>Ƃ<EFBFBD><C682><EFBFBD><EFBFBD>̂<EFBFBD> Python <20>̃|<7C><><EFBFBD>V<EFBFBD>[<5B>ł<EFBFBD><C582>B
+"""
--- a/python/extractor/tests/tokenizer/shift_jis.tokens
+++ b/python/extractor/tests/tokenizer/shift_jis.tokens
@@ -0,0 +1,5 @@
+1,0-1,20:           COMMENT        '# encoding:shift-jis'
+3,0-3,59:           COMMENT        '#This is copied from the Python test library copyright PSF.'
+5,0-11,3:           STRING         '"""\nPython の開発は、1990 年ごろから開始されています。\n開発者の Guido van Rossum は教育用のプログラミング言語「ABC」の開発に参加していましたが、ABC は実用上の目的にはあまり適していませんでした。\nこのような背景から生まれた Python の言語設計は、「シンプル」で「習得が容易」という目標に重点が置かれています。\n多くのスクリプト系言語ではユーザの目先の利便性を優先して色々な機能を言語要素として取り入れる場合が多いのですが、Python ではそういった小細工が追加されることはあまりありません。\n言語自体の機能は最小限に押さえ、必要な機能は拡張モジュールとして追加する、というのが Python のポリシーです。\n"""'
+11,3-11,4:          NEWLINE        '\n'
+12,0-12,0:          ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/strings.py
+++ b/python/extractor/tests/tokenizer/strings.py
@@ -0,0 +1,112 @@
+
+
+#Raw
+
+r'012345678'
+r'(\033|~{)'
+r'\A[+-]?\d+'
+r'(?P<name>[\w]+)|'
+r'\|\[\][123]|\{\}'
+r'^.$'
+r'[^A-Z]'
+
+# With escapes
+
+'\n'
+"\'"
+'\''
+"\""
+"\t\l\b"
+
+
+#F-strings
+
+f''
+rf'hello'
+fr'hello'
+f'a{1+1}b'
+f'{x}{y}a{z}'
+#This is not legal in CPython, but we tokenize it anyway.
+f'a{'x'+"y"}b'
+
+#Multiline f-string
+f'''
+    In f-string expressions act as if parenthesised
+{
+    x +
+    y &
+      z
+}
+end
+'''
+
+#Multi-line
+
+
+r""" Single quotation character with multi-line
+
+"a", 'b', "", ''
+....
+"""
+
+r''' Single quotation character with multi-line
+
+"a", 'b', "", ''
+....
+'''
+
+#f-string conversions
+!a
+!s
+!r
+
+f"{k}={v!r}"
+
+#Implicit concatenation
+(f'{expr} text '
+    'continuation'
+    f'and{v}'
+)
+
+#prefixes
+
+u'{}\r{}{:<{width}}'
+u'{}\r{}{:<{}}'
+
+#f-strings with format specifier
+f'result: {value:0.2f}'
+f'result: {value:{width}.{precision}}'
+
+
+f"Too {'many' if alen > elen else 'few'} parameters for {cls};"
+
+# f-strings have special escaping rules for curly-brackets
+f'This should work \{foo}'
+rf'This should work \{foo}'
+
+f'\}' # syntax error (we currently don't report this)
+f'\}}' # ok
+
+
+# f-strings with unicode literals of the form `\N{...}`
+f'{degrees:0.0f}\N{DEGREE SIGN}'
+f"{degrees:0.0f}\N{DEGREE SIGN}"
+f'''{degrees:0.0f}\N{DEGREE SIGN}'''
+f"""{degrees:0.0f}\N{DEGREE SIGN}"""
+
+# double curlies in f-strings with various kinds of quoting
+f'{{ {foo} }}'
+f"{{ {foo} }}"
+f'''{{ {foo} }}'''
+f"""{{ {foo} }}"""
+
+# Empty f-strings
+f''
+f""
+f''''''
+f""""""
+
+
+r'\NUL' # _Not_ a named unicode escape (`\N{...}`)
+
+f'res: {val:{width:0}.{prec:1}}'
--- a/python/extractor/tests/tokenizer/strings.tokens
+++ b/python/extractor/tests/tokenizer/strings.tokens
@@ -0,0 +1,211 @@
+3,0-3,4:            COMMENT        '#Raw'
+5,0-5,12:           STRING         'r\'012345678\''
+5,12-5,13:          NEWLINE        '\n'
+6,0-6,12:           STRING         'r\'(\\033|~{)\''
+6,12-6,13:          NEWLINE        '\n'
+7,0-7,13:           STRING         'r\'\\A[+-]?\\d+\''
+7,13-7,14:          NEWLINE        '\n'
+8,0-8,19:           STRING         'r\'(?P<name>[\\w]+)|\''
+8,19-8,20:          NEWLINE        '\n'
+9,0-9,19:           STRING         'r\'\\|\\[\\][123]|\\{\\}\''
+9,19-9,20:          NEWLINE        '\n'
+10,0-10,6:          STRING         'r\'^.$\''
+10,6-10,7:          NEWLINE        '\n'
+11,0-11,9:          STRING         'r\'[^A-Z]\''
+11,9-11,10:         NEWLINE        '\n'
+13,0-13,14:         COMMENT        '# With escapes'
+15,0-15,4:          STRING         '\'\\n\''
+15,4-15,5:          NEWLINE        '\n'
+16,0-16,4:          STRING         '"\\\'"'
+16,4-16,5:          NEWLINE        '\n'
+17,0-17,4:          STRING         '\'\\\'\''
+17,4-17,5:          NEWLINE        '\n'
+18,0-18,4:          STRING         '"\\""'
+18,4-18,5:          NEWLINE        '\n'
+19,0-19,8:          STRING         '"\\t\\l\\b"'
+19,8-19,9:          NEWLINE        '\n'
+22,0-22,10:         COMMENT        '#F-strings'
+24,0-24,3:          STRING         'f\'\''
+24,3-24,4:          NEWLINE        '\n'
+25,0-25,9:          STRING         'rf\'hello\''
+25,9-25,10:         NEWLINE        '\n'
+26,0-26,9:          STRING         'fr\'hello\''
+26,9-26,10:         NEWLINE        '\n'
+27,0-27,4:          FSTRING_START  'f\'a{'
+27,4-27,5:          NUMBER         '1'
+27,5-27,6:          OP             '+'
+27,6-27,7:          NUMBER         '1'
+27,7-27,10:         FSTRING_END    '}b\''
+27,10-27,11:        NEWLINE        '\n'
+28,0-28,3:          FSTRING_START  'f\'{'
+28,3-28,4:          NAME           'x'
+28,4-28,6:          FSTRING_MID    '}{'
+28,6-28,7:          NAME           'y'
+28,7-28,10:         FSTRING_MID    '}a{'
+28,10-28,11:        NAME           'z'
+28,11-28,13:        FSTRING_END    '}\''
+28,13-28,14:        NEWLINE        '\n'
+29,0-29,57:         COMMENT        '#This is not legal in CPython, but we tokenize it anyway.'
+30,0-30,4:          FSTRING_START  'f\'a{'
+30,4-30,7:          STRING         '\'x\''
+30,7-30,8:          OP             '+'
+30,8-30,11:         STRING         '"y"'
+30,11-30,14:        FSTRING_END    '}b\''
+30,14-30,15:        NEWLINE        '\n'
+32,0-32,19:         COMMENT        '#Multiline f-string'
+33,0-35,1:          FSTRING_START  'f\'\'\'\n    In f-string expressions act as if parenthesised\n{'
+36,4-36,5:          NAME           'x'
+36,6-36,7:          OP             '+'
+37,4-37,5:          NAME           'y'
+37,6-37,7:          OP             '&'
+38,6-38,7:          NAME           'z'
+39,0-41,3:          FSTRING_END    '}\nend\n\'\'\''
+41,3-41,4:          NEWLINE        '\n'
+43,0-43,11:         COMMENT        '#Multi-line'
+46,0-50,3:          STRING         'r""" Single quotation character with multi-line\n\n"a", \'b\', "", \'\'\n....\n"""'
+50,3-50,4:          NEWLINE        '\n'
+52,0-56,3:          STRING         'r\'\'\' Single quotation character with multi-line\n\n"a", \'b\', "", \'\'\n....\n\'\'\''
+56,3-56,4:          NEWLINE        '\n'
+58,0-58,21:         COMMENT        '#f-string conversions'
+59,0-59,2:          CONVERSION     '!a'
+59,2-59,3:          NEWLINE        '\n'
+60,0-60,2:          CONVERSION     '!s'
+60,2-60,3:          NEWLINE        '\n'
+61,0-61,2:          CONVERSION     '!r'
+61,2-61,3:          NEWLINE        '\n'
+63,0-63,3:          FSTRING_START  'f"{'
+63,3-63,4:          NAME           'k'
+63,4-63,7:          FSTRING_MID    '}={'
+63,7-63,8:          NAME           'v'
+63,8-63,10:         CONVERSION     '!r'
+63,10-63,12:        FSTRING_END    '}"'
+63,12-63,13:        NEWLINE        '\n'
+65,0-65,23:         COMMENT        '#Implicit concatenation'
+66,0-66,1:          LPAR           '('
+66,1-66,4:          FSTRING_START  'f\'{'
+66,4-66,8:          NAME           'expr'
+66,8-66,16:         FSTRING_END    '} text \''
+67,4-67,18:         STRING         '\'continuation\''
+68,4-68,10:         FSTRING_START  'f\'and{'
+68,10-68,11:        NAME           'v'
+68,11-68,13:        FSTRING_END    '}\''
+69,0-69,1:          RPAR           ')'
+69,1-69,2:          NEWLINE        '\n'
+71,0-71,9:          COMMENT        '#prefixes'
+73,0-73,20:         STRING         'u\'{}\\r{}{:<{width}}\''
+73,20-73,21:        NEWLINE        '\n'
+74,0-74,15:         STRING         'u\'{}\\r{}{:<{}}\''
+74,15-74,16:        NEWLINE        '\n'
+76,0-76,32:         COMMENT        '#f-strings with format specifier'
+77,0-77,11:         FSTRING_START  'f\'result: {'
+77,11-77,16:        NAME           'value'
+77,16-77,17:        COLON          ':'
+77,17-77,21:        FSTRING_SPEC   '0.2f'
+77,21-77,23:        FSTRING_END    '}\''
+77,23-77,24:        NEWLINE        '\n'
+78,0-78,11:         FSTRING_START  'f\'result: {'
+78,11-78,16:        NAME           'value'
+78,16-78,17:        COLON          ':'
+78,17-78,18:        FSTRING_SPEC   '{'
+78,18-78,23:        NAME           'width'
+78,23-78,26:        FSTRING_SPEC   '}.{'
+78,26-78,35:        NAME           'precision'
+78,35-78,36:        FSTRING_SPEC   '}'
+78,36-78,38:        FSTRING_END    '}\''
+78,38-78,39:        NEWLINE        '\n'
+81,0-81,7:          FSTRING_START  'f"Too {'
+81,7-81,13:         STRING         '\'many\''
+81,14-81,16:        NAME           'if'
+81,17-81,21:        NAME           'alen'
+81,22-81,23:        OP             '>'
+81,24-81,28:        NAME           'elen'
+81,29-81,33:        NAME           'else'
+81,34-81,39:        STRING         '\'few\''
+81,39-81,57:        FSTRING_MID    '} parameters for {'
+81,57-81,60:        NAME           'cls'
+81,60-81,63:        FSTRING_END    '};"'
+81,63-81,64:        NEWLINE        '\n'
+83,0-83,58:         COMMENT        '# f-strings have special escaping rules for curly-brackets'
+84,0-84,21:         FSTRING_START  'f\'This should work \\{'
+84,21-84,24:        NAME           'foo'
+84,24-84,26:        FSTRING_END    '}\''
+84,26-84,27:        NEWLINE        '\n'
+85,0-85,22:         FSTRING_START  'rf\'This should work \\{'
+85,22-85,25:        NAME           'foo'
+85,25-85,27:        FSTRING_END    '}\''
+85,27-85,28:        NEWLINE        '\n'
+87,0-87,5:          STRING         'f\'\\}\''
+87,6-87,53:         COMMENT        '# syntax error (we currently don\'t report this)'
+87,53-87,54:        NEWLINE        '\n'
+88,0-88,6:          STRING         'f\'\\}}\''
+88,7-88,11:         COMMENT        '# ok'
+88,11-88,12:        NEWLINE        '\n'
+91,0-91,55:         COMMENT        '# f-strings with unicode literals of the form `\\N{...}`'
+92,0-92,3:          FSTRING_START  'f\'{'
+92,3-92,10:         NAME           'degrees'
+92,10-92,11:        COLON          ':'
+92,11-92,15:        FSTRING_SPEC   '0.0f'
+92,15-92,32:        FSTRING_END    '}\\N{DEGREE SIGN}\''
+92,32-92,33:        NEWLINE        '\n'
+93,0-93,3:          FSTRING_START  'f"{'
+93,3-93,10:         NAME           'degrees'
+93,10-93,11:        COLON          ':'
+93,11-93,15:        FSTRING_SPEC   '0.0f'
+93,15-93,32:        FSTRING_END    '}\\N{DEGREE SIGN}"'
+93,32-93,33:        NEWLINE        '\n'
+94,0-94,5:          FSTRING_START  'f\'\'\'{'
+94,5-94,12:         NAME           'degrees'
+94,12-94,13:        COLON          ':'
+94,13-94,17:        FSTRING_SPEC   '0.0f'
+94,17-94,36:        FSTRING_END    '}\\N{DEGREE SIGN}\'\'\''
+94,36-94,37:        NEWLINE        '\n'
+95,0-95,5:          FSTRING_START  'f"""{'
+95,5-95,12:         NAME           'degrees'
+95,12-95,13:        COLON          ':'
+95,13-95,17:        FSTRING_SPEC   '0.0f'
+95,17-95,36:        FSTRING_END    '}\\N{DEGREE SIGN}"""'
+95,36-95,37:        NEWLINE        '\n'
+97,0-97,59:         COMMENT        '# double curlies in f-strings with various kinds of quoting'
+98,0-98,6:          FSTRING_START  'f\'{{ {'
+98,6-98,9:          NAME           'foo'
+98,9-98,14:         FSTRING_END    '} }}\''
+98,14-98,15:        NEWLINE        '\n'
+99,0-99,6:          FSTRING_START  'f"{{ {'
+99,6-99,9:          NAME           'foo'
+99,9-99,14:         FSTRING_END    '} }}"'
+99,14-99,15:        NEWLINE        '\n'
+100,0-100,8:        FSTRING_START  'f\'\'\'{{ {'
+100,8-100,11:       NAME           'foo'
+100,11-100,18:      FSTRING_END    '} }}\'\'\''
+100,18-100,19:      NEWLINE        '\n'
+101,0-101,8:        FSTRING_START  'f"""{{ {'
+101,8-101,11:       NAME           'foo'
+101,11-101,18:      FSTRING_END    '} }}"""'
+101,18-101,19:      NEWLINE        '\n'
+103,0-103,17:       COMMENT        '# Empty f-strings'
+104,0-104,3:        STRING         'f\'\''
+104,3-104,4:        NEWLINE        '\n'
+105,0-105,3:        STRING         'f""'
+105,3-105,4:        NEWLINE        '\n'
+106,0-106,7:        STRING         'f\'\'\'\'\'\''
+106,7-106,8:        NEWLINE        '\n'
+107,0-107,7:        STRING         'f""""""'
+107,7-107,8:        NEWLINE        '\n'
+110,0-110,7:        STRING         'r\'\\NUL\''
+110,8-110,50:       COMMENT        '# _Not_ a named unicode escape (`\\N{...}`)'
+110,50-110,51:      NEWLINE        '\n'
+112,0-112,8:        FSTRING_START  'f\'res: {'
+112,8-112,11:       NAME           'val'
+112,11-112,12:      COLON          ':'
+112,12-112,13:      FSTRING_SPEC   '{'
+112,13-112,18:      NAME           'width'
+112,18-112,19:      COLON          ':'
+112,19-112,20:      NUMBER         '0'
+112,20-112,23:      FSTRING_SPEC   '}.{'
+112,23-112,27:      NAME           'prec'
+112,27-112,28:      COLON          ':'
+112,28-112,29:      NUMBER         '1'
+112,29-112,30:      FSTRING_SPEC   '}'
+112,30-112,32:      FSTRING_END    '}\''
+112,32-112,33:      NEWLINE        '\n'
+113,0-113,0:        ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/tab.py
+++ b/python/extractor/tests/tokenizer/tab.py
@@ -0,0 +1,3 @@
+
+class C(object):
+	pass
--- a/python/extractor/tests/tokenizer/tab.tokens
+++ b/python/extractor/tests/tokenizer/tab.tokens
@@ -0,0 +1,12 @@
+2,0-2,5:            NAME           'class'
+2,6-2,7:            NAME           'C'
+2,7-2,8:            LPAR           '('
+2,8-2,14:           NAME           'object'
+2,14-2,15:          RPAR           ')'
+2,15-2,16:          COLON          ':'
+2,16-2,17:          NEWLINE        '\n'
+3,0-3,1:            INDENT         '\t'
+3,1-3,5:            NAME           'pass'
+3,5-3,6:            NEWLINE        '\n'
+4,0-4,0:            DEDENT         ''
+4,0-4,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/temp.tokens
+++ b/python/extractor/tests/tokenizer/temp.tokens
@@ -0,0 +1,84 @@
+2,0-2,22:           COMMENT        '#Some negative numbers'
+4,0-4,1:            OP             '-'
+4,1-4,2:            NUMBER         '1'
+4,2-4,3:            NEWLINE        '\n'
+5,0-5,1:            OP             '-'
+5,1-5,18:           NUMBER         '10000000000000000'
+5,18-5,19:          NEWLINE        '\n'
+6,0-6,1:            OP             '-'
+6,1-6,4:            NUMBER         '1.0'
+6,4-6,5:            NEWLINE        '\n'
+7,0-7,1:            OP             '-'
+7,1-7,7:            NUMBER         '3.0e17'
+7,7-7,8:            NEWLINE        '\n'
+9,0-9,1:            OP             '-'
+9,1-9,2:            LPAR           '('
+9,2-9,3:            NUMBER         '1'
+9,3-9,4:            RPAR           ')'
+9,4-9,5:            NEWLINE        '\n'
+10,0-10,1:          OP             '-'
+10,1-10,2:          LPAR           '('
+10,2-10,19:         NUMBER         '10000000000000000'
+10,19-10,20:        RPAR           ')'
+10,20-10,21:        NEWLINE        '\n'
+11,0-11,1:          OP             '-'
+11,1-11,2:          LPAR           '('
+11,2-11,5:          NUMBER         '1.0'
+11,5-11,6:          RPAR           ')'
+11,6-11,7:          NEWLINE        '\n'
+12,0-12,1:          OP             '-'
+12,1-12,2:          LPAR           '('
+12,2-12,8:          NUMBER         '3.0e17'
+12,8-12,9:          RPAR           ')'
+12,9-12,10:         NEWLINE        '\n'
+14,0-14,1:          LPAR           '('
+14,1-14,2:          OP             '-'
+14,2-14,3:          NUMBER         '1'
+14,3-14,4:          RPAR           ')'
+14,4-14,5:          NEWLINE        '\n'
+15,0-15,1:          LPAR           '('
+15,1-15,2:          OP             '-'
+15,2-15,19:         NUMBER         '10000000000000000'
+15,19-15,20:        RPAR           ')'
+15,20-15,21:        NEWLINE        '\n'
+16,0-16,1:          LPAR           '('
+16,1-16,2:          OP             '-'
+16,2-16,5:          NUMBER         '1.0'
+16,5-16,6:          RPAR           ')'
+16,6-16,7:          NEWLINE        '\n'
+17,0-17,1:          LPAR           '('
+17,1-17,2:          OP             '-'
+17,2-17,8:          NUMBER         '3.0e17'
+17,8-17,9:          RPAR           ')'
+17,9-17,10:         NEWLINE        '\n'
+19,0-19,1:          OP             '-'
+19,1-19,3:          NUMBER         '1j'
+19,3-19,4:          NEWLINE        '\n'
+21,0-21,1:          OP             '-'
+21,1-21,8:          NUMBER         '3.7e12j'
+21,8-21,9:          NEWLINE        '\n'
+23,0-23,19:         COMMENT        '#Some other numbers'
+24,0-24,20:         NUMBER         '0.058823529630899429'
+24,20-24,21:        NEWLINE        '\n'
+26,0-26,5:          NUMBER         '1e-06'
+26,5-26,6:          NEWLINE        '\n'
+27,0-27,8:          NUMBER         '.9999999'
+27,8-27,9:          NEWLINE        '\n'
+28,0-28,8:          NUMBER         '0xffffff'
+28,8-28,9:          NEWLINE        '\n'
+29,0-29,4:          NUMBER         '1e10'
+29,4-29,5:          NEWLINE        '\n'
+30,0-30,2:          NUMBER         '1.'
+30,2-30,3:          NEWLINE        '\n'
+31,0-31,10:         NUMBER         '2.79252680'
+31,10-31,11:        NEWLINE        '\n'
+32,0-32,9:          NUMBER         '0x0001000'
+32,9-32,10:         NEWLINE        '\n'
+33,0-33,55:         NUMBER         '4987312561856745907287624786230562734672583763984576267'
+33,55-33,56:        NEWLINE        '\n'
+35,0-35,18:         COMMENT        '#Octal both styles'
+36,0-36,4:          NUMBER         '0777'
+36,4-36,5:          NEWLINE        '\n'
+37,0-37,5:          NUMBER         '0o777'
+37,5-37,6:          NEWLINE        '\n'
+39,0-39,0:          ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/utf8.py
+++ b/python/extractor/tests/tokenizer/utf8.py
@@ -0,0 +1,2 @@
+# Some abitrary prefix with no space beforecoding: utf-8 -*-
+# €€€€
--- a/python/extractor/tests/tokenizer/utf8.tokens
+++ b/python/extractor/tests/tokenizer/utf8.tokens
@@ -0,0 +1,3 @@
+1,0-1,60:           COMMENT        '# Some abitrary prefix with no space beforecoding: utf-8 -*-'
+2,0-2,6:            COMMENT        '# €€€€'
+3,0-3,0:            ENDMARKER      ''
--- a/python/extractor/tests/tokenizer/utf8_bom.py
+++ b/python/extractor/tests/tokenizer/utf8_bom.py
@@ -0,0 +1 @@
+#Starts with a BOM
--- a/python/extractor/tests/tokenizer/utf8_bom.tokens
+++ b/python/extractor/tests/tokenizer/utf8_bom.tokens
@@ -0,0 +1,2 @@
+1,0-1,18:           COMMENT        '#Starts with a BOM'
+2,0-2,0:            ENDMARKER      ''