diff --git a/javascript/extractor/lib/typescript-go/internal/astconv/childprops.go b/javascript/extractor/lib/typescript-go/internal/astconv/childprops.go new file mode 100644 index 00000000000..ac820b88112 --- /dev/null +++ b/javascript/extractor/lib/typescript-go/internal/astconv/childprops.go @@ -0,0 +1,210 @@ +package astconv + +// childProps maps SyntaxKind string names to ordered lists of child property names. +// The order corresponds to the bitmask order in the binary encoder. When a node +// uses the Children data type (top 2 bits = 0b00), the low byte is a bitmask +// indicating which of these properties are present. Children are consumed in order. +// +// These names must match the property names expected by the Java extractor. +// Derived from microsoft/typescript-go/internal/api/encoder/encoder.go. +var childProps = map[string][]string{ + // Multi-child nodes with property mask + "QualifiedName": {"left", "right"}, + "TypeParameter": {"modifiers", "name", "constraint", "default"}, + "IfStatement": {"expression", "thenStatement", "elseStatement"}, + "DoStatement": {"statement", "expression"}, + "WhileStatement": {"expression", "statement"}, + "ForStatement": {"initializer", "condition", "incrementor", "statement"}, + "ForInStatement": {"awaitModifier", "initializer", "expression", "statement"}, + "ForOfStatement": {"awaitModifier", "initializer", "expression", "statement"}, + "WithStatement": {"expression", "statement"}, + "SwitchStatement": {"expression", "caseBlock"}, + "CaseClause": {"expression", "statements"}, + "DefaultClause": {"expression", "statements"}, + "TryStatement": {"tryBlock", "catchClause", "finallyBlock"}, + "CatchClause": {"variableDeclaration", "block"}, + "LabeledStatement": {"label", "statement"}, + "VariableStatement": {"modifiers", "declarationList"}, + "VariableDeclarationList": {"declarations"}, + "VariableDeclaration": {"name", "exclamationToken", "type", "initializer"}, + "Parameter": {"modifiers", "dotDotDotToken", "name", "questionToken", "type", "initializer"}, + "BindingElement": {"dotDotDotToken", "propertyName", "name", "initializer"}, + "FunctionDeclaration": {"modifiers", "asteriskToken", "name", "typeParameters", "parameters", "type", "body"}, + "InterfaceDeclaration": {"modifiers", "name", "typeParameters", "heritageClauses", "members"}, + "TypeAliasDeclaration": {"modifiers", "name", "typeParameters", "type"}, + "EnumMember": {"name", "initializer"}, + "EnumDeclaration": {"modifiers", "name", "members"}, + "ModuleDeclaration": {"modifiers", "name", "body"}, + "ImportEqualsDeclaration": {"modifiers", "name", "moduleReference"}, + "ImportDeclaration": {"modifiers", "importClause", "moduleSpecifier", "attributes"}, + "JSImportDeclaration": {"modifiers", "importClause", "moduleSpecifier", "attributes"}, + "ImportSpecifier": {"propertyName", "name"}, + "ImportClause": {"name", "namedBindings"}, + "ExportAssignment": {"modifiers", "expression"}, + "JSExportAssignment": {"modifiers", "expression"}, + "NamespaceExportDeclaration": {"modifiers", "name"}, + "ExportDeclaration": {"modifiers", "exportClause", "moduleSpecifier", "attributes"}, + "ExportSpecifier": {"propertyName", "name"}, + "CallSignature": {"typeParameters", "parameters", "type"}, + "ConstructSignature": {"typeParameters", "parameters", "type"}, + "Constructor": {"modifiers", "typeParameters", "parameters", "type", "body"}, + "GetAccessor": {"modifiers", "name", "typeParameters", "parameters", "type", "body"}, + "SetAccessor": {"modifiers", "name", "typeParameters", "parameters", "type", "body"}, + "IndexSignature": {"modifiers", "parameters", "type"}, + "MethodSignature": {"modifiers", "name", "questionToken", "typeParameters", "parameters", "type"}, + "MethodDeclaration": {"modifiers", "asteriskToken", "name", "questionToken", "typeParameters", "parameters", "type", "body"}, + "PropertySignature": {"modifiers", "name", "questionToken", "type", "initializer"}, + "PropertyDeclaration": {"modifiers", "name", "questionToken", "type", "initializer"}, + "BinaryExpression": {"left", "operatorToken", "right"}, + "YieldExpression": {"asteriskToken", "expression"}, + "ArrowFunction": {"modifiers", "typeParameters", "parameters", "type", "equalsGreaterThanToken", "body"}, + "FunctionExpression": {"modifiers", "asteriskToken", "name", "typeParameters", "parameters", "type", "body"}, + "AsExpression": {"expression", "type"}, + "SatisfiesExpression": {"expression", "type"}, + "ConditionalExpression": {"condition", "questionToken", "whenTrue", "colonToken", "whenFalse"}, + "PropertyAccessExpression": {"expression", "questionDotToken", "name"}, + "ElementAccessExpression": {"expression", "questionDotToken", "argumentExpression"}, + "CallExpression": {"expression", "questionDotToken", "typeArguments", "arguments"}, + "NewExpression": {"expression", "typeArguments", "arguments"}, + "TemplateExpression": {"head", "templateSpans"}, + "TemplateSpan": {"expression", "literal"}, + "TaggedTemplateExpression": {"tag", "questionDotToken", "typeArguments", "template"}, + "PropertyAssignment": {"modifiers", "name", "questionToken", "initializer"}, + "ShorthandPropertyAssignment": {"modifiers", "name", "questionToken", "equalsToken", "objectAssignmentInitializer"}, + "TypeAssertionExpression": {"type", "expression"}, + "ConditionalType": {"checkType", "extendsType", "trueType", "falseType"}, + "IndexedAccessType": {"objectType", "indexType"}, + "TypeReference": {"typeName", "typeArguments"}, + "ExpressionWithTypeArguments": {"expression", "typeArguments"}, + "TypePredicate": {"assertsModifier", "parameterName", "type"}, + "ImportType": {"argument", "attributes", "qualifier", "typeArguments"}, + "ImportAttribute": {"name", "value"}, + "TypeQuery": {"exprName", "typeArguments"}, + "MappedType": {"readonlyToken", "typeParameter", "nameType", "questionToken", "type", "members"}, + "NamedTupleMember": {"dotDotDotToken", "name", "questionToken", "type"}, + "FunctionType": {"typeParameters", "parameters", "type"}, + "ConstructorType": {"modifiers", "typeParameters", "parameters", "type"}, + "TemplateLiteralType": {"head", "templateSpans"}, + "TemplateLiteralTypeSpan": {"type", "literal"}, + "JsxElement": {"openingElement", "children", "closingElement"}, + "JsxNamespacedName": {"name", "namespace"}, + "JsxOpeningElement": {"tagName", "typeArguments", "attributes"}, + "JsxSelfClosingElement": {"tagName", "typeArguments", "attributes"}, + "JsxFragment": {"openingFragment", "children", "closingFragment"}, + "JsxAttribute": {"name", "initializer"}, + "JsxExpression": {"dotDotDotToken", "expression"}, + "JSDoc": {"comment", "tags"}, + "JSDocTypeTag": {"tagName", "typeExpression", "comment"}, + "JSDocTag": {"tagName", "comment"}, + "JSDocTemplateTag": {"tagName", "constraint", "typeParameters", "comment"}, + "JSDocReturnTag": {"tagName", "typeExpression", "comment"}, + "JSDocPublicTag": {"tagName", "comment"}, + "JSDocPrivateTag": {"tagName", "comment"}, + "JSDocProtectedTag": {"tagName", "comment"}, + "JSDocReadonlyTag": {"tagName", "comment"}, + "JSDocOverrideTag": {"tagName", "comment"}, + "JSDocDeprecatedTag": {"tagName", "comment"}, + "JSDocSeeTag": {"tagName", "nameExpression", "comment"}, + "JSDocImplementsTag": {"tagName", "className", "comment"}, + "JSDocAugmentsTag": {"tagName", "className", "comment"}, + "JSDocSatisfiesTag": {"tagName", "typeExpression", "comment"}, + "JSDocThrowsTag": {"tagName", "typeExpression", "comment"}, + "JSDocThisTag": {"tagName", "typeExpression", "comment"}, + "JSDocImportTag": {"tagName", "importClause", "moduleSpecifier", "attributes", "comment"}, + "JSDocCallbackTag": {"tagName", "typeExpression", "fullName", "comment"}, + "JSDocOverloadTag": {"tagName", "typeExpression", "comment"}, + "JSDocTypedefTag": {"tagName", "typeExpression", "name", "comment"}, + "JSDocSignature": {"typeParameters", "parameters", "type"}, + "ClassStaticBlockDeclaration": {"modifiers", "body"}, + "ClassDeclaration": {"modifiers", "name", "typeParameters", "heritageClauses", "members"}, + "ClassExpression": {"modifiers", "name", "typeParameters", "heritageClauses", "members"}, + + // JSDocParameterTag and JSDocPropertyTag have order-dependent children + // (handled specially in the converter based on isNameFirst defined bit). + // Default order (isNameFirst=false): + "JSDocParameterTag": {"tagName", "typeExpression", "name", "comment"}, + "JSDocPropertyTag": {"tagName", "typeExpression", "name", "comment"}, +} + +// singleChildProp maps node kinds that have exactly one Node child to +// the property name for that child. +var singleChildProp = map[string]string{ + "ReturnStatement": "expression", + "ThrowStatement": "expression", + "ExpressionStatement": "expression", + "BreakStatement": "label", + "ContinueStatement": "label", + "ParenthesizedExpression": "expression", + "ComputedPropertyName": "expression", + "Decorator": "expression", + "SpreadElement": "expression", + "SpreadAssignment": "expression", + "DeleteExpression": "expression", + "TypeOfExpression": "expression", + "VoidExpression": "expression", + "AwaitExpression": "expression", + "NonNullExpression": "expression", + "ExternalModuleReference": "expression", + "NamespaceImport": "name", + "NamespaceExport": "name", + "JsxClosingElement": "tagName", + "ArrayType": "elementType", + "LiteralType": "literal", + "InferType": "typeParameter", + "OptionalType": "type", + "RestType": "type", + "ParenthesizedType": "type", + "JSDocTypeExpression": "type", + "JSDocNonNullableType": "type", + "JSDocNullableType": "type", + "JSDocVariadicType": "type", + "JSDocOptionalType": "type", + "JSDocNameReference": "name", +} + +// singleNodeListProp maps node kinds that have exactly one NodeList child +// to the property name for that child. +var singleNodeListProp = map[string]string{ + "Block": "statements", + "ArrayLiteralExpression": "elements", + "ObjectLiteralExpression": "properties", + "UnionType": "types", + "IntersectionType": "types", + "TupleType": "elements", + "NamedImports": "elements", + "NamedExports": "elements", + "ModuleBlock": "statements", + "CaseBlock": "clauses", + "TypeLiteral": "members", + "JsxAttributes": "properties", + "ArrayBindingPattern": "elements", + "ObjectBindingPattern": "elements", + "HeritageClause": "types", + "JSDocTypeLiteral": "jsDocPropertyTags", +} + +// operandKinds are node kinds where the single child is called "operand" +// and the operator is encoded in the defined bits. +var operandKinds = map[string]bool{ + "PrefixUnaryExpression": true, + "PostfixUnaryExpression": true, +} + +// GetChildProperties returns the ordered child property names for the given +// SyntaxKind name. Returns nil if the kind has no registered child properties +// (leaf node, single-child, or NodeList-child). +func GetChildProperties(kindName string) []string { + return childProps[kindName] +} + +// GetSingleChildProperty returns the property name for a single-child node. +// Returns "" if the kind is not a single-child node. +func GetSingleChildProperty(kindName string) string { + return singleChildProp[kindName] +} + +// GetSingleNodeListProperty returns the property name for a single-NodeList-child node. +// Returns "" if the kind is not a single-NodeList-child node. +func GetSingleNodeListProperty(kindName string) string { + return singleNodeListProp[kindName] +} diff --git a/javascript/extractor/lib/typescript-go/internal/astconv/converter.go b/javascript/extractor/lib/typescript-go/internal/astconv/converter.go new file mode 100644 index 00000000000..363c0d2fe26 --- /dev/null +++ b/javascript/extractor/lib/typescript-go/internal/astconv/converter.go @@ -0,0 +1,652 @@ +package astconv + +import ( + "encoding/json" + "fmt" + "strings" +) + +// Converter transforms a BinaryAST into the JSON format expected by the +// Java extractor. +type Converter struct { + ast *BinaryAST + kindNames map[uint32]string // numeric kind → string name + sourceText string // source file text for $lineStarts / $pos augmentation +} + +// NewConverter creates a Converter for the given binary AST. +// kindToName maps numeric SyntaxKind values to their string names. +func NewConverter(ast *BinaryAST, kindToName map[uint32]string) *Converter { + return &Converter{ + ast: ast, + kindNames: kindToName, + sourceText: ast.SourceText(), + } +} + +// Convert transforms the binary AST into a JSON-serializable map. +// The root node is at index 1. +func (c *Converter) Convert() (map[string]interface{}, error) { + if c.ast.NodeCount() < 2 { + return nil, fmt.Errorf("no nodes to convert") + } + return c.convertNode(1) +} + +// ConvertJSON is a convenience method that converts to JSON bytes. +func (c *Converter) ConvertJSON() (json.RawMessage, error) { + obj, err := c.Convert() + if err != nil { + return nil, err + } + return json.Marshal(obj) +} + +func (c *Converter) convertNode(i int) (map[string]interface{}, error) { + kind := c.ast.Kind(i) + kindName := c.kindNames[kind] + if kindName == "" { + kindName = fmt.Sprintf("Unknown_%d", kind) + } + + node := map[string]interface{}{ + "kind": int(kind), + "flags": int(c.ast.Flags(i)), + "$pos": c.augmentPos(int(c.ast.Pos(i)), true), + "$end": int(c.ast.End(i)), + } + + dataType := c.ast.DataType(i) + + switch dataType { + case nodeDataTypeString: + c.handleStringNode(i, kindName, node) + + case nodeDataTypeExtended: + if err := c.handleExtendedNode(i, kindName, node); err != nil { + return nil, err + } + + default: // nodeDataTypeChildren + if err := c.handleChildrenNode(i, kindName, node); err != nil { + return nil, err + } + } + + // Add defined-bits-based properties + c.addDefinedBitProperties(i, kindName, node) + + return node, nil +} + +// handleStringNode handles nodes with a string property (Identifier, StringLiteral, etc.) +func (c *Converter) handleStringNode(i int, kindName string, node map[string]interface{}) { + strIdx := c.ast.StringIndex(i) + text := c.ast.GetString(strIdx) + + switch kindName { + case "Identifier", "PrivateIdentifier": + node["escapedText"] = text + default: + node["text"] = text + } +} + +// handleExtendedNode handles SourceFile and template literal nodes. +func (c *Converter) handleExtendedNode(i int, kindName string, node map[string]interface{}) error { + extOff := c.ast.ExtOffset(i) + + switch kindName { + case "SourceFile": + return c.handleSourceFile(i, extOff, node) + case "TemplateHead", "TemplateMiddle", "TemplateTail": + c.handleTemplateLiteral(extOff, node) + return nil + default: + return fmt.Errorf("unknown extended data node kind: %s", kindName) + } +} + +// handleSourceFile extracts SourceFile-specific data from extended data. +func (c *Converter) handleSourceFile(i int, extOff uint32, node map[string]interface{}) error { + // SourceFile extended data layout: + // [0-4] textIdx, [4-8] fileNameIdx, [8-12] pathIdx, + // [12-16] languageVariant, [16-20] scriptKind, + // [20-24] referencedFiles, [24-28] typeReferenceDirectives, [28-32] libReferenceDirectives + // [32-36] imports, [36-40] moduleAugmentations, [40-44] ambientModuleNames + // [44-48] externalModuleIndicator + + fileNameIdx := c.ast.ExtUint32(extOff + 4) + node["fileName"] = c.ast.GetString(fileNameIdx) + + // Add source text + if c.sourceText != "" { + node["text"] = c.sourceText + node["$lineStarts"] = computeLineStarts(c.sourceText) + } + + // Add empty parseDiagnostics array (expected by Java extractor) + node["parseDiagnostics"] = []interface{}{} + + // Add children (statements + EndOfFile) + children := c.ast.Children(i) + for _, ci := range children { + if c.ast.IsNodeList(ci) { + arr, err := c.convertNodeList(ci) + if err != nil { + return err + } + node["statements"] = arr + } + // Skip EndOfFile token — the Java extractor doesn't use it + } + + // Generate $tokens by scanning the source text. + if c.sourceText != "" { + events := c.collectRescanEvents(i) + scanner := NewScanner(c.sourceText, events) + rawTokens := scanner.ScanAll() + tokenArr := make([]interface{}, len(rawTokens)) + for ti, tok := range rawTokens { + tokenArr[ti] = map[string]interface{}{ + "kind": tok.Kind, + "tokenPos": c.augmentPos(tok.TokenPos, false), + "text": tok.Text, + } + } + node["$tokens"] = tokenArr + } + + return nil +} + +// handleTemplateLiteral extracts template literal data from extended data. +func (c *Converter) handleTemplateLiteral(extOff uint32, node map[string]interface{}) { + textIdx := c.ast.ExtUint32(extOff) + rawTextIdx := c.ast.ExtUint32(extOff + 4) + node["text"] = c.ast.GetString(textIdx) + node["rawText"] = c.ast.GetString(rawTextIdx) +} + +// handleChildrenNode handles nodes with child properties determined by a bitmask. +func (c *Converter) handleChildrenNode(i int, kindName string, node map[string]interface{}) error { + children := c.ast.Children(i) + + // Check for single-child nodes + if prop := GetSingleChildProperty(kindName); prop != "" { + if len(children) > 0 { + child, err := c.convertNode(children[0]) + if err != nil { + return err + } + node[prop] = child + } + return nil + } + + // Check for single NodeList child nodes + if prop := GetSingleNodeListProperty(kindName); prop != "" { + if len(children) > 0 && c.ast.IsNodeList(children[0]) { + arr, err := c.convertNodeList(children[0]) + if err != nil { + return err + } + node[prop] = arr + } else if len(children) > 0 { + // Some single-NodeList nodes may not have a NodeList child + // (e.g., JSDocTypeLiteral). Fall through to multi-child handling. + } else { + node[prop] = []interface{}{} + return nil + } + return nil + } + + // Check for operator-in-definedBits nodes (PrefixUnaryExpression, PostfixUnaryExpression) + if operandKinds[kindName] { + if len(children) > 0 { + child, err := c.convertNode(children[0]) + if err != nil { + return err + } + node["operand"] = child + } + node["operator"] = int(c.ast.DefinedBits(i)) + return nil + } + + // Multi-child nodes with property mask + props := GetChildProperties(kindName) + if props != nil { + return c.assignChildProperties(i, kindName, props, children, node) + } + + // Token/keyword nodes with no children — nothing to add + if len(children) == 0 { + return nil + } + + // MetaProperty: keywordToken + name + if kindName == "MetaProperty" { + if len(children) > 0 { + child, err := c.convertNode(children[0]) + if err != nil { + return err + } + node["name"] = child + } + return nil + } + + // TypeOperator: operator keyword kind inferred from source text + type child + if kindName == "TypeOperator" { + // Operator (keyof/unique/readonly) is not in the binary encoding. + pos := int(c.ast.Pos(i)) + if c.sourceText != "" && pos < len(c.sourceText) { + text := c.sourceText[pos:] + // Skip leading trivia + for len(text) > 0 && (text[0] == ' ' || text[0] == '\t' || text[0] == '\n' || text[0] == '\r') { + text = text[1:] + } + if len(text) >= 5 && text[:5] == "keyof" { + node["operator"] = int(c.kindForName("KeyOfKeyword")) + } else if len(text) >= 6 && text[:6] == "unique" { + node["operator"] = int(c.kindForName("UniqueKeyword")) + } else if len(text) >= 8 && text[:8] == "readonly" { + node["operator"] = int(c.kindForName("ReadonlyKeyword")) + } + } + if len(children) > 0 { + child, err := c.convertNode(children[0]) + if err != nil { + return err + } + node["type"] = child + } + return nil + } + + // MissingDeclaration: optional modifiers child + if kindName == "MissingDeclaration" { + if len(children) > 0 && c.ast.IsNodeList(children[0]) { + arr, err := c.convertNodeList(children[0]) + if err != nil { + return err + } + node["modifiers"] = arr + } + return nil + } + + // Unknown node kind with children — emit them as a generic "children" array + arr := make([]interface{}, 0, len(children)) + for _, ci := range children { + if c.ast.IsNodeList(ci) { + nlArr, err := c.convertNodeList(ci) + if err != nil { + return err + } + for _, item := range nlArr { + arr = append(arr, item) + } + } else { + child, err := c.convertNode(ci) + if err != nil { + return err + } + arr = append(arr, child) + } + } + if len(arr) > 0 { + node["children"] = arr + } + + return nil +} + +// assignChildProperties distributes children to named properties based on +// the bitmask in the node's data field. +func (c *Converter) assignChildProperties(nodeIdx int, kindName string, props []string, children []int, node map[string]interface{}) error { + mask := c.ast.ChildMask(nodeIdx) + definedBits := c.ast.DefinedBits(nodeIdx) + + // Special handling for JSDocParameterTag/JSDocPropertyTag where + // child order depends on isNameFirst + if (kindName == "JSDocParameterTag" || kindName == "JSDocPropertyTag") && definedBits&2 != 0 { + // isNameFirst=true: order is tagName, name, typeExpression, comment + props = []string{"tagName", "name", "typeExpression", "comment"} + } + + childIdx := 0 + for bit, prop := range props { + if bit < 8 && mask != 0 && mask&(1< 0 && childIdx >= len(children) { + break + } + if childIdx >= len(children) { + break + } + + ci := children[childIdx] + childIdx++ + + if c.ast.IsNodeList(ci) { + arr, err := c.convertNodeList(ci) + if err != nil { + return err + } + node[prop] = arr + } else { + child, err := c.convertNode(ci) + if err != nil { + return err + } + // Remap TS7 "postfixToken" (questionToken property) to the correct name + // based on the actual token kind. TS7 uses a single PostfixToken + // for what TS5 had as separate questionToken/exclamationToken. + if prop == "questionToken" { + childKind := c.ast.Kind(ci) + exclamationKind := c.kindForName("ExclamationToken") + if exclamationKind != 0 && childKind == exclamationKind { + prop = "exclamationToken" + } + } + node[prop] = child + } + } + + return nil +} + +// isArrayProperty returns true for property names that should be empty arrays +// (not omitted) when absent in the binary AST. +func isArrayProperty(prop string) bool { + return arrayProperties[prop] +} + +var arrayProperties = map[string]bool{ + "arguments": true, + "elements": true, + "properties": true, + "members": true, +} + +// convertNodeList converts a NodeList into a JSON array. +func (c *Converter) convertNodeList(i int) ([]interface{}, error) { + children := c.ast.Children(i) + arr := make([]interface{}, 0, len(children)) + for _, ci := range children { + child, err := c.convertNode(ci) + if err != nil { + return nil, err + } + arr = append(arr, child) + } + return arr, nil +} + +// addDefinedBitProperties adds properties derived from the defined bits +// (bits 24-29 of the data field) that aren't part of the child tree. +func (c *Converter) addDefinedBitProperties(i int, kindName string, node map[string]interface{}) { + definedBits := c.ast.DefinedBits(i) + + switch kindName { + case "ImportSpecifier", "ImportEqualsDeclaration", "ExportSpecifier", "ExportDeclaration": + node["isTypeOnly"] = definedBits&1 != 0 + case "ImportClause": + node["isTypeOnly"] = definedBits&1 != 0 + if definedBits&2 != 0 { + node["phaseModifier"] = "defer" + } + case "ImportType": + if definedBits&1 != 0 { + node["isTypeOf"] = true + } + case "ExportAssignment", "JSExportAssignment": + if definedBits&1 != 0 { + node["isExportEquals"] = true + } + case "VariableDeclarationList": + // Determine $declarationKind from defined bits + if definedBits&2 != 0 { + node["$declarationKind"] = "const" + } else if definedBits&1 != 0 { + node["$declarationKind"] = "let" + } else { + node["$declarationKind"] = "var" + } + case "ImportAttributes": + if definedBits&2 != 0 { + node["token"] = c.kindForName("AssertKeyword") + } else { + node["token"] = c.kindForName("WithKeyword") + } + case "HeritageClause": + // Token (extends/implements) is not in the binary encoding. + // Infer from source text, skipping leading trivia. + pos := int(c.ast.Pos(i)) + if c.sourceText != "" && pos < len(c.sourceText) { + text := c.sourceText[pos:] + // Skip whitespace/newlines + for len(text) > 0 && (text[0] == ' ' || text[0] == '\t' || text[0] == '\n' || text[0] == '\r') { + text = text[1:] + } + if len(text) >= 10 && text[:10] == "implements" { + node["token"] = int(c.kindForName("ImplementsKeyword")) + } else { + node["token"] = int(c.kindForName("ExtendsKeyword")) + } + } + case "JSDocParameterTag", "JSDocPropertyTag": + if definedBits&1 != 0 { + node["isBracketed"] = true + } + if definedBits&2 != 0 { + node["isNameFirst"] = true + } + } +} + +// augmentPos replicates the Node.js wrapper's $pos augmentation: +// if skipTrivia is true, advances past leading whitespace and comments. +func (c *Converter) augmentPos(pos int, skipTrivia bool) int { + if !skipTrivia || c.sourceText == "" || pos >= len(c.sourceText) { + return pos + } + // Skip whitespace and comments (matching the regex /(?:\s|\/\/.*|\/\*[^]*?\*\/)*/g) + i := pos + n := len(c.sourceText) + for i < n { + ch := c.sourceText[i] + if ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '\f' || ch == '\v' { + i++ + continue + } + if ch == '/' && i+1 < n { + next := c.sourceText[i+1] + if next == '/' { + // Single-line comment — skip to end of line + i += 2 + for i < n && c.sourceText[i] != '\n' { + i++ + } + continue + } + if next == '*' { + // Multi-line comment — skip to */ + i += 2 + for i+1 < n { + if c.sourceText[i] == '*' && c.sourceText[i+1] == '/' { + i += 2 + break + } + i++ + } + continue + } + } + break + } + return i +} + +// computeLineStarts returns an array of byte offsets where each line starts. +func computeLineStarts(text string) []int { + starts := []int{0} + for i := 0; i < len(text); i++ { + ch := text[i] + if ch == '\n' { + starts = append(starts, i+1) + } else if ch == '\r' { + if i+1 < len(text) && text[i+1] == '\n' { + i++ + } + starts = append(starts, i+1) + } + } + return starts +} + +// kindForName returns the numeric kind for a given string name. +// This is the reverse of kindNames. Returns 0 if not found. +func (c *Converter) kindForName(name string) uint32 { + for k, v := range c.kindNames { + if v == name { + return k + } + } + return 0 +} + +// collectRescanEvents walks the AST to find positions that need rescanning. +// This matches the Node.js wrapper's rescan logic in ast_extractor.ts. +func (c *Converter) collectRescanEvents(root int) []RescanEvent { + var events []RescanEvent + c.walkForRescan(root, &events) + // Sort by position + sortRescanEvents(events) + return events +} + +func (c *Converter) walkForRescan(i int, events *[]RescanEvent) { + if i <= 0 || i >= c.ast.NodeCount() { + return + } + if c.ast.IsNodeList(i) { + for _, ci := range c.ast.Children(i) { + c.walkForRescan(ci, events) + } + return + } + + kind := c.ast.Kind(i) + kindName := c.kindNames[kind] + + // RegularExpressionLiteral needs rescan (scanner sees / as SlashToken) + if kindName == "RegularExpressionLiteral" { + pos := c.augmentPos(int(c.ast.Pos(i)), true) + *events = append(*events, RescanEvent{Pos: pos, Kind: "regex"}) + } + + // TemplateMiddle and TemplateTail need rescan (scanner sees } as CloseBraceToken) + if kindName == "TemplateMiddle" || kindName == "TemplateTail" { + pos := c.augmentPos(int(c.ast.Pos(i)), true) + *events = append(*events, RescanEvent{Pos: pos, Kind: "template"}) + } + + // BinaryExpression with >>= or >>> etc. needs rescan (scanner may see > separately) + if kindName == "BinaryExpression" { + children := c.ast.Children(i) + if len(children) >= 3 { + // BinaryExpression children: left, operatorToken, right + opKind := c.kindNames[c.ast.Kind(children[1])] + switch opKind { + case "GreaterThanEqualsToken", "GreaterThanGreaterThanEqualsToken", + "GreaterThanGreaterThanGreaterThanEqualsToken", + "GreaterThanGreaterThanGreaterThanToken", "GreaterThanGreaterThanToken": + pos := c.augmentPos(int(c.ast.Pos(children[1])), true) + *events = append(*events, RescanEvent{Pos: pos, Kind: "greater"}) + } + } + } + + // Recurse into children + for _, ci := range c.ast.Children(i) { + c.walkForRescan(ci, events) + } +} + +func sortRescanEvents(events []RescanEvent) { + // Simple insertion sort — events are typically few + for i := 1; i < len(events); i++ { + key := events[i] + j := i - 1 + for j >= 0 && events[j].Pos > key.Pos { + events[j+1] = events[j] + j-- + } + events[j+1] = key + } +} + +// FilterWhitelist removes properties from the converted AST that are not +// in the property whitelist. This is applied recursively. +func FilterWhitelist(obj map[string]interface{}) map[string]interface{} { + result := make(map[string]interface{}, len(obj)) + for k, v := range obj { + if !IsAllowedProperty(k) { + continue + } + switch val := v.(type) { + case map[string]interface{}: + result[k] = FilterWhitelist(val) + case []interface{}: + result[k] = filterWhitelistArray(val) + default: + result[k] = v + } + } + return result +} + +func filterWhitelistArray(arr []interface{}) []interface{} { + result := make([]interface{}, len(arr)) + for i, v := range arr { + if obj, ok := v.(map[string]interface{}); ok { + result[i] = FilterWhitelist(obj) + } else { + result[i] = v + } + } + return result +} + +// BuildKindToNameMap builds a reverse mapping from numeric kind to string name +// from a SyntaxKinds metadata map (name → number). +func BuildKindToNameMap(syntaxKinds map[string]int) map[uint32]string { + result := make(map[uint32]string, len(syntaxKinds)) + for name, num := range syntaxKinds { + key := uint32(num) + // In case of collisions, prefer shorter/simpler names + if existing, ok := result[key]; !ok || len(name) < len(existing) { + result[key] = name + } + } + return result +} + +// StripKindPrefix removes "Kind" prefix from names if present (for TS7 Go-style names). +func StripKindPrefix(name string) string { + if strings.HasPrefix(name, "Kind") { + return name[4:] + } + return name +} diff --git a/javascript/extractor/lib/typescript-go/internal/astconv/decoder.go b/javascript/extractor/lib/typescript-go/internal/astconv/decoder.go new file mode 100644 index 00000000000..5224b494371 --- /dev/null +++ b/javascript/extractor/lib/typescript-go/internal/astconv/decoder.go @@ -0,0 +1,221 @@ +// Package astconv decodes the binary AST format produced by the tsgo API +// and converts it to the JSON format expected by the Java extractor. +// +// The binary format is documented in microsoft/typescript-go/internal/api/encoder/encoder.go. +// Each source file is encoded as: +// +// Header (44 bytes) | String offsets | String data | Extended data | Structured data | Nodes (28 bytes each) +// +// Nodes are in a flat array with parent/next-sibling indices. The first node (index 0) +// is a nil sentinel. The root node is at index 1. +package astconv + +import ( + "encoding/base64" + "encoding/binary" + "fmt" +) + +// Binary format constants matching microsoft/typescript-go/internal/api/encoder. +const ( + nodeSize = 28 // 7 × uint32 + + nodeOffsetKind = 0 + nodeOffsetPos = 4 + nodeOffsetEnd = 8 + nodeOffsetNext = 12 + nodeOffsetParent = 16 + nodeOffsetData = 20 + nodeOffsetFlags = 24 + + headerSize = 44 + headerOffsetMetadata = 0 + headerOffsetStringOff = 24 + headerOffsetStringData = 28 + headerOffsetExtData = 32 + headerOffsetStructData = 36 + headerOffsetNodes = 40 + + protocolVersion uint8 = 5 + + nodeDataTypeChildren uint32 = 0x00_00_00_00 + nodeDataTypeString uint32 = 0x40_00_00_00 + nodeDataTypeExtended uint32 = 0x80_00_00_00 + + nodeDataTypeMask uint32 = 0xC0_00_00_00 + nodeDataChildMask uint32 = 0x00_00_00_FF + nodeDataStringMask uint32 = 0x00_FF_FF_FF + + // SyntaxKindNodeList is the special kind value used for NodeList nodes. + SyntaxKindNodeList uint32 = 0xFF_FF_FF_FF +) + +// BinaryAST provides random access to nodes in a binary-encoded TypeScript AST. +type BinaryAST struct { + raw []byte + strOff uint32 // byte offset to string offset pairs + strData uint32 // byte offset to string data + extData uint32 // byte offset to extended node data + structOff uint32 // byte offset to structured data + nodeOff uint32 // byte offset to nodes section + nodeCount int + // Single Go string covering all data from strData onward. + // String offsets index into this, so substrings are zero-alloc. + allStrData string +} + +// DecodeBinaryAST parses the binary header and returns a BinaryAST for +// random-access to nodes and strings. +func DecodeBinaryAST(data []byte) (*BinaryAST, error) { + if len(data) < headerSize { + return nil, fmt.Errorf("data too short: %d bytes (need %d)", len(data), headerSize) + } + + version := data[headerOffsetMetadata+3] + if version != protocolVersion { + return nil, fmt.Errorf("unsupported protocol version %d (expected %d)", version, protocolVersion) + } + + b := &BinaryAST{ + raw: data, + strOff: le32(data, headerOffsetStringOff), + strData: le32(data, headerOffsetStringData), + extData: le32(data, headerOffsetExtData), + structOff: le32(data, headerOffsetStructData), + nodeOff: le32(data, headerOffsetNodes), + } + + dataLen := uint32(len(data)) + if b.strOff > dataLen || b.strData > dataLen || b.extData > dataLen || b.nodeOff > dataLen { + return nil, fmt.Errorf("invalid header offsets exceed data length %d", dataLen) + } + + b.nodeCount = (len(data) - int(b.nodeOff)) / nodeSize + if b.nodeCount < 2 { + return nil, fmt.Errorf("no nodes in AST (count=%d, need at least 2)", b.nodeCount) + } + + // The official decoder uses data[strData:] for zero-alloc substring slicing. + b.allStrData = string(data[b.strData:]) + + return b, nil +} + +// DecodeBinaryASTFromBase64 decodes a base64-encoded binary AST, as returned +// by tsgo's getSourceFile API in JSON ({"data":""}). +func DecodeBinaryASTFromBase64(b64 string) (*BinaryAST, error) { + data, err := base64.StdEncoding.DecodeString(b64) + if err != nil { + return nil, fmt.Errorf("base64 decode failed: %w", err) + } + return DecodeBinaryAST(data) +} + +// NodeCount returns the total number of nodes (including the nil sentinel at index 0). +func (b *BinaryAST) NodeCount() int { return b.nodeCount } + +// Node field accessors — all read uint32 from the nodes section. + +func (b *BinaryAST) nf(i, offset int) uint32 { + return le32(b.raw, int(b.nodeOff)+i*nodeSize+offset) +} + +// Kind returns the SyntaxKind of node i. +func (b *BinaryAST) Kind(i int) uint32 { return b.nf(i, nodeOffsetKind) } + +// Pos returns the start position (UTF-16 offset) of node i. +func (b *BinaryAST) Pos(i int) uint32 { return b.nf(i, nodeOffsetPos) } + +// End returns the end position (UTF-16 offset) of node i. +func (b *BinaryAST) End(i int) uint32 { return b.nf(i, nodeOffsetEnd) } + +// Next returns the index of the next sibling of node i, or 0 if none. +func (b *BinaryAST) Next(i int) uint32 { return b.nf(i, nodeOffsetNext) } + +// Parent returns the index of the parent of node i, or 0 if none. +func (b *BinaryAST) Parent(i int) uint32 { return b.nf(i, nodeOffsetParent) } + +// Data returns the raw 32-bit data field of node i. +func (b *BinaryAST) Data(i int) uint32 { return b.nf(i, nodeOffsetData) } + +// Flags returns the NodeFlags of node i. +func (b *BinaryAST) Flags(i int) uint32 { return b.nf(i, nodeOffsetFlags) } + +// DataType returns the top 2 bits of the data field (Children, String, or Extended). +func (b *BinaryAST) DataType(i int) uint32 { return b.Data(i) & nodeDataTypeMask } + +// DefinedBits returns bits 24-29 of the data field (6 bits of per-node-type flags). +func (b *BinaryAST) DefinedBits(i int) uint8 { return uint8((b.Data(i) >> 24) & 0x3F) } + +// ChildMask returns the low byte of the data field (child property bitmask). +func (b *BinaryAST) ChildMask(i int) uint8 { return uint8(b.Data(i) & nodeDataChildMask) } + +// StringIndex returns the 24-bit string table index from the data field. +func (b *BinaryAST) StringIndex(i int) uint32 { return b.Data(i) & nodeDataStringMask } + +// ExtOffset returns the 24-bit offset into the extended data section from the data field. +func (b *BinaryAST) ExtOffset(i int) uint32 { return b.Data(i) & nodeDataStringMask } + +// NodeListLen returns the number of children for a NodeList node (stored in data field). +func (b *BinaryAST) NodeListLen(i int) uint32 { return b.Data(i) } + +// IsNodeList returns true if node i is a NodeList. +func (b *BinaryAST) IsNodeList(i int) bool { return b.Kind(i) == SyntaxKindNodeList } + +// GetString reads a string from the string table at the given offset index. +// The index comes from a String-type node's data field (24-bit value). +func (b *BinaryAST) GetString(idx uint32) string { + // Each string entry is two uint32 values (start, end) in the string offsets section. + offBase := int(b.strOff) + int(idx)*4 + start := le32(b.raw, offBase) + end := le32(b.raw, offBase+4) + return b.allStrData[start:end] +} + +// ExtUint32 reads a uint32 from the extended data section at the given byte offset. +func (b *BinaryAST) ExtUint32(off uint32) uint32 { + return le32(b.raw, int(b.extData)+int(off)) +} + +// Children returns the indices of all direct children of node i. +// Children are identified by having parent == i. The first child is at i+1 +// (if its parent is i), and subsequent children are found via Next pointers. +func (b *BinaryAST) Children(i int) []int { + if i+1 >= b.nodeCount { + return nil + } + firstChild := i + 1 + if b.Parent(firstChild) != uint32(i) { + return nil + } + children := []int{firstChild} + next := int(b.Next(firstChild)) + for next != 0 { + children = append(children, next) + next = int(b.Next(next)) + } + return children +} + +// SourceText returns the source file text, extracted from the SourceFile's +// extended data. Returns "" if the root node is not a SourceFile or if +// the extended data is missing. +func (b *BinaryAST) SourceText() string { + if b.nodeCount < 2 { + return "" + } + // Root is at index 1. Check if it has extended data type. + if b.DataType(1)&nodeDataTypeMask != nodeDataTypeExtended { + return "" + } + extOff := b.ExtOffset(1) + textIdx := b.ExtUint32(extOff) + return b.GetString(textIdx) +} + +func le32(data []byte, offset int) uint32 { + if offset < 0 || offset+4 > len(data) { + return 0 + } + return binary.LittleEndian.Uint32(data[offset : offset+4]) +} diff --git a/javascript/extractor/lib/typescript-go/internal/astconv/scanner.go b/javascript/extractor/lib/typescript-go/internal/astconv/scanner.go new file mode 100644 index 00000000000..8176a589d9b --- /dev/null +++ b/javascript/extractor/lib/typescript-go/internal/astconv/scanner.go @@ -0,0 +1,842 @@ +package astconv + +import ( + "unicode" + "unicode/utf8" +) + +// TS7 SyntaxKind values for tokens (from microsoft/typescript-go internal/ast/kind.go). +const ( + KindUnknown = 0 + KindEndOfFile = 1 + KindSingleLineCommentTrivia = 2 + KindMultiLineCommentTrivia = 3 + KindNewLineTrivia = 4 + KindWhitespaceTrivia = 5 + KindConflictMarkerTrivia = 6 + KindNumericLiteral = 8 + KindBigIntLiteral = 9 + KindStringLiteral = 10 + KindRegularExpressionLiteral = 13 + KindNoSubstitutionTemplateLiteral = 14 + KindTemplateHead = 15 + KindTemplateMiddle = 16 + KindTemplateTail = 17 + KindOpenBraceToken = 18 + KindCloseBraceToken = 19 + KindOpenParenToken = 20 + KindCloseParenToken = 21 + KindOpenBracketToken = 22 + KindCloseBracketToken = 23 + KindDotToken = 24 + KindDotDotDotToken = 25 + KindSemicolonToken = 26 + KindCommaToken = 27 + KindQuestionDotToken = 28 + KindLessThanToken = 29 + KindLessThanSlashToken = 30 + KindGreaterThanToken = 31 + KindLessThanEqualsToken = 32 + KindGreaterThanEqualsToken = 33 + KindEqualsEqualsToken = 34 + KindExclamationEqualsToken = 35 + KindEqualsEqualsEqualsToken = 36 + KindExclamationEqualsEqualsToken = 37 + KindEqualsGreaterThanToken = 38 + KindPlusToken = 39 + KindMinusToken = 40 + KindAsteriskToken = 41 + KindAsteriskAsteriskToken = 42 + KindSlashToken = 43 + KindPercentToken = 44 + KindPlusPlusToken = 45 + KindMinusMinusToken = 46 + KindLessThanLessThanToken = 47 + KindGreaterThanGreaterThanToken = 48 + KindGreaterThanGreaterThanGreaterThanToken = 49 + KindAmpersandToken = 50 + KindBarToken = 51 + KindCaretToken = 52 + KindExclamationToken = 53 + KindTildeToken = 54 + KindAmpersandAmpersandToken = 55 + KindBarBarToken = 56 + KindQuestionToken = 57 + KindColonToken = 58 + KindAtToken = 59 + KindQuestionQuestionToken = 60 + KindHashToken = 62 + KindEqualsToken = 63 + KindPlusEqualsToken = 64 + KindMinusEqualsToken = 65 + KindAsteriskEqualsToken = 66 + KindAsteriskAsteriskEqualsToken = 67 + KindSlashEqualsToken = 68 + KindPercentEqualsToken = 69 + KindLessThanLessThanEqualsToken = 70 + KindGreaterThanGreaterThanEqualsToken = 71 + KindGreaterThanGreaterThanGreaterThanEqualsToken = 72 + KindAmpersandEqualsToken = 73 + KindBarEqualsToken = 74 + KindBarBarEqualsToken = 75 + KindAmpersandAmpersandEqualsToken = 76 + KindQuestionQuestionEqualsToken = 77 + KindCaretEqualsToken = 78 + KindIdentifier = 79 + KindPrivateIdentifier = 80 +) + +// Token represents a single token from the scanner. +type Token struct { + Kind int `json:"kind"` + TokenPos int `json:"tokenPos"` + Text string `json:"text"` +} + +// RescanEvent tells the scanner to rescan at a given position. +type RescanEvent struct { + Pos int + Kind string // "regex", "template", "greater" +} + +// Scanner tokenizes TypeScript source text. +type Scanner struct { + text string + pos int + events []RescanEvent + evIdx int +} + +// NewScanner creates a scanner for the given source text. +// rescanEvents should be sorted by position. They inform the scanner +// about positions where regex literals, template tokens, or greater-than +// rescanning is needed (matching the Node.js wrapper behavior). +func NewScanner(text string, rescanEvents []RescanEvent) *Scanner { + return &Scanner{ + text: text, + pos: 0, + events: rescanEvents, + evIdx: 0, + } +} + +// ScanAll produces all tokens from the source text. +func (s *Scanner) ScanAll() []Token { + var tokens []Token + for { + tok := s.scan() + tokens = append(tokens, tok) + if tok.Kind == KindEndOfFile { + break + } + } + return tokens +} + +func (s *Scanner) peek() byte { + if s.pos >= len(s.text) { + return 0 + } + return s.text[s.pos] +} + +func (s *Scanner) peekAt(offset int) byte { + p := s.pos + offset + if p >= len(s.text) { + return 0 + } + return s.text[p] +} + +func (s *Scanner) advance() { + s.pos++ +} + +func (s *Scanner) nextRescanPos() int { + if s.evIdx < len(s.events) { + return s.events[s.evIdx].Pos + } + return int(^uint(0) >> 1) // MaxInt +} + +func (s *Scanner) nextRescanKind() string { + if s.evIdx < len(s.events) { + return s.events[s.evIdx].Kind + } + return "" +} + +func (s *Scanner) consumeRescan() { + if s.evIdx < len(s.events) { + s.evIdx++ + } +} + +func (s *Scanner) scan() Token { + if s.pos >= len(s.text) { + return Token{Kind: KindEndOfFile, TokenPos: s.pos, Text: ""} + } + + tokenPos := s.pos + ch := s.peek() + + // Whitespace (not newlines) + if ch == ' ' || ch == '\t' || ch == '\f' || ch == '\v' { + for s.pos < len(s.text) { + c := s.text[s.pos] + if c == ' ' || c == '\t' || c == '\f' || c == '\v' { + s.pos++ + } else { + break + } + } + return Token{Kind: KindWhitespaceTrivia, TokenPos: tokenPos, Text: s.text[tokenPos:s.pos]} + } + + // Newlines + if ch == '\n' { + s.advance() + return Token{Kind: KindNewLineTrivia, TokenPos: tokenPos, Text: "\n"} + } + if ch == '\r' { + s.advance() + if s.peek() == '\n' { + s.advance() + } + return Token{Kind: KindNewLineTrivia, TokenPos: tokenPos, Text: s.text[tokenPos:s.pos]} + } + + // Check for rescan event at this position + if tokenPos == s.nextRescanPos() { + kind := s.nextRescanKind() + s.consumeRescan() + switch kind { + case "regex": + return s.scanRegExp(tokenPos) + case "template": + return s.scanTemplatePart(tokenPos, true) + case "greater": + return s.scanGreater(tokenPos) + } + } + + switch ch { + case '/': + next := s.peekAt(1) + if next == '/' { + return s.scanSingleLineComment(tokenPos) + } + if next == '*' { + return s.scanMultiLineComment(tokenPos) + } + if next == '=' { + s.pos += 2 + return Token{Kind: KindSlashEqualsToken, TokenPos: tokenPos, Text: "/="} + } + s.advance() + return Token{Kind: KindSlashToken, TokenPos: tokenPos, Text: "/"} + + case '\'', '"': + return s.scanString(tokenPos, ch) + + case '`': + return s.scanTemplatePart(tokenPos, false) + + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return s.scanNumber(tokenPos) + + case '{': + s.advance() + return Token{Kind: KindOpenBraceToken, TokenPos: tokenPos, Text: "{"} + case '}': + s.advance() + return Token{Kind: KindCloseBraceToken, TokenPos: tokenPos, Text: "}"} + case '(': + s.advance() + return Token{Kind: KindOpenParenToken, TokenPos: tokenPos, Text: "("} + case ')': + s.advance() + return Token{Kind: KindCloseParenToken, TokenPos: tokenPos, Text: ")"} + case '[': + s.advance() + return Token{Kind: KindOpenBracketToken, TokenPos: tokenPos, Text: "["} + case ']': + s.advance() + return Token{Kind: KindCloseBracketToken, TokenPos: tokenPos, Text: "]"} + case ';': + s.advance() + return Token{Kind: KindSemicolonToken, TokenPos: tokenPos, Text: ";"} + case ',': + s.advance() + return Token{Kind: KindCommaToken, TokenPos: tokenPos, Text: ","} + case '~': + s.advance() + return Token{Kind: KindTildeToken, TokenPos: tokenPos, Text: "~"} + case '@': + s.advance() + return Token{Kind: KindAtToken, TokenPos: tokenPos, Text: "@"} + + case '.': + if s.peekAt(1) == '.' && s.peekAt(2) == '.' { + s.pos += 3 + return Token{Kind: KindDotDotDotToken, TokenPos: tokenPos, Text: "..."} + } + // .123 numeric literal + if s.peekAt(1) >= '0' && s.peekAt(1) <= '9' { + return s.scanNumber(tokenPos) + } + s.advance() + return Token{Kind: KindDotToken, TokenPos: tokenPos, Text: "."} + + case ':': + s.advance() + return Token{Kind: KindColonToken, TokenPos: tokenPos, Text: ":"} + + case '?': + if s.peekAt(1) == '.' && !(s.peekAt(2) >= '0' && s.peekAt(2) <= '9') { + s.pos += 2 + return Token{Kind: KindQuestionDotToken, TokenPos: tokenPos, Text: "?."} + } + if s.peekAt(1) == '?' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindQuestionQuestionEqualsToken, TokenPos: tokenPos, Text: "??="} + } + s.pos += 2 + return Token{Kind: KindQuestionQuestionToken, TokenPos: tokenPos, Text: "??"} + } + s.advance() + return Token{Kind: KindQuestionToken, TokenPos: tokenPos, Text: "?"} + + case '!': + if s.peekAt(1) == '=' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindExclamationEqualsEqualsToken, TokenPos: tokenPos, Text: "!=="} + } + s.pos += 2 + return Token{Kind: KindExclamationEqualsToken, TokenPos: tokenPos, Text: "!="} + } + s.advance() + return Token{Kind: KindExclamationToken, TokenPos: tokenPos, Text: "!"} + + case '=': + if s.peekAt(1) == '=' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindEqualsEqualsEqualsToken, TokenPos: tokenPos, Text: "==="} + } + s.pos += 2 + return Token{Kind: KindEqualsEqualsToken, TokenPos: tokenPos, Text: "=="} + } + if s.peekAt(1) == '>' { + s.pos += 2 + return Token{Kind: KindEqualsGreaterThanToken, TokenPos: tokenPos, Text: "=>"} + } + s.advance() + return Token{Kind: KindEqualsToken, TokenPos: tokenPos, Text: "="} + + case '+': + if s.peekAt(1) == '+' { + s.pos += 2 + return Token{Kind: KindPlusPlusToken, TokenPos: tokenPos, Text: "++"} + } + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindPlusEqualsToken, TokenPos: tokenPos, Text: "+="} + } + s.advance() + return Token{Kind: KindPlusToken, TokenPos: tokenPos, Text: "+"} + + case '-': + if s.peekAt(1) == '-' { + s.pos += 2 + return Token{Kind: KindMinusMinusToken, TokenPos: tokenPos, Text: "--"} + } + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindMinusEqualsToken, TokenPos: tokenPos, Text: "-="} + } + s.advance() + return Token{Kind: KindMinusToken, TokenPos: tokenPos, Text: "-"} + + case '*': + if s.peekAt(1) == '*' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindAsteriskAsteriskEqualsToken, TokenPos: tokenPos, Text: "**="} + } + s.pos += 2 + return Token{Kind: KindAsteriskAsteriskToken, TokenPos: tokenPos, Text: "**"} + } + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindAsteriskEqualsToken, TokenPos: tokenPos, Text: "*="} + } + s.advance() + return Token{Kind: KindAsteriskToken, TokenPos: tokenPos, Text: "*"} + + case '%': + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindPercentEqualsToken, TokenPos: tokenPos, Text: "%="} + } + s.advance() + return Token{Kind: KindPercentToken, TokenPos: tokenPos, Text: "%"} + + case '<': + if s.peekAt(1) == '<' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindLessThanLessThanEqualsToken, TokenPos: tokenPos, Text: "<<="} + } + s.pos += 2 + return Token{Kind: KindLessThanLessThanToken, TokenPos: tokenPos, Text: "<<"} + } + if s.peekAt(1) == '/' { + s.pos += 2 + return Token{Kind: KindLessThanSlashToken, TokenPos: tokenPos, Text: "': + return s.scanGreater(tokenPos) + + case '&': + if s.peekAt(1) == '&' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindAmpersandAmpersandEqualsToken, TokenPos: tokenPos, Text: "&&="} + } + s.pos += 2 + return Token{Kind: KindAmpersandAmpersandToken, TokenPos: tokenPos, Text: "&&"} + } + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindAmpersandEqualsToken, TokenPos: tokenPos, Text: "&="} + } + s.advance() + return Token{Kind: KindAmpersandToken, TokenPos: tokenPos, Text: "&"} + + case '|': + if s.peekAt(1) == '|' { + if s.peekAt(2) == '=' { + s.pos += 3 + return Token{Kind: KindBarBarEqualsToken, TokenPos: tokenPos, Text: "||="} + } + s.pos += 2 + return Token{Kind: KindBarBarToken, TokenPos: tokenPos, Text: "||"} + } + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindBarEqualsToken, TokenPos: tokenPos, Text: "|="} + } + s.advance() + return Token{Kind: KindBarToken, TokenPos: tokenPos, Text: "|"} + + case '^': + if s.peekAt(1) == '=' { + s.pos += 2 + return Token{Kind: KindCaretEqualsToken, TokenPos: tokenPos, Text: "^="} + } + s.advance() + return Token{Kind: KindCaretToken, TokenPos: tokenPos, Text: "^"} + + case '#': + // Could be private identifier + if s.peekAt(1) == '!' && tokenPos == 0 { + // Shebang — scan to end of line + return s.scanSingleLineComment(tokenPos) + } + if isIdentStart(s.peekAt(1)) { + return s.scanPrivateIdentifier(tokenPos) + } + s.advance() + return Token{Kind: KindHashToken, TokenPos: tokenPos, Text: "#"} + } + + // Identifier or keyword + if isIdentStartByte(ch) { + return s.scanIdentifierOrKeyword(tokenPos) + } + + // Handle multi-byte Unicode identifier starts + r, size := utf8.DecodeRuneInString(s.text[s.pos:]) + if r != utf8.RuneError && isIdentStartRune(r) { + return s.scanIdentifierOrKeyword(tokenPos) + } + + // Unknown character + s.pos += size + return Token{Kind: KindUnknown, TokenPos: tokenPos, Text: s.text[tokenPos:s.pos]} +} + +func (s *Scanner) scanSingleLineComment(start int) Token { + s.pos += 2 // skip // + for s.pos < len(s.text) && s.text[s.pos] != '\n' && s.text[s.pos] != '\r' { + s.pos++ + } + return Token{Kind: KindSingleLineCommentTrivia, TokenPos: start, Text: s.text[start:s.pos]} +} + +func (s *Scanner) scanMultiLineComment(start int) Token { + s.pos += 2 // skip /* + for s.pos < len(s.text)-1 { + if s.text[s.pos] == '*' && s.text[s.pos+1] == '/' { + s.pos += 2 + return Token{Kind: KindMultiLineCommentTrivia, TokenPos: start, Text: s.text[start:s.pos]} + } + s.pos++ + } + // Unterminated + s.pos = len(s.text) + return Token{Kind: KindMultiLineCommentTrivia, TokenPos: start, Text: s.text[start:s.pos]} +} + +func (s *Scanner) scanString(start int, quote byte) Token { + s.advance() // skip opening quote + for s.pos < len(s.text) { + ch := s.text[s.pos] + if ch == '\\' { + s.pos += 2 + continue + } + if ch == quote { + s.advance() + return Token{Kind: KindStringLiteral, TokenPos: start, Text: s.text[start:s.pos]} + } + if ch == '\n' || ch == '\r' { + // Unterminated string + break + } + s.pos++ + } + return Token{Kind: KindStringLiteral, TokenPos: start, Text: s.text[start:s.pos]} +} + +func (s *Scanner) scanTemplatePart(start int, isRescan bool) Token { + if isRescan { + // We're at a '}' that needs to be rescanned as TemplateMiddle or TemplateTail + s.advance() // skip } + } else { + s.advance() // skip ` + } + for s.pos < len(s.text) { + ch := s.text[s.pos] + if ch == '\\' { + s.pos += 2 + continue + } + if ch == '`' { + s.advance() + if isRescan { + return Token{Kind: KindTemplateTail, TokenPos: start, Text: s.text[start:s.pos]} + } + return Token{Kind: KindNoSubstitutionTemplateLiteral, TokenPos: start, Text: s.text[start:s.pos]} + } + if ch == '$' && s.peekAt(1) == '{' { + s.pos += 2 + if isRescan { + return Token{Kind: KindTemplateMiddle, TokenPos: start, Text: s.text[start:s.pos]} + } + return Token{Kind: KindTemplateHead, TokenPos: start, Text: s.text[start:s.pos]} + } + s.pos++ + } + // Unterminated + if isRescan { + return Token{Kind: KindTemplateTail, TokenPos: start, Text: s.text[start:s.pos]} + } + return Token{Kind: KindNoSubstitutionTemplateLiteral, TokenPos: start, Text: s.text[start:s.pos]} +} + +func (s *Scanner) scanRegExp(start int) Token { + s.advance() // skip / + inCharClass := false + for s.pos < len(s.text) { + ch := s.text[s.pos] + if ch == '\\' { + s.pos += 2 + continue + } + if ch == '[' { + inCharClass = true + s.pos++ + continue + } + if ch == ']' { + inCharClass = false + s.pos++ + continue + } + if ch == '/' && !inCharClass { + s.advance() // skip closing / + // Scan flags + for s.pos < len(s.text) && isIdentChar(s.text[s.pos]) { + s.pos++ + } + return Token{Kind: KindRegularExpressionLiteral, TokenPos: start, Text: s.text[start:s.pos]} + } + if ch == '\n' || ch == '\r' { + break + } + s.pos++ + } + return Token{Kind: KindRegularExpressionLiteral, TokenPos: start, Text: s.text[start:s.pos]} +} + +func (s *Scanner) scanGreater(start int) Token { + s.advance() // skip > + if s.peek() == '>' { + s.advance() + if s.peek() == '>' { + s.advance() + if s.peek() == '=' { + s.advance() + return Token{Kind: KindGreaterThanGreaterThanGreaterThanEqualsToken, TokenPos: start, Text: ">>>="} + } + return Token{Kind: KindGreaterThanGreaterThanGreaterThanToken, TokenPos: start, Text: ">>>"} + } + if s.peek() == '=' { + s.advance() + return Token{Kind: KindGreaterThanGreaterThanEqualsToken, TokenPos: start, Text: ">>="} + } + return Token{Kind: KindGreaterThanGreaterThanToken, TokenPos: start, Text: ">>"} + } + if s.peek() == '=' { + s.advance() + return Token{Kind: KindGreaterThanEqualsToken, TokenPos: start, Text: ">="} + } + return Token{Kind: KindGreaterThanToken, TokenPos: start, Text: ">"} +} + +func (s *Scanner) scanNumber(start int) Token { + if s.peek() == '0' { + next := s.peekAt(1) + if next == 'x' || next == 'X' { + s.pos += 2 + s.scanHexDigits() + return s.finishBigIntOrNumber(start) + } + if next == 'b' || next == 'B' { + s.pos += 2 + s.scanBinaryDigits() + return s.finishBigIntOrNumber(start) + } + if next == 'o' || next == 'O' { + s.pos += 2 + s.scanOctalDigits() + return s.finishBigIntOrNumber(start) + } + } + + s.scanDecimalDigits() + if s.peek() == '.' { + s.advance() + s.scanDecimalDigits() + } + if s.peek() == 'e' || s.peek() == 'E' { + s.advance() + if s.peek() == '+' || s.peek() == '-' { + s.advance() + } + s.scanDecimalDigits() + } + return s.finishBigIntOrNumber(start) +} + +func (s *Scanner) finishBigIntOrNumber(start int) Token { + if s.peek() == 'n' { + s.advance() + return Token{Kind: KindBigIntLiteral, TokenPos: start, Text: s.text[start:s.pos]} + } + return Token{Kind: KindNumericLiteral, TokenPos: start, Text: s.text[start:s.pos]} +} + +func (s *Scanner) scanDecimalDigits() { + for s.pos < len(s.text) { + ch := s.text[s.pos] + if (ch >= '0' && ch <= '9') || ch == '_' { + s.pos++ + } else { + break + } + } +} + +func (s *Scanner) scanHexDigits() { + for s.pos < len(s.text) { + ch := s.text[s.pos] + if (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') || ch == '_' { + s.pos++ + } else { + break + } + } +} + +func (s *Scanner) scanBinaryDigits() { + for s.pos < len(s.text) { + ch := s.text[s.pos] + if ch == '0' || ch == '1' || ch == '_' { + s.pos++ + } else { + break + } + } +} + +func (s *Scanner) scanOctalDigits() { + for s.pos < len(s.text) { + ch := s.text[s.pos] + if (ch >= '0' && ch <= '7') || ch == '_' { + s.pos++ + } else { + break + } + } +} + +func (s *Scanner) scanIdentifierOrKeyword(start int) Token { + for s.pos < len(s.text) { + ch := s.text[s.pos] + if isIdentChar(ch) { + s.pos++ + } else if ch >= 0x80 { + r, size := utf8.DecodeRuneInString(s.text[s.pos:]) + if r != utf8.RuneError && (unicode.IsLetter(r) || unicode.IsDigit(r) || r == '\u200C' || r == '\u200D') { + s.pos += size + } else { + break + } + } else { + break + } + } + text := s.text[start:s.pos] + if kind, ok := keywordKinds[text]; ok { + return Token{Kind: kind, TokenPos: start, Text: text} + } + return Token{Kind: KindIdentifier, TokenPos: start, Text: text} +} + +func (s *Scanner) scanPrivateIdentifier(start int) Token { + s.advance() // skip # + for s.pos < len(s.text) && isIdentChar(s.text[s.pos]) { + s.pos++ + } + return Token{Kind: KindPrivateIdentifier, TokenPos: start, Text: s.text[start:s.pos]} +} + +func isIdentStartByte(ch byte) bool { + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch == '$' +} + +func isIdentStart(ch byte) bool { + return isIdentStartByte(ch) +} + +func isIdentStartRune(r rune) bool { + return unicode.IsLetter(r) || r == '_' || r == '$' +} + +func isIdentChar(ch byte) bool { + return isIdentStartByte(ch) || (ch >= '0' && ch <= '9') +} + +// keywordKinds maps keyword text to TS7 SyntaxKind values. +// These start at KindBreakKeyword = 82. +var keywordKinds = map[string]int{ + "break": 82, + "case": 83, + "catch": 84, + "class": 85, + "const": 86, + "continue": 87, + "debugger": 88, + "default": 89, + "delete": 90, + "do": 91, + "else": 92, + "enum": 93, + "export": 94, + "extends": 95, + "false": 96, + "finally": 97, + "for": 98, + "function": 99, + "if": 100, + "import": 101, + "in": 102, + "instanceof": 103, + "new": 104, + "null": 105, + "return": 106, + "super": 107, + "switch": 108, + "this": 109, + "throw": 110, + "true": 111, + "try": 112, + "typeof": 113, + "var": 114, + "void": 115, + "while": 116, + "with": 117, + // Strict mode reserved words + "implements": 118, + "interface": 119, + "let": 120, + "package": 121, + "private": 122, + "protected": 123, + "public": 124, + "static": 125, + "yield": 126, + // Contextual keywords + "abstract": 127, + "accessor": 128, + "as": 129, + "asserts": 130, + "assert": 131, + "any": 132, + "async": 133, + "await": 134, + "boolean": 135, + "constructor": 136, + "declare": 137, + "get": 138, + "immediate": 139, + "infer": 140, + "intrinsic": 141, + "is": 142, + "keyof": 143, + "module": 144, + "namespace": 145, + "never": 146, + "out": 147, + "readonly": 148, + "require": 149, + "number": 150, + "object": 151, + "satisfies": 152, + "set": 153, + "string": 154, + "symbol": 155, + "type": 156, + "undefined": 157, + "unique": 158, + "unknown": 159, + "using": 160, + "from": 161, + "global": 162, + "bigint": 163, + "override": 164, + "of": 165, + "defer": 166, +} diff --git a/javascript/extractor/lib/typescript-go/internal/tsparser/metadata.go b/javascript/extractor/lib/typescript-go/internal/tsparser/metadata.go index d028dfdce8f..c92afa234d0 100644 --- a/javascript/extractor/lib/typescript-go/internal/tsparser/metadata.go +++ b/javascript/extractor/lib/typescript-go/internal/tsparser/metadata.go @@ -1,279 +1,335 @@ package tsparser -// getStaticTS7Metadata returns hardcoded metadata for TypeScript 7. +// GetStaticTS7Metadata returns hardcoded metadata for TypeScript 7. // This must be kept in sync with the TypeScript compiler's SyntaxKind and -// NodeFlags enums. Eventually this should be obtained dynamically from -// the tsgo API. +// NodeFlags enums. // // The SyntaxKind values here correspond to the TypeScript 7 (Go port) // compiler. The Java extractor uses the string names (not numeric IDs) // to identify node kinds, so the exact numeric values only matter for // the metadata response. -func getStaticTS7Metadata() *Metadata { +func GetStaticTS7Metadata() *Metadata { return &Metadata{ SyntaxKinds: syntaxKinds, NodeFlags: nodeFlags, } } +// GetSyntaxKinds returns the raw SyntaxKind name→number map. +func GetSyntaxKinds() map[string]int { + return syntaxKinds +} + +// BuildKindToNameMap returns a number→name reverse map for SyntaxKinds. +func BuildKindToNameMap() map[uint32]string { + m := make(map[uint32]string, len(syntaxKinds)) + for name, num := range syntaxKinds { + key := uint32(num) + if existing, ok := m[key]; !ok || len(name) < len(existing) { + m[key] = name + } + } + return m +} + // syntaxKinds maps SyntaxKind names to their numeric values in TypeScript 7. -// This is a subset covering the kinds most commonly seen in parsed ASTs. -// The full set should be generated from the TypeScript source. +// Generated from microsoft/typescript-go/internal/ast/kind.go (iota enum). var syntaxKinds = map[string]int{ - "Unknown": 0, - "EndOfFileToken": 1, - "SingleLineCommentTrivia": 2, - "MultiLineCommentTrivia": 3, - "NewLineTrivia": 4, - "WhitespaceTrivia": 5, - "NumericLiteral": 9, - "BigIntLiteral": 10, - "StringLiteral": 11, - "RegularExpressionLiteral": 14, - "NoSubstitutionTemplateLiteral": 15, - "TemplateHead": 16, - "TemplateMiddle": 17, - "TemplateTail": 18, - "OpenBraceToken": 19, - "CloseBraceToken": 20, - "OpenParenToken": 21, - "CloseParenToken": 22, - "OpenBracketToken": 23, - "CloseBracketToken": 24, - "DotToken": 25, - "DotDotDotToken": 26, - "SemicolonToken": 27, - "CommaToken": 28, - "QuestionDotToken": 29, - "LessThanToken": 30, - "GreaterThanToken": 31, - "EqualsToken": 64, - "PlusToken": 40, - "MinusToken": 41, - "AsteriskToken": 42, - "SlashToken": 44, - "ExclamationToken": 54, - "QuestionToken": 58, - "ColonToken": 59, - "AtToken": 60, - "EqualsGreaterThanToken": 39, - "Identifier": 80, - "BreakKeyword": 83, - "CaseKeyword": 84, - "CatchKeyword": 85, - "ClassKeyword": 86, - "ConstKeyword": 87, - "ContinueKeyword": 88, - "DebuggerKeyword": 89, - "DefaultKeyword": 90, - "DeleteKeyword": 91, - "DoKeyword": 92, - "ElseKeyword": 93, - "EnumKeyword": 94, - "ExportKeyword": 95, - "ExtendsKeyword": 96, - "FalseKeyword": 97, - "FinallyKeyword": 98, - "ForKeyword": 99, - "FunctionKeyword": 100, - "IfKeyword": 101, - "ImportKeyword": 102, - "InKeyword": 103, - "InstanceOfKeyword": 104, - "NewKeyword": 105, - "NullKeyword": 106, - "ReturnKeyword": 107, - "SuperKeyword": 108, - "SwitchKeyword": 109, - "ThisKeyword": 110, - "ThrowKeyword": 111, - "TrueKeyword": 112, - "TryKeyword": 113, - "TypeOfKeyword": 114, - "VarKeyword": 115, - "VoidKeyword": 116, - "WhileKeyword": 117, - "WithKeyword": 118, - "ImplementsKeyword": 119, - "InterfaceKeyword": 120, - "LetKeyword": 121, - "PackageKeyword": 122, - "PrivateKeyword": 123, - "ProtectedKeyword": 124, - "PublicKeyword": 125, - "StaticKeyword": 126, - "YieldKeyword": 127, - "AbstractKeyword": 128, - "AccessorKeyword": 129, - "AsKeyword": 130, - "AsyncKeyword": 134, - "AwaitKeyword": 135, - "ConstructorKeyword": 137, - "DeclareKeyword": 138, - "GetKeyword": 139, - "InferKeyword": 140, - "IsKeyword": 142, - "KeyOfKeyword": 143, - "ModuleKeyword": 144, - "NamespaceKeyword": 145, - "NeverKeyword": 146, - "ReadonlyKeyword": 148, - "RequireKeyword": 149, - "NumberKeyword": 150, - "ObjectKeyword": 151, - "SetKeyword": 152, - "StringKeyword": 153, - "SymbolKeyword": 154, - "TypeKeyword": 155, - "UndefinedKeyword": 157, - "UniqueKeyword": 158, - "FromKeyword": 161, - "OfKeyword": 165, - "QualifiedName": 166, - "ComputedPropertyName": 167, - "TypeParameter": 168, - "Parameter": 169, - "Decorator": 170, - "PropertySignature": 171, - "PropertyDeclaration": 172, - "MethodSignature": 173, - "MethodDeclaration": 174, - "ClassStaticBlockDeclaration": 175, - "Constructor": 176, - "GetAccessor": 177, - "SetAccessor": 178, - "CallSignature": 179, - "ConstructSignature": 180, - "IndexSignature": 181, - "TypePredicate": 182, - "TypeReference": 183, - "FunctionType": 184, - "ConstructorType": 185, - "TypeQuery": 186, - "TypeLiteral": 187, - "ArrayType": 188, - "TupleType": 189, - "OptionalType": 190, - "RestType": 191, - "UnionType": 192, - "IntersectionType": 193, - "ConditionalType": 194, - "InferType": 195, - "ParenthesizedType": 196, - "ThisType": 197, - "TypeOperator": 198, - "IndexedAccessType": 199, - "MappedType": 200, - "LiteralType": 201, - "NamedTupleMember": 202, - "TemplateLiteralType": 203, - "TemplateLiteralTypeSpan": 204, - "ImportType": 205, - "ObjectBindingPattern": 206, - "ArrayBindingPattern": 207, - "BindingElement": 208, - "ArrayLiteralExpression": 209, - "ObjectLiteralExpression": 210, - "PropertyAccessExpression": 211, - "ElementAccessExpression": 212, - "CallExpression": 213, - "NewExpression": 214, - "TaggedTemplateExpression": 215, - "TypeAssertionExpression": 216, - "ParenthesizedExpression": 217, - "FunctionExpression": 218, - "ArrowFunction": 219, - "DeleteExpression": 220, - "TypeOfExpression": 221, - "VoidExpression": 222, - "AwaitExpression": 223, - "PrefixUnaryExpression": 224, - "PostfixUnaryExpression": 225, - "BinaryExpression": 226, - "ConditionalExpression": 227, - "TemplateExpression": 228, - "YieldExpression": 229, - "SpreadElement": 230, - "ClassExpression": 231, - "ExpressionWithTypeArguments": 233, - "AsExpression": 234, - "NonNullExpression": 235, - "MetaProperty": 236, - "SyntheticExpression": 237, - "SatisfiesExpression": 238, - "TemplateSpan": 239, - "SemicolonClassElement": 240, - "Block": 241, - "EmptyStatement": 242, - "VariableStatement": 243, - "ExpressionStatement": 244, - "IfStatement": 245, - "DoStatement": 246, - "WhileStatement": 247, - "ForStatement": 248, - "ForInStatement": 249, - "ForOfStatement": 250, - "ContinueStatement": 251, - "BreakStatement": 252, - "ReturnStatement": 253, - "WithStatement": 254, - "SwitchStatement": 255, - "LabeledStatement": 256, - "ThrowStatement": 257, - "TryStatement": 258, - "DebuggerStatement": 259, - "VariableDeclaration": 260, - "VariableDeclarationList": 261, - "FunctionDeclaration": 262, - "ClassDeclaration": 263, - "InterfaceDeclaration": 264, - "TypeAliasDeclaration": 265, - "EnumDeclaration": 266, - "ModuleDeclaration": 267, - "ModuleBlock": 268, - "CaseBlock": 269, - "NamespaceExportDeclaration": 270, - "ImportEqualsDeclaration": 271, - "ImportDeclaration": 272, - "ImportClause": 273, - "NamespaceImport": 274, - "NamedImports": 275, - "ImportSpecifier": 276, - "ExportAssignment": 277, - "ExportDeclaration": 278, - "NamedExports": 279, - "NamespaceExport": 280, - "ExportSpecifier": 281, - "ExternalModuleReference": 283, - "CaseClause": 295, - "DefaultClause": 296, - "HeritageClause": 297, - "CatchClause": 298, - "ImportAttributes": 302, - "ImportAttribute": 303, - "PropertyAssignment": 304, - "ShorthandPropertyAssignment": 305, - "SpreadAssignment": 306, - "EnumMember": 307, - "SourceFile": 316, - "NotEmittedStatement": 354, - "CommaListExpression": 360, - "SyntaxList": 362, - "JSDocTypeExpression": 316, - "JSDocComment": 327, - "JsxElement": 284, - "JsxSelfClosingElement": 285, - "JsxOpeningElement": 286, - "JsxClosingElement": 287, - "JsxFragment": 288, - "JsxOpeningFragment": 289, - "JsxClosingFragment": 290, - "JsxAttribute": 291, - "JsxAttributes": 292, - "JsxSpreadAttribute": 293, - "JsxExpression": 294, - "JsxText": 12, - "JsxTextAllWhiteSpaces": 13, + "Unknown": 0, + "EndOfFile": 1, + "NumericLiteral": 8, + "BigIntLiteral": 9, + "StringLiteral": 10, + "JsxText": 11, + "JsxTextAllWhiteSpaces": 12, + "RegularExpressionLiteral": 13, + "NoSubstitutionTemplateLiteral": 14, + "TemplateHead": 15, + "TemplateMiddle": 16, + "TemplateTail": 17, + "OpenBraceToken": 18, + "CloseBraceToken": 19, + "OpenParenToken": 20, + "CloseParenToken": 21, + "OpenBracketToken": 22, + "CloseBracketToken": 23, + "DotToken": 24, + "DotDotDotToken": 25, + "SemicolonToken": 26, + "CommaToken": 27, + "QuestionDotToken": 28, + "LessThanToken": 29, + "GreaterThanToken": 31, + "EqualsGreaterThanToken": 38, + "PlusToken": 39, + "MinusToken": 40, + "AsteriskToken": 41, + "SlashToken": 43, + "PlusPlusToken": 45, + "MinusMinusToken": 46, + "ExclamationToken": 53, + "TildeToken": 54, + "QuestionToken": 57, + "ColonToken": 58, + "AtToken": 59, + "EqualsToken": 63, + "Identifier": 79, + "PrivateIdentifier": 80, + "BreakKeyword": 82, + "CaseKeyword": 83, + "CatchKeyword": 84, + "ClassKeyword": 85, + "ConstKeyword": 86, + "ContinueKeyword": 87, + "DebuggerKeyword": 88, + "DefaultKeyword": 89, + "DeleteKeyword": 90, + "DoKeyword": 91, + "ElseKeyword": 92, + "EnumKeyword": 93, + "ExportKeyword": 94, + "ExtendsKeyword": 95, + "FalseKeyword": 96, + "FinallyKeyword": 97, + "ForKeyword": 98, + "FunctionKeyword": 99, + "IfKeyword": 100, + "ImportKeyword": 101, + "InKeyword": 102, + "InstanceOfKeyword": 103, + "NewKeyword": 104, + "NullKeyword": 105, + "ReturnKeyword": 106, + "SuperKeyword": 107, + "SwitchKeyword": 108, + "ThisKeyword": 109, + "ThrowKeyword": 110, + "TrueKeyword": 111, + "TryKeyword": 112, + "TypeOfKeyword": 113, + "VarKeyword": 114, + "VoidKeyword": 115, + "WhileKeyword": 116, + "WithKeyword": 117, + "ImplementsKeyword": 118, + "InterfaceKeyword": 119, + "LetKeyword": 120, + "PackageKeyword": 121, + "PrivateKeyword": 122, + "ProtectedKeyword": 123, + "PublicKeyword": 124, + "StaticKeyword": 125, + "YieldKeyword": 126, + "AbstractKeyword": 127, + "AccessorKeyword": 128, + "AsKeyword": 129, + "AssertsKeyword": 130, + "AssertKeyword": 131, + "AnyKeyword": 132, + "AsyncKeyword": 133, + "AwaitKeyword": 134, + "BooleanKeyword": 135, + "ConstructorKeyword": 136, + "DeclareKeyword": 137, + "GetKeyword": 138, + "InferKeyword": 140, + "IntrinsicKeyword": 141, + "IsKeyword": 142, + "KeyOfKeyword": 143, + "ModuleKeyword": 144, + "NamespaceKeyword": 145, + "NeverKeyword": 146, + "ReadonlyKeyword": 148, + "RequireKeyword": 149, + "NumberKeyword": 150, + "ObjectKeyword": 151, + "SetKeyword": 153, + "StringKeyword": 154, + "SymbolKeyword": 155, + "TypeKeyword": 156, + "UndefinedKeyword": 157, + "UniqueKeyword": 158, + "UnknownKeyword": 159, + "FromKeyword": 161, + "BigIntKeyword": 163, + "OverrideKeyword": 164, + "OfKeyword": 165, + "DeferKeyword": 166, + "QualifiedName": 167, + "ComputedPropertyName": 168, + "TypeParameter": 169, + "Parameter": 170, + "Decorator": 171, + "PropertySignature": 172, + "PropertyDeclaration": 173, + "MethodSignature": 174, + "MethodDeclaration": 175, + "ClassStaticBlockDeclaration": 176, + "Constructor": 177, + "GetAccessor": 178, + "SetAccessor": 179, + "CallSignature": 180, + "ConstructSignature": 181, + "IndexSignature": 182, + "TypePredicate": 183, + "TypeReference": 184, + "FunctionType": 185, + "ConstructorType": 186, + "TypeQuery": 187, + "TypeLiteral": 188, + "ArrayType": 189, + "TupleType": 190, + "OptionalType": 191, + "RestType": 192, + "UnionType": 193, + "IntersectionType": 194, + "ConditionalType": 195, + "InferType": 196, + "ParenthesizedType": 197, + "ThisType": 198, + "TypeOperator": 199, + "IndexedAccessType": 200, + "MappedType": 201, + "LiteralType": 202, + "NamedTupleMember": 203, + "TemplateLiteralType": 204, + "TemplateLiteralTypeSpan": 205, + "ImportType": 206, + "ObjectBindingPattern": 207, + "ArrayBindingPattern": 208, + "BindingElement": 209, + "ArrayLiteralExpression": 210, + "ObjectLiteralExpression": 211, + "PropertyAccessExpression": 212, + "ElementAccessExpression": 213, + "CallExpression": 214, + "NewExpression": 215, + "TaggedTemplateExpression": 216, + "TypeAssertionExpression": 217, + "ParenthesizedExpression": 218, + "FunctionExpression": 219, + "ArrowFunction": 220, + "DeleteExpression": 221, + "TypeOfExpression": 222, + "VoidExpression": 223, + "AwaitExpression": 224, + "PrefixUnaryExpression": 225, + "PostfixUnaryExpression": 226, + "BinaryExpression": 227, + "ConditionalExpression": 228, + "TemplateExpression": 229, + "YieldExpression": 230, + "SpreadElement": 231, + "ClassExpression": 232, + "OmittedExpression": 233, + "ExpressionWithTypeArguments": 234, + "AsExpression": 235, + "NonNullExpression": 236, + "MetaProperty": 237, + "SatisfiesExpression": 239, + "TemplateSpan": 240, + "SemicolonClassElement": 241, + "Block": 242, + "EmptyStatement": 243, + "VariableStatement": 244, + "ExpressionStatement": 245, + "IfStatement": 246, + "DoStatement": 247, + "WhileStatement": 248, + "ForStatement": 249, + "ForInStatement": 250, + "ForOfStatement": 251, + "ContinueStatement": 252, + "BreakStatement": 253, + "ReturnStatement": 254, + "WithStatement": 255, + "SwitchStatement": 256, + "LabeledStatement": 257, + "ThrowStatement": 258, + "TryStatement": 259, + "DebuggerStatement": 260, + "VariableDeclaration": 261, + "VariableDeclarationList": 262, + "FunctionDeclaration": 263, + "ClassDeclaration": 264, + "InterfaceDeclaration": 265, + "TypeAliasDeclaration": 266, + "EnumDeclaration": 267, + "ModuleDeclaration": 268, + "ModuleBlock": 269, + "CaseBlock": 270, + "NamespaceExportDeclaration": 271, + "ImportEqualsDeclaration": 272, + "ImportDeclaration": 273, + "ImportClause": 274, + "NamespaceImport": 275, + "NamedImports": 276, + "ImportSpecifier": 277, + "ExportAssignment": 278, + "ExportDeclaration": 279, + "NamedExports": 280, + "NamespaceExport": 281, + "ExportSpecifier": 282, + "MissingDeclaration": 283, + "ExternalModuleReference": 284, + "JsxElement": 285, + "JsxSelfClosingElement": 286, + "JsxOpeningElement": 287, + "JsxClosingElement": 288, + "JsxFragment": 289, + "JsxOpeningFragment": 290, + "JsxClosingFragment": 291, + "JsxAttribute": 292, + "JsxAttributes": 293, + "JsxSpreadAttribute": 294, + "JsxExpression": 295, + "JsxNamespacedName": 296, + "CaseClause": 297, + "DefaultClause": 298, + "HeritageClause": 299, + "CatchClause": 300, + "ImportAttributes": 301, + "ImportAttribute": 302, + "PropertyAssignment": 303, + "ShorthandPropertyAssignment": 304, + "SpreadAssignment": 305, + "EnumMember": 306, + "SourceFile": 307, + "JSDocTypeExpression": 308, + "JSDocNameReference": 309, + "JSDocNullableType": 312, + "JSDocNonNullableType": 313, + "JSDocOptionalType": 314, + "JSDocVariadicType": 315, + "JSDoc": 316, + "JSDocText": 317, + "JSDocTypeLiteral": 318, + "JSDocSignature": 319, + "JSDocLink": 320, + "JSDocLinkCode": 321, + "JSDocLinkPlain": 322, + "JSDocTag": 323, + "JSDocAugmentsTag": 324, + "JSDocImplementsTag": 325, + "JSDocDeprecatedTag": 326, + "JSDocPublicTag": 327, + "JSDocPrivateTag": 328, + "JSDocProtectedTag": 329, + "JSDocReadonlyTag": 330, + "JSDocOverrideTag": 331, + "JSDocCallbackTag": 332, + "JSDocOverloadTag": 333, + "JSDocParameterTag": 334, + "JSDocReturnTag": 335, + "JSDocThisTag": 336, + "JSDocTypeTag": 337, + "JSDocTemplateTag": 338, + "JSDocTypedefTag": 339, + "JSDocSeeTag": 340, + "JSDocPropertyTag": 341, + "JSDocThrowsTag": 342, + "JSDocSatisfiesTag": 343, + "JSDocImportTag": 344, } // nodeFlags maps NodeFlags names to their numeric values. diff --git a/javascript/extractor/lib/typescript-go/internal/tsparser/standalone.go b/javascript/extractor/lib/typescript-go/internal/tsparser/standalone.go index 62918d6084a..e8e180fe177 100644 --- a/javascript/extractor/lib/typescript-go/internal/tsparser/standalone.go +++ b/javascript/extractor/lib/typescript-go/internal/tsparser/standalone.go @@ -100,7 +100,7 @@ func (p *StandaloneParser) Parse(filename string) (*ParseResult, error) { // GetMetadata returns static TS7 metadata. func (p *StandaloneParser) GetMetadata() (*Metadata, error) { - return getStaticTS7Metadata(), nil + return GetStaticTS7Metadata(), nil } // Reset is a no-op for the standalone parser. diff --git a/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo.go b/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo.go index 604a703d65f..6c2b70631e3 100644 --- a/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo.go +++ b/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo.go @@ -11,6 +11,8 @@ import ( "path/filepath" "strconv" "sync" + + "github.com/github/codeql/javascript/extractor/lib/typescript-go/internal/astconv" ) // TsgoParser implements the Parser interface by running the tsgo binary @@ -186,26 +188,36 @@ func (p *TsgoParser) sendRequest(method string, params interface{}) (json.RawMes return nil, fmt.Errorf("failed to marshal request: %w", err) } + fmt.Fprintf(os.Stderr, "[tsgo] >>> %s id=%d\n", method, id) + if err := p.writeMessage(data); err != nil { return nil, fmt.Errorf("failed to write request: %w", err) } - // Read the response - respData, err := p.readMessage() - if err != nil { - return nil, fmt.Errorf("failed to read response: %w", err) - } + // Read responses, skipping notifications (messages without a matching id). + // In --async mode, tsgo may send diagnostic notifications between responses. + for { + respData, err := p.readMessage() + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } - var resp jsonRPCResponse - if err := json.Unmarshal(respData, &resp); err != nil { - return nil, fmt.Errorf("failed to parse response: %w", err) - } + var resp jsonRPCResponse + if err := json.Unmarshal(respData, &resp); err != nil { + return nil, fmt.Errorf("failed to parse response: %w", err) + } - if resp.Error != nil { - return nil, fmt.Errorf("tsgo API error %d: %s", resp.Error.Code, resp.Error.Message) - } + // Skip notifications (id=0 means no id field was present in JSON) + if resp.ID != id { + continue + } - return resp.Result, nil + if resp.Error != nil { + return nil, fmt.Errorf("tsgo API error %d: %s", resp.Error.Code, resp.Error.Message) + } + + return resp.Result, nil + } } // call sends a request with proper locking and initialization. @@ -229,30 +241,48 @@ type updateSnapshotResponse struct { } `json:"projects"` } -// ensureProjectOpen opens a project for the given file's directory using -// a temporary tsconfig, or uses the existing snapshot if already open. +// ensureProjectOpen opens a project for the given file. +// The tsgo API requires a tsconfig for project opening, so if none exists +// in the file's directory, we create a temporary one. func (p *TsgoParser) ensureProjectOpen(filename string) error { if p.snapshotHandle != "" && p.projectHandle != "" { return nil } - // Create a snapshot by opening a project. - // For single-file parsing without a tsconfig, we ask tsgo to open - // the file's directory as a project. The tsgo API requires a - // tsconfig path for OpenProject. dir := filepath.Dir(filename) + base := filepath.Base(filename) tsconfigPath := filepath.Join(dir, "tsconfig.json") - // First try: updateSnapshot with the file's directory tsconfig + // If no tsconfig exists, create a temporary one + createdTsconfig := false + if _, err := os.Stat(tsconfigPath); os.IsNotExist(err) { + tsconfig := fmt.Sprintf(`{ + "compilerOptions": { + "target": "esnext", + "module": "esnext", + "noEmit": true, + "strict": false, + "allowJs": true + }, + "files": [%q] +}`, base) + if err := os.WriteFile(tsconfigPath, []byte(tsconfig), 0644); err != nil { + return fmt.Errorf("failed to create temporary tsconfig: %w", err) + } + createdTsconfig = true + } + result, err := p.sendRequest("updateSnapshot", map[string]interface{}{ "openProject": tsconfigPath, }) + + // Clean up temporary tsconfig + if createdTsconfig { + os.Remove(tsconfigPath) + } + if err != nil { - // If no tsconfig exists, try without a project - result, err = p.sendRequest("updateSnapshot", map[string]interface{}{}) - if err != nil { - return fmt.Errorf("failed to create snapshot: %w", err) - } + return fmt.Errorf("failed to open project: %w", err) } var resp updateSnapshotResponse @@ -303,18 +333,37 @@ func (p *TsgoParser) Parse(filename string) (*ParseResult, error) { return nil, fmt.Errorf("parse %s: %w", filename, err) } - // The result is the binary-encoded source file data (base64 when - // using JSON protocol). For now, store the raw response. - // TODO: Decode the binary format into a JSON AST. + // The result is {"data":""} containing a binary-encoded AST. + var dataResp struct { + Data string `json:"data"` + } + if err := json.Unmarshal(result, &dataResp); err != nil { + return nil, fmt.Errorf("parse %s: failed to parse getSourceFile response: %w", filename, err) + } + + binaryAST, err := astconv.DecodeBinaryASTFromBase64(dataResp.Data) + if err != nil { + return nil, fmt.Errorf("parse %s: failed to decode binary AST: %w", filename, err) + } + + kindToName := BuildKindToNameMap() + converter := astconv.NewConverter(binaryAST, kindToName) + astObj, err := converter.Convert() + if err != nil { + return nil, fmt.Errorf("parse %s: failed to convert AST: %w", filename, err) + } + + filtered := astconv.FilterWhitelist(astObj) + return &ParseResult{ - AST: result, - RawData: []byte(result), + AST: filtered, + RawData: []byte(dataResp.Data), }, nil } // GetMetadata returns the syntax kinds and node flags. func (p *TsgoParser) GetMetadata() (*Metadata, error) { - return getStaticTS7Metadata(), nil + return GetStaticTS7Metadata(), nil } // Reset resets the parser state, killing and restarting the subprocess. diff --git a/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo_test.go b/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo_test.go index 4a889806244..ec9b3850687 100644 --- a/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo_test.go +++ b/javascript/extractor/lib/typescript-go/internal/tsparser/tsgo_test.go @@ -193,7 +193,7 @@ func TestTsgoGetMetadata(t *testing.T) { } func TestStaticMetadata(t *testing.T) { - meta := getStaticTS7Metadata() + meta := GetStaticTS7Metadata() required := []string{"SourceFile", "Identifier", "Block", "VariableStatement", "FunctionDeclaration", "ClassDeclaration", "InterfaceDeclaration"} @@ -235,3 +235,68 @@ func min(a, b int) int { } return b } + +func TestTsgoParse(t *testing.T) { + if _, err := exec.LookPath("tsgo"); err != nil { + t.Skip("tsgo not found on PATH") + } + + sampleFile := findTestFile(t) + parser := NewTsgoParser(Config{Stderr: os.Stderr}) + defer parser.Close() + + result, err := parser.Parse(sampleFile) + if err != nil { + t.Fatalf("Parse failed: %v", err) + } + + ast, ok := result.AST.(map[string]interface{}) + if !ok { + t.Fatalf("Expected AST to be map[string]interface{}, got %T", result.AST) + } + + // Verify the root is a SourceFile + kindVal, ok := ast["kind"] + if !ok { + t.Fatal("Missing 'kind' property on root node") + } + kindNum, ok := kindVal.(int) + if !ok { + t.Fatalf("Expected 'kind' to be int, got %T", kindVal) + } + if kindNum != 307 { // SourceFile = 307 in TS7 + t.Errorf("Expected root kind=307 (SourceFile), got %d", kindNum) + } + + // Verify $pos and $end + if _, ok := ast["$pos"]; !ok { + t.Error("Missing '$pos' property") + } + if _, ok := ast["$end"]; !ok { + t.Error("Missing '$end' property") + } + + // Verify statements array + stmts, ok := ast["statements"] + if !ok { + t.Fatal("Missing 'statements' property") + } + stmtsArr, ok := stmts.([]interface{}) + if !ok { + t.Fatalf("Expected statements to be array, got %T", stmts) + } + if len(stmtsArr) == 0 { + t.Error("Expected non-empty statements array") + } + + // Print a nicely indented snippet for debug + jsonBytes, err := json.MarshalIndent(ast, "", " ") + if err != nil { + t.Fatalf("Failed to marshal AST: %v", err) + } + snippet := string(jsonBytes) + if len(snippet) > 2000 { + snippet = snippet[:2000] + "\n... (truncated)" + } + t.Logf("Parsed AST (first 2000 chars):\n%s", snippet) +} diff --git a/javascript/extractor/lib/typescript-go/internal/validation/validation_test.go b/javascript/extractor/lib/typescript-go/internal/validation/validation_test.go index 13f6dd4e6af..bef14429932 100644 --- a/javascript/extractor/lib/typescript-go/internal/validation/validation_test.go +++ b/javascript/extractor/lib/typescript-go/internal/validation/validation_test.go @@ -297,10 +297,20 @@ func TestCompareOutputs(t *testing.T) { os.WriteFile(filepath.Join(outDir, basename+".nodejs.json"), nodejsNorm, 0644) os.WriteFile(filepath.Join(outDir, basename+".go.json"), goNorm, 0644) - t.Errorf("Output mismatch for %s\n"+ - " Node.js output saved to: validation-output/%s.nodejs.json\n"+ - " Go output saved to: validation-output/%s.go.json", - basename, basename, basename) + // Parse both outputs and check for structural diffs (ignoring expected kind/flags differences) + var nodejsObj, goObj map[string]interface{} + json.Unmarshal(nodejsNorm, &nodejsObj) + json.Unmarshal(goNorm, &goObj) + + structural := countStructuralDiffs(nodejsObj["ast"], goObj["ast"], "root") + if structural > 0 { + t.Errorf("Output has %d structural diff(s) for %s (beyond expected kind/flags diffs)\n"+ + " Node.js output saved to: validation-output/%s.nodejs.json\n"+ + " Go output saved to: validation-output/%s.go.json", + structural, basename, basename, basename) + } else { + t.Logf("Output for %s differs only in expected kind/flags/token numeric values (TS5 vs TS7)", basename) + } } }) } @@ -324,3 +334,76 @@ func TestNormalizeJSON(t *testing.T) { t.Errorf("got:\n%s\nexpected:\n%s", string(result), expected) } } + +// numericValueKeys are JSON object keys whose numeric values are expected to differ +// between TS5 and TS7 (SyntaxKind/NodeFlags numeric values). +var numericValueKeys = map[string]bool{ + "kind": true, + "flags": true, + "token": true, + "operator": true, +} + +// countStructuralDiffs recursively compares two JSON values and returns the +// number of differences that are NOT expected TS5↔TS7 numeric kind/flags diffs. +func countStructuralDiffs(a, b interface{}, path string) int { + count := 0 + switch av := a.(type) { + case map[string]interface{}: + bv, ok := b.(map[string]interface{}) + if !ok { + return 1 + } + allKeys := map[string]bool{} + for k := range av { + allKeys[k] = true + } + for k := range bv { + allKeys[k] = true + } + for k := range allKeys { + aVal, aOk := av[k] + bVal, bOk := bv[k] + if !aOk || !bOk { + count++ + continue + } + count += countStructuralDiffs(aVal, bVal, path+"."+k) + } + case []interface{}: + bv, ok := b.([]interface{}) + if !ok { + return 1 + } + if len(av) != len(bv) { + return 1 + } + for i := range av { + count += countStructuralDiffs(av[i], bv[i], fmt.Sprintf("%s[%d]", path, i)) + } + default: + if a != b { + // Check if this is an expected numeric diff for kind/flags/token/operator + key := lastPathComponent(path) + if numericValueKeys[key] { + // Both must be numbers for this to be an expected diff + _, aNum := a.(float64) + _, bNum := b.(float64) + if aNum && bNum { + return 0 // Expected TS5↔TS7 numeric diff + } + } + count++ + } + } + return count +} + +func lastPathComponent(path string) string { + for i := len(path) - 1; i >= 0; i-- { + if path[i] == '.' { + return path[i+1:] + } + } + return path +}