Python: Copy Python extractor to codeql repo

This commit is contained in:
Taus
2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
import os.path
from .parse import Parser
from .emit import Emitter
class Extractor(object):
def __init__(self, trap_folder, src_archive=None):
self.parser = Parser()
self.emitter = Emitter(trap_folder)
self.src_archive = src_archive
def _walk(self, path):
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
if filename.endswith(".thrift"):
yield os.path.join(dirpath, filename)
def extract_files(self, files):
for file in files:
self.extract_file(file)
def extract_folder(self, path):
for file in self._walk(path):
self.extract_file(file)
def extract_file(self, file):
with open(file, "rb") as fd:
bytes_source = fd.read()
src = bytes_source.decode('utf-8')
tree = self.parser.parse(src)
self.emitter.emit(file, tree)
if self.src_archive:
self.src_archive.write(file, bytes_source)

View File

@@ -0,0 +1,52 @@
import os.path
import csv
import re
import semmle.util
IGNORE = re.compile("namespace|fieldreq")
class Emitter(object):
def __init__(self, trap_folder):
self.trap_folder = trap_folder
self.lengths = {}
self.next_id = 0
self.uuid = semmle.util.uuid('thrift')
def emit(self, file, tree):
trapwriter = semmle.util.TrapWriter()
vpath = self.trap_folder.get_virtual_path(file)
self.emit_recursive(trapwriter, vpath, tree, None, None)
trapwriter.write_file(vpath)
self.trap_folder.write_trap("thrift", file, trapwriter.get_compressed())
def emitrow(self, trapwriter, kind, *args):
if kind in self.lengths:
if len(args) != self.lengths[kind]:
raise Exception("Inconsistent row for '%s': %s, expecting %d" % (kind, args, self.lengths[kind]))
else:
self.lengths[kind] = len(args)
qpath = "thrift-"+kind
id = trapwriter.get_unique_id()
for index, value in enumerate(args):
trapwriter.write_tuple("externalData", "rsds", id, qpath, index, value)
def emit_recursive(self, trapwriter, file, node, index, parent):
self.next_id += 1
if hasattr(node, "type"):
tag = node.type
assert index >= 0
name = "%s-%s-%s" % (tag, self.next_id, self.uuid)
self.emitrow(trapwriter, tag, name, index, parent, node.value, file, node.line, node.column)
else:
tag = node.data
if IGNORE.match(tag):
return
name = "%s-%s-%s" % (node.data, self.next_id, self.uuid)
for cindex, child in enumerate(node.children):
self.emit_recursive(trapwriter, file, child, cindex, name)
if index is None:
self.emitrow(trapwriter, tag, name)
else:
self.emitrow(trapwriter, tag, name, index, parent)

View File

@@ -0,0 +1,83 @@
# This grammar is based on https://github.com/apache/thrift/blob/master/doc/specs/idl.md
grammar = r"""
start : document
document : header* definition*
header : include | cppinclude | namespace
include : "include" LITERAL
cppinclude : "cpp_include" LITERAL
namespace : ( "namespace" ( namespacescope name )
| ( "smalltalk.prefix" name ) )
| ( "php_namespace" LITERAL )
| ( "xsd_namespace" LITERAL )
!namespacescope : "*" | IDENTIFIER
definition : const | typedef | enum | senum | struct | union | exception | service
const : "const" fieldtype name "=" constvalue _listseparator?
typedef : "typedef" definitiontype type_annotations name type_annotations _listseparator?
enum : "enum" name "{" enumfield* "}" type_annotations
enumfield : name enumvalue type_annotations _listseparator?
enumvalue : ("=" INTCONSTANT)?
senum : "senum" name "{" senumfield* "}"
senumfield : LITERAL _listseparator?
struct : "struct" name "xsd_all"? "{" field* "}" type_annotations
name : IDENTIFIER
union : "union" name "xsd_all"? "{" field* "}"
exception : "exception" name "{" field* "}"
service : "service" name extends? "{" function* "}" type_annotations
extends : "extends" IDENTIFIER
field : fieldid fieldreq fieldtype type_annotations IDENTIFIER fieldvalue xsdfieldoptions type_annotations _listseparator?
fieldvalue : ("=" constvalue)?
fieldid : (INTCONSTANT ":")?
fieldreq : ("required" | "optional")?
?xsdfieldoptions: "xsd_optional"? "xsd_nillable"? xsdattrs?
xsdattrs : "xsd_attrs" "{" field* "}"
function : oneway functiontype name "(" field* ")" throws type_annotations _listseparator?
oneway : ("oneway")?
!functiontype : fieldtype | "void"
throws : ( "throws" "(" field* ")" )?
fieldtype : IDENTIFIER | basetype | containertype
definitiontype : IDENTIFIER | basetype | containertype
!basetype : "bool" | "byte" | "i8" | "i16" | "i32" | "i64" | "double" | "string" | "binary" | "slist"
containertype : maptype | settype | listtype
maptype : "map" cpptype? "<" fieldtype "," fieldtype ">"
settype : "set" cpptype? "<" fieldtype ">"
listtype : "list" "<" fieldtype ">" cpptype?
cpptype : "cpp_type" LITERAL
!constvalue : INTCONSTANT | DOUBLECONSTANT | LITERAL | IDENTIFIER | constlist | constmap
INTCONSTANT : ("+" | "-")? DIGIT+
DOUBLECONSTANT : ("+" | "-")? DIGIT* "." DIGIT+ ( ("E" | "e") INTCONSTANT )?
constlist : "[" constlistelt* "]"
constlistelt : constvalue _listseparator?
constmap : "{" constmapelt* "}"
constmapelt : constvalue ":" constvalue _listseparator?
type_annotations : ( "(" type_annotation* ")" )?
type_annotation : name "=" constvalue _listseparator?
LITERAL : ("\"" /[^"]/* "\"") | ("'" /[^']/* "'")
IDENTIFIER : ( LETTER | "_" ) ( LETTER | DIGIT | "." | "_" )*
_listseparator : "," | ";"
LETTER : "A".."Z" | "a".."z"
DIGIT : "0".."9"
WHITESPACE : (" " | "\t" | "\r" | "\n")+
%import common.NEWLINE
COMMENT : "/*" /(.|\n|\r)*?/ "*/"
| "//" /(.)*/ NEWLINE
| "#" /(.)*/ NEWLINE
%ignore WHITESPACE
%ignore COMMENT
"""
from lark import Lark
class Parser(Lark):
def __init__(self):
Lark.__init__(self, grammar, parser="earley", lexer="standard")
def parse(src):
return parser.parse(src)