mirror of
https://github.com/github/codeql.git
synced 2026-04-30 03:05:15 +02:00
Python: Copy Python extractor to codeql repo
This commit is contained in:
34
python/extractor/semmle/thrift/__init__.py
Normal file
34
python/extractor/semmle/thrift/__init__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import os.path
|
||||
|
||||
from .parse import Parser
|
||||
from .emit import Emitter
|
||||
|
||||
class Extractor(object):
|
||||
|
||||
def __init__(self, trap_folder, src_archive=None):
|
||||
self.parser = Parser()
|
||||
self.emitter = Emitter(trap_folder)
|
||||
self.src_archive = src_archive
|
||||
|
||||
def _walk(self, path):
|
||||
for dirpath, _, filenames in os.walk(path):
|
||||
for filename in filenames:
|
||||
if filename.endswith(".thrift"):
|
||||
yield os.path.join(dirpath, filename)
|
||||
|
||||
def extract_files(self, files):
|
||||
for file in files:
|
||||
self.extract_file(file)
|
||||
|
||||
def extract_folder(self, path):
|
||||
for file in self._walk(path):
|
||||
self.extract_file(file)
|
||||
|
||||
def extract_file(self, file):
|
||||
with open(file, "rb") as fd:
|
||||
bytes_source = fd.read()
|
||||
src = bytes_source.decode('utf-8')
|
||||
tree = self.parser.parse(src)
|
||||
self.emitter.emit(file, tree)
|
||||
if self.src_archive:
|
||||
self.src_archive.write(file, bytes_source)
|
||||
52
python/extractor/semmle/thrift/emit.py
Normal file
52
python/extractor/semmle/thrift/emit.py
Normal file
@@ -0,0 +1,52 @@
|
||||
|
||||
import os.path
|
||||
import csv
|
||||
import re
|
||||
import semmle.util
|
||||
|
||||
IGNORE = re.compile("namespace|fieldreq")
|
||||
|
||||
class Emitter(object):
|
||||
|
||||
def __init__(self, trap_folder):
|
||||
self.trap_folder = trap_folder
|
||||
self.lengths = {}
|
||||
self.next_id = 0
|
||||
self.uuid = semmle.util.uuid('thrift')
|
||||
|
||||
def emit(self, file, tree):
|
||||
trapwriter = semmle.util.TrapWriter()
|
||||
vpath = self.trap_folder.get_virtual_path(file)
|
||||
self.emit_recursive(trapwriter, vpath, tree, None, None)
|
||||
trapwriter.write_file(vpath)
|
||||
self.trap_folder.write_trap("thrift", file, trapwriter.get_compressed())
|
||||
|
||||
def emitrow(self, trapwriter, kind, *args):
|
||||
if kind in self.lengths:
|
||||
if len(args) != self.lengths[kind]:
|
||||
raise Exception("Inconsistent row for '%s': %s, expecting %d" % (kind, args, self.lengths[kind]))
|
||||
else:
|
||||
self.lengths[kind] = len(args)
|
||||
qpath = "thrift-"+kind
|
||||
id = trapwriter.get_unique_id()
|
||||
for index, value in enumerate(args):
|
||||
trapwriter.write_tuple("externalData", "rsds", id, qpath, index, value)
|
||||
|
||||
def emit_recursive(self, trapwriter, file, node, index, parent):
|
||||
self.next_id += 1
|
||||
if hasattr(node, "type"):
|
||||
tag = node.type
|
||||
assert index >= 0
|
||||
name = "%s-%s-%s" % (tag, self.next_id, self.uuid)
|
||||
self.emitrow(trapwriter, tag, name, index, parent, node.value, file, node.line, node.column)
|
||||
else:
|
||||
tag = node.data
|
||||
if IGNORE.match(tag):
|
||||
return
|
||||
name = "%s-%s-%s" % (node.data, self.next_id, self.uuid)
|
||||
for cindex, child in enumerate(node.children):
|
||||
self.emit_recursive(trapwriter, file, child, cindex, name)
|
||||
if index is None:
|
||||
self.emitrow(trapwriter, tag, name)
|
||||
else:
|
||||
self.emitrow(trapwriter, tag, name, index, parent)
|
||||
83
python/extractor/semmle/thrift/parse.py
Normal file
83
python/extractor/semmle/thrift/parse.py
Normal file
@@ -0,0 +1,83 @@
|
||||
|
||||
# This grammar is based on https://github.com/apache/thrift/blob/master/doc/specs/idl.md
|
||||
grammar = r"""
|
||||
start : document
|
||||
document : header* definition*
|
||||
header : include | cppinclude | namespace
|
||||
include : "include" LITERAL
|
||||
cppinclude : "cpp_include" LITERAL
|
||||
namespace : ( "namespace" ( namespacescope name )
|
||||
| ( "smalltalk.prefix" name ) )
|
||||
| ( "php_namespace" LITERAL )
|
||||
| ( "xsd_namespace" LITERAL )
|
||||
!namespacescope : "*" | IDENTIFIER
|
||||
definition : const | typedef | enum | senum | struct | union | exception | service
|
||||
const : "const" fieldtype name "=" constvalue _listseparator?
|
||||
typedef : "typedef" definitiontype type_annotations name type_annotations _listseparator?
|
||||
enum : "enum" name "{" enumfield* "}" type_annotations
|
||||
enumfield : name enumvalue type_annotations _listseparator?
|
||||
enumvalue : ("=" INTCONSTANT)?
|
||||
senum : "senum" name "{" senumfield* "}"
|
||||
senumfield : LITERAL _listseparator?
|
||||
struct : "struct" name "xsd_all"? "{" field* "}" type_annotations
|
||||
name : IDENTIFIER
|
||||
union : "union" name "xsd_all"? "{" field* "}"
|
||||
exception : "exception" name "{" field* "}"
|
||||
service : "service" name extends? "{" function* "}" type_annotations
|
||||
extends : "extends" IDENTIFIER
|
||||
field : fieldid fieldreq fieldtype type_annotations IDENTIFIER fieldvalue xsdfieldoptions type_annotations _listseparator?
|
||||
fieldvalue : ("=" constvalue)?
|
||||
fieldid : (INTCONSTANT ":")?
|
||||
fieldreq : ("required" | "optional")?
|
||||
?xsdfieldoptions: "xsd_optional"? "xsd_nillable"? xsdattrs?
|
||||
xsdattrs : "xsd_attrs" "{" field* "}"
|
||||
function : oneway functiontype name "(" field* ")" throws type_annotations _listseparator?
|
||||
oneway : ("oneway")?
|
||||
!functiontype : fieldtype | "void"
|
||||
throws : ( "throws" "(" field* ")" )?
|
||||
fieldtype : IDENTIFIER | basetype | containertype
|
||||
definitiontype : IDENTIFIER | basetype | containertype
|
||||
!basetype : "bool" | "byte" | "i8" | "i16" | "i32" | "i64" | "double" | "string" | "binary" | "slist"
|
||||
containertype : maptype | settype | listtype
|
||||
maptype : "map" cpptype? "<" fieldtype "," fieldtype ">"
|
||||
settype : "set" cpptype? "<" fieldtype ">"
|
||||
listtype : "list" "<" fieldtype ">" cpptype?
|
||||
cpptype : "cpp_type" LITERAL
|
||||
!constvalue : INTCONSTANT | DOUBLECONSTANT | LITERAL | IDENTIFIER | constlist | constmap
|
||||
INTCONSTANT : ("+" | "-")? DIGIT+
|
||||
DOUBLECONSTANT : ("+" | "-")? DIGIT* "." DIGIT+ ( ("E" | "e") INTCONSTANT )?
|
||||
constlist : "[" constlistelt* "]"
|
||||
constlistelt : constvalue _listseparator?
|
||||
constmap : "{" constmapelt* "}"
|
||||
constmapelt : constvalue ":" constvalue _listseparator?
|
||||
|
||||
type_annotations : ( "(" type_annotation* ")" )?
|
||||
type_annotation : name "=" constvalue _listseparator?
|
||||
|
||||
LITERAL : ("\"" /[^"]/* "\"") | ("'" /[^']/* "'")
|
||||
IDENTIFIER : ( LETTER | "_" ) ( LETTER | DIGIT | "." | "_" )*
|
||||
_listseparator : "," | ";"
|
||||
LETTER : "A".."Z" | "a".."z"
|
||||
DIGIT : "0".."9"
|
||||
WHITESPACE : (" " | "\t" | "\r" | "\n")+
|
||||
|
||||
%import common.NEWLINE
|
||||
COMMENT : "/*" /(.|\n|\r)*?/ "*/"
|
||||
| "//" /(.)*/ NEWLINE
|
||||
| "#" /(.)*/ NEWLINE
|
||||
%ignore WHITESPACE
|
||||
%ignore COMMENT
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
from lark import Lark
|
||||
|
||||
class Parser(Lark):
|
||||
|
||||
def __init__(self):
|
||||
Lark.__init__(self, grammar, parser="earley", lexer="standard")
|
||||
|
||||
def parse(src):
|
||||
return parser.parse(src)
|
||||
Reference in New Issue
Block a user