mirror of
https://github.com/github/codeql.git
synced 2026-01-16 07:54:52 +01:00
381 lines
14 KiB
Python
381 lines
14 KiB
Python
|
|
import ast
|
|
import sys
|
|
from types import ModuleType, GetSetDescriptorType
|
|
import hashlib
|
|
import os
|
|
|
|
from semmle.python import ast
|
|
from semmle.python.passes._pass import Pass
|
|
from semmle.util import get_analysis_major_version
|
|
from semmle.python.passes.ast_pass import iter_fields
|
|
from semmle.cmdline import is_legal_module_name
|
|
|
|
|
|
'''
|
|
The QL library depends on a reasonable one-to-one correspondence
|
|
between DB entities and Python objects. However, since QL has only
|
|
one notion of equality, but Python has two (`__eq__` and `is`) we need to be careful.
|
|
What we want to do is to treat objects like builtin functions and classes as using
|
|
reference equality and numbers and strings as using value equality.
|
|
|
|
In practice this is impossible as we want to distinguish `True` from `1` from `1.0`
|
|
even though all these values are equal. However, we want to get as close as possible.
|
|
|
|
'''
|
|
|
|
|
|
__all__ = [ 'ObjectPass' ]
|
|
|
|
OBJECT_TYPES = set([ ast.ClassExpr, ast.Call,
|
|
ast.FunctionExpr, ast.Tuple,
|
|
ast.Str, ast.Num, ast.List, ast.ListComp, ast.Module,
|
|
ast.Dict, ast.Ellipsis, ast.Lambda])
|
|
|
|
# Types from Python 2.7 onwards
|
|
OBJECT_TYPES.add(ast.DictComp)
|
|
OBJECT_TYPES.add(ast.SetComp)
|
|
OBJECT_TYPES.add(ast.Set)
|
|
|
|
NUMERIC_TYPES = set([int, float, bool])
|
|
|
|
BUILTINS_NAME = 'builtins'
|
|
|
|
LITERALS = (ast.Num, ast.Str)
|
|
|
|
class _CObject(object):
|
|
'''Utility class to wrap arbitrary C objects.
|
|
Treat all objects as unique. Rely on naming in the
|
|
trap files to merge the objects that we want merged.
|
|
'''
|
|
__slots__ = ['obj']
|
|
|
|
def __init__(self, obj):
|
|
self.obj = obj
|
|
|
|
def __eq__(self, other):
|
|
if isinstance(other, _CObject):
|
|
return self.obj is other.obj
|
|
else:
|
|
return False
|
|
|
|
def __ne__(self, other):
|
|
return not self.__eq__(other)
|
|
|
|
def __hash__(self):
|
|
return id(self.obj)
|
|
|
|
class ObjectPass(Pass):
|
|
'''Generates relations for objects. This includes information about
|
|
builtin objects, including their types and members.
|
|
It also generates objects for all literal values present in the Python source.'''
|
|
|
|
def extract(self, ast, path, writer):
|
|
self.writer = writer
|
|
try:
|
|
self._extract_py(ast)
|
|
self._extract_possible_module_names(path)
|
|
finally:
|
|
self.writer = None
|
|
|
|
def _extract_possible_module_names(self, path):
|
|
maybe_name, _ = os.path.splitext(path)
|
|
maybe_name = maybe_name.replace(os.sep, ".")
|
|
while maybe_name.count(".") > 3:
|
|
_, maybe_name = maybe_name.split(".", 1)
|
|
while True:
|
|
if is_legal_module_name(maybe_name):
|
|
self._write_module_and_package_names(maybe_name)
|
|
if "." not in maybe_name:
|
|
return
|
|
_, maybe_name = maybe_name.split(".", 1)
|
|
|
|
def _write_module_and_package_names(self, module_name):
|
|
self._write_c_object(module_name, None, False)
|
|
while "." in module_name:
|
|
module_name, _ = module_name.rsplit(".", 1)
|
|
self._write_c_object(module_name, None, False)
|
|
|
|
def extract_builtin(self, module, writer):
|
|
self.writer = writer
|
|
try:
|
|
self._extract_c(module)
|
|
finally:
|
|
self.writer = None
|
|
|
|
def _extract_c(self, mod):
|
|
self.next_address_label = 0
|
|
self.address_labels = {}
|
|
self._write_c_object(mod, None, False)
|
|
self.address_labels = None
|
|
|
|
def _write_str(self, s):
|
|
assert type(s) is str
|
|
self._write_c_object(s, None, False)
|
|
|
|
def _write_c_object(self, obj, label, write_special, string_prefix=""):
|
|
ANALYSIS_MAJOR_VERSION = get_analysis_major_version()
|
|
# If we're extracting Python 2 code using Python 3, we want to treat `str` as `bytes` for
|
|
# the purposes of determining the type, but we still want to treat the _value_ as if it's a `str`.
|
|
obj_type = type(obj)
|
|
if obj_type == str and ANALYSIS_MAJOR_VERSION == 2 and 'u' not in string_prefix:
|
|
obj_type = bytes
|
|
|
|
cobj = _CObject(obj)
|
|
if self.writer.has_written(cobj):
|
|
return self.writer.get_node_id(cobj)
|
|
obj_label = self.get_label_for_object(obj, label, obj_type)
|
|
obj_id = self.writer.get_labelled_id(cobj, obj_label)
|
|
#Avoid writing out all the basic types for every C module.
|
|
if not write_special and cobj in SPECIAL_OBJECTS:
|
|
return obj_id
|
|
type_id = self._write_c_object(obj_type, None, write_special)
|
|
self.writer.write_tuple(u'py_cobjects', 'r', obj_id)
|
|
self.writer.write_tuple(u'py_cobjecttypes', 'rr', obj_id, type_id)
|
|
self.writer.write_tuple(u'py_cobject_sources', 'rd', obj_id, 0)
|
|
if isinstance(obj, ModuleType) or isinstance(obj, type):
|
|
for name, value in sorted(obj.__dict__.items()):
|
|
if (obj, name) in SKIPLIST:
|
|
continue
|
|
val_id = self._write_c_object(value, obj_label + u'$%d' % ANALYSIS_MAJOR_VERSION + name, write_special)
|
|
self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs',
|
|
obj_id, name, val_id, ANALYSIS_MAJOR_VERSION)
|
|
if isinstance(obj, type) and obj is not object:
|
|
super_id = self._write_c_object(obj.__mro__[1], None, write_special)
|
|
self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs',
|
|
obj_id, u".super.", super_id, ANALYSIS_MAJOR_VERSION)
|
|
if isinstance(obj, (list, tuple)):
|
|
for index, item in enumerate(obj):
|
|
item_id = self._write_c_object(item, obj_label + u'$' + str(index), write_special)
|
|
self.writer.write_tuple(u'py_citems', 'rdr',
|
|
obj_id, index, item_id)
|
|
if type(obj) is GetSetDescriptorType:
|
|
for name in type(obj).__dict__:
|
|
if name == '__name__' or not hasattr(obj, name):
|
|
continue
|
|
val_id = self._write_c_object(getattr(obj, name), obj_label + u'$%d' % ANALYSIS_MAJOR_VERSION + name, write_special)
|
|
self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs',
|
|
obj_id, name, val_id, ANALYSIS_MAJOR_VERSION)
|
|
if hasattr(obj, '__name__'):
|
|
#Use qualified names for classes.
|
|
if isinstance(obj, type):
|
|
name = qualified_type_name(obj)
|
|
# https://bugs.python.org/issue18602
|
|
elif isinstance(obj, ModuleType) and obj.__name__ == "io":
|
|
name = "_io"
|
|
elif obj is EXEC:
|
|
name = "exec"
|
|
else:
|
|
name = obj.__name__
|
|
self.writer.write_tuple(u'py_cobjectnames', 'rs',
|
|
obj_id, name)
|
|
elif type(obj) in NUMERIC_TYPES:
|
|
self.writer.write_tuple(u'py_cobjectnames', 'rq',
|
|
obj_id, obj)
|
|
elif type(obj) is str:
|
|
if 'b' in string_prefix:
|
|
prefix = u"b"
|
|
elif 'u' in string_prefix:
|
|
prefix = u"u"
|
|
else:
|
|
if ANALYSIS_MAJOR_VERSION == 2:
|
|
prefix = u"b"
|
|
else:
|
|
prefix = u"u"
|
|
self.writer.write_tuple(u'py_cobjectnames', 'rs',
|
|
obj_id, prefix + u"'" + obj + u"'")
|
|
elif type(obj) is bytes:
|
|
#Convert bytes to a unicode characters one-to-one.
|
|
obj_string = u"b'" + obj.decode("latin-1") + u"'"
|
|
self.writer.write_tuple(u'py_cobjectnames', 'rs',
|
|
obj_id, obj_string)
|
|
elif type(obj) is type(None):
|
|
self.writer.write_tuple(u'py_cobjectnames', 'rs',
|
|
obj_id, u'None')
|
|
else:
|
|
self.writer.write_tuple(u'py_cobjectnames', 'rs',
|
|
obj_id, u'object')
|
|
return obj_id
|
|
|
|
def write_special_objects(self, writer):
|
|
'''Write important builtin objects to the trap file'''
|
|
self.writer = writer
|
|
self.next_address_label = 0
|
|
self.address_labels = {}
|
|
|
|
def write(obj, name, label=None):
|
|
obj_id = self._write_c_object(obj, label, True)
|
|
self.writer.write_tuple(u'py_special_objects', 'rs', obj_id, name)
|
|
|
|
for obj, name in SPECIAL_OBJECTS.items():
|
|
write(obj.obj, name)
|
|
|
|
###Artificial objects for use by the type-inferencer - Make sure that they are unique.
|
|
write(object(), u"_semmle_unknown_type", u"$_semmle_unknown_type")
|
|
write(object(), u"_semmle_undefined_value", u"$_semmle_undefined_value")
|
|
|
|
self.writer = None
|
|
self.address_labels = None
|
|
|
|
def get_label_for_object(self, obj, default_label, obj_type):
|
|
"""Gets a label for an object. Attempt to make this as universal as possible.
|
|
The object graph in the database should reflect the real object graph,
|
|
only rarely diverging. This should be true even in highly parallel environments
|
|
including cases where trap files may be overwritten.
|
|
Proviso: Distinct immutable primitive objects may be merged (which should be benign)
|
|
For objects without a unambiguous global name, 'default_label' is used.
|
|
"""
|
|
#This code must be robust against (possibly intentionally) incorrect implementations
|
|
#of the object model.
|
|
if obj is None:
|
|
return u"C_None"
|
|
t = type(obj)
|
|
t_name = t.__name__
|
|
if t is tuple and len(obj) == 0:
|
|
return u"C_EmptyTuple"
|
|
|
|
if obj_type is str:
|
|
prefix = u"C_unicode$"
|
|
else:
|
|
prefix = u"C_bytes$"
|
|
if t is str:
|
|
obj = obj.encode("utf8", errors='replace')
|
|
return prefix + hashlib.sha1(obj).hexdigest()
|
|
if t is bytes:
|
|
return prefix + hashlib.sha1(obj).hexdigest()
|
|
if t in NUMERIC_TYPES:
|
|
return u"C_" + t_name + u"$" + repr(obj)
|
|
try:
|
|
if isinstance(obj, type):
|
|
return u"C_" + t_name + u"$" + qualified_type_name(obj)
|
|
except Exception:
|
|
#Misbehaved object.
|
|
return default_label
|
|
if t is ModuleType:
|
|
return u"C_" + t_name + u"$" + obj.__name__
|
|
if t is type(len):
|
|
mod_name = obj.__module__
|
|
if isinstance(mod_name, str):
|
|
if mod_name == BUILTINS_NAME:
|
|
mod_name = "builtins"
|
|
return u"C_" + t_name + u"$" + mod_name + "." + obj.__name__
|
|
return default_label
|
|
|
|
# Python files -- Extract objects for all numeric and string values.
|
|
|
|
def _extract_py(self, ast):
|
|
self._walk_py(ast)
|
|
|
|
def _write_literal(self, node):
|
|
if isinstance(node, ast.Num):
|
|
self._write_c_object(node.n, None, False)
|
|
else:
|
|
prefix = getattr(node, "prefix", "")
|
|
# Output both byte and unicode objects if the relevant objects could exist
|
|
# Non-prefixed strings can be either bytes or unicode.
|
|
if 'u' not in prefix:
|
|
try:
|
|
self._write_c_object(node.s.encode("latin-1"), None, False, string_prefix=prefix)
|
|
except UnicodeEncodeError:
|
|
#If not encodeable as latin-1 then it cannot be bytes
|
|
pass
|
|
if 'b' not in prefix:
|
|
self._write_c_object(node.s, None, False, string_prefix=prefix)
|
|
|
|
def _walk_py(self, node):
|
|
if isinstance(node, ast.AstBase):
|
|
if isinstance(node, LITERALS):
|
|
self._write_literal(node)
|
|
else:
|
|
for _, _, child_node in iter_fields(node):
|
|
self._walk_py(child_node)
|
|
elif isinstance(node, list):
|
|
for n in node:
|
|
self._walk_py(n)
|
|
|
|
def a_function():
|
|
pass
|
|
|
|
def a_generator_function():
|
|
yield None
|
|
|
|
class C(object):
|
|
def meth(self):
|
|
pass
|
|
|
|
#Create an object for 'exec', as parser no longer treats it as statement.
|
|
# Use `[].append` as it has the same type as `exec`.
|
|
EXEC = [].append
|
|
|
|
SPECIAL_OBJECTS = {
|
|
type(a_function): u"FunctionType",
|
|
type(len): u"BuiltinFunctionType",
|
|
classmethod: u"ClassMethod",
|
|
staticmethod: u"StaticMethod",
|
|
type(sys): u"ModuleType",
|
|
type(a_generator_function()): u"generator",
|
|
None: u"None",
|
|
type(None): u"NoneType",
|
|
True: u"True",
|
|
False: u"False",
|
|
bool: u"bool",
|
|
sys: u"sys",
|
|
Exception: u"Exception",
|
|
BaseException: u"BaseException",
|
|
TypeError: u"TypeError",
|
|
AttributeError: u"AttributeError",
|
|
KeyError: u"KeyError",
|
|
int: u"int",
|
|
float: u"float",
|
|
object: u"object",
|
|
type: u"type",
|
|
tuple: u"tuple",
|
|
dict: u"dict",
|
|
list: u"list",
|
|
set: u"set",
|
|
locals: u"locals",
|
|
globals: u"globals",
|
|
property: u"property",
|
|
type(list.append): u"MethodDescriptorType",
|
|
super: u"super",
|
|
type(C().meth): u"MethodType",
|
|
#For future enhancements
|
|
object(): u"_1",
|
|
object(): u"_2",
|
|
#Make sure we have all version numbers as single character strings.
|
|
b'2': u'b2',
|
|
b'3': u'b3',
|
|
u'2': u'u2',
|
|
u'3': u'u3',
|
|
}
|
|
|
|
SPECIAL_OBJECTS[__import__(BUILTINS_NAME)] = u"builtin_module"
|
|
SPECIAL_OBJECTS[str] = u"unicode"
|
|
SPECIAL_OBJECTS[bytes] = u"bytes"
|
|
|
|
#Store wrapped versions of special objects, so that they compare correctly.
|
|
tmp = {}
|
|
for obj, name in SPECIAL_OBJECTS.items():
|
|
tmp[_CObject(obj)] = name
|
|
SPECIAL_OBJECTS = tmp
|
|
del tmp
|
|
|
|
#List of various attributes VM implementation details we want to skip.
|
|
SKIPLIST = set([
|
|
(sys, "exc_value"),
|
|
(sys, "exc_type"),
|
|
(sys, "exc_traceback"),
|
|
(__import__(BUILTINS_NAME), "_"),
|
|
])
|
|
|
|
def qualified_type_name(cls):
|
|
#Special case bytes/str/unicode to make sure they share names across versions
|
|
if cls is bytes:
|
|
return u"bytes"
|
|
if cls is str:
|
|
return u"unicode"
|
|
if cls.__module__ == BUILTINS_NAME or cls.__module__ == "exceptions":
|
|
return cls.__name__
|
|
else:
|
|
return cls.__module__ + "." + cls.__name__
|