import ast import sys from types import ModuleType, GetSetDescriptorType import hashlib import os from semmle.python import ast from semmle.python.passes._pass import Pass from semmle.util import get_analysis_major_version from semmle.python.passes.ast_pass import iter_fields from semmle.cmdline import is_legal_module_name ''' The QL library depends on a reasonable one-to-one correspondence between DB entities and Python objects. However, since QL has only one notion of equality, but Python has two (`__eq__` and `is`) we need to be careful. What we want to do is to treat objects like builtin functions and classes as using reference equality and numbers and strings as using value equality. In practice this is impossible as we want to distinguish `True` from `1` from `1.0` even though all these values are equal. However, we want to get as close as possible. ''' __all__ = [ 'ObjectPass' ] OBJECT_TYPES = set([ ast.ClassExpr, ast.Call, ast.FunctionExpr, ast.Tuple, ast.Str, ast.Num, ast.List, ast.ListComp, ast.Module, ast.Dict, ast.Ellipsis, ast.Lambda]) # Types from Python 2.7 onwards OBJECT_TYPES.add(ast.DictComp) OBJECT_TYPES.add(ast.SetComp) OBJECT_TYPES.add(ast.Set) NUMERIC_TYPES = set([int, float, bool]) BUILTINS_NAME = 'builtins' LITERALS = (ast.Num, ast.Str) class _CObject(object): '''Utility class to wrap arbitrary C objects. Treat all objects as unique. Rely on naming in the trap files to merge the objects that we want merged. ''' __slots__ = ['obj'] def __init__(self, obj): self.obj = obj def __eq__(self, other): if isinstance(other, _CObject): return self.obj is other.obj else: return False def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return id(self.obj) class ObjectPass(Pass): '''Generates relations for objects. This includes information about builtin objects, including their types and members. It also generates objects for all literal values present in the Python source.''' def extract(self, ast, path, writer): self.writer = writer try: self._extract_py(ast) self._extract_possible_module_names(path) finally: self.writer = None def _extract_possible_module_names(self, path): maybe_name, _ = os.path.splitext(path) maybe_name = maybe_name.replace(os.sep, ".") while maybe_name.count(".") > 3: _, maybe_name = maybe_name.split(".", 1) while True: if is_legal_module_name(maybe_name): self._write_module_and_package_names(maybe_name) if "." not in maybe_name: return _, maybe_name = maybe_name.split(".", 1) def _write_module_and_package_names(self, module_name): self._write_c_object(module_name, None, False) while "." in module_name: module_name, _ = module_name.rsplit(".", 1) self._write_c_object(module_name, None, False) def extract_builtin(self, module, writer): self.writer = writer try: self._extract_c(module) finally: self.writer = None def _extract_c(self, mod): self.next_address_label = 0 self.address_labels = {} self._write_c_object(mod, None, False) self.address_labels = None def _write_str(self, s): assert type(s) is str self._write_c_object(s, None, False) def _write_c_object(self, obj, label, write_special, string_prefix=""): ANALYSIS_MAJOR_VERSION = get_analysis_major_version() # If we're extracting Python 2 code using Python 3, we want to treat `str` as `bytes` for # the purposes of determining the type, but we still want to treat the _value_ as if it's a `str`. obj_type = type(obj) if obj_type == str and ANALYSIS_MAJOR_VERSION == 2 and 'u' not in string_prefix: obj_type = bytes cobj = _CObject(obj) if self.writer.has_written(cobj): return self.writer.get_node_id(cobj) obj_label = self.get_label_for_object(obj, label, obj_type) obj_id = self.writer.get_labelled_id(cobj, obj_label) #Avoid writing out all the basic types for every C module. if not write_special and cobj in SPECIAL_OBJECTS: return obj_id type_id = self._write_c_object(obj_type, None, write_special) self.writer.write_tuple(u'py_cobjects', 'r', obj_id) self.writer.write_tuple(u'py_cobjecttypes', 'rr', obj_id, type_id) self.writer.write_tuple(u'py_cobject_sources', 'rd', obj_id, 0) if isinstance(obj, ModuleType) or isinstance(obj, type): for name, value in sorted(obj.__dict__.items()): if (obj, name) in SKIPLIST: continue val_id = self._write_c_object(value, obj_label + u'$%d' % ANALYSIS_MAJOR_VERSION + name, write_special) self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs', obj_id, name, val_id, ANALYSIS_MAJOR_VERSION) if isinstance(obj, type) and obj is not object: super_id = self._write_c_object(obj.__mro__[1], None, write_special) self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs', obj_id, u".super.", super_id, ANALYSIS_MAJOR_VERSION) if isinstance(obj, (list, tuple)): for index, item in enumerate(obj): item_id = self._write_c_object(item, obj_label + u'$' + str(index), write_special) self.writer.write_tuple(u'py_citems', 'rdr', obj_id, index, item_id) if type(obj) is GetSetDescriptorType: for name in type(obj).__dict__: if name == '__name__' or not hasattr(obj, name): continue val_id = self._write_c_object(getattr(obj, name), obj_label + u'$%d' % ANALYSIS_MAJOR_VERSION + name, write_special) self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs', obj_id, name, val_id, ANALYSIS_MAJOR_VERSION) if hasattr(obj, '__name__'): #Use qualified names for classes. if isinstance(obj, type): name = qualified_type_name(obj) # https://bugs.python.org/issue18602 elif isinstance(obj, ModuleType) and obj.__name__ == "io": name = "_io" elif obj is EXEC: name = "exec" else: name = obj.__name__ self.writer.write_tuple(u'py_cobjectnames', 'rs', obj_id, name) elif type(obj) in NUMERIC_TYPES: self.writer.write_tuple(u'py_cobjectnames', 'rq', obj_id, obj) elif type(obj) is str: if 'b' in string_prefix: prefix = u"b" elif 'u' in string_prefix: prefix = u"u" else: if ANALYSIS_MAJOR_VERSION == 2: prefix = u"b" else: prefix = u"u" self.writer.write_tuple(u'py_cobjectnames', 'rs', obj_id, prefix + u"'" + obj + u"'") elif type(obj) is bytes: #Convert bytes to a unicode characters one-to-one. obj_string = u"b'" + obj.decode("latin-1") + u"'" self.writer.write_tuple(u'py_cobjectnames', 'rs', obj_id, obj_string) elif type(obj) is type(None): self.writer.write_tuple(u'py_cobjectnames', 'rs', obj_id, u'None') else: self.writer.write_tuple(u'py_cobjectnames', 'rs', obj_id, u'object') return obj_id def write_special_objects(self, writer): '''Write important builtin objects to the trap file''' self.writer = writer self.next_address_label = 0 self.address_labels = {} def write(obj, name, label=None): obj_id = self._write_c_object(obj, label, True) self.writer.write_tuple(u'py_special_objects', 'rs', obj_id, name) for obj, name in SPECIAL_OBJECTS.items(): write(obj.obj, name) ###Artificial objects for use by the type-inferencer - Make sure that they are unique. write(object(), u"_semmle_unknown_type", u"$_semmle_unknown_type") write(object(), u"_semmle_undefined_value", u"$_semmle_undefined_value") self.writer = None self.address_labels = None def get_label_for_object(self, obj, default_label, obj_type): """Gets a label for an object. Attempt to make this as universal as possible. The object graph in the database should reflect the real object graph, only rarely diverging. This should be true even in highly parallel environments including cases where trap files may be overwritten. Proviso: Distinct immutable primitive objects may be merged (which should be benign) For objects without a unambiguous global name, 'default_label' is used. """ #This code must be robust against (possibly intentionally) incorrect implementations #of the object model. if obj is None: return u"C_None" t = type(obj) t_name = t.__name__ if t is tuple and len(obj) == 0: return u"C_EmptyTuple" if obj_type is str: prefix = u"C_unicode$" else: prefix = u"C_bytes$" if t is str: obj = obj.encode("utf8", errors='replace') return prefix + hashlib.sha1(obj).hexdigest() if t is bytes: return prefix + hashlib.sha1(obj).hexdigest() if t in NUMERIC_TYPES: return u"C_" + t_name + u"$" + repr(obj) try: if isinstance(obj, type): return u"C_" + t_name + u"$" + qualified_type_name(obj) except Exception: #Misbehaved object. return default_label if t is ModuleType: return u"C_" + t_name + u"$" + obj.__name__ if t is type(len): mod_name = obj.__module__ if isinstance(mod_name, str): if mod_name == BUILTINS_NAME: mod_name = "builtins" return u"C_" + t_name + u"$" + mod_name + "." + obj.__name__ return default_label # Python files -- Extract objects for all numeric and string values. def _extract_py(self, ast): self._walk_py(ast) def _write_literal(self, node): if isinstance(node, ast.Num): self._write_c_object(node.n, None, False) else: prefix = getattr(node, "prefix", "") # Output both byte and unicode objects if the relevant objects could exist # Non-prefixed strings can be either bytes or unicode. if 'u' not in prefix: try: self._write_c_object(node.s.encode("latin-1"), None, False, string_prefix=prefix) except UnicodeEncodeError: #If not encodeable as latin-1 then it cannot be bytes pass if 'b' not in prefix: self._write_c_object(node.s, None, False, string_prefix=prefix) def _walk_py(self, node): if isinstance(node, ast.AstBase): if isinstance(node, LITERALS): self._write_literal(node) else: for _, _, child_node in iter_fields(node): self._walk_py(child_node) elif isinstance(node, list): for n in node: self._walk_py(n) def a_function(): pass def a_generator_function(): yield None class C(object): def meth(self): pass #Create an object for 'exec', as parser no longer treats it as statement. # Use `[].append` as it has the same type as `exec`. EXEC = [].append SPECIAL_OBJECTS = { type(a_function): u"FunctionType", type(len): u"BuiltinFunctionType", classmethod: u"ClassMethod", staticmethod: u"StaticMethod", type(sys): u"ModuleType", type(a_generator_function()): u"generator", None: u"None", type(None): u"NoneType", True: u"True", False: u"False", bool: u"bool", sys: u"sys", Exception: u"Exception", BaseException: u"BaseException", TypeError: u"TypeError", AttributeError: u"AttributeError", KeyError: u"KeyError", int: u"int", float: u"float", object: u"object", type: u"type", tuple: u"tuple", dict: u"dict", list: u"list", set: u"set", locals: u"locals", globals: u"globals", property: u"property", type(list.append): u"MethodDescriptorType", super: u"super", type(C().meth): u"MethodType", #For future enhancements object(): u"_1", object(): u"_2", #Make sure we have all version numbers as single character strings. b'2': u'b2', b'3': u'b3', u'2': u'u2', u'3': u'u3', } SPECIAL_OBJECTS[__import__(BUILTINS_NAME)] = u"builtin_module" SPECIAL_OBJECTS[str] = u"unicode" SPECIAL_OBJECTS[bytes] = u"bytes" #Store wrapped versions of special objects, so that they compare correctly. tmp = {} for obj, name in SPECIAL_OBJECTS.items(): tmp[_CObject(obj)] = name SPECIAL_OBJECTS = tmp del tmp #List of various attributes VM implementation details we want to skip. SKIPLIST = set([ (sys, "exc_value"), (sys, "exc_type"), (sys, "exc_traceback"), (__import__(BUILTINS_NAME), "_"), ]) def qualified_type_name(cls): #Special case bytes/str/unicode to make sure they share names across versions if cls is bytes: return u"bytes" if cls is str: return u"unicode" if cls.__module__ == BUILTINS_NAME or cls.__module__ == "exceptions": return cls.__name__ else: return cls.__module__ + "." + cls.__name__