codeql/python/extractor/semmle/python/passes/objects.py


import ast
import sys
from types import ModuleType, GetSetDescriptorType
import hashlib
import os

from semmle.python import ast
from semmle.python.passes._pass import Pass
from semmle.util import get_analysis_major_version
from semmle.python.passes.ast_pass import iter_fields
from semmle.cmdline import is_legal_module_name


'''
The QL library depends on a reasonable one-to-one correspondence
between DB entities and Python objects. However, since QL has only
one notion of equality, but Python has two (`__eq__` and `is`) we need to be careful.
What we want to do is to treat objects like builtin functions and classes as using
reference equality and numbers and strings as using value equality.

In practice this is impossible as we want to distinguish `True` from `1` from `1.0`
even though all these values are equal. However, we want to get as close as possible.

'''


__all__ = [ 'ObjectPass' ]

OBJECT_TYPES = set([ ast.ClassExpr, ast.Call,
                     ast.FunctionExpr, ast.Tuple,
                     ast.Str, ast.Num, ast.List, ast.ListComp, ast.Module,
                     ast.Dict, ast.Ellipsis, ast.Lambda])

# Types from Python 2.7 onwards
OBJECT_TYPES.add(ast.DictComp)
OBJECT_TYPES.add(ast.SetComp)
OBJECT_TYPES.add(ast.Set)

NUMERIC_TYPES = set([int, float, bool])

BUILTINS_NAME = 'builtins'

LITERALS = (ast.Num, ast.Str)

class _CObject(object):
    '''Utility class to wrap arbitrary C objects.
    Treat all objects as unique. Rely on naming in the
    trap files to merge the objects that we want merged.
    '''
    __slots__ = ['obj']

    def __init__(self, obj):
        self.obj = obj

    def __eq__(self, other):
        if isinstance(other, _CObject):
            return self.obj is other.obj
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return id(self.obj)

class ObjectPass(Pass):
    '''Generates relations for objects. This includes information about
    builtin objects, including their types and members.
    It also generates objects for all literal values present in the Python source.'''

    def extract(self, ast, path, writer):
        self.writer = writer
        try:
            self._extract_py(ast)
            self._extract_possible_module_names(path)
        finally:
            self.writer = None

    def _extract_possible_module_names(self, path):
        maybe_name, _ = os.path.splitext(path)
        maybe_name = maybe_name.replace(os.sep, ".")
        while maybe_name.count(".") > 3:
            _, maybe_name = maybe_name.split(".", 1)
        while True:
            if is_legal_module_name(maybe_name):
                self._write_module_and_package_names(maybe_name)
            if "." not in maybe_name:
                return
            _, maybe_name = maybe_name.split(".", 1)

    def _write_module_and_package_names(self, module_name):
        self._write_c_object(module_name, None, False)
        while "." in module_name:
            module_name, _ = module_name.rsplit(".", 1)
            self._write_c_object(module_name, None, False)

    def extract_builtin(self, module, writer):
        self.writer = writer
        try:
            self._extract_c(module)
        finally:
            self.writer = None

    def _extract_c(self, mod):
        self.next_address_label = 0
        self.address_labels = {}
        self._write_c_object(mod, None, False)
        self.address_labels = None

    def _write_str(self, s):
        assert type(s) is str
        self._write_c_object(s, None, False)

    def _write_c_object(self, obj, label, write_special, string_prefix=""):
        ANALYSIS_MAJOR_VERSION = get_analysis_major_version()
        # If we're extracting Python 2 code using Python 3, we want to treat `str` as `bytes` for
        # the purposes of determining the type, but we still want to treat the _value_ as if it's a `str`.
        obj_type = type(obj)
        if obj_type == str and ANALYSIS_MAJOR_VERSION == 2 and 'u' not in string_prefix:
            obj_type = bytes

        cobj = _CObject(obj)
        if self.writer.has_written(cobj):
            return self.writer.get_node_id(cobj)
        obj_label = self.get_label_for_object(obj, label, obj_type)
        obj_id = self.writer.get_labelled_id(cobj, obj_label)
        #Avoid writing out all the basic types for every C module.
        if not write_special and cobj in SPECIAL_OBJECTS:
            return obj_id
        type_id = self._write_c_object(obj_type, None, write_special)
        self.writer.write_tuple(u'py_cobjects', 'r', obj_id)
        self.writer.write_tuple(u'py_cobjecttypes', 'rr', obj_id, type_id)
        self.writer.write_tuple(u'py_cobject_sources', 'rd', obj_id, 0)
        if isinstance(obj, ModuleType) or isinstance(obj, type):
            for name, value in sorted(obj.__dict__.items()):
                if (obj, name) in SKIPLIST:
                    continue
                val_id = self._write_c_object(value, obj_label + u'$%d' % ANALYSIS_MAJOR_VERSION + name, write_special)
                self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs',
                                        obj_id, name, val_id, ANALYSIS_MAJOR_VERSION)
            if isinstance(obj, type) and obj is not object:
                super_id = self._write_c_object(obj.__mro__[1], None, write_special)
                self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs',
                                        obj_id, u".super.", super_id, ANALYSIS_MAJOR_VERSION)
        if isinstance(obj, (list, tuple)):
            for index, item in enumerate(obj):
                item_id = self._write_c_object(item, obj_label + u'$' + str(index), write_special)
                self.writer.write_tuple(u'py_citems', 'rdr',
                                        obj_id, index, item_id)
        if type(obj) is GetSetDescriptorType:
            for name in type(obj).__dict__:
                if name == '__name__' or not hasattr(obj, name):
                    continue
                val_id = self._write_c_object(getattr(obj, name), obj_label + u'$%d' % ANALYSIS_MAJOR_VERSION + name, write_special)
                self.writer.write_tuple(u'py_cmembers_versioned', 'rsrs',
                                        obj_id, name, val_id, ANALYSIS_MAJOR_VERSION)
        if hasattr(obj, '__name__'):
            #Use qualified names for classes.
            if isinstance(obj, type):
                name = qualified_type_name(obj)
            # https://bugs.python.org/issue18602
            elif isinstance(obj, ModuleType) and obj.__name__ == "io":
                name = "_io"
            elif obj is EXEC:
                name = "exec"
            else:
                name = obj.__name__
            self.writer.write_tuple(u'py_cobjectnames', 'rs',
                                    obj_id, name)
        elif type(obj) in NUMERIC_TYPES:
            self.writer.write_tuple(u'py_cobjectnames', 'rq',
                                    obj_id, obj)
        elif type(obj) is str:
            if 'b' in string_prefix:
                prefix = u"b"
            elif 'u' in string_prefix:
                prefix = u"u"
            else:
                if ANALYSIS_MAJOR_VERSION == 2:
                    prefix = u"b"
                else:
                    prefix = u"u"
            self.writer.write_tuple(u'py_cobjectnames', 'rs',
                                    obj_id, prefix + u"'" + obj + u"'")
        elif type(obj) is bytes:
            #Convert bytes to a unicode characters one-to-one.
            obj_string = u"b'" + obj.decode("latin-1") + u"'"
            self.writer.write_tuple(u'py_cobjectnames', 'rs',
                                    obj_id, obj_string)
        elif type(obj) is type(None):
            self.writer.write_tuple(u'py_cobjectnames', 'rs',
                                    obj_id, u'None')
        else:
            self.writer.write_tuple(u'py_cobjectnames', 'rs',
                                    obj_id, u'object')
        return obj_id

    def write_special_objects(self, writer):
        '''Write important builtin objects to the trap file'''
        self.writer = writer
        self.next_address_label = 0
        self.address_labels = {}

        def write(obj, name, label=None):
            obj_id = self._write_c_object(obj, label, True)
            self.writer.write_tuple(u'py_special_objects', 'rs', obj_id, name)

        for obj, name in SPECIAL_OBJECTS.items():
            write(obj.obj, name)

        ###Artificial objects for use by the type-inferencer - Make sure that they are unique.
        write(object(), u"_semmle_unknown_type", u"$_semmle_unknown_type")
        write(object(), u"_semmle_undefined_value", u"$_semmle_undefined_value")

        self.writer = None
        self.address_labels = None

    def get_label_for_object(self, obj, default_label, obj_type):
        """Gets a label for an object. Attempt to make this as universal as possible.
        The object graph in the database should reflect the real object graph,
        only rarely diverging. This should be true even in highly parallel environments
        including cases where trap files may be overwritten.
        Proviso: Distinct immutable primitive objects may be merged (which should be benign)
        For objects without a unambiguous global name, 'default_label' is used.
        """
        #This code must be robust against (possibly intentionally) incorrect implementations
        #of the object model.
        if obj is None:
            return u"C_None"
        t = type(obj)
        t_name = t.__name__
        if t is tuple and len(obj) == 0:
            return u"C_EmptyTuple"

        if obj_type is str:
            prefix = u"C_unicode$"
        else:
            prefix = u"C_bytes$"
        if t is str:
            obj = obj.encode("utf8", errors='replace')
            return prefix + hashlib.sha1(obj).hexdigest()
        if t is bytes:
            return prefix + hashlib.sha1(obj).hexdigest()
        if t in NUMERIC_TYPES:
            return u"C_" + t_name + u"$" + repr(obj)
        try:
            if isinstance(obj, type):
                return u"C_" + t_name + u"$" + qualified_type_name(obj)
        except Exception:
            #Misbehaved object.
            return default_label
        if t is ModuleType:
            return u"C_" + t_name + u"$" + obj.__name__
        if t is type(len):
            mod_name = obj.__module__
            if isinstance(mod_name, str):
                if mod_name == BUILTINS_NAME:
                    mod_name = "builtins"
                    return u"C_" + t_name + u"$" + mod_name + "." + obj.__name__
        return default_label

    # Python files -- Extract objects for all numeric and string values.

    def _extract_py(self, ast):
        self._walk_py(ast)

    def _write_literal(self, node):
        if isinstance(node, ast.Num):
            self._write_c_object(node.n, None, False)
        else:
            prefix = getattr(node, "prefix", "")
            # Output both byte and unicode objects if the relevant objects could exist
            # Non-prefixed strings can be either bytes or unicode.
            if 'u' not in prefix:
                try:
                    self._write_c_object(node.s.encode("latin-1"), None, False, string_prefix=prefix)
                except UnicodeEncodeError:
                    #If not encodeable as latin-1 then it cannot be bytes
                    pass
            if 'b' not in prefix:
                self._write_c_object(node.s, None, False, string_prefix=prefix)

    def _walk_py(self, node):
        if isinstance(node, ast.AstBase):
            if isinstance(node, LITERALS):
                self._write_literal(node)
            else:
                for _, _, child_node in iter_fields(node):
                    self._walk_py(child_node)
        elif isinstance(node, list):
            for n in node:
                self._walk_py(n)

def a_function():
    pass

def a_generator_function():
    yield None

class C(object):
    def meth(self):
        pass

#Create an object for 'exec', as parser no longer treats it as statement.
# Use `[].append` as it has the same type as `exec`.
EXEC = [].append

SPECIAL_OBJECTS = {
    type(a_function): u"FunctionType",
    type(len): u"BuiltinFunctionType",
    classmethod: u"ClassMethod",
    staticmethod: u"StaticMethod",
    type(sys): u"ModuleType",
    type(a_generator_function()): u"generator",
    None: u"None",
    type(None): u"NoneType",
    True: u"True",
    False: u"False",
    bool: u"bool",
    sys: u"sys",
    Exception: u"Exception",
    BaseException: u"BaseException",
    TypeError: u"TypeError",
    AttributeError: u"AttributeError",
    KeyError: u"KeyError",
    int: u"int",
    float: u"float",
    object: u"object",
    type: u"type",
    tuple: u"tuple",
    dict: u"dict",
    list: u"list",
    set: u"set",
    locals: u"locals",
    globals: u"globals",
    property: u"property",
    type(list.append): u"MethodDescriptorType",
    super: u"super",
    type(C().meth): u"MethodType",
    #For future enhancements
    object(): u"_1",
    object(): u"_2",
    #Make sure we have all version numbers as single character strings.
    b'2': u'b2',
    b'3': u'b3',
    u'2': u'u2',
    u'3': u'u3',
}

SPECIAL_OBJECTS[__import__(BUILTINS_NAME)] = u"builtin_module"
SPECIAL_OBJECTS[str] = u"unicode"
SPECIAL_OBJECTS[bytes] = u"bytes"

#Store wrapped versions of special objects, so that they compare correctly.
tmp = {}
for obj, name in SPECIAL_OBJECTS.items():
    tmp[_CObject(obj)] = name
SPECIAL_OBJECTS = tmp
del tmp

#List of various attributes VM implementation details we want to skip.
SKIPLIST = set([
    (sys, "exc_value"),
    (sys, "exc_type"),
    (sys, "exc_traceback"),
    (__import__(BUILTINS_NAME), "_"),
])

def qualified_type_name(cls):
    #Special case bytes/str/unicode to make sure they share names across versions
    if cls is bytes:
        return u"bytes"
    if cls is str:
        return u"unicode"
    if cls.__module__ == BUILTINS_NAME or cls.__module__ == "exceptions":
        return cls.__name__
    else:
        return cls.__module__ + "." + cls.__name__