codeql/python/extractor/semmle/util.py

import sys
import codecs
import gzip
import re
import os.path
import random
import base64
import hashlib
from io import BytesIO

#Semantic version of extractor.
#Update this if any changes are made
VERSION = "7.1.6"

PY_EXTENSIONS = ".py", ".pyw"

STDLIB_PATH = os.path.dirname(os.__file__)

def get_analysis_version():
    return PYTHON_ANALYSIS_VERSION

def get_analysis_major_version():
    return PYTHON_ANALYSIS_MAJOR_VERSION

def update_analysis_version(version):
    global PYTHON_ANALYSIS_VERSION
    PYTHON_ANALYSIS_VERSION = version
    global PYTHON_ANALYSIS_MAJOR_VERSION
    PYTHON_ANALYSIS_MAJOR_VERSION = 2 if PYTHON_ANALYSIS_VERSION.startswith("2") else 3

update_analysis_version(os.environ.get("CODEQL_EXTRACTOR_PYTHON_ANALYSIS_VERSION", "3"))

#Flow graph labels:
#These should be powers of two, to allow use of bitsets.
NORMAL_EDGE = 1
FALSE_EDGE = 2
TRUE_EDGE = 4
EXCEPTIONAL_EDGE = 8
EXHAUSTED_EDGE = 16

class SemmleError(Exception):
    'Custom Error class, for reporting errors.'
    pass

#Define our own printf function to avoid Python2/3 problems.
def printf(fmt, *args):
    'Format arguments using % operator and print to sys.stdout'
    sys.stdout.write(fmt % args)

def fprintf(fout, fmt, *args):
    'Format arguments using % operator and print to file'
    fout.write(fmt % args)

def safe_string(txt):
    #Replace all characters after the first 10k
    if len(txt) > 10000:
        txt = txt[:10000] + u"..."
    return txt.replace(u'"', u'""')

def escaped_string(txt):
    return txt.replace(u'"', u'""')


if os.name == 'nt':

    MAGIC_PREFIX = u"\\\\?\\"

    def safe_path(path):
        'Returns an absolute path, safe for use on all OSes regardless of length.'
        if path.startswith(MAGIC_PREFIX):
            return path
        return MAGIC_PREFIX + os.path.abspath(path)

    _open = open

    def open(path, *args):
        assert safe_path(path) == path
        return _open(path, *args)

else:

    def safe_path(path):
        'Returns an absolute path, safe for use on all OSes regardless of length.'
        if os.path.isabs(path):
            return path
        return os.path.abspath(path)

AUTO_GEN_STRING = "/* AUTO GENERATED PART STARTS HERE */\n"

def folder_tag(name):
    return name + ';folder'

def trap_id_escape(s):
    """Escapes characters that are interpreted specially in TRAP IDs"""
    s = s.replace("&", "&amp;")
    s = s.replace("{", "&lbrace;")
    s = s.replace("}", "&rbrace;")
    s = s.replace('"', "&quot;")
    s = s.replace('@', "&commat;")
    s = s.replace('#', "&num;")
    return s

def generate_formatting_function(fmt):
    '''Generate a new function that writes its arguments with the given format.
       For example, for the format string "dd", this function will create the following function:
            def format_ss(self, name, arg0, arg1):
                self.out.write(u'%s(%s %s)\\n' % (name, str(arg0), str(arg1)))
    '''
    func_name = 'format_' + fmt
    args = ['self', 'name'] + [ 'arg%d' % i for i in range(len(fmt)) ]
    defn = 'def %s(%s):\n' % (func_name, ', '.join(args))
    values = [ _formatting_functions[f](a) for f, a in zip(fmt, args[2:])]
    format_string = "u'%s(" + ', '.join(['%s'] * len(fmt)) + ")\\n'"
    body = '    self.out.write(%s %% (%s))\n' % (format_string, ',\n'.join(['name'] + values))
    func = defn + body
    namespace = globals()
    exec (func, namespace)
    function = namespace[func_name]
    del namespace[func_name]
    return function

def _format_d(val):
    return 'repr(%s)' % val

def _format_g(val):
    return 'self.pool.get(%s, %s)' % (val, val)

def _format_n(val):
    return '''self.pool.get(%s, %s.trap_name) if hasattr(%s, 'trap_name') else self.pool.get(%s)''' % (val, val, val, val)

def _format_r(val):
    return val

def _format_u(val):
    return '''_INVALID_RE.sub(u'\uFFFD', u'"%%s"' %% safe_string(%s))''' % val

def _format_b(val):
    return '''u'"%%s"' %% safe_string(%s.decode("latin-1"))''' % val

def _format_s(val):
    return '''%s if isinstance(%s, bytes) else _INVALID_RE.sub(u'\uFFFD', u'"%%s"' %% safe_string(str(%s)))''' % (_format_b(val), val, val)

def _format_B(val):
    return '''u'"%%s"' %% escaped_string(%s.decode("latin-1"))''' % val

def _format_S(val):
    return '''%s if isinstance(%s, bytes) else _INVALID_RE.sub(u'\uFFFD', u'"%%s"' %% escaped_string(str(%s)))''' % (_format_B(val), val, val)

def _format_x(val):
    return '''(u"false", u"true")[%s]''' % val

def _format_q(val):
    return 'format_numeric_literal(%s)' % val

_formatting_functions = {
    'b' : _format_b,
    'd' : _format_d,
    'g' : _format_g,
    'n' : _format_n,
    'r' : _format_r,
    's' : _format_s,
    'u' : _format_u,
    'x' : _format_x,
    'q' : _format_q,
    'B' : _format_B,
    'S' : _format_S,
}


def format_numeric_literal(val):
    txt = repr(val)
    return u'"%s"' % txt

class Buffer(object):
    def __init__(self, out):
        self.out = out
        self.buf = []

    def write(self, content):
        self.buf.append(content)
        if len(self.buf) > 10000:
            self.flush()

    def close(self):
        self.flush()
        self.out.close()

    def flush(self):
        self.out.write(u''.join(self.buf))
        self.buf = []

class Utf8Zip(object):

    def __init__(self):
        self.raw = BytesIO()
        gout = gzip.GzipFile('', 'wb', 5, fileobj=self.raw)
        self.out = codecs.getwriter('utf-8')(gout, errors='backslashreplace')

    def write(self, data):
        self.out.write(data)

    def close(self):
        self.out.close()

    def getvalue(self):
        return self.raw.getvalue()


class TrapWriter(object):

    _format_functions = {}

    def __init__(self):
        self.zip = Utf8Zip()
        self.out = Buffer(self.zip)
        self.pool = IDPool(self.out)
        self.written_containers = {}

    def write_tuple(self, name, fmt, *args):
        '''Write tuple accepts the following format characters:
           'b' : A bytes object. Limits the resulting string to ~10k.
           'd' : An integer
           'g' : A unicode object, as a globally shared object
           'n' : A node object (any AST, flow or variable node)
           'r' : "Raw", a precomputed id or similar.
           's' : Any object to be written as a unicode string. Limits the string to ~10k.
           'u' : A unicode object, as a string
           'x' : A boolean
           'B' : Like 'b' but not limited to 10k
           'S' : Like 's' but not limited to 10k
        '''
        if fmt in self._format_functions:
            return self._format_functions[fmt](self, name, *args)
        func = generate_formatting_function(fmt)
        self._format_functions[fmt] = func
        return func(self, name, *args)

    def get_node_id(self, node):
        if hasattr(node, 'trap_name'):
            return self.pool.get(node, node.trap_name)
        else:
            return self.pool.get(node)

    def has_written(self, node):
        return node in self.pool.pool

    def get_unique_id(self):
        return self.pool.get_unique_id()

    '''Return an id that is shared across trap files,
       whenever the label is used'''
    def get_labelled_id(self, obj, label):
        return self.pool.get(obj, label)

    def write_container(self, fullpath, is_file):
        if fullpath in self.written_containers:
            return self.written_containers[fullpath]
        folder, filename = os.path.split(fullpath)
        if is_file:
            tag = get_source_file_tag(fullpath)
            self.write_tuple(u'files', 'gs', tag, fullpath)
        else:
            tag = get_folder_tag(fullpath)
            self.write_tuple(u'folders', 'gs', tag, fullpath)
        self.written_containers[fullpath] = tag
        if folder and filename:
            folder_tag = self.write_container(folder, False)
            self.write_tuple(u'containerparent' , 'gg', folder_tag, tag)
        return tag

    def write_file(self, fullpath):
        '''Writes `files` tuple plus all container tuples, up to the root.
        Returns the tag.
        Records tuples written to avoid duplication.
        '''
        return self.write_container(fullpath, True)

    def write_folder(self, fullpath):
        '''Writes `folders` tuple plus all container tuples, up to the root.
        Returns the tag.
        Records tuples written to avoid duplication.
        '''
        return self.write_container(fullpath, False)

    def get_compressed(self):
        '''Returns the gzipped compressed, utf-8 encoded contents of this trap file.
        Closes the underlying zip stream, which means that no more tuples can be added.'''
        self.out.close()
        return self.zip.getvalue()

    def write_comment(self, text):
        self.out.write(u'// %s\n' % text)

# RegEx to find invalid characters
_INVALID_RE = re.compile(u'[^\u0000-\uD7FF\uE000-\uFFFF]', re.UNICODE)

class _HashableList(object):
    'Utility class for handling lists in the IDPool'

    def __init__(self, items):
        self.items = items

    def __eq__(self, other):
        if not isinstance(other, _HashableList):
            return False
        return self.items is other.items

    def __ne__(self, other):
        if not isinstance(other, _HashableList):
            return True
        return self.items is not other.items

    def __hash__(self):
        return id(self.items)

class IDPool(object):

    def __init__(self, out, init_id = 10000):
        self.out = out
        self.pool = {}
        self.next_id = init_id

    def get_unique_id(self):
        res = u'#' + str(self.next_id)
        self.out.write(res + u' = *\n')
        self.next_id += 1
        return res

    def get(self, node, name=None):
        """Gets the ID for the given node, creating a new one if necessary.
        Inside name (if supplied), the characters &, {, }, ", @, and # will be escaped,
        as these have special meaning in TRAP IDs
        """
        #Need to special case lists as they are unhashable
        if type(node) is list:
            node = _HashableList(node)
        if node in self.pool:
            return self.pool[node]
        next_id = (u'#' +
                   str(self.next_id))
        if name is not None:
            name = str(name)
            name = u'@"%s"' % safe_string(trap_id_escape(name))
        else:
            name = u'*'
        self.out.write(u"%s = %s\n" % (next_id, name))
        self.pool[node] = next_id
        self.next_id += 1
        return next_id


def get_folder_tag(folder):
    return '/'.join(folder.split(os.path.sep)) + ';folder'


def get_source_file_tag(fullpath):
    return fullpath, sys.getfilesystemencoding() + u';sourcefile'

def makedirs(path):
    try:
        os.makedirs(path)
    except OSError:
        #If directory does not exist then error was a real one.
        if not os.path.isdir(path):
            raise

def clean_cache(subdir, suffix, verbose):
    #Remove any pre-existing cached files as they are now out of date
    if os.path.exists(subdir):
        for filename in os.listdir(subdir):
            if not filename.endswith(suffix):
                continue
            filepath = os.path.join(subdir, filename)
            try:
                if verbose:
                    print ("Deleting stale trap file: " + filepath)
                os.remove(filepath)
            except Exception as ex:
                if verbose:
                    msg = "Failed to remove stale trap file %s due to %s"
                    print (msg % (filepath, repr(ex)))
    else:
        makedirs(subdir)

if os.name == 'nt':

    def storage_path(container, path):
        ''' Returns a path in a source archive, trap-output or trap-cache.'''
        path = path.replace(":", "_")
        if os.path.isabs(path):
            path = path[1:]
        return safe_path(os.path.join(container, path))

    def isdir(path):
        if len(path) > 240:
            path = "\\\\?\\" + path
        return os.path.isdir(path)

    def islink(path):
        if len(path) > 240:
            path = "\\\\?\\" + path
        return os.path.islink(path)

    def listdir(path):
        if len(path) > 240:
            path = "\\\\?\\" + path
        return os.listdir(path)

else:

    def storage_path(container, path):
        ''' Returns a path in a source archive, trap-output or trap-cache.'''
        if os.path.isabs(path):
            path = path[1:]
        return safe_path(os.path.join(container, path))

    isdir = os.path.isdir
    islink = os.path.islink
    listdir = os.listdir


LATIN1 = codecs.lookup("latin-1")
UTF8 = codecs.lookup("utf-8")

def was_interned_ascii_bytes(txt):
    return txt is sys.intern(txt[:])

def is_a_number(txt):
    try:
        float(txt)
        return True
    except ValueError:
        return False


#Should only be set to True for debugging and testing
USE_INTOLERANT_ENCODING = False

def change_default_encoding():

    if USE_INTOLERANT_ENCODING:

        def _decode(input, errors=None):
            '''If the input is interned (program source) or a number, then it is safe to implicitly convert it.
            Otherwise it may not be, so raise an exception'''
            if not was_interned_ascii_bytes(input) and not is_a_number(input):
                f = sys._getframe(1)
                if "semmle" in f.f_code.co_filename:
                    raise SemmleError(b"Implicit decode of '%s' at %s:%d" % (input, f.f_code.co_filename, f.f_lineno))
            try:
                return UTF8.decode(input)
            except UnicodeDecodeError:
                return LATIN1.decode(input)

        def _encode(input, errors=None):
            f = sys._getframe(1)
            if "semmle" in f.f_code.co_filename:
                raise SemmleError("Implicit encode of '%s' at %s:%d" % (UTF8.encode(input), f.f_code.co_filename, f.f_lineno))
            return UTF8.encode(input, "backslashreplace")

    else:

        def _decode(input, errors=None):
            '''Convert bytes to unicode without failing.'''
            try:
                return UTF8.decode(input)
            except UnicodeDecodeError:
                return LATIN1.decode(input)

        def _encode(input, errors=None):
            '''Convert unicode to bytes without failing.'''
            return UTF8.encode(input, "backslashreplace")

    def search(name):
        if name != "safe":
            return None
        return codecs.CodecInfo(_encode, _decode, name="safe")
    codecs.register(search)
    from importlib import reload
    reload(sys)
    sys.setdefaultencoding("safe")
    del sys.setdefaultencoding

_sys_rand = random.SystemRandom()

def uuid(local_name):
    '''Return a randomised string to use as a UUID.
    Do not use the uuid module as it calls out to ldconfig,
    which is prohibited in some sandboxed environments.
    '''
    hex_string = hex(_sys_rand.randrange(1 << 256))
    #Strip leading '0x'
    return hex_string[2:] + "-" + local_name


class Extractable(object):
    '''Extractable class representing a Extractable of extraction.
    Typically a file, but may be other things like a built-in Python module.
    '''

    def __ne__(self, other):
        return not self == other

    @staticmethod
    def from_path(path):
        if os.path.isdir(path):
            return FolderExtractable(path)
        elif os.path.isfile(path):
            return FileExtractable(path)
        else:
            raise IOError("% does not exist" % path)

class PathExtractable(Extractable):

    PATTERN = 421706893

    __slots__ = [ 'path' ]

    def __init__(self, path):
        assert "<compiled code>" not in path
        self.path = path

    def __eq__(self, other):
        return isinstance(other, type(self)) and self.path == other.path

    def __hash__(self):
        return hash(self.path) ^ self.PATTERN

class FileExtractable(PathExtractable):

    PATTERN = 1903946595

    __slots__ = [ 'path' ]

    def __str__(self):
        return "file " + self.path

    def __repr__(self):
        return "FileExtractable(%r)" % self.path


class FolderExtractable(PathExtractable):

    PATTERN = 712343093

    __slots__ = [ 'path' ]

    def __str__(self):
        return "folder " + self.path

    def __repr__(self):
        return "FolderExtractable(%r)" % self.path

class BuiltinModuleExtractable(Extractable):

    __slots__ = [ 'name' ]

    def __init__(self, name):
        self.name = name

    def __str__(self):
        return "module " + self.name

    def __repr__(self):
        return "BuiltinModuleExtractable(%r)" % self.name

    def __eq__(self, other):
        return isinstance(other, BuiltinModuleExtractable) and self.name == other.name

    def __hash__(self):
        return hash(self.name) ^ 82753421

def base64digest(code):
    return base64.b64encode(hashlib.sha1(code.encode("utf8")).digest(), b"_-").decode("ascii")