codeql/python/tools/recorded-call-graph-metrics/cg_trace/tracer.py

import dataclasses
import dis
import os
import sys
from typing import Optional

# copy-paste For interactive ipython sessions
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()


def debug_print(*args, **kwargs):
    # print(*args, **kwargs, file=sys.__stderr__)
    pass


_canonic_filename_cache = dict()


def canonic_filename(filename):
    """Return canonical form of filename. (same as Bdb.canonic)

    For real filenames, the canonical form is a case-normalized (on
    case insensitive filesystems) absolute path.  'Filenames' with
    angle brackets, such as "<stdin>", generated in interactive
    mode, are returned unchanged.
    """
    if filename == "<" + filename[1:-1] + ">":
        return filename
    canonic = _canonic_filename_cache.get(filename)
    if not canonic:
        canonic = os.path.abspath(filename)
        canonic = os.path.normcase(canonic)
        _canonic_filename_cache[filename] = canonic
    return canonic


@dataclasses.dataclass(frozen=True, eq=True, order=True)
class Call:
    """A call
    """

    filename: str
    linenum: int
    inst_index: int

    @classmethod
    def from_frame(cls, frame):
        code = frame.f_code

        # Uncomment to see the bytecode
        b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
        debug_print(b.dis())

        return cls(
            filename=canonic_filename(code.co_filename),
            linenum=frame.f_lineno,
            inst_index=frame.f_lasti,
        )


def better_compare_for_dataclass(cls):
    """When dataclass is used with `order=True`, the comparison methods is only implemented for
    objects of the same class. This decorator extends the functionality to compare class
    name if used against other objects.
    """
    for op in [
        "__lt__",
        "__le__",
        "__gt__",
        "__ge__",
    ]:
        old = getattr(cls, op)

        def new(self, other):
            if type(self) == type(other):
                return old(self, other)
            return getattr(str, op)(self.__class__.__name__, other.__class__.__name__)

        setattr(cls, op, new)
    return cls


@dataclasses.dataclass(frozen=True, eq=True, order=True)
class Callee:
    pass


BUILTIN_FUNCTION_OR_METHOD = type(print)


@better_compare_for_dataclass
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class ExternalCallee(Callee):
    # Some bound methods might not have __module__ attribute: for example,
    # `list().append.__module__ is None`
    module: Optional[str]
    qualname: str
    #
    is_builtin: bool

    @classmethod
    def from_arg(cls, func):
        return cls(
            module=func.__module__,
            qualname=func.__qualname__,
            is_builtin=type(func) == BUILTIN_FUNCTION_OR_METHOD,
        )


@better_compare_for_dataclass
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class PythonCallee(Callee):
    """A callee (Function/Lambda/???)

    should (hopefully) be uniquely identified by its name and location (filename+line
    number)
    """

    filename: str
    linenum: int
    funcname: str

    @classmethod
    def from_frame(cls, frame):
        code = frame.f_code
        return cls(
            filename=canonic_filename(code.co_filename),
            linenum=frame.f_lineno,
            funcname=code.co_name,
        )


class CallGraphTracer:
    """Tracer that records calls being made

    It would seem obvious that this should have extended `trace` library
    (https://docs.python.org/3/library/trace.html), but that part is not extensible.

    You might think that we can just use `sys.settrace`
    (https://docs.python.org/3.8/library/sys.html#sys.settrace) like the basic debugger
    (bdb) does, but that isn't invoked on calls to C code, which we need in general, and
    need for handling builtins specifically.

    Luckily, `sys.setprofile`
    (https://docs.python.org/3.8/library/sys.html#sys.setprofile) provides all that we
    need. You might be scared by reading the following bit of the documentation

    > The function is thread-specific, but there is no way for the profiler to know about
    > context switches between threads, so it does not make sense to use this in the
    > presence of multiple threads.

    but that is to be understood in the context of making a profiler (you can't reliably
    measure function execution time if you don't know about context switches). For our
    use-case, this is not a problem.
    """

    recorded_calls: set

    def __init__(self):
        self.recorded_calls = set()

    def run(self, code, globals, locals):
        self.exec_call_seen = False
        self.ignore_rest = False
        try:
            sys.setprofile(self.profilefunc)
            exec(code, globals, locals)
        # TODO: exception handling?
        finally:
            sys.setprofile(None)

    def profilefunc(self, frame, event, arg):
        # ignore everything until the first call, since that is `exec` from the `run`
        # method above
        if not self.exec_call_seen:
            if event == "call":
                self.exec_call_seen = True
            return

        # if we're going out of the exec, we should ignore anything else (for example the
        # call to `sys.setprofile(None)`)
        if event == "c_return":
            if arg == exec and frame.f_code.co_filename == __file__:
                self.ignore_rest = True

        if self.ignore_rest:
            return

        if event not in ["call", "c_call"]:
            return

        debug_print(f"profilefunc {event=}")
        if event == "call":
            # in call, the `frame` argument is new the frame for entering the callee
            call = Call.from_frame(frame.f_back)
            callee = PythonCallee.from_frame(frame)

        if event == "c_call":
            # in c_call, the `frame` argument is frame where the call happens, and the
            # `arg` argument is the C function object.
            call = Call.from_frame(frame)
            callee = ExternalCallee.from_arg(arg)

        debug_print(f"{call} --> {callee}")
        debug_print("\n" * 5)
        self.recorded_calls.add((call, callee))