Files
codeql/python/tools/recorded-call-graph-metrics/cg_trace/tracer.py
2020-07-17 14:40:54 +02:00

207 lines
6.0 KiB
Python

import dataclasses
import dis
import os
import sys
from typing import Optional
# copy-paste For interactive ipython sessions
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()
def debug_print(*args, **kwargs):
# print(*args, **kwargs, file=sys.__stderr__)
pass
_canonic_filename_cache = dict()
def canonic_filename(filename):
"""Return canonical form of filename. (same as Bdb.canonic)
For real filenames, the canonical form is a case-normalized (on
case insensitive filesystems) absolute path. 'Filenames' with
angle brackets, such as "<stdin>", generated in interactive
mode, are returned unchanged.
"""
if filename == "<" + filename[1:-1] + ">":
return filename
canonic = _canonic_filename_cache.get(filename)
if not canonic:
canonic = os.path.abspath(filename)
canonic = os.path.normcase(canonic)
_canonic_filename_cache[filename] = canonic
return canonic
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class Call:
"""A call
"""
filename: str
linenum: int
inst_index: int
@classmethod
def from_frame(cls, frame):
code = frame.f_code
# Uncomment to see the bytecode
b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
debug_print(b.dis())
return cls(
filename=canonic_filename(code.co_filename),
linenum=frame.f_lineno,
inst_index=frame.f_lasti,
)
def better_compare_for_dataclass(cls):
"""When dataclass is used with `order=True`, the comparison methods is only implemented for
objects of the same class. This decorator extends the functionality to compare class
name if used against other objects.
"""
for op in [
"__lt__",
"__le__",
"__gt__",
"__ge__",
]:
old = getattr(cls, op)
def new(self, other):
if type(self) == type(other):
return old(self, other)
return getattr(str, op)(self.__class__.__name__, other.__class__.__name__)
setattr(cls, op, new)
return cls
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class Callee:
pass
BUILTIN_FUNCTION_OR_METHOD = type(print)
@better_compare_for_dataclass
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class ExternalCallee(Callee):
# Some bound methods might not have __module__ attribute: for example,
# `list().append.__module__ is None`
module: Optional[str]
qualname: str
#
is_builtin: bool
@classmethod
def from_arg(cls, func):
return cls(
module=func.__module__,
qualname=func.__qualname__,
is_builtin=type(func) == BUILTIN_FUNCTION_OR_METHOD,
)
@better_compare_for_dataclass
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class PythonCallee(Callee):
"""A callee (Function/Lambda/???)
should (hopefully) be uniquely identified by its name and location (filename+line
number)
"""
filename: str
linenum: int
funcname: str
@classmethod
def from_frame(cls, frame):
code = frame.f_code
return cls(
filename=canonic_filename(code.co_filename),
linenum=frame.f_lineno,
funcname=code.co_name,
)
class CallGraphTracer:
"""Tracer that records calls being made
It would seem obvious that this should have extended `trace` library
(https://docs.python.org/3/library/trace.html), but that part is not extensible.
You might think that we can just use `sys.settrace`
(https://docs.python.org/3.8/library/sys.html#sys.settrace) like the basic debugger
(bdb) does, but that isn't invoked on calls to C code, which we need in general, and
need for handling builtins specifically.
Luckily, `sys.setprofile`
(https://docs.python.org/3.8/library/sys.html#sys.setprofile) provides all that we
need. You might be scared by reading the following bit of the documentation
> The function is thread-specific, but there is no way for the profiler to know about
> context switches between threads, so it does not make sense to use this in the
> presence of multiple threads.
but that is to be understood in the context of making a profiler (you can't reliably
measure function execution time if you don't know about context switches). For our
use-case, this is not a problem.
"""
recorded_calls: set
def __init__(self):
self.recorded_calls = set()
def run(self, code, globals, locals):
self.exec_call_seen = False
self.ignore_rest = False
try:
sys.setprofile(self.profilefunc)
exec(code, globals, locals)
# TODO: exception handling?
finally:
sys.setprofile(None)
def profilefunc(self, frame, event, arg):
# ignore everything until the first call, since that is `exec` from the `run`
# method above
if not self.exec_call_seen:
if event == "call":
self.exec_call_seen = True
return
# if we're going out of the exec, we should ignore anything else (for example the
# call to `sys.setprofile(None)`)
if event == "c_return":
if arg == exec and frame.f_code.co_filename == __file__:
self.ignore_rest = True
if self.ignore_rest:
return
if event not in ["call", "c_call"]:
return
debug_print(f"profilefunc {event=}")
if event == "call":
# in call, the `frame` argument is new the frame for entering the callee
call = Call.from_frame(frame.f_back)
callee = PythonCallee.from_frame(frame)
if event == "c_call":
# in c_call, the `frame` argument is frame where the call happens, and the
# `arg` argument is the C function object.
call = Call.from_frame(frame)
callee = ExternalCallee.from_arg(arg)
debug_print(f"{call} --> {callee}")
debug_print("\n" * 5)
self.recorded_calls.add((call, callee))