Files
codeql/python/tools/recorded-call-graph-metrics/cg_trace.py
2020-07-07 11:05:03 +02:00

225 lines
6.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""Call Graph tracing.
Execute a python program and for each call being made, record the call and callable. This
allows us to compare call graph resolution from static analysis with actual data -- that
is, can we statically determine the target of each actual call correctly.
If there is 100% code coverage from the Python execution, it would also be possible to
look at the precision of the call graph resolutions -- that is, do we expect a function to
be able to be called in a place where it is not? Currently not something we're looking at.
"""
# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
# TODO: Know that a call to a C-function was made. See
# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
# test
# For inspiration, look at these projects:
# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
import argparse
import bdb
from io import StringIO
import sys
import os
import dis
import dataclasses
import csv
import xml.etree.ElementTree as ET
# Copy-Paste and uncomment for interactive ipython sessions
# import IPython; IPython.embed(); sys.exit()
@dataclasses.dataclass(frozen=True)
class Call():
"""A call to a callable
"""
filename: str
linenum: int
inst_index: int
@classmethod
def from_frame(cls, frame, debugger: bdb.Bdb):
code = frame.f_code
# Uncomment to see the bytecode
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
# print(b.dis(), file=sys.__stderr__)
return cls(
filename = debugger.canonic(code.co_filename),
linenum = frame.f_lineno,
inst_index = frame.f_lasti,
)
@dataclasses.dataclass(frozen=True)
class Callable():
"""A callable (Function/Lambda) should (hopefully) be uniquely identified by its name and
location (filename+line number)
TODO: Callable is maybe not a good name, since classes with __call__ will return true
for the python code `callable(cls)` -- will have to consider how __call__ is handled
"""
funcname: str
filename: str
linenum: int
@classmethod
def from_frame(cls, frame, debugger: bdb.Bdb):
code = frame.f_code
return cls(
funcname = code.co_name,
filename = debugger.canonic(code.co_filename),
linenum = frame.f_lineno,
)
class CallGraphTracer(bdb.Bdb):
"""Tracer that records calls being made
It would seem obvious that this should have extended `trace` library
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
using `sys.settrace` directly.
"""
recorded_calls: set
def __init__(self):
self.recorded_calls = set()
super().__init__()
def user_call(self, frame, argument_list):
call = Call.from_frame(frame.f_back, self)
callable = Callable.from_frame(frame, self)
# _print(f'{call} -> {callable}')
self.recorded_calls.add((call, callable))
################################################################################
# Export
################################################################################
class Exporter:
@staticmethod
def export(recorded_calls, outfile_path):
raise NotImplementedError()
@staticmethod
def dataclass_to_dict(obj):
d = dataclasses.asdict(obj)
prefix = obj.__class__.__name__.lower()
return {f"{prefix}_{key}": val for (key, val) in d.items()}
class CSVExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
with open(outfile_path, 'w', newline='') as csv_file:
writer = None
for (call, callable) in recorded_calls:
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callable)
}
if writer is None:
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
writer.writeheader()
writer.writerow(data)
print(f'output written to {outfile_path}')
# embed(); sys.exit()
class XMLExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
root = ET.Element('root')
for (call, callable) in recorded_calls:
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callable)
}
rc = ET.SubElement(root, 'recorded_call')
# this xml library only supports serializing attributes that have string values
rc.attrib = {k: str(v) for k, v in data.items()}
tree = ET.ElementTree(root)
tree.write(outfile_path, encoding='utf-8')
################################################################################
# __main__
################################################################################
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--csv')
parser.add_argument('--xml')
parser.add_argument('progname', help='file to run as main program')
parser.add_argument('arguments', nargs=argparse.REMAINDER,
help='arguments to the program')
opts = parser.parse_args()
# These details of setting up the program to be run is very much inspired by `trace`
# from the standard library
sys.argv = [opts.progname, *opts.arguments]
sys.path[0] = os.path.dirname(opts.progname)
with open(opts.progname) as fp:
code = compile(fp.read(), opts.progname, 'exec')
# try to emulate __main__ namespace as much as possible
globs = {
'__file__': opts.progname,
'__name__': '__main__',
'__package__': None,
'__cached__': None,
}
real_stdout = sys.stdout
real_stderr = sys.stderr
captured_stdout = StringIO()
sys.stdout = captured_stdout
cgt = CallGraphTracer()
cgt.run(code, globs, globs)
sys.stdout = real_stdout
if opts.csv:
CSVExporter.export(cgt.recorded_calls, opts.csv)
elif opts.xml:
XMLExporter.export(cgt.recorded_calls, opts.xml)
else:
for (call, callable) in cgt.recorded_calls:
print(f'{call} -> {callable}')
print('--- captured stdout ---')
print(captured_stdout.getvalue(), end='')