diff --git a/python/tools/recorded-call-graph-metrics/README.md b/python/tools/recorded-call-graph-metrics/README.md new file mode 100644 index 00000000000..a249dce5a84 --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/README.md @@ -0,0 +1,17 @@ +# Recorded Call Graph Metrics + +also known as _call graph tracing_. + +Execute a python program and for each call being made, record the call and callable. This allows us to compare call graph resolution from static analysis with actual data -- that is, can we statically determine the target of each actual call correctly. + +This is still in the early stages, and currently only support a very minimal working example (to show that this approach might work). + +The next hurdle is being able to handle multiple calls on the same line, such as + +- `foo(); bar()` +- `foo(bar())` +- `foo().bar()` + +## How do I give it a spin? + +Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory. diff --git a/python/tools/recorded-call-graph-metrics/cg_trace.py b/python/tools/recorded-call-graph-metrics/cg_trace.py new file mode 100755 index 00000000000..67256e47b3e --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/cg_trace.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 + +"""Call Graph tracing. + +Execute a python program and for each call being made, record the call and callable. This +allows us to compare call graph resolution from static analysis with actual data -- that +is, can we statically determine the target of each actual call correctly. + +If there is 100% code coverage from the Python execution, it would also be possible to +look at the precision of the call graph resolutions -- that is, do we expect a function to +be able to be called in a place where it is not? Currently not something we're looking at. +""" + +# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/ + +# TODO: Know that a call to a C-function was made. See +# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as +# test + +# For inspiration, look at these projects: +# - https://github.com/joerick/pyinstrument (capture call-stack every ms for profiling) +# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution) + +import argparse +import bdb +from io import StringIO +import sys +import os +import dis +import dataclasses +import csv +import xml.etree.ElementTree as ET + +# Copy-Paste and uncomment for interactive ipython sessions +# import IPython; IPython.embed(); sys.exit() + + +@dataclasses.dataclass(frozen=True) +class Call(): + """A call to a callable + """ + filename: str + linenum: int + inst_index: int + + @classmethod + def from_frame(cls, frame, debugger: bdb.Bdb): + code = frame.f_code + + # Uncomment to see the bytecode + # b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti) + # print(b.dis(), file=sys.__stderr__) + + return cls( + filename = debugger.canonic(code.co_filename), + linenum = frame.f_lineno, + inst_index = frame.f_lasti, + ) + + +@dataclasses.dataclass(frozen=True) +class Callable(): + """A callable (Function/Lambda) should (hopefully) be uniquely identified by its name and + location (filename+line number) + + TODO: Callable is maybe not a good name, since classes with __call__ will return true + for the python code `callable(cls)` -- will have to consider how __call__ is handled + """ + funcname: str + filename: str + linenum: int + + @classmethod + def from_frame(cls, frame, debugger: bdb.Bdb): + code = frame.f_code + return cls( + funcname = code.co_name, + filename = debugger.canonic(code.co_filename), + linenum = frame.f_lineno, + ) + + +class CallGraphTracer(bdb.Bdb): + """Tracer that records calls being made + + It would seem obvious that this should have extended `trace` library + (https://docs.python.org/3/library/trace.html), but that part is not extensible -- + however, the basic debugger (bdb) is, and provides maybe a bit more help than just + using `sys.settrace` directly. + """ + + recorded_calls: set + + def __init__(self): + self.recorded_calls = set() + super().__init__() + + def user_call(self, frame, argument_list): + call = Call.from_frame(frame.f_back, self) + callable = Callable.from_frame(frame, self) + + # _print(f'{call} -> {callable}') + self.recorded_calls.add((call, callable)) + + +################################################################################ +# Export +################################################################################ + + +class Exporter: + + @staticmethod + def export(recorded_calls, outfile_path): + raise NotImplementedError() + + @staticmethod + def dataclass_to_dict(obj): + d = dataclasses.asdict(obj) + prefix = obj.__class__.__name__.lower() + return {f"{prefix}_{key}": val for (key, val) in d.items()} + + +class CSVExporter(Exporter): + + @staticmethod + def export(recorded_calls, outfile_path): + with open(outfile_path, 'w', newline='') as csv_file: + writer = None + for (call, callable) in recorded_calls: + + data = { + **Exporter.dataclass_to_dict(call), + **Exporter.dataclass_to_dict(callable) + } + + if writer is None: + writer = csv.DictWriter(csv_file, fieldnames=data.keys()) + writer.writeheader() + + writer.writerow(data) + + + print(f'output written to {outfile_path}') + + # embed(); sys.exit() + + +class XMLExporter(Exporter): + + @staticmethod + def export(recorded_calls, outfile_path): + + root = ET.Element('root') + + for (call, callable) in recorded_calls: + data = { + **Exporter.dataclass_to_dict(call), + **Exporter.dataclass_to_dict(callable) + } + + rc = ET.SubElement(root, 'recorded_call') + # this xml library only supports serializing attributes that have string values + rc.attrib = {k: str(v) for k, v in data.items()} + + tree = ET.ElementTree(root) + tree.write(outfile_path, encoding='utf-8') + + +################################################################################ +# __main__ +################################################################################ + + +if __name__ == "__main__": + + + parser = argparse.ArgumentParser() + + + parser.add_argument('--csv') + parser.add_argument('--xml') + + parser.add_argument('progname', help='file to run as main program') + parser.add_argument('arguments', nargs=argparse.REMAINDER, + help='arguments to the program') + + opts = parser.parse_args() + + # These details of setting up the program to be run is very much inspired by `trace` + # from the standard library + sys.argv = [opts.progname, *opts.arguments] + sys.path[0] = os.path.dirname(opts.progname) + + with open(opts.progname) as fp: + code = compile(fp.read(), opts.progname, 'exec') + + # try to emulate __main__ namespace as much as possible + globs = { + '__file__': opts.progname, + '__name__': '__main__', + '__package__': None, + '__cached__': None, + } + + real_stdout = sys.stdout + real_stderr = sys.stderr + captured_stdout = StringIO() + + sys.stdout = captured_stdout + cgt = CallGraphTracer() + cgt.run(code, globs, globs) + sys.stdout = real_stdout + + if opts.csv: + CSVExporter.export(cgt.recorded_calls, opts.csv) + elif opts.xml: + XMLExporter.export(cgt.recorded_calls, opts.xml) + else: + for (call, callable) in cgt.recorded_calls: + print(f'{call} -> {callable}') + + print('--- captured stdout ---') + print(captured_stdout.getvalue(), end='') diff --git a/python/tools/recorded-call-graph-metrics/example/simple.py b/python/tools/recorded-call-graph-metrics/example/simple.py new file mode 100644 index 00000000000..626d402cb20 --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/example/simple.py @@ -0,0 +1,10 @@ +def foo(): + print('foo') + +def bar(): + print('bar') + +foo() +bar() + +foo(); bar() diff --git a/python/tools/recorded-call-graph-metrics/example/simple.xml b/python/tools/recorded-call-graph-metrics/example/simple.xml new file mode 100644 index 00000000000..94ceb3a7923 --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/example/simple.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql b/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql new file mode 100644 index 00000000000..9510aec598e --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql @@ -0,0 +1,9 @@ +import RecordedCalls + +from ValidRecordedCall rc, Call call, Function callable, CallableValue callableValue +where + call = rc.getCall() and + callable = rc.getCallable() and + callableValue.getScope() = callable and + callableValue.getACall() = call.getAFlowNode() +select call, "-->", callable diff --git a/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll b/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll new file mode 100644 index 00000000000..01b6bf82f2e --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll @@ -0,0 +1,38 @@ +import python + +class RecordedCall extends XMLElement { + RecordedCall() { + this.hasName("recorded_call") + } + + string call_filename() { result = this.getAttributeValue("call_filename") } + + int call_linenum() { result = this.getAttributeValue("call_linenum").toInt() } + + int call_inst_index() { result = this.getAttributeValue("call_inst_index").toInt() } + + Call getCall() { + // TODO: handle calls spanning multiple lines + result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _) + } + + string callable_filename() { result = this.getAttributeValue("callable_filename") } + + int callable_linenum() { result = this.getAttributeValue("callable_linenum").toInt() } + + string callable_funcname() { result = this.getAttributeValue("callable_funcname") } + + Function getCallable() { + result.getLocation().hasLocationInfo(this.callable_filename(), this.callable_linenum(), _, _, _) + } +} + +/** + * Class of recorded calls where we can uniquely identify both the `call` and the `callable`. + */ +class ValidRecordedCall extends RecordedCall { + ValidRecordedCall() { + strictcount(this.getCall()) = 1 and + strictcount(this.getCallable()) = 1 + } +} diff --git a/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql b/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql new file mode 100644 index 00000000000..b2f85832f8c --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql @@ -0,0 +1,7 @@ +import RecordedCalls + +from RecordedCall rc +where not rc instanceof ValidRecordedCall +select "Could not uniquely identify this recorded call (either call or callable was not uniquely identified)", + rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callable_filename(), + rc.callable_linenum(), rc.callable_funcname() diff --git a/python/tools/recorded-call-graph-metrics/ql/qlpack.yml b/python/tools/recorded-call-graph-metrics/ql/qlpack.yml new file mode 100644 index 00000000000..1b4b6c0ca8c --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/ql/qlpack.yml @@ -0,0 +1,4 @@ +name: codeql-python-recorded-call-graph-metrics +version: 0.0.1 +libraryPathDependencies: codeql-python +extractor: python diff --git a/python/tools/recorded-call-graph-metrics/recreate-db.sh b/python/tools/recorded-call-graph-metrics/recreate-db.sh new file mode 100755 index 00000000000..8c41f51e9ba --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/recreate-db.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e +set -x + +DB="cg-trace-example-db" +SRC="example/" +XMLDIR="$SRC" +PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python) + + +./cg_trace.py --xml example/simple.xml example/simple.py + +rm -rf "$DB" + + +codeql database init --source-root="$SRC" --language=python "$DB" +codeql database trace-command --working-dir="$SRC" "$DB" "$PYTHON_EXTRACTOR/tools/autobuild.sh" +codeql database index-files --language xml --include-extension .xml --working-dir="$XMLDIR" "$DB" +codeql database finalize "$DB" + +set +x +echo "Created database '$DB'"