From c2748bf7cfbd599a7dd4e3bd0eea04cc6840c2be Mon Sep 17 00:00:00 2001 From: Rasmus Wriedt Larsen Date: Mon, 20 Jul 2020 11:28:05 +0200 Subject: [PATCH] Python: CG trace: reconstruct call expr from bytecode So we can differentiate multiple calls in one line. --- .../example-traces/builtins.xml | 107 ++++++++++++- .../example-traces/simple.xml | 141 +++++++++++++++++- .../recorded-call-graph-metrics/setup.py | 2 +- .../src/cg_trace/__init__.py | 2 +- .../src/cg_trace/bytecode_reconstructor.py | 135 +++++++++++++++++ .../src/cg_trace/cmdline.py | 1 - .../src/cg_trace/exporter.py | 71 ++++----- .../src/cg_trace/main.py | 41 +++-- .../src/cg_trace/tracer.py | 18 ++- 9 files changed, 447 insertions(+), 71 deletions(-) create mode 100644 python/tools/recorded-call-graph-metrics/src/cg_trace/bytecode_reconstructor.py diff --git a/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml b/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml index 60703c50cf2..c9dacc3159d 100644 --- a/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml +++ b/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml @@ -1,6 +1,105 @@ - - - - + + 0.0.2 + --xml example-traces/builtins.xml example/builtins.py + completed + 0.00 seconds + 2020-07-20T09:27:00 + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py + 1 + 4 + + + + + print + + + + + + + builtins + print + True + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py + 2 + 12 + + + + + len + + + + + + + builtins + len + True + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py + 4 + 28 + + + + + append + + + l + + + + + + + + + None + list.append + True + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py + 7 + 44 + + + + + getdefaultencoding + + + sys + + + + + + + + + sys + getdefaultencoding + True + + + diff --git a/python/tools/recorded-call-graph-metrics/example-traces/simple.xml b/python/tools/recorded-call-graph-metrics/example-traces/simple.xml index 25f494aec86..7aa14f62728 100644 --- a/python/tools/recorded-call-graph-metrics/example-traces/simple.xml +++ b/python/tools/recorded-call-graph-metrics/example-traces/simple.xml @@ -1,8 +1,137 @@ - - - - - - + + 0.0.2 + --xml example-traces/simple.xml example/simple.py + completed + 0.00 seconds + 2020-07-20T09:27:01 + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 2 + 4 + + + + + print + + + + + + + builtins + print + True + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 5 + 4 + + + + + print + + + + + + + builtins + print + True + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 7 + 18 + + + + + foo + + + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 1 + foo + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 8 + 24 + + + + + bar + + + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 4 + bar + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 10 + 30 + + + + + foo + + + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 1 + foo + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 10 + 36 + + + + + bar + + + + + + + /home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py + 4 + bar + + + diff --git a/python/tools/recorded-call-graph-metrics/setup.py b/python/tools/recorded-call-graph-metrics/setup.py index 9e41f1e0e4c..ef212fac8f1 100644 --- a/python/tools/recorded-call-graph-metrics/setup.py +++ b/python/tools/recorded-call-graph-metrics/setup.py @@ -4,7 +4,7 @@ from setuptools import find_packages, setup setup( name="cg_trace", - version="0.0.1", + version="0.0.2", # Remember to update src/cg_trace/__init__.py description="Call graph tracing", packages=find_packages("src"), package_dir={"": "src"}, diff --git a/python/tools/recorded-call-graph-metrics/src/cg_trace/__init__.py b/python/tools/recorded-call-graph-metrics/src/cg_trace/__init__.py index 9432512b4e7..582f75fa50d 100644 --- a/python/tools/recorded-call-graph-metrics/src/cg_trace/__init__.py +++ b/python/tools/recorded-call-graph-metrics/src/cg_trace/__init__.py @@ -1,6 +1,6 @@ import sys -__version__ = "0.0.1" +__version__ = "0.0.2" # remember to update setup.py # Since the virtual machine opcodes changed in 3.6, not going to attempt to support # anything before that. Using dataclasses, which is a new feature in Python 3.7 diff --git a/python/tools/recorded-call-graph-metrics/src/cg_trace/bytecode_reconstructor.py b/python/tools/recorded-call-graph-metrics/src/cg_trace/bytecode_reconstructor.py new file mode 100644 index 00000000000..9ab26bb8bfb --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/src/cg_trace/bytecode_reconstructor.py @@ -0,0 +1,135 @@ +import dataclasses +import dis +import logging +from dis import Instruction +from types import FrameType +from typing import List + +LOGGER = logging.getLogger(__name__) + +# See https://docs.python.org/3/library/dis.html#python-bytecode-instructions for +# details on the bytecode instructions + +# TODO: read https://opensource.com/article/18/4/introduction-python-bytecode + + +class BytecodeExpr: + """An expression reconstructed from Python bytecode + """ + + +@dataclasses.dataclass(frozen=True, eq=True, order=True) +class BytecodeVariableName(BytecodeExpr): + name: str + + def __str__(self): + return self.name + + +@dataclasses.dataclass(frozen=True, eq=True, order=True) +class BytecodeAttribute(BytecodeExpr): + attr_name: str + object: BytecodeExpr + + def __str__(self): + return f"{self.object}.{self.attr_name}" + + +@dataclasses.dataclass(frozen=True, eq=True, order=True) +class BytecodeCall(BytecodeExpr): + function: BytecodeExpr + + def __str__(self): + return f"{self.function}()" + + +@dataclasses.dataclass(frozen=True, eq=True, order=True) +class BytecodeUnknown(BytecodeExpr): + opname: str + + def __str__(self): + return f"<{self.opname}>" + + +def find_inst_that_added_elem_to_stack( + instructions: List[Instruction], start_index: int, stack_pos: int +): + """Backwards traverse instructions + + Backwards traverse the instructions starting at `start_index` until we find the + instruction that added the element at stack position `stack_pos` (where 0 means top + of stack). For example, if the instructions are: + + ``` + 0: LOAD_GLOBAL 0 (func) + 1: LOAD_CONST 1 (42) + 2: CALL_FUNCTION 1 + ``` + + We can look for the function that is called by invoking this function with + `start_index = 1` and `stack_pos = 1`. It will see that `LOAD_CONST` added the top + element to the stack, and find that `LOAD_GLOBAL` was the instruction to add element + in stack position 1 to the stack -- so the index 0 is returned. + + It is assumed that if `stack_pos == 0` then the instruction you are looking for is + the one at `instructions[start_index]`. This might not hold, in case of using `NOP` + instructions. + """ + LOGGER.debug(f"find_inst_that_added_elem_to_stack {start_index=} {stack_pos=}") + assert stack_pos >= 0 + for inst in reversed(instructions[: start_index + 1]): + if stack_pos == 0: + LOGGER.debug(f"Found it: {inst}") + return instructions.index(inst) + LOGGER.debug(f"Skipping {inst}") + stack_pos -= dis.stack_effect(inst.opcode, inst.arg) + + raise Exception("inst_index_for_stack_diff failed") + + +def expr_from_instruction(instructions: List[Instruction], index: int) -> BytecodeExpr: + inst = instructions[index] + + LOGGER.debug(f"expr_from_instruction: {inst} {index=}") + + if inst.opname in ["LOAD_GLOBAL", "LOAD_FAST", "LOAD_NAME"]: + return BytecodeVariableName(inst.argval) + + # https://docs.python.org/3/library/dis.html#opcode-LOAD_METHOD + # https://docs.python.org/3/library/dis.html#opcode-LOAD_ATTR + elif inst.opname in ["LOAD_METHOD", "LOAD_ATTR"]: + attr_name = inst.argval + obj_index = find_inst_that_added_elem_to_stack(instructions, index - 1, 0) + obj_expr = expr_from_instruction(instructions, obj_index) + return BytecodeAttribute(attr_name=attr_name, object=obj_expr) + + # https://docs.python.org/3/library/dis.html#opcode-CALL_FUNCTION + elif inst.opname in ["CALL_FUNCTION", "CALL_METHOD", "CALL_FUNCTION_KW"]: + assert index > 0 + assert isinstance(inst.arg, int) + if inst.opname in ["CALL_FUNCTION", "CALL_METHOD"]: + num_stack_elems = inst.arg + elif inst.opname == "CALL_FUNCTION_KW": + num_stack_elems = inst.arg + 1 + + func_index = find_inst_that_added_elem_to_stack( + instructions, index - 1, num_stack_elems + ) + func_expr = expr_from_instruction(instructions, func_index) + return BytecodeCall(function=func_expr) + + else: + # LOAD_BUILD_CLASS is included here intentionally for now, since I don't really + # know what to do about it. + LOGGER.warning(f"Don't know how to handle this type of instruction: {inst}") + return BytecodeUnknown(inst.opname) + + +def expr_from_frame(frame: FrameType) -> BytecodeExpr: + bytecode = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti) + + LOGGER.debug(f"bytecode: \n{bytecode.dis()}") + + instructions = list(iter(bytecode)) + last_instruction_index = [inst.offset for inst in instructions].index(frame.f_lasti) + return expr_from_instruction(instructions, last_instruction_index) diff --git a/python/tools/recorded-call-graph-metrics/src/cg_trace/cmdline.py b/python/tools/recorded-call-graph-metrics/src/cg_trace/cmdline.py index aaf047315d3..ddbeeaf50dc 100644 --- a/python/tools/recorded-call-graph-metrics/src/cg_trace/cmdline.py +++ b/python/tools/recorded-call-graph-metrics/src/cg_trace/cmdline.py @@ -4,7 +4,6 @@ import argparse def parse(args): parser = argparse.ArgumentParser() - parser.add_argument("--csv") parser.add_argument("--xml") parser.add_argument("progname", help="file to run as main program") diff --git a/python/tools/recorded-call-graph-metrics/src/cg_trace/exporter.py b/python/tools/recorded-call-graph-metrics/src/cg_trace/exporter.py index bee0d5dc0f6..b6051a01a16 100644 --- a/python/tools/recorded-call-graph-metrics/src/cg_trace/exporter.py +++ b/python/tools/recorded-call-graph-metrics/src/cg_trace/exporter.py @@ -1,57 +1,40 @@ -import csv import dataclasses +from typing import Dict from lxml import etree -class Exporter: +def dataclass_to_xml(obj, parent): + obj_elem = etree.SubElement(parent, obj.__class__.__name__) + for field in dataclasses.fields(obj): + field_elem = etree.SubElement(obj_elem, field.name) + value = getattr(obj, field.name) + if isinstance(value, (str, int)) or value is None: + field_elem.text = str(value) + elif dataclasses.is_dataclass(value): + dataclass_to_xml(value, field_elem) + else: + raise ValueError( + f"Can't export key {field.name!r} with value {value!r} (type {type(value)}" + ) + + +class XMLExporter: @staticmethod - def export(recorded_calls, outfile_path): - raise NotImplementedError() - - @staticmethod - def dataclass_to_dict(obj): - d = dataclasses.asdict(obj) - prefix = obj.__class__.__name__.lower() - return {f"{prefix}_{key}": val for (key, val) in d.items()} - - -class CSVExporter(Exporter): - @staticmethod - def export(recorded_calls, outfile_path): - with open(outfile_path, "w", newline="") as csv_file: - writer = None - for (call, callee) in sorted(recorded_calls): - data = { - **Exporter.dataclass_to_dict(call), - **Exporter.dataclass_to_dict(callee), - } - - if writer is None: - writer = csv.DictWriter(csv_file, fieldnames=data.keys()) - writer.writeheader() - - writer.writerow(data) - - print(f"output written to {outfile_path}") - - -class XMLExporter(Exporter): - @staticmethod - def export(recorded_calls, outfile_path): + def export(outfile_path, recorded_calls, info: Dict[str, str]): root = etree.Element("root") - for (call, callee) in sorted(recorded_calls): - data = { - **Exporter.dataclass_to_dict(call), - **Exporter.dataclass_to_dict(callee), - } + info_elem = etree.SubElement(root, "info") + for k, v in info.items(): + etree.SubElement(info_elem, k).text = v - rc = etree.SubElement(root, "recorded_call") - for k, v in data.items(): - # xml library only supports serializing attributes that have string values - rc.set(k, str(v)) + rcs = etree.SubElement(root, "recorded_calls") + + for (call, callee) in sorted(recorded_calls): + rc = etree.SubElement(rcs, "recorded_call") + dataclass_to_xml(call, rc) + dataclass_to_xml(callee, rc) tree = etree.ElementTree(root) tree.write(outfile_path, encoding="utf-8", pretty_print=True) diff --git a/python/tools/recorded-call-graph-metrics/src/cg_trace/main.py b/python/tools/recorded-call-graph-metrics/src/cg_trace/main.py index 77aed2146c9..3691fe4149b 100644 --- a/python/tools/recorded-call-graph-metrics/src/cg_trace/main.py +++ b/python/tools/recorded-call-graph-metrics/src/cg_trace/main.py @@ -1,10 +1,12 @@ import logging import os import sys +import time +from datetime import datetime from io import StringIO -from cg_trace import cmdline, tracer -from cg_trace.exporter import CSVExporter, XMLExporter +from cg_trace import __version__, cmdline, tracer +from cg_trace.exporter import XMLExporter def record_calls(code, globals): @@ -17,15 +19,19 @@ def record_calls(code, globals): sys.stderr = captured_stderr cgt = tracer.CallGraphTracer() - cgt.run(code, globals, globals) + exit_status = cgt.run(code, globals, globals) sys.stdout = real_stdout sys.stderr = real_stderr - return sorted(cgt.recorded_calls), captured_stdout, captured_stderr + return sorted(cgt.recorded_calls), captured_stdout, captured_stderr, exit_status def main(args=None) -> int: - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + from . import bytecode_reconstructor + + logging.getLogger(bytecode_reconstructor.__name__).setLevel(logging.INFO) if args is None: # first element in argv is program name @@ -49,14 +55,27 @@ def main(args=None) -> int: "__cached__": None, } - recorded_calls, captured_stdout, captured_stderr = record_calls(code, globs) + start = time.time() + recorded_calls, captured_stdout, captured_stderr, exit_status = record_calls( + code, globs + ) + end = time.time() + elapsed_formatted = f"{end-start:.2f} seconds" - if opts.csv: - CSVExporter.export(recorded_calls, opts.csv) - elif opts.xml: - XMLExporter.export(recorded_calls, opts.xml) + if opts.xml: + XMLExporter.export( + opts.xml, + recorded_calls, + info={ + "cg_trace_version": __version__, + "args": " ".join(args), + "exit_status": exit_status, + "elapsed": elapsed_formatted, + "utctimestamp": datetime.utcnow().replace(microsecond=0).isoformat(), + }, + ) else: - print("Recorded calls:") + print(f"--- Recorded calls (in {elapsed_formatted}) ---") for (call, callee) in recorded_calls: print(f"{call} --> {callee}") diff --git a/python/tools/recorded-call-graph-metrics/src/cg_trace/tracer.py b/python/tools/recorded-call-graph-metrics/src/cg_trace/tracer.py index 87835c4bf9e..25a49943214 100644 --- a/python/tools/recorded-call-graph-metrics/src/cg_trace/tracer.py +++ b/python/tools/recorded-call-graph-metrics/src/cg_trace/tracer.py @@ -1,11 +1,12 @@ import dataclasses -import dis import logging import os import sys from types import FrameType from typing import Optional +from cg_trace.bytecode_reconstructor import BytecodeExpr, expr_from_frame + LOGGER = logging.getLogger(__name__) @@ -42,18 +43,26 @@ class Call: filename: str linenum: int inst_index: int + bytecode_expr: BytecodeExpr + + def __str__(self): + d = dataclasses.asdict(self) + del d["bytecode_expr"] + normal_fields = ", ".join(f"{k}={v!r}" for k, v in d.items()) + + return f"{type(self).__name__}({normal_fields}, bytecode_expr≈{repr(str(self.bytecode_expr))})" @classmethod def from_frame(cls, frame: FrameType): code = frame.f_code - b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti) - LOGGER.debug(f"bytecode: \n{b.dis()}") + bytecode_expr = expr_from_frame(frame) return cls( filename=canonic_filename(code.co_filename), linenum=frame.f_lineno, inst_index=frame.f_lasti, + bytecode_expr=bytecode_expr, ) @@ -165,9 +174,11 @@ class CallGraphTracer: sys.setprofile(self.profilefunc) exec(code, globals, locals) sys.setprofile(None) + return "completed" except Exception: sys.setprofile(None) LOGGER.info("Exception occurred while running program:", exc_info=True) + return "exception occurred" def profilefunc(self, frame: FrameType, event: str, arg): # ignore everything until the first call, since that is `exec` from the `run` @@ -191,6 +202,7 @@ class CallGraphTracer: LOGGER.debug(f"profilefunc {event=}") if event == "call": + assert frame.f_back is not None # in call, the `frame` argument is new the frame for entering the callee call = Call.from_frame(frame.f_back) callee = PythonCallee.from_frame(frame)