Python: CG trace: reconstruct call expr from bytecode

So we can differentiate multiple calls in one line.
This commit is contained in:
Rasmus Wriedt Larsen
2020-07-20 11:28:05 +02:00
parent d46b410111
commit c2748bf7cf
9 changed files with 447 additions and 71 deletions

View File

@@ -1,6 +1,105 @@
<root> <root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="1" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/> <info>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="2" call_inst_index="12" externalcallee_module="builtins" externalcallee_qualname="len" externalcallee_is_builtin="True"/> <cg_trace_version>0.0.2</cg_trace_version>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="4" call_inst_index="28" externalcallee_module="None" externalcallee_qualname="list.append" externalcallee_is_builtin="True"/> <args>--xml example-traces/builtins.xml example/builtins.py</args>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="7" call_inst_index="44" externalcallee_module="sys" externalcallee_qualname="getdefaultencoding" externalcallee_is_builtin="True"/> <exit_status>completed</exit_status>
<elapsed>0.00 seconds</elapsed>
<utctimestamp>2020-07-20T09:27:00</utctimestamp>
</info>
<recorded_calls>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>1</linenum>
<inst_index>4</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>print</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>print</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>2</linenum>
<inst_index>12</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>len</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>len</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>4</linenum>
<inst_index>28</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeAttribute>
<attr_name>append</attr_name>
<object>
<BytecodeVariableName>
<name>l</name>
</BytecodeVariableName>
</object>
</BytecodeAttribute>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>None</module>
<qualname>list.append</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>7</linenum>
<inst_index>44</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeAttribute>
<attr_name>getdefaultencoding</attr_name>
<object>
<BytecodeVariableName>
<name>sys</name>
</BytecodeVariableName>
</object>
</BytecodeAttribute>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>sys</module>
<qualname>getdefaultencoding</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
</recorded_calls>
</root> </root>

View File

@@ -1,8 +1,137 @@
<root> <root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="2" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/> <info>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="5" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/> <cg_trace_version>0.0.2</cg_trace_version>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/> <args>--xml example-traces/simple.xml example/simple.py</args>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/> <exit_status>completed</exit_status>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/> <elapsed>0.00 seconds</elapsed>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/> <utctimestamp>2020-07-20T09:27:01</utctimestamp>
</info>
<recorded_calls>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>2</linenum>
<inst_index>4</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>print</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>print</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>5</linenum>
<inst_index>4</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>print</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>print</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>7</linenum>
<inst_index>18</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>foo</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>1</linenum>
<funcname>foo</funcname>
</PythonCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>8</linenum>
<inst_index>24</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>bar</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>4</linenum>
<funcname>bar</funcname>
</PythonCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>10</linenum>
<inst_index>30</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>foo</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>1</linenum>
<funcname>foo</funcname>
</PythonCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>10</linenum>
<inst_index>36</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>bar</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>4</linenum>
<funcname>bar</funcname>
</PythonCallee>
</recorded_call>
</recorded_calls>
</root> </root>

View File

@@ -4,7 +4,7 @@ from setuptools import find_packages, setup
setup( setup(
name="cg_trace", name="cg_trace",
version="0.0.1", version="0.0.2", # Remember to update src/cg_trace/__init__.py
description="Call graph tracing", description="Call graph tracing",
packages=find_packages("src"), packages=find_packages("src"),
package_dir={"": "src"}, package_dir={"": "src"},

View File

@@ -1,6 +1,6 @@
import sys import sys
__version__ = "0.0.1" __version__ = "0.0.2" # remember to update setup.py
# Since the virtual machine opcodes changed in 3.6, not going to attempt to support # Since the virtual machine opcodes changed in 3.6, not going to attempt to support
# anything before that. Using dataclasses, which is a new feature in Python 3.7 # anything before that. Using dataclasses, which is a new feature in Python 3.7

View File

@@ -0,0 +1,135 @@
import dataclasses
import dis
import logging
from dis import Instruction
from types import FrameType
from typing import List
LOGGER = logging.getLogger(__name__)
# See https://docs.python.org/3/library/dis.html#python-bytecode-instructions for
# details on the bytecode instructions
# TODO: read https://opensource.com/article/18/4/introduction-python-bytecode
class BytecodeExpr:
"""An expression reconstructed from Python bytecode
"""
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeVariableName(BytecodeExpr):
name: str
def __str__(self):
return self.name
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeAttribute(BytecodeExpr):
attr_name: str
object: BytecodeExpr
def __str__(self):
return f"{self.object}.{self.attr_name}"
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeCall(BytecodeExpr):
function: BytecodeExpr
def __str__(self):
return f"{self.function}()"
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeUnknown(BytecodeExpr):
opname: str
def __str__(self):
return f"<{self.opname}>"
def find_inst_that_added_elem_to_stack(
instructions: List[Instruction], start_index: int, stack_pos: int
):
"""Backwards traverse instructions
Backwards traverse the instructions starting at `start_index` until we find the
instruction that added the element at stack position `stack_pos` (where 0 means top
of stack). For example, if the instructions are:
```
0: LOAD_GLOBAL 0 (func)
1: LOAD_CONST 1 (42)
2: CALL_FUNCTION 1
```
We can look for the function that is called by invoking this function with
`start_index = 1` and `stack_pos = 1`. It will see that `LOAD_CONST` added the top
element to the stack, and find that `LOAD_GLOBAL` was the instruction to add element
in stack position 1 to the stack -- so the index 0 is returned.
It is assumed that if `stack_pos == 0` then the instruction you are looking for is
the one at `instructions[start_index]`. This might not hold, in case of using `NOP`
instructions.
"""
LOGGER.debug(f"find_inst_that_added_elem_to_stack {start_index=} {stack_pos=}")
assert stack_pos >= 0
for inst in reversed(instructions[: start_index + 1]):
if stack_pos == 0:
LOGGER.debug(f"Found it: {inst}")
return instructions.index(inst)
LOGGER.debug(f"Skipping {inst}")
stack_pos -= dis.stack_effect(inst.opcode, inst.arg)
raise Exception("inst_index_for_stack_diff failed")
def expr_from_instruction(instructions: List[Instruction], index: int) -> BytecodeExpr:
inst = instructions[index]
LOGGER.debug(f"expr_from_instruction: {inst} {index=}")
if inst.opname in ["LOAD_GLOBAL", "LOAD_FAST", "LOAD_NAME"]:
return BytecodeVariableName(inst.argval)
# https://docs.python.org/3/library/dis.html#opcode-LOAD_METHOD
# https://docs.python.org/3/library/dis.html#opcode-LOAD_ATTR
elif inst.opname in ["LOAD_METHOD", "LOAD_ATTR"]:
attr_name = inst.argval
obj_index = find_inst_that_added_elem_to_stack(instructions, index - 1, 0)
obj_expr = expr_from_instruction(instructions, obj_index)
return BytecodeAttribute(attr_name=attr_name, object=obj_expr)
# https://docs.python.org/3/library/dis.html#opcode-CALL_FUNCTION
elif inst.opname in ["CALL_FUNCTION", "CALL_METHOD", "CALL_FUNCTION_KW"]:
assert index > 0
assert isinstance(inst.arg, int)
if inst.opname in ["CALL_FUNCTION", "CALL_METHOD"]:
num_stack_elems = inst.arg
elif inst.opname == "CALL_FUNCTION_KW":
num_stack_elems = inst.arg + 1
func_index = find_inst_that_added_elem_to_stack(
instructions, index - 1, num_stack_elems
)
func_expr = expr_from_instruction(instructions, func_index)
return BytecodeCall(function=func_expr)
else:
# LOAD_BUILD_CLASS is included here intentionally for now, since I don't really
# know what to do about it.
LOGGER.warning(f"Don't know how to handle this type of instruction: {inst}")
return BytecodeUnknown(inst.opname)
def expr_from_frame(frame: FrameType) -> BytecodeExpr:
bytecode = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
LOGGER.debug(f"bytecode: \n{bytecode.dis()}")
instructions = list(iter(bytecode))
last_instruction_index = [inst.offset for inst in instructions].index(frame.f_lasti)
return expr_from_instruction(instructions, last_instruction_index)

View File

@@ -4,7 +4,6 @@ import argparse
def parse(args): def parse(args):
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--csv")
parser.add_argument("--xml") parser.add_argument("--xml")
parser.add_argument("progname", help="file to run as main program") parser.add_argument("progname", help="file to run as main program")

View File

@@ -1,57 +1,40 @@
import csv
import dataclasses import dataclasses
from typing import Dict
from lxml import etree from lxml import etree
class Exporter: def dataclass_to_xml(obj, parent):
obj_elem = etree.SubElement(parent, obj.__class__.__name__)
for field in dataclasses.fields(obj):
field_elem = etree.SubElement(obj_elem, field.name)
value = getattr(obj, field.name)
if isinstance(value, (str, int)) or value is None:
field_elem.text = str(value)
elif dataclasses.is_dataclass(value):
dataclass_to_xml(value, field_elem)
else:
raise ValueError(
f"Can't export key {field.name!r} with value {value!r} (type {type(value)}"
)
class XMLExporter:
@staticmethod @staticmethod
def export(recorded_calls, outfile_path): def export(outfile_path, recorded_calls, info: Dict[str, str]):
raise NotImplementedError()
@staticmethod
def dataclass_to_dict(obj):
d = dataclasses.asdict(obj)
prefix = obj.__class__.__name__.lower()
return {f"{prefix}_{key}": val for (key, val) in d.items()}
class CSVExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
with open(outfile_path, "w", newline="") as csv_file:
writer = None
for (call, callee) in sorted(recorded_calls):
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callee),
}
if writer is None:
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
writer.writeheader()
writer.writerow(data)
print(f"output written to {outfile_path}")
class XMLExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
root = etree.Element("root") root = etree.Element("root")
for (call, callee) in sorted(recorded_calls): info_elem = etree.SubElement(root, "info")
data = { for k, v in info.items():
**Exporter.dataclass_to_dict(call), etree.SubElement(info_elem, k).text = v
**Exporter.dataclass_to_dict(callee),
}
rc = etree.SubElement(root, "recorded_call") rcs = etree.SubElement(root, "recorded_calls")
for k, v in data.items():
# xml library only supports serializing attributes that have string values for (call, callee) in sorted(recorded_calls):
rc.set(k, str(v)) rc = etree.SubElement(rcs, "recorded_call")
dataclass_to_xml(call, rc)
dataclass_to_xml(callee, rc)
tree = etree.ElementTree(root) tree = etree.ElementTree(root)
tree.write(outfile_path, encoding="utf-8", pretty_print=True) tree.write(outfile_path, encoding="utf-8", pretty_print=True)

View File

@@ -1,10 +1,12 @@
import logging import logging
import os import os
import sys import sys
import time
from datetime import datetime
from io import StringIO from io import StringIO
from cg_trace import cmdline, tracer from cg_trace import __version__, cmdline, tracer
from cg_trace.exporter import CSVExporter, XMLExporter from cg_trace.exporter import XMLExporter
def record_calls(code, globals): def record_calls(code, globals):
@@ -17,15 +19,19 @@ def record_calls(code, globals):
sys.stderr = captured_stderr sys.stderr = captured_stderr
cgt = tracer.CallGraphTracer() cgt = tracer.CallGraphTracer()
cgt.run(code, globals, globals) exit_status = cgt.run(code, globals, globals)
sys.stdout = real_stdout sys.stdout = real_stdout
sys.stderr = real_stderr sys.stderr = real_stderr
return sorted(cgt.recorded_calls), captured_stdout, captured_stderr return sorted(cgt.recorded_calls), captured_stdout, captured_stderr, exit_status
def main(args=None) -> int: def main(args=None) -> int:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logging.basicConfig(stream=sys.stderr, level=logging.INFO)
from . import bytecode_reconstructor
logging.getLogger(bytecode_reconstructor.__name__).setLevel(logging.INFO)
if args is None: if args is None:
# first element in argv is program name # first element in argv is program name
@@ -49,14 +55,27 @@ def main(args=None) -> int:
"__cached__": None, "__cached__": None,
} }
recorded_calls, captured_stdout, captured_stderr = record_calls(code, globs) start = time.time()
recorded_calls, captured_stdout, captured_stderr, exit_status = record_calls(
code, globs
)
end = time.time()
elapsed_formatted = f"{end-start:.2f} seconds"
if opts.csv: if opts.xml:
CSVExporter.export(recorded_calls, opts.csv) XMLExporter.export(
elif opts.xml: opts.xml,
XMLExporter.export(recorded_calls, opts.xml) recorded_calls,
info={
"cg_trace_version": __version__,
"args": " ".join(args),
"exit_status": exit_status,
"elapsed": elapsed_formatted,
"utctimestamp": datetime.utcnow().replace(microsecond=0).isoformat(),
},
)
else: else:
print("Recorded calls:") print(f"--- Recorded calls (in {elapsed_formatted}) ---")
for (call, callee) in recorded_calls: for (call, callee) in recorded_calls:
print(f"{call} --> {callee}") print(f"{call} --> {callee}")

View File

@@ -1,11 +1,12 @@
import dataclasses import dataclasses
import dis
import logging import logging
import os import os
import sys import sys
from types import FrameType from types import FrameType
from typing import Optional from typing import Optional
from cg_trace.bytecode_reconstructor import BytecodeExpr, expr_from_frame
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
@@ -42,18 +43,26 @@ class Call:
filename: str filename: str
linenum: int linenum: int
inst_index: int inst_index: int
bytecode_expr: BytecodeExpr
def __str__(self):
d = dataclasses.asdict(self)
del d["bytecode_expr"]
normal_fields = ", ".join(f"{k}={v!r}" for k, v in d.items())
return f"{type(self).__name__}({normal_fields}, bytecode_expr≈{repr(str(self.bytecode_expr))})"
@classmethod @classmethod
def from_frame(cls, frame: FrameType): def from_frame(cls, frame: FrameType):
code = frame.f_code code = frame.f_code
b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti) bytecode_expr = expr_from_frame(frame)
LOGGER.debug(f"bytecode: \n{b.dis()}")
return cls( return cls(
filename=canonic_filename(code.co_filename), filename=canonic_filename(code.co_filename),
linenum=frame.f_lineno, linenum=frame.f_lineno,
inst_index=frame.f_lasti, inst_index=frame.f_lasti,
bytecode_expr=bytecode_expr,
) )
@@ -165,9 +174,11 @@ class CallGraphTracer:
sys.setprofile(self.profilefunc) sys.setprofile(self.profilefunc)
exec(code, globals, locals) exec(code, globals, locals)
sys.setprofile(None) sys.setprofile(None)
return "completed"
except Exception: except Exception:
sys.setprofile(None) sys.setprofile(None)
LOGGER.info("Exception occurred while running program:", exc_info=True) LOGGER.info("Exception occurred while running program:", exc_info=True)
return "exception occurred"
def profilefunc(self, frame: FrameType, event: str, arg): def profilefunc(self, frame: FrameType, event: str, arg):
# ignore everything until the first call, since that is `exec` from the `run` # ignore everything until the first call, since that is `exec` from the `run`
@@ -191,6 +202,7 @@ class CallGraphTracer:
LOGGER.debug(f"profilefunc {event=}") LOGGER.debug(f"profilefunc {event=}")
if event == "call": if event == "call":
assert frame.f_back is not None
# in call, the `frame` argument is new the frame for entering the callee # in call, the `frame` argument is new the frame for entering the callee
call = Call.from_frame(frame.f_back) call = Call.from_frame(frame.f_back)
callee = PythonCallee.from_frame(frame) callee = PythonCallee.from_frame(frame)