Python: CG trace: Handle builtins

This commit is contained in:
Rasmus Wriedt Larsen
2020-07-16 18:04:04 +02:00
parent 92e8e1622c
commit 1c2e259970
9 changed files with 241 additions and 36 deletions

View File

@@ -14,4 +14,30 @@ The next hurdle is being able to handle multiple calls on the same line, such as
## How do I give it a spin?
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`. Then run the queries inside the `ql/` directory.
## Limitations
### Code that uses `sys.setprofile`
Since that is our mechanism for recording calls, any code that uses `sys.setprofile` will not work together with the call-graph tracer.
### Class instantiation
Does not always fire off an event in the `sys.setprofile` function (neither in `sys.settrace`), so is not recorded. Example:
```
r = range(10)
```
when disassembled (`python -m dis <file>`):
```
9 48 LOAD_NAME 7 (range)
50 LOAD_CONST 5 (10)
52 CALL_FUNCTION 1
54 STORE_NAME 8 (r)
```
but no event :disappointed:

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python3
"""Call Graph tracing.
Execute a python program and for each call being made, record the call and callee. This
@@ -30,10 +29,14 @@ import dis
import dataclasses
import csv
from lxml import etree
from typing import Optional
# Copy-Paste and uncomment for interactive ipython sessions
# import IPython; IPython.embed(); sys.exit()
# copy-paste For interactive ipython sessions
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()
def debug_print(*args, **kwargs):
# print(*args, **kwargs, file=sys.__stderr__)
pass
_canonic_filename_cache = dict()
def canonic_filename(filename):
@@ -67,8 +70,8 @@ class Call():
code = frame.f_code
# Uncomment to see the bytecode
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
# print(b.dis(), file=sys.__stderr__)
b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
debug_print(b.dis())
return cls(
filename = canonic_filename(code.co_filename),
@@ -77,8 +80,53 @@ class Call():
)
def better_compare_for_dataclass(cls):
"""When dataclass is used with `order=True`, the comparison methods is only implemented for
objects of the same class. This decorator extends the functionality to compare class
name if used against other objects.
"""
for op in ['__lt__', '__le__', '__gt__', '__ge__',]:
old = getattr(cls, op)
def new(self, other):
if type(self) == type(other):
return old(self, other)
return getattr(str, op)(self.__class__.__name__, other.__class__.__name__)
setattr(cls, op, new)
return cls
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class Callee():
class Callee:
pass
BUILTIN_FUNCTION_OR_METHOD = type(print)
@better_compare_for_dataclass
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class ExternalCallee(Callee):
# Some bound methods might not have __module__ attribute: for example,
# `list().append.__module__ is None`
module: Optional[str]
qualname: str
#
is_builtin: bool
@classmethod
def from_arg(cls, func):
# if func.__name__ == "append":
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()
return cls(
module=func.__module__,
qualname=func.__qualname__,
is_builtin=type(func) == BUILTIN_FUNCTION_OR_METHOD
)
@better_compare_for_dataclass
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class PythonCallee(Callee):
"""A callee (Function/Lambda/???)
should (hopefully) be uniquely identified by its name and location (filename+line
@@ -92,32 +140,84 @@ class Callee():
def from_frame(cls, frame):
code = frame.f_code
return cls(
funcname = code.co_name,
filename = canonic_filename(code.co_filename),
linenum = frame.f_lineno,
funcname = code.co_name,
)
class CallGraphTracer(bdb.Bdb):
class CallGraphTracer:
"""Tracer that records calls being made
It would seem obvious that this should have extended `trace` library
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
using `sys.settrace` directly.
(https://docs.python.org/3/library/trace.html), but that part is not extensible.
You might think that we can just use `sys.settrace`
(https://docs.python.org/3.8/library/sys.html#sys.settrace) like the basic debugger
(bdb) does, but that isn't invoked on calls to C code, which we need in general, and
need for handling builtins specifically.
Luckily, `sys.setprofile`
(https://docs.python.org/3.8/library/sys.html#sys.setprofile) provides all that we
need. You might be scared by reading the following bit of the documentation
> The function is thread-specific, but there is no way for the profiler to know about
> context switches between threads, so it does not make sense to use this in the
> presence of multiple threads.
but that is to be understood in the context of making a profiler (you can't reliably
measure function execution time if you don't know about context switches). For our
use-case, this is not a problem.
"""
recorded_calls: set
def __init__(self):
self.recorded_calls = set()
super().__init__()
def user_call(self, frame, argument_list):
call = Call.from_frame(frame.f_back)
callee = Callee.from_frame(frame)
def run(self, code, globals, locals):
self.exec_call_seen = False
self.ignore_rest = False
try:
sys.setprofile(cgt.profilefunc)
exec(code, globals, locals)
# TODO: exception handling?
finally:
sys.setprofile(None)
# _print(f'{call} -> {callee}')
def profilefunc(self, frame, event, arg):
# ignore everything until the first call, since that is `exec` from the `run` method above
if not self.exec_call_seen:
if event == "call":
self.exec_call_seen = True
return
# if we're going out of the exec, we should ignore anything else (for example the
# call to `sys.setprofile(None)`)
if event == "c_return":
if arg == exec and frame.f_code.co_filename == __file__:
self.ignore_rest = True
if self.ignore_rest:
return
if event not in ["call", "c_call"]:
return
debug_print(f"profilefunc {event=}")
if event == "call":
# in call, the `frame` argument is new the frame for entering the callee
call = Call.from_frame(frame.f_back)
callee = PythonCallee.from_frame(frame)
if event == "c_call":
# in c_call, the `frame` argument is frame where the call happens, and the `arg` argument
# is the C function object.
call = Call.from_frame(frame)
callee = ExternalCallee.from_arg(arg)
debug_print(f'{call} --> {callee}')
debug_print('\n'*5)
self.recorded_calls.add((call, callee))
@@ -191,8 +291,6 @@ class XMLExporter(Exporter):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
@@ -236,7 +334,7 @@ if __name__ == "__main__":
XMLExporter.export(cgt.recorded_calls, opts.xml)
else:
for (call, callee) in sorted(cgt.recorded_calls):
print(f'{call} -> {callee}')
print(f'{call} --> {callee}')
print('--- captured stdout ---')
print(captured_stdout.getvalue(), end='')

View File

@@ -0,0 +1,6 @@
<root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="1" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="2" call_inst_index="12" externalcallee_module="builtins" externalcallee_qualname="len" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="4" call_inst_index="28" externalcallee_module="None" externalcallee_qualname="list.append" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="7" call_inst_index="44" externalcallee_module="sys" externalcallee_qualname="getdefaultencoding" externalcallee_is_builtin="True"/>
</root>

View File

@@ -1,6 +1,8 @@
<root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" callee_funcname="foo"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" callee_funcname="bar"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" callee_funcname="foo"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" callee_funcname="bar"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="2" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="5" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/>
</root>

View File

@@ -0,0 +1,9 @@
print("builtins test")
len("bar")
l = list()
l.append(42)
import sys
sys.getdefaultencoding()
r = range(10)

View File

@@ -1,9 +1,19 @@
import RecordedCalls
from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue
import semmle.python.objects.Callables
from ValidRecordedCall rc, Call call, Value calleeValue
where
call = rc.getCall() and
callee = rc.getCallee() and
calleeValue.getScope() = callee and
calleeValue.getACall() = call.getAFlowNode()
select call, "-->", callee
calleeValue.getACall() = call.getAFlowNode() and
(
rc instanceof RecordedPythonCall and
calleeValue.(PythonFunctionValue).getScope() = rc.(RecordedPythonCall).getCallee()
or
rc instanceof RecordedBuiltinCall and
calleeValue.(BuiltinFunctionObjectInternal).getBuiltin() = rc.(RecordedBuiltinCall).getCallee()
or
rc instanceof RecordedBuiltinCall and
calleeValue.(BuiltinMethodObjectInternal).getBuiltin() = rc.(RecordedBuiltinCall).getCallee()
)
select call, "-->", calleeValue

View File

@@ -1,5 +1,7 @@
import python
import semmle.python.types.Builtins
class RecordedCall extends XMLElement {
RecordedCall() { this.hasName("recorded_call") }
@@ -13,24 +15,76 @@ class RecordedCall extends XMLElement {
// TODO: handle calls spanning multiple lines
result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
}
}
string callee_filename() { result = this.getAttributeValue("callee_filename") }
class RecordedPythonCall extends RecordedCall {
RecordedPythonCall() {
this.hasAttribute("pythoncallee_filename") and
this.hasAttribute("pythoncallee_linenum") and
this.hasAttribute("pythoncallee_funcname")
}
int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() }
string pythoncallee_filename() { result = this.getAttributeValue("pythoncallee_filename") }
string callee_funcname() { result = this.getAttributeValue("callee_funcname") }
int pythoncallee_linenum() { result = this.getAttributeValue("pythoncallee_linenum").toInt() }
string pythoncallee_funcname() { result = this.getAttributeValue("pythoncallee_funcname") }
Function getCallee() {
result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _)
result.getLocation().hasLocationInfo(this.pythoncallee_filename(), this.pythoncallee_linenum(), _, _, _)
}
}
class RecordedBuiltinCall extends RecordedCall {
RecordedBuiltinCall() {
this.hasAttribute("externalcallee_module") and
this.hasAttribute("externalcallee_qualname") and
this.getAttributeValue("externalcallee_is_builtin") = "True"
}
string externalcallee_module() { result = this.getAttributeValue("externalcallee_module") }
string externalcallee_qualname() { result = this.getAttributeValue("externalcallee_qualname") }
Builtin getCallee() {
exists(Builtin mod |
not externalcallee_module() = "None" and
mod.isModule() and
mod.getName() = this.externalcallee_module()
or
externalcallee_module() = "None" and
mod = Builtin::builtinModule()
|
result = traverse_qualname(mod, this.externalcallee_qualname())
)
}
}
Builtin traverse_qualname(Builtin parent, string qualname) {
not qualname = "__objclass__" and
not qualname.matches("%.%") and
result = parent.getMember(qualname)
or
qualname.matches("%.%") and
exists(string before_dot, string after_dot, Builtin intermediate_parent |
qualname = before_dot + "." + after_dot and
not before_dot = "__objclass__" and
intermediate_parent = parent.getMember(before_dot) and
result = traverse_qualname(intermediate_parent, after_dot)
)
}
/**
* Class of recorded calls where we can uniquely identify both the `call` and the `callee`.
*/
class ValidRecordedCall extends RecordedCall {
ValidRecordedCall() {
strictcount(this.getCall()) = 1 and
strictcount(this.getCallee()) = 1
(
strictcount(this.(RecordedPythonCall).getCall()) = 1
or
strictcount(this.(RecordedBuiltinCall).getCall()) = 1
)
}
}

View File

@@ -3,5 +3,4 @@ import RecordedCalls
from RecordedCall rc
where not rc instanceof ValidRecordedCall
select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)",
rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(),
rc.callee_linenum(), rc.callee_funcname()
rc, rc.call_filename(), rc.call_linenum(), rc.call_inst_index()

View File

@@ -10,6 +10,7 @@ PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
./cg_trace.py --xml "$XMLDIR"/simple.xml example/simple.py
./cg_trace.py --xml "$XMLDIR"/builtins.xml example/builtins.py
rm -rf "$DB"