mirror of
https://github.com/github/codeql.git
synced 2025-12-16 16:53:25 +01:00
Python: CG trace: Handle builtins
This commit is contained in:
@@ -14,4 +14,30 @@ The next hurdle is being able to handle multiple calls on the same line, such as
|
||||
|
||||
## How do I give it a spin?
|
||||
|
||||
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.
|
||||
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`. Then run the queries inside the `ql/` directory.
|
||||
|
||||
|
||||
## Limitations
|
||||
|
||||
### Code that uses `sys.setprofile`
|
||||
|
||||
Since that is our mechanism for recording calls, any code that uses `sys.setprofile` will not work together with the call-graph tracer.
|
||||
|
||||
### Class instantiation
|
||||
|
||||
Does not always fire off an event in the `sys.setprofile` function (neither in `sys.settrace`), so is not recorded. Example:
|
||||
|
||||
```
|
||||
r = range(10)
|
||||
```
|
||||
|
||||
when disassembled (`python -m dis <file>`):
|
||||
|
||||
```
|
||||
9 48 LOAD_NAME 7 (range)
|
||||
50 LOAD_CONST 5 (10)
|
||||
52 CALL_FUNCTION 1
|
||||
54 STORE_NAME 8 (r)
|
||||
```
|
||||
|
||||
but no event :disappointed:
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""Call Graph tracing.
|
||||
|
||||
Execute a python program and for each call being made, record the call and callee. This
|
||||
@@ -30,10 +29,14 @@ import dis
|
||||
import dataclasses
|
||||
import csv
|
||||
from lxml import etree
|
||||
from typing import Optional
|
||||
|
||||
# Copy-Paste and uncomment for interactive ipython sessions
|
||||
# import IPython; IPython.embed(); sys.exit()
|
||||
# copy-paste For interactive ipython sessions
|
||||
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()
|
||||
|
||||
def debug_print(*args, **kwargs):
|
||||
# print(*args, **kwargs, file=sys.__stderr__)
|
||||
pass
|
||||
|
||||
_canonic_filename_cache = dict()
|
||||
def canonic_filename(filename):
|
||||
@@ -67,8 +70,8 @@ class Call():
|
||||
code = frame.f_code
|
||||
|
||||
# Uncomment to see the bytecode
|
||||
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
|
||||
# print(b.dis(), file=sys.__stderr__)
|
||||
b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
|
||||
debug_print(b.dis())
|
||||
|
||||
return cls(
|
||||
filename = canonic_filename(code.co_filename),
|
||||
@@ -77,8 +80,53 @@ class Call():
|
||||
)
|
||||
|
||||
|
||||
def better_compare_for_dataclass(cls):
|
||||
"""When dataclass is used with `order=True`, the comparison methods is only implemented for
|
||||
objects of the same class. This decorator extends the functionality to compare class
|
||||
name if used against other objects.
|
||||
"""
|
||||
for op in ['__lt__', '__le__', '__gt__', '__ge__',]:
|
||||
old = getattr(cls, op)
|
||||
def new(self, other):
|
||||
if type(self) == type(other):
|
||||
return old(self, other)
|
||||
return getattr(str, op)(self.__class__.__name__, other.__class__.__name__)
|
||||
setattr(cls, op, new)
|
||||
return cls
|
||||
|
||||
@dataclasses.dataclass(frozen=True, eq=True, order=True)
|
||||
class Callee():
|
||||
class Callee:
|
||||
pass
|
||||
|
||||
|
||||
BUILTIN_FUNCTION_OR_METHOD = type(print)
|
||||
|
||||
|
||||
@better_compare_for_dataclass
|
||||
@dataclasses.dataclass(frozen=True, eq=True, order=True)
|
||||
class ExternalCallee(Callee):
|
||||
# Some bound methods might not have __module__ attribute: for example,
|
||||
# `list().append.__module__ is None`
|
||||
module: Optional[str]
|
||||
qualname: str
|
||||
#
|
||||
is_builtin: bool
|
||||
|
||||
@classmethod
|
||||
def from_arg(cls, func):
|
||||
# if func.__name__ == "append":
|
||||
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()
|
||||
|
||||
return cls(
|
||||
module=func.__module__,
|
||||
qualname=func.__qualname__,
|
||||
is_builtin=type(func) == BUILTIN_FUNCTION_OR_METHOD
|
||||
)
|
||||
|
||||
|
||||
@better_compare_for_dataclass
|
||||
@dataclasses.dataclass(frozen=True, eq=True, order=True)
|
||||
class PythonCallee(Callee):
|
||||
"""A callee (Function/Lambda/???)
|
||||
|
||||
should (hopefully) be uniquely identified by its name and location (filename+line
|
||||
@@ -92,32 +140,84 @@ class Callee():
|
||||
def from_frame(cls, frame):
|
||||
code = frame.f_code
|
||||
return cls(
|
||||
funcname = code.co_name,
|
||||
filename = canonic_filename(code.co_filename),
|
||||
linenum = frame.f_lineno,
|
||||
funcname = code.co_name,
|
||||
)
|
||||
|
||||
|
||||
class CallGraphTracer(bdb.Bdb):
|
||||
class CallGraphTracer:
|
||||
"""Tracer that records calls being made
|
||||
|
||||
It would seem obvious that this should have extended `trace` library
|
||||
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
|
||||
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
|
||||
using `sys.settrace` directly.
|
||||
(https://docs.python.org/3/library/trace.html), but that part is not extensible.
|
||||
|
||||
You might think that we can just use `sys.settrace`
|
||||
(https://docs.python.org/3.8/library/sys.html#sys.settrace) like the basic debugger
|
||||
(bdb) does, but that isn't invoked on calls to C code, which we need in general, and
|
||||
need for handling builtins specifically.
|
||||
|
||||
Luckily, `sys.setprofile`
|
||||
(https://docs.python.org/3.8/library/sys.html#sys.setprofile) provides all that we
|
||||
need. You might be scared by reading the following bit of the documentation
|
||||
|
||||
> The function is thread-specific, but there is no way for the profiler to know about
|
||||
> context switches between threads, so it does not make sense to use this in the
|
||||
> presence of multiple threads.
|
||||
|
||||
but that is to be understood in the context of making a profiler (you can't reliably
|
||||
measure function execution time if you don't know about context switches). For our
|
||||
use-case, this is not a problem.
|
||||
"""
|
||||
|
||||
recorded_calls: set
|
||||
|
||||
def __init__(self):
|
||||
self.recorded_calls = set()
|
||||
super().__init__()
|
||||
|
||||
def user_call(self, frame, argument_list):
|
||||
call = Call.from_frame(frame.f_back)
|
||||
callee = Callee.from_frame(frame)
|
||||
def run(self, code, globals, locals):
|
||||
self.exec_call_seen = False
|
||||
self.ignore_rest = False
|
||||
try:
|
||||
sys.setprofile(cgt.profilefunc)
|
||||
exec(code, globals, locals)
|
||||
# TODO: exception handling?
|
||||
finally:
|
||||
sys.setprofile(None)
|
||||
|
||||
# _print(f'{call} -> {callee}')
|
||||
def profilefunc(self, frame, event, arg):
|
||||
# ignore everything until the first call, since that is `exec` from the `run` method above
|
||||
if not self.exec_call_seen:
|
||||
if event == "call":
|
||||
self.exec_call_seen = True
|
||||
return
|
||||
|
||||
# if we're going out of the exec, we should ignore anything else (for example the
|
||||
# call to `sys.setprofile(None)`)
|
||||
if event == "c_return":
|
||||
if arg == exec and frame.f_code.co_filename == __file__:
|
||||
self.ignore_rest = True
|
||||
|
||||
if self.ignore_rest:
|
||||
return
|
||||
|
||||
if event not in ["call", "c_call"]:
|
||||
return
|
||||
|
||||
debug_print(f"profilefunc {event=}")
|
||||
if event == "call":
|
||||
# in call, the `frame` argument is new the frame for entering the callee
|
||||
call = Call.from_frame(frame.f_back)
|
||||
callee = PythonCallee.from_frame(frame)
|
||||
|
||||
if event == "c_call":
|
||||
# in c_call, the `frame` argument is frame where the call happens, and the `arg` argument
|
||||
# is the C function object.
|
||||
call = Call.from_frame(frame)
|
||||
callee = ExternalCallee.from_arg(arg)
|
||||
|
||||
debug_print(f'{call} --> {callee}')
|
||||
debug_print('\n'*5)
|
||||
self.recorded_calls.add((call, callee))
|
||||
|
||||
|
||||
@@ -191,8 +291,6 @@ class XMLExporter(Exporter):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
|
||||
@@ -236,7 +334,7 @@ if __name__ == "__main__":
|
||||
XMLExporter.export(cgt.recorded_calls, opts.xml)
|
||||
else:
|
||||
for (call, callee) in sorted(cgt.recorded_calls):
|
||||
print(f'{call} -> {callee}')
|
||||
print(f'{call} --> {callee}')
|
||||
|
||||
print('--- captured stdout ---')
|
||||
print(captured_stdout.getvalue(), end='')
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
<root>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="1" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="2" call_inst_index="12" externalcallee_module="builtins" externalcallee_qualname="len" externalcallee_is_builtin="True"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="4" call_inst_index="28" externalcallee_module="None" externalcallee_qualname="list.append" externalcallee_is_builtin="True"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="7" call_inst_index="44" externalcallee_module="sys" externalcallee_qualname="getdefaultencoding" externalcallee_is_builtin="True"/>
|
||||
</root>
|
||||
@@ -1,6 +1,8 @@
|
||||
<root>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" callee_funcname="foo"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" callee_funcname="bar"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" callee_funcname="foo"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" callee_funcname="bar"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="2" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="5" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/>
|
||||
</root>
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
print("builtins test")
|
||||
len("bar")
|
||||
l = list()
|
||||
l.append(42)
|
||||
|
||||
import sys
|
||||
sys.getdefaultencoding()
|
||||
|
||||
r = range(10)
|
||||
@@ -1,9 +1,19 @@
|
||||
import RecordedCalls
|
||||
|
||||
from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue
|
||||
import semmle.python.objects.Callables
|
||||
|
||||
from ValidRecordedCall rc, Call call, Value calleeValue
|
||||
where
|
||||
call = rc.getCall() and
|
||||
callee = rc.getCallee() and
|
||||
calleeValue.getScope() = callee and
|
||||
calleeValue.getACall() = call.getAFlowNode()
|
||||
select call, "-->", callee
|
||||
calleeValue.getACall() = call.getAFlowNode() and
|
||||
(
|
||||
rc instanceof RecordedPythonCall and
|
||||
calleeValue.(PythonFunctionValue).getScope() = rc.(RecordedPythonCall).getCallee()
|
||||
or
|
||||
rc instanceof RecordedBuiltinCall and
|
||||
calleeValue.(BuiltinFunctionObjectInternal).getBuiltin() = rc.(RecordedBuiltinCall).getCallee()
|
||||
or
|
||||
rc instanceof RecordedBuiltinCall and
|
||||
calleeValue.(BuiltinMethodObjectInternal).getBuiltin() = rc.(RecordedBuiltinCall).getCallee()
|
||||
)
|
||||
select call, "-->", calleeValue
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import python
|
||||
|
||||
import semmle.python.types.Builtins
|
||||
|
||||
class RecordedCall extends XMLElement {
|
||||
RecordedCall() { this.hasName("recorded_call") }
|
||||
|
||||
@@ -13,24 +15,76 @@ class RecordedCall extends XMLElement {
|
||||
// TODO: handle calls spanning multiple lines
|
||||
result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
|
||||
}
|
||||
}
|
||||
|
||||
string callee_filename() { result = this.getAttributeValue("callee_filename") }
|
||||
class RecordedPythonCall extends RecordedCall {
|
||||
RecordedPythonCall() {
|
||||
this.hasAttribute("pythoncallee_filename") and
|
||||
this.hasAttribute("pythoncallee_linenum") and
|
||||
this.hasAttribute("pythoncallee_funcname")
|
||||
}
|
||||
|
||||
int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() }
|
||||
string pythoncallee_filename() { result = this.getAttributeValue("pythoncallee_filename") }
|
||||
|
||||
string callee_funcname() { result = this.getAttributeValue("callee_funcname") }
|
||||
int pythoncallee_linenum() { result = this.getAttributeValue("pythoncallee_linenum").toInt() }
|
||||
|
||||
string pythoncallee_funcname() { result = this.getAttributeValue("pythoncallee_funcname") }
|
||||
|
||||
Function getCallee() {
|
||||
result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _)
|
||||
result.getLocation().hasLocationInfo(this.pythoncallee_filename(), this.pythoncallee_linenum(), _, _, _)
|
||||
}
|
||||
}
|
||||
|
||||
class RecordedBuiltinCall extends RecordedCall {
|
||||
RecordedBuiltinCall() {
|
||||
this.hasAttribute("externalcallee_module") and
|
||||
this.hasAttribute("externalcallee_qualname") and
|
||||
this.getAttributeValue("externalcallee_is_builtin") = "True"
|
||||
}
|
||||
|
||||
string externalcallee_module() { result = this.getAttributeValue("externalcallee_module") }
|
||||
|
||||
string externalcallee_qualname() { result = this.getAttributeValue("externalcallee_qualname") }
|
||||
|
||||
Builtin getCallee() {
|
||||
exists(Builtin mod |
|
||||
not externalcallee_module() = "None" and
|
||||
mod.isModule() and
|
||||
mod.getName() = this.externalcallee_module()
|
||||
or
|
||||
externalcallee_module() = "None" and
|
||||
mod = Builtin::builtinModule()
|
||||
|
|
||||
result = traverse_qualname(mod, this.externalcallee_qualname())
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
Builtin traverse_qualname(Builtin parent, string qualname) {
|
||||
not qualname = "__objclass__" and
|
||||
not qualname.matches("%.%") and
|
||||
result = parent.getMember(qualname)
|
||||
or
|
||||
qualname.matches("%.%") and
|
||||
exists(string before_dot, string after_dot, Builtin intermediate_parent |
|
||||
qualname = before_dot + "." + after_dot and
|
||||
not before_dot = "__objclass__" and
|
||||
intermediate_parent = parent.getMember(before_dot) and
|
||||
result = traverse_qualname(intermediate_parent, after_dot)
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Class of recorded calls where we can uniquely identify both the `call` and the `callee`.
|
||||
*/
|
||||
class ValidRecordedCall extends RecordedCall {
|
||||
ValidRecordedCall() {
|
||||
strictcount(this.getCall()) = 1 and
|
||||
strictcount(this.getCallee()) = 1
|
||||
(
|
||||
strictcount(this.(RecordedPythonCall).getCall()) = 1
|
||||
or
|
||||
strictcount(this.(RecordedBuiltinCall).getCall()) = 1
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,5 +3,4 @@ import RecordedCalls
|
||||
from RecordedCall rc
|
||||
where not rc instanceof ValidRecordedCall
|
||||
select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)",
|
||||
rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(),
|
||||
rc.callee_linenum(), rc.callee_funcname()
|
||||
rc, rc.call_filename(), rc.call_linenum(), rc.call_inst_index()
|
||||
|
||||
@@ -10,6 +10,7 @@ PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
|
||||
|
||||
|
||||
./cg_trace.py --xml "$XMLDIR"/simple.xml example/simple.py
|
||||
./cg_trace.py --xml "$XMLDIR"/builtins.xml example/builtins.py
|
||||
|
||||
rm -rf "$DB"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user