diff --git a/python/tools/recorded-call-graph-metrics/README.md b/python/tools/recorded-call-graph-metrics/README.md index 4e257c0c34d..48756ebe531 100644 --- a/python/tools/recorded-call-graph-metrics/README.md +++ b/python/tools/recorded-call-graph-metrics/README.md @@ -14,4 +14,30 @@ The next hurdle is being able to handle multiple calls on the same line, such as ## How do I give it a spin? -Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory. +Run the `recreate-db.sh` script to create the database `cg-trace-example-db`. Then run the queries inside the `ql/` directory. + + +## Limitations + +### Code that uses `sys.setprofile` + +Since that is our mechanism for recording calls, any code that uses `sys.setprofile` will not work together with the call-graph tracer. + +### Class instantiation + +Does not always fire off an event in the `sys.setprofile` function (neither in `sys.settrace`), so is not recorded. Example: + +``` +r = range(10) +``` + +when disassembled (`python -m dis `): + +``` + 9 48 LOAD_NAME 7 (range) + 50 LOAD_CONST 5 (10) + 52 CALL_FUNCTION 1 + 54 STORE_NAME 8 (r) +``` + +but no event :disappointed: diff --git a/python/tools/recorded-call-graph-metrics/cg_trace.py b/python/tools/recorded-call-graph-metrics/cg_trace.py index 4b1e7d43867..55d6f6e4665 100755 --- a/python/tools/recorded-call-graph-metrics/cg_trace.py +++ b/python/tools/recorded-call-graph-metrics/cg_trace.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """Call Graph tracing. Execute a python program and for each call being made, record the call and callee. This @@ -30,10 +29,14 @@ import dis import dataclasses import csv from lxml import etree +from typing import Optional -# Copy-Paste and uncomment for interactive ipython sessions -# import IPython; IPython.embed(); sys.exit() +# copy-paste For interactive ipython sessions +# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit() +def debug_print(*args, **kwargs): + # print(*args, **kwargs, file=sys.__stderr__) + pass _canonic_filename_cache = dict() def canonic_filename(filename): @@ -67,8 +70,8 @@ class Call(): code = frame.f_code # Uncomment to see the bytecode - # b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti) - # print(b.dis(), file=sys.__stderr__) + b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti) + debug_print(b.dis()) return cls( filename = canonic_filename(code.co_filename), @@ -77,8 +80,53 @@ class Call(): ) +def better_compare_for_dataclass(cls): + """When dataclass is used with `order=True`, the comparison methods is only implemented for + objects of the same class. This decorator extends the functionality to compare class + name if used against other objects. + """ + for op in ['__lt__', '__le__', '__gt__', '__ge__',]: + old = getattr(cls, op) + def new(self, other): + if type(self) == type(other): + return old(self, other) + return getattr(str, op)(self.__class__.__name__, other.__class__.__name__) + setattr(cls, op, new) + return cls + @dataclasses.dataclass(frozen=True, eq=True, order=True) -class Callee(): +class Callee: + pass + + +BUILTIN_FUNCTION_OR_METHOD = type(print) + + +@better_compare_for_dataclass +@dataclasses.dataclass(frozen=True, eq=True, order=True) +class ExternalCallee(Callee): + # Some bound methods might not have __module__ attribute: for example, + # `list().append.__module__ is None` + module: Optional[str] + qualname: str + # + is_builtin: bool + + @classmethod + def from_arg(cls, func): + # if func.__name__ == "append": + # import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit() + + return cls( + module=func.__module__, + qualname=func.__qualname__, + is_builtin=type(func) == BUILTIN_FUNCTION_OR_METHOD + ) + + +@better_compare_for_dataclass +@dataclasses.dataclass(frozen=True, eq=True, order=True) +class PythonCallee(Callee): """A callee (Function/Lambda/???) should (hopefully) be uniquely identified by its name and location (filename+line @@ -92,32 +140,84 @@ class Callee(): def from_frame(cls, frame): code = frame.f_code return cls( - funcname = code.co_name, filename = canonic_filename(code.co_filename), linenum = frame.f_lineno, + funcname = code.co_name, ) -class CallGraphTracer(bdb.Bdb): +class CallGraphTracer: """Tracer that records calls being made It would seem obvious that this should have extended `trace` library - (https://docs.python.org/3/library/trace.html), but that part is not extensible -- - however, the basic debugger (bdb) is, and provides maybe a bit more help than just - using `sys.settrace` directly. + (https://docs.python.org/3/library/trace.html), but that part is not extensible. + + You might think that we can just use `sys.settrace` + (https://docs.python.org/3.8/library/sys.html#sys.settrace) like the basic debugger + (bdb) does, but that isn't invoked on calls to C code, which we need in general, and + need for handling builtins specifically. + + Luckily, `sys.setprofile` + (https://docs.python.org/3.8/library/sys.html#sys.setprofile) provides all that we + need. You might be scared by reading the following bit of the documentation + + > The function is thread-specific, but there is no way for the profiler to know about + > context switches between threads, so it does not make sense to use this in the + > presence of multiple threads. + + but that is to be understood in the context of making a profiler (you can't reliably + measure function execution time if you don't know about context switches). For our + use-case, this is not a problem. """ recorded_calls: set def __init__(self): self.recorded_calls = set() - super().__init__() - def user_call(self, frame, argument_list): - call = Call.from_frame(frame.f_back) - callee = Callee.from_frame(frame) + def run(self, code, globals, locals): + self.exec_call_seen = False + self.ignore_rest = False + try: + sys.setprofile(cgt.profilefunc) + exec(code, globals, locals) + # TODO: exception handling? + finally: + sys.setprofile(None) - # _print(f'{call} -> {callee}') + def profilefunc(self, frame, event, arg): + # ignore everything until the first call, since that is `exec` from the `run` method above + if not self.exec_call_seen: + if event == "call": + self.exec_call_seen = True + return + + # if we're going out of the exec, we should ignore anything else (for example the + # call to `sys.setprofile(None)`) + if event == "c_return": + if arg == exec and frame.f_code.co_filename == __file__: + self.ignore_rest = True + + if self.ignore_rest: + return + + if event not in ["call", "c_call"]: + return + + debug_print(f"profilefunc {event=}") + if event == "call": + # in call, the `frame` argument is new the frame for entering the callee + call = Call.from_frame(frame.f_back) + callee = PythonCallee.from_frame(frame) + + if event == "c_call": + # in c_call, the `frame` argument is frame where the call happens, and the `arg` argument + # is the C function object. + call = Call.from_frame(frame) + callee = ExternalCallee.from_arg(arg) + + debug_print(f'{call} --> {callee}') + debug_print('\n'*5) self.recorded_calls.add((call, callee)) @@ -191,8 +291,6 @@ class XMLExporter(Exporter): if __name__ == "__main__": - - parser = argparse.ArgumentParser() @@ -236,7 +334,7 @@ if __name__ == "__main__": XMLExporter.export(cgt.recorded_calls, opts.xml) else: for (call, callee) in sorted(cgt.recorded_calls): - print(f'{call} -> {callee}') + print(f'{call} --> {callee}') print('--- captured stdout ---') print(captured_stdout.getvalue(), end='') diff --git a/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml b/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml new file mode 100644 index 00000000000..60703c50cf2 --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/example-traces/builtins.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/python/tools/recorded-call-graph-metrics/example-traces/simple.xml b/python/tools/recorded-call-graph-metrics/example-traces/simple.xml index d452f775376..25f494aec86 100644 --- a/python/tools/recorded-call-graph-metrics/example-traces/simple.xml +++ b/python/tools/recorded-call-graph-metrics/example-traces/simple.xml @@ -1,6 +1,8 @@ - - - - + + + + + + diff --git a/python/tools/recorded-call-graph-metrics/example/builtins.py b/python/tools/recorded-call-graph-metrics/example/builtins.py new file mode 100644 index 00000000000..7dde7177331 --- /dev/null +++ b/python/tools/recorded-call-graph-metrics/example/builtins.py @@ -0,0 +1,9 @@ +print("builtins test") +len("bar") +l = list() +l.append(42) + +import sys +sys.getdefaultencoding() + +r = range(10) diff --git a/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql b/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql index a22a2f3a597..ca8c0c54879 100644 --- a/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql +++ b/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql @@ -1,9 +1,19 @@ import RecordedCalls -from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue +import semmle.python.objects.Callables + +from ValidRecordedCall rc, Call call, Value calleeValue where call = rc.getCall() and - callee = rc.getCallee() and - calleeValue.getScope() = callee and - calleeValue.getACall() = call.getAFlowNode() -select call, "-->", callee + calleeValue.getACall() = call.getAFlowNode() and + ( + rc instanceof RecordedPythonCall and + calleeValue.(PythonFunctionValue).getScope() = rc.(RecordedPythonCall).getCallee() + or + rc instanceof RecordedBuiltinCall and + calleeValue.(BuiltinFunctionObjectInternal).getBuiltin() = rc.(RecordedBuiltinCall).getCallee() + or + rc instanceof RecordedBuiltinCall and + calleeValue.(BuiltinMethodObjectInternal).getBuiltin() = rc.(RecordedBuiltinCall).getCallee() + ) +select call, "-->", calleeValue diff --git a/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll b/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll index e3c4bede44d..cfc6504f40d 100644 --- a/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll +++ b/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll @@ -1,5 +1,7 @@ import python +import semmle.python.types.Builtins + class RecordedCall extends XMLElement { RecordedCall() { this.hasName("recorded_call") } @@ -13,24 +15,76 @@ class RecordedCall extends XMLElement { // TODO: handle calls spanning multiple lines result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _) } +} - string callee_filename() { result = this.getAttributeValue("callee_filename") } +class RecordedPythonCall extends RecordedCall { + RecordedPythonCall() { + this.hasAttribute("pythoncallee_filename") and + this.hasAttribute("pythoncallee_linenum") and + this.hasAttribute("pythoncallee_funcname") + } - int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() } + string pythoncallee_filename() { result = this.getAttributeValue("pythoncallee_filename") } - string callee_funcname() { result = this.getAttributeValue("callee_funcname") } + int pythoncallee_linenum() { result = this.getAttributeValue("pythoncallee_linenum").toInt() } + + string pythoncallee_funcname() { result = this.getAttributeValue("pythoncallee_funcname") } Function getCallee() { - result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _) + result.getLocation().hasLocationInfo(this.pythoncallee_filename(), this.pythoncallee_linenum(), _, _, _) } } +class RecordedBuiltinCall extends RecordedCall { + RecordedBuiltinCall() { + this.hasAttribute("externalcallee_module") and + this.hasAttribute("externalcallee_qualname") and + this.getAttributeValue("externalcallee_is_builtin") = "True" + } + + string externalcallee_module() { result = this.getAttributeValue("externalcallee_module") } + + string externalcallee_qualname() { result = this.getAttributeValue("externalcallee_qualname") } + + Builtin getCallee() { + exists(Builtin mod | + not externalcallee_module() = "None" and + mod.isModule() and + mod.getName() = this.externalcallee_module() + or + externalcallee_module() = "None" and + mod = Builtin::builtinModule() + | + result = traverse_qualname(mod, this.externalcallee_qualname()) + ) + } +} + +Builtin traverse_qualname(Builtin parent, string qualname) { + not qualname = "__objclass__" and + not qualname.matches("%.%") and + result = parent.getMember(qualname) + or + qualname.matches("%.%") and + exists(string before_dot, string after_dot, Builtin intermediate_parent | + qualname = before_dot + "." + after_dot and + not before_dot = "__objclass__" and + intermediate_parent = parent.getMember(before_dot) and + result = traverse_qualname(intermediate_parent, after_dot) + ) +} + + /** * Class of recorded calls where we can uniquely identify both the `call` and the `callee`. */ class ValidRecordedCall extends RecordedCall { ValidRecordedCall() { strictcount(this.getCall()) = 1 and - strictcount(this.getCallee()) = 1 + ( + strictcount(this.(RecordedPythonCall).getCall()) = 1 + or + strictcount(this.(RecordedBuiltinCall).getCall()) = 1 + ) } } diff --git a/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql b/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql index 414c24a9fd6..8e98531f012 100644 --- a/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql +++ b/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql @@ -3,5 +3,4 @@ import RecordedCalls from RecordedCall rc where not rc instanceof ValidRecordedCall select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)", - rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(), - rc.callee_linenum(), rc.callee_funcname() + rc, rc.call_filename(), rc.call_linenum(), rc.call_inst_index() diff --git a/python/tools/recorded-call-graph-metrics/recreate-db.sh b/python/tools/recorded-call-graph-metrics/recreate-db.sh index 6f1f39cb05d..7ac42c4f1e3 100755 --- a/python/tools/recorded-call-graph-metrics/recreate-db.sh +++ b/python/tools/recorded-call-graph-metrics/recreate-db.sh @@ -10,6 +10,7 @@ PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python) ./cg_trace.py --xml "$XMLDIR"/simple.xml example/simple.py +./cg_trace.py --xml "$XMLDIR"/builtins.xml example/builtins.py rm -rf "$DB"