mirror of
https://github.com/github/codeql.git
synced 2025-12-16 16:53:25 +01:00
Python: CG trace: Make code modular
This commit is contained in:
7
python/tools/recorded-call-graph-metrics/.gitignore
vendored
Normal file
7
python/tools/recorded-call-graph-metrics/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Example DB
|
||||||
|
cg-trace-example-db/
|
||||||
|
|
||||||
|
# Artifact from building `pip install -e .`
|
||||||
|
cg_trace.egg-info/
|
||||||
|
|
||||||
|
venv/
|
||||||
6
python/tools/recorded-call-graph-metrics/.isort.cfg
Normal file
6
python/tools/recorded-call-graph-metrics/.isort.cfg
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
[settings]
|
||||||
|
multi_line_output = 3
|
||||||
|
include_trailing_comma = True
|
||||||
|
force_grid_wrap = 0
|
||||||
|
use_parentheses = True
|
||||||
|
line_length = 88
|
||||||
@@ -14,10 +14,40 @@ The next hurdle is being able to handle multiple calls on the same line, such as
|
|||||||
|
|
||||||
## How do I give it a spin?
|
## How do I give it a spin?
|
||||||
|
|
||||||
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`. Then run the queries inside the `ql/` directory.
|
After following setup instructions below, run the `recreate-db.sh` script to create the database `cg-trace-example-db`. Then run the queries inside the `ql/` directory.
|
||||||
|
|
||||||
|
|
||||||
## Limitations
|
## Setup
|
||||||
|
|
||||||
|
1. Ensure you have at least Python 3.6
|
||||||
|
|
||||||
|
2. Create virtual environment `python3 -m venv venv` and activate it
|
||||||
|
|
||||||
|
3. Install dependencies `pip install -r --upgrade requirements.txt`
|
||||||
|
|
||||||
|
4. Install this codebase as an editable package `pip install -e .`
|
||||||
|
|
||||||
|
5. Setup your editor. If you're using VS Code, create a new project for this folder, and
|
||||||
|
use these settings for correct autoformatting of code on save:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"python.pythonPath": "venv/bin/python",
|
||||||
|
"python.linting.enabled": true,
|
||||||
|
"python.linting.flake8Enabled": true,
|
||||||
|
"python.formatting.provider": "black",
|
||||||
|
"editor.formatOnSave": true,
|
||||||
|
"[python]": {
|
||||||
|
"editor.codeActionsOnSave": {
|
||||||
|
"source.organizeImports": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Enjoy writing code, and being able to run `cg-trace` on your command line :tada:
|
||||||
|
|
||||||
|
|
||||||
|
## Tracing Limitations
|
||||||
|
|
||||||
### Code that uses `sys.setprofile`
|
### Code that uses `sys.setprofile`
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
__version__ = "0.0.1"
|
||||||
|
|
||||||
|
# Since the virtual machine opcodes changed in 3.6, not going to attempt to support
|
||||||
|
# anything before that
|
||||||
|
MIN_PYTHON_VERSION = (3, 6)
|
||||||
|
MIN_PYTHON_VERSION_FORMATTED = ".".join(str(i) for i in MIN_PYTHON_VERSION)
|
||||||
|
|
||||||
|
if not sys.version_info[:2] >= MIN_PYTHON_VERSION:
|
||||||
|
sys.exit(
|
||||||
|
"You need at least Python {} to use 'cg_trace'".format(
|
||||||
|
MIN_PYTHON_VERSION_FORMATTED
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from cg_trace.main import main
|
||||||
|
|
||||||
|
sys.exit(main())
|
||||||
15
python/tools/recorded-call-graph-metrics/cg_trace/cmdline.py
Normal file
15
python/tools/recorded-call-graph-metrics/cg_trace/cmdline.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def parse(args):
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("--csv")
|
||||||
|
parser.add_argument("--xml")
|
||||||
|
|
||||||
|
parser.add_argument("progname", help="file to run as main program")
|
||||||
|
parser.add_argument(
|
||||||
|
"arguments", nargs=argparse.REMAINDER, help="arguments to the program"
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args(args)
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
import csv
|
||||||
|
import dataclasses
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter:
|
||||||
|
@staticmethod
|
||||||
|
def export(recorded_calls, outfile_path):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def dataclass_to_dict(obj):
|
||||||
|
d = dataclasses.asdict(obj)
|
||||||
|
prefix = obj.__class__.__name__.lower()
|
||||||
|
return {f"{prefix}_{key}": val for (key, val) in d.items()}
|
||||||
|
|
||||||
|
|
||||||
|
class CSVExporter(Exporter):
|
||||||
|
@staticmethod
|
||||||
|
def export(recorded_calls, outfile_path):
|
||||||
|
with open(outfile_path, "w", newline="") as csv_file:
|
||||||
|
writer = None
|
||||||
|
for (call, callee) in sorted(recorded_calls):
|
||||||
|
data = {
|
||||||
|
**Exporter.dataclass_to_dict(call),
|
||||||
|
**Exporter.dataclass_to_dict(callee),
|
||||||
|
}
|
||||||
|
|
||||||
|
if writer is None:
|
||||||
|
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
writer.writerow(data)
|
||||||
|
|
||||||
|
print(f"output written to {outfile_path}")
|
||||||
|
|
||||||
|
|
||||||
|
class XMLExporter(Exporter):
|
||||||
|
@staticmethod
|
||||||
|
def export(recorded_calls, outfile_path):
|
||||||
|
|
||||||
|
root = etree.Element("root")
|
||||||
|
|
||||||
|
for (call, callee) in sorted(recorded_calls):
|
||||||
|
data = {
|
||||||
|
**Exporter.dataclass_to_dict(call),
|
||||||
|
**Exporter.dataclass_to_dict(callee),
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = etree.SubElement(root, "recorded_call")
|
||||||
|
for k, v in data.items():
|
||||||
|
# xml library only supports serializing attributes that have string values
|
||||||
|
rc.set(k, str(v))
|
||||||
|
|
||||||
|
tree = etree.ElementTree(root)
|
||||||
|
tree.write(outfile_path, encoding="utf-8", pretty_print=True)
|
||||||
64
python/tools/recorded-call-graph-metrics/cg_trace/main.py
Normal file
64
python/tools/recorded-call-graph-metrics/cg_trace/main.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
from cg_trace import cmdline, tracer
|
||||||
|
from cg_trace.exporter import CSVExporter, XMLExporter
|
||||||
|
|
||||||
|
|
||||||
|
def record_calls(code, globals):
|
||||||
|
real_stdout = sys.stdout
|
||||||
|
real_stderr = sys.stderr
|
||||||
|
captured_stdout = StringIO()
|
||||||
|
captured_stderr = StringIO()
|
||||||
|
|
||||||
|
sys.stdout = captured_stdout
|
||||||
|
sys.stderr = captured_stderr
|
||||||
|
|
||||||
|
cgt = tracer.CallGraphTracer()
|
||||||
|
cgt.run(code, globals, globals)
|
||||||
|
sys.stdout = real_stdout
|
||||||
|
sys.stderr = real_stderr
|
||||||
|
|
||||||
|
return sorted(cgt.recorded_calls), captured_stdout, captured_stderr
|
||||||
|
|
||||||
|
|
||||||
|
def main(args=None) -> int:
|
||||||
|
if args is None:
|
||||||
|
# first element in argv is program name
|
||||||
|
args = sys.argv[1:]
|
||||||
|
|
||||||
|
opts = cmdline.parse(args)
|
||||||
|
|
||||||
|
# These details of setting up the program to be run is very much inspired by `trace`
|
||||||
|
# from the standard library
|
||||||
|
sys.argv = [opts.progname, *opts.arguments]
|
||||||
|
sys.path[0] = os.path.dirname(opts.progname)
|
||||||
|
|
||||||
|
with open(opts.progname) as fp:
|
||||||
|
code = compile(fp.read(), opts.progname, "exec")
|
||||||
|
|
||||||
|
# try to emulate __main__ namespace as much as possible
|
||||||
|
globs = {
|
||||||
|
"__file__": opts.progname,
|
||||||
|
"__name__": "__main__",
|
||||||
|
"__package__": None,
|
||||||
|
"__cached__": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
recorded_calls, captured_stdout, captured_stderr = record_calls(code, globs)
|
||||||
|
|
||||||
|
if opts.csv:
|
||||||
|
CSVExporter.export(recorded_calls, opts.csv)
|
||||||
|
elif opts.xml:
|
||||||
|
XMLExporter.export(recorded_calls, opts.xml)
|
||||||
|
else:
|
||||||
|
for (call, callee) in recorded_calls:
|
||||||
|
print(f"{call} --> {callee}")
|
||||||
|
|
||||||
|
print("--- captured stdout ---")
|
||||||
|
print(captured_stdout.getvalue(), end="")
|
||||||
|
print("--- captured stderr ---")
|
||||||
|
print(captured_stderr.getvalue(), end="")
|
||||||
|
|
||||||
|
return 0
|
||||||
152
python/tools/recorded-call-graph-metrics/cg_trace.py → python/tools/recorded-call-graph-metrics/cg_trace/tracer.py
Executable file → Normal file
152
python/tools/recorded-call-graph-metrics/cg_trace.py → python/tools/recorded-call-graph-metrics/cg_trace/tracer.py
Executable file → Normal file
@@ -1,33 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Call Graph tracing.
|
|
||||||
|
|
||||||
Execute a python program and for each call being made, record the call and callee. This
|
|
||||||
allows us to compare call graph resolution from static analysis with actual data -- that
|
|
||||||
is, can we statically determine the target of each actual call correctly.
|
|
||||||
|
|
||||||
If there is 100% code coverage from the Python execution, it would also be possible to
|
|
||||||
look at the precision of the call graph resolutions -- that is, do we expect a function to
|
|
||||||
be able to be called in a place where it is not? Currently not something we're looking at.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
|
|
||||||
|
|
||||||
# TODO: Know that a call to a C-function was made. See
|
|
||||||
# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
|
|
||||||
# test
|
|
||||||
|
|
||||||
# For inspiration, look at these projects:
|
|
||||||
# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
|
|
||||||
# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
from io import StringIO
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import dis
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import csv
|
import dis
|
||||||
from lxml import etree
|
import os
|
||||||
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
# copy-paste For interactive ipython sessions
|
# copy-paste For interactive ipython sessions
|
||||||
@@ -126,9 +100,6 @@ class ExternalCallee(Callee):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_arg(cls, func):
|
def from_arg(cls, func):
|
||||||
# if func.__name__ == "append":
|
|
||||||
# import IPython; sys.stdout = sys.__stdout__; IPython.embed(); sys.exit()
|
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
module=func.__module__,
|
module=func.__module__,
|
||||||
qualname=func.__qualname__,
|
qualname=func.__qualname__,
|
||||||
@@ -192,7 +163,7 @@ class CallGraphTracer:
|
|||||||
self.exec_call_seen = False
|
self.exec_call_seen = False
|
||||||
self.ignore_rest = False
|
self.ignore_rest = False
|
||||||
try:
|
try:
|
||||||
sys.setprofile(cgt.profilefunc)
|
sys.setprofile(self.profilefunc)
|
||||||
exec(code, globals, locals)
|
exec(code, globals, locals)
|
||||||
# TODO: exception handling?
|
# TODO: exception handling?
|
||||||
finally:
|
finally:
|
||||||
@@ -233,118 +204,3 @@ class CallGraphTracer:
|
|||||||
debug_print(f"{call} --> {callee}")
|
debug_print(f"{call} --> {callee}")
|
||||||
debug_print("\n" * 5)
|
debug_print("\n" * 5)
|
||||||
self.recorded_calls.add((call, callee))
|
self.recorded_calls.add((call, callee))
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
|
||||||
# Export
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
class Exporter:
|
|
||||||
@staticmethod
|
|
||||||
def export(recorded_calls, outfile_path):
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def dataclass_to_dict(obj):
|
|
||||||
d = dataclasses.asdict(obj)
|
|
||||||
prefix = obj.__class__.__name__.lower()
|
|
||||||
return {f"{prefix}_{key}": val for (key, val) in d.items()}
|
|
||||||
|
|
||||||
|
|
||||||
class CSVExporter(Exporter):
|
|
||||||
@staticmethod
|
|
||||||
def export(recorded_calls, outfile_path):
|
|
||||||
with open(outfile_path, "w", newline="") as csv_file:
|
|
||||||
writer = None
|
|
||||||
for (call, callee) in sorted(recorded_calls):
|
|
||||||
data = {
|
|
||||||
**Exporter.dataclass_to_dict(call),
|
|
||||||
**Exporter.dataclass_to_dict(callee),
|
|
||||||
}
|
|
||||||
|
|
||||||
if writer is None:
|
|
||||||
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
|
|
||||||
writer.writeheader()
|
|
||||||
|
|
||||||
writer.writerow(data)
|
|
||||||
|
|
||||||
print(f"output written to {outfile_path}")
|
|
||||||
|
|
||||||
# embed(); sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
class XMLExporter(Exporter):
|
|
||||||
@staticmethod
|
|
||||||
def export(recorded_calls, outfile_path):
|
|
||||||
|
|
||||||
root = etree.Element("root")
|
|
||||||
|
|
||||||
for (call, callee) in sorted(recorded_calls):
|
|
||||||
data = {
|
|
||||||
**Exporter.dataclass_to_dict(call),
|
|
||||||
**Exporter.dataclass_to_dict(callee),
|
|
||||||
}
|
|
||||||
|
|
||||||
rc = etree.SubElement(root, "recorded_call")
|
|
||||||
for k, v in data.items():
|
|
||||||
# xml library only supports serializing attributes that have string values
|
|
||||||
rc.set(k, str(v))
|
|
||||||
|
|
||||||
tree = etree.ElementTree(root)
|
|
||||||
tree.write(outfile_path, encoding="utf-8", pretty_print=True)
|
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
|
||||||
# __main__
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
|
|
||||||
parser.add_argument("--csv")
|
|
||||||
parser.add_argument("--xml")
|
|
||||||
|
|
||||||
parser.add_argument("progname", help="file to run as main program")
|
|
||||||
parser.add_argument(
|
|
||||||
"arguments", nargs=argparse.REMAINDER, help="arguments to the program"
|
|
||||||
)
|
|
||||||
|
|
||||||
opts = parser.parse_args()
|
|
||||||
|
|
||||||
# These details of setting up the program to be run is very much inspired by `trace`
|
|
||||||
# from the standard library
|
|
||||||
sys.argv = [opts.progname, *opts.arguments]
|
|
||||||
sys.path[0] = os.path.dirname(opts.progname)
|
|
||||||
|
|
||||||
with open(opts.progname) as fp:
|
|
||||||
code = compile(fp.read(), opts.progname, "exec")
|
|
||||||
|
|
||||||
# try to emulate __main__ namespace as much as possible
|
|
||||||
globs = {
|
|
||||||
"__file__": opts.progname,
|
|
||||||
"__name__": "__main__",
|
|
||||||
"__package__": None,
|
|
||||||
"__cached__": None,
|
|
||||||
}
|
|
||||||
|
|
||||||
real_stdout = sys.stdout
|
|
||||||
real_stderr = sys.stderr
|
|
||||||
captured_stdout = StringIO()
|
|
||||||
|
|
||||||
sys.stdout = captured_stdout
|
|
||||||
cgt = CallGraphTracer()
|
|
||||||
cgt.run(code, globs, globs)
|
|
||||||
sys.stdout = real_stdout
|
|
||||||
|
|
||||||
if opts.csv:
|
|
||||||
CSVExporter.export(cgt.recorded_calls, opts.csv)
|
|
||||||
elif opts.xml:
|
|
||||||
XMLExporter.export(cgt.recorded_calls, opts.xml)
|
|
||||||
else:
|
|
||||||
for (call, callee) in sorted(cgt.recorded_calls):
|
|
||||||
print(f"{call} --> {callee}")
|
|
||||||
|
|
||||||
print("--- captured stdout ---")
|
|
||||||
print(captured_stdout.getvalue(), end="")
|
|
||||||
@@ -3,14 +3,19 @@
|
|||||||
set -e
|
set -e
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
if ! pip show cg_trace; then
|
||||||
|
echo "You need to follow setup instructions in README"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
DB="cg-trace-example-db"
|
DB="cg-trace-example-db"
|
||||||
SRC="example/"
|
SRC="example/"
|
||||||
XMLDIR="example-traces/"
|
XMLDIR="example-traces/"
|
||||||
PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
|
PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
|
||||||
|
|
||||||
|
|
||||||
./cg_trace.py --xml "$XMLDIR"/simple.xml example/simple.py
|
cg-trace --xml "$XMLDIR"/simple.xml example/simple.py
|
||||||
./cg_trace.py --xml "$XMLDIR"/builtins.xml example/builtins.py
|
cg-trace --xml "$XMLDIR"/builtins.xml example/builtins.py
|
||||||
|
|
||||||
rm -rf "$DB"
|
rm -rf "$DB"
|
||||||
|
|
||||||
|
|||||||
16
python/tools/recorded-call-graph-metrics/setup.py
Normal file
16
python/tools/recorded-call-graph-metrics/setup.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
from cg_trace import MIN_PYTHON_VERSION_FORMATTED, __version__
|
||||||
|
|
||||||
|
# TODO: There was some benefit of structuring your code as `src/yourpackage/code.py`
|
||||||
|
# instead of `yourpackage/code.py` concerning imports, but I don't recall the details
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="cg_trace",
|
||||||
|
version=__version__,
|
||||||
|
description="Call graph tracing",
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=["lxml"],
|
||||||
|
entry_points={"console_scripts": ["cg-trace = cg_trace.main:main"]},
|
||||||
|
python_requires=">={}".format(MIN_PYTHON_VERSION_FORMATTED),
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user