Python: Improve parser logging/timing/customisability

Does a bunch of things, unfortunately all in the same place, so my
apologies in advance for a slightly complicated commit.

As for the changes themselves, this commit

- Adds timers for the old and new parsers. This means we get the overall
time spent on these parts of the extractor if the extractor is run with
`DEBUG` output shown.
- Adds logging information (at the `DEBUG` level) to show which
invocations of the parsers happen when, and whether they succeed or not.
- Adds support for using an environment variable named
`CODEQL_PYTHON_DISABLE_OLD_PARSER` to disable using the old parser
entirely. This makes it easier to test the new parser in isolation.
- Fixes a bug where we did not check whether a parse with the new parser
had already succeeded, and so would do a superfluous second parse.
This commit is contained in:
Taus
2024-10-30 13:58:46 +00:00
parent f75615b913
commit 2ef3ae9860
4 changed files with 24 additions and 6 deletions

View File

@@ -98,6 +98,7 @@ class ModuleImporter(object):
def get_import_nodes(self, loaded_module):
'Return list of AST nodes representing imports'
try:
self.logger.debug("Looking for imports in %s", loaded_module.path)
return imports_from_ast(loaded_module.py_ast)
except Exception as ex:
if isinstance(ex, SyntaxError):

View File

@@ -109,14 +109,25 @@ class PythonSourceModule(object):
def old_py_ast(self):
# The py_ast is the raw ast from the Python parser.
if self._py_ast is None:
self._py_ast = semmle.python.parser.parse(self.tokens, self.logger)
with timers["old_py_ast"]:
self.logger.debug("Trying old parser on %s", self.path)
self._py_ast = semmle.python.parser.parse(self.tokens, self.logger)
self.logger.debug("Old parser successful on %s", self.path)
else:
self.logger.debug("Found (during old_py_ast) parse tree for %s in cache", self.path)
return self._py_ast
@property
def py_ast(self):
try:
# First, try to parse the source with the old Python parser.
return self.old_py_ast
# If the `CODEQL_PYTHON_DISABLE_OLD_PARSER` flag is present, we do not try to use the
# old parser, and instead jump straight to the exception handler.
if os.environ.get("CODEQL_PYTHON_DISABLE_OLD_PARSER"):
self.logger.debug("Old parser disabled, skipping old parse attempt for %s", self.path)
raise Exception("Skipping old parser")
# Otherwise, we first try to parse the source with the old Python parser.
self._py_ast = self.old_py_ast
return self._py_ast
except Exception as ex:
# If that fails, try to parse the source with the new Python parser (unless it has been
# explicitly disabled).
@@ -131,7 +142,13 @@ class PythonSourceModule(object):
raise SyntaxError("Exception %s while parsing %s" % (ex, self.path))
else:
try:
self._py_ast = semmle.python.parser.tsg_parser.parse(self.path, self.logger)
with timers["tsg_py_ast"]:
if self._py_ast is None:
self.logger.debug("Trying tsg-python on %s", self.path)
self._py_ast = semmle.python.parser.tsg_parser.parse(self.path, self.logger)
self.logger.debug("tsg-python successful on %s", self.path)
else:
self.logger.debug("Found (during py_ast) parse tree for %s in cache", self.path)
return self._py_ast
except SyntaxError as ex:
raise ex

View File

@@ -168,7 +168,7 @@ def read_tsg_python_output(path, logger):
p.stdout.close()
p.terminate()
p.wait()
logger.info("Read {} nodes and {} edges from TSG output".format(len(node_attr), len(edge_attr)))
logger.debug("Read {} nodes and {} edges from TSG output".format(len(node_attr), len(edge_attr)))
return node_attr, edge_attr
def evaluate_string(s):

View File

@@ -10,7 +10,7 @@ from io import BytesIO
#Semantic version of extractor.
#Update this if any changes are made
VERSION = "7.1.0"
VERSION = "7.1.1"
PY_EXTENSIONS = ".py", ".pyw"