Python: Copy Python extractor to codeql repo

This commit is contained in:
Taus
2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions

View File

@@ -0,0 +1,284 @@
import sys
import os
import inspect
import pkgutil
from semmle.python import ast
from semmle.python.passes.exports import ExportsPass
from semmle.python.passes.lexical import LexicalPass
from semmle.python.passes.flow import FlowPass
from semmle.python.passes.ast_pass import ASTPass
from semmle.python.passes.objects import ObjectPass
from semmle.util import VERSION, uuid, get_analysis_version, get_analysis_major_version
from semmle.util import makedirs, get_source_file_tag, TrapWriter, base64digest
from semmle.cache import Cache
from semmle.logging import WARN, syntax_error_message, Logger
from semmle.profiling import timers
UTRAP_KEY = 'utrap%s' % VERSION
__all__ = [ 'Extractor', 'CachingExtractor' ]
FLAG_SAVE_TYPES = float, complex, bool, int, bytes, str
class Extractor(object):
'''The extractor controls the execution of the all the
specialised passes'''
def __init__(self, trap_folder, src_archive, options, logger: Logger, diagnostics_writer):
assert trap_folder
self.trap_folder = trap_folder
self.src_archive = src_archive
self.object_pass = ObjectPass()
self.passes = [
ASTPass(),
ExportsPass(),
FlowPass(options.split, options.prune, options.unroll, logger)
]
self.lexical = LexicalPass()
self.files = {}
self.options = options
self.handle_syntax_errors = not options.no_syntax_errors
self.logger = logger
self.diagnostics_writer = diagnostics_writer
def _handle_syntax_error(self, module, ex):
# Write out diagnostics for the syntax error.
error = syntax_error_message(ex, module)
self.diagnostics_writer.write(error)
# Emit trap for the syntax error
self.logger.debug("Emitting trap for syntax error in %s", module.path)
writer = TrapWriter()
module_id = writer.get_node_id(module)
# Report syntax error as an alert.
# Ensure line and col are ints (not None).
line = ex.lineno if ex.lineno else 0
if line > len(module.lines):
line = len(module.lines)
col = len(module.lines[-1])-1
else:
col = ex.offset if ex.offset else 0
loc_id = writer.get_unique_id()
writer.write_tuple(u'locations_ast', 'rrdddd',
loc_id, module_id, 0, 0, 0, 0)
syntax_id = u'syntax%d:%d' % (line, col)
writer.write_tuple(u'locations_ast', 'nrdddd',
syntax_id, module_id, line, col+1, line, col+1)
writer.write_tuple(u'py_syntax_error_versioned', 'nss', syntax_id, ex.msg, get_analysis_major_version())
trap = writer.get_compressed()
self.trap_folder.write_trap("syntax-error", module.path, trap)
#Create an AST equivalent to an empty file, so that the other passes produce consistent output.
return ast.Module([])
def _extract_trap_file(self, ast, comments, path):
writer = TrapWriter()
file_tag = get_source_file_tag(self.src_archive.get_virtual_path(path))
writer.write_tuple(u'py_Modules', 'g', ast.trap_name)
writer.write_tuple(u'py_module_path', 'gg', ast.trap_name, file_tag)
try:
for ex in self.passes:
with timers[ex.name]:
if isinstance(ex, FlowPass):
ex.set_filename(path)
ex.extract(ast, writer)
with timers['lexical']:
self.lexical.extract(ast, comments, writer)
with timers['object']:
self.object_pass.extract(ast, path, writer)
except Exception as ex:
self.logger.error("Exception extracting module %s: %s", path, ex)
self.logger.traceback(WARN)
return None
return writer.get_compressed()
def process_source_module(self, module):
'''Process a Python source module. Checks that module has valid syntax,
then passes passes ast, source, etc to `process_module`
'''
try:
#Ensure that module does not have invalid syntax before extracting it.
ast = module.ast
except SyntaxError as ex:
self.logger.debug("handle syntax errors is %s", self.handle_syntax_errors)
if self.handle_syntax_errors:
ast = self._handle_syntax_error(module, ex)
else:
return None
ast.name = module.name
ast.kind = module.kind
ast.trap_name = module.trap_name
return self.process_module(ast, module.trap_name, module.bytes_source,
module.path, module.comments)
def process_module(self, ast, module_tag, bytes_source, path, comments):
'Process a module, generating the trap file for that module'
self.logger.debug(u"Populating trap file for %s", path)
ast.trap_name = module_tag
trap = self._extract_trap_file(ast, comments, path)
if trap is None:
return None
with timers['trap']:
self.trap_folder.write_trap("python", path, trap)
try:
with timers['archive']:
self.copy_source(bytes_source, module_tag, path)
except Exception:
import traceback
traceback.print_exc()
return trap
def copy_source(self, bytes_source, module_tag, path):
if bytes_source is None:
return
self.files[module_tag] = self.src_archive.get_virtual_path(path)
self.src_archive.write(path, bytes_source)
def write_interpreter_data(self, options):
'''Write interpreter data, such as version numbers and flags.'''
def write_flag(name, value):
writer.write_tuple(u'py_flags_versioned', 'uus', name, value, get_analysis_major_version())
def write_flags(obj, prefix):
pre = prefix + u"."
for name, value in inspect.getmembers(obj):
if name[0] == "_":
continue
if type(value) in FLAG_SAVE_TYPES:
write_flag(pre + name, str(value))
writer = TrapWriter()
for index, name in enumerate((u'major', u'minor', u'micro', u'releaselevel', u'serial')):
writer.write_tuple(u'py_flags_versioned', 'sss', u'extractor_python_version.' + name, str(sys.version_info[index]), get_analysis_major_version())
write_flags(sys.flags, u'flags')
write_flags(sys.float_info, u'float')
write_flags(self.options, u'options')
write_flag(u'sys.prefix', sys.prefix)
path = os.pathsep.join(os.path.abspath(p) for p in options.sys_path)
write_flag(u'sys.path', path)
if options.path is None:
path = ''
else:
path = os.pathsep.join(self.src_archive.get_virtual_path(p) for p in options.path)
if options.language_version:
write_flag(u'language.version', options.language_version[-1])
else:
write_flag(u'language.version', get_analysis_version())
write_flag(u'extractor.path', path)
write_flag(u'sys.platform', sys.platform)
write_flag(u'os.sep', os.sep)
write_flag(u'os.pathsep', os.pathsep)
write_flag(u'extractor.version', VERSION)
if options.context_cost is not None:
write_flag(u'context.cost', options.context_cost)
self.trap_folder.write_trap("flags", "$flags", writer.get_compressed())
if get_analysis_major_version() == 2:
# Copy the pre-extracted builtins trap
builtins_trap_data = pkgutil.get_data('semmle.data', 'interpreter2.trap')
self.trap_folder.write_trap("interpreter", '$interpreter2', builtins_trap_data, extension=".trap")
else:
writer = TrapWriter()
self.object_pass.write_special_objects(writer)
self.trap_folder.write_trap("interpreter", '$interpreter3', writer.get_compressed())
# Copy stdlib trap
if get_analysis_major_version() == 2:
stdlib_trap_name = '$stdlib_27.trap'
else:
stdlib_trap_name = '$stdlib_33.trap'
stdlib_trap_data = pkgutil.get_data('semmle.data', stdlib_trap_name)
self.trap_folder.write_trap("stdlib", stdlib_trap_name[:-5], stdlib_trap_data, extension=".trap")
@staticmethod
def from_options(options, trap_dir, archive, logger: Logger, diagnostics_writer):
'''Convenience method to create extractor from options'''
try:
trap_copy_dir = options.trap_cache
caching_extractor = CachingExtractor(trap_copy_dir, options, logger)
except Exception as ex:
if options.verbose and trap_copy_dir is not None:
print ("Failed to create caching extractor: " + str(ex))
caching_extractor = None
worker = Extractor(trap_dir, archive, options, logger, diagnostics_writer)
if caching_extractor:
caching_extractor.set_worker(worker)
return caching_extractor
else:
return worker
def stop(self):
pass
def close(self):
'close() must be called, or some information will be not be written'
#Add name tag to file name, so that multiple extractors do not overwrite each other
if self.files:
trapwriter = TrapWriter()
for _, filepath in self.files.items():
trapwriter.write_file(filepath)
self.trap_folder.write_trap('folders', uuid('python') + '/$files', trapwriter.get_compressed())
self.files = set()
for name, timer in sorted(timers.items()):
self.logger.debug("Total time for pass '%s': %0.0fms", name, timer.elapsed)
def hash_combine(x, y):
return base64digest(x + u":" + y)
class CachingExtractor(object):
'''The caching extractor has a two stage initialization process.
After creating the extractor (which will check that the cachedir is valid)
set_worker(worker) must be called before the CachingExtractor is valid'''
def __init__(self, cachedir, options, logger: Logger):
if cachedir is None:
raise IOError("No cache directory")
makedirs(cachedir)
self.worker = None
self.cache = Cache.for_directory(cachedir, options.verbose)
self.logger = logger
self.split = options.split
def set_worker(self, worker):
self.worker = worker
def get_cache_key(self, module):
key = hash_combine(module.path, module.source)
if not self.split:
#Use different key, as not splitting will modify the trap file.
key = hash_combine(UTRAP_KEY, key)
return hash_combine(key, module.source)
def process_source_module(self, module):
'''Process a Python source module. First look up trap file in cache.
In no cached trap file is found, then delegate to normal extractor.
'''
if self.worker is None:
raise Exception("worker is not set")
key = self.get_cache_key(module)
trap = self.cache.get(key)
if trap is None:
trap = self.worker.process_source_module(module)
if trap is not None:
self.cache.set(key, trap)
else:
self.logger.debug(u"Found cached trap file for %s", module.path)
self.worker.trap_folder.write_trap("python", module.path, trap)
try:
self.worker.copy_source(module.bytes_source, module.trap_name, module.path)
except Exception:
self.logger.traceback(WARN)
return trap
def process_module(self, ast, module_tag, source_code, path, comments):
self.worker.process_module(ast, module_tag, source_code, path, comments)
def close(self):
self.worker.close()
def write_interpreter_data(self, sys_path):
self.worker.write_interpreter_data(sys_path)
def stop(self):
self.worker.stop()