Python: Copy Python extractor to codeql repo

This commit is contained in:
Taus
2024-02-28 15:15:21 +00:00
parent 297a17975d
commit 6dec323cfc
369 changed files with 165346 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
from .super_extractor import SuperExtractor
from .py_extractor import PythonExtractor
from .builtin_extractor import BuiltinExtractor, SkippedBuiltin
from .module_printer import ModulePrinter

View File

@@ -0,0 +1,14 @@
from semmle.logging import Logger
class BaseExtractor(object):
'''Base class for extractors.'''
def __init__(self, options, trap_folder, src_archive, logger: Logger):
self.options = options
self.trap_folder = trap_folder
self.src_archive = src_archive
self.logger = logger
def process(self, unit):
raise NotImplementedError()

View File

@@ -0,0 +1,41 @@
import sys
from semmle import util
from semmle.python.passes.objects import ObjectPass
from semmle.extractors.base import BaseExtractor
# A sentinel object representing a built-in that should be skipped.
# Unlike returning `NotImplemented`, this prevents other extractors from
# attempting to extract the same file/module and/or reporting an extraction error.
SkippedBuiltin = object()
class BuiltinExtractor(BaseExtractor):
'''Extractor that can extract built-in Python modules, such as the `sys` module.'''
name = "built-in extractor"
def process(self, unit):
# Modules in the standard library (e.g. `os`)
if not self.options.extract_stdlib and \
isinstance(unit, util.FileExtractable) and \
unit.path.startswith(util.STDLIB_PATH):
return SkippedBuiltin
if not isinstance(unit, util.BuiltinModuleExtractable):
return NotImplemented
name = unit.name
# If a Shared Object file fails to import, we want to prevent the `ImportError` from
# propagating further up. Instead, we simply behave as if the module is not extractable.
try:
module = __import__(name)
except ImportError as e:
if e.path.endswith(".so"):
return NotImplemented
else:
raise e
writer = util.TrapWriter()
ObjectPass().extract_builtin(module, writer)
output = writer.get_compressed()
self.trap_folder.write_trap("builtin", name, output)
return ()
def close(self):
pass

View File

@@ -0,0 +1,33 @@
from semmle import util
from semmle.extractors.base import BaseExtractor
HALF_MB = 1 << 19
class FileExtractor(BaseExtractor):
'''Extractor for extracting arbitrary 'text' files.'''
name = "file extractor"
def process(self, unit):
if not isinstance(unit, util.FileExtractable):
return NotImplemented
if util.isdir(unit.path):
return NotImplemented
with open(unit.path, "rb") as fd:
data = fd.read()
source = data.decode("latin-1")
if len(source) > HALF_MB:
self.logger.info("Skipping overly large file: '%s'", unit.path)
return ()
file_tag = util.get_source_file_tag(unit.path)
writer = util.TrapWriter()
writer.write_tuple("file_contents", "gS", file_tag, source)
writer.write_file(unit.path)
output = writer.get_compressed()
self.trap_folder.write_trap("file", unit.path, output)
self.src_archive.write(unit.path, data)
return ()
def close(self):
pass

View File

@@ -0,0 +1,31 @@
import sys
from semmle import util
from .py_extractor import PythonExtractor
class ModulePrinter(object):
name = "module printer"
def __init__(self, options, trap_folder, src_archive, renamer, logger):
self.logger = logger
self.py_extractor = PythonExtractor(options, trap_folder, src_archive, logger)
def process(self, unit):
imports = ()
if isinstance(unit, util.BuiltinModuleExtractable):
name = unit.name
self.logger.info("Found builtin module '%s'", name)
elif isinstance(unit, util.FileExtractable):
self.logger.info("Found file '%s'", unit.path)
_, imports = self.py_extractor._get_module_and_imports(unit)
elif isinstance(unit, util.FolderExtractable):
self.logger.info("Found folder '%s'", unit.path)
else:
self.logger.error("Unexpected object: %s", unit)
return imports
def close(self):
pass
def write_global_data(self):
pass

View File

@@ -0,0 +1,102 @@
import os.path
from semmle import util
from semmle.python import extractor, finder, imports
import re
from semmle.extractors.base import BaseExtractor
from semmle.logging import Logger
class PythonExtractor(BaseExtractor):
'''Extractor that can extract Python source code.'''
name = "Python extractor"
def __init__(self, options, trap_folder, src_archive, logger: Logger, diagnostics_writer):
super(PythonExtractor, self).__init__(options, trap_folder, src_archive, logger)
self.module_extractor = extractor.Extractor.from_options(options, trap_folder, src_archive, logger, diagnostics_writer)
self.finder = finder.Finder.from_options_and_env(options, logger)
self.importer = imports.importer_from_options(options, self.finder, logger)
def _get_module_and_imports(self, unit):
if not isinstance(unit, util.FileExtractable):
return None, ()
#Convert unit to module.
module = self.finder.from_extractable(unit)
if module is None:
return None, ()
py_module = module.load(self.logger)
if py_module is None:
return None, ()
imports = set(mod.get_extractable() for mod in self.importer.get_imports(module, py_module))
for imp in imports:
self.logger.trace("%s imports %s", module, imp)
package = module.package
while package:
ex = package.get_extractable()
if ex is None:
break
self.logger.debug("Requiring package %s", ex)
imports.add(ex)
package = package.package
return py_module, imports
def process(self, unit):
py_module, imports = self._get_module_and_imports(unit)
if py_module is None:
return NotImplemented
self.module_extractor.process_source_module(py_module)
return imports
def close(self):
self.module_extractor.close()
def write_interpreter_data(self, options):
self.module_extractor.write_interpreter_data(options)
LEGAL_NAME = re.compile(r"[^\W0-9]\w+$")
class PackageExtractor(object):
'''Extractor that can extract folders as Python packages.'''
name = "package extractor"
def __init__(self, options, trap_folder, src_archive, logger):
self.trap_folder = trap_folder
self.src_archive = src_archive
self.logger = logger
self.respect_init = options.respect_init
def process(self, unit):
if not isinstance(unit, util.FolderExtractable):
return NotImplemented
_, name = os.path.split(unit.path)
init_path = os.path.join(unit.path, "__init__.py")
if (self.respect_init and not os.path.exists(init_path)) or not LEGAL_NAME.match(name):
self.logger.debug("Ignoring non-package folder %s", unit.path)
return ()
writer = util.TrapWriter()
trap_name = u'py-package:' + unit.path
vpath = self.src_archive.get_virtual_path(unit.path)
folder_tag = writer.write_folder(vpath)
writer.write_tuple(u'py_Modules', 'g', trap_name)
writer.write_tuple(u'py_module_path', 'gg', trap_name, folder_tag)
#Add fake CFG entry node to represent the PackageObject.
entry_node = object()
entry_id = trap_name + ":entry-point"
entry_tag = writer.get_labelled_id(entry_node, entry_id)
writer.write_tuple(u'py_flow_bb_node', 'rgrd', entry_tag, trap_name, entry_tag, 0)
writer.write_tuple(u'py_scope_flow', 'rgd', entry_tag, trap_name, -1)
#Add dummy location
loc = object()
loc_id = trap_name + ":location"
loc_tag = writer.get_labelled_id(loc, loc_id)
writer.write_tuple(u'locations_ast', 'rgdddd', loc_tag, trap_name, 0, 0, 0, 0)
output = writer.get_compressed()
self.trap_folder.write_trap('$package', unit.path, output)
if os.path.exists(init_path):
return util.FileExtractable(init_path),
else:
return ()
def close(self):
pass

View File

@@ -0,0 +1,69 @@
from .builtin_extractor import BuiltinExtractor
from .py_extractor import PythonExtractor
from .py_extractor import PackageExtractor
from .file_extractor import FileExtractor
from .thrift_extractor import ThriftExtractor
from semmle.files import TrapFolder, SourceArchive, NullArchive
from semmle.profiling import MillisecondTimer
from semmle.logging import DEBUG, Logger
class SuperExtractor(object):
'''Extractor that can extract any 'extractable'.
Delegates to the relevant extractor.'''
def __init__(self, options, trap_dir, archive, renamer, logger: Logger, diagnostics_writer):
trap_folder = TrapFolder(trap_dir, renamer, logger)
if archive is None:
src_archive = NullArchive(renamer)
else:
src_archive = SourceArchive(archive, renamer, logger)
bltn_extractor = BuiltinExtractor(options, trap_folder, src_archive, logger)
package_extractor = PackageExtractor(options, trap_folder, src_archive, logger)
gen_extractor = FileExtractor(options, trap_folder, src_archive, logger)
thrift_extractor = ThriftExtractor(options, trap_folder, src_archive, logger)
self.py_extractor = PythonExtractor(options, trap_folder, src_archive, logger, diagnostics_writer)
self.extractors = [ bltn_extractor, thrift_extractor, self.py_extractor, package_extractor, gen_extractor]
if logger.level >= DEBUG:
self.extractors = [ TimingExtractor(extractor, logger) for extractor in self.extractors ]
self.logger = logger
self.options = options
def process(self, unit):
for extractor in self.extractors:
self.logger.debug("Trying %s on %s",extractor.name, unit)
res = extractor.process(unit)
if res is not NotImplemented:
self.logger.debug("%s extracted by the %s.", unit, extractor.name)
break
else:
self.logger.error("Could not extract %s", unit)
res = ()
return res
def add_extractor(self, extractor):
#Insert after built-in extractor
self.extractors.insert(1, extractor)
def close(self):
for ex in self.extractors:
ex.close()
def write_global_data(self):
self.py_extractor.write_interpreter_data(self.options)
class TimingExtractor(object):
def __init__(self, extractor, logger):
self.timer = MillisecondTimer()
self.extractor = extractor
self.logger = logger
self.name = self.extractor.name
def process(self, unit):
with self.timer:
return self.extractor.process(unit)
def close(self):
self.logger.debug(self.name + " time %0.1fs", self.timer.elapsed/1000)
self.extractor.close()

View File

@@ -0,0 +1,28 @@
import os.path
import semmle.thrift
import semmle.util
from semmle.extractors.base import BaseExtractor
class ThriftExtractor(BaseExtractor):
'''Extractor that can extract Apache thrift IDL files.'''
name = "thrift extractor"
def __init__(self, options, trap_folder, src_archive, logger):
super(ThriftExtractor, self).__init__(options, trap_folder, src_archive, logger)
self.thrift_extractor = semmle.thrift.Extractor(trap_folder, src_archive)
def process(self, unit):
if not isinstance(unit, semmle.util.FileExtractable):
return NotImplemented
if semmle.util.isdir(unit.path):
return NotImplemented
if not unit.path.endswith(".thrift"):
return NotImplemented
self.thrift_extractor.extract_file(unit.path)
return ()
def close(self):
pass