mirror of
https://github.com/github/codeql.git
synced 2026-04-30 11:15:13 +02:00
Python: Copy Python extractor to codeql repo
This commit is contained in:
5
python/extractor/semmle/extractors/__init__.py
Normal file
5
python/extractor/semmle/extractors/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
|
||||
from .super_extractor import SuperExtractor
|
||||
from .py_extractor import PythonExtractor
|
||||
from .builtin_extractor import BuiltinExtractor, SkippedBuiltin
|
||||
from .module_printer import ModulePrinter
|
||||
14
python/extractor/semmle/extractors/base.py
Normal file
14
python/extractor/semmle/extractors/base.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from semmle.logging import Logger
|
||||
|
||||
|
||||
class BaseExtractor(object):
|
||||
'''Base class for extractors.'''
|
||||
|
||||
def __init__(self, options, trap_folder, src_archive, logger: Logger):
|
||||
self.options = options
|
||||
self.trap_folder = trap_folder
|
||||
self.src_archive = src_archive
|
||||
self.logger = logger
|
||||
|
||||
def process(self, unit):
|
||||
raise NotImplementedError()
|
||||
41
python/extractor/semmle/extractors/builtin_extractor.py
Normal file
41
python/extractor/semmle/extractors/builtin_extractor.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
from semmle import util
|
||||
from semmle.python.passes.objects import ObjectPass
|
||||
from semmle.extractors.base import BaseExtractor
|
||||
|
||||
# A sentinel object representing a built-in that should be skipped.
|
||||
# Unlike returning `NotImplemented`, this prevents other extractors from
|
||||
# attempting to extract the same file/module and/or reporting an extraction error.
|
||||
SkippedBuiltin = object()
|
||||
|
||||
class BuiltinExtractor(BaseExtractor):
|
||||
'''Extractor that can extract built-in Python modules, such as the `sys` module.'''
|
||||
|
||||
name = "built-in extractor"
|
||||
|
||||
def process(self, unit):
|
||||
# Modules in the standard library (e.g. `os`)
|
||||
if not self.options.extract_stdlib and \
|
||||
isinstance(unit, util.FileExtractable) and \
|
||||
unit.path.startswith(util.STDLIB_PATH):
|
||||
return SkippedBuiltin
|
||||
if not isinstance(unit, util.BuiltinModuleExtractable):
|
||||
return NotImplemented
|
||||
name = unit.name
|
||||
# If a Shared Object file fails to import, we want to prevent the `ImportError` from
|
||||
# propagating further up. Instead, we simply behave as if the module is not extractable.
|
||||
try:
|
||||
module = __import__(name)
|
||||
except ImportError as e:
|
||||
if e.path.endswith(".so"):
|
||||
return NotImplemented
|
||||
else:
|
||||
raise e
|
||||
writer = util.TrapWriter()
|
||||
ObjectPass().extract_builtin(module, writer)
|
||||
output = writer.get_compressed()
|
||||
self.trap_folder.write_trap("builtin", name, output)
|
||||
return ()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
33
python/extractor/semmle/extractors/file_extractor.py
Normal file
33
python/extractor/semmle/extractors/file_extractor.py
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
from semmle import util
|
||||
from semmle.extractors.base import BaseExtractor
|
||||
|
||||
HALF_MB = 1 << 19
|
||||
|
||||
class FileExtractor(BaseExtractor):
|
||||
'''Extractor for extracting arbitrary 'text' files.'''
|
||||
|
||||
name = "file extractor"
|
||||
|
||||
def process(self, unit):
|
||||
if not isinstance(unit, util.FileExtractable):
|
||||
return NotImplemented
|
||||
if util.isdir(unit.path):
|
||||
return NotImplemented
|
||||
with open(unit.path, "rb") as fd:
|
||||
data = fd.read()
|
||||
source = data.decode("latin-1")
|
||||
if len(source) > HALF_MB:
|
||||
self.logger.info("Skipping overly large file: '%s'", unit.path)
|
||||
return ()
|
||||
file_tag = util.get_source_file_tag(unit.path)
|
||||
writer = util.TrapWriter()
|
||||
writer.write_tuple("file_contents", "gS", file_tag, source)
|
||||
writer.write_file(unit.path)
|
||||
output = writer.get_compressed()
|
||||
self.trap_folder.write_trap("file", unit.path, output)
|
||||
self.src_archive.write(unit.path, data)
|
||||
return ()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
31
python/extractor/semmle/extractors/module_printer.py
Normal file
31
python/extractor/semmle/extractors/module_printer.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import sys
|
||||
from semmle import util
|
||||
from .py_extractor import PythonExtractor
|
||||
|
||||
class ModulePrinter(object):
|
||||
|
||||
name = "module printer"
|
||||
|
||||
def __init__(self, options, trap_folder, src_archive, renamer, logger):
|
||||
self.logger = logger
|
||||
self.py_extractor = PythonExtractor(options, trap_folder, src_archive, logger)
|
||||
|
||||
def process(self, unit):
|
||||
imports = ()
|
||||
if isinstance(unit, util.BuiltinModuleExtractable):
|
||||
name = unit.name
|
||||
self.logger.info("Found builtin module '%s'", name)
|
||||
elif isinstance(unit, util.FileExtractable):
|
||||
self.logger.info("Found file '%s'", unit.path)
|
||||
_, imports = self.py_extractor._get_module_and_imports(unit)
|
||||
elif isinstance(unit, util.FolderExtractable):
|
||||
self.logger.info("Found folder '%s'", unit.path)
|
||||
else:
|
||||
self.logger.error("Unexpected object: %s", unit)
|
||||
return imports
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def write_global_data(self):
|
||||
pass
|
||||
102
python/extractor/semmle/extractors/py_extractor.py
Normal file
102
python/extractor/semmle/extractors/py_extractor.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import os.path
|
||||
|
||||
from semmle import util
|
||||
from semmle.python import extractor, finder, imports
|
||||
import re
|
||||
from semmle.extractors.base import BaseExtractor
|
||||
from semmle.logging import Logger
|
||||
|
||||
class PythonExtractor(BaseExtractor):
|
||||
'''Extractor that can extract Python source code.'''
|
||||
|
||||
name = "Python extractor"
|
||||
|
||||
def __init__(self, options, trap_folder, src_archive, logger: Logger, diagnostics_writer):
|
||||
super(PythonExtractor, self).__init__(options, trap_folder, src_archive, logger)
|
||||
self.module_extractor = extractor.Extractor.from_options(options, trap_folder, src_archive, logger, diagnostics_writer)
|
||||
self.finder = finder.Finder.from_options_and_env(options, logger)
|
||||
self.importer = imports.importer_from_options(options, self.finder, logger)
|
||||
|
||||
def _get_module_and_imports(self, unit):
|
||||
if not isinstance(unit, util.FileExtractable):
|
||||
return None, ()
|
||||
#Convert unit to module.
|
||||
module = self.finder.from_extractable(unit)
|
||||
if module is None:
|
||||
return None, ()
|
||||
py_module = module.load(self.logger)
|
||||
if py_module is None:
|
||||
return None, ()
|
||||
imports = set(mod.get_extractable() for mod in self.importer.get_imports(module, py_module))
|
||||
for imp in imports:
|
||||
self.logger.trace("%s imports %s", module, imp)
|
||||
package = module.package
|
||||
while package:
|
||||
ex = package.get_extractable()
|
||||
if ex is None:
|
||||
break
|
||||
self.logger.debug("Requiring package %s", ex)
|
||||
imports.add(ex)
|
||||
package = package.package
|
||||
return py_module, imports
|
||||
|
||||
def process(self, unit):
|
||||
py_module, imports = self._get_module_and_imports(unit)
|
||||
if py_module is None:
|
||||
return NotImplemented
|
||||
self.module_extractor.process_source_module(py_module)
|
||||
return imports
|
||||
|
||||
def close(self):
|
||||
self.module_extractor.close()
|
||||
|
||||
def write_interpreter_data(self, options):
|
||||
self.module_extractor.write_interpreter_data(options)
|
||||
|
||||
LEGAL_NAME = re.compile(r"[^\W0-9]\w+$")
|
||||
|
||||
class PackageExtractor(object):
|
||||
'''Extractor that can extract folders as Python packages.'''
|
||||
|
||||
name = "package extractor"
|
||||
|
||||
def __init__(self, options, trap_folder, src_archive, logger):
|
||||
self.trap_folder = trap_folder
|
||||
self.src_archive = src_archive
|
||||
self.logger = logger
|
||||
self.respect_init = options.respect_init
|
||||
|
||||
def process(self, unit):
|
||||
if not isinstance(unit, util.FolderExtractable):
|
||||
return NotImplemented
|
||||
_, name = os.path.split(unit.path)
|
||||
init_path = os.path.join(unit.path, "__init__.py")
|
||||
if (self.respect_init and not os.path.exists(init_path)) or not LEGAL_NAME.match(name):
|
||||
self.logger.debug("Ignoring non-package folder %s", unit.path)
|
||||
return ()
|
||||
writer = util.TrapWriter()
|
||||
trap_name = u'py-package:' + unit.path
|
||||
vpath = self.src_archive.get_virtual_path(unit.path)
|
||||
folder_tag = writer.write_folder(vpath)
|
||||
writer.write_tuple(u'py_Modules', 'g', trap_name)
|
||||
writer.write_tuple(u'py_module_path', 'gg', trap_name, folder_tag)
|
||||
#Add fake CFG entry node to represent the PackageObject.
|
||||
entry_node = object()
|
||||
entry_id = trap_name + ":entry-point"
|
||||
entry_tag = writer.get_labelled_id(entry_node, entry_id)
|
||||
writer.write_tuple(u'py_flow_bb_node', 'rgrd', entry_tag, trap_name, entry_tag, 0)
|
||||
writer.write_tuple(u'py_scope_flow', 'rgd', entry_tag, trap_name, -1)
|
||||
#Add dummy location
|
||||
loc = object()
|
||||
loc_id = trap_name + ":location"
|
||||
loc_tag = writer.get_labelled_id(loc, loc_id)
|
||||
writer.write_tuple(u'locations_ast', 'rgdddd', loc_tag, trap_name, 0, 0, 0, 0)
|
||||
output = writer.get_compressed()
|
||||
self.trap_folder.write_trap('$package', unit.path, output)
|
||||
if os.path.exists(init_path):
|
||||
return util.FileExtractable(init_path),
|
||||
else:
|
||||
return ()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
69
python/extractor/semmle/extractors/super_extractor.py
Normal file
69
python/extractor/semmle/extractors/super_extractor.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from .builtin_extractor import BuiltinExtractor
|
||||
from .py_extractor import PythonExtractor
|
||||
from .py_extractor import PackageExtractor
|
||||
from .file_extractor import FileExtractor
|
||||
from .thrift_extractor import ThriftExtractor
|
||||
from semmle.files import TrapFolder, SourceArchive, NullArchive
|
||||
from semmle.profiling import MillisecondTimer
|
||||
from semmle.logging import DEBUG, Logger
|
||||
|
||||
class SuperExtractor(object):
|
||||
'''Extractor that can extract any 'extractable'.
|
||||
Delegates to the relevant extractor.'''
|
||||
|
||||
def __init__(self, options, trap_dir, archive, renamer, logger: Logger, diagnostics_writer):
|
||||
trap_folder = TrapFolder(trap_dir, renamer, logger)
|
||||
if archive is None:
|
||||
src_archive = NullArchive(renamer)
|
||||
else:
|
||||
src_archive = SourceArchive(archive, renamer, logger)
|
||||
bltn_extractor = BuiltinExtractor(options, trap_folder, src_archive, logger)
|
||||
package_extractor = PackageExtractor(options, trap_folder, src_archive, logger)
|
||||
gen_extractor = FileExtractor(options, trap_folder, src_archive, logger)
|
||||
thrift_extractor = ThriftExtractor(options, trap_folder, src_archive, logger)
|
||||
self.py_extractor = PythonExtractor(options, trap_folder, src_archive, logger, diagnostics_writer)
|
||||
self.extractors = [ bltn_extractor, thrift_extractor, self.py_extractor, package_extractor, gen_extractor]
|
||||
if logger.level >= DEBUG:
|
||||
self.extractors = [ TimingExtractor(extractor, logger) for extractor in self.extractors ]
|
||||
self.logger = logger
|
||||
self.options = options
|
||||
|
||||
def process(self, unit):
|
||||
for extractor in self.extractors:
|
||||
self.logger.debug("Trying %s on %s",extractor.name, unit)
|
||||
res = extractor.process(unit)
|
||||
if res is not NotImplemented:
|
||||
self.logger.debug("%s extracted by the %s.", unit, extractor.name)
|
||||
break
|
||||
else:
|
||||
self.logger.error("Could not extract %s", unit)
|
||||
res = ()
|
||||
return res
|
||||
|
||||
def add_extractor(self, extractor):
|
||||
#Insert after built-in extractor
|
||||
self.extractors.insert(1, extractor)
|
||||
|
||||
def close(self):
|
||||
for ex in self.extractors:
|
||||
ex.close()
|
||||
|
||||
def write_global_data(self):
|
||||
self.py_extractor.write_interpreter_data(self.options)
|
||||
|
||||
|
||||
class TimingExtractor(object):
|
||||
|
||||
def __init__(self, extractor, logger):
|
||||
self.timer = MillisecondTimer()
|
||||
self.extractor = extractor
|
||||
self.logger = logger
|
||||
self.name = self.extractor.name
|
||||
|
||||
def process(self, unit):
|
||||
with self.timer:
|
||||
return self.extractor.process(unit)
|
||||
|
||||
def close(self):
|
||||
self.logger.debug(self.name + " time %0.1fs", self.timer.elapsed/1000)
|
||||
self.extractor.close()
|
||||
28
python/extractor/semmle/extractors/thrift_extractor.py
Normal file
28
python/extractor/semmle/extractors/thrift_extractor.py
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
|
||||
import os.path
|
||||
import semmle.thrift
|
||||
import semmle.util
|
||||
from semmle.extractors.base import BaseExtractor
|
||||
|
||||
class ThriftExtractor(BaseExtractor):
|
||||
'''Extractor that can extract Apache thrift IDL files.'''
|
||||
|
||||
name = "thrift extractor"
|
||||
|
||||
def __init__(self, options, trap_folder, src_archive, logger):
|
||||
super(ThriftExtractor, self).__init__(options, trap_folder, src_archive, logger)
|
||||
self.thrift_extractor = semmle.thrift.Extractor(trap_folder, src_archive)
|
||||
|
||||
def process(self, unit):
|
||||
if not isinstance(unit, semmle.util.FileExtractable):
|
||||
return NotImplemented
|
||||
if semmle.util.isdir(unit.path):
|
||||
return NotImplemented
|
||||
if not unit.path.endswith(".thrift"):
|
||||
return NotImplemented
|
||||
self.thrift_extractor.extract_file(unit.path)
|
||||
return ()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
Reference in New Issue
Block a user