codeql/python/extractor/semmle/cmdline.py

from optparse import OptionParser, OptionGroup, HelpFormatter
import shlex
import sys
import os
import re

from semmle import logging
from semmle.util import VERSION


def make_parser():
    '''Parse command_line, returning options, arguments'''
    parser = OptionParser(add_help_option=False, version='%s' % VERSION)

    import_options = OptionGroup(parser, "Import following options",
        description="Note that -a -n -g and -t are included for backwards compatibility. They are ignored")
    import_options.add_option("--max-import-depth", dest="max_import_depth",
                      help="The maximum depth of imports to follow before halting.",
                      default=None)
    import_options.add_option("-p", "--path", dest="path", default=[], action="append",
                      help="Search path for python modules.")
    import_options.get_option("-p").long_help = (
        "This is the path that the extractor uses when searching for imports. This path is searched before sys.path. "+
        "If the search path (sys.path) during program execution includes any paths that are not in 'sys.path' during extraction, " +
        "then those paths need to be included using this flag.")
    import_options.add_option("-x", "--excludepath", dest="exclude", default=[], action="append",
                      help="Exclude from search path for importing modules.")
    import_options.get_option("-x").long_help = (
        "Excludes this path and all its sub-paths when searching for imports. " +
        "Useful for excluding sub folders of paths specified with the '-p' option, or for excluding items in the 'sys.path' list.")
    import_options.add_option("-a", "--all-imports", dest="all",
                      help="Ignored", default=False, action="store_true")
    import_options.add_option("-n", "--no-imports", dest="none",
                      help="Ignored", default=False, action="store_true")
    import_options.add_option("-g", "--guess-imports", dest="guess",
                          help="Ignored", default=False, action="store_true")
    import_options.add_option("-t", "--top-imports", dest="top",
                      help="Ignored", default=False, action="store_true")
    parser.add_option_group(import_options)

    module_options = OptionGroup(parser, "Options to determine which modules are to be extracted",
        description="When specifying a list of values, individual values should be separated by the OS path separator for paths, and by commas for names.")
    module_options.add_option("-m", "--main", dest="main",
                      help="A list of files which can be run as the main (or application) script.",
                      default=[], action="append")
    module_options.get_option("-m").long_help = (
        "Files included in the database as 'main' modules will have the name '__main__' rather than a name derived from the path. " +
        "It is perfectly legal to have several '__main__' modules in the database.")
    module_options.add_option("-r", "--recurse-package", dest="recursive", default=[], action="append",
                      help="DEPRECATED. Analyze all modules in this comma-separated list of packages (recursively).")
    module_options.add_option("-y", "--exclude-package", dest="exclude_package", default=[], action="append",
                      help="IGNORED.")
    module_options.add_option("-Y", "--exclude-file", dest="exclude_file", default=[], action="append",
                      help="Exclude file from recursive search of files. Will not affect recursive search by package.")
    module_options.add_option("--filter", dest="path_filter", default=[], action="append",
                      help="""Filter to apply to files from recursive search of files. Will not affect recursive search by package.
                      Filters are of the form [include|exclude]:GLOB_PATTERN""")
    module_options.add_option("--exclude-pattern", dest="exclude_pattern",
                      help = """Exclude any modules matching this regular expression.""",
                      default=None)
    module_options.add_option("--respect-init", dest="respect_init",
                      help="Respect the presence of '__init__.py' files when considering whether a folder is "
                        "a package. Defaults to True for Python 2 and False for Python 3. "
                        "Legal values are 'True' or 'False' (case-insensitive).",
                      default = None)
    module_options.add_option("-F", "--files", dest="files", default=[], action="append",
                      help = """Treat the paths in this list as source files for modules. Compute the module name from given paths.""")
    module_options.add_option("-R", "--recurse-files", dest="recurse_files", default=[], action="append",
                      help = """Treat the paths in this list as paths for packages, then recurse. Compute the package name from given paths.""")
    parser.add_option_group(module_options)

    config_options = OptionGroup(parser, "Configuration options")
    config_options.add_option("-f","--file", dest="file", default=None,
                      help="File to read options from")
    config_options.add_option("-c", "--trap-cache", dest="trap_cache",
                      help="Directory in which to cache trap files.",
                      default=None)
    config_options.add_option("-z", "--max-procs", dest="max_procs", default=None,
                      help="Maximum number of processes, legal options are "
                      "'all', 'half'(the default) or any positive integer.")
    config_options.add_option("-j", "--introspect-c", dest="introspect_c",
                      help="Option is ignored (retained for backwards compatibility)",
                      default=False, action="store_true")
    config_options.add_option("--ignore-missing-modules", dest="ignore_missing_modules", default=False, action="store_true",
                      help = """Ignore any module specified on the command line that cannot be found. Defaults to false.""")
    config_options.add_option("-u", "--no-symlinks", dest="no_symlinks",
                      help="Do not follow sym-links when normalizing paths",
                      default=False, action="store_true")
    config_options.add_option("-e", "--renamer", dest="renamer",
                      help="""Module containing get_renamer() function which returns
                              a renaming function to be used when normalizing paths.""",
                      default=None)
    config_options.add_option("-o", "--outdir", dest="outdir",
                      help="Output directory for writing trap files.")
    config_options.add_option("--omit-syntax-errors", dest="no_syntax_errors",
                      help="Do not emit trap files or copy source for those files containing syntax errors",
                      default=False, action="store_true")
    config_options.get_option("-o").long_help = " Only useful when running the extractor independently of Semmle's toolchain."
    config_options.add_option("--max-context-cost", dest="context_cost", default=None,
                      help="""Specify the maximum cost of contexts in the points-to analysis.
                      WARNING: Setting this option may cause the analysis to consume a lot more time and memory than normal""")
    config_options.add_option("--colorize", dest="colorize", default=False, action="store_true",
                      help = """Colorize the logging output.""")

    config_options.add_option("--dont-extract-stdlib", dest="extract_stdlib", action="store_false",
        help="This flag is deprecated; not extracting the standard library is now the default.")
    config_options.add_option("--extract-stdlib", dest="extract_stdlib", default=False, action="store_true",
        help="Extract the standard library.")

    parser.add_option_group(config_options)

    debug_options = OptionGroup(parser, "Debug and information options")
    debug_options.add_option("-h", "--help", default=False, action="store_true",
                      help="show this help message and exit. Combine with -v for more details.")
    debug_options.add_option("-v", "--verbose", dest="verbose", help="Verbose output",
                      default=0, action="count")
    debug_options.add_option("--verbosity", dest="verbosity", help="Verbosity of output",
                      default=None)
    debug_options.add_option("--quiet", dest="quiet", help="Quiet output, only report errors or worse.",
                      default=0, action="count")
    debug_options.add_option("-q", "--trace-only", dest="trace_only",
                      help="Trace only, printing modules found. Do not create trap files.",
                      default=False, action="store_true")
    debug_options.add_option("--profile-out", dest="profile_out", default=None,
                      help="Write profiling information to the given file.")
    parser.add_option_group(debug_options)

    lang_options = OptionGroup(parser, "Options for handling sub-languages and extensions")

    # This is a temporary feature until we have full, transparent support for combined 2/3 analysis.
    # Slated to be removed before 1.12 so it should not be documented.
    lang_options.add_option("-l", "--lang", dest="language_version", default=[], action="append",
                              help="Override automatic language version detection and use specified versions(s)")

    parser.add_option_group(lang_options)

    advanced_options = OptionGroup(parser, "Advanced options: For running the extractor in unusual environments.")
    advanced_options.add_option("--dont-split-graph", dest="split", default=True, action="store_false",
                      help = """Do not perform splitting on the flow graph, this will result in increased performance,
                      but at the cost of decreased accuracy in the resulting database. Defaults to false.""")
    advanced_options.add_option("--dont-unroll-graph", dest="unroll", action="store_false",
                      help = """DEPRECATED. Do not use.
                      Do not perform selective loop unrolling on the flow graph. This will result in increased performance,
                      but at the cost of decreased accuracy in the resulting database. Defaults to true.""")
    advanced_options.add_option("--unroll-graph", dest="unroll", default=False, action="store_true",
                      help = """Perform selective loop unrolling on the flow graph. This may result in increased accuracy,
                      but at the cost of decreased performance in the resulting database. Defaults to false.""")

    parser.add_option_group(advanced_options)
    return parser

def strip_trailing_slash(path):
    '''Remove trailing slash from path for consistency'''
    while path.endswith(os.sep) and path != os.sep:
        path = path[:-1]
    return path

def parse(command_line):
    parser = make_parser()
    options, args = parser.parse_args(command_line)
    while options.file:
        with open(options.file) as opt_file:
            file_opts = shlex.split(opt_file.read())
        extra_options, extra_args = parser.parse_args(file_opts)
        options.file = None
        #The optparse.Values class does not provide a public method for updating.
        #This only works if all the defaults are a false value (which they are)
        for attr in dir(options):
            if attr in extra_options.__dict__:
                dval = extra_options.__dict__[attr]
                if dval:
                    setattr(options, attr, dval)
        args.extend(extra_args)
    del options.file
    if options.help:
        if options.verbose:
            for opt in parser._get_all_options():
                if hasattr(opt, "long_help"):
                    if opt.long_help.endswith("."):
                        opt.help += " " + opt.long_help
                    else:
                        opt.help += ". " + opt.long_help
        parser.print_help()
        if options.verbose:
            print(EXTRA_HELP)
        sys.exit(0)
    if options.respect_init is None:
        # In this case we cannot use `util.get_analysis_major_version` because it will only be
        # populated _after_ we've parsed the options.
        options.respect_init = any(version.startswith('2') for version in options.language_version)
    else:
        options.respect_init = options.respect_init.lower() == "true"
    options.main = split_and_flatten(options.main, os.pathsep)
    options.exclude = split_and_flatten(options.exclude, os.pathsep)
    options.recursive = split_and_flatten(options.recursive, ",")
    options.exclude_package = split_and_flatten(options.exclude_package, ",")
    options.files = split_and_flatten(options.files, os.pathsep)
    options.recurse_files = split_and_flatten(options.recurse_files, os.pathsep)
    options.path = split_and_flatten(options.path, os.pathsep)
    options.path = [strip_trailing_slash(item) for item in options.path]
    for name in options.recursive:
        verify_module_name(name)
    for name in options.exclude_package:
        verify_module_name(name)
    for name in args:
        verify_module_name(name)
    if options.verbosity is not None:
        try:
            options.verbosity = int(options.verbosity)
        except ValueError:
            print (options.verbosity + " is not a valid verbosity level.")
            sys.exit(1)
    else:
        options.verbosity = logging.WARN # default logging level
        options.verbosity -= options.quiet
        options.verbosity += options.verbose
    if options.verbosity > logging.TRACE:
        options.verbosity = logging.TRACE
    if options.verbosity < logging.OFF:
        options.verbosity = logging.OFF
    if options.max_import_depth is None:
        max_import_depth = float('inf')
    else:
        max_import_depth = int(options.max_import_depth)
    if max_import_depth < 0:
        max_import_depth = float('inf')
    options.max_import_depth = max_import_depth

    if 'CODEQL_EXTRACTOR_PYTHON_DONT_EXTRACT_STDLIB' in os.environ:
        options.extract_stdlib = False
        print ("Warning: CODEQL_EXTRACTOR_PYTHON_DONT_EXTRACT_STDLIB is deprecated; the default is now to not extract the standard library.")

    if 'CODEQL_EXTRACTOR_PYTHON_EXTRACT_STDLIB' in os.environ:
        options.extract_stdlib = True

    options.prune = True

    if options.extract_stdlib:
        print ("Warning: The analysis will extract the standard library. This behavior is deprecated and will be removed in a future release. We expect it to be gone in CLI version 2.20.0.")

    return options, args

def split_and_flatten(options_list, div):
    result = []
    for item in options_list:
        result.extend(item.split(div))
    return result

def is_legal_module_name(name):
    for identifier in name.split("."):
        if not identifier.isidentifier():
            return False
    return True

def verify_module_name(name):
    if not is_legal_module_name(name):
        sys.exit("'%s' is not a legal module name" % name)

EXTRA_HELP = '''
When combining explicitly listed modules, or any options to include modules, with any option to exclude modules, the exclude options act as filters on the included modules.
Therefore if any module is both excluded and included by a command line option, then it will not be included in the database.
Note that exclusion of a module does not necessarily exclude the modules that are imported by that module.

For example, if module 'a' imports module 'b' and module 'c' also imports module 'b' and the extractor is called with "-y c a",
then 'c' will be excluded but 'b' will be included as it is imported by 'a'.

Exit codes:
    0. OK, finished normally
    1. Failed to extract one or more files.
    2. Interrupted (by ctrl-C or a signal)
    3. Other error.
'''

def output_dir_from_options_and_env(options):
    trap_dir = options.outdir
    if trap_dir is None:
        if 'CODEQL_EXTRACTOR_PYTHON_TRAP_DIR' in os.environ:
            trap_dir = os.environ['CODEQL_EXTRACTOR_PYTHON_TRAP_DIR']
        elif 'TRAP_FOLDER' in os.environ:
            trap_dir = os.environ['TRAP_FOLDER']
        else:
            raise IOError(
                "Cannot find trap folder. CODEQL_EXTRACTOR_PYTHON_TRAP_DIR is not set.")
    if not os.path.exists(trap_dir):
        os.makedirs(trap_dir)
    return trap_dir


class MarkdownFormatter (HelpFormatter):
    """Format help with underlined section headers.
    """

    def __init__(self,
                 indent_increment=0,
                 max_help_position=40,
                 width=1000,
                 short_first=0):
        HelpFormatter.__init__ (
            self, indent_increment, max_help_position, width, short_first)
        self.needs_table_heading = False

    def format_usage(self, usage):
        return "%s  %s\n" % (self.format_heading(_("Usage")), usage)

    def format_heading(self, heading):
        self.needs_table_heading = True
        return '%s %s\n' % ('#' * (self.level +3), heading)

    def format_description(self, description):
        return description + "\n"

    def format_option(self, option):
        if self.needs_table_heading:
            self.needs_table_heading = False
            header = "Flags | Description\n------|---------\n"
        else:
            header = ''
        opts = self.option_strings[option]
        return header + opts + " | " + option.help.replace("\n", " ") + "\n"

def _is_help_line(lines, index, pos):
    if index + 1 >= len(lines):
        return False
    if len(lines[index]) <= pos:
        return False
    if lines[index].startswith("#"):
        return False
    return True

def _format_parser_options():
    parser = make_parser()
    formatter = MarkdownFormatter()
    return parser.format_help(formatter)


if __name__ == "__main__":
    print(_format_parser_options())