mirror of
https://github.com/github/codeql.git
synced 2026-04-28 18:25:24 +02:00
Python: Copy Python extractor to codeql repo
This commit is contained in:
364
python/extractor/semmle/projectlayout.py
Normal file
364
python/extractor/semmle/projectlayout.py
Normal file
@@ -0,0 +1,364 @@
|
||||
#
|
||||
# This is a port of com.semmle.extractor.projectstructure.ProjectLayout
|
||||
# and must be kept in sync
|
||||
#
|
||||
|
||||
"""Project-layout files are used to transform or exclude paths. The format
|
||||
is described at https://semmle.com/wiki/display/SD/project-layout+format"""
|
||||
|
||||
__ALL__ = [ 'load', 'ProjectLayout' ]
|
||||
|
||||
import collections
|
||||
import re
|
||||
from functools import total_ordering
|
||||
import sys
|
||||
|
||||
def get_renamer(filename):
|
||||
layout = load(filename)
|
||||
def rename(path):
|
||||
renamed = layout.artificial_path(path)
|
||||
return path if renamed is None else renamed
|
||||
return rename
|
||||
|
||||
def load(filename):
|
||||
"""Load a project-layout file from 'filename'."""
|
||||
with open(filename, 'rb') as f:
|
||||
content = f.read().decode('utf-8')
|
||||
lines = [ line.strip() for line in content.split('\n') ]
|
||||
return ProjectLayout(lines)
|
||||
|
||||
def _escape_string_literal_for_regexp(literal, preserve):
|
||||
ESCAPE = u"(){}[].^$+\\*?"
|
||||
def escape(char):
|
||||
if char in ESCAPE and not char in preserve:
|
||||
return u"\\" + char
|
||||
else:
|
||||
return char
|
||||
return u"".join(escape(c) for c in literal)
|
||||
|
||||
|
||||
class ProjectLayout(object):
|
||||
""" A project-layout file optionally begins with an '@'
|
||||
followed by the name the project should be renamed to.
|
||||
Optionally, it can then be followed by a list of
|
||||
include/exclude patterns (see below) which are kept
|
||||
as untransformed paths. This is followed by one or
|
||||
more clauses. Each clause has the following form:
|
||||
|
||||
#virtual-path
|
||||
path/to/include
|
||||
another/path/to/include
|
||||
-/path/to/include/except/this
|
||||
|
||||
i.e. one or more paths (to include) and zero or more paths
|
||||
prefixed by minus-signs (to exclude)."""
|
||||
|
||||
def __init__(self, lines):
|
||||
"""Construct a project-layout object from an array of strings, each
|
||||
corresponding to one line of the project-layout. This constructor is
|
||||
for testing. Usually, use the 'load' function."""
|
||||
|
||||
self._project = None
|
||||
# Map from virtual path prefixes (following the '#' in the
|
||||
# project-layout) to the sequence of patterns that fall into that
|
||||
# section. Declared as an OrderedDict since iteration order matters --
|
||||
# the blocks are processed in the same order as they occur in the
|
||||
# project-layout.
|
||||
self._rewrites = collections.OrderedDict()
|
||||
virtual = u""
|
||||
section = _Section()
|
||||
self._rewrites[virtual] = section
|
||||
num = 0
|
||||
for line in lines:
|
||||
num += 1
|
||||
if not line:
|
||||
continue
|
||||
if line[0] == u'@':
|
||||
if self._project is not None:
|
||||
raise _error(u"Multiple project names in project-layout", num)
|
||||
self._project = self._tail(line)
|
||||
elif line[0] == u'#':
|
||||
virtual = self._tail(line)
|
||||
if virtual in self._rewrites:
|
||||
raise _error(u"Duplicate virtual path prefix " + virtual, num)
|
||||
section = _Section(virtual)
|
||||
self._rewrites[virtual] = section
|
||||
elif line[0] == u'-':
|
||||
section.add(_Rewrite(self._tail(line), num))
|
||||
else:
|
||||
section.add(_Rewrite(line, num, virtual))
|
||||
|
||||
@classmethod
|
||||
def _tail(cls, line):
|
||||
return line[1:].strip()
|
||||
|
||||
def project_name(self, default=None):
|
||||
""" Get the project name, if specified by the project-layout.
|
||||
If default is specified, it will be returned if no project name
|
||||
is specified. Otherwise, an exception is thrown."""
|
||||
|
||||
if self._project is not None:
|
||||
return self._project
|
||||
if default is not None:
|
||||
return default
|
||||
raise Exception(u"Project specificatino does not define a project name.")
|
||||
|
||||
def sections(self):
|
||||
"""return the section headings (aka virtual paths)"""
|
||||
return self._rewrites.keys()
|
||||
|
||||
def section_is_empty(self, section):
|
||||
"""Determine whether or not a particular section in this
|
||||
project-layout is empty (has no include/exclude patterns)."""
|
||||
|
||||
if section in self._rewrites:
|
||||
return self._rewrites[section].is_empty()
|
||||
raise Exception(u"Section does not exist: " + section)
|
||||
|
||||
def rename_section(self, old, new):
|
||||
"""Reaname a section in this project-layout."""
|
||||
|
||||
if not old in self._rewrites:
|
||||
raise Exception(u"Section does not exist: " + old)
|
||||
section = self._rewrites.pop(old)
|
||||
section.rename(new)
|
||||
self._rewrites[new] = section
|
||||
|
||||
def sub_layout(self, section_name):
|
||||
"""Return a project-layout file for just one of the sections in this
|
||||
project-layout. This is done by copying all the rules from the
|
||||
section, and changing the section heading (beginning with '#')
|
||||
to a project name (beginning with '@')."""
|
||||
|
||||
section = self._rewrites.get(section_name, None)
|
||||
if section is None:
|
||||
raise Exception(u"Section does not exist: " + section)
|
||||
return section.to_layout()
|
||||
|
||||
def artificial_path(self, path):
|
||||
"""Maps a path to its corresponding artificial path according to the
|
||||
rules in this project-layout. If the path is excluded (either
|
||||
explicitly, or because it is not mentioned in the project-layout)
|
||||
then None is returned.
|
||||
|
||||
Paths should start with a leading forward-slash."""
|
||||
|
||||
prefixes = _Section.prefixes(path)
|
||||
for section in self._rewrites.values():
|
||||
rewrite = section.match(prefixes);
|
||||
rewritten = None;
|
||||
if rewrite is not None:
|
||||
rewritten = rewrite.rewrite(path);
|
||||
if rewritten is not None:
|
||||
return rewritten
|
||||
return None
|
||||
|
||||
def include_file(self, path):
|
||||
"""Checks whether a path should be included in the project specified by
|
||||
this file. A file is included if it is mapped to some location.
|
||||
|
||||
Paths should start with a leading forward-slash."""
|
||||
|
||||
return self.artificial_path(path) is not None
|
||||
|
||||
|
||||
class _Section(object):
|
||||
"""Each section corresponds to a block beginning with '#some/path'. There
|
||||
is also an initial section for any include/exclude patterns before the
|
||||
first '#'."""
|
||||
|
||||
def __init__(self, virtual=u""):
|
||||
self._virtual = virtual
|
||||
self._simple_rewrites = collections.OrderedDict()
|
||||
self._complex_rewrites = []
|
||||
|
||||
def to_layout(self):
|
||||
result = []
|
||||
rewrites = []
|
||||
rewrites.extend(self._simple_rewrites.values())
|
||||
rewrites.extend(self._complex_rewrites)
|
||||
rewrites.sort()
|
||||
|
||||
result.append(u'@' + self._virtual)
|
||||
for rewrite in rewrites:
|
||||
result.append(str(rewrite))
|
||||
result.append(u'')
|
||||
return u'\n'.join(result)
|
||||
|
||||
def rename(self, new):
|
||||
self._virtual = new
|
||||
for rewrite in self._simple_rewrites.values():
|
||||
rewrite.virtual = new
|
||||
for rewrite in self._complex_rewrites:
|
||||
rewrite.virtual = new
|
||||
|
||||
def add(self, rewrite):
|
||||
if rewrite.is_simple():
|
||||
self._simple_rewrites[rewrite.simple_prefix()] = rewrite
|
||||
else:
|
||||
self._complex_rewrites.append(rewrite)
|
||||
|
||||
def is_empty(self):
|
||||
return not self._simple_rewrites and not self._complex_rewrites
|
||||
|
||||
@classmethod
|
||||
def prefixes(cls, path):
|
||||
result = [path]
|
||||
i = len(path)
|
||||
while (i > 1):
|
||||
i = path.rfind(u'/', 0, i)
|
||||
result.append(path[:i])
|
||||
result.append(u"/")
|
||||
return result;
|
||||
|
||||
def match(self, prefixes):
|
||||
best = None
|
||||
for prefix in prefixes:
|
||||
match = self._simple_rewrites.get(prefix, None)
|
||||
if match is not None:
|
||||
if best is None or best._line < match._line:
|
||||
best = match;
|
||||
# Last matching rewrite 'wins'
|
||||
for rewrite in reversed(self._complex_rewrites):
|
||||
if rewrite.matches(prefixes[0]):
|
||||
if best is None or best._line < rewrite._line:
|
||||
best = rewrite;
|
||||
# no point continuing
|
||||
break;
|
||||
return best;
|
||||
|
||||
@total_ordering
|
||||
class _Rewrite(object):
|
||||
"""Each Rewrite corresponds to a single include or exclude line in the
|
||||
project-layout. For example, for following clause there would be three
|
||||
Rewrite objects:
|
||||
|
||||
#Source
|
||||
/src
|
||||
/lib
|
||||
-/src/tests
|
||||
|
||||
For includes use the two-argument constructor; for excludes the
|
||||
one-argument constructor."""
|
||||
|
||||
# The intention is to allow the ** wildcard when followed by a slash only. The
|
||||
# following should be invalid:
|
||||
# - a / *** / b (too many stars)
|
||||
# - a / ** (** at the end should be omitted)
|
||||
# - a / **b (illegal)
|
||||
# - a / b** (illegal)
|
||||
# - ** (the same as a singleton '/')
|
||||
# This regular expression matches ** when followed by a non-/ character,
|
||||
# or the end of string.
|
||||
_verify_stars = re.compile(u".*(?:\\*\\*[^/].*|\\*\\*$|[^/]\\*\\*.*)")
|
||||
|
||||
def __init__(self, path, line, virtual=None):
|
||||
if virtual is None:
|
||||
exclude = path
|
||||
self._line = line;
|
||||
self._original = u'-' + exclude;
|
||||
if not exclude.startswith(u"/"):
|
||||
exclude = u'/' + exclude
|
||||
if exclude.find(u"//") != -1:
|
||||
raise _error(u"Illegal '//' in exclude path", line)
|
||||
if self._verify_stars.match(exclude):
|
||||
raise _error(u"Illegal use of '**' in exclude path", line)
|
||||
if exclude.endswith(u"/"):
|
||||
exclude = exclude[0 : -1]
|
||||
self._pattern = self._compile_prefix(exclude);
|
||||
exclude = exclude.replace(u"//", u"/")
|
||||
if len(exclude) > 1 and exclude.endswith(u"/"):
|
||||
exclude = exclude[0 : -1]
|
||||
self._simple = None if exclude.find(u"*") != -1 else exclude
|
||||
else:
|
||||
include = path
|
||||
self._line = line;
|
||||
self._original = include;
|
||||
if not include.startswith(u"/"):
|
||||
include = u'/' + include
|
||||
doubleslash = include.find(u"//")
|
||||
if doubleslash != include.find(u"//"):
|
||||
raise _error(u"More than one '//' in include path (project-layout)", line)
|
||||
if self._verify_stars.match(include):
|
||||
raise _error(u"Illegal use of '**' in include path (project-layout)", line)
|
||||
if not virtual.startswith(u"/"):
|
||||
virtual = u"/" + virtual
|
||||
if virtual.endswith(u"/"):
|
||||
virtual = virtual[0 : -1]
|
||||
self._pattern = self._compile_prefix(include);
|
||||
include = include.replace(u"//", u"/");
|
||||
if len(include) > 1 and include.endswith(u"/"):
|
||||
include = include[0 : -1]
|
||||
self._simple = None if include.find(u"*") != -1 else include
|
||||
self._virtual = virtual;
|
||||
|
||||
@classmethod
|
||||
def _compile_prefix(cls, pattern):
|
||||
"""
|
||||
Patterns are matched by translation to regex. The following invariants
|
||||
are assumed to hold:
|
||||
|
||||
- The pattern starts with a '/'.
|
||||
- There are no occurrences of '**' that is not surrounded by slashes
|
||||
(unless it is at the start of a pattern).
|
||||
- There is at most one double slash.
|
||||
|
||||
The result of the translation has precisely one capture group, which
|
||||
(after successful matching) will contain the part of the path that
|
||||
should be glued to the virtual prefix.
|
||||
|
||||
It proceeds by starting the capture group either after the double
|
||||
slash or at the start of the pattern, and then replacing '*' with
|
||||
'[^/]*' (meaning any number of non-slash characters) and '/**' with
|
||||
'(?:|/.*)' (meaning empty string or a slash followed by any number of
|
||||
characters including '/').
|
||||
|
||||
The pattern is terminated by the term '(?:/.*|$)', saying 'either the
|
||||
next character is a '/' or the string ends' -- this avoids accidental
|
||||
matching of partial directory/file names.
|
||||
|
||||
IMPORTANT: Run the ProjectLayoutTests when changing this!
|
||||
"""
|
||||
|
||||
pattern = _escape_string_literal_for_regexp(pattern, u"*")
|
||||
if pattern.find(u"//") != -1:
|
||||
pattern = pattern.replace(u"//", u"(/")
|
||||
else:
|
||||
pattern = u"(" + pattern
|
||||
if pattern.endswith(u"/"):
|
||||
pattern = pattern[0 : -1]
|
||||
pattern = pattern.replace(u"/**", u"-///-")
|
||||
pattern = pattern.replace(u"*", u"[^/]*")
|
||||
pattern = pattern.replace(u"-///-", u"(?:|/.*)")
|
||||
return re.compile(pattern + u"(?:/.*|$))")
|
||||
|
||||
def is_simple(self):
|
||||
return self._simple is not None
|
||||
|
||||
def simple_prefix(self):
|
||||
"""Returns the path included/excluded by this rewrite, if it is
|
||||
simple, or <code>null</code> if it is not."""
|
||||
|
||||
return self._simple
|
||||
|
||||
def matches(self, path):
|
||||
return bool(self._pattern.match(path))
|
||||
|
||||
def rewrite(self, path):
|
||||
if self._virtual is None:
|
||||
return None
|
||||
matcher = self._pattern.match(path)
|
||||
if not matcher:
|
||||
return None
|
||||
return self._virtual + matcher.group(1);
|
||||
|
||||
def __unicode__(self):
|
||||
return self._original
|
||||
|
||||
def __lt__(self, other):
|
||||
return self._line < other._line
|
||||
|
||||
__hash__ = None
|
||||
|
||||
def _error(message, line):
|
||||
raise Exception(u"%s (line %d)" % (message, line))
|
||||
Reference in New Issue
Block a user