Initial commit of Python queries and QL libraries.

This commit is contained in:
Mark Shannon
2018-11-19 13:13:39 +00:00
committed by Mark Shannon
parent 90c75cd362
commit 5f58824d1b
725 changed files with 63520 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
import re
matcher = re.compile(r"\b[\t\b]")
def match_data(data):
return bool(matcher.match(data))

View File

@@ -0,0 +1,40 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
The meaning of the <code>\b</code> escape sequence inside a regular expression depends on its
syntactic context: inside a character class, it matches the backspace character; outside of a
character class, it matches a word boundary. This context dependency makes regular expressions
hard to read, so the <code>\b</code> escape sequence should not be used inside character classes.
</p>
</overview>
<recommendation>
<p>
Replace <code>\b</code> in character classes with the semantically identical escape sequence <code>\x08</code>.
</p>
</recommendation>
<example>
<p>
In the following example, the regular expression contains two uses of <code>\b</code>: in the
first case, it matches a word boundary, in the second case it matches a backspace character.
</p>
<sample src="BackspaceEscape.py" />
<p>
You can make the regular expression easier for other developers to interpret, by rewriting it as <code>r"\b[\t\x08]"</code>.
</p>
</example>
<references>
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,22 @@
/**
* @name Backspace escape in regular expression
* @description Using '\b' to escape the backspace character in a regular expression is confusing
* since it could be mistaken for a word boundary assertion.
* @kind problem
* @tags maintainability
* @problem.severity recommendation
* @sub-severity high
* @precision very-high
* @id py/regex/backspace-escape
*/
import python
import semmle.python.regex
from Regex r, int offset
where r.escapingChar(offset) and r.getChar(offset+1) = "b" and
exists(int start, int end |
start < offset and end > offset |
r.charSet(start, end)
)
select r, "Backspace escape in regular expression at offset " + offset + "."

View File

@@ -0,0 +1,6 @@
import re
matcher = re.compile(r"[password|pwd]")
def find_password(data):
if matcher.match(data):
print("Found password!")

View File

@@ -0,0 +1,44 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Character classes in regular expressions represent sets of characters, so there is no need to specify
the same character twice in one character class. Duplicate characters in character classes are at best
useless, and may even indicate a latent bug.
</p>
</overview>
<recommendation>
<p>Determine whether a character is simply duplicated or whether the character class was in fact meant as a group.
If it is just a duplicate, then remove the duplicate character.
If was supposed to be a group, then replace the square brackets with parentheses.
</p>
</recommendation>
<example>
<p>
In the following example, the character class <code>[password|pwd]</code> contains two instances each
of the characters <code>d</code>, <code>p</code>, <code>s</code>, and <code>w</code>. The programmer most likely meant
to write <code>(password|pwd)</code> (a pattern that matches either the string <code>"password"</code>
or the string <code>"pwd"</code>), and accidentally mistyped the enclosing brackets.
</p>
<sample src="DuplicateCharacterInSet.py" />
<p>
To fix this problem, the regular expression should be rewritten to <code>r"(password|pwd)"</code>.
</p>
</example>
<references>
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/charclass.html">Character Classes or Character Sets</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,34 @@
/**
* @name Duplication in regular expression character class
* @description Duplicate characters in a class have no effect and may indicate an error in the regular expression.
* @kind problem
* @tags reliability
* readability
* @problem.severity warning
* @sub-severity low
* @precision very-high
* @id py/regex/duplicate-in-character-class
*/
import python
import semmle.python.regex
predicate duplicate_char_in_class(Regex r, string char) {
exists(int i, int j, int x, int y, int start, int end |
i != x and j != y and
start < i and j < end and
start < x and y < end and
r.character(i, j) and char = r.getText().substring(i, j) and
r.character(x, y) and char = r.getText().substring(x, y) and
r.charSet(start, end)
) and
/* Exclude <20> as we use it for any unencodable character */
char != "<22>" and
//Ignore whitespace in verbose mode
not (r.getAMode() = "VERBOSE" and (char = " " or char = "\t" or char = "\r" or char = "\n"))
}
from Regex r, string char
where duplicate_char_in_class(r, char)
select r, "This regular expression includes duplicate character '" + char + "' in a set of characters."

View File

@@ -0,0 +1,10 @@
import re
matcher = re.compile(r'(P<name>[\w]+)')
def only_letters(text):
m = matcher.match(text)
if m:
print("Letters are: " + m.group('name'))
#Fix the pattern by adding the missing '?'
fixed_matcher = re.compile(r'(?P<name>[\w]+)')

View File

@@ -0,0 +1,37 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
One of the problems with using regular expressions is that almost any sequence of characters is a valid pattern.
This means that it is easy to omit a necessary character and still have a valid regular expression.
Omitting a character in a named capturing group is a specific case which can dramatically change the meaning of a regular expression.
</p>
</overview>
<recommendation>
<p>
Examine the regular expression to find and correct any typos.
</p>
</recommendation>
<example>
<p>
In the following example, the regular expression for <code>matcher</code>, <code>r"(P&lt;name&gt;[\w]+)"</code>, is missing a "?" and will
match only strings of letters that start with "P&lt;name&gt;", instead of matching any sequence of letters
and placing the result in a named group.
The fixed version, <code>fixed_matcher</code>, includes the "?" and will work as expected.
</p>
<sample src="MissingPartSpecialGroup.py" />
</example>
<references>
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/named.html">Named Capturing Groups</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,20 @@
/**
* @name Missing part of special group in regular expression
* @description Incomplete special groups are parsed as normal groups and are unlikely to match the intended strings.
* @kind problem
* @tags reliability
* correctness
* @problem.severity warning
* @sub-severity high
* @precision high
* @id py/regex/incomplete-special-group
*/
import python
import semmle.python.regex
from Regex r, string missing, string part
where r.getText().regexpMatch(".*\\(P<\\w+>.*") and missing = "?" and part = "named group"
select r, "Regular expression is missing '" + missing + "' in " + part + "."

View File

@@ -0,0 +1,11 @@
import re
#Regular expression includes a caret, but not at the start.
matcher = re.compile(r"\[^.]*\.css")
def find_css(filename):
if matcher.match(filename):
print("Found it!")
#Regular expression for a css file name
fixed_matcher_css = re.compile(r"[^.]*\.css")

View File

@@ -0,0 +1,40 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
The caret character <code>^</code> anchors a regular expression to the beginning of the input, or
(for multi-line regular expressions) to the beginning of a line.
If it is preceded by a pattern that must match a non-empty sequence of (non-newline) input characters,
then the entire regular expression cannot match anything.
</p>
</overview>
<recommendation>
<p>
Examine the regular expression to find and correct any typos.
</p>
</recommendation>
<example>
<p>
In the following example, the regular expression <code>r"\[^.]*\.css"</code> cannot match any
string, since it contains a caret assertion preceded by an escape sequence that matches an
opening bracket.
</p>
<p>
In the second regular expression, <code>r"[^.]*\.css"</code>, the caret is part of a character class, and will not match the start of the string.
</p>
<sample src="UnmatchableCaret.py" />
</example>
<references>
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/anchors.html">Start of String and End of String Anchors</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,25 @@
/**
* @name Unmatchable caret in regular expression
* @description Regular expressions containing a caret '^' in the middle cannot be matched, whatever the input.
* @kind problem
* @tags reliability
* correctness
* @problem.severity error
* @sub-severity low
* @precision high
* @id py/regex/unmatchable-caret
*/
import python
import semmle.python.regex
predicate unmatchable_caret(Regex r, int start) {
not r.getAMode() = "MULTILINE" and
not r.getAMode() = "VERBOSE" and
r.specialCharacter(start, start+1, "^") and
not r.firstItem(start, start+1)
}
from Regex r, int offset
where unmatchable_caret(r, offset)
select r, "This regular expression includes an unmatchable caret at offset " + offset.toString() + "."

View File

@@ -0,0 +1,10 @@
import re
#Regular expression that includes a dollar, but not at the end.
matcher = re.compile(r"\.\(\w+$\)")
def find_it(filename):
if matcher.match(filename):
print("Found it!")
#Regular expression anchored to end of input.
fixed_matcher = re.compile(r"\.\(\w+\)$")

View File

@@ -0,0 +1,41 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
A dollar assertion <code>$</code> in a regular expression only matches at the end of the input, or
(for multi-line regular expressions) at the end of a line. If it is followed by a pattern
that must match a non-empty sequence of (non-newline) input characters, it cannot possibly match,
rendering the entire regular expression unmatchable.
</p>
</overview>
<recommendation>
<p>
Examine the regular expression to find and correct any typos.
</p>
</recommendation>
<example>
<p>
In the following example, the regular expression <code>r"\.\(\w+$\)"</code> cannot match any
string, since it contains a dollar assertion followed by an escape sequence that matches a
closing parenthesis.
</p>
<p>
The second regular expression, <code>r"\.\(\w+\)$"</code>, has the dollar at the end and will work as expected.
</p>
<sample src="UnmatchableDollar.py" />
</example>
<references>
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/anchors.html">Start of String and End of String Anchors</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,26 @@
/**
* @name Unmatchable dollar in regular expression
* @description Regular expressions containing a dollar '$' in the middle cannot be matched, whatever the input.
* @kind problem
* @tags reliability
* correctness
* @problem.severity error
* @sub-severity low
* @precision high
* @id py/regex/unmatchable-dollar
*/
import python
import semmle.python.regex
predicate unmatchable_dollar(Regex r, int start) {
not r.getAMode() = "MULTILINE" and
not r.getAMode() = "VERBOSE" and
r.specialCharacter(start, start+1, "$")
and
not r.lastItem(start, start+1)
}
from Regex r, int offset
where unmatchable_dollar(r, offset)
select r, "This regular expression includes an unmatchable dollar at offset " + offset.toString() + "."