mirror of
https://github.com/github/codeql.git
synced 2026-04-27 17:55:19 +02:00
Initial commit of Python queries and QL libraries.
This commit is contained in:
committed by
Mark Shannon
parent
90c75cd362
commit
5f58824d1b
5
python/ql/src/Expressions/Regex/BackspaceEscape.py
Normal file
5
python/ql/src/Expressions/Regex/BackspaceEscape.py
Normal file
@@ -0,0 +1,5 @@
|
||||
import re
|
||||
matcher = re.compile(r"\b[\t\b]")
|
||||
|
||||
def match_data(data):
|
||||
return bool(matcher.match(data))
|
||||
40
python/ql/src/Expressions/Regex/BackspaceEscape.qhelp
Normal file
40
python/ql/src/Expressions/Regex/BackspaceEscape.qhelp
Normal file
@@ -0,0 +1,40 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
|
||||
<overview>
|
||||
<p>
|
||||
The meaning of the <code>\b</code> escape sequence inside a regular expression depends on its
|
||||
syntactic context: inside a character class, it matches the backspace character; outside of a
|
||||
character class, it matches a word boundary. This context dependency makes regular expressions
|
||||
hard to read, so the <code>\b</code> escape sequence should not be used inside character classes.
|
||||
</p>
|
||||
|
||||
</overview>
|
||||
<recommendation>
|
||||
|
||||
<p>
|
||||
Replace <code>\b</code> in character classes with the semantically identical escape sequence <code>\x08</code>.
|
||||
</p>
|
||||
|
||||
</recommendation>
|
||||
<example>
|
||||
<p>
|
||||
In the following example, the regular expression contains two uses of <code>\b</code>: in the
|
||||
first case, it matches a word boundary, in the second case it matches a backspace character.
|
||||
</p>
|
||||
|
||||
<sample src="BackspaceEscape.py" />
|
||||
|
||||
<p>
|
||||
You can make the regular expression easier for other developers to interpret, by rewriting it as <code>r"\b[\t\x08]"</code>.
|
||||
</p>
|
||||
|
||||
</example>
|
||||
<references>
|
||||
|
||||
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
|
||||
|
||||
</references>
|
||||
</qhelp>
|
||||
22
python/ql/src/Expressions/Regex/BackspaceEscape.ql
Normal file
22
python/ql/src/Expressions/Regex/BackspaceEscape.ql
Normal file
@@ -0,0 +1,22 @@
|
||||
/**
|
||||
* @name Backspace escape in regular expression
|
||||
* @description Using '\b' to escape the backspace character in a regular expression is confusing
|
||||
* since it could be mistaken for a word boundary assertion.
|
||||
* @kind problem
|
||||
* @tags maintainability
|
||||
* @problem.severity recommendation
|
||||
* @sub-severity high
|
||||
* @precision very-high
|
||||
* @id py/regex/backspace-escape
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
from Regex r, int offset
|
||||
where r.escapingChar(offset) and r.getChar(offset+1) = "b" and
|
||||
exists(int start, int end |
|
||||
start < offset and end > offset |
|
||||
r.charSet(start, end)
|
||||
)
|
||||
select r, "Backspace escape in regular expression at offset " + offset + "."
|
||||
@@ -0,0 +1,6 @@
|
||||
import re
|
||||
matcher = re.compile(r"[password|pwd]")
|
||||
|
||||
def find_password(data):
|
||||
if matcher.match(data):
|
||||
print("Found password!")
|
||||
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
|
||||
<overview>
|
||||
<p>
|
||||
Character classes in regular expressions represent sets of characters, so there is no need to specify
|
||||
the same character twice in one character class. Duplicate characters in character classes are at best
|
||||
useless, and may even indicate a latent bug.
|
||||
</p>
|
||||
|
||||
</overview>
|
||||
<recommendation>
|
||||
|
||||
<p>Determine whether a character is simply duplicated or whether the character class was in fact meant as a group.
|
||||
If it is just a duplicate, then remove the duplicate character.
|
||||
If was supposed to be a group, then replace the square brackets with parentheses.
|
||||
</p>
|
||||
|
||||
|
||||
</recommendation>
|
||||
<example>
|
||||
<p>
|
||||
In the following example, the character class <code>[password|pwd]</code> contains two instances each
|
||||
of the characters <code>d</code>, <code>p</code>, <code>s</code>, and <code>w</code>. The programmer most likely meant
|
||||
to write <code>(password|pwd)</code> (a pattern that matches either the string <code>"password"</code>
|
||||
or the string <code>"pwd"</code>), and accidentally mistyped the enclosing brackets.
|
||||
</p>
|
||||
|
||||
<sample src="DuplicateCharacterInSet.py" />
|
||||
|
||||
<p>
|
||||
To fix this problem, the regular expression should be rewritten to <code>r"(password|pwd)"</code>.
|
||||
</p>
|
||||
|
||||
</example>
|
||||
<references>
|
||||
|
||||
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
|
||||
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/charclass.html">Character Classes or Character Sets</a>.</li>
|
||||
|
||||
</references>
|
||||
</qhelp>
|
||||
34
python/ql/src/Expressions/Regex/DuplicateCharacterInSet.ql
Normal file
34
python/ql/src/Expressions/Regex/DuplicateCharacterInSet.ql
Normal file
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* @name Duplication in regular expression character class
|
||||
* @description Duplicate characters in a class have no effect and may indicate an error in the regular expression.
|
||||
* @kind problem
|
||||
* @tags reliability
|
||||
* readability
|
||||
* @problem.severity warning
|
||||
* @sub-severity low
|
||||
* @precision very-high
|
||||
* @id py/regex/duplicate-in-character-class
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
predicate duplicate_char_in_class(Regex r, string char) {
|
||||
exists(int i, int j, int x, int y, int start, int end |
|
||||
i != x and j != y and
|
||||
start < i and j < end and
|
||||
start < x and y < end and
|
||||
r.character(i, j) and char = r.getText().substring(i, j) and
|
||||
r.character(x, y) and char = r.getText().substring(x, y) and
|
||||
r.charSet(start, end)
|
||||
) and
|
||||
/* Exclude <20> as we use it for any unencodable character */
|
||||
char != "<22>" and
|
||||
//Ignore whitespace in verbose mode
|
||||
not (r.getAMode() = "VERBOSE" and (char = " " or char = "\t" or char = "\r" or char = "\n"))
|
||||
}
|
||||
|
||||
from Regex r, string char
|
||||
where duplicate_char_in_class(r, char)
|
||||
select r, "This regular expression includes duplicate character '" + char + "' in a set of characters."
|
||||
|
||||
10
python/ql/src/Expressions/Regex/MissingPartSpecialGroup.py
Normal file
10
python/ql/src/Expressions/Regex/MissingPartSpecialGroup.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import re
|
||||
matcher = re.compile(r'(P<name>[\w]+)')
|
||||
|
||||
def only_letters(text):
|
||||
m = matcher.match(text)
|
||||
if m:
|
||||
print("Letters are: " + m.group('name'))
|
||||
|
||||
#Fix the pattern by adding the missing '?'
|
||||
fixed_matcher = re.compile(r'(?P<name>[\w]+)')
|
||||
@@ -0,0 +1,37 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
<overview>
|
||||
<p>
|
||||
One of the problems with using regular expressions is that almost any sequence of characters is a valid pattern.
|
||||
This means that it is easy to omit a necessary character and still have a valid regular expression.
|
||||
Omitting a character in a named capturing group is a specific case which can dramatically change the meaning of a regular expression.
|
||||
</p>
|
||||
|
||||
</overview>
|
||||
<recommendation>
|
||||
|
||||
<p>
|
||||
Examine the regular expression to find and correct any typos.
|
||||
</p>
|
||||
|
||||
</recommendation>
|
||||
<example>
|
||||
<p>
|
||||
In the following example, the regular expression for <code>matcher</code>, <code>r"(P<name>[\w]+)"</code>, is missing a "?" and will
|
||||
match only strings of letters that start with "P<name>", instead of matching any sequence of letters
|
||||
and placing the result in a named group.
|
||||
The fixed version, <code>fixed_matcher</code>, includes the "?" and will work as expected.
|
||||
</p>
|
||||
|
||||
<sample src="MissingPartSpecialGroup.py" />
|
||||
|
||||
</example>
|
||||
<references>
|
||||
|
||||
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
|
||||
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/named.html">Named Capturing Groups</a>.</li>
|
||||
|
||||
</references>
|
||||
</qhelp>
|
||||
20
python/ql/src/Expressions/Regex/MissingPartSpecialGroup.ql
Normal file
20
python/ql/src/Expressions/Regex/MissingPartSpecialGroup.ql
Normal file
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* @name Missing part of special group in regular expression
|
||||
* @description Incomplete special groups are parsed as normal groups and are unlikely to match the intended strings.
|
||||
* @kind problem
|
||||
* @tags reliability
|
||||
* correctness
|
||||
* @problem.severity warning
|
||||
* @sub-severity high
|
||||
* @precision high
|
||||
* @id py/regex/incomplete-special-group
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
from Regex r, string missing, string part
|
||||
where r.getText().regexpMatch(".*\\(P<\\w+>.*") and missing = "?" and part = "named group"
|
||||
select r, "Regular expression is missing '" + missing + "' in " + part + "."
|
||||
|
||||
|
||||
11
python/ql/src/Expressions/Regex/UnmatchableCaret.py
Normal file
11
python/ql/src/Expressions/Regex/UnmatchableCaret.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import re
|
||||
#Regular expression includes a caret, but not at the start.
|
||||
matcher = re.compile(r"\[^.]*\.css")
|
||||
|
||||
def find_css(filename):
|
||||
if matcher.match(filename):
|
||||
print("Found it!")
|
||||
|
||||
#Regular expression for a css file name
|
||||
fixed_matcher_css = re.compile(r"[^.]*\.css")
|
||||
|
||||
40
python/ql/src/Expressions/Regex/UnmatchableCaret.qhelp
Normal file
40
python/ql/src/Expressions/Regex/UnmatchableCaret.qhelp
Normal file
@@ -0,0 +1,40 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
<overview>
|
||||
<p>
|
||||
The caret character <code>^</code> anchors a regular expression to the beginning of the input, or
|
||||
(for multi-line regular expressions) to the beginning of a line.
|
||||
If it is preceded by a pattern that must match a non-empty sequence of (non-newline) input characters,
|
||||
then the entire regular expression cannot match anything.
|
||||
</p>
|
||||
|
||||
</overview>
|
||||
<recommendation>
|
||||
|
||||
<p>
|
||||
Examine the regular expression to find and correct any typos.
|
||||
</p>
|
||||
|
||||
</recommendation>
|
||||
<example>
|
||||
<p>
|
||||
In the following example, the regular expression <code>r"\[^.]*\.css"</code> cannot match any
|
||||
string, since it contains a caret assertion preceded by an escape sequence that matches an
|
||||
opening bracket.
|
||||
</p>
|
||||
<p>
|
||||
In the second regular expression, <code>r"[^.]*\.css"</code>, the caret is part of a character class, and will not match the start of the string.
|
||||
</p>
|
||||
|
||||
<sample src="UnmatchableCaret.py" />
|
||||
|
||||
</example>
|
||||
<references>
|
||||
|
||||
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
|
||||
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/anchors.html">Start of String and End of String Anchors</a>.</li>
|
||||
|
||||
</references>
|
||||
</qhelp>
|
||||
25
python/ql/src/Expressions/Regex/UnmatchableCaret.ql
Normal file
25
python/ql/src/Expressions/Regex/UnmatchableCaret.ql
Normal file
@@ -0,0 +1,25 @@
|
||||
/**
|
||||
* @name Unmatchable caret in regular expression
|
||||
* @description Regular expressions containing a caret '^' in the middle cannot be matched, whatever the input.
|
||||
* @kind problem
|
||||
* @tags reliability
|
||||
* correctness
|
||||
* @problem.severity error
|
||||
* @sub-severity low
|
||||
* @precision high
|
||||
* @id py/regex/unmatchable-caret
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
predicate unmatchable_caret(Regex r, int start) {
|
||||
not r.getAMode() = "MULTILINE" and
|
||||
not r.getAMode() = "VERBOSE" and
|
||||
r.specialCharacter(start, start+1, "^") and
|
||||
not r.firstItem(start, start+1)
|
||||
}
|
||||
|
||||
from Regex r, int offset
|
||||
where unmatchable_caret(r, offset)
|
||||
select r, "This regular expression includes an unmatchable caret at offset " + offset.toString() + "."
|
||||
10
python/ql/src/Expressions/Regex/UnmatchableDollar.py
Normal file
10
python/ql/src/Expressions/Regex/UnmatchableDollar.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import re
|
||||
#Regular expression that includes a dollar, but not at the end.
|
||||
matcher = re.compile(r"\.\(\w+$\)")
|
||||
|
||||
def find_it(filename):
|
||||
if matcher.match(filename):
|
||||
print("Found it!")
|
||||
|
||||
#Regular expression anchored to end of input.
|
||||
fixed_matcher = re.compile(r"\.\(\w+\)$")
|
||||
41
python/ql/src/Expressions/Regex/UnmatchableDollar.qhelp
Normal file
41
python/ql/src/Expressions/Regex/UnmatchableDollar.qhelp
Normal file
@@ -0,0 +1,41 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
<overview>
|
||||
<p>
|
||||
A dollar assertion <code>$</code> in a regular expression only matches at the end of the input, or
|
||||
(for multi-line regular expressions) at the end of a line. If it is followed by a pattern
|
||||
that must match a non-empty sequence of (non-newline) input characters, it cannot possibly match,
|
||||
rendering the entire regular expression unmatchable.
|
||||
</p>
|
||||
|
||||
</overview>
|
||||
<recommendation>
|
||||
|
||||
<p>
|
||||
Examine the regular expression to find and correct any typos.
|
||||
</p>
|
||||
|
||||
</recommendation>
|
||||
<example>
|
||||
<p>
|
||||
In the following example, the regular expression <code>r"\.\(\w+$\)"</code> cannot match any
|
||||
string, since it contains a dollar assertion followed by an escape sequence that matches a
|
||||
closing parenthesis.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
The second regular expression, <code>r"\.\(\w+\)$"</code>, has the dollar at the end and will work as expected.
|
||||
</p>
|
||||
|
||||
<sample src="UnmatchableDollar.py" />
|
||||
|
||||
</example>
|
||||
<references>
|
||||
|
||||
<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
|
||||
<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/anchors.html">Start of String and End of String Anchors</a>.</li>
|
||||
|
||||
</references>
|
||||
</qhelp>
|
||||
26
python/ql/src/Expressions/Regex/UnmatchableDollar.ql
Normal file
26
python/ql/src/Expressions/Regex/UnmatchableDollar.ql
Normal file
@@ -0,0 +1,26 @@
|
||||
/**
|
||||
* @name Unmatchable dollar in regular expression
|
||||
* @description Regular expressions containing a dollar '$' in the middle cannot be matched, whatever the input.
|
||||
* @kind problem
|
||||
* @tags reliability
|
||||
* correctness
|
||||
* @problem.severity error
|
||||
* @sub-severity low
|
||||
* @precision high
|
||||
* @id py/regex/unmatchable-dollar
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
predicate unmatchable_dollar(Regex r, int start) {
|
||||
not r.getAMode() = "MULTILINE" and
|
||||
not r.getAMode() = "VERBOSE" and
|
||||
r.specialCharacter(start, start+1, "$")
|
||||
and
|
||||
not r.lastItem(start, start+1)
|
||||
}
|
||||
|
||||
from Regex r, int offset
|
||||
where unmatchable_dollar(r, offset)
|
||||
select r, "This regular expression includes an unmatchable dollar at offset " + offset.toString() + "."
|
||||
Reference in New Issue
Block a user