Initial commit of Python queries and QL libraries.

2026-04-27 17:55:19 +02:00 · 2018-11-19 13:13:39 +00:00
parent 90c75cd362
commit 5f58824d1b
725 changed files with 63520 additions and 0 deletions
--- a/python/ql/src/Expressions/Regex/BackspaceEscape.py
+++ b/python/ql/src/Expressions/Regex/BackspaceEscape.py
@@ -0,0 +1,5 @@
+import re
+matcher = re.compile(r"\b[\t\b]")
+
+def match_data(data):
+    return bool(matcher.match(data))
--- a/python/ql/src/Expressions/Regex/BackspaceEscape.qhelp
+++ b/python/ql/src/Expressions/Regex/BackspaceEscape.qhelp
@@ -0,0 +1,40 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+The meaning of the <code>\b</code> escape sequence inside a regular expression depends on its
+syntactic context: inside a character class, it matches the backspace character; outside of a
+character class, it matches a word boundary. This context dependency makes regular expressions
+hard to read, so the <code>\b</code> escape sequence should not be used inside character classes.
+</p>
+
+</overview>
+<recommendation>
+
+<p>
+Replace <code>\b</code> in character classes with the semantically identical escape sequence <code>\x08</code>.
+</p>
+
+</recommendation>
+<example>
+<p>
+In the following example, the regular expression contains two uses of <code>\b</code>: in the
+first case, it matches a word boundary, in the second case it matches a backspace character.
+</p>
+
+<sample src="BackspaceEscape.py" />
+
+<p>
+You can make the regular expression easier for other developers to interpret, by rewriting it as <code>r"\b[\t\x08]"</code>.
+</p>
+
+</example>
+<references>
+
+<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
+
+</references>
+</qhelp>
--- a/python/ql/src/Expressions/Regex/BackspaceEscape.ql
+++ b/python/ql/src/Expressions/Regex/BackspaceEscape.ql
@@ -0,0 +1,22 @@
+/**
+ * @name Backspace escape in regular expression
+ * @description Using '\b' to escape the backspace character in a regular expression is confusing
+ *              since it could be mistaken for a word boundary assertion.
+ * @kind problem
+ * @tags maintainability
+ * @problem.severity recommendation
+ * @sub-severity high
+ * @precision very-high
+ * @id py/regex/backspace-escape
+ */
+
+import python
+import semmle.python.regex
+
+from Regex r, int offset
+where r.escapingChar(offset) and r.getChar(offset+1) = "b" and 
+exists(int start, int end |
+    start < offset and end > offset |
+    r.charSet(start, end)
+)
+select r, "Backspace escape in regular expression at offset " + offset + "."
--- a/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.py
+++ b/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.py
@@ -0,0 +1,6 @@
+import re
+matcher = re.compile(r"[password|pwd]")
+
+def find_password(data):
+    if matcher.match(data):
+        print("Found password!")
--- a/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.qhelp
+++ b/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.qhelp
@@ -0,0 +1,44 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+Character classes in regular expressions represent sets of characters, so there is no need to specify
+the same character twice in one character class. Duplicate characters in character classes are at best
+useless, and may even indicate a latent bug.
+</p>
+
+</overview>
+<recommendation>
+
+<p>Determine whether a character is simply duplicated or whether the character class was in fact meant as a group.
+If it is just a duplicate, then remove the duplicate character.
+If was supposed to be a group, then replace the square brackets with parentheses.
+</p>
+
+
+</recommendation>
+<example>
+<p>
+In the following example, the character class <code>[password|pwd]</code> contains two instances each
+of the characters <code>d</code>, <code>p</code>, <code>s</code>, and <code>w</code>. The programmer most likely meant
+to write <code>(password|pwd)</code> (a pattern that matches either the string <code>"password"</code>
+or the string <code>"pwd"</code>), and accidentally mistyped the enclosing brackets.
+</p>
+
+<sample src="DuplicateCharacterInSet.py" />
+
+<p>
+To fix this problem, the regular expression should be rewritten to <code>r"(password|pwd)"</code>.
+</p>
+
+</example>
+<references>
+
+<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
+<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/charclass.html">Character Classes or Character Sets</a>.</li>
+
+</references>
+</qhelp>
--- a/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.ql
+++ b/python/ql/src/Expressions/Regex/DuplicateCharacterInSet.ql
@@ -0,0 +1,34 @@
+/**
+ * @name Duplication in regular expression character class
+ * @description Duplicate characters in a class have no effect and may indicate an error in the regular expression.
+ * @kind problem
+ * @tags reliability
+ *       readability
+ * @problem.severity warning
+ * @sub-severity low
+ * @precision very-high
+ * @id py/regex/duplicate-in-character-class
+ */
+
+import python
+import semmle.python.regex
+
+predicate duplicate_char_in_class(Regex r, string char) {
+    exists(int i, int j, int x, int y, int start, int end |
+        i != x and j != y and
+        start < i and j < end and
+        start < x and y  < end and
+        r.character(i, j) and char = r.getText().substring(i, j) and
+        r.character(x, y) and char = r.getText().substring(x, y) and
+        r.charSet(start, end)
+    ) and
+    /* Exclude <20> as we use it for any unencodable character */
+    char != "<22>" and
+    //Ignore whitespace in verbose mode
+    not (r.getAMode() = "VERBOSE" and (char = " " or char = "\t" or char = "\r" or char = "\n"))
+}
+
+from Regex r, string char
+where duplicate_char_in_class(r, char)
+select r, "This regular expression includes duplicate character '" + char + "' in a set of characters."
+
--- a/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.py
+++ b/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.py
@@ -0,0 +1,10 @@
+import re
+matcher = re.compile(r'(P<name>[\w]+)')
+
+def only_letters(text):
+    m = matcher.match(text)
+    if m:
+        print("Letters are: " + m.group('name'))
+
+#Fix the pattern by adding the missing '?'
+fixed_matcher = re.compile(r'(?P<name>[\w]+)')
--- a/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.qhelp
+++ b/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.qhelp
@@ -0,0 +1,37 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+<overview>
+<p>
+One of the problems with using regular expressions is that almost any sequence of characters is a valid pattern.
+This means that it is easy to omit a necessary character and still have a valid regular expression.
+Omitting a character in a named capturing group is a specific case which can dramatically change the meaning of a regular expression.
+</p>
+
+</overview>
+<recommendation>
+
+<p>
+Examine the regular expression to find and correct any typos.
+</p>
+
+</recommendation>
+<example>
+<p>
+In the following example, the regular expression for <code>matcher</code>, <code>r"(P&lt;name&gt;[\w]+)"</code>, is missing a "?" and will
+match only strings of letters that start with "P&lt;name&gt;", instead of matching any sequence of letters
+and placing the result in a named group.
+The fixed version, <code>fixed_matcher</code>, includes the "?" and will work as expected.
+</p>
+
+<sample src="MissingPartSpecialGroup.py" />
+
+</example>
+<references>
+
+<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
+<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/named.html">Named Capturing Groups</a>.</li>
+
+</references>
+</qhelp>
--- a/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.ql
+++ b/python/ql/src/Expressions/Regex/MissingPartSpecialGroup.ql
@@ -0,0 +1,20 @@
+/**
+ * @name Missing part of special group in regular expression
+ * @description Incomplete special groups are parsed as normal groups and are unlikely to match the intended strings.
+ * @kind problem
+ * @tags reliability
+ *       correctness
+ * @problem.severity warning
+ * @sub-severity high
+ * @precision high
+ * @id py/regex/incomplete-special-group
+ */
+
+import python
+import semmle.python.regex
+
+from Regex r, string missing, string part
+where r.getText().regexpMatch(".*\\(P<\\w+>.*") and missing = "?" and part = "named group"
+select r, "Regular expression is missing '" + missing + "' in " + part + "."
+
+
--- a/python/ql/src/Expressions/Regex/UnmatchableCaret.py
+++ b/python/ql/src/Expressions/Regex/UnmatchableCaret.py
@@ -0,0 +1,11 @@
+import re
+#Regular expression includes a caret, but not at the start.
+matcher = re.compile(r"\[^.]*\.css")
+
+def find_css(filename):
+    if matcher.match(filename):
+        print("Found it!")
+        
+#Regular expression for a css file name
+fixed_matcher_css = re.compile(r"[^.]*\.css")
+
--- a/python/ql/src/Expressions/Regex/UnmatchableCaret.qhelp
+++ b/python/ql/src/Expressions/Regex/UnmatchableCaret.qhelp
@@ -0,0 +1,40 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+<overview>
+<p>
+The caret character <code>^</code> anchors a regular expression to the beginning of the input, or
+(for multi-line regular expressions) to the beginning of a line.
+If it is preceded by a pattern that must match a non-empty sequence of (non-newline) input characters,
+then the entire regular expression cannot match anything.
+</p>
+
+</overview>
+<recommendation>
+
+<p>
+Examine the regular expression to find and correct any typos.
+</p>
+
+</recommendation>
+<example>
+<p>
+In the following example, the regular expression <code>r"\[^.]*\.css"</code> cannot match any
+string, since it contains a caret assertion preceded by an escape sequence that matches an
+opening bracket.
+</p>
+<p>
+In the second regular expression, <code>r"[^.]*\.css"</code>, the caret is part of a character class, and will not match the start of the string.
+</p>
+
+<sample src="UnmatchableCaret.py" />
+
+</example>
+<references>
+
+<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
+<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/anchors.html">Start of String and End of String Anchors</a>.</li>
+
+</references>
+</qhelp>
--- a/python/ql/src/Expressions/Regex/UnmatchableCaret.ql
+++ b/python/ql/src/Expressions/Regex/UnmatchableCaret.ql
@@ -0,0 +1,25 @@
+/**
+ * @name Unmatchable caret in regular expression
+ * @description Regular expressions containing a caret '^' in the middle cannot be matched, whatever the input.
+ * @kind problem
+ * @tags reliability
+ *       correctness
+ * @problem.severity error
+ * @sub-severity low
+ * @precision high
+ * @id py/regex/unmatchable-caret
+ */
+
+import python
+import semmle.python.regex
+
+predicate unmatchable_caret(Regex r, int start) {
+    not r.getAMode() = "MULTILINE" and
+    not r.getAMode() = "VERBOSE" and
+    r.specialCharacter(start, start+1, "^") and
+    not r.firstItem(start, start+1)
+}
+
+from Regex r, int offset
+where unmatchable_caret(r, offset)
+select r, "This regular expression includes an unmatchable caret at offset " + offset.toString() + "."
--- a/python/ql/src/Expressions/Regex/UnmatchableDollar.py
+++ b/python/ql/src/Expressions/Regex/UnmatchableDollar.py
@@ -0,0 +1,10 @@
+import re
+#Regular expression that includes a dollar, but not at the end.
+matcher = re.compile(r"\.\(\w+$\)")
+
+def find_it(filename):
+    if matcher.match(filename):
+        print("Found it!")
+
+#Regular expression anchored to end of input.
+fixed_matcher = re.compile(r"\.\(\w+\)$")
--- a/python/ql/src/Expressions/Regex/UnmatchableDollar.qhelp
+++ b/python/ql/src/Expressions/Regex/UnmatchableDollar.qhelp
@@ -0,0 +1,41 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+<overview>
+<p>
+A dollar assertion <code>$</code> in a regular expression only matches at the end of the input, or
+(for multi-line regular expressions) at the end of a line. If it is followed by a pattern
+that must match a non-empty sequence of (non-newline) input characters, it cannot possibly match,
+rendering the entire regular expression unmatchable.
+</p>
+
+</overview>
+<recommendation>
+
+<p>
+Examine the regular expression to find and correct any typos.
+</p>
+
+</recommendation>
+<example>
+<p>
+In the following example, the regular expression <code>r"\.\(\w+$\)"</code> cannot match any
+string, since it contains a dollar assertion followed by an escape sequence that matches a
+closing parenthesis.
+</p>
+
+<p>
+The second regular expression, <code>r"\.\(\w+\)$"</code>, has the dollar at the end and will work as expected.
+</p>
+
+<sample src="UnmatchableDollar.py" />
+
+</example>
+<references>
+
+<li>Python Standard Library: <a href="https://docs.python.org/library/re.html">Regular expression operations</a>.</li>
+<li>Regular-Expressions.info: <a href="http://www.regular-expressions.info/anchors.html">Start of String and End of String Anchors</a>.</li>
+
+</references>
+</qhelp>
--- a/python/ql/src/Expressions/Regex/UnmatchableDollar.ql
+++ b/python/ql/src/Expressions/Regex/UnmatchableDollar.ql
@@ -0,0 +1,26 @@
+/**
+ * @name Unmatchable dollar in regular expression
+ * @description Regular expressions containing a dollar '$' in the middle cannot be matched, whatever the input.
+ * @kind problem
+ * @tags reliability
+ *       correctness
+ * @problem.severity error
+ * @sub-severity low
+ * @precision high
+ * @id py/regex/unmatchable-dollar
+ */
+
+import python
+import semmle.python.regex
+
+predicate unmatchable_dollar(Regex r, int start) {
+    not r.getAMode() = "MULTILINE" and
+    not r.getAMode() = "VERBOSE" and
+    r.specialCharacter(start, start+1, "$")
+    and
+    not r.lastItem(start, start+1)
+}
+
+from Regex r, int offset
+where unmatchable_dollar(r, offset)
+select r, "This regular expression includes an unmatchable dollar at offset " + offset.toString() + "."