Merge remote-tracking branch 'origin/main' into jorgectf/python/headerInjection

2026-01-30 06:42:57 +01:00 · 2021-09-07 19:54:58 +02:00
parent 352eab0eca 0e9d36b922
commit eee9b3f39e
2262 changed files with 206632 additions and 30291 deletions
--- a/python/ql/src/experimental/Security/CWE-287/ImproperLdapAuth.qhelp
+++ b/python/ql/src/experimental/Security/CWE-287/ImproperLdapAuth.qhelp
@@ -0,0 +1,31 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+<overview>
+<p>If an LDAP query doesn't carry any kind of authentication, anonymous binds causes an empty or None-set password 
+to result in a successful authentication.</p>
+</overview>
+
+<recommendation>
+<p>Use a non-empty password while establishing an LDAP connection.</p>
+</recommendation>
+
+<example>
+<p>In the following examples, the code builds a LDAP query whose execution carries no authentication or binds anonymously.</p>
+
+<sample src="examples/auth_bad_2.py" />
+<sample src="examples/auth_bad_3.py" />
+
+<p>In the third and fourth examples, the authentication is established using a password.</p>
+
+<sample src="examples/auth_good_2.py" />
+<sample src="examples/auth_good_3.py" />
+</example>
+
+<references>
+<li>SonarSource: <a href="https://rules.sonarsource.com/python/type/Vulnerability/RSPEC-4433">RSPEC-4433</a>.</li>
+<li>Python2: <a href="https://www.python-ldap.org/en/python-ldap-3.3.0/reference/ldap.html">LDAP Documentation</a>.</li>
+<li>Python3: <a href="https://ldap3.readthedocs.io/en/latest/">LDAP Documentation</a>.</li>
+</references>
+</qhelp>
--- a/python/ql/src/experimental/Security/CWE-287/ImproperLdapAuth.ql
+++ b/python/ql/src/experimental/Security/CWE-287/ImproperLdapAuth.ql
@@ -0,0 +1,31 @@
+/**
+ * @name Improper LDAP Authentication
+ * @description A user-controlled query carries no authentication
+ * @kind problem
+ * @problem.severity warning
+ * @id py/improper-ldap-auth
+ * @tags experimental
+ *       security
+ *       external/cwe/cwe-287
+ */
+
+// Determine precision above
+import python
+import experimental.semmle.python.Concepts
+import semmle.python.dataflow.new.DataFlow
+
+predicate authenticatesImproperly(LDAPBind ldapBind) {
+  (
+    DataFlow::localFlow(DataFlow::exprNode(any(None noneName)), ldapBind.getPassword()) or
+    not exists(ldapBind.getPassword())
+  )
+  or
+  exists(StrConst emptyString |
+    emptyString.getText() = "" and
+    DataFlow::localFlow(DataFlow::exprNode(emptyString), ldapBind.getPassword())
+  )
+}
+
+from LDAPBind ldapBind
+where authenticatesImproperly(ldapBind)
+select ldapBind, "The following LDAP bind operation is executed without authentication"
--- a/python/ql/src/experimental/Security/CWE-287/examples/auth_bad_2.py
+++ b/python/ql/src/experimental/Security/CWE-287/examples/auth_bad_2.py
@@ -0,0 +1,14 @@
+from flask import request, Flask
+import ldap
+import ldap.filter
+import ldap.dn
+
+
+@app.route("/bind_example")
+def bind_example():
+    dn = "dc={}".format(ldap.dn.escape_dn_chars(request.args['dc']))
+    search_filter = "(user={})".format(ldap.filter.escape_filter_chars(request.args['search']))
+
+    ldap_connection = ldap.initialize("ldap://127.0.0.1:1337")
+    ldap_connection.bind('cn=root', "")
+    user = ldap_connection.search_s(dn, ldap.SCOPE_SUBTREE, search_filter)
--- a/python/ql/src/experimental/Security/CWE-287/examples/auth_bad_3.py
+++ b/python/ql/src/experimental/Security/CWE-287/examples/auth_bad_3.py
@@ -0,0 +1,13 @@
+from ldap3 import Server, Connection, ALL
+from flask import request, Flask
+from ldap3.utils.dn import escape_rdn
+from ldap3.utils.conv import escape_filter_chars
+
+@app.route("/passwordNone")
+def passwordNone():
+    dn = "dc={}".format(escape_rdn(request.args['dc']))
+    search_filter = "(user={})".format(escape_filter_chars(request.args['search']))
+
+    srv = Server('servername', get_info=ALL)
+    conn = Connection(srv, user='user_dn', password=None)
+    status, result, response, _ = conn.search(dn, search_filter)
--- a/python/ql/src/experimental/Security/CWE-287/examples/auth_good_2.py
+++ b/python/ql/src/experimental/Security/CWE-287/examples/auth_good_2.py
@@ -0,0 +1,14 @@
+from flask import request, Flask
+import ldap
+import ldap.filter
+import ldap.dn
+
+
+@app.route("/bind_example")
+def bind_example():
+    dn = "dc={}".format(ldap.dn.escape_dn_chars(request.args['dc']))
+    search_filter = "(user={})".format(ldap.filter.escape_filter_chars(request.args['search']))
+
+    ldap_connection = ldap.initialize("ldap://127.0.0.1:1337")
+    ldap_connection.bind('cn=root', "SecurePa$$!")
+    user = ldap_connection.search_s(dn, ldap.SCOPE_SUBTREE, search_filter)
--- a/python/ql/src/experimental/Security/CWE-287/examples/auth_good_3.py
+++ b/python/ql/src/experimental/Security/CWE-287/examples/auth_good_3.py
@@ -0,0 +1,14 @@
+from ldap3 import Server, Connection, ALL
+from flask import request, Flask
+from ldap3.utils.dn import escape_rdn
+from ldap3.utils.conv import escape_filter_chars
+
+@app.route("/passwordFromEnv")
+def passwordFromEnv():
+    dn = "dc={}".format(escape_rdn(request.args['dc']))
+    search_filter = "(user={})".format(escape_filter_chars(request.args['search']))
+
+    srv = Server('servername', get_info=ALL)
+    conn = Connection(srv, user='user_dn',
+                      password="SecurePa$$!")
+    status, result, response, _ = conn.search(dn, search_filter)
--- a/python/ql/src/experimental/Security/CWE-730/PolynomialBackTracking.ql
+++ b/python/ql/src/experimental/Security/CWE-730/PolynomialBackTracking.ql
@@ -0,0 +1,6 @@
+import python
+import semmle.python.security.performance.SuperlinearBackTracking
+
+from PolynomialBackTrackingTerm t
+where t.getLocation().getFile().getBaseName() = "KnownCVEs.py"
+select t.getRegex(), t, t.getReason()
--- a/python/ql/src/experimental/Security/CWE-730/PolynomialReDoS.qhelp
+++ b/python/ql/src/experimental/Security/CWE-730/PolynomialReDoS.qhelp
@@ -0,0 +1,108 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+
+<qhelp>
+
+	<include src="ReDoSIntroduction.inc.qhelp" />
+
+	<example>
+		<p>
+
+			Consider this use of a regular expression, which removes
+			all leading and trailing whitespace in a string:
+
+		</p>
+
+		<sample language="python">
+			re.sub(r"^\s+|\s+$", "", text) # BAD
+		</sample>
+
+		<p>
+
+			The sub-expression <code>"\s+$"</code> will match the
+			whitespace characters in <code>text</code> from left to right, but it
+			can start matching anywhere within a whitespace sequence. This is
+			problematic for strings that do <strong>not</strong> end with a whitespace
+			character. Such a string will force the regular expression engine to
+			process each whitespace sequence once per whitespace character in the
+			sequence.
+
+		</p>
+
+		<p>
+
+			This ultimately means that the time cost of trimming a
+			string is quadratic in the length of the string. So a string like
+			<code>"a b"</code> will take milliseconds to process, but a similar
+			string with a million spaces instead of just one will take several
+			minutes.
+
+		</p>
+
+		<p>
+
+			Avoid this problem by rewriting the regular expression to
+			not contain the ambiguity about when to start matching whitespace
+			sequences. For instance, by using a negative look-behind
+			(<code>^\s+|(?&lt;!\s)\s+$</code>), or just by using the built-in strip
+			method (<code>text.strip()</code>).
+
+		</p>
+
+		<p>
+
+			Note that the sub-expression <code>"^\s+"</code> is
+			<strong>not</strong> problematic as the <code>^</code> anchor restricts
+			when that sub-expression can start matching, and as the regular
+			expression engine matches from left to right.
+
+		</p>
+
+	</example>
+
+	<example>
+
+		<p>
+
+			As a similar, but slightly subtler problem, consider the
+			regular expression that matches lines with numbers, possibly written
+			using scientific notation:
+		</p>
+
+		<sample language="python">
+			^0\.\d+E?\d+$ # BAD
+		</sample>
+
+		<p>
+
+			The problem with this regular expression is in the
+			sub-expression <code>\d+E?\d+</code> because the second
+			<code>\d+</code> can start matching digits anywhere after the first
+			match of the first <code>\d+</code> if there is no <code>E</code> in
+			the input string.
+
+		</p>
+
+		<p>
+
+			This is problematic for strings that do <strong>not</strong>
+			end with a digit. Such a string will force the regular expression
+			engine to process each digit sequence once per digit in the sequence,
+			again leading to a quadratic time complexity.
+
+		</p>
+
+		<p>
+
+			To make the processing faster, the regular expression
+			should be rewritten such that the two <code>\d+</code> sub-expressions
+			do not have overlapping matches: <code>^0\.\d+(E\d+)?$</code>.
+
+		</p>
+
+	</example>
+
+	<include src="ReDoSReferences.inc.qhelp"/>
+
+</qhelp>
--- a/python/ql/src/experimental/Security/CWE-730/PolynomialReDoS.ql
+++ b/python/ql/src/experimental/Security/CWE-730/PolynomialReDoS.ql
@@ -0,0 +1,33 @@
+/**
+ * @name Polynomial regular expression used on uncontrolled data
+ * @description A regular expression that can require polynomial time
+ *              to match may be vulnerable to denial-of-service attacks.
+ * @kind path-problem
+ * @problem.severity warning
+ * @precision high
+ * @id py/polynomial-redos
+ * @tags security
+ *       external/cwe/cwe-730
+ *       external/cwe/cwe-400
+ */
+
+import python
+import semmle.python.security.performance.SuperlinearBackTracking
+import semmle.python.security.dataflow.PolynomialReDoS
+import DataFlow::PathGraph
+
+from
+  PolynomialReDoS::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink,
+  PolynomialReDoS::Sink sinkNode, PolynomialBackTrackingTerm regexp
+where
+  config.hasFlowPath(source, sink) and
+  sinkNode = sink.getNode() and
+  regexp.getRootTerm() = sinkNode.getRegExp()
+//   not (
+//     source.getNode().(Source).getKind() = "url" and
+//     regexp.isAtEndLine()
+//   )
+select sinkNode.getHighlight(), source, sink,
+  "This $@ that depends on $@ may run slow on strings " + regexp.getPrefixMessage() +
+    "with many repetitions of '" + regexp.getPumpString() + "'.", regexp, "regular expression",
+  source.getNode(), "a user-provided value"
--- a/python/ql/src/experimental/Security/CWE-730/ReDoS.qhelp
+++ b/python/ql/src/experimental/Security/CWE-730/ReDoS.qhelp
@@ -0,0 +1,34 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+
+<qhelp>
+
+	<include src="ReDoSIntroduction.inc.qhelp" />
+
+	<example>
+		<p>
+			Consider this regular expression:
+		</p>
+		<sample language="python">
+			^_(__|.)+_$
+		</sample>
+		<p>
+			Its sub-expression <code>"(__|.)+?"</code> can match the string <code>"__"</code> either by the
+			first alternative <code>"__"</code> to the left of the <code>"|"</code> operator, or by two
+			repetitions of the second alternative <code>"."</code> to the right. Thus, a string consisting
+			of an odd number of underscores followed by some other character will cause the regular
+			expression engine to run for an exponential amount of time before rejecting the input.
+		</p>
+		<p>
+			This problem can be avoided by rewriting the regular expression to remove the ambiguity between
+			the two branches of the alternative inside the repetition:
+		</p>
+		<sample language="python">
+			^_(__|[^_])+_$
+		</sample>
+	</example>
+
+	<include src="ReDoSReferences.inc.qhelp"/>
+
+</qhelp>
--- a/python/ql/src/experimental/Security/CWE-730/ReDoS.ql
+++ b/python/ql/src/experimental/Security/CWE-730/ReDoS.ql
@@ -0,0 +1,25 @@
+/**
+ * @name Inefficient regular expression
+ * @description A regular expression that requires exponential time to match certain inputs
+ *              can be a performance bottleneck, and may be vulnerable to denial-of-service
+ *              attacks.
+ * @kind problem
+ * @problem.severity error
+ * @precision high
+ * @id py/redos
+ * @tags security
+ *       external/cwe/cwe-730
+ *       external/cwe/cwe-400
+ */
+
+import python
+import semmle.python.security.performance.ExponentialBackTracking
+
+from RegExpTerm t, string pump, State s, string prefixMsg
+where
+  hasReDoSResult(t, pump, s, prefixMsg) and
+  // exclude verbose mode regexes for now
+  not t.getRegex().getAMode() = "VERBOSE"
+select t,
+  "This part of the regular expression may cause exponential backtracking on strings " + prefixMsg +
+    "containing many repetitions of '" + pump + "'."
--- a/python/ql/src/experimental/Security/CWE-730/ReDoSIntroduction.inc.qhelp
+++ b/python/ql/src/experimental/Security/CWE-730/ReDoSIntroduction.inc.qhelp
@@ -0,0 +1,54 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+	<overview>
+		<p>
+
+			Some regular expressions take a long time to match certain
+			input strings to the point where the time it takes to match a string
+			of length <i>n</i> is proportional to <i>n<sup>k</sup></i> or even
+			<i>2<sup>n</sup></i>.  Such regular expressions can negatively affect
+			performance, or even allow a malicious user to perform a Denial of
+			Service ("DoS") attack by crafting an expensive input string for the
+			regular expression to match.
+
+		</p>
+
+		<p>
+
+			The regular expression engine provided by Python uses a backtracking non-deterministic finite
+			automata to implement regular expression matching. While this approach
+			is space-efficient and allows supporting advanced features like
+			capture groups, it is not time-efficient in general. The worst-case
+			time complexity of such an automaton can be polynomial or even
+			exponential, meaning that for strings of a certain shape, increasing
+			the input length by ten characters may make the automaton about 1000
+			times slower.
+
+		</p>
+
+		<p>
+
+			Typically, a regular expression is affected by this
+			problem if it contains a repetition of the form <code>r*</code> or
+			<code>r+</code> where the sub-expression <code>r</code> is ambiguous
+			in the sense that it can match some string in multiple ways. More
+			information about the precise circumstances can be found in the
+			references.
+
+		</p>
+	</overview>
+
+	<recommendation>
+
+		<p>
+
+			Modify the regular expression to remove the ambiguity, or
+			ensure that the strings matched with the regular expression are short
+			enough that the time-complexity does not matter.
+
+		</p>
+
+	</recommendation>
+</qhelp>
--- a/python/ql/src/experimental/Security/CWE-730/ReDoSReferences.inc.qhelp
+++ b/python/ql/src/experimental/Security/CWE-730/ReDoSReferences.inc.qhelp
@@ -0,0 +1,16 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+	<references>
+		<li>
+			OWASP:
+			<a href="https://www.owasp.org/index.php/Regular_expression_Denial_of_Service_-_ReDoS">Regular expression Denial of Service - ReDoS</a>.
+		</li>
+		<li>Wikipedia: <a href="https://en.wikipedia.org/wiki/ReDoS">ReDoS</a>.</li>
+		<li>Wikipedia: <a href="https://en.wikipedia.org/wiki/Time_complexity">Time complexity</a>.</li>
+		<li>James Kirrage, Asiri Rathnayake, Hayo Thielecke:
+		<a href="http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf">Static Analysis for Regular Expression Denial-of-Service Attack</a>.
+		</li>
+	</references>
+</qhelp>