Python: Two new queries for URL and hostname sanitization (CWE-020).

This commit is contained in:
Mark Shannon
2019-01-24 12:57:14 +00:00
parent ffa8b12d48
commit 88d8cb514c
13 changed files with 385 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Sanitizing untrusted URLs is an important technique for
preventing attacks such as request forgeries and malicious
redirections. Often, this is done by checking that the host of a URL
is in a set of allowed hosts.
</p>
<p>
If a regular expression implements such a check, it is
easy to accidentally make the check too permissive by not escaping the
<code>.</code> meta-characters appropriately.
Even if the check is not used in a security-critical
context, the incomplete check may still cause undesirable behaviors
when it accidentally succeeds.
</p>
</overview>
<recommendation>
<p>
Escape all meta-characters appropriately when constructing
regular expressions for security checks, pay special attention to the
<code>.</code> meta-character.
</p>
</recommendation>
<example>
<p>
The following example code checks that a URL redirection
will reach the <code>example.com</code> domain, or one of its
subdomains.
</p>
<sample src="examples/IncompleteHostnameRegExp.py"/>
<p>
The <code>unsafe</code> check is easy to bypass because the unescaped
<code>.</code> allows for any character before
<code>example.com</code>, effectively allowing the redirect to go to
an attacker-controlled domain such as <code>wwwXexample.com</code>.
</p>
<p>
This vulnerability is addressed in the <code>safe</code> check, which
escapes the <code>.</code> and will reject <code>wwwXexample.com</code>.
</p>
</example>
<references>
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,44 @@
/**
* @name Incomplete regular expression for hostnames
* @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
* @kind problem
* @problem.severity warning
* @precision high
* @id py/incomplete-hostname-regexp
* @tags correctness
* security
* external/cwe/cwe-20
*/
import python
import semmle.python.regex
private string commonTopLevelDomainRegex() {
result = "com|org|edu|gov|uk|net|io"
}
/**
* Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
* and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
*/
bindingset[pattern]
predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
hostPart = pattern
.regexpCapture("(?i).*" +
// an unescaped single `.`
"(?<!\\\\)[.]" +
// immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
"([():|?a-z0-9-]+(\\\\)?[.](" + commonTopLevelDomainRegex() + "))" + ".*", 1)
}
from Regex r, string pattern, string hostPart
where
(
r.getText() = pattern
) and
isIncompleteHostNameRegExpPattern(pattern, hostPart) and
// ignore patterns with capture groups after the TLD
not pattern.regexpMatch("(?i).*[.](" + commonTopLevelDomainRegex() + ").*[(][?]:.*[)].*")
select r,
"This regular expression has an unescaped '.' before '" + hostPart +
"', so it might match more hosts than expected."

View File

@@ -0,0 +1,86 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Sanitizing untrusted URLs is an important technique for
preventing attacks such as request forgeries and malicious
redirections. Usually, this is done by checking that the host of a URL
is in a set of allowed hosts.
</p>
<p>
However, it is notoriously error-prone to treat the URL as
a string and check if one of the allowed hosts is a substring of the
URL. Malicious URLs can bypass such security checks by embedding one
of the allowed hosts in an unexpected location.
</p>
<p>
Even if the substring check is not used in a
security-critical context, the incomplete check may still cause
undesirable behaviors when the check succeeds accidentally.
</p>
</overview>
<recommendation>
<p>
Parse a URL before performing a check on its host value,
and ensure that the check handles arbitrary subdomain sequences
correctly.
</p>
</recommendation>
<example>
<p>
The following example code checks that a URL redirection
will reach the <code>example.com</code> domain.
</p>
<sample src="examples/IncompleteUrlSubstringSanitization.py"/>
<p>
The first two examples show unsafe checks that are easily bypassed.
In <code>unsafe1</code> the attacker can simply add
<code>example.com</code> anywhere in the url. For example,
<code>http://evil-example.net/example.com</code>.
</p>
<p>
In <code>unsafe2</code> the attacker must use a hostname ending in
<code>example.com</code>, but that is easy to do. For example,
<code>http://benign-looking-prefix-example.com</code>.
</p>
<p>
The second two examples show safe checks.
In <code>safe1</code>, a white-list is used. Although fairly inflexible,
this is easy to get right and is most likely to be safe.
</p>
<p>
In <code>safe2</code>, <code>urlparse</code> is used to parse the URL,
then the hostname is checked to make sure it ends with <code>.example.com</code>.
</p>
</example>
<references>
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,58 @@
/**
* @name Incomplete URL substring sanitization
* @description Security checks on the substrings of an unparsed URL are often vulnerable to bypassing.
* @kind problem
* @problem.severity warning
* @precision high
* @id py/incomplete-url-substring-sanitization
* @tags correctness
* security
* external/cwe/cwe-20
*/
import python
import semmle.python.regex
private string commonTopLevelDomainRegex() {
result = "com|org|edu|gov|uk|net|io"
}
predicate looksLikeUrl(StrConst s) {
exists(string text |
text = s.getText()
|
text.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" +
commonTopLevelDomainRegex() +")(:[0-9]+)?/?")
or
// target is a HTTP URL to a domain on any TLD
text.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
)
}
predicate incomplete_sanitization(Expr sanitizer, StrConst url) {
looksLikeUrl(url) and
(
sanitizer.(Compare).compares(url, any(In i), _)
or
call_to_startswith(sanitizer, url)
or
unsafe_call_to_endswith(sanitizer, url)
)
}
predicate call_to_startswith(Call sanitizer, StrConst url) {
sanitizer.getFunc().(Attribute).getName() = "startswith"
and
sanitizer.getArg(0) = url
}
predicate unsafe_call_to_endswith(Call sanitizer, StrConst url) {
sanitizer.getFunc().(Attribute).getName() = "endswith" and
sanitizer.getArg(0) = url and
not url.getText().regexpMatch("(?i)\\.([a-z0-9-]+)(\\.[a-z0-9-]+)+")
}
from Expr sanitizer, StrConst url
where incomplete_sanitization(sanitizer, url)
select sanitizer, "'$@' may be at an arbitrary position in the sanitized URL.", url, url.getText()

View File

@@ -0,0 +1,19 @@
from flask import Flask, request, redirect
import re
app = Flask(__name__)
UNSAFE_REGEX = re.compile("(www|beta).example.com/")
SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
@app.route('/some/path/bad')
def unsafe(request):
target = request.args.get('target', '')
if UNSAFE_REGEX.match(target):
return redirect(target)
@app.route('/some/path/good')
def safe(request):
target = request.args.get('target', '')
if SAFE_REGEX.match(target):
return redirect(target)

View File

@@ -0,0 +1,41 @@
from flask import Flask, request, redirect
from urllib.parse import urlparse
app = Flask(__name__)
@app.route('/some/path/bad1')
def unsafe1(request):
target = request.args.get('target', '')
if "example.com" in target:
return redirect(target)
@app.route('/some/path/bad2')
def unsafe2(request):
target = request.args.get('target', '')
if target.endswith("example.com"):
return redirect(target)
#Simplest and safest approach is to use a white-list
@app.route('/some/path/good1')
def safe1(request):
whitelist = [
"example.com/home",
"example.com/login",
]
target = request.args.get('target', '')
if target in whitelist:
return redirect(target)
#More complex example allowing sub-domains.
@app.route('/some/path/good2')
def safe2(request):
target = request.args.get('target', '')
host = urlparse(target).hostname
#Note the '.' preceding example.com
if host and host.endswith(".example.com"):
return redirect(target)