mirror of
https://github.com/github/codeql.git
synced 2026-04-30 19:26:02 +02:00
Python: Two new queries for URL and hostname sanitization (CWE-020).
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
|
||||
<overview>
|
||||
<p>
|
||||
|
||||
Sanitizing untrusted URLs is an important technique for
|
||||
preventing attacks such as request forgeries and malicious
|
||||
redirections. Often, this is done by checking that the host of a URL
|
||||
is in a set of allowed hosts.
|
||||
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
||||
If a regular expression implements such a check, it is
|
||||
easy to accidentally make the check too permissive by not escaping the
|
||||
<code>.</code> meta-characters appropriately.
|
||||
|
||||
Even if the check is not used in a security-critical
|
||||
context, the incomplete check may still cause undesirable behaviors
|
||||
when it accidentally succeeds.
|
||||
|
||||
</p>
|
||||
</overview>
|
||||
|
||||
<recommendation>
|
||||
<p>
|
||||
|
||||
Escape all meta-characters appropriately when constructing
|
||||
regular expressions for security checks, pay special attention to the
|
||||
<code>.</code> meta-character.
|
||||
|
||||
</p>
|
||||
</recommendation>
|
||||
|
||||
<example>
|
||||
|
||||
<p>
|
||||
|
||||
The following example code checks that a URL redirection
|
||||
will reach the <code>example.com</code> domain, or one of its
|
||||
subdomains.
|
||||
|
||||
</p>
|
||||
|
||||
<sample src="examples/IncompleteHostnameRegExp.py"/>
|
||||
|
||||
<p>
|
||||
The <code>unsafe</code> check is easy to bypass because the unescaped
|
||||
<code>.</code> allows for any character before
|
||||
<code>example.com</code>, effectively allowing the redirect to go to
|
||||
an attacker-controlled domain such as <code>wwwXexample.com</code>.
|
||||
|
||||
</p>
|
||||
<p>
|
||||
This vulnerability is addressed in the <code>safe</code> check, which
|
||||
escapes the <code>.</code> and will reject <code>wwwXexample.com</code>.
|
||||
|
||||
</p>
|
||||
|
||||
</example>
|
||||
|
||||
<references>
|
||||
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
|
||||
<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
|
||||
</references>
|
||||
</qhelp>
|
||||
44
python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
Normal file
44
python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
Normal file
@@ -0,0 +1,44 @@
|
||||
/**
|
||||
* @name Incomplete regular expression for hostnames
|
||||
* @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
|
||||
* @kind problem
|
||||
* @problem.severity warning
|
||||
* @precision high
|
||||
* @id py/incomplete-hostname-regexp
|
||||
* @tags correctness
|
||||
* security
|
||||
* external/cwe/cwe-20
|
||||
*/
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
private string commonTopLevelDomainRegex() {
|
||||
result = "com|org|edu|gov|uk|net|io"
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
|
||||
* and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
|
||||
*/
|
||||
bindingset[pattern]
|
||||
predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
|
||||
hostPart = pattern
|
||||
.regexpCapture("(?i).*" +
|
||||
// an unescaped single `.`
|
||||
"(?<!\\\\)[.]" +
|
||||
// immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
|
||||
"([():|?a-z0-9-]+(\\\\)?[.](" + commonTopLevelDomainRegex() + "))" + ".*", 1)
|
||||
}
|
||||
|
||||
from Regex r, string pattern, string hostPart
|
||||
where
|
||||
(
|
||||
r.getText() = pattern
|
||||
) and
|
||||
isIncompleteHostNameRegExpPattern(pattern, hostPart) and
|
||||
// ignore patterns with capture groups after the TLD
|
||||
not pattern.regexpMatch("(?i).*[.](" + commonTopLevelDomainRegex() + ").*[(][?]:.*[)].*")
|
||||
select r,
|
||||
"This regular expression has an unescaped '.' before '" + hostPart +
|
||||
"', so it might match more hosts than expected."
|
||||
@@ -0,0 +1,86 @@
|
||||
<!DOCTYPE qhelp PUBLIC
|
||||
"-//Semmle//qhelp//EN"
|
||||
"qhelp.dtd">
|
||||
<qhelp>
|
||||
|
||||
<overview>
|
||||
<p>
|
||||
|
||||
Sanitizing untrusted URLs is an important technique for
|
||||
preventing attacks such as request forgeries and malicious
|
||||
redirections. Usually, this is done by checking that the host of a URL
|
||||
is in a set of allowed hosts.
|
||||
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
||||
However, it is notoriously error-prone to treat the URL as
|
||||
a string and check if one of the allowed hosts is a substring of the
|
||||
URL. Malicious URLs can bypass such security checks by embedding one
|
||||
of the allowed hosts in an unexpected location.
|
||||
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
||||
Even if the substring check is not used in a
|
||||
security-critical context, the incomplete check may still cause
|
||||
undesirable behaviors when the check succeeds accidentally.
|
||||
|
||||
</p>
|
||||
</overview>
|
||||
|
||||
<recommendation>
|
||||
<p>
|
||||
|
||||
Parse a URL before performing a check on its host value,
|
||||
and ensure that the check handles arbitrary subdomain sequences
|
||||
correctly.
|
||||
|
||||
</p>
|
||||
</recommendation>
|
||||
|
||||
<example>
|
||||
|
||||
<p>
|
||||
|
||||
The following example code checks that a URL redirection
|
||||
will reach the <code>example.com</code> domain.
|
||||
|
||||
</p>
|
||||
|
||||
<sample src="examples/IncompleteUrlSubstringSanitization.py"/>
|
||||
|
||||
<p>
|
||||
|
||||
The first two examples show unsafe checks that are easily bypassed.
|
||||
In <code>unsafe1</code> the attacker can simply add
|
||||
<code>example.com</code> anywhere in the url. For example,
|
||||
<code>http://evil-example.net/example.com</code>.
|
||||
</p>
|
||||
<p>
|
||||
In <code>unsafe2</code> the attacker must use a hostname ending in
|
||||
<code>example.com</code>, but that is easy to do. For example,
|
||||
<code>http://benign-looking-prefix-example.com</code>.
|
||||
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
||||
The second two examples show safe checks.
|
||||
In <code>safe1</code>, a white-list is used. Although fairly inflexible,
|
||||
this is easy to get right and is most likely to be safe.
|
||||
</p>
|
||||
<p>
|
||||
In <code>safe2</code>, <code>urlparse</code> is used to parse the URL,
|
||||
then the hostname is checked to make sure it ends with <code>.example.com</code>.
|
||||
</p>
|
||||
|
||||
</example>
|
||||
|
||||
<references>
|
||||
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
|
||||
<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
|
||||
</references>
|
||||
</qhelp>
|
||||
@@ -0,0 +1,58 @@
|
||||
/**
|
||||
* @name Incomplete URL substring sanitization
|
||||
* @description Security checks on the substrings of an unparsed URL are often vulnerable to bypassing.
|
||||
* @kind problem
|
||||
* @problem.severity warning
|
||||
* @precision high
|
||||
* @id py/incomplete-url-substring-sanitization
|
||||
* @tags correctness
|
||||
* security
|
||||
* external/cwe/cwe-20
|
||||
*/
|
||||
|
||||
|
||||
import python
|
||||
import semmle.python.regex
|
||||
|
||||
private string commonTopLevelDomainRegex() {
|
||||
result = "com|org|edu|gov|uk|net|io"
|
||||
}
|
||||
|
||||
predicate looksLikeUrl(StrConst s) {
|
||||
exists(string text |
|
||||
text = s.getText()
|
||||
|
|
||||
text.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" +
|
||||
commonTopLevelDomainRegex() +")(:[0-9]+)?/?")
|
||||
or
|
||||
// target is a HTTP URL to a domain on any TLD
|
||||
text.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
|
||||
)
|
||||
}
|
||||
|
||||
predicate incomplete_sanitization(Expr sanitizer, StrConst url) {
|
||||
looksLikeUrl(url) and
|
||||
(
|
||||
sanitizer.(Compare).compares(url, any(In i), _)
|
||||
or
|
||||
call_to_startswith(sanitizer, url)
|
||||
or
|
||||
unsafe_call_to_endswith(sanitizer, url)
|
||||
)
|
||||
}
|
||||
|
||||
predicate call_to_startswith(Call sanitizer, StrConst url) {
|
||||
sanitizer.getFunc().(Attribute).getName() = "startswith"
|
||||
and
|
||||
sanitizer.getArg(0) = url
|
||||
}
|
||||
|
||||
predicate unsafe_call_to_endswith(Call sanitizer, StrConst url) {
|
||||
sanitizer.getFunc().(Attribute).getName() = "endswith" and
|
||||
sanitizer.getArg(0) = url and
|
||||
not url.getText().regexpMatch("(?i)\\.([a-z0-9-]+)(\\.[a-z0-9-]+)+")
|
||||
}
|
||||
|
||||
from Expr sanitizer, StrConst url
|
||||
where incomplete_sanitization(sanitizer, url)
|
||||
select sanitizer, "'$@' may be at an arbitrary position in the sanitized URL.", url, url.getText()
|
||||
@@ -0,0 +1,19 @@
|
||||
from flask import Flask, request, redirect
|
||||
import re
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
UNSAFE_REGEX = re.compile("(www|beta).example.com/")
|
||||
SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
|
||||
|
||||
@app.route('/some/path/bad')
|
||||
def unsafe(request):
|
||||
target = request.args.get('target', '')
|
||||
if UNSAFE_REGEX.match(target):
|
||||
return redirect(target)
|
||||
|
||||
@app.route('/some/path/good')
|
||||
def safe(request):
|
||||
target = request.args.get('target', '')
|
||||
if SAFE_REGEX.match(target):
|
||||
return redirect(target)
|
||||
@@ -0,0 +1,41 @@
|
||||
from flask import Flask, request, redirect
|
||||
from urllib.parse import urlparse
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/some/path/bad1')
|
||||
def unsafe1(request):
|
||||
target = request.args.get('target', '')
|
||||
if "example.com" in target:
|
||||
return redirect(target)
|
||||
|
||||
@app.route('/some/path/bad2')
|
||||
def unsafe2(request):
|
||||
target = request.args.get('target', '')
|
||||
if target.endswith("example.com"):
|
||||
return redirect(target)
|
||||
|
||||
|
||||
|
||||
#Simplest and safest approach is to use a white-list
|
||||
|
||||
@app.route('/some/path/good1')
|
||||
def safe1(request):
|
||||
whitelist = [
|
||||
"example.com/home",
|
||||
"example.com/login",
|
||||
]
|
||||
target = request.args.get('target', '')
|
||||
if target in whitelist:
|
||||
return redirect(target)
|
||||
|
||||
#More complex example allowing sub-domains.
|
||||
|
||||
@app.route('/some/path/good2')
|
||||
def safe2(request):
|
||||
target = request.args.get('target', '')
|
||||
host = urlparse(target).hostname
|
||||
#Note the '.' preceding example.com
|
||||
if host and host.endswith(".example.com"):
|
||||
return redirect(target)
|
||||
|
||||
Reference in New Issue
Block a user