Merge remote-tracking branch 'upstream/main' into incomplete-url-string-sanitization

Conflicts:
	config/identical-files.json
	javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
	javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll
	ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitization.qll
This commit is contained in:
Arthur Baars
2022-03-18 16:08:59 +01:00
1308 changed files with 77789 additions and 53601 deletions

View File

@@ -25,7 +25,7 @@ DataFlow::Node relevantTaintSink(string kind) {
or
kind = "CommandInjection" and result instanceof CommandInjection::Sink
or
kind = "XSS" and result instanceof ReflectedXSS::Sink
kind = "XSS" and result instanceof ReflectedXss::Sink
or
kind = "PathInjection" and result instanceof PathInjection::Sink
or

View File

@@ -0,0 +1,202 @@
/**
* Provides predicates for reasoning about regular expressions
* that match URLs and hostname patterns.
*/
private import HostnameRegexpSpecific
/**
* Holds if the given constant is unlikely to occur in the origin part of a URL.
*/
predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
// Look for any of these cases:
// - A character that can't occur in the origin
// - Two dashes in a row
// - A colon that is not part of port or scheme separator
// - A slash that is not part of scheme separator
term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
}
/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
predicate isDotConstant(RegExpTerm term) {
term.(RegExpCharEscape).getValue() = "."
or
exists(RegExpCharacterClass cls |
term = cls and
not cls.isInverted() and
cls.getNumChild() = 1 and
cls.getAChild().(RegExpConstant).getValue() = "."
)
}
/** Holds if `term` is a wildcard `.` or an actual `.` character. */
predicate isDotLike(RegExpTerm term) {
term instanceof RegExpDot
or
isDotConstant(term)
}
/** Holds if `term` will only ever be matched against the beginning of the input. */
predicate matchesBeginningOfString(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent | matchesBeginningOfString(parent) |
term = parent.(RegExpSequence).getChild(0)
or
parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
term = parent.(RegExpSequence).getChild(1)
or
term = parent.(RegExpAlt).getAChild()
or
term = parent.(RegExpGroup).getAChild()
)
}
/**
* Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`,
* excluding cases where this is at the very beginning of the regexp.
*
* `i` is bound to the index of the last child in the top-level domain part.
*/
predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
seq.getChild(i)
.(RegExpConstant)
.getValue()
.regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
isDotLike(seq.getChild(i - 1)) and
not (i = 1 and matchesBeginningOfString(seq))
}
/**
* Holds if the given regular expression term contains top-level domain preceded by a dot,
* such as `.com`.
*/
predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
/**
* Holds if `term` will always match a hostname, that is, all disjunctions contain
* a hostname pattern that isn't inside a quantifier.
*/
predicate alwaysMatchesHostname(RegExpTerm term) {
hasTopLevelDomainEnding(term, _)
or
// `localhost` is considered a hostname pattern, but has no TLD
term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
or
not term instanceof RegExpAlt and
not term instanceof RegExpQuantifier and
alwaysMatchesHostname(term.getAChild())
or
alwaysMatchesHostnameAlt(term)
}
/** Holds if every child of `alt` contains a hostname pattern. */
predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
}
/**
* Holds if the first `i` children of `alt` contains a hostname pattern.
*
* This is used instead of `forall` to avoid materializing the set of alternatives
* that don't contains hostnames, which is much larger.
*/
predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
alwaysMatchesHostname(alt.getChild(0)) and i = 0
or
alwaysMatchesHostnameAlt(alt, i - 1) and
alwaysMatchesHostname(alt.getChild(i))
}
/**
* Holds if `term` occurs inside a quantifier or alternative (and thus
* can not be expected to correspond to a unique match), or as part of
* a lookaround assertion (which are rarely used for capture groups).
*/
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
exists(RegExpParent parent | parent = term.getParent() |
parent instanceof RegExpAlt
or
parent instanceof RegExpQuantifier
or
parent instanceof RegExpSubPattern
or
isInsideChoiceOrSubPattern(parent)
)
}
/**
* Holds if `group` is likely to be used as a capture group.
*/
predicate isLikelyCaptureGroup(RegExpGroup group) {
group.isCapture() and
not isInsideChoiceOrSubPattern(group)
}
/**
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
*
* At least one of these dots is not intended to be a subdomain separator,
* so we avoid flagging the pattern in this case.
*/
predicate hasConsecutiveDots(RegExpSequence seq) {
exists(int i |
isDotLike(seq.getChild(i)) and
isDotLike(seq.getChild(i + 1))
)
}
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
seq = regexp.getAChild*() and
exists(RegExpDot unescapedDot, int i, string hostname |
hasTopLevelDomainEnding(seq, i) and
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
not hasConsecutiveDots(unescapedDot.getParent()) and
hostname =
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
seq.getChild(i).getRawValue()
|
if unescapedDot.getParent() instanceof RegExpQuantifier
then
// `.*\.example.com` can match `evil.com/?x=.example.com`
//
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
seq.getChild(0) instanceof RegExpCaret and
not seq.getAChild() instanceof RegExpDollar and
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
msg =
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
+ "' which may cause '" + hostname +
"' to be matched anywhere in the URL, outside the hostname."
else
msg =
"has an unescaped '.' before '" + hostname +
"', so it might match more hosts than expected."
)
}
predicate incompleteHostnameRegExp(
RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
) {
exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
regexp = re.getRegExpTerm() and
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
(
if re.getAParse() != re
then (
kind = "string, which is used as a regular expression $@," and
aux = re.getAParse()
) else (
kind = "regular expression" and aux = re
)
)
|
message = "This " + kind + " " + msg and label = "here"
)
}

View File

@@ -0,0 +1,2 @@
import codeql.ruby.security.performance.RegExpTreeView
import codeql.ruby.DataFlow

View File

@@ -0,0 +1,72 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Sanitizing untrusted URLs is an important technique for
preventing attacks such as request forgeries and malicious
redirections. Often, this is done by checking that the host of a URL
is in a set of allowed hosts.
</p>
<p>
If a regular expression implements such a check, it is
easy to accidentally make the check too permissive by not escaping the
<code>.</code> meta-characters appropriately.
Even if the check is not used in a security-critical
context, the incomplete check may still cause undesirable behaviors
when it accidentally succeeds.
</p>
</overview>
<recommendation>
<p>
Escape all meta-characters appropriately when constructing
regular expressions for security checks, and pay special attention to the
<code>.</code> meta-character.
</p>
</recommendation>
<example>
<p>
The following example code checks that a URL redirection
will reach the <code>example.com</code> domain, or one of its
subdomains.
</p>
<sample src="examples/IncompleteHostnameRegExp.rb"/>
<p>
The check is however easy to bypass because the unescaped
<code>.</code> allows for any character before
<code>example.com</code>, effectively allowing the redirect to go to
an attacker-controlled domain such as <code>wwwXexample.com</code>.
</p>
<p>
Address this vulnerability by escaping <code>.</code>
appropriately: <code>regex = /^((www|beta)\.)?example\.com/</code>.
</p>
</example>
<references>
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,16 @@
/**
* @name Incomplete regular expression for hostnames
* @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision high
* @id rb/incomplete-hostname-regexp
* @tags correctness
* security
* external/cwe/cwe-020
*/
import HostnameRegexpShared
query predicate problems = incompleteHostnameRegExp/4;

View File

@@ -31,7 +31,7 @@ query predicate problems(
(
// target contains a domain on a common TLD, and perhaps some other URL components
target
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
"(:[0-9]+)?/?")
or
// target is a HTTP URL to a domain on any TLD

View File

@@ -0,0 +1,13 @@
class AppController < ApplicationController
def index
url = params[:url]
host = URI(url).host
# BAD: the host of `url` may be controlled by an attacker
regex = /^((www|beta).)?example.com/
if host.match(regex)
redirect_to url
end
end
end

View File

@@ -18,7 +18,7 @@ import codeql.ruby.security.ReflectedXSSQuery
import codeql.ruby.DataFlow
import DataFlow::PathGraph
from ReflectedXSS::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
from ReflectedXss::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
where config.hasFlowPath(source, sink)
select sink.getNode(), source, sink, "Cross-site scripting vulnerability due to $@.",
source.getNode(), "a user-provided value"

View File

@@ -17,7 +17,7 @@ import codeql.ruby.security.StoredXSSQuery
import codeql.ruby.DataFlow
import DataFlow::PathGraph
from StoredXSS::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
from StoredXss::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
where config.hasFlowPath(source, sink)
select sink.getNode(), source, sink, "Cross-site scripting vulnerability due to $@",
source.getNode(), "stored value"

View File

@@ -19,8 +19,8 @@ import codeql.ruby.dataflow.RemoteFlowSources
import codeql.ruby.TaintTracking
import DataFlow::PathGraph
class SQLInjectionConfiguration extends TaintTracking::Configuration {
SQLInjectionConfiguration() { this = "SQLInjectionConfiguration" }
class SqlInjectionConfiguration extends TaintTracking::Configuration {
SqlInjectionConfiguration() { this = "SQLInjectionConfiguration" }
override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }
@@ -32,7 +32,7 @@ class SQLInjectionConfiguration extends TaintTracking::Configuration {
}
}
from SQLInjectionConfiguration config, DataFlow::PathNode source, DataFlow::PathNode sink
from SqlInjectionConfiguration config, DataFlow::PathNode source, DataFlow::PathNode sink
where config.hasFlowPath(source, sink)
select sink.getNode(), source, sink, "This SQL query depends on $@.", source.getNode(),
"a user-provided value"

View File

@@ -16,6 +16,6 @@
import codeql.ruby.security.BadTagFilterQuery
from HTMLMatchingRegExp regexp, string msg
from HtmlMatchingRegExp regexp, string msg
where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
select regexp, msg