Merge pull request #14034 from geoffw0/hostname

Swift: New query: Incomplete regular expression for hostnames
This commit is contained in:
Mathias Vorreiter Pedersen
2023-08-30 11:33:36 +01:00
committed by GitHub
10 changed files with 315 additions and 9 deletions

View File

@@ -8,6 +8,35 @@ private import codeql.swift.dataflow.DataFlow
private import internal.ParseRegex
private import internal.RegexTracking
/**
* A data flow node whose value may flow to a position where it is interpreted
* as a part of a regular expression. For example the string literal
* `"(a|b).*"` in:
* ```
* Regex("(a|b).*").firstMatch(in: myString)
* ```
*/
abstract class RegexPatternSource extends DataFlow::Node {
/**
* Gets a node where the pattern of this node is parsed as a part of
* a regular expression.
*/
abstract DataFlow::Node getAParse();
/**
* Gets the root term of the regular expression parsed from this pattern.
*/
abstract RegExpTerm getRegExpTerm();
}
/**
* For each `RegexPatternSource` data flow node, the corresponding `Expr` is
* a `Regex`. This is a simple wrapper to make that happen.
*/
private class RegexFromRegexPatternSource extends RegExp {
RegexFromRegexPatternSource() { this = any(RegexPatternSource node).asExpr() }
}
/**
* A string literal that is used as a regular expression. For example
* the string literal `"(a|b).*"` in:
@@ -15,16 +44,18 @@ private import internal.RegexTracking
* Regex("(a|b).*").firstMatch(in: myString)
* ```
*/
private class ParsedStringRegex extends RegExp, StringLiteralExpr {
private class ParsedStringRegex extends RegexPatternSource {
StringLiteralExpr expr;
DataFlow::Node use;
ParsedStringRegex() { StringLiteralUseFlow::flow(DataFlow::exprNode(this), use) }
ParsedStringRegex() {
expr = this.asExpr() and
StringLiteralUseFlow::flow(this, use)
}
/**
* Gets a dataflow node where this string literal is used as a regular
* expression.
*/
DataFlow::Node getUse() { result = use }
override DataFlow::Node getAParse() { result = use }
override RegExpTerm getRegExpTerm() { result.getRegExp() = expr }
}
/**
@@ -246,11 +277,11 @@ abstract class RegexEval extends CallExpr {
*/
RegExp getARegex() {
// string literal used directly as a regex
result.(ParsedStringRegex).getUse().asExpr() = this.getRegexInput()
DataFlow::exprNode(result).(ParsedStringRegex).getAParse().asExpr() = this.getRegexInput()
or
// string literal -> regex object -> use
exists(RegexCreation regexCreation |
result.(ParsedStringRegex).getUse() = regexCreation.getStringInput() and
DataFlow::exprNode(result).(ParsedStringRegex).getAParse() = regexCreation.getStringInput() and
RegexUseFlow::flow(regexCreation, DataFlow::exprNode(this.getRegexInput()))
)
}

View File

@@ -0,0 +1,21 @@
/**
* Provides predicates for reasoning about regular expressions
* that match URLs and hostname patterns.
*/
private import swift
private import codeql.swift.dataflow.DataFlow
private import codeql.swift.regex.Regex as Regex
private import codeql.swift.regex.RegexTreeView::RegexTreeView as TreeImpl
private import codeql.regex.HostnameRegexp as Shared
/**
* An implementation of the signature that allows the Hostname analysis to run.
*/
private module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
class DataFlowNode = DataFlow::Node;
class RegExpPatternSource = Regex::RegexPatternSource;
}
import Shared::Make<TreeImpl, Impl>

View File

@@ -0,0 +1,5 @@
---
category: newQuery
---
* Added new query "Incomplete regular expression for hostnames" (`swift/incomplete-hostname-regexp`). This query finds regular expressions matching a URL or hostname that may match more hostnames than expected.

View File

@@ -0,0 +1,74 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Sanitizing untrusted URLs is an important technique for
preventing attacks such as request forgeries and malicious
redirections. Often, this is done by checking that the host of a URL
is in a set of allowed hosts.
</p>
<p>
If a regular expression implements such a check, it is
easy to accidentally make the check too permissive by not escaping the
<code>.</code> meta-characters appropriately.
Even if the check is not used in a security-critical
context, the incomplete check may still cause undesirable behaviors
when it accidentally succeeds.
</p>
</overview>
<recommendation>
<p>
Escape all meta-characters appropriately when constructing
regular expressions for security checks, and pay special attention to the
<code>.</code> meta-character.
</p>
</recommendation>
<example>
<p>
The following example code checks that a URL redirection
will reach the <code>example.com</code> domain, or one of its
subdomains.
</p>
<sample src="IncompleteHostnameRegexBad.swift"/>
<p>
The check is, however, easy to bypass because the unescaped
<code>.</code> allows for any character before
<code>example.com</code>, effectively allowing the redirect to go to
an attacker-controlled domain such as <code>wwwXexample.com</code>.
</p>
<p>
Address this vulnerability by escaping <code>.</code>
to <code>\.</code>:
</p>
<sample src="IncompleteHostnameRegexGood.swift"/>
</example>
<references>
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">Server Side Request Forgery</a>.</li>
<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,16 @@
/**
* @name Incomplete regular expression for hostnames
* @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname may match more hostnames than expected.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision high
* @id swift/incomplete-hostname-regexp
* @tags correctness
* security
* external/cwe/cwe-020
*/
private import codeql.swift.security.regex.HostnameRegex as HostnameRegex
query predicate problems = HostnameRegex::incompleteHostnameRegExp/4;

View File

@@ -0,0 +1,12 @@
func handleUrl(_ urlString: String) {
// get the 'url=' parameter from the URL
let components = URLComponents(string: urlString)
let redirectParam = components?.queryItems?.first(where: { $0.name == "url" })
// check we trust the host
let regex = #/^(www|beta).example.com//# // BAD
if let match = redirectParam?.value?.firstMatch(of: regex) {
// ... trust the URL ...
}
}

View File

@@ -0,0 +1,12 @@
func handleUrl(_ urlString: String) {
// get the 'url=' parameter from the URL
let components = URLComponents(string: urlString)
let redirectParam = components?.queryItems?.first(where: { $0.name == "url" })
// check we trust the host
let regex = #/^(www|beta)\.example\.com//# // GOOD
if let match = redirectParam?.value?.firstMatch(of: regex) {
// ... trust the URL ...
}
}

View File

@@ -0,0 +1,21 @@
| test.swift:60:17:60:40 | ^http://test.example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | test.swift:60:16:60:16 | ^http://test.example.com/ | here |
| test.swift:63:17:63:40 | ^http://test.example.net/ | This regular expression has an unescaped '.' before 'example.net/', so it might match more hosts than expected. | test.swift:63:16:63:16 | ^http://test.example.net/ | here |
| test.swift:64:17:64:54 | ^http://test.(example-a\|example-b).com/ | This regular expression has an unescaped '.' before '(example-a\|example-b).com/', so it might match more hosts than expected. | test.swift:64:16:64:16 | ^http://test.(example-a\|example-b).com/ | here |
| test.swift:65:17:65:40 | ^http://(.+).example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | test.swift:65:16:65:16 | ^http://(.+).example.com/ | here |
| test.swift:65:17:65:40 | ^http://(.+).example.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example.com/' to be matched anywhere in the URL, outside the hostname. | test.swift:65:16:65:16 | ^http://(.+).example.com/ | here |
| test.swift:67:17:67:49 | ^http://(?:.+)\\.test\\.example.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example.com/' to be matched anywhere in the URL, outside the hostname. | test.swift:67:16:67:16 | ^http://(?:.+)\\.test\\.example.com/ | here |
| test.swift:68:17:68:46 | ^http://test.example.com/(?:.*) | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | test.swift:68:16:68:16 | ^http://test.example.com/(?:.*) | here |
| test.swift:70:17:70:63 | ^(https?:)?//((service\|www).)?example.com(?=$\|/) | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:70:16:70:16 | ^(https?:)?//((service\|www).)?example.com(?=$\|/) | here |
| test.swift:71:17:71:51 | ^(http\|https)://www.example.com/p/f/ | This regular expression has an unescaped '.' before 'example.com/p/f/', so it might match more hosts than expected. | test.swift:71:16:71:16 | ^(http\|https)://www.example.com/p/f/ | here |
| test.swift:72:19:72:40 | http://sub.example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | test.swift:72:16:72:16 | ^(http://sub.example.com/) | here |
| test.swift:73:17:73:41 | ^https?://api.example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | test.swift:73:16:73:16 | ^https?://api.example.com/ | here |
| test.swift:75:17:75:43 | ^https://[a-z]*.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:75:16:75:16 | ^https://[a-z]*.example.com$ | here |
| test.swift:77:39:77:51 | .+.example.net | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. | test.swift:77:16:77:16 | ^protos?://(localhost\|.+.example.net\|.+.example-a.com\|.+.example-b.com\|.+.example.internal) | here |
| test.swift:77:54:77:68 | .+.example-a.com | This regular expression has an unescaped '.' before 'example-a.com', so it might match more hosts than expected. | test.swift:77:16:77:16 | ^protos?://(localhost\|.+.example.net\|.+.example-a.com\|.+.example-b.com\|.+.example.internal) | here |
| test.swift:77:71:77:85 | .+.example-b.com | This regular expression has an unescaped '.' before 'example-b.com', so it might match more hosts than expected. | test.swift:77:16:77:16 | ^protos?://(localhost\|.+.example.net\|.+.example-a.com\|.+.example-b.com\|.+.example.internal) | here |
| test.swift:81:19:81:33 | foo.example\\.com | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | test.swift:81:16:81:16 | ^(foo.example\\.com\|whatever)$ | here |
| test.swift:83:17:83:33 | ^test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:83:16:83:16 | ^test.example.com$ | here |
| test.swift:84:17:84:31 | test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:84:16:84:16 | test.example.com | here |
| test.swift:86:26:86:41 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:86:16:86:48 | call to id(_:) | here |
| test.swift:92:21:92:36 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:93:16:93:23 | .hostname | here |
| test.swift:98:29:98:44 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | test.swift:96:20:96:27 | .hostname | here |

View File

@@ -0,0 +1 @@
queries/Security/CWE-020/IncompleteHostnameRegex.ql

View File

@@ -0,0 +1,113 @@
// --- stubs ---
struct URL {
init?(string: String) {}
}
extension String {
init(contentsOf: URL) {
let data = ""
self.init(data)
}
}
struct AnyRegexOutput {
}
protocol RegexComponent<RegexOutput> {
associatedtype RegexOutput
}
struct Regex<Output> : RegexComponent {
struct Match {
}
init(_ pattern: String) throws where Output == AnyRegexOutput { }
func firstMatch(in string: String) throws -> Regex<Output>.Match? { return nil}
func wholeMatch(in string: String) throws -> Regex<Output>.Match? { return nil }
typealias RegexOutput = Output
}
extension String : RegexComponent {
typealias Output = Substring
typealias RegexOutput = String.Output
}
// --- tests ---
func id(_ s : String) -> String { return s }
struct MyDomain {
init(_ hostname: String) {
self.hostname = hostname
}
var hostname: String
}
func testHostnames(myUrl: URL) throws {
let tainted = String(contentsOf: myUrl) // tainted
_ = try Regex(#"^http://example\.com/"#).firstMatch(in: tainted) // GOOD
_ = try Regex(#"^http://example.com/"#).firstMatch(in: tainted) // GOOD (only '.' here gives a valid top-level domain)
_ = try Regex(#"^http://example.com"#).firstMatch(in: tainted) // BAD (missing anchor) [NOT DETECTED]
_ = try Regex(#"^http://test\.example\.com/"#).firstMatch(in: tainted) // GOOD
_ = try Regex(#"^http://test\.example.com/"#).firstMatch(in: tainted) // GOOD (only '.' here gives a valid top-level domain)
_ = try Regex(#"^http://test\.example.com"#).firstMatch(in: tainted) // BAD (missing anchor) [NOT DETECTED]
_ = try Regex(#"^http://test.example.com/"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^http://test[.]example[.]com/"#).firstMatch(in: tainted) // GOOD (alternative method of escaping)
_ = try Regex(#"^http://test.example.net/"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^http://test.(example-a|example-b).com/"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^http://(.+).example.com/"#).firstMatch(in: tainted) // BAD (incomplete hostname x 2)
_ = try Regex(#"^http://(\.+)\.example.com/"#).firstMatch(in: tainted) // GOOD
_ = try Regex(#"^http://(?:.+)\.test\.example.com/"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^http://test.example.com/(?:.*)"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^(.+\.(?:example-a|example-b)\.com)/"#).firstMatch(in: tainted) // BAD (missing anchor) [NOT DETECTED]
_ = try Regex(#"^(https?:)?//((service|www).)?example.com(?=$|/)"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^(http|https)://www.example.com/p/f/"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^(http://sub.example.com/)"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^https?://api.example.com/"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^http[s]?://?sub1\.sub2\.example\.com/f/(.+)"#).firstMatch(in: tainted) // GOOD (it has a capture group after the TLD, so should be ignored)
_ = try Regex(#"^https://[a-z]*.example.com$"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"^(example.dev|example.com)"#).firstMatch(in: tainted) // GOOD (any extended hostname wouldn't be included in the capture group)
_ = try Regex(#"^protos?://(localhost|.+.example.net|.+.example-a.com|.+.example-b.com|.+.example.internal)"#).firstMatch(in: tainted) // BAD (incomplete hostname x3, missing anchor x 1)
_ = try Regex(#"^http://(..|...)\.example\.com/index\.html"#).firstMatch(in: tainted) // GOOD (wildcards are intentional)
_ = try Regex(#"^http://.\.example\.com/index\.html"#).firstMatch(in: tainted) // GOOD (the wildcard is intentional)
_ = try Regex(#"^(foo.example\.com|whatever)$"#).firstMatch(in: tainted) // DUBIOUS (one disjunction doesn't even look like a hostname) [DETECTED incomplete hostname]
_ = try Regex(#"^test.example.com$"#).firstMatch(in: tainted) // BAD (incomplete hostname)
_ = try Regex(#"test.example.com"#).wholeMatch(in: tainted) // BAD (incomplete hostname, missing anchor)
_ = try Regex(id(id(id(#"test.example.com$"#)))).firstMatch(in: tainted) // BAD (incomplete hostname)
let hostname = #"test.example.com$"# // BAD (incomplete hostname) [NOT DETECTED]
_ = try Regex("\(hostname)").firstMatch(in: tainted)
var domain = MyDomain("")
domain.hostname = #"test.example.com$"# // BAD (incomplete hostname)
_ = try Regex(domain.hostname).firstMatch(in: tainted)
func convert1(_ domain: MyDomain) throws -> Regex<AnyRegexOutput> {
return try Regex(domain.hostname)
}
_ = try convert1(MyDomain(#"test.example.com$"#)).firstMatch(in: tainted) // BAD (incomplete hostname)
let domains = [ MyDomain(#"test.example.com$"#) ] // BAD (incomplete hostname) [NOT DETECTED]
func convert2(_ domain: MyDomain) throws -> Regex<AnyRegexOutput> {
return try Regex(domain.hostname)
}
_ = try domains.map({ try convert2($0).firstMatch(in: tainted) })
let primary = "example.com$"
_ = try Regex("test." + primary).firstMatch(in: tainted) // BAD (incomplete hostname) [NOT DETECTED]
_ = try Regex("test." + "example.com$").firstMatch(in: tainted) // BAD (incomplete hostname) [NOT DETECTED]
_ = try Regex(#"^http://localhost:8000|" + "^https?://.+\.example\.com/"#).firstMatch(in: tainted) // BAD (incomplete hostname) [NOT DETECTED]
_ = try Regex(#"^http://localhost:8000|" + "^https?://.+.example\.com/"#).firstMatch(in: tainted) // BAD (incomplete hostname) [NOT DETECTED]
let harmless = #"^http://test.example.com"# // GOOD (never used as a regex)
}