add the bad tag filter query to ruby

This commit is contained in:
Erik Krogh Kristensen
2021-10-26 15:18:51 +02:00
parent c15ddf6e92
commit 97264b5dda
9 changed files with 435 additions and 3 deletions

View File

@@ -0,0 +1,306 @@
/**
* Provides precicates for reasoning about bad tag filter vulnerabilities.
*/
import performance.ReDoSUtil
/**
* A module for determining if a regexp matches a given string,
* and reasoning about which capture groups are filled by a given string.
*/
private module RegexpMatching {
/**
* A class to test whether a regular expression matches a string.
* Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
* The result can afterwards be read from the `matches` predicate.
*
* Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
* The result is available in the `fillCaptureGroup` predicate.
*/
abstract class MatchedRegExp extends RegExpTerm {
MatchedRegExp() { this.isRootTerm() }
/**
* Holds if it should be tested whether this regular expression matches `str`.
*
* If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
* E.g. a regular expression `/foo$/` will match any string that ends with "foo",
* but if `ignorePrefix` is true, it will only match "foo".
*/
predicate test(string str, boolean ignorePrefix) {
none() // maybe overriden in subclasses
}
/**
* Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
*/
predicate testWithGroups(string str, boolean ignorePrefix) {
none() // maybe overriden in subclasses
}
/**
* Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
*/
final predicate matches(string str) {
exists(State state | state = getAState(this, str.length() - 1, str, _) |
epsilonSucc*(state) = Accept(_)
)
}
/**
* Holds if matching `str` may fill capture group number `g`.
* Only holds if `str` is in the `testWithGroups` predicate.
*/
final predicate fillsCaptureGroup(string str, int g) {
exists(State s |
s = getAStateThatReachesAccept(this, _, str, _) and
g = group(s.getRepr())
)
}
}
/**
* Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
* The regular expression is modelled as a non-determistic finite automaton,
* the regular expression can therefore be in multiple states after matching a character.
*
* It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
*/
private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
// start state, the -1 position before any chars have been matched
i = -1 and
(
reg.test(str, ignorePrefix)
or
reg.testWithGroups(str, ignorePrefix)
) and
result.getRepr().getRootTerm() = reg and
isStartState(result)
or
// recursive case
result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
}
/**
* Gets the next state after the `prev` state from `reg`.
* `prev` is the state after matching `fromIndex` chars in `str`,
* and the result is the state after matching `toIndex` chars in `str`.
*
* This predicate is used as a step relation in the forwards search (`getAState`),
* and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
*/
private State getAStateAfterMatching(
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
) {
// the basic recursive case - outlined into a noopt helper to make performance work out.
result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
or
// we can skip past word boundaries if the next char is a non-word char.
fromIndex = toIndex and
prev.getRepr() instanceof RegExpWordBoundary and
prev = getAState(reg, toIndex, str, ignorePrefix) and
after(prev.getRepr()) = result and
str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
}
pragma[noopt]
private State getAStateAfterMatchingAux(
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
) {
prev = getAState(reg, fromIndex, str, ignorePrefix) and
fromIndex = toIndex - 1 and
exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
not discardedPrefixStep(prev, result, ignorePrefix)
}
/** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
prev = mkMatch(any(RegExpRoot r)) and
ignorePrefix = true and
next = prev
}
// The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
private predicate specializedDeltaClosed(State prev, string char, State next) {
deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
}
// The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
pragma[noinline]
private InputSymbol specializedGetAnInputSymbolMatching(string char) {
exists(string s, MatchedRegExp r |
r.test(s, _)
or
r.testWithGroups(s, _)
|
char = s.charAt(_)
) and
result = getAnInputSymbolMatching(char)
}
/**
* Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
* Starts with an accepting state as found by `getAState` and searches backwards
* to the start state through the reachable states (as found by `getAState`).
*
* This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
* and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
* The result state is therefore always on a valid path where `reg` accepts `str`.
*
* This predicate is only used to find which capture groups a regular expression has filled,
* and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
*/
private State getAStateThatReachesAccept(
MatchedRegExp reg, int i, string str, boolean ignorePrefix
) {
// base case, reaches an accepting state from the last state in `getAState(..)`
reg.testWithGroups(str, ignorePrefix) and
i = str.length() - 1 and
result = getAState(reg, i, str, ignorePrefix) and
epsilonSucc*(result) = Accept(_)
or
// recursive case. `next` is the next state to be matched after matching `prev`.
// this predicate is doing a backwards search, so `prev` is the result we are looking for.
exists(State next, State prev, int fromIndex, int toIndex |
next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
i = fromIndex and
result = prev
)
}
/** Gets the capture group number that `term` belongs to. */
private int group(RegExpTerm term) {
exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
}
}
/** A class to test whether a regular expression matches certain HTML tags. */
class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
HTMLMatchingRegExp() {
// the regexp must mention "<" and ">" explicitly.
forall(string angleBracket | angleBracket = ["<", ">"] |
any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
this
)
}
override predicate testWithGroups(string str, boolean ignorePrefix) {
ignorePrefix = true and
str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
}
override predicate test(string str, boolean ignorePrefix) {
ignorePrefix = true and
str =
[
"<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
"<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
"<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
"<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
"<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
"<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
]
}
}
/**
* Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
*
* When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
*/
predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
// CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
regexp.matches("<!-- foo -->") and
regexp.matches("<!-- foo --!>") and
exists(int a, int b | a != b |
regexp.fillsCaptureGroup("<!-- foo -->", a) and
// <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
regexp.fillsCaptureGroup("<!-- foo --!>", b) and
not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
msg =
"Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
+ a + " and comments ending with --!> are matched with capture group " +
strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
"."
)
or
// CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
exists(int group, int other |
group != other and
regexp.fillsCaptureGroup("<!-- foo -->", group) and
regexp.fillsCaptureGroup("<foo>", other) and
not regexp.matches("<!-- foo --!>") and
not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
not regexp.fillsCaptureGroup("<!- foo ->", group) and
not regexp.fillsCaptureGroup("<foo>", group) and
not regexp.fillsCaptureGroup("<script>", group) and
msg =
"This regular expression only parses --> (capture group " + group +
") and not --!> as a HTML comment end tag."
)
or
regexp.matches("<!-- foo -->") and
not regexp.matches("<!-- foo\n -->") and
not regexp.matches("<!- foo ->") and
not regexp.matches("<foo>") and
not regexp.matches("<script>") and
msg = "This regular expression does not match comments containing newlines."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<script \n>foo</script>") and
msg = "This regular expression matches <script></script>, but not <script \\n></script>"
or
not regexp.matches("<script >foo\n</script>") and
msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
)
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<script src='foo'></script>") and
not regexp.matches("<foo>") and
msg = "This regular expression does not match script tags where the attribute uses single-quotes."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src='foo'></script>") and
not regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo>") and
msg = "This regular expression does not match script tags where the attribute uses double-quotes."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src='foo'></script>") and
not regexp.matches("<script\tsrc='foo'></script>") and
not regexp.matches("<foo>") and
not regexp.matches("<foo src=\"foo\"></foo>") and
msg = "This regular expression does not match script tags where tabs are used between attributes."
or
regexp.matches("<script>foo</script>") and
not RegExpFlags::isIgnoreCase(regexp) and
not regexp.matches("<foo>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<SCRIPT>foo</SCRIPT>") and
msg = "This regular expression does not match upper case <SCRIPT> tags."
or
not regexp.matches("<sCrIpT>foo</ScRiPt>") and
regexp.matches("<SCRIPT>foo</SCRIPT>") and
msg = "This regular expression does not match mixed case <sCrIpT> tags."
)
or
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<script src=\"foo\">foo</script >") and
msg = "This regular expression does not match script end tags like </script >."
or
not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
or
not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
)
}

View File

@@ -438,6 +438,13 @@ private int toHex(string hex) {
result = 15 and hex = ["f", "F"]
}
/**
* A word boundary, that is, a regular expression term of the form `\b`.
*/
class RegExpWordBoundary extends RegExpEscape {
RegExpWordBoundary() { this.getUnescaped() = "b" }
}
/**
* A character class escape in a regular expression.
* That is, an escaped character that denotes multiple characters.

View File

@@ -0,0 +1,54 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
It is possible to match some single HTML tags using regular expressions (parsing general HTML using
regular expressions is impossible). However, if the regular expression is not written well it might
be possible to circumvent it, which can lead to cross-site scripting or other security issues.
</p>
<p>
Some of these mistakes are caused by browsers having very forgiving HTML parsers, and
will often render invalid HTML containing syntax errors.
Regular expressions that attempt to match HTML should also recognize tags containing such syntax errors.
</p>
</overview>
<recommendation>
<p>
Use a well-tested sanitization or parser library if at all possible. These libraries are much more
likely to handle corner cases correctly than a custom implementation.
</p>
</recommendation>
<example>
<p>
The following example attempts to filters out all <code>&lt;script&gt;</code> tags.
</p>
<sample src="examples/BadTagFilter.rb" />
<p>
The above sanitizer does not filter out all <code>&lt;script&gt;</code> tags.
Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by
the function, and <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
</p>
<p>
Other corner cases include that HTML comments can end with <code>--!&gt;</code>,
and that HTML tag names can contain upper case characters.
</p>
</example>
<references>
<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,19 @@
/**
* @name Bad HTML filtering regexp
* @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision high
* @id rb/bad-tag-filter
* @tags correctness
* security
* external/cwe/cwe-116
* external/cwe/cwe-020
*/
import codeql.ruby.security.BadTagFilterQuery
from HTMLMatchingRegExp regexp, string msg
where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
select regexp, msg

View File

@@ -0,0 +1,8 @@
def filterScripTags(html)
oldHtml = "";
while (html != oldHtml)
oldHtml = html;
html = html.gsub(/<script[^>]*>.*<\/script>/m, "");
end
return html;
end

View File

@@ -0,0 +1,14 @@
| test.rb:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| test.rb:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| test.rb:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
| test.rb:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
| test.rb:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
| test.rb:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
| test.rb:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
| test.rb:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
| test.rb:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
| test.rb:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
| test.rb:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
| test.rb:17:6:17:40 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
| test.rb:18:6:18:48 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
| test.rb:19:6:19:147 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 1, 3, 4, 5. |

View File

@@ -0,0 +1 @@
queries/security/cwe-116/BadTagFilter.ql

View File

@@ -0,0 +1,22 @@
filters = [
/<script.*?>.*?<\/script>/i, # NOT OK - doesn't match newlines or `</script >`
/<script.*?>.*?<\/script>/im, # NOT OK - doesn't match `</script >`
/<script.*?>.*?<\/script[^>]*>/im, # OK
/<!--.*-->/im, # OK - we don't care regexps that only match comments
/<!--.*--!?>/im, # OK
/<!--.*--!?>/i, # NOT OK, does not match newlines
/<script.*?>(.|\s)*?<\/script[^>]*>/i, # NOT OK - doesn't match inside the script tag
/<script[^>]*?>.*?<\/script[^>]*>/i, # NOT OK - doesn't match newlines inside the content
/<script(\s|\w|=|")*?>.*?<\/script[^>]*>/im, # NOT OK - does not match single quotes for attribute values
/<script(\s|\w|=|')*?>.*?<\/script[^>]*>/im, # NOT OK - does not match double quotes for attribute values
/<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/im, # NOT OK - does not match tabs between attributes
/<script.*?>.*?<\/script[^>]*>/m, # NOT OK - does not match uppercase SCRIPT tags
/<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/m, # NOT OK - does not match mixed case script tags
/<script[^>]*?>[\s\S]*?<\/script.*>/i, # NOT OK - doesn't match newlines in the end tag
/<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, # OK
/<script\b[^>]*>([\s\S]*?)<\/script>/gi, # NOT OK - too strict matching on the end tag
/<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>/, # NOT OK - doesn't match comments with the right capture groups
/<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))/, # NOT OK - capture groups
]
doFilters(filters)