Merge pull request #6462 from erik-krogh/repeat

JS: support more regular expressions in js/incomplete-multi-character-sanitization
This commit is contained in:
Erik Krogh Kristensen
2021-08-23 15:39:31 +02:00
committed by GitHub
6 changed files with 107 additions and 46 deletions

View File

@@ -56,31 +56,69 @@ DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
}
private import semmle.javascript.security.performance.ReDoSUtil as ReDoSUtil
/**
* Gets a char from a dangerous prefix that is matched by `t`.
*/
pragma[noinline]
DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
t.isNullable() and result = ""
or
t.getAMatchedString() = result
or
// A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
exists(ReDoSUtil::CharacterClass cc |
cc = ReDoSUtil::getCanonicalCharClass(t) and
cc.matches(result) and
result.regexpMatch("\\w") and
// excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
not cc.matches(">")
)
or
t instanceof RegExpDot and
result.length() = 1
or
(
t instanceof RegExpOpt or
t instanceof RegExpStar or
t instanceof RegExpPlus or
t instanceof RegExpGroup or
t instanceof RegExpAlt
) and
result = getADangerousMatchedChar(t.getAChild())
}
/**
* Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
*
* Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
*/
DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
exists(string left |
t.isNullable() and left = ""
or
t.getAMatchedString() = left
or
(
t instanceof RegExpOpt or
t instanceof RegExpStar or
t instanceof RegExpPlus or
t instanceof RegExpGroup or
t instanceof RegExpAlt
) and
left = getADangerousMatchedPrefixSubstring(t.getAChild())
|
result = left + getADangerousMatchedPrefixSubstring(t.getSuccessor()) or
result = left
result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
or
result = getADangerousMatchedChar(t)
or
// loop around for repetitions (only considering alphanumeric characters in the repetition)
exists(RepetitionMatcher repetition | t = repetition |
result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
)
}
class RepetitionMatcher extends EmptyReplaceRegExpTerm {
string char;
pragma[noinline]
RepetitionMatcher() {
(this instanceof RegExpPlus or this instanceof RegExpStar) and
char = getADangerousMatchedChar(this.getAChild()) and
char.regexpMatch("\\w")
}
pragma[noinline]
string getAChar() { result = char }
}
/**
* Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerablity of kind `kind`.
*/
@@ -151,7 +189,7 @@ where
// skip leading optional elements
not dangerous.isNullable() and
// only warn about the longest match (presumably the most descriptive)
prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length()) and
prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length(), m) and
// only warn once per kind
not exists(EmptyReplaceRegExpTerm other |
other = dangerous.getAChild+() or other = dangerous.getPredecessor+()

View File

@@ -155,15 +155,22 @@ private class RegexpCharacterConstant extends RegExpConstant {
RegexpCharacterConstant() { this.isCharacter() }
}
/**
* A regexp term that is relevant for this ReDoS analysis.
*/
class RelevantRegExpTerm extends RegExpTerm {
RelevantRegExpTerm() { getRoot(this).isRelevant() }
}
/**
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
*
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
*/
private predicate isCanonicalTerm(RegExpTerm term, string str) {
private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
term =
rank[1](RegExpTerm t, Location loc, File file |
min(RelevantRegExpTerm t, Location loc, File file |
loc = t.getLocation() and
file = t.getFile() and
str = t.getRawValue()
@@ -178,15 +185,15 @@ private predicate isCanonicalTerm(RegExpTerm term, string str) {
private newtype TInputSymbol =
/** An input symbol corresponding to character `c`. */
Char(string c) {
c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
} or
/**
* An input symbol representing all characters matched by
* a (non-universal) character class that has string representation `charClassString`.
*/
CharClass(string charClassString) {
exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
exists(RegExpTerm recc | isCanonicalTerm(recc, charClassString) |
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
recc instanceof RegExpCharacterClass and
not recc.(RegExpCharacterClass).isUniversalClass()
or
@@ -251,7 +258,7 @@ class InputSymbol extends TInputSymbol {
/**
* An abstract input symbol that represents a character class.
*/
abstract private class CharacterClass extends InputSymbol {
abstract class CharacterClass extends InputSymbol {
/**
* Gets a character that is relevant for intersection-tests involving this
* character class.
@@ -626,13 +633,10 @@ RegExpRoot getRoot(RegExpTerm term) {
}
private newtype TState =
Match(RegExpTerm t, int i) {
getRoot(t).isRelevant() and
(
i = 0
or
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
)
Match(RelevantRegExpTerm t, int i) {
i = 0
or
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
} or
Accept(RegExpRoot l) { l.isRelevant() } or
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }

View File

@@ -1,7 +1,7 @@
| tst-multi-character-sanitization.js:3:13:3:57 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:3:30:3:30 | < | <cript |
| tst-multi-character-sanitization.js:3:13:3:57 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:3:30:3:30 | < | <script |
| tst-multi-character-sanitization.js:4:13:4:47 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:4:30:4:40 | on\\w+=".*" | on |
| tst-multi-character-sanitization.js:5:13:5:49 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:5:30:5:42 | on\\w+=\\'.*\\' | on |
| tst-multi-character-sanitization.js:9:13:9:47 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:9:30:9:30 | < | <cript |
| tst-multi-character-sanitization.js:9:13:9:47 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:9:30:9:30 | < | <script |
| tst-multi-character-sanitization.js:10:13:10:49 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:10:30:10:42 | .on\\w+=.*".*" | on |
| tst-multi-character-sanitization.js:11:13:11:51 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:11:30:11:44 | .on\\w+=.*\\'.*\\' | on |
| tst-multi-character-sanitization.js:19:3:19:35 | respons ... pt, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:18:18:18:24 | <script | <script |
@@ -31,4 +31,9 @@
| tst-multi-character-sanitization.js:126:7:129:34 | x\\n . ... //, "") | This string may still contain $@, which may cause a path injection vulnerability. | tst-multi-character-sanitization.js:129:21:129:22 | \\/ | /.. |
| tst-multi-character-sanitization.js:135:2:135:44 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:135:19:135:25 | <script | <script |
| tst-multi-character-sanitization.js:136:2:136:46 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:136:19:136:19 | < | <script |
| tst-multi-character-sanitization.js:137:2:137:48 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:137:19:137:20 | .+ | <script |
| tst-multi-character-sanitization.js:138:2:138:48 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:138:21:138:21 | < | <script |
| tst-multi-character-sanitization.js:142:13:142:62 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:142:30:142:36 | <script | <script |
| tst-multi-character-sanitization.js:143:13:143:56 | content ... /g, '') | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:143:30:143:30 | < | <script |
| tst-multi-character-sanitization.js:144:13:144:91 | content ... /g, '') | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:144:30:144:30 | < | <script |
| tst-multi-character-sanitization.js:145:13:145:90 | content ... /g, '') | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:145:30:145:30 | < | <script |

View File

@@ -134,6 +134,14 @@
(function (content) {
content.replace(/<script.*\/script>/gi, ""); // NOT OK
content.replace(/<(script).*\/script>/gi, ""); // NOT OK
content.replace(/.+<(script).*\/script>/gi, ""); // OK
content.replace(/.+<(script).*\/script>/gi, ""); // NOT OK
content.replace(/.*<(script).*\/script>/gi, ""); // NOT OK
});
(function (content) {
content = content.replace(/<script[\s\S]*?<\/script>/gi, ""); // NOT OK
content = content.replace(/<[a-zA-Z\/](.|\n)*?>/g, '') || ' '; // NOT OK
content = content.replace(/<(script|iframe|video)[\s\S]*?<\/(script|iframe|video)>/g, '') // NOT OK
content = content.replace(/<(script|iframe|video)(.|\s)*?\/(script|iframe|video)>/g, '') // NOT OK
content = content.replace(/<[^<]*>/g, ""); // OK
});