JS: make RegExpPatterns::commonTLD more robust

This commit is contained in:
Esben Sparre Andreasen
2019-03-14 08:02:29 +01:00
parent 9eb039038e
commit 74144b0271
3 changed files with 6 additions and 6 deletions

View File

@@ -36,7 +36,7 @@ predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
// an unescaped single `.`
"(?<!\\\\)[.]" +
// immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
"([():|?a-z0-9-]+(\\\\)?[.](" + RegExpPatterns::commonTLD() + "))" + ".*", 1)
"([():|?a-z0-9-]+(\\\\)?[.]" + RegExpPatterns::commonTLD() + ")" + ".*", 1)
}
from Expr e, string pattern, string hostPart
@@ -51,7 +51,7 @@ where
) and
isIncompleteHostNameRegExpPattern(pattern, hostPart) and
// ignore patterns with capture groups after the TLD
not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*")
not pattern.regexpMatch("(?i).*[.]" + RegExpPatterns::commonTLD() + ".*[(][?]:.*[)].*")
select e,
"This regular expression has an unescaped '.' before '" + hostPart +
"', so it might match more hosts than expected."

View File

@@ -38,8 +38,8 @@ where
(
// target contains a domain on a common TLD, and perhaps some other URL components
target
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" + RegExpPatterns::commonTLD() +
")(:[0-9]+)?/?")
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +
"(:[0-9]+)?/?")
or
// target is a HTTP URL to a domain on any TLD
target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")

View File

@@ -397,10 +397,10 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
*/
module RegExpPatterns {
/**
* Gets a pattern that matches common top-level domain names.
* Gets a pattern that matches common top-level domain names in lower case.
*/
string commonTLD() {
// according to ranking by http://google.com/search?q=site:.<<TLD>>
result = "com|org|edu|gov|uk|net|io"
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
}
}