JS: make RegExpPatterns::commonTLD more robust

2026-05-03 20:58:03 +02:00 · 2019-03-14 08:02:29 +01:00
parent 9eb039038e
commit 74144b0271
3 changed files with 6 additions and 6 deletions
--- a/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
+++ b/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -36,7 +36,7 @@ predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
            // an unescaped single `.`
            "(?<!\\\\)[.]" +
            // immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
-            "([():|?a-z0-9-]+(\\\\)?[.](" + RegExpPatterns::commonTLD() + "))" + ".*", 1)
+            "([():|?a-z0-9-]+(\\\\)?[.]" + RegExpPatterns::commonTLD() + ")" + ".*", 1)
 }

 from Expr e, string pattern, string hostPart
@@ -51,7 +51,7 @@ where
  ) and
  isIncompleteHostNameRegExpPattern(pattern, hostPart) and
  // ignore patterns with capture groups after the TLD
-  not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*")
+  not pattern.regexpMatch("(?i).*[.]" + RegExpPatterns::commonTLD() + ".*[(][?]:.*[)].*")
 select e,
  "This regular expression has an unescaped '.' before '" + hostPart +
    "', so it might match more hosts than expected."
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
@@ -38,8 +38,8 @@ where
  (
    // target contains a domain on a common TLD, and perhaps some other URL components
    target
-        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" + RegExpPatterns::commonTLD() +
-            ")(:[0-9]+)?/?")
+        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +
+            "(:[0-9]+)?/?")
    or
    // target is a HTTP URL to a domain on any TLD
    target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
--- a/javascript/ql/src/semmle/javascript/Regexp.qll
+++ b/javascript/ql/src/semmle/javascript/Regexp.qll
@@ -397,10 +397,10 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
 */
 module RegExpPatterns {
  /**
-   * Gets a pattern that matches common top-level domain names.
+   * Gets a pattern that matches common top-level domain names in lower case.
   */
  string commonTLD() {
    // according to ranking by http://google.com/search?q=site:.<<TLD>>
-    result = "com|org|edu|gov|uk|net|io"
+    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
  }
 }