Merge pull request #6561 from erik-krogh/htmlReg

JS/Py/Ruby: add a bad-tag-filter query
2026-04-30 11:15:13 +02:00 · 2021-11-18 09:39:13 +01:00
parent 08b6a17097 b639a8d183
commit 1cca377e7d
45 changed files with 1510 additions and 101 deletions
--- a/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.expected
+++ b/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.expected
@@ -0,0 +1,17 @@
+| tst.js:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.js:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.js:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
+| tst.js:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
+| tst.js:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
+| tst.js:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
+| tst.js:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
+| tst.js:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
+| tst.js:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
+| tst.js:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
+| tst.js:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
+| tst.js:17:6:17:40 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.js:18:6:18:48 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
+| tst.js:19:6:19:147 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
+| tst.js:20:3:20:57 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
+| tst.js:21:6:21:249 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as a HTML comment end tag. |
+| tst.js:22:6:22:33 | <!--([\\w\\W]*?)-->\|<([^>]*?)> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
--- a/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.qlref
+++ b/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.qlref
@@ -0,0 +1 @@
+Security/CWE-116/BadTagFilter.ql
--- a/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/tst.js
+++ b/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/tst.js
@@ -0,0 +1,28 @@
+var filters = [
+    /<script.*?>.*?<\/script>/i, // NOT OK - doesn't match newlines or `</script >`
+    /<script.*?>.*?<\/script>/is, // NOT OK - doesn't match `</script >`
+    /<script.*?>.*?<\/script[^>]*>/is, // OK
+    /<!--.*-->/is, // OK - we don't care regexps that only match comments
+    /<!--.*--!?>/is, // OK
+    /<!--.*--!?>/i, // NOT OK, does not match newlines
+    /<script.*?>(.|\s)*?<\/script[^>]*>/i, // NOT OK - doesn't match inside the script tag
+    /<script[^>]*?>.*?<\/script[^>]*>/i, // NOT OK - doesn't match newlines inside the content
+    /<script(\s|\w|=|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match single quotes for attribute values
+    /<script(\s|\w|=|')*?>.*?<\/script[^>]*>/is, // NOT OK - does not match double quotes for attribute values
+    /<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match tabs between attributes
+    /<script.*?>.*?<\/script[^>]*>/s, // NOT OK - does not match uppercase SCRIPT tags
+    /<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/s, // NOT OK - does not match mixed case script tags
+    /<script[^>]*?>[\s\S]*?<\/script.*>/i, // NOT OK - doesn't match newlines in the end tag
+    /<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, // OK
+    /<script\b[^>]*>([\s\S]*?)<\/script>/gi, // NOT OK - too strict matching on the end tag
+    /<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>/, // NOT OK - doesn't match comments with the right capture groups
+    /<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))/, // NOT OK - capture groups
+	/(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--.*?--\s*)+>)/gi, // NOT OK - capture groups
+    /<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))/g, // NOT OK - capture groups
+    /<!--([\w\W]*?)-->|<([^>]*?)>/g, // NOT OK - capture groups
+]
+
+doFilters(filters)
+
+var strip = '<script([^>]*)>([\\S\\s]*?)<\/script([^>]*)>';  // OK - it's used with the ignorecase flag
+new RegExp(strip, 'gi');