diff --git a/config/identical-files.json b/config/identical-files.json index 5dbebe0329a..afef0a923d5 100644 --- a/config/identical-files.json +++ b/config/identical-files.json @@ -531,11 +531,6 @@ "ruby/ql/lib/codeql/ruby/internal/ConceptsShared.qll", "javascript/ql/lib/semmle/javascript/internal/ConceptsShared.qll" ], - "Hostname Regexp queries": [ - "javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll", - "python/ql/src/Security/CWE-020/HostnameRegexpShared.qll", - "ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll" - ], "ApiGraphModels": [ "javascript/ql/lib/semmle/javascript/frameworks/data/internal/ApiGraphModels.qll", "ruby/ql/lib/codeql/ruby/frameworks/data/internal/ApiGraphModels.qll", diff --git a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll index 1b01be0377b..d376f7d2388 100644 --- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll +++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll @@ -558,6 +558,8 @@ module Impl implements RegexTreeViewSig { } } + class RegExpCharEscape = RegExpEscape; + /** * A word boundary, that is, a regular expression term of the form `\b`. */ @@ -868,6 +870,9 @@ module Impl implements RegexTreeViewSig { predicate isNamedGroupOfLiteral(RegExpLiteral lit, string name) { lit = this.getLiteral() and name = this.getName() } + + /** Holds if this is a capture group. */ + predicate isCapture() { exists(this.getNumber()) } } /** diff --git a/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll b/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll new file mode 100644 index 00000000000..1922ad01bf1 --- /dev/null +++ b/javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll @@ -0,0 +1,19 @@ +/** + * Provides predicates for reasoning about regular expressions + * that match URLs and hostname patterns. + */ + +private import javascript as JS +private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl +private import semmle.javascript.Regexp as RegExp +private import codeql.regex.HostnameRegexp as Shared + +private module Impl implements Shared::HostnameRegexpSig { + class DataFlowNode = JS::DataFlow::Node; + + class RegExpPatternSource = RegExp::RegExpPatternSource; + + string getACommonTld() { result = RegExp::RegExpPatterns::getACommonTld() } +} + +import Shared::Make diff --git a/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll b/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll index 5e9bc406512..524be45c653 100644 --- a/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll +++ b/javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll @@ -3,200 +3,5 @@ * that match URLs and hostname patterns. */ -private import HostnameRegexpSpecific - -/** - * Holds if the given constant is unlikely to occur in the origin part of a URL. - */ -predicate isConstantInvalidInsideOrigin(RegExpConstant term) { - // Look for any of these cases: - // - A character that can't occur in the origin - // - Two dashes in a row - // - A colon that is not part of port or scheme separator - // - A slash that is not part of scheme separator - term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(? { + class DataFlowNode = DataFlow::Node; + + class RegExpPatternSource = Regexp::RegExpPatternSource; + + string getACommonTld() { result = Regexp::RegExpPatterns::getACommonTld() } +} + +import Shared::Make diff --git a/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll b/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll index 5e9bc406512..d15714d406a 100644 --- a/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll +++ b/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll @@ -3,200 +3,6 @@ * that match URLs and hostname patterns. */ -private import HostnameRegexpSpecific - -/** - * Holds if the given constant is unlikely to occur in the origin part of a URL. - */ -predicate isConstantInvalidInsideOrigin(RegExpConstant term) { - // Look for any of these cases: - // - A character that can't occur in the origin - // - Two dashes in a row - // - A colon that is not part of port or scheme separator - // - A slash that is not part of scheme separator - term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(? { + class DataFlowNode = DataFlow::Node; + + class RegExpPatternSource = Regexp::RegExpPatternSource; + + string getACommonTld() { result = Regexp::RegExpPatterns::getACommonTld() } +} + +import Shared::Make diff --git a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll index 5e9bc406512..dc3ed9aeaf7 100644 --- a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll +++ b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll @@ -3,200 +3,6 @@ * that match URLs and hostname patterns. */ -private import HostnameRegexpSpecific - -/** - * Holds if the given constant is unlikely to occur in the origin part of a URL. - */ -predicate isConstantInvalidInsideOrigin(RegExpConstant term) { - // Look for any of these cases: - // - A character that can't occur in the origin - // - Two dashes in a row - // - A colon that is not part of port or scheme separator - // - A slash that is not part of scheme separator - term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(? { + /** + * Gets a pattern that matches common top-level domain names in lower case. + */ + string getACommonTld(); + + /** A node in the data-flow graph. */ + class DataFlowNode; + + /** A node in the data-flow graph that represents a regular expression pattern. */ + class RegExpPatternSource extends DataFlowNode { + /** + * Gets the root term of the regular expression parsed from this pattern. + */ + TreeImpl::RegExpTerm getRegExpTerm(); + + /** + * Gets a node where the pattern of this node is parsed as a part of + * a regular expression. + */ + DataFlowNode getAParse(); + } +} + +/** + * Classes and predicates implementing an analysis on regular expressions + * that match URLs and hostname patterns. + */ +module Make Specific> { + private import TreeImpl + + /** + * Holds if the given constant is unlikely to occur in the origin part of a URL. + */ + predicate isConstantInvalidInsideOrigin(RegExpConstant term) { + // Look for any of these cases: + // - A character that can't occur in the origin + // - Two dashes in a row + // - A colon that is not part of port or scheme separator + // - A slash that is not part of scheme separator + term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?