From b737bdbca037dd178690c3245d1b0d3d5b08ed2e Mon Sep 17 00:00:00 2001 From: erik-krogh Date: Tue, 1 Nov 2022 12:16:28 +0100 Subject: [PATCH] add a Java implementation of `RegexTreeViewSig` --- .../semmle/code/java/regex/RegexTreeView.qll | 91 ++++++++++++++++++- 1 file changed, 87 insertions(+), 4 deletions(-) diff --git a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll index ceccd3efd5f..2eecff1e627 100644 --- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll +++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll @@ -1,11 +1,19 @@ /** Provides a class hierarchy corresponding to a parse tree of regular expressions. */ -private import java -private import semmle.code.java.regex.regex +private import semmle.code.java.regex.regex as RE // importing under a namescape to avoid naming conflict for `Top`. +private import codeql.regex.nfa.NfaUtils as NfaUtils +// exporting as RegexTreeView, and in the top-level scope. +import Impl as RegexTreeView import Impl /** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ -RegExpTerm getParsedRegExp(StringLiteral re) { result.getRegex() = re and result.isRootTerm() } +RegExpTerm getParsedRegExp(RE::StringLiteral re) { result.getRegex() = re and result.isRootTerm() } + +private class Regex = RE::Regex; + +private class Location = RE::Location; + +private class File = RE::File; /** * An element containing a regular expression term, that is, either @@ -53,7 +61,10 @@ private newtype TRegExpParent = /** A back reference */ TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) } -module Impl { +private import codeql.regex.RegexTreeView + +/** An implementation that statisfies the RegexTreeView signature. */ +module Impl implements RegexTreeViewSig { /** * An element containing a regular expression term, that is, either * a string literal (parsed as a regular expression; the root of the parse tree) @@ -547,6 +558,13 @@ module Impl { } } + /** + * A word boundary, that is, a regular expression term of the form `\b`. + */ + class RegExpWordBoundary extends RegExpSpecialChar { + RegExpWordBoundary() { this.getChar() = "\\b" } + } + /** * Gets the hex number for the `hex` char. */ @@ -1088,4 +1106,69 @@ module Impl { override string getPrimaryQLClass() { result = "RegExpBackRef" } } + + class Top = RegExpParent; + + /** + * Holds if `term` is an escape class representing e.g. `\d`. + * `clazz` is which character class it represents, e.g. "d" for `\d`. + */ + predicate isEscapeClass(RegExpTerm term, string clazz) { + term.(RegExpCharacterClassEscape).getValue() = clazz + or + term.(RegExpNamedProperty).getBackslashEquivalent() = clazz + } + + /** + * Holds if `term` is a possessive quantifier, e.g. `a*+`. + */ + predicate isPossessive(RegExpQuantifier term) { term.isPossessive() } + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. + */ + predicate matchesAnyPrefix(RegExpTerm term) { not term.getRegex().matchesFullString() } + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. + */ + predicate matchesAnySuffix(RegExpTerm term) { not term.getRegex().matchesFullString() } + + /** + * Holds if the regular expression should not be considered. + * + * We make the pragmatic performance optimization to ignore regular expressions in files + * that do not belong to the project code (such as installed dependencies). + */ + predicate isExcluded(RegExpParent parent) { + not exists(parent.getRegex().getLocation().getFile().getRelativePath()) + or + // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so + // we explicitly exclude these. + strictcount(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 + } + + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isIgnoreCase() + } + + /** + * Gets the flags for `root`, or the empty string if `root` has no flags. + */ + deprecated string getFlags(RegExpTerm root) { + root.isRootTerm() and + result = root.getLiteral().getFlags() + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isDotAll() + } }