From 9f4da6503042291b5828a0dfb4fe8be6609605cb Mon Sep 17 00:00:00 2001 From: Joe Farebrother Date: Fri, 11 Feb 2022 17:35:38 +0000 Subject: [PATCH] Improve calculation of locations of regex terms --- .../semmle/code/java/regex/RegexTreeView.qll | 16 ++-- java/ql/lib/semmle/code/java/regex/regex.qll | 68 ++++++++++++-- .../regex/RegexParseTests.expected | 90 ++++++++++--------- java/ql/test/library-tests/regex/Test.java | 5 +- 4 files changed, 123 insertions(+), 56 deletions(-) diff --git a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll index fd9b93a142f..c022ba6b2ac 100644 --- a/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll +++ b/java/ql/lib/semmle/code/java/regex/RegexTreeView.qll @@ -188,12 +188,18 @@ class RegExpTerm extends RegExpParent { predicate hasLocationInfo( string filepath, int startline, int startcolumn, int endline, int endcolumn ) { - // This currently gives incorrect results for string literals including backslashes. TODO: fix that. - // There are also more complex cases where it fails. Handling all of them would be difficult for not much gain. - exists(int re_start, int re_end | + /* + * This is an approximation that handles the simple and common case of single, + * normal string literal written in the source, but does not give correct results in more complex cases + * such as compile-time concatenation, or multi-line string literals. + */ + + exists(int re_start, int re_end, int src_start, int src_end | re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, re_end) and - startcolumn = re_start + start + 1 and - endcolumn = re_start + end + re.sourceCharacter(start, src_start, _) and + re.sourceCharacter(end - 1, _, src_end) and + startcolumn = re_start + src_start and + endcolumn = re_start + src_end - 1 ) } diff --git a/java/ql/lib/semmle/code/java/regex/regex.qll b/java/ql/lib/semmle/code/java/regex/regex.qll index 71ec4d1f7d2..9bd49037cdd 100644 --- a/java/ql/lib/semmle/code/java/regex/regex.qll +++ b/java/ql/lib/semmle/code/java/regex/regex.qll @@ -27,7 +27,6 @@ abstract class RegexString extends StringLiteral { * In order to avoid negative recursion, we return a boolean. * This way, we can refer to `escaping(pos - 1).booleanNot()` * rather than to a negated version of `escaping(pos)`. - * Does not take into account escape characters inside quote sequences. */ private boolean escaping(int pos) { pos = -1 and result = false @@ -104,11 +103,10 @@ abstract class RegexString extends StringLiteral { end = start + 3 } - string nonEscapedCharAt(int i) { - result = this.getText().charAt(i) and + private string nonEscapedCharAt(int i) { + result = this.getChar(i) and not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) and - not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1]) and - not exists(int x, int y | this.controlEscape(x, y) and i in [x .. y - 1]) + not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1]) } /** Holds if a character set starts between `start` and `end`, including any negation character (`^`). */ @@ -822,6 +820,66 @@ abstract class RegexString extends StringLiteral { this.alternation(start, end) and this.subalternation(start, part_end, part_start) } + + /** + * Gets the `i`th character of this literal as it was written in the source code. + */ + string getSourceChar(int i) { result = this.(StringLiteral).getLiteral().charAt(i) } + + /** + * Helper predicate for `sourceEscapingChar` that + * results in a boolean in order to avoid negative recursion. + */ + private boolean sourceEscaping(int pos) { + pos = -1 and result = false + or + this.getSourceChar(pos) = "\\" and + result = this.sourceEscaping(pos - 1).booleanNot() + or + this.getSourceChar(pos) != "\\" and result = false + } + + /** + * Equivalent of `escapingChar` for the literal source rather than the string value. + * Holds if the character at position `pos` in the source literal is a '\' that is + * actually escaping what comes after it. + */ + private predicate sourceEcapingChar(int pos) { this.sourceEscaping(pos) = true } + + /** + * Holds if an escaped character exists between `start` and `end` in the source iteral. + */ + private predicate sourceEscapedCharacter(int start, int end) { + this.sourceEcapingChar(start) and + (if this.getSourceChar(start + 1) = "u" then end = start + 6 else end = start + 2) + } + + private predicate sourceNonEscapedCharacter(int i) { + exists(this.getSourceChar(i)) and + not exists(int x, int y | this.sourceEscapedCharacter(x, y) and i in [x .. y - 1]) + } + + /** + * Holds if a character is represented between `start` and `end` in the source literal. + */ + private predicate sourceCharacter(int start, int end) { + sourceEscapedCharacter(start, end) + or + sourceNonEscapedCharacter(start) and + end = start + 1 + } + + /** + * Holds if the `i`th character of the string is represented between offsets + * `start` (inclusive) and `end` (exclusive) in the source code of this literal. + * This only gives correct results if the literal is written as a normal single-line string literal; + * without compile-time concatenation involved. + */ + predicate sourceCharacter(int pos, int start, int end) { + exists(this.getChar(pos)) and + sourceCharacter(start, end) and + start = rank[pos + 2](int s | sourceCharacter(s, _)) + } } /** A string literal used as a regular expression */ diff --git a/java/ql/test/library-tests/regex/RegexParseTests.expected b/java/ql/test/library-tests/regex/RegexParseTests.expected index ebd0317bcc2..c6d8322e5c1 100644 --- a/java/ql/test/library-tests/regex/RegexParseTests.expected +++ b/java/ql/test/library-tests/regex/RegexParseTests.expected @@ -1,14 +1,14 @@ parseFailures #select -| Test.java:5:10:5:16 | [A-Z\\d] | [RegExpCharacterClass] | -| Test.java:5:10:5:18 | [A-Z\\d]++ | [RegExpPlus] | +| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] | +| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] | | Test.java:5:11:5:11 | A | [RegExpConstant,RegExpNormalChar] | | Test.java:5:11:5:13 | A-Z | [RegExpCharacterRange] | | Test.java:5:13:5:13 | Z | [RegExpConstant,RegExpNormalChar] | -| Test.java:5:14:5:15 | \\d | [RegExpCharacterClassEscape] | -| Test.java:6:10:6:39 | \\Q hello world [ *** \\Q ) ( \\E | [RegExpConstant,RegExpQuote] | -| Test.java:7:10:7:21 | [\\Q hi ] \\E] | [RegExpCharacterClass] | -| Test.java:7:11:7:20 | \\Q hi ] \\E | [RegExpConstant,RegExpQuote] | +| Test.java:5:14:5:16 | \\d | [RegExpCharacterClassEscape] | +| Test.java:6:10:6:42 | \\Q hello world [ *** \\Q ) ( \\E | [RegExpConstant,RegExpQuote] | +| Test.java:7:10:7:23 | [\\Q hi ] \\E] | [RegExpCharacterClass] | +| Test.java:7:11:7:22 | \\Q hi ] \\E | [RegExpConstant,RegExpQuote] | | Test.java:8:10:8:12 | []] | [RegExpCharacterClass] | | Test.java:8:11:8:11 | ] | [RegExpConstant,RegExpNormalChar] | | Test.java:9:10:9:13 | [^]] | [RegExpCharacterClass] | @@ -23,33 +23,33 @@ parseFailures | Test.java:10:17:10:17 | f | [RegExpConstant,RegExpNormalChar] | | Test.java:10:18:10:18 | g | [RegExpConstant,RegExpNormalChar] | | Test.java:10:19:10:19 | ] | [RegExpConstant,RegExpNormalChar] | -| Test.java:11:10:11:53 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]] | [RegExpCharacterClass] | -| Test.java:11:10:11:62 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]]\\b7\\b{g}8 | [RegExpSequence] | +| Test.java:11:10:11:57 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]] | [RegExpCharacterClass] | +| Test.java:11:10:11:68 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]]\\b7\\b{g}8 | [RegExpSequence] | | Test.java:11:11:11:11 | a | [RegExpConstant,RegExpNormalChar] | | Test.java:11:12:11:12 | b | [RegExpConstant,RegExpNormalChar] | | Test.java:11:13:11:13 | c | [RegExpConstant,RegExpNormalChar] | | Test.java:11:14:11:14 | & | [RegExpConstant,RegExpNormalChar] | | Test.java:11:15:11:15 | & | [RegExpConstant,RegExpNormalChar] | | Test.java:11:16:11:16 | [ | [RegExpConstant,RegExpNormalChar] | -| Test.java:11:17:11:18 | \\W | [RegExpCharacterClassEscape] | -| Test.java:11:19:11:27 | \\p{Lower} | [RegExpCharacterClassEscape] | -| Test.java:11:28:11:36 | \\P{Space} | [RegExpCharacterClassEscape] | -| Test.java:11:37:11:51 | \\N{degree sign} | [RegExpConstant,RegExpEscape] | -| Test.java:11:52:11:52 | ] | [RegExpConstant,RegExpNormalChar] | -| Test.java:11:54:11:55 | \\b | [RegExpConstant,RegExpEscape] | -| Test.java:11:56:11:56 | 7 | [RegExpConstant,RegExpNormalChar] | -| Test.java:11:57:11:61 | \\b{g} | [RegExpConstant,RegExpEscape] | -| Test.java:11:62:11:62 | 8 | [RegExpConstant,RegExpNormalChar] | -| Test.java:12:10:12:12 | \\cA | [RegExpConstant,RegExpEscape] | -| Test.java:13:10:13:12 | \\c( | [RegExpConstant,RegExpEscape] | -| Test.java:14:10:14:12 | \\c\\ | [RegExpConstant,RegExpEscape] | -| Test.java:14:10:14:16 | \\c\\(ab) | [RegExpSequence] | -| Test.java:14:13:14:16 | (ab) | [RegExpGroup] | -| Test.java:14:14:14:14 | a | [RegExpConstant,RegExpNormalChar] | -| Test.java:14:14:14:15 | ab | [RegExpSequence] | -| Test.java:14:15:14:15 | b | [RegExpConstant,RegExpNormalChar] | +| Test.java:11:17:11:19 | \\W | [RegExpCharacterClassEscape] | +| Test.java:11:20:11:29 | \\p{Lower} | [RegExpCharacterClassEscape] | +| Test.java:11:30:11:39 | \\P{Space} | [RegExpCharacterClassEscape] | +| Test.java:11:40:11:55 | \\N{degree sign} | [RegExpConstant,RegExpEscape] | +| Test.java:11:56:11:56 | ] | [RegExpConstant,RegExpNormalChar] | +| Test.java:11:58:11:60 | \\b | [RegExpConstant,RegExpEscape] | +| Test.java:11:61:11:61 | 7 | [RegExpConstant,RegExpNormalChar] | +| Test.java:11:62:11:67 | \\b{g} | [RegExpConstant,RegExpEscape] | +| Test.java:11:68:11:68 | 8 | [RegExpConstant,RegExpNormalChar] | +| Test.java:12:10:12:13 | \\cA | [RegExpConstant,RegExpEscape] | +| Test.java:13:10:13:13 | \\c( | [RegExpConstant,RegExpEscape] | +| Test.java:14:10:14:14 | \\c\\ | [RegExpConstant,RegExpEscape] | +| Test.java:14:10:14:18 | \\c\\(ab) | [RegExpSequence] | +| Test.java:14:15:14:18 | (ab) | [RegExpGroup] | +| Test.java:14:16:14:16 | a | [RegExpConstant,RegExpNormalChar] | +| Test.java:14:16:14:17 | ab | [RegExpSequence] | +| Test.java:14:17:14:17 | b | [RegExpConstant,RegExpNormalChar] | | Test.java:15:10:15:15 | (?>hi) | [RegExpGroup] | -| Test.java:15:10:15:44 | (?>hi)(?hell*?o*+)123\\k | [RegExpSequence] | +| Test.java:15:10:15:45 | (?>hi)(?hell*?o*+)123\\k | [RegExpSequence] | | Test.java:15:13:15:13 | h | [RegExpConstant,RegExpNormalChar] | | Test.java:15:13:15:14 | hi | [RegExpSequence] | | Test.java:15:14:15:14 | i | [RegExpConstant,RegExpNormalChar] | @@ -65,7 +65,7 @@ parseFailures | Test.java:15:34:15:34 | 1 | [RegExpConstant,RegExpNormalChar] | | Test.java:15:35:15:35 | 2 | [RegExpConstant,RegExpNormalChar] | | Test.java:15:36:15:36 | 3 | [RegExpConstant,RegExpNormalChar] | -| Test.java:15:37:15:44 | \\k | [RegExpBackRef] | +| Test.java:15:37:15:45 | \\k | [RegExpBackRef] | | Test.java:16:10:16:10 | a | [RegExpConstant,RegExpNormalChar] | | Test.java:16:10:16:11 | a+ | [RegExpPlus] | | Test.java:16:10:16:108 | a+b*c?d{2}e{3,4}f{,5}g{6,}h+?i*?j??k{7}?l{8,9}?m{,10}?n{11,}?o++p*+q?+r{12}+s{13,14}+t{,15}+u{16,}+ | [RegExpSequence] | @@ -120,20 +120,22 @@ parseFailures | Test.java:17:30:17:35 | (?hi)(?hell*?o*+)123\\k", "a+b*c?d{2}e{3,4}f{,5}g{6,}h+?i*?j??k{7}?l{8,9}?m{,10}?n{11,}?o++p*+q?+r{12}+s{13,14}+t{,15}+u{16,}+", "(?i)(?=a)(?!b)(?<=c)(?