make the character search skip unencodable characters

This commit is contained in:
Erik Krogh Kristensen
2020-11-18 11:55:49 +01:00
parent 55f2f86a26
commit 8270bf5bb9
2 changed files with 23 additions and 8 deletions

View File

@@ -122,6 +122,13 @@ class RegExpRepetition extends RegExpParent {
} }
} }
/**
* A constant in a regular expression that represents valid Unicode character(s).
*/
class RegexpCharacterConstant extends RegExpConstant {
RegexpCharacterConstant() { this.isCharacter() }
}
/** /**
* Gets the root containing the given term, that is, the root of the literal, * Gets the root containing the given term, that is, the root of the literal,
* or a branch of the root disjunction. * or a branch of the root disjunction.
@@ -136,7 +143,9 @@ RegExpRoot getRoot(RegExpTerm term) {
*/ */
newtype TInputSymbol = newtype TInputSymbol =
/** An input symbol corresponding to character `c`. */ /** An input symbol corresponding to character `c`. */
Char(string c) { c = any(RegExpConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_) } or Char(string c) {
c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
} or
/** /**
* An input symbol representing all characters matched by * An input symbol representing all characters matched by
* (non-universal) character class `recc`. * (non-universal) character class `recc`.
@@ -173,7 +182,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
*/ */
private predicate belongsTo(TInputSymbol a, RegExpRoot root) { private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
exists(RegExpTerm term | getRoot(term) = root | exists(RegExpTerm term | getRoot(term) = root |
a = Char(term.(RegExpConstant).getValue().charAt(_)) a = Char(term.(RegexpCharacterConstant).getValue().charAt(_))
or or
a = CharClass(term) a = CharClass(term)
) )
@@ -236,7 +245,7 @@ private module CharacterClasses {
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
exists(CharClass(cc)) and exists(CharClass(cc)) and
exists(RegExpTerm child | child = cc.getAChild() | exists(RegExpTerm child | child = cc.getAChild() |
char = child.(RegExpConstant).getValue() char = child.(RegexpCharacterConstant).getValue()
or or
rangeMatchesOnLetterOrDigits(child, char) rangeMatchesOnLetterOrDigits(child, char)
or or
@@ -300,7 +309,7 @@ private module CharacterClasses {
private string getARelevantChar() { private string getARelevantChar() {
exists(ascii(result)) exists(ascii(result))
or or
exists(RegExpConstant c | result = c.getValue().charAt(_)) exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
or or
classEscapeMatches(_, result) classEscapeMatches(_, result)
} }
@@ -310,7 +319,7 @@ private module CharacterClasses {
*/ */
private string getAMentionedChar(RegExpCharacterClass c) { private string getAMentionedChar(RegExpCharacterClass c) {
exists(RegExpTerm child | child = c.getAChild() | exists(RegExpTerm child | child = c.getAChild() |
result = child.(RegExpConstant).getValue() result = child.(RegexpCharacterConstant).getValue()
or or
child.(RegExpCharacterRange).isRange(result, _) child.(RegExpCharacterRange).isRange(result, _)
or or
@@ -439,7 +448,7 @@ newtype TState =
( (
i = 0 i = 0
or or
exists(t.(RegExpConstant).getValue().charAt(i)) exists(t.(RegexpCharacterConstant).getValue().charAt(i))
) )
} or } or
Accept(RegExpRoot l) { l.isRelevant() } Accept(RegExpRoot l) { l.isRelevant() }
@@ -511,7 +520,7 @@ State after(RegExpTerm t) {
* Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`. * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
*/ */
predicate delta(State q1, EdgeLabel lbl, State q2) { predicate delta(State q1, EdgeLabel lbl, State q2) {
exists(RegExpConstant s, int i | exists(RegexpCharacterConstant s, int i |
q1 = Match(s, i) and q1 = Match(s, i) and
lbl = Char(s.getValue().charAt(i)) and lbl = Char(s.getValue().charAt(i)) and
( (

View File

@@ -272,4 +272,10 @@ var bad60 = /(.thisisagoddamnlongstringforstresstestingthequery|\sthisisagoddamn
var bad61 = /(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-/ var bad61 = /(thisisagoddamnlongstringforstresstestingthequery|this\w+query)*-/
// GOOD // GOOD
var good27 = /(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-/ var good27 = /(thisisagoddamnlongstringforstresstestingthequery|imanotherbutunrelatedstringcomparedtotheotherstring)*-/
// GOOD
var good28 = /foo([\uDC66\uDC67]|[\uDC68\uDC69])*foo/
// GOOD
var good29 = /foo((\uDC66|\uDC67)|(\uDC68|\uDC69))*foo/