Fix parsing of POSIX bracket expressions.

The docs are misleading. [[:alpha:]] is actually a character class
*containing* a POSIX bracket expression, and that means you can have
expressions like [[:alpha:][:digit:]_?!]
This commit is contained in:
Nick Rolfe
2021-07-28 15:02:13 +01:00
parent 5d336d8e1d
commit 3abe047cac
3 changed files with 220 additions and 110 deletions

View File

@@ -223,252 +223,334 @@ regexp.rb:
# 26| [RegExpConstant, RegExpNormalChar] -
# 29| [RegExpStar] .*
# 29| [RegExpCharacterClass] [[a-f]
#-----| 0 -> [RegExpConstant, RegExpNormalChar] [
#-----| 1 -> [RegExpCharacterRange] a-f
# 29| [RegExpSequence] [[a-f]A-F]
#-----| 0 -> [RegExpCharacterClass] [[a-f]
#-----| 1 -> [RegExpConstant, RegExpNormalChar] A
#-----| 2 -> [RegExpConstant, RegExpNormalChar] -
#-----| 3 -> [RegExpConstant, RegExpNormalChar] F
#-----| 4 -> [RegExpConstant, RegExpNormalChar] ]
# 29| [RegExpConstant, RegExpNormalChar] [
# 29| [RegExpCharacterRange] a-f
#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
#-----| 1 -> [RegExpConstant, RegExpNormalChar] f
# 29| [RegExpConstant, RegExpNormalChar] a
# 29| [RegExpConstant, RegExpNormalChar] f
# 29| [RegExpConstant, RegExpNormalChar] A
# 29| [RegExpConstant, RegExpNormalChar] -
# 29| [RegExpConstant, RegExpNormalChar] F
# 29| [RegExpConstant, RegExpNormalChar] ]
# 32| [RegExpStar] .*
#-----| 0 -> [RegExpDot] .
# 29| [RegExpDot] .
# 32| [RegExpDot] .
# 30| [RegExpStar] .*
# 33| [RegExpStar] .*
#-----| 0 -> [RegExpDot] .
# 30| [RegExpDot] .
# 33| [RegExpDot] .
# 31| [RegExpPlus] \w+
# 34| [RegExpPlus] \w+
#-----| 0 -> [RegExpCharacterClassEscape] \w
# 31| [RegExpCharacterClassEscape] \w
# 34| [RegExpCharacterClassEscape] \w
# 31| [RegExpSequence] \w+\W
# 34| [RegExpSequence] \w+\W
#-----| 0 -> [RegExpPlus] \w+
#-----| 1 -> [RegExpCharacterClassEscape] \W
# 31| [RegExpCharacterClassEscape] \W
# 34| [RegExpCharacterClassEscape] \W
# 32| [RegExpCharacterClassEscape] \s
# 35| [RegExpCharacterClassEscape] \s
# 32| [RegExpSequence] \s\S
# 35| [RegExpSequence] \s\S
#-----| 0 -> [RegExpCharacterClassEscape] \s
#-----| 1 -> [RegExpCharacterClassEscape] \S
# 32| [RegExpCharacterClassEscape] \S
# 35| [RegExpCharacterClassEscape] \S
# 33| [RegExpCharacterClassEscape] \d
# 36| [RegExpCharacterClassEscape] \d
# 33| [RegExpSequence] \d\D
# 36| [RegExpSequence] \d\D
#-----| 0 -> [RegExpCharacterClassEscape] \d
#-----| 1 -> [RegExpCharacterClassEscape] \D
# 33| [RegExpCharacterClassEscape] \D
# 36| [RegExpCharacterClassEscape] \D
# 34| [RegExpCharacterClassEscape] \h
# 37| [RegExpCharacterClassEscape] \h
# 34| [RegExpSequence] \h\H
# 37| [RegExpSequence] \h\H
#-----| 0 -> [RegExpCharacterClassEscape] \h
#-----| 1 -> [RegExpCharacterClassEscape] \H
# 34| [RegExpCharacterClassEscape] \H
# 37| [RegExpCharacterClassEscape] \H
# 35| [RegExpConstant, RegExpEscape] \n
# 38| [RegExpConstant, RegExpEscape] \n
# 35| [RegExpSequence] \n\r\t
# 38| [RegExpSequence] \n\r\t
#-----| 0 -> [RegExpConstant, RegExpEscape] \n
#-----| 1 -> [RegExpConstant, RegExpEscape] \r
#-----| 2 -> [RegExpConstant, RegExpEscape] \t
# 35| [RegExpConstant, RegExpEscape] \r
# 38| [RegExpConstant, RegExpEscape] \r
# 35| [RegExpConstant, RegExpEscape] \t
# 38| [RegExpConstant, RegExpEscape] \t
# 38| [RegExpStar] (foo)*
# 41| [RegExpStar] (foo)*
#-----| 0 -> [RegExpGroup] (foo)
# 38| [RegExpGroup] (foo)
# 41| [RegExpGroup] (foo)
#-----| 0 -> [RegExpSequence] foo
# 38| [RegExpSequence] (foo)*bar
# 41| [RegExpSequence] (foo)*bar
#-----| 0 -> [RegExpStar] (foo)*
#-----| 1 -> [RegExpConstant, RegExpNormalChar] b
#-----| 2 -> [RegExpConstant, RegExpNormalChar] a
#-----| 3 -> [RegExpConstant, RegExpNormalChar] r
# 38| [RegExpConstant, RegExpNormalChar] f
# 41| [RegExpConstant, RegExpNormalChar] f
# 38| [RegExpSequence] foo
# 41| [RegExpSequence] foo
#-----| 0 -> [RegExpConstant, RegExpNormalChar] f
#-----| 1 -> [RegExpConstant, RegExpNormalChar] o
#-----| 2 -> [RegExpConstant, RegExpNormalChar] o
# 38| [RegExpConstant, RegExpNormalChar] o
# 41| [RegExpConstant, RegExpNormalChar] o
# 38| [RegExpConstant, RegExpNormalChar] o
# 41| [RegExpConstant, RegExpNormalChar] o
# 38| [RegExpConstant, RegExpNormalChar] b
# 41| [RegExpConstant, RegExpNormalChar] b
# 38| [RegExpConstant, RegExpNormalChar] a
# 41| [RegExpConstant, RegExpNormalChar] a
# 38| [RegExpConstant, RegExpNormalChar] r
# 41| [RegExpConstant, RegExpNormalChar] r
# 39| [RegExpConstant, RegExpNormalChar] f
# 42| [RegExpConstant, RegExpNormalChar] f
# 39| [RegExpSequence] fo(o|b)ar
# 42| [RegExpSequence] fo(o|b)ar
#-----| 0 -> [RegExpConstant, RegExpNormalChar] f
#-----| 1 -> [RegExpConstant, RegExpNormalChar] o
#-----| 2 -> [RegExpGroup] (o|b)
#-----| 3 -> [RegExpConstant, RegExpNormalChar] a
#-----| 4 -> [RegExpConstant, RegExpNormalChar] r
# 39| [RegExpConstant, RegExpNormalChar] o
# 42| [RegExpConstant, RegExpNormalChar] o
# 39| [RegExpGroup] (o|b)
# 42| [RegExpGroup] (o|b)
#-----| 0 -> [RegExpAlt] o|b
# 39| [RegExpAlt] o|b
# 42| [RegExpAlt] o|b
#-----| 0 -> [RegExpConstant, RegExpNormalChar] o
#-----| 1 -> [RegExpConstant, RegExpNormalChar] b
# 39| [RegExpConstant, RegExpNormalChar] o
# 42| [RegExpConstant, RegExpNormalChar] o
# 39| [RegExpConstant, RegExpNormalChar] b
# 42| [RegExpConstant, RegExpNormalChar] b
# 39| [RegExpConstant, RegExpNormalChar] a
# 42| [RegExpConstant, RegExpNormalChar] a
# 39| [RegExpConstant, RegExpNormalChar] r
# 42| [RegExpConstant, RegExpNormalChar] r
# 40| [RegExpGroup] (a|b|cd)
# 43| [RegExpGroup] (a|b|cd)
#-----| 0 -> [RegExpAlt] a|b|cd
# 40| [RegExpSequence] (a|b|cd)e
# 43| [RegExpSequence] (a|b|cd)e
#-----| 0 -> [RegExpGroup] (a|b|cd)
#-----| 1 -> [RegExpConstant, RegExpNormalChar] e
# 40| [RegExpAlt] a|b|cd
# 43| [RegExpAlt] a|b|cd
#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
#-----| 1 -> [RegExpConstant, RegExpNormalChar] b
#-----| 2 -> [RegExpSequence] cd
# 40| [RegExpConstant, RegExpNormalChar] a
# 43| [RegExpConstant, RegExpNormalChar] a
# 40| [RegExpConstant, RegExpNormalChar] b
# 43| [RegExpConstant, RegExpNormalChar] b
# 40| [RegExpConstant, RegExpNormalChar] c
# 43| [RegExpConstant, RegExpNormalChar] c
# 40| [RegExpSequence] cd
# 43| [RegExpSequence] cd
#-----| 0 -> [RegExpConstant, RegExpNormalChar] c
#-----| 1 -> [RegExpConstant, RegExpNormalChar] d
# 40| [RegExpConstant, RegExpNormalChar] d
# 43| [RegExpConstant, RegExpNormalChar] d
# 40| [RegExpConstant, RegExpNormalChar] e
# 43| [RegExpConstant, RegExpNormalChar] e
# 41| [RegExpGroup] (?::+)
# 44| [RegExpGroup] (?::+)
#-----| 0 -> [RegExpPlus] :+
# 41| [RegExpSequence] (?::+)\w
# 44| [RegExpSequence] (?::+)\w
#-----| 0 -> [RegExpGroup] (?::+)
#-----| 1 -> [RegExpCharacterClassEscape] \w
# 41| [RegExpPlus] :+
# 44| [RegExpPlus] :+
#-----| 0 -> [RegExpConstant, RegExpNormalChar] :
# 41| [RegExpConstant, RegExpNormalChar] :
# 41| [RegExpCharacterClassEscape] \w
# 44| [RegExpGroup] (?<id>\w+)
#-----| 0 -> [RegExpPlus] \w+
# 44| [RegExpPlus] \w+
#-----| 0 -> [RegExpCharacterClassEscape] \w
# 44| [RegExpConstant, RegExpNormalChar] :
# 44| [RegExpCharacterClassEscape] \w
# 45| [RegExpGroup] (?'foo'fo+)
# 47| [RegExpGroup] (?<id>\w+)
#-----| 0 -> [RegExpPlus] \w+
# 47| [RegExpPlus] \w+
#-----| 0 -> [RegExpCharacterClassEscape] \w
# 47| [RegExpCharacterClassEscape] \w
# 48| [RegExpGroup] (?'foo'fo+)
#-----| 0 -> [RegExpSequence] fo+
# 45| [RegExpConstant, RegExpNormalChar] f
# 48| [RegExpConstant, RegExpNormalChar] f
# 45| [RegExpSequence] fo+
# 48| [RegExpSequence] fo+
#-----| 0 -> [RegExpConstant, RegExpNormalChar] f
#-----| 1 -> [RegExpPlus] o+
# 45| [RegExpPlus] o+
# 48| [RegExpPlus] o+
#-----| 0 -> [RegExpConstant, RegExpNormalChar] o
# 45| [RegExpConstant, RegExpNormalChar] o
# 48| [RegExpConstant, RegExpNormalChar] o
# 48| [RegExpGroup] (a+)
# 51| [RegExpGroup] (a+)
#-----| 0 -> [RegExpPlus] a+
# 48| [RegExpSequence] (a+)b+\1
# 51| [RegExpSequence] (a+)b+\1
#-----| 0 -> [RegExpGroup] (a+)
#-----| 1 -> [RegExpPlus] b+
#-----| 2 -> [RegExpBackRef] \1
# 48| [RegExpPlus] a+
# 51| [RegExpPlus] a+
#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
# 48| [RegExpConstant, RegExpNormalChar] a
# 51| [RegExpConstant, RegExpNormalChar] a
# 48| [RegExpPlus] b+
# 51| [RegExpPlus] b+
#-----| 0 -> [RegExpConstant, RegExpNormalChar] b
# 48| [RegExpConstant, RegExpNormalChar] b
# 51| [RegExpConstant, RegExpNormalChar] b
# 48| [RegExpBackRef] \1
# 51| [RegExpBackRef] \1
# 49| [RegExpGroup] (?<qux>q+)
# 52| [RegExpGroup] (?<qux>q+)
#-----| 0 -> [RegExpPlus] q+
# 49| [RegExpSequence] (?<qux>q+)\s+\k<qux>+
# 52| [RegExpSequence] (?<qux>q+)\s+\k<qux>+
#-----| 0 -> [RegExpGroup] (?<qux>q+)
#-----| 1 -> [RegExpPlus] \s+
#-----| 2 -> [RegExpPlus] \k<qux>+
# 49| [RegExpPlus] q+
# 52| [RegExpPlus] q+
#-----| 0 -> [RegExpConstant, RegExpNormalChar] q
# 49| [RegExpConstant, RegExpNormalChar] q
# 52| [RegExpConstant, RegExpNormalChar] q
# 49| [RegExpPlus] \s+
# 52| [RegExpPlus] \s+
#-----| 0 -> [RegExpCharacterClassEscape] \s
# 49| [RegExpCharacterClassEscape] \s
# 52| [RegExpCharacterClassEscape] \s
# 49| [RegExpBackRef] \k<qux>
# 52| [RegExpBackRef] \k<qux>
# 49| [RegExpPlus] \k<qux>+
# 52| [RegExpPlus] \k<qux>+
#-----| 0 -> [RegExpBackRef] \k<qux>
# 52| [RegExpNamedCharacterProperty] \p{Word}
# 55| [RegExpNamedCharacterProperty] \p{Word}
# 52| [RegExpStar] \p{Word}*
# 55| [RegExpStar] \p{Word}*
#-----| 0 -> [RegExpNamedCharacterProperty] \p{Word}
# 53| [RegExpNamedCharacterProperty] \P{Digit}
# 56| [RegExpNamedCharacterProperty] \P{Digit}
# 53| [RegExpPlus] \P{Digit}+
# 56| [RegExpPlus] \P{Digit}+
#-----| 0 -> [RegExpNamedCharacterProperty] \P{Digit}
# 54| [RegExpNamedCharacterProperty] \p{^Alnum}
# 57| [RegExpNamedCharacterProperty] \p{^Alnum}
# 54| [RegExpRange] \p{^Alnum}{2,3}
# 57| [RegExpRange] \p{^Alnum}{2,3}
#-----| 0 -> [RegExpNamedCharacterProperty] \p{^Alnum}
# 54| [RegExpNormalChar] 2
# 57| [RegExpNormalChar] 2
# 54| [RegExpNormalChar] ,
# 57| [RegExpNormalChar] ,
# 54| [RegExpNormalChar] 3
# 57| [RegExpNormalChar] 3
# 54| [RegExpNormalChar] }
# 57| [RegExpNormalChar] }
# 55| [RegExpNamedCharacterProperty] [[:alpha:]]
# 58| [RegExpCharacterClass] [a-f\p{Digit}]
#-----| 0 -> [RegExpCharacterRange] a-f
#-----| 1 -> [RegExpNamedCharacterProperty] \p{Digit}
# 55| [RegExpSequence] [[:alpha:]][[:digit:]]+
#-----| 0 -> [RegExpNamedCharacterProperty] [[:alpha:]]
#-----| 1 -> [RegExpPlus] [[:digit:]]+
# 58| [RegExpPlus] [a-f\p{Digit}]+
#-----| 0 -> [RegExpCharacterClass] [a-f\p{Digit}]
# 55| [RegExpNamedCharacterProperty] [[:digit:]]
# 58| [RegExpCharacterRange] a-f
#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
#-----| 1 -> [RegExpConstant, RegExpNormalChar] f
# 55| [RegExpPlus] [[:digit:]]+
#-----| 0 -> [RegExpNamedCharacterProperty] [[:digit:]]
# 58| [RegExpConstant, RegExpNormalChar] a
# 58| [RegExpConstant, RegExpNormalChar] f
# 58| [RegExpNamedCharacterProperty] \p{Digit}
# 61| [RegExpCharacterClass] [[:alpha:]]
#-----| 0 -> [RegExpNamedCharacterProperty] [:alpha:]
# 61| [RegExpSequence] [[:alpha:]][[:digit:]]
#-----| 0 -> [RegExpCharacterClass] [[:alpha:]]
#-----| 1 -> [RegExpCharacterClass] [[:digit:]]
# 61| [RegExpNamedCharacterProperty] [:alpha:]
# 61| [RegExpCharacterClass] [[:digit:]]
#-----| 0 -> [RegExpNamedCharacterProperty] [:digit:]
# 61| [RegExpNamedCharacterProperty] [:digit:]
# 64| [RegExpCharacterClass] [[:alpha:][:digit:]]
#-----| 0 -> [RegExpNamedCharacterProperty] [:alpha:]
#-----| 1 -> [RegExpNamedCharacterProperty] [:digit:]
# 64| [RegExpNamedCharacterProperty] [:alpha:]
# 64| [RegExpNamedCharacterProperty] [:digit:]
# 67| [RegExpCharacterClass] [A-F[:digit:]a-f]
#-----| 0 -> [RegExpCharacterRange] A-F
#-----| 1 -> [RegExpNamedCharacterProperty] [:digit:]
#-----| 2 -> [RegExpCharacterRange] a-f
# 67| [RegExpCharacterRange] A-F
#-----| 0 -> [RegExpConstant, RegExpNormalChar] A
#-----| 1 -> [RegExpConstant, RegExpNormalChar] F
# 67| [RegExpConstant, RegExpNormalChar] A
# 67| [RegExpConstant, RegExpNormalChar] F
# 67| [RegExpNamedCharacterProperty] [:digit:]
# 67| [RegExpCharacterRange] a-f
#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
#-----| 1 -> [RegExpConstant, RegExpNormalChar] f
# 67| [RegExpConstant, RegExpNormalChar] a
# 67| [RegExpConstant, RegExpNormalChar] f
# 70| [RegExpNamedCharacterProperty] [:digit:]

View File

@@ -25,6 +25,9 @@
/[^]]/ # MRI gives a warning, but accepts this as matching anything except ']'
/[^-]/
# Nested character classes
/[[a-f]A-F]/ # BAD - not parsed correctly
# Meta-character classes
/.*/
/.*/m
@@ -48,8 +51,20 @@
/(a+)b+\1/
/(?<qux>q+)\s+\k<qux>+/
# Named character properties
# Named character properties using the p-style syntax
/\p{Word}*/
/\P{Digit}+/
/\p{^Alnum}{2,3}/
/[[:alpha:]][[:digit:]]+/
/[a-f\p{Digit}]+/ # Also valid inside character classes
# Two separate character classes, each containing a single POSIX bracket expression
/[[:alpha:]][[:digit:]]/
# A single character class containing two POSIX bracket expressions
/[[:alpha:][:digit:]]/
# A single character class containing two ranges and one POSIX bracket expression
/[A-F[:digit:]a-f]/
# *Not* a POSIX bracket expression; just a regular character class.
/[:digit:]/