add support for [^negated_character_classes], although we don't parse them yet

This commit is contained in:
2025-10-07 08:57:01 +01:00
parent d6142285da
commit 8eed2ef51a

View File

@@ -4,6 +4,7 @@ struct
CHAR_LITERAL of {char: char, position: int} CHAR_LITERAL of {char: char, position: int}
| WILDCARD of int | WILDCARD of int
| IS_ANY_CHARACTER of {chars: char vector, position: int} | IS_ANY_CHARACTER of {chars: char vector, position: int}
| NOT_ANY_CHARACTER of {chars: char vector, position: int}
| CONCAT of {l: regex, r: regex, leftMaxState: int, rightMaxState: int} | CONCAT of {l: regex, r: regex, leftMaxState: int, rightMaxState: int}
| ALTERNATION of {l: regex, r: regex, leftMaxState: int, rightMaxState: int} | ALTERNATION of {l: regex, r: regex, leftMaxState: int, rightMaxState: int}
| ZERO_OR_ONE of regex | ZERO_OR_ONE of regex
@@ -11,6 +12,8 @@ struct
| ONE_OR_MORE of regex | ONE_OR_MORE of regex
| GROUP of regex | GROUP of regex
val endMarker = #"\^@"
structure Set = structure Set =
struct struct
datatype 'a set = BRANCH of 'a set * int * 'a * 'a set | LEAF datatype 'a set = BRANCH of 'a set * int * 'a * 'a set | LEAF
@@ -276,6 +279,7 @@ struct
CHAR_LITERAL _ => false CHAR_LITERAL _ => false
| WILDCARD _ => false | WILDCARD _ => false
| IS_ANY_CHARACTER _ => false | IS_ANY_CHARACTER _ => false
| NOT_ANY_CHARACTER _ => false
| CONCAT {l, r, ...} => isNullable l andalso isNullable r | CONCAT {l, r, ...} => isNullable l andalso isNullable r
| ALTERNATION {l, r, ...} => isNullable l orelse isNullable r | ALTERNATION {l, r, ...} => isNullable l orelse isNullable r
@@ -290,6 +294,7 @@ struct
case tree of case tree of
CHAR_LITERAL {position, ...} => position :: acc CHAR_LITERAL {position, ...} => position :: acc
| IS_ANY_CHARACTER {position, ...} => position :: acc | IS_ANY_CHARACTER {position, ...} => position :: acc
| NOT_ANY_CHARACTER {position, ...} => position :: acc
| WILDCARD i => i :: acc | WILDCARD i => i :: acc
| CONCAT {l, r, ...} => | CONCAT {l, r, ...} =>
@@ -313,6 +318,7 @@ struct
case tree of case tree of
CHAR_LITERAL {position, ...} => position :: acc CHAR_LITERAL {position, ...} => position :: acc
| IS_ANY_CHARACTER {position, ...} => position :: acc | IS_ANY_CHARACTER {position, ...} => position :: acc
| NOT_ANY_CHARACTER {position, ...} => position :: acc
| WILDCARD i => i :: acc | WILDCARD i => i :: acc
| CONCAT {l, r, ...} => | CONCAT {l, r, ...} =>
@@ -363,11 +369,18 @@ struct
* as an end marker which will not appear anywhere else. * as an end marker which will not appear anywhere else.
* So we don't want to match it, but the wildcard can match * So we don't want to match it, but the wildcard can match
* any other character that has a different ASCII code. *) * any other character that has a different ASCII code. *)
{sawConcat = false, follows = [], charIsMatch = curChr <> #"\^@"} {sawConcat = false, follows = [], charIsMatch = curChr <> endMarker}
| IS_ANY_CHARACTER {chars, ...} => | IS_ANY_CHARACTER {chars, ...} =>
let val chrExists = chrExistsInVec (0, chars, curChr) let val chrExists = chrExistsInVec (0, chars, curChr)
in {sawConcat = false, follows = [], charIsMatch = chrExists} in {sawConcat = false, follows = [], charIsMatch = chrExists}
end end
| NOT_ANY_CHARACTER {chars, ...} =>
let
val charIsValid = chrExistsInVec (0, chars, curChr)
val charIsValid = not charIsValid andalso curChr <> endMarker
in
{sawConcat = false, follows = [], charIsMatch = charIsValid}
end
| ALTERNATION {l, r, leftMaxState, rightMaxState} => | ALTERNATION {l, r, leftMaxState, rightMaxState} =>
let val nodeToFollow = if pos <= leftMaxState then l else r let val nodeToFollow = if pos <= leftMaxState then l else r
in getFollowsForPositionAndChar (nodeToFollow, pos, curChr) in getFollowsForPositionAndChar (nodeToFollow, pos, curChr)
@@ -578,7 +591,8 @@ struct
case ParseDfa.parse (str, 0) of case ParseDfa.parse (str, 0) of
SOME (ast, numStates) => SOME (ast, numStates) =>
let let
val endMarker = CHAR_LITERAL {char = #"\^@", position = numStates + 1} val endMarker =
CHAR_LITERAL {char = endMarker, position = numStates + 1}
val ast = CONCAT val ast = CONCAT
{ l = ast { l = ast
, leftMaxState = numStates , leftMaxState = numStates