add handling for [character class] type (but note that we don't parse a character class yet)

This commit is contained in:
2025-10-07 08:51:46 +01:00
parent 56658a4a70
commit d6142285da

View File

@@ -2,13 +2,14 @@ structure DfaGen =
struct struct
datatype regex = datatype regex =
CHAR_LITERAL of {char: char, position: int} CHAR_LITERAL of {char: char, position: int}
| WILDCARD of int
| IS_ANY_CHARACTER of {chars: char vector, position: int}
| CONCAT of {l: regex, r: regex, leftMaxState: int, rightMaxState: int} | CONCAT of {l: regex, r: regex, leftMaxState: int, rightMaxState: int}
| ALTERNATION of {l: regex, r: regex, leftMaxState: int, rightMaxState: int} | ALTERNATION of {l: regex, r: regex, leftMaxState: int, rightMaxState: int}
| ZERO_OR_ONE of regex | ZERO_OR_ONE of regex
| ZERO_OR_MORE of regex | ZERO_OR_MORE of regex
| ONE_OR_MORE of regex | ONE_OR_MORE of regex
| GROUP of regex | GROUP of regex
| WILDCARD of int
structure Set = structure Set =
struct struct
@@ -274,6 +275,7 @@ struct
case tree of case tree of
CHAR_LITERAL _ => false CHAR_LITERAL _ => false
| WILDCARD _ => false | WILDCARD _ => false
| IS_ANY_CHARACTER _ => false
| CONCAT {l, r, ...} => isNullable l andalso isNullable r | CONCAT {l, r, ...} => isNullable l andalso isNullable r
| ALTERNATION {l, r, ...} => isNullable l orelse isNullable r | ALTERNATION {l, r, ...} => isNullable l orelse isNullable r
@@ -287,6 +289,7 @@ struct
fun firstpos (tree, acc) = fun firstpos (tree, acc) =
case tree of case tree of
CHAR_LITERAL {position, ...} => position :: acc CHAR_LITERAL {position, ...} => position :: acc
| IS_ANY_CHARACTER {position, ...} => position :: acc
| WILDCARD i => i :: acc | WILDCARD i => i :: acc
| CONCAT {l, r, ...} => | CONCAT {l, r, ...} =>
@@ -309,6 +312,7 @@ struct
fun lastpos (tree, acc) = fun lastpos (tree, acc) =
case tree of case tree of
CHAR_LITERAL {position, ...} => position :: acc CHAR_LITERAL {position, ...} => position :: acc
| IS_ANY_CHARACTER {position, ...} => position :: acc
| WILDCARD i => i :: acc | WILDCARD i => i :: acc
| CONCAT {l, r, ...} => | CONCAT {l, r, ...} =>
@@ -335,6 +339,13 @@ struct
| ONE_OR_MORE r => firstpos (r, acc) | ONE_OR_MORE r => firstpos (r, acc)
| _ => acc | _ => acc
fun chrExistsInVec (idx, vec, curChr) =
if idx = Vector.length vec then
false
else
Vector.sub (vec, idx) = curChr
orelse chrExistsInVec (idx + 1, vec, curChr)
(* Does two things: (* Does two things:
* 1. Descends to the leaf matching 'pos'. * 1. Descends to the leaf matching 'pos'.
* 2. If the character at 'pos' matches the current character, * 2. If the character at 'pos' matches the current character,
@@ -353,6 +364,10 @@ struct
* So we don't want to match it, but the wildcard can match * So we don't want to match it, but the wildcard can match
* any other character that has a different ASCII code. *) * any other character that has a different ASCII code. *)
{sawConcat = false, follows = [], charIsMatch = curChr <> #"\^@"} {sawConcat = false, follows = [], charIsMatch = curChr <> #"\^@"}
| IS_ANY_CHARACTER {chars, ...} =>
let val chrExists = chrExistsInVec (0, chars, curChr)
in {sawConcat = false, follows = [], charIsMatch = chrExists}
end
| ALTERNATION {l, r, leftMaxState, rightMaxState} => | ALTERNATION {l, r, leftMaxState, rightMaxState} =>
let val nodeToFollow = if pos <= leftMaxState then l else r let val nodeToFollow = if pos <= leftMaxState then l else r
in getFollowsForPositionAndChar (nodeToFollow, pos, curChr) in getFollowsForPositionAndChar (nodeToFollow, pos, curChr)