change dfa-gen to a functor, and use functor to instantiate different structures

This commit is contained in:
2025-10-07 14:05:45 +01:00
parent 075fec02be
commit c62e234d00

View File

@@ -1,18 +1,36 @@
structure DfaGen = signature DFA_GEN_PARAMS =
sig
val endMarker: char
val charIsEqual: char * char -> bool
end
signature DFA_GEN =
sig
type dfa = int vector vector
type dfa_state = int
val fromString: string -> dfa
val nextState: dfa * dfa_state * char -> dfa_state
val isFinal: dfa * dfa_state -> bool
val isDead: dfa_state -> bool
end
functor MakeDfaGen(Fn: DFA_GEN_PARAMS): DFA_GEN =
struct struct
datatype regex = datatype parse_tree =
CHAR_LITERAL of {char: char, position: int} CHAR_LITERAL of {char: char, position: int}
| WILDCARD of int | WILDCARD of int
| IS_ANY_CHARACTER of {chars: char vector, position: int} | IS_ANY_CHARACTER of {chars: char vector, position: int}
| NOT_ANY_CHARACTER of {chars: char vector, position: int} | NOT_ANY_CHARACTER of {chars: char vector, position: int}
| CONCAT of {l: regex, r: regex, leftMaxState: int, rightMaxState: int} | CONCAT of
| ALTERNATION of {l: regex, r: regex, leftMaxState: int, rightMaxState: int} {l: parse_tree, r: parse_tree, leftMaxState: int, rightMaxState: int}
| ZERO_OR_ONE of regex | ALTERNATION of
| ZERO_OR_MORE of regex {l: parse_tree, r: parse_tree, leftMaxState: int, rightMaxState: int}
| ONE_OR_MORE of regex | ZERO_OR_ONE of parse_tree
| GROUP of regex | ZERO_OR_MORE of parse_tree
| ONE_OR_MORE of parse_tree
val endMarker = #"\^@" | GROUP of parse_tree
structure Set = structure Set =
struct struct
@@ -346,7 +364,7 @@ struct
in SOME (pos + 1, chr, stateNum + 1) in SOME (pos + 1, chr, stateNum + 1)
end end
and climb (pos, str, lhs, level, stateNum) : (int * regex * int) option = and climb (pos, str, lhs, level, stateNum) : (int * parse_tree * int) option =
if pos = String.size str then if pos = String.size str then
SOME (pos, lhs, stateNum) SOME (pos, lhs, stateNum)
else else
@@ -511,35 +529,50 @@ struct
if idx = Vector.length vec then if idx = Vector.length vec then
false false
else else
Vector.sub (vec, idx) = curChr let
orelse chrExistsInVec (idx + 1, vec, curChr) val idxChr = Vector.sub (vec, idx)
in
Fn.charIsEqual (idxChr, curChr)
orelse chrExistsInVec (idx + 1, vec, curChr)
end
(* Does two things: (* Does two things:
* 1. Descends to the leaf matching 'pos'. * 1. Descends to the leaf matching 'pos'.
* 2. If the character at 'pos' matches the current character, * 2. If the character at 'pos' matches the current character,
* calls followpos at the appropriate nodes. * calls followpos at the appropriate nodes.
* In the end, we get a list of positions to follow. *) * In the end, we get a list of positions to follow.
fun getFollowsForPositionAndChar (regex: regex, pos, curChr) = * Note: The character #"\^@" is an endmarker
* indicating that this is the final state.
* We say that there is no match,
* even if the curChr is the endmarker. *)
fun getFollowsForPositionAndChar (regex: parse_tree, pos, curChr) =
case regex of case regex of
CHAR_LITERAL {char, position = _} => CHAR_LITERAL {char, position = _} =>
if char = curChr then let
{sawConcat = false, follows = [], charIsMatch = true} val charIsMatch =
else Fn.charIsEqual (char, curChr)
{sawConcat = false, follows = [], charIsMatch = false} andalso not (Fn.charIsEqual (curChr, Fn.endMarker))
in
{sawConcat = false, follows = [], charIsMatch = charIsMatch}
end
| WILDCARD _ => | WILDCARD _ =>
(* we are treating a char that has ASCII code 0 let val charIsMatch = not (Fn.charIsEqual (curChr, Fn.endMarker))
* as an end marker which will not appear anywhere else. in {sawConcat = false, follows = [], charIsMatch = charIsMatch}
* So we don't want to match it, but the wildcard can match end
* any other character that has a different ASCII code. *)
{sawConcat = false, follows = [], charIsMatch = curChr <> endMarker}
| IS_ANY_CHARACTER {chars, ...} => | IS_ANY_CHARACTER {chars, ...} =>
let val chrExists = chrExistsInVec (0, chars, curChr) let
in {sawConcat = false, follows = [], charIsMatch = chrExists} val chrExists = chrExistsInVec (0, chars, curChr)
val chrExists =
chrExists andalso not (Fn.charIsEqual (curChr, Fn.endMarker))
in
{sawConcat = false, follows = [], charIsMatch = chrExists}
end end
| NOT_ANY_CHARACTER {chars, ...} => | NOT_ANY_CHARACTER {chars, ...} =>
let let
val charIsValid = chrExistsInVec (0, chars, curChr) val charIsValid = chrExistsInVec (0, chars, curChr)
val charIsValid = not charIsValid andalso curChr <> endMarker val charIsValid =
charIsValid andalso Fn.charIsEqual (curChr, Fn.endMarker)
val charIsValid = not charIsValid
in in
{sawConcat = false, follows = [], charIsMatch = charIsValid} {sawConcat = false, follows = [], charIsMatch = charIsValid}
end end
@@ -754,7 +787,7 @@ struct
SOME (ast, numStates) => SOME (ast, numStates) =>
let let
val endMarker = val endMarker =
CHAR_LITERAL {char = endMarker, position = numStates + 1} CHAR_LITERAL {char = Fn.endMarker, position = numStates + 1}
val ast = CONCAT val ast = CONCAT
{ l = ast { l = ast
, leftMaxState = numStates , leftMaxState = numStates
@@ -767,18 +800,33 @@ struct
| NONE => Vector.fromList [] | NONE => Vector.fromList []
type dfa = int vector vector type dfa = int vector vector
type dfa_state = int
fun nextState (dfa: dfa, curState, chr) = fun nextState (dfa: dfa, curState: dfa_state, chr) =
let val curTable = Vector.sub (dfa, curState) let val curTable = Vector.sub (dfa, curState)
in Vector.sub (curTable, Char.ord chr) in Vector.sub (curTable, Char.ord chr)
end end
fun isFinal (dfa: dfa, curState) = fun isFinal (dfa: dfa, curState: dfa_state) =
curState <> ~1 curState <> ~1
andalso andalso
let val curTable = Vector.sub (dfa, curState) let val curTable = Vector.sub (dfa, curState)
in Vector.sub (curTable, 0) <> ~1 in Vector.sub (curTable, 0) <> ~1
end end
fun isDead curState = curState = ~1 fun isDead (curState: dfa_state) = curState = ~1
end end
structure CaseInsensitiveDfa =
MakeDfaGen
(struct
val endMarker = #"\^@"
fun charIsEqual (a: char, b: char) = a = b
end)
structure CaseSensitiveDfa =
MakeDfaGen
(struct
val endMarker = #"\^@"
fun charIsEqual (a: char, b: char) = Char.toLower a = Char.toLower b
end)