From a44afca40b8f61c912fb32a1ef35581ece599501 Mon Sep 17 00:00:00 2001 From: Humza Shahid Date: Fri, 10 Oct 2025 11:54:34 +0100 Subject: [PATCH] checkpoint for reimplementing dfa-gen.sml --- fcore/search-list/dfa-gen.sml | 329 +++++++++++++++++++++------------- 1 file changed, 200 insertions(+), 129 deletions(-) diff --git a/fcore/search-list/dfa-gen.sml b/fcore/search-list/dfa-gen.sml index 0a6ace6..39082c3 100644 --- a/fcore/search-list/dfa-gen.sml +++ b/fcore/search-list/dfa-gen.sml @@ -94,6 +94,11 @@ struct struct datatype 'a set = BRANCH of 'a set * int * 'a * 'a set | LEAF + fun isEmpty set = + case set of + BRANCH _ => false + | LEAF => true + fun insertOrReplace (newKey, newVal, tree) = case tree of BRANCH (l, curKey, curVal, r) => @@ -690,130 +695,6 @@ struct end | [] => Set.keysToList followSet - fun appendIfNew (pos, dstates, newStates) = - if pos = Vector.length dstates then - let - val record = {transitions = newStates, marked = false} - val dstates = Vector.concat [dstates, Vector.fromList [record]] - in - (pos, dstates) - end - else - let - val {transitions: int list, marked = _} = Vector.sub (dstates, pos) - in - if transitions = newStates then (pos, dstates) - else appendIfNew (pos + 1, dstates, newStates) - end - - fun getUnmarkedTransitionsIfExists (pos, dstates) = - if pos = Vector.length dstates then - NONE - else - let - val record = Vector.sub (dstates, pos) - in - if #marked record then - getUnmarkedTransitionsIfExists (pos + 1, dstates) - else - SOME (pos, #transitions record) - end - - (* the int key in dtran refers to the char code - * while the int value refers to the idx from dstates - * that this char transitions to *) - type dtran = int Set.set - - fun makeEmptyVec _ = ~1 - - fun convertChar - ( char - , regex - , dstates - , dtran: dtran vector - , curStates - , curStatesIdx - , setForCurStates - ) = - if char < 0 then - let - (* append setForCurStates which was accumulated in this function - * to the end of dtran. *) - val dtran = Vector.concat [dtran, Vector.fromList [setForCurStates]] - in - (dstates, dtran) - end - else - let - (* get union of all follow positions *) - val u = - getFollowPositionsFromList - (curStates, regex, Char.chr char, Set.LEAF) - in - case u of - [] => - (* no follow positions from here, so don't add to dstates *) - convertChar - ( char - 1 - , regex - , dstates - , dtran - , curStates - , curStatesIdx - , setForCurStates - ) - | _ => - let - (* add follow positions to dstates if they are not already inside - * and if follow is not empty *) - val (newStateIdx, dstates) = appendIfNew (0, dstates, u) - - (* update dtran to include transitions for char. *) - val setForCurStates = - Set.insertOrReplace (char, newStateIdx, setForCurStates) - in - convertChar - ( char - 1 - , regex - , dstates - , dtran - , curStates - , curStatesIdx - , setForCurStates - ) - end - end - - fun convertLoop (regex, dstates, dtran) = - case getUnmarkedTransitionsIfExists (0, dstates) of - SOME (unmarkedIdx, unamarkedTransition) => - let - (* mark transition *) - val dstates = - let - val newMark = {marked = true, transitions = unamarkedTransition} - in - Vector.update (dstates, unmarkedIdx, newMark) - end - - val (dstates, dtran) = convertChar - ( 255 - , regex - , dstates - , dtran - , unamarkedTransition - , unmarkedIdx - , Set.LEAF - ) - in - convertLoop (regex, dstates, dtran) - end - | NONE => - Vector.map - (fn set => - Vector.tabulate (256, fn i => Set.getOrDefault (i, set, ~1))) - dtran - fun addKeysToFollowSet (lst, addSet, followSet) = case lst of hd :: tl => @@ -867,13 +748,203 @@ struct | ZERO_OR_ONE child => addToFollowSet (child, followSet) | GROUP child => addToFollowSet (child, followSet) - fun convert (regex, numStates) = + + fun appendIfNew (pos, dstates, newStates) = + if pos = Vector.length dstates then + let + val record = {transitions = newStates, marked = false} + val dstates = Vector.concat [dstates, Vector.fromList [record]] + in + (pos, dstates) + end + else + let + val {transitions: int list, marked = _} = Vector.sub (dstates, pos) + in + if transitions = newStates then (pos, dstates) + else appendIfNew (pos + 1, dstates, newStates) + end + + fun getUnmarkedTransitionsIfExists (pos, dstates) = + if pos = Vector.length dstates then + NONE + else + let + val record = Vector.sub (dstates, pos) + in + if #marked record then + getUnmarkedTransitionsIfExists (pos + 1, dstates) + else + SOME (pos, #transitions record) + end + + (* the int key in dtran refers to the char code + * while the int value refers to the idx from dstates + * that this char transitions to *) + type dtran = int Set.set + + fun isCharMatch (regex, pos, curChr) = + case regex of + CHAR_LITERAL {char, ...} => Fn.charIsEqual (char, curChr) + | WILDCARD _ => Fn.charIsNotEqual (curChr, Fn.endMarker) + | IS_ANY_CHARACTER {chars, ...} => chrExistsInVec (0, chars, curChr) + | NOT_ANY_CHARACTER {chars, ...} => + let val charIsValid = chrExistsInVec (0, chars, curChr) + in not charIsValid andalso Fn.charIsNotEqual (curChr, Fn.endMarker) + end + | ALTERNATION {l, r, leftMaxState, ...} => + if pos > leftMaxState then isCharMatch (r, pos, curChr) + else isCharMatch (l, pos, curChr) + | CONCAT {l, r, leftMaxState, ...} => + if pos > leftMaxState then isCharMatch (r, pos, curChr) + else isCharMatch (l, pos, curChr) + | ZERO_OR_ONE child => isCharMatch (child, pos, curChr) + | ZERO_OR_MORE child => isCharMatch (child, pos, curChr) + | ONE_OR_MORE child => isCharMatch (child, pos, curChr) + | GROUP child => isCharMatch (child, pos, curChr) + + fun positionsThatCorrespondToChar (char, curStates, followsForChar, regex) = + case curStates of + [] => Set.keysToList followsForChar + | pos :: tl => + let + val followsForChar = + if isCharMatch (regex, pos, Char.chr char) then + Set.insertOrReplace (pos, (), followsForChar) + else + followsForChar + in + positionsThatCorrespondToChar (char, tl, followsForChar, regex) + end + + fun convertChar + ( char + , regex + , dstates + , dtran: dtran vector + , curStates + , curStatesIdx + , setForCurStates + , followSet + , followPositionsForAllChars + ) = + if char < 0 then + let + (* append setForCurStates which was accumulated in this function + * to the end of dtran. *) + val dtran = Vector.concat [dtran, Vector.fromList [setForCurStates]] + in + (dstates, dtran) + end + else + let + (* get union of all follow positions that match char *) + val followsForCurrentChr = + positionsThatCorrespondToChar + (char, followPositionsForAllChars, Set.LEAF, regex) + in + case followsForCurrentChr of + [] => + (* no follow positions from here, so don't add to dstates *) + convertChar + ( char - 1 + , regex + , dstates + , dtran + , curStates + , curStatesIdx + , setForCurStates + , followSet + , followPositionsForAllChars + ) + | _ => + let + (* add follow positions to dstates if they are not already inside + * and if follow is not empty *) + val (newStateIdx, dstates) = + appendIfNew (0, dstates, followsForCurrentChr) + + (* update dtran to include transitions for char. *) + val setForCurStates = + Set.insertOrReplace (char, newStateIdx, setForCurStates) + in + convertChar + ( char - 1 + , regex + , dstates + , dtran + , curStates + , curStatesIdx + , setForCurStates + , followSet + , followPositionsForAllChars + ) + end + end + + fun getFollowsForUnmarked (unmarked, followsForUnmarked, followSet) = + case unmarked of + [] => List.concat followsForUnmarked + | hd :: tl => + let + val followForHd = Set.getOrDefault (hd, followSet, []) + in + case followForHd of + [] => getFollowsForUnmarked (tl, followsForUnmarked, followSet) + | _ => + let val followsForUnmarked = followForHd :: followsForUnmarked + in getFollowsForUnmarked (tl, followsForUnmarked, followSet) + end + end + + fun convertLoop (regex, dstates, dtran, followSet) = + case getUnmarkedTransitionsIfExists (0, dstates) of + SOME (unmarkedIdx, unamarkedTransition) => + let + (* mark transition *) + val dstates = + let + val newMark = {marked = true, transitions = unamarkedTransition} + in + Vector.update (dstates, unmarkedIdx, newMark) + end + + (* get follow positions for all chars *) + val followPositionsForAllChars = + getFollowsForUnmarked (unamarkedTransition, [], followSet) + + val (dstates, dtran) = convertChar + ( 255 + , regex + , dstates + , dtran + , unamarkedTransition + , unmarkedIdx + , Set.LEAF + , followSet + , followPositionsForAllChars + ) + in + convertLoop (regex, dstates, dtran, followSet) + end + | NONE => + Vector.map + (fn set => + Vector.tabulate (256, fn i => Set.getOrDefault (i, set, ~1))) + dtran + + fun convert regex = let - val fs = addToFollowSet (regex, Set.LEAF) - val first = List.rev (firstpos (regex, [])) + val followSet = addToFollowSet (regex, Set.LEAF) + + (* get firstpos, sorted *) + val first = firstpos (regex, []) + val first = Set.addFromList (first, Set.LEAF) + val first = Set.keysToList first + val dstates = Vector.fromList [{transitions = first, marked = false}] in - convertLoop (regex, dstates, Vector.fromList []) + convertLoop (regex, dstates, Vector.fromList [], followSet) end end @@ -894,7 +965,7 @@ struct , lastpos = [] } in - ToDfa.convert (ast, numStates + 1) + ToDfa.convert ast end | NONE => Vector.fromList []