add ternary string set file, and code an insert function for it (plus datatype definitions, etc.)

This commit is contained in:
2025-02-23 10:44:39 +00:00
parent 0dda4dc974
commit d056e08ce7
2 changed files with 202 additions and 523 deletions

202
src/ternary-string-set.sml Normal file
View File

@@ -0,0 +1,202 @@
(* An attempt at implementing terneary search trees,
* except that each node contains a full string instead of a char *)
structure TernaryStringSet =
struct
datatype t =
NODE of {left: t, key: string, follow: t, right: t}
(* follow is if insString contains nodeString *)
| FOUND_NODE of {left: t, key: string, follow: t, right: t}
| LEAF of string
| EMPTY
val empty = EMPTY
fun isEmpty t = t = EMPTY
fun fromString str =
if String.size str > 0 then LEAF str else EMPTY
val nodeStringContainsInsString = ~1
val insStringContainsNodeString = ~2
fun getBreakPos (pos, insString, nodeString) =
if pos = String.size insString andalso pos = String.size nodeString then
pos
else if pos = String.size insString then
(* if nodeString contains insString, return ~1 *)
nodeStringContainsInsString
else if pos = String.size nodeString then
(* if insString contains nodeString, return ~2 *)
insStringContainsNodeString
else
let
val insChr = String.sub (insString, pos)
val nodeChr = String.sub (nodeString, pos)
in
if insChr = nodeChr then
(* continue *)
getBreakPos (pos + 1, insString, nodeString)
else
(* return break position *)
pos
end
fun helpIns (pos, insString, t) =
case t of
NODE {left, key = nodeString, follow, right} =>
let
val breakPos = getBreakPos (pos, insString, nodeString)
in
if breakPos = String.size insString then
(* change node tag *)
FOUND_NODE
{left = left, key = nodeString, follow = follow, right = right}
else if breakPos = nodeStringContainsInsString then
(* add new node at current position,
* making current node a child of the new node *)
FOUND_NODE
{left = EMPTY, key = insString, follow = t, right = EMPTY}
else if breakPos = insStringContainsNodeString then
(* follow *)
let
val follow = helpIns (breakPos, insString, follow)
in
NODE
{left = left, key = nodeString, follow = follow, right = right}
end
else
(* we have a difference: do we want to go left or right? *)
let
val insChr = String.sub (insString, breakPos)
val nodeChr = String.sub (nodeString, breakPos)
in
if insChr < nodeChr then
let
val left = helpIns (breakPos, insString, left)
in
NODE
{ left = left
, key = nodeString
, follow = follow
, right = right
}
end
else
let
val right = helpIns (breakPos, insString, right)
in
NODE
{ left = left
, key = nodeString
, follow = follow
, right = right
}
end
end
end
| FOUND_NODE {left, key = nodeString, follow, right} =>
let
val breakPos = getBreakPos (pos, insString, nodeString)
in
if breakPos = String.size insString then
(* return original tree as nothing to do, since key already exists *)
t
else if breakPos = nodeStringContainsInsString then
(* add new node at current position,
* making current node a child of the new node *)
FOUND_NODE
{left = EMPTY, key = insString, follow = t, right = EMPTY}
else if breakPos = insStringContainsNodeString then
(* follow *)
let
val follow = helpIns (breakPos, insString, follow)
in
FOUND_NODE
{left = left, key = nodeString, follow = follow, right = right}
end
else
(* we have a difference: do we want to go left or right? *)
let
val insChr = String.sub (insString, breakPos)
val nodeChr = String.sub (nodeString, breakPos)
in
if insChr < nodeChr then
let
val left = helpIns (breakPos, insString, left)
in
FOUND_NODE
{ left = left
, key = nodeString
, follow = follow
, right = right
}
end
else
let
val right = helpIns (breakPos, insString, right)
in
FOUND_NODE
{ left = left
, key = nodeString
, follow = follow
, right = right
}
end
end
end
| EMPTY => LEAF insString
| LEAF nodeString =>
let
val breakPos = getBreakPos (pos, insString, nodeString)
in
if breakPos = String.size insString then
(* no change as user tried to insert string that already exists *)
t
else if breakPos = nodeStringContainsInsString then
(* add new node at current position,
* making current node a child of the new node *)
FOUND_NODE
{left = EMPTY, key = insString, follow = t, right = EMPTY}
else if breakPos = insStringContainsNodeString then
(* transform this node to a FOUND_NODE,
* and add a new LEAF at follow position *)
let
val newLeaf = LEAF insString
in
FOUND_NODE
{ key = nodeString
, follow = newLeaf
, left = EMPTY
, right = EMPTY
}
end
else
(* we have a difference: break string and add a new NODE.
* The way we insert has implications for searching.
* Search should descend down on all three children
* if searchKey contains nodeKey.
* *)
let
val breakString = String.substring (insString, 0, breakPos)
val insChr = String.sub (insString, breakPos)
val nodeChr = String.sub (nodeString, breakPos)
in
if insChr < nodeChr then
(* insString on left *)
NODE
{ left = LEAF insString
, right = t
, key = breakString
, follow = EMPTY
}
else
(* insString on right *)
NODE
{ left = t
, right = LEAF insString
, key = breakString
, follow = EMPTY
}
end
end
end

View File

@@ -1,523 +0,0 @@
signature STRING_SET =
sig
(* the type of tries *)
type t
(* the empty trie *)
val empty: t
(* StringSet.isEmpty trie
* returns true if the trie is empty *)
val isEmpty: t -> bool
(* StringSet.fromString "hello world"
* creates a trie containing just a string *)
val fromString: string -> t
(* StringSet.exists ("hello world", trie)
* returns true if the key was inserted into the trie *)
val exists: string * t -> bool
(* StringSet.insert ("myNewString", trie)
* inserts a new string into the trie, returning a new trie *)
val insert: string * t -> t
(* StringSet.remove ("stringToRemove", trie)
* removes the key from the trie, returning a new trie *)
val remove: string * t -> t
(* StringSet.getPrefixList ("myPrefix", trie)
* returns a list of all keys matching the specified prefix *)
val getPrefixList: string * t -> string list
(* StringSet.toList trie
* returns a list containing all keys in the trie *)
val toList: t -> string list
(* StringSet.fromList ["hello", "world"]
* returns a trie containing all keys in the string list *)
val fromList: string list -> t
(* StringSet.foldl (fn (key, acc) => String.size key + acc) 0 trie
* folds a value through the trie, from lowest to highest. *)
val foldl: (string * 'b -> 'b) -> 'b -> t -> 'b
(* StringSet.foldlWithPrefix (fn (key, acc) => String.size key + acc) 0 trie "myPrefix"
* folds a value through a subset of the trie containing the specified prefix,
* from lowest to highest. *)
val foldlWithPrefix: (string * 'b -> 'b) -> 'b -> t -> string -> 'b
(* StringSet.foldr (fn (key, acc) => String.size key + acc) 0 trie
* folds a value through the trie, from highest to lowest. *)
val foldr: (string * 'b -> 'b) -> 'b -> t -> 'b
(* StringSet.foldrWithPrefix (fn (key, acc) => String.size key + acc) 0 trie "myPrefix"
* folds a value through a subset of the trie containing the specified prefix,
* from highest to lowest. *)
val foldrWithPrefix: (string * 'b -> 'b) -> 'b -> t -> string -> 'b
end
structure ZipStringSet =
struct
datatype t =
CHILDREN of
{ leftKeys: string vector list
, leftChildren: t vector list
, rightKeys: string vector list
, rightChildren: t vector list
}
| FOUND_WITH_CHILDREN of
{ leftKeys: string vector list
, leftChildren: t vector list
, rightKeys: string vector list
, rightChildren: t vector list
}
| FOUND
val maxSize = 32
val empty = CHILDREN
{leftKeys = [], leftChildren = [], rightKeys = [], rightChildren = []}
fun isEmpty trie =
case trie of
CHILDREN {leftKeys = [], rightKeys = [], ...} => true
| _ => false
fun fromString str =
if String.size str > 0 then
CHILDREN
{ leftKeys = [Vector.fromList [str]]
, leftChildren = [Vector.fromList [FOUND]]
, rightKeys = []
, rightChildren = []
}
else
empty
fun helpBinSearch (findChr, keyPos, children, low, high) =
if high >= low then
let
val mid = low + ((high - low) div 2)
val midStr = Vector.sub (children, mid)
val midChr = String.sub (midStr, keyPos)
in
if midChr = findChr then
SOME mid
else if midChr < findChr then
helpBinSearch (findChr, keyPos, children, mid + 1, high)
else
helpBinSearch (findChr, keyPos, children, low, mid - 1)
end
else
NONE
fun findBinSearch (findChr, keyPos, children) =
helpBinSearch (findChr, keyPos, children, 0, Vector.length children - 1)
datatype search_string_match =
NO_SEARCH_MATCH
| FULL_SEARCH_MATCH
| SEARCH_KEY_CONTAINS_TRIE_KEY
| TRIE_KEY_CONTAINS_SEARCH_KEY
fun searchKeyMatch (searchKey, trieKey, keyPos) =
if
keyPos < String.size searchKey
then
if keyPos < String.size trieKey then
let
val searchChr = String.sub (searchKey, keyPos)
val trieChr = String.sub (trieKey, keyPos)
in
if searchChr = trieChr then
searchKeyMatch (searchKey, trieKey, keyPos + 1)
else
NO_SEARCH_MATCH
end
else
SEARCH_KEY_CONTAINS_TRIE_KEY
else if
keyPos = String.size searchKey
then
if keyPos < String.size trieKey then TRIE_KEY_CONTAINS_SEARCH_KEY
else if keyPos = String.size trieKey then FULL_SEARCH_MATCH
else SEARCH_KEY_CONTAINS_TRIE_KEY
else (* implicit: keyPos > String.size searchKey *) if
keyPos <= String.size trieKey
then
TRIE_KEY_CONTAINS_SEARCH_KEY
else
NO_SEARCH_MATCH
fun isFoundNode node =
case node of
FOUND => true
| FOUND_WITH_CHILDREN _ => true
| CHILDREN _ => false
fun checkNodeExistsMatch (searchKey, trieKey, keyPos, children, idx) =
(case searchKeyMatch (searchKey, trieKey, keyPos + 1) of
NO_SEARCH_MATCH => false
| FULL_SEARCH_MATCH =>
let val trieChild = Vector.sub (children, idx)
in isFoundNode trieChild
end
| SEARCH_KEY_CONTAINS_TRIE_KEY =>
let val trieChild = Vector.sub (children, idx)
in recurseExists (searchKey, String.size trieKey, trieChild)
end
| TRIE_KEY_CONTAINS_SEARCH_KEY => false)
and checkExistsLeft (searchKey, searchChr, keyPos, leftKeys, leftChildren) =
case (leftKeys, leftChildren) of
(khd :: ktl, chd :: ctl) =>
let
val firstNode = Vector.sub (khd, 0)
val firstNodeChr = String.sub (firstNode, keyPos)
in
if firstNodeChr < searchChr then
(* keep checking leftwards *)
checkExistsLeft (searchKey, searchChr, keyPos, ktl, ctl)
else if firstNodeChr = searchChr then
(* check if there is a full/partial key match at this node *)
checkNodeExistsMatch (searchKey, firstNode, keyPos, chd, 0)
else
(* binary search this node to see if there is a matching chr *)
(case findBinSearch (searchChr, keyPos, khd) of
SOME idx =>
checkNodeExistsMatch
(searchKey, Vector.sub (khd, idx), keyPos, chd, idx)
| NONE => false)
end
| (_, _) => false
and checkExistsRight (searchKey, searchChr, keyPos, rightKeys, rightChildren) =
case (rightKeys, rightChildren) of
(khd :: ktl, chd :: ctl) =>
let
val lastNode = Vector.sub (khd, Vector.length khd - 1)
val lastNodeChr = String.sub (lastNode, keyPos)
in
if lastNodeChr > searchChr then
(* keep checking rightwards *)
checkExistsRight (searchKey, searchChr, keyPos, ktl, ctl)
else if lastNodeChr = searchChr then
(* check for full/partial match at this node *)
checkNodeExistsMatch
(searchKey, lastNode, keyPos, chd, Vector.length khd - 1)
else
(* binary search this node to see if there is a matching chr *)
(case findBinSearch (searchChr, keyPos, khd) of
SOME idx =>
checkNodeExistsMatch
(searchKey, Vector.sub (khd, idx), keyPos, chd, idx)
| NONE => false)
end
| (_, _) => false
and decideExistsDirection
(searchKey, keyPos, leftKeys, leftChildren, rightKeys, rightChildren) =
case (leftKeys, leftChildren) of
(khd :: ktl, chd :: ctl) =>
let
val searchChr = String.sub (searchKey, keyPos)
val firstNode = Vector.sub (khd, 0)
val startNodeChr = String.sub (firstNode, keyPos)
in
if searchChr < startNodeChr then
checkExistsLeft (searchKey, searchChr, keyPos, ktl, ctl)
else if searchChr = startNodeChr then
(* check string match on first key/firstNode
* and recurse if full or partial match *)
checkNodeExistsMatch (searchKey, firstNode, keyPos, chd, 0)
else
(* implicit: searchChr > startNodeChr *)
let
val lastNode = Vector.sub (khd, Vector.length khd - 1)
val lastNodeChr = String.sub (lastNode, keyPos)
in
if searchChr > lastNodeChr then
(* check rightKeys/rightChildren *)
checkExistsRight
(searchKey, searchChr, keyPos, rightKeys, rightChildren)
else if searchChr = lastNodeChr then
(* check string match on last key/lastNode
* and recurse if full or partial match *)
checkNodeExistsMatch
(searchKey, lastNode, keyPos, chd, Vector.length khd - 1)
else
(* implicit: searchChr < lastNodeChr
* should perform binary search at this node
* to find if key exists *)
(case findBinSearch (searchChr, keyPos, khd) of
SOME idx =>
checkNodeExistsMatch
(searchKey, Vector.sub (khd, idx), keyPos, chd, idx)
| NONE => false)
end
end
| (_, _) =>
(* leftKeys and leftChildren are both empty
* so check rightKeys/rightChildren *)
checkExistsRight
( searchKey
, String.sub (searchKey, keyPos)
, keyPos
, rightKeys
, rightChildren
)
and recurseExists (searchKey, keyPos, trie) =
case trie of
CHILDREN {leftKeys, leftChildren, rightKeys, rightChildren} =>
decideExistsDirection
(searchKey, keyPos, leftKeys, leftChildren, rightKeys, rightChildren)
| FOUND_WITH_CHILDREN {leftKeys, leftChildren, rightKeys, rightChildren} =>
decideExistsDirection
(searchKey, keyPos, leftKeys, leftChildren, rightKeys, rightChildren)
| FOUND => false
fun exists (searchKey, trie) =
if isEmpty trie orelse String.size searchKey = 0 then false
else recurseExists (searchKey, 0, trie)
fun checkNodeSubtrieMatch (prefix, trieKey, keyPos, children, idx) =
case searchKeyMatch (prefix, trieKey, keyPos + 1) of
NO_SEARCH_MATCH => NONE
| SEARCH_KEY_CONTAINS_TRIE_KEY =>
let val trieChild = Vector.sub (children, idx)
in recurseGetPrefixSubtrie (prefix, String.size trieKey, trieChild)
end
| FULL_SEARCH_MATCH =>
let val node = Vector.sub (children, idx)
in SOME (prefix, node)
end
| TRIE_KEY_CONTAINS_SEARCH_KEY =>
let val node = Vector.sub (children, idx)
in SOME (trieKey, node)
end
and helpGetPrefixSubtrieChildren (prefix, keyPos, keys, children, trie) =
let
val findChr = String.sub (prefix, keyPos)
in
case findBinSearch (findChr, keyPos, keys) of
SOME idx =>
let val trieKey = Vector.sub (keys, idx)
in checkNodeSubtrieMatch (prefix, trieKey, keyPos, children, idx)
end
| NONE => NONE
end
and getPrefixSubtrieLeft (prefix, searchChr, keyPos, leftKeys, leftChildren) =
case (leftKeys, leftChildren) of
(khd :: ktl, chd :: ctl) =>
let
val firstNode = Vector.sub (khd, 0)
val firstNodeChr = String.sub (firstNode, keyPos)
in
if firstNodeChr < searchChr then
(* keep moving leftwards *)
getPrefixSubtrieLeft (prefix, searchChr, keyPos, ktl, ctl)
else if firstNodeChr = searchChr then
(* check if there is a full/partial key match at this node *)
checkNodeSubtrieMatch (prefix, firstNode, keyPos, chd, 0)
else
(* binary search this node to see if there is a matching chr *)
(case findBinSearch (searchChr, keyPos, khd) of
SOME idx =>
checkNodeSubtrieMatch
(prefix, Vector.sub (khd, idx), keyPos, chd, idx)
| NONE => NONE)
end
| (_, _) => NONE
and getPrefixSubtrieRight
(prefix, searchChr, keyPos, rightKeys, rightChildren) =
case (rightKeys, rightChildren) of
(khd :: ktl, chd :: ctl) =>
let
val lastNode = Vector.sub (khd, Vector.length khd - 1)
val lastNodeChr = String.sub (lastNode, keyPos)
in
if lastNodeChr > searchChr then
(* keep checking rightwards *)
getPrefixSubtrieRight (prefix, searchChr, keyPos, ktl, ctl)
else if lastNodeChr = searchChr then
(* check for full/partial match at this node *)
checkNodeSubtrieMatch
(prefix, lastNode, keyPos, chd, Vector.length khd - 1)
else
(* binary search this node to see if there is a matching chr *)
(case findBinSearch (searchChr, keyPos, khd) of
SOME idx =>
checkNodeSubtrieMatch
(prefix, Vector.sub (khd, idx), keyPos, chd, idx)
| NONE => NONE)
end
| (_, _) => NONE
and decideGetPrefixSubtrieDirection
(prefix, keyPos, leftKeys, leftChildren, rightKeys, rightChildren) =
case (leftKeys, leftChildren) of
(khd :: ktl, chd :: ctl) =>
let
val searchChr = String.sub (prefix, keyPos)
val firstNode = Vector.sub (khd, 0)
val startNodeChr = String.sub (firstNode, keyPos)
in
if searchChr < startNodeChr then
getPrefixSubtrieLeft (prefix, searchChr, keyPos, ktl, ctl)
else if searchChr = startNodeChr then
(* check string match on first key/firstNode
* and recurse if full or partial match *)
checkNodeSubtrieMatch (prefix, firstNode, keyPos, chd, 0)
else
(* implicit: searchChr > startNodeChr *)
let
val lastNode = Vector.sub (khd, Vector.length khd - 1)
val lastNodeChr = String.sub (lastNode, keyPos)
in
if searchChr > lastNodeChr then
(* check rightKeys/rightChildren *)
getPrefixSubtrieRight
(prefix, searchChr, keyPos, rightKeys, rightChildren)
else if searchChr = lastNodeChr then
(* check string match on last key/lastNode
* and recurse if full or partial match *)
checkNodeSubtrieMatch
(prefix, lastNode, keyPos, chd, Vector.length khd - 1)
else
(* implicit: searchChr < lastNodeChr
* should perform binary search at this node
* to find if key exists *)
(case findBinSearch (searchChr, keyPos, khd) of
SOME idx =>
checkNodeSubtrieMatch
(prefix, Vector.sub (khd, idx), keyPos, chd, idx)
| NONE => NONE)
end
end
| (_, _) => NONE
and recurseGetPrefixSubtrie (prefix, keyPos, trie) =
case trie of
CHILDREN {leftKeys, leftChildren, rightKeys, rightChildren} =>
decideGetPrefixSubtrieDirection
(prefix, keyPos, leftKeys, leftChildren, rightKeys, rightChildren)
| FOUND_WITH_CHILDREN {leftKeys, leftChildren, rightKeys, rightChildren} =>
decideGetPrefixSubtrieDirection
(prefix, keyPos, leftKeys, leftChildren, rightKeys, rightChildren)
| FOUND => NONE
fun getPrefixSubtrie (prefix, trie) =
if isEmpty trie then NONE else recurseGetPrefixSubtrie (prefix, 0, trie)
datatype insert_string_match =
NO_INSERT_MATCH
| DIFFERENCE_FOUND_AT of int
| FULL_INSERT_MATCH
| INSERT_KEY_CONTAINS_TRIE_KEY
| TRIE_KEY_CONTAINS_INSERT_KEY
fun insertKeyMatch (insertKey, trieKey, keyPos) =
if
keyPos < String.size insertKey
then
if keyPos < String.size trieKey then
let
val searchChr = String.sub (insertKey, keyPos)
val trieChr = String.sub (trieKey, keyPos)
in
if searchChr = trieChr then
insertKeyMatch (insertKey, trieKey, keyPos + 1)
else
DIFFERENCE_FOUND_AT keyPos
end
else
INSERT_KEY_CONTAINS_TRIE_KEY
else if
keyPos = String.size insertKey
then
if keyPos < String.size trieKey then TRIE_KEY_CONTAINS_INSERT_KEY
else if keyPos = String.size trieKey then FULL_INSERT_MATCH
else INSERT_KEY_CONTAINS_TRIE_KEY
else (* implicit: keyPos > String.size insertKey *) if
keyPos <= String.size trieKey
then
TRIE_KEY_CONTAINS_INSERT_KEY
else
NO_INSERT_MATCH
datatype insert_bin_search_result =
INSERT_NEW_CHILD of int
| FOUND_INSERT_POS of int
| APPEND_NEW_CHILD
fun linearSearch (findChr, keyPos, idx, children) =
if idx = Vector.length children then
APPEND_NEW_CHILD
else
let
val curStr = Vector.sub (children, idx)
val curChr = String.sub (curStr, keyPos)
in
if curChr > findChr then INSERT_NEW_CHILD idx
else linearSearch (findChr, keyPos, idx + 1, children)
end
fun helpInsertBinSearch (findChr, keyPos, children, low, high) =
let
val mid = low + ((high - low) div 2)
in
if high >= low then
let
val midStr = Vector.sub (children, mid)
val midChr = String.sub (midStr, keyPos)
in
if midChr = findChr then
FOUND_INSERT_POS mid
else if midChr < findChr then
helpInsertBinSearch (findChr, keyPos, children, mid + 1, high)
else
helpInsertBinSearch (findChr, keyPos, children, low, mid - 1)
end
else
linearSearch (findChr, keyPos, if mid >= 0 then mid else 0, children)
end
fun insertBinSearch (findChr, keyPos, children) =
helpInsertBinSearch
(findChr, keyPos, children, 0, Vector.length children - 1)
fun insertDifferenceFoundAtLeft
( insKey
, insIdx
, splitTrieKeyStart
, trieChild
, childLeftKeys
, childLeftChildren
, childRightKeys
, childRightChildren
, parentKeys
, parentChildren
, parentConstructor
) =
let
(* child node should always have CHILDREN case,
* because we are splitting prefix into two,
* when neither trieKey nor insKey match the prefix. *)
val childNode = CHILDREN {keys = childKeys, children = childChildren}
val keys =
Vector.mapi
(fn (idx, key) => if idx <> insIdx then key else splitTrieKeyStart)
parentKeys
val children =
Vector.mapi (fn (idx, elt) => if idx <> insIdx then elt else childNode)
parentChildren
in
parentConstructor {keys = keys, children = children}
end
end