better word tokenisation algorithm in bench/conv-words.sml
This commit is contained in:
@@ -3,9 +3,30 @@
|
|||||||
val inIo = TextIO.openIn "words.txt"
|
val inIo = TextIO.openIn "words.txt"
|
||||||
val outIO = TextIO.openOut "words.sml"
|
val outIO = TextIO.openOut "words.sml"
|
||||||
|
|
||||||
|
fun consWordChrs (wordChrs, acc) =
|
||||||
|
case wordChrs of
|
||||||
|
[] => acc
|
||||||
|
| _ => (String.implode wordChrs) :: acc
|
||||||
|
|
||||||
|
fun helpTokeniseLine (pos, wordChrs, line, acc) =
|
||||||
|
if pos < 0 then
|
||||||
|
consWordChrs (wordChrs, acc)
|
||||||
|
else
|
||||||
|
let
|
||||||
|
val chr = String.sub (line, pos)
|
||||||
|
in
|
||||||
|
if Char.isPrint chr andalso not (Char.isSpace chr) then
|
||||||
|
helpTokeniseLine (pos - 1, chr :: wordChrs, line, acc)
|
||||||
|
else
|
||||||
|
helpTokeniseLine (pos - 1, [], line, consWordChrs (wordChrs, acc))
|
||||||
|
end
|
||||||
|
|
||||||
|
fun tokeniseLine (line, acc) =
|
||||||
|
helpTokeniseLine (String.size line - 1, [], line, acc)
|
||||||
|
|
||||||
fun readLines (inIo, acc) =
|
fun readLines (inIo, acc) =
|
||||||
case TextIO.inputLine inIo of
|
case TextIO.inputLine inIo of
|
||||||
SOME word => readLines (inIo, word :: acc)
|
SOME line => readLines (inIo, tokeniseLine (line, acc))
|
||||||
| NONE => List.rev acc
|
| NONE => List.rev acc
|
||||||
|
|
||||||
fun writeLines (outIO, lst) =
|
fun writeLines (outIO, lst) =
|
||||||
@@ -13,12 +34,6 @@ fun writeLines (outIO, lst) =
|
|||||||
[] => ()
|
[] => ()
|
||||||
| word :: tl =>
|
| word :: tl =>
|
||||||
let
|
let
|
||||||
(* remove \r and \n from the word *)
|
|
||||||
val word = Substring.full word
|
|
||||||
val word =
|
|
||||||
Substring.dropr (fn chr => chr = #"\n" orelse chr = #"\r") word
|
|
||||||
val word = Substring.string word
|
|
||||||
|
|
||||||
val isLast = tl = []
|
val isLast = tl = []
|
||||||
val word = if isLast then "\"" ^ word ^ "\"" else "\"" ^ word ^ "\",\n"
|
val word = if isLast then "\"" ^ word ^ "\"" else "\"" ^ word ^ "\",\n"
|
||||||
val _ = TextIO.output (outIO, word)
|
val _ = TextIO.output (outIO, word)
|
||||||
@@ -36,3 +51,5 @@ fun main () =
|
|||||||
in
|
in
|
||||||
()
|
()
|
||||||
end
|
end
|
||||||
|
|
||||||
|
val _ = main ()
|
||||||
|
|||||||
Reference in New Issue
Block a user