refactor strings

This commit is contained in:
jjanzen 2025-03-22 12:23:22 -05:00
parent f16db81fd6
commit 4508879586
2 changed files with 86 additions and 45 deletions

View file

@ -15,51 +15,62 @@ data Token = TokLPar
| TokErr String
deriving (Eq, Show)
isWhitespace :: Char -> Bool
isWhitespace x = x == ' ' || x == '\t' || x == '\n' || x == '\r'
data StrState = StrStart
| StrInside
| StrEscape
identifyString :: String -> StrState -> (String, Maybe String)
identifyString ('"':xs) StrStart = identifyString xs StrInside
identifyString xs StrStart = (xs, Nothing) -- will never occur
identifyString "" _ = ("", Nothing) -- early end of file
identifySymbol :: String -> (String, String)
identifySymbol "" = ("", "")
identifySymbol (x:xs) | isWhitespace x = (x:xs, "")
| otherwise = case identifySymbol xs of
(ys, zs) -> (ys, x:zs)
identifyString :: String -> StrState -> Maybe (String, String)
identifyString ('"':xs) StrStart = case identifyString xs StrInside of
Nothing -> Nothing
y -> y
identifyString _ StrStart = Nothing -- should never occur unless not called on a string
identifyString "" _ = Nothing -- early end of file
identifyString ('\\':xs) StrInside = identifyString xs StrEscape
identifyString ('"':xs) StrInside = (xs, Just "")
identifyString ('n':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('\n':b))
identifyString ('"':xs) StrInside = Just (xs, "")
identifyString ('n':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\n':b)
y -> y
identifyString ('\\':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('\\':b))
identifyString ('\\':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\\':b)
y -> y
identifyString ('/':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('/':b))
identifyString ('/':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '/':b)
y -> y
identifyString ('"':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('"':b))
identifyString ('"':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '"':b)
y -> y
identifyString ('b':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('\b':b))
identifyString ('b':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\b':b)
y -> y
identifyString ('f':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('\f':b))
identifyString ('f':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\f':b)
y -> y
identifyString ('r':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('\r':b))
identifyString ('r':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\r':b)
y -> y
identifyString ('t':xs) StrEscape = case identifyString xs StrInside
of (ys, Just b) -> (ys, Just ('\t':b))
identifyString ('t':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\t':b)
y -> y
identifyString ('u':w:x:y:z:xs) StrEscape | isHexDigit w &&
isHexDigit x &&
isHexDigit y &&
isHexDigit z =
case identifyString xs StrInside
of (ys, Just b) -> (ys, Just (chr (read ('0':'x':w:x:y:z:"")):b))
case identifyString xs StrInside of
Just (ys, b) -> Just (ys, chr (read ('0':'x':w:x:y:z:"")):b)
a -> a
identifyString (x:xs) StrEscape = (x:xs, Nothing)
identifyString (x:xs) StrInside = case identifyString xs StrInside
of (ys, Nothing) -> (ys, Nothing)
(ys, Just b) -> (ys, Just (x:b))
identifyString _ StrEscape = Nothing
identifyString (x:xs) StrInside = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, x:b)
y -> y
data NumState = NumStart
| NumNeg
@ -141,16 +152,21 @@ lexStr "" = []
lexStr ('(':xs) = TokLPar : lexStr xs
lexStr (')':xs) = TokRPar : lexStr xs
lexStr ('"':xs) = case identifyString ('"':xs) StrStart
of (ys, Nothing) -> TokErr "End of file reached before matching \"" :lexStr ys
(ys, Just b) -> TokStr b:lexStr ys
lexStr ('\'':"") = [TokErr "End of file reached before matching '"]
of Nothing -> case identifySymbol ('"':xs) of (rest, symb) -> TokSymbol symb : lexStr rest
Just (ys, b) -> TokStr b:lexStr ys
lexStr ('\'':"") = [TokSymbol "'"]
lexStr ('\'':'(':xs) = TokQuote : lexStr ('(':xs)
lexStr ('\'':x:'\'':xs) = TokCharacter x : lexStr xs
lexStr ('\'':x:xs) | isWhitespace x = TokSymbol "'" : lexStr (x:xs)
| otherwise = case identifySymbol (x:xs) of
(rest, symb) -> TokQuote : TokSymbol symb : lexStr rest
lexStr (' ':xs) = lexStr xs
lexStr ('\t':xs) = lexStr xs
lexStr ('\n':xs) = lexStr xs
lexStr ('\r':xs) = lexStr xs
lexStr (x:xs) | isDigit x || x == '-' = case identifyNum (x:xs) NumStart
of (ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys
lexStr (x:xs) | isDigit x ||
x == '-' =
case identifyNum (x:xs) NumStart of
(ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys
(ys, Just b) -> TokNumber (read b) : lexStr ys
lexStr _ = []
lexStr xs = case identifySymbol xs of (ys, zs) -> TokSymbol zs : lexStr ys

View file

@ -8,16 +8,31 @@ testPars = TestCase (assertEqual "() -> [TokLPar,TokRPar]"
[Lib.TokLPar, Lib.TokRPar]
(lexStr "()"))
testSymbol :: Test
testSymbol = TestCase (assertEqual "hello world -> [TokSymbol \"hello\", TokSymbol \"world\"]"
[Lib.TokSymbol "hello", Lib.TokSymbol "world"]
(lexStr "hello world"))
testQuotedPars :: Test
testQuotedPars = TestCase (assertEqual "'() -> [TokQuote,TokLPar,TokRPar]"
[Lib.TokQuote, Lib.TokLPar, Lib.TokRPar]
(lexStr "'()"))
testLoneQuoteIsSymbol :: Test
testLoneQuoteIsSymbol = TestCase (assertEqual "' ' -> [TokSymbol \"'\", TokSymbol \"'\"]"
[Lib.TokSymbol "'", Lib.TokSymbol "'"]
(lexStr "' '"))
testCharacter :: Test
testCharacter = TestCase (assertEqual "'a' -> [TokCharacter 'a']"
[Lib.TokCharacter 'a']
(lexStr "'a'"))
testQuotedSymbol :: Test
testQuotedSymbol = TestCase (assertEqual "'hello -> [TokQuote,TokSymbol \"hello\"]"
[Lib.TokQuote, Lib.TokSymbol "hello"]
(lexStr "'hello"))
testUnicodeCharacter :: Test
testUnicodeCharacter = TestCase (assertEqual "'📎' -> [TokCharacter '📎']"
[Lib.TokCharacter '📎']
@ -50,22 +65,32 @@ testNegativeNumber = TestCase (assertEqual "-42.03e-12 -> [TokNumber -42.03e-12]
testStringsAreRecognized :: Test
testStringsAreRecognized = TestCase (assertEqual
"\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\" -> [Lib.TokStr \"\43981asdf\b\f\r\n\t\\/\""
"can strings be recognized?"
[Lib.TokStr "\43981asdf\b\f\r\n\t\\/"]
(lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\""))
testUnclosedStringsAreSymbols :: Test
testUnclosedStringsAreSymbols = TestCase (assertEqual
"this would make for a terrible symbol, but it can hypothetically start with a double quote"
[Lib.TokSymbol "\"hello", Lib.TokSymbol "world"]
(lexStr "\"hello world"))
tests :: Test
tests = TestList [
TestLabel "test paretheses are detected" testPars,
TestLabel "test symbols are recognized" testSymbol,
TestLabel "test quoted parentheses are detected" testQuotedPars,
TestLabel "test single quotes without symbols or characters are symbols themselves" testLoneQuoteIsSymbol,
TestLabel "test characters are recognized" testCharacter,
TestLabel "test quoted symbols are recognized" testQuotedSymbol,
TestLabel "test unicode characters are recognized" testUnicodeCharacter,
TestLabel "test simple integers are recognized" testSimpleIntegers,
TestLabel "test fractional numbers are recognized" testFractionalNumber,
TestLabel "test integers with exponential component are recognized" testExponentialInteger,
TestLabel "test exponential number is recognized" testExponential,
TestLabel "test negative numbers are recognized" testNegativeNumber,
TestLabel "test strings are recognized" testStringsAreRecognized
TestLabel "test strings are recognized" testStringsAreRecognized,
TestLabel "test unclosed strings are symbols" testUnclosedStringsAreSymbols
]
main :: IO ()