refactor strings

This commit is contained in:
jjanzen 2025-03-22 12:23:22 -05:00
parent f16db81fd6
commit 4508879586
2 changed files with 86 additions and 45 deletions

View file

@ -15,51 +15,62 @@ data Token = TokLPar
| TokErr String | TokErr String
deriving (Eq, Show) deriving (Eq, Show)
isWhitespace :: Char -> Bool
isWhitespace x = x == ' ' || x == '\t' || x == '\n' || x == '\r'
data StrState = StrStart data StrState = StrStart
| StrInside | StrInside
| StrEscape | StrEscape
identifyString :: String -> StrState -> (String, Maybe String) identifySymbol :: String -> (String, String)
identifyString ('"':xs) StrStart = identifyString xs StrInside identifySymbol "" = ("", "")
identifyString xs StrStart = (xs, Nothing) -- will never occur identifySymbol (x:xs) | isWhitespace x = (x:xs, "")
identifyString "" _ = ("", Nothing) -- early end of file | otherwise = case identifySymbol xs of
(ys, zs) -> (ys, x:zs)
identifyString :: String -> StrState -> Maybe (String, String)
identifyString ('"':xs) StrStart = case identifyString xs StrInside of
Nothing -> Nothing
y -> y
identifyString _ StrStart = Nothing -- should never occur unless not called on a string
identifyString "" _ = Nothing -- early end of file
identifyString ('\\':xs) StrInside = identifyString xs StrEscape identifyString ('\\':xs) StrInside = identifyString xs StrEscape
identifyString ('"':xs) StrInside = (xs, Just "") identifyString ('"':xs) StrInside = Just (xs, "")
identifyString ('n':xs) StrEscape = case identifyString xs StrInside identifyString ('n':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('\n':b)) Just (ys, b) -> Just (ys, '\n':b)
y -> y y -> y
identifyString ('\\':xs) StrEscape = case identifyString xs StrInside identifyString ('\\':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('\\':b)) Just (ys, b) -> Just (ys, '\\':b)
y -> y y -> y
identifyString ('/':xs) StrEscape = case identifyString xs StrInside identifyString ('/':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('/':b)) Just (ys, b) -> Just (ys, '/':b)
y -> y y -> y
identifyString ('"':xs) StrEscape = case identifyString xs StrInside identifyString ('"':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('"':b)) Just (ys, b) -> Just (ys, '"':b)
y -> y y -> y
identifyString ('b':xs) StrEscape = case identifyString xs StrInside identifyString ('b':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('\b':b)) Just (ys, b) -> Just (ys, '\b':b)
y -> y y -> y
identifyString ('f':xs) StrEscape = case identifyString xs StrInside identifyString ('f':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('\f':b)) Just (ys, b) -> Just (ys, '\f':b)
y -> y y -> y
identifyString ('r':xs) StrEscape = case identifyString xs StrInside identifyString ('r':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('\r':b)) Just (ys, b) -> Just (ys, '\r':b)
y -> y y -> y
identifyString ('t':xs) StrEscape = case identifyString xs StrInside identifyString ('t':xs) StrEscape = case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just ('\t':b)) Just (ys, b) -> Just (ys, '\t':b)
y -> y y -> y
identifyString ('u':w:x:y:z:xs) StrEscape | isHexDigit w && identifyString ('u':w:x:y:z:xs) StrEscape | isHexDigit w &&
isHexDigit x && isHexDigit x &&
isHexDigit y && isHexDigit y &&
isHexDigit z = isHexDigit z =
case identifyString xs StrInside case identifyString xs StrInside of
of (ys, Just b) -> (ys, Just (chr (read ('0':'x':w:x:y:z:"")):b)) Just (ys, b) -> Just (ys, chr (read ('0':'x':w:x:y:z:"")):b)
a -> a a -> a
identifyString (x:xs) StrEscape = (x:xs, Nothing) identifyString _ StrEscape = Nothing
identifyString (x:xs) StrInside = case identifyString xs StrInside identifyString (x:xs) StrInside = case identifyString xs StrInside of
of (ys, Nothing) -> (ys, Nothing) Just (ys, b) -> Just (ys, x:b)
(ys, Just b) -> (ys, Just (x:b)) y -> y
data NumState = NumStart data NumState = NumStart
| NumNeg | NumNeg
@ -141,16 +152,21 @@ lexStr "" = []
lexStr ('(':xs) = TokLPar : lexStr xs lexStr ('(':xs) = TokLPar : lexStr xs
lexStr (')':xs) = TokRPar : lexStr xs lexStr (')':xs) = TokRPar : lexStr xs
lexStr ('"':xs) = case identifyString ('"':xs) StrStart lexStr ('"':xs) = case identifyString ('"':xs) StrStart
of (ys, Nothing) -> TokErr "End of file reached before matching \"" :lexStr ys of Nothing -> case identifySymbol ('"':xs) of (rest, symb) -> TokSymbol symb : lexStr rest
(ys, Just b) -> TokStr b:lexStr ys Just (ys, b) -> TokStr b:lexStr ys
lexStr ('\'':"") = [TokErr "End of file reached before matching '"] lexStr ('\'':"") = [TokSymbol "'"]
lexStr ('\'':'(':xs) = TokQuote : lexStr ('(':xs) lexStr ('\'':'(':xs) = TokQuote : lexStr ('(':xs)
lexStr ('\'':x:'\'':xs) = TokCharacter x : lexStr xs lexStr ('\'':x:'\'':xs) = TokCharacter x : lexStr xs
lexStr ('\'':x:xs) | isWhitespace x = TokSymbol "'" : lexStr (x:xs)
| otherwise = case identifySymbol (x:xs) of
(rest, symb) -> TokQuote : TokSymbol symb : lexStr rest
lexStr (' ':xs) = lexStr xs lexStr (' ':xs) = lexStr xs
lexStr ('\t':xs) = lexStr xs lexStr ('\t':xs) = lexStr xs
lexStr ('\n':xs) = lexStr xs lexStr ('\n':xs) = lexStr xs
lexStr ('\r':xs) = lexStr xs lexStr ('\r':xs) = lexStr xs
lexStr (x:xs) | isDigit x || x == '-' = case identifyNum (x:xs) NumStart lexStr (x:xs) | isDigit x ||
of (ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys x == '-' =
case identifyNum (x:xs) NumStart of
(ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys
(ys, Just b) -> TokNumber (read b) : lexStr ys (ys, Just b) -> TokNumber (read b) : lexStr ys
lexStr _ = [] lexStr xs = case identifySymbol xs of (ys, zs) -> TokSymbol zs : lexStr ys

View file

@ -8,16 +8,31 @@ testPars = TestCase (assertEqual "() -> [TokLPar,TokRPar]"
[Lib.TokLPar, Lib.TokRPar] [Lib.TokLPar, Lib.TokRPar]
(lexStr "()")) (lexStr "()"))
testSymbol :: Test
testSymbol = TestCase (assertEqual "hello world -> [TokSymbol \"hello\", TokSymbol \"world\"]"
[Lib.TokSymbol "hello", Lib.TokSymbol "world"]
(lexStr "hello world"))
testQuotedPars :: Test testQuotedPars :: Test
testQuotedPars = TestCase (assertEqual "'() -> [TokQuote,TokLPar,TokRPar]" testQuotedPars = TestCase (assertEqual "'() -> [TokQuote,TokLPar,TokRPar]"
[Lib.TokQuote, Lib.TokLPar, Lib.TokRPar] [Lib.TokQuote, Lib.TokLPar, Lib.TokRPar]
(lexStr "'()")) (lexStr "'()"))
testLoneQuoteIsSymbol :: Test
testLoneQuoteIsSymbol = TestCase (assertEqual "' ' -> [TokSymbol \"'\", TokSymbol \"'\"]"
[Lib.TokSymbol "'", Lib.TokSymbol "'"]
(lexStr "' '"))
testCharacter :: Test testCharacter :: Test
testCharacter = TestCase (assertEqual "'a' -> [TokCharacter 'a']" testCharacter = TestCase (assertEqual "'a' -> [TokCharacter 'a']"
[Lib.TokCharacter 'a'] [Lib.TokCharacter 'a']
(lexStr "'a'")) (lexStr "'a'"))
testQuotedSymbol :: Test
testQuotedSymbol = TestCase (assertEqual "'hello -> [TokQuote,TokSymbol \"hello\"]"
[Lib.TokQuote, Lib.TokSymbol "hello"]
(lexStr "'hello"))
testUnicodeCharacter :: Test testUnicodeCharacter :: Test
testUnicodeCharacter = TestCase (assertEqual "'📎' -> [TokCharacter '📎']" testUnicodeCharacter = TestCase (assertEqual "'📎' -> [TokCharacter '📎']"
[Lib.TokCharacter '📎'] [Lib.TokCharacter '📎']
@ -50,22 +65,32 @@ testNegativeNumber = TestCase (assertEqual "-42.03e-12 -> [TokNumber -42.03e-12]
testStringsAreRecognized :: Test testStringsAreRecognized :: Test
testStringsAreRecognized = TestCase (assertEqual testStringsAreRecognized = TestCase (assertEqual
"\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\" -> [Lib.TokStr \"\43981asdf\b\f\r\n\t\\/\"" "can strings be recognized?"
[Lib.TokStr "\43981asdf\b\f\r\n\t\\/"] [Lib.TokStr "\43981asdf\b\f\r\n\t\\/"]
(lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\"")) (lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\""))
testUnclosedStringsAreSymbols :: Test
testUnclosedStringsAreSymbols = TestCase (assertEqual
"this would make for a terrible symbol, but it can hypothetically start with a double quote"
[Lib.TokSymbol "\"hello", Lib.TokSymbol "world"]
(lexStr "\"hello world"))
tests :: Test tests :: Test
tests = TestList [ tests = TestList [
TestLabel "test paretheses are detected" testPars, TestLabel "test paretheses are detected" testPars,
TestLabel "test symbols are recognized" testSymbol,
TestLabel "test quoted parentheses are detected" testQuotedPars, TestLabel "test quoted parentheses are detected" testQuotedPars,
TestLabel "test single quotes without symbols or characters are symbols themselves" testLoneQuoteIsSymbol,
TestLabel "test characters are recognized" testCharacter, TestLabel "test characters are recognized" testCharacter,
TestLabel "test quoted symbols are recognized" testQuotedSymbol,
TestLabel "test unicode characters are recognized" testUnicodeCharacter, TestLabel "test unicode characters are recognized" testUnicodeCharacter,
TestLabel "test simple integers are recognized" testSimpleIntegers, TestLabel "test simple integers are recognized" testSimpleIntegers,
TestLabel "test fractional numbers are recognized" testFractionalNumber, TestLabel "test fractional numbers are recognized" testFractionalNumber,
TestLabel "test integers with exponential component are recognized" testExponentialInteger, TestLabel "test integers with exponential component are recognized" testExponentialInteger,
TestLabel "test exponential number is recognized" testExponential, TestLabel "test exponential number is recognized" testExponential,
TestLabel "test negative numbers are recognized" testNegativeNumber, TestLabel "test negative numbers are recognized" testNegativeNumber,
TestLabel "test strings are recognized" testStringsAreRecognized TestLabel "test strings are recognized" testStringsAreRecognized,
TestLabel "test unclosed strings are symbols" testUnclosedStringsAreSymbols
] ]
main :: IO () main :: IO ()