diff --git a/src/Lexer.hs b/src/Lexer.hs index 2a445d5..200d76a 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -15,51 +15,62 @@ data Token = TokLPar | TokErr String deriving (Eq, Show) +isWhitespace :: Char -> Bool +isWhitespace x = x == ' ' || x == '\t' || x == '\n' || x == '\r' + data StrState = StrStart | StrInside | StrEscape -identifyString :: String -> StrState -> (String, Maybe String) -identifyString ('"':xs) StrStart = identifyString xs StrInside -identifyString xs StrStart = (xs, Nothing) -- will never occur -identifyString "" _ = ("", Nothing) -- early end of file +identifySymbol :: String -> (String, String) +identifySymbol "" = ("", "") +identifySymbol (x:xs) | isWhitespace x = (x:xs, "") + | otherwise = case identifySymbol xs of + (ys, zs) -> (ys, x:zs) + +identifyString :: String -> StrState -> Maybe (String, String) +identifyString ('"':xs) StrStart = case identifyString xs StrInside of + Nothing -> Nothing + y -> y +identifyString _ StrStart = Nothing -- should never occur unless not called on a string +identifyString "" _ = Nothing -- early end of file identifyString ('\\':xs) StrInside = identifyString xs StrEscape -identifyString ('"':xs) StrInside = (xs, Just "") -identifyString ('n':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('\n':b)) - y -> y -identifyString ('\\':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('\\':b)) - y -> y -identifyString ('/':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('/':b)) - y -> y -identifyString ('"':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('"':b)) - y -> y -identifyString ('b':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('\b':b)) - y -> y -identifyString ('f':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('\f':b)) - y -> y -identifyString ('r':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('\r':b)) - y -> y -identifyString ('t':xs) StrEscape = case identifyString xs StrInside - of (ys, Just b) -> (ys, Just ('\t':b)) - y -> y +identifyString ('"':xs) StrInside = Just (xs, "") +identifyString ('n':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '\n':b) + y -> y +identifyString ('\\':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '\\':b) + y -> y +identifyString ('/':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '/':b) + y -> y +identifyString ('"':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '"':b) + y -> y +identifyString ('b':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '\b':b) + y -> y +identifyString ('f':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '\f':b) + y -> y +identifyString ('r':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '\r':b) + y -> y +identifyString ('t':xs) StrEscape = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, '\t':b) + y -> y identifyString ('u':w:x:y:z:xs) StrEscape | isHexDigit w && isHexDigit x && isHexDigit y && isHexDigit z = - case identifyString xs StrInside - of (ys, Just b) -> (ys, Just (chr (read ('0':'x':w:x:y:z:"")):b)) - a -> a -identifyString (x:xs) StrEscape = (x:xs, Nothing) -identifyString (x:xs) StrInside = case identifyString xs StrInside - of (ys, Nothing) -> (ys, Nothing) - (ys, Just b) -> (ys, Just (x:b)) + case identifyString xs StrInside of + Just (ys, b) -> Just (ys, chr (read ('0':'x':w:x:y:z:"")):b) + a -> a +identifyString _ StrEscape = Nothing +identifyString (x:xs) StrInside = case identifyString xs StrInside of + Just (ys, b) -> Just (ys, x:b) + y -> y data NumState = NumStart | NumNeg @@ -141,16 +152,21 @@ lexStr "" = [] lexStr ('(':xs) = TokLPar : lexStr xs lexStr (')':xs) = TokRPar : lexStr xs lexStr ('"':xs) = case identifyString ('"':xs) StrStart - of (ys, Nothing) -> TokErr "End of file reached before matching \"" :lexStr ys - (ys, Just b) -> TokStr b:lexStr ys -lexStr ('\'':"") = [TokErr "End of file reached before matching '"] + of Nothing -> case identifySymbol ('"':xs) of (rest, symb) -> TokSymbol symb : lexStr rest + Just (ys, b) -> TokStr b:lexStr ys +lexStr ('\'':"") = [TokSymbol "'"] lexStr ('\'':'(':xs) = TokQuote : lexStr ('(':xs) lexStr ('\'':x:'\'':xs) = TokCharacter x : lexStr xs +lexStr ('\'':x:xs) | isWhitespace x = TokSymbol "'" : lexStr (x:xs) + | otherwise = case identifySymbol (x:xs) of + (rest, symb) -> TokQuote : TokSymbol symb : lexStr rest lexStr (' ':xs) = lexStr xs lexStr ('\t':xs) = lexStr xs lexStr ('\n':xs) = lexStr xs lexStr ('\r':xs) = lexStr xs -lexStr (x:xs) | isDigit x || x == '-' = case identifyNum (x:xs) NumStart - of (ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys - (ys, Just b) -> TokNumber (read b) : lexStr ys -lexStr _ = [] +lexStr (x:xs) | isDigit x || + x == '-' = + case identifyNum (x:xs) NumStart of + (ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys + (ys, Just b) -> TokNumber (read b) : lexStr ys +lexStr xs = case identifySymbol xs of (ys, zs) -> TokSymbol zs : lexStr ys diff --git a/test/Main.hs b/test/Main.hs index db17f42..4d9fdf6 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -8,16 +8,31 @@ testPars = TestCase (assertEqual "() -> [TokLPar,TokRPar]" [Lib.TokLPar, Lib.TokRPar] (lexStr "()")) +testSymbol :: Test +testSymbol = TestCase (assertEqual "hello world -> [TokSymbol \"hello\", TokSymbol \"world\"]" + [Lib.TokSymbol "hello", Lib.TokSymbol "world"] + (lexStr "hello world")) + testQuotedPars :: Test testQuotedPars = TestCase (assertEqual "'() -> [TokQuote,TokLPar,TokRPar]" [Lib.TokQuote, Lib.TokLPar, Lib.TokRPar] (lexStr "'()")) +testLoneQuoteIsSymbol :: Test +testLoneQuoteIsSymbol = TestCase (assertEqual "' ' -> [TokSymbol \"'\", TokSymbol \"'\"]" + [Lib.TokSymbol "'", Lib.TokSymbol "'"] + (lexStr "' '")) + testCharacter :: Test testCharacter = TestCase (assertEqual "'a' -> [TokCharacter 'a']" [Lib.TokCharacter 'a'] (lexStr "'a'")) +testQuotedSymbol :: Test +testQuotedSymbol = TestCase (assertEqual "'hello -> [TokQuote,TokSymbol \"hello\"]" + [Lib.TokQuote, Lib.TokSymbol "hello"] + (lexStr "'hello")) + testUnicodeCharacter :: Test testUnicodeCharacter = TestCase (assertEqual "'📎' -> [TokCharacter '📎']" [Lib.TokCharacter '📎'] @@ -50,22 +65,32 @@ testNegativeNumber = TestCase (assertEqual "-42.03e-12 -> [TokNumber -42.03e-12] testStringsAreRecognized :: Test testStringsAreRecognized = TestCase (assertEqual - "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\" -> [Lib.TokStr \"\43981asdf\b\f\r\n\t\\/\"" + "can strings be recognized?" [Lib.TokStr "\43981asdf\b\f\r\n\t\\/"] (lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\"")) +testUnclosedStringsAreSymbols :: Test +testUnclosedStringsAreSymbols = TestCase (assertEqual + "this would make for a terrible symbol, but it can hypothetically start with a double quote" + [Lib.TokSymbol "\"hello", Lib.TokSymbol "world"] + (lexStr "\"hello world")) + tests :: Test tests = TestList [ TestLabel "test paretheses are detected" testPars, + TestLabel "test symbols are recognized" testSymbol, TestLabel "test quoted parentheses are detected" testQuotedPars, + TestLabel "test single quotes without symbols or characters are symbols themselves" testLoneQuoteIsSymbol, TestLabel "test characters are recognized" testCharacter, + TestLabel "test quoted symbols are recognized" testQuotedSymbol, TestLabel "test unicode characters are recognized" testUnicodeCharacter, TestLabel "test simple integers are recognized" testSimpleIntegers, TestLabel "test fractional numbers are recognized" testFractionalNumber, TestLabel "test integers with exponential component are recognized" testExponentialInteger, TestLabel "test exponential number is recognized" testExponential, TestLabel "test negative numbers are recognized" testNegativeNumber, - TestLabel "test strings are recognized" testStringsAreRecognized + TestLabel "test strings are recognized" testStringsAreRecognized, + TestLabel "test unclosed strings are symbols" testUnclosedStringsAreSymbols ] main :: IO ()