From 0d6277a3a76b510d29b71c6b7fd4026a49fb29ca Mon Sep 17 00:00:00 2001 From: jjanzen Date: Sat, 22 Mar 2025 13:17:12 -0500 Subject: [PATCH] refactor number recognition --- src/Lexer.hs | 131 ++++++++++++++++++++++++--------------------------- test/Main.hs | 40 +++++++++------- 2 files changed, 84 insertions(+), 87 deletions(-) diff --git a/src/Lexer.hs b/src/Lexer.hs index 200d76a..35159a2 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -60,13 +60,14 @@ identifyString ('r':xs) StrEscape = case identifyString xs StrInside of identifyString ('t':xs) StrEscape = case identifyString xs StrInside of Just (ys, b) -> Just (ys, '\t':b) y -> y -identifyString ('u':w:x:y:z:xs) StrEscape | isHexDigit w && - isHexDigit x && - isHexDigit y && - isHexDigit z = - case identifyString xs StrInside of - Just (ys, b) -> Just (ys, chr (read ('0':'x':w:x:y:z:"")):b) - a -> a +identifyString ('u':w:x:y:z:xs) StrEscape + | isHexDigit w && + isHexDigit x && + isHexDigit y && + isHexDigit z = + case identifyString xs StrInside of + Just (ys, b) -> Just (ys, chr (read ('0':'x':w:x:y:z:"")):b) + a -> a identifyString _ StrEscape = Nothing identifyString (x:xs) StrInside = case identifyString xs StrInside of Just (ys, b) -> Just (ys, x:b) @@ -78,74 +79,64 @@ data NumState = NumStart | NumFractionalComponent | NumExponentialComponent -identifyNum :: String -> NumState -> (String, Maybe String) -identifyNum "" NumStart = ("", Nothing) -identifyNum ('-':xs) NumStart = identifyNum xs NumNeg -identifyNum ('0':'.':x:xs) NumStart | isDigit x = case identifyNum (x:xs) NumFractionalComponent - of (ys, Just zs) -> (ys, Just ('0':'.':zs)) - y -> y - | otherwise = (x:xs, Nothing) -identifyNum (x:xs) NumStart | x == '0' = (x:xs, Nothing) +identifyNum :: String -> NumState -> Maybe (String, String) +identifyNum "" _ = Nothing +identifyNum (x:"") _ | isDigit x = Just ("", x:"") +identifyNum (x:y:xs) _ | isDigit x && isWhitespace y = Just (y:xs, x:"") +identifyNum ('-':xs) NumStart = case identifyNum xs NumNeg of + Just (ys, zs) -> Just (ys, '-':zs) + y -> y +identifyNum ('0':'.':x:xs) NumStart | isDigit x = case identifyNum (x:xs) NumFractionalComponent of + Just (ys, zs) -> Just (ys, '0':'.':zs) + y -> y +identifyNum (x:xs) NumStart | x == '0' = Nothing | isDigit x = identifyNum (x:xs) NumIntegralComponent - | otherwise = (x:xs, Nothing) -identifyNum "" NumNeg = ("", Nothing) -identifyNum ('0':'.':x:xs) NumNeg | isDigit x = case identifyNum (x:xs) NumFractionalComponent - of (ys, Just zs) -> (ys, Just ('-':'0':'.':zs)) - y -> y -identifyNum (x:xs) NumNeg | x == '0' = (x:xs, Nothing) - | isDigit x = case identifyNum (x:xs) NumIntegralComponent - of (ys, Just zs) -> (ys, Just ('-':zs)) - y -> y - | otherwise = (x:xs, Nothing) -identifyNum ('.':x:xs) NumIntegralComponent | isDigit x = case identifyNum (x:xs) NumFractionalComponent - of (ys, Just zs) -> (ys, Just ('.':zs)) - y -> y - | otherwise = ('.':x:xs, Just "") + | otherwise = Nothing +identifyNum ('0':'.':x:xs) NumNeg | isDigit x = case identifyNum (x:xs) NumFractionalComponent of + Just (ys, zs) -> Just (ys, '0':'.':zs) + y -> y +identifyNum (x:xs) NumNeg | x == '0' = Nothing + | isDigit x = identifyNum (x:xs) NumIntegralComponent + | otherwise = Nothing +identifyNum ('.':x:xs) NumIntegralComponent + | isDigit x = case identifyNum (x:xs) NumFractionalComponent of + Just (ys, zs) -> Just (ys, '.':zs) + y -> y + | otherwise = Nothing identifyNum (w:x:y:xs) NumIntegralComponent | (w == 'e' || w == 'E') && (x == '+' || x == '-') && isDigit y = - case identifyNum (y:xs) NumExponentialComponent - of (ys, Just zs) -> (ys, Just (w:x:zs)) - z -> z - | (w == 'e' || w == 'E') && - isDigit x = - case identifyNum (x:y:xs) NumExponentialComponent - of (ys, Just zs) -> (ys, Just (w:zs)) - z -> z -identifyNum (x:y:"") NumIntegralComponent | (x == 'e' || x == 'E') && - isDigit y = ("", Just (x:y:"")) - | isDigit x && isDigit y = ("", Just (x:y:"")) - | isDigit x = (y:"", Just (x:"")) - | otherwise = (x:y:"", Just "") -identifyNum (x:xs) NumIntegralComponent | isDigit x = case identifyNum xs NumIntegralComponent - of (ys, Just zs) -> (ys, Just (x:zs)) - y -> y - | otherwise = (x:xs, Just "") + case identifyNum (y:xs) NumExponentialComponent of + Just (ys, zs) -> Just (ys, w:x:zs) + z -> z +identifyNum (x:y:xs) NumIntegralComponent | (x == 'e' || x == 'E') && + isDigit y = + case identifyNum (y:xs) NumExponentialComponent of + Just (ys, zs) -> Just (ys, x:zs) + z -> z +identifyNum (x:xs) NumIntegralComponent | isDigit x = case identifyNum xs NumIntegralComponent of + Just (ys, zs) -> Just (ys, x:zs) + y -> y + | otherwise = Nothing identifyNum (w:x:y:xs) NumFractionalComponent | (w == 'e' || w == 'E') && (x == '+' || x == '-') && isDigit y = - case identifyNum (y:xs) NumExponentialComponent - of (ys, Just zs) -> (ys, Just (w:x:zs)) - z -> z - | (w == 'e' || w == 'E') && - isDigit x = - case identifyNum (x:y:xs) NumExponentialComponent - of (ys, Just zs) -> (ys, Just (w:zs)) - z -> z -identifyNum (x:y:"") NumFractionalComponent | (x == 'e' || x == 'E') && - isDigit y = ("", Just (x:y:"")) - | isDigit x && isDigit y = ("", Just (x:y:"")) - | isDigit x = (y:"", Just (x:"")) - | otherwise = (x:y:"", Just "") -identifyNum (x:xs) NumFractionalComponent | isDigit x = case identifyNum xs NumFractionalComponent - of (ys, Just zs) -> (ys, Just (x:zs)) - y -> y - | otherwise = (x:xs, Just "") -identifyNum (x:xs) NumExponentialComponent | isDigit x = case identifyNum xs NumExponentialComponent - of (ys, Just zs) -> (ys, Just (x:zs)) - y -> y - | otherwise = (x:xs, Just "") -identifyNum [] _ = ("", Just "") + case identifyNum (y:xs) NumExponentialComponent of + Just (ys, zs) -> Just (ys, w:x:zs) + z -> z +identifyNum (x:y:xs) NumFractionalComponent | (x == 'e' || x == 'E') && + isDigit y = + case identifyNum (y:xs) NumExponentialComponent of + Just (ys, zs) -> Just (ys, x:zs) + z -> z +identifyNum (x:xs) NumFractionalComponent | isDigit x = case identifyNum xs NumFractionalComponent of + Just (ys, zs) -> Just (ys, x:zs) + y -> y + | otherwise = Nothing +identifyNum (x:xs) NumExponentialComponent | isDigit x = case identifyNum xs NumExponentialComponent of + Just (ys, zs) -> Just (ys, x:zs) + y -> y + | otherwise = Nothing lexStr :: String -> [Token] lexStr "" = [] @@ -167,6 +158,6 @@ lexStr ('\r':xs) = lexStr xs lexStr (x:xs) | isDigit x || x == '-' = case identifyNum (x:xs) NumStart of - (ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys - (ys, Just b) -> TokNumber (read b) : lexStr ys + Nothing -> case identifySymbol (x:xs) of (rest, symb) -> TokSymbol symb : lexStr rest + Just (ys, b) -> TokNumber (read b) : lexStr ys lexStr xs = case identifySymbol xs of (ys, zs) -> TokSymbol zs : lexStr ys diff --git a/test/Main.hs b/test/Main.hs index 4d9fdf6..5d4c8de 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -4,68 +4,73 @@ import Test.HUnit import Lib testPars :: Test -testPars = TestCase (assertEqual "() -> [TokLPar,TokRPar]" +testPars = TestCase (assertEqual "recognize parentheses" [Lib.TokLPar, Lib.TokRPar] (lexStr "()")) testSymbol :: Test -testSymbol = TestCase (assertEqual "hello world -> [TokSymbol \"hello\", TokSymbol \"world\"]" +testSymbol = TestCase (assertEqual "recognize symbols" [Lib.TokSymbol "hello", Lib.TokSymbol "world"] (lexStr "hello world")) testQuotedPars :: Test -testQuotedPars = TestCase (assertEqual "'() -> [TokQuote,TokLPar,TokRPar]" +testQuotedPars = TestCase (assertEqual "recognize quoted parentheses" [Lib.TokQuote, Lib.TokLPar, Lib.TokRPar] (lexStr "'()")) testLoneQuoteIsSymbol :: Test -testLoneQuoteIsSymbol = TestCase (assertEqual "' ' -> [TokSymbol \"'\", TokSymbol \"'\"]" +testLoneQuoteIsSymbol = TestCase (assertEqual "a quote on its own is just a symbol" [Lib.TokSymbol "'", Lib.TokSymbol "'"] (lexStr "' '")) testCharacter :: Test -testCharacter = TestCase (assertEqual "'a' -> [TokCharacter 'a']" +testCharacter = TestCase (assertEqual "recognize characters" [Lib.TokCharacter 'a'] (lexStr "'a'")) testQuotedSymbol :: Test -testQuotedSymbol = TestCase (assertEqual "'hello -> [TokQuote,TokSymbol \"hello\"]" +testQuotedSymbol = TestCase (assertEqual "recognize quoted symbols" [Lib.TokQuote, Lib.TokSymbol "hello"] (lexStr "'hello")) testUnicodeCharacter :: Test -testUnicodeCharacter = TestCase (assertEqual "'📎' -> [TokCharacter '📎']" +testUnicodeCharacter = TestCase (assertEqual "recognize unicode characters" [Lib.TokCharacter '📎'] (lexStr "'📎'")) testSimpleIntegers :: Test -testSimpleIntegers = TestCase (assertEqual "42 -> [TokNumber 42]" +testSimpleIntegers = TestCase (assertEqual "recognize basic integers" [Lib.TokNumber 42] - (lexStr "42")) + (lexStr "42\t")) testFractionalNumber :: Test -testFractionalNumber = TestCase (assertEqual "42.123 -> [TokNumber 42.123]" +testFractionalNumber = TestCase (assertEqual "recognize numbers with fractional components" [Lib.TokNumber 42.123] - (lexStr "42.123")) + (lexStr "42.123 ")) testExponentialInteger :: Test -testExponentialInteger = TestCase (assertEqual "42e123 -> [TokNumber 42e123]" +testExponentialInteger = TestCase (assertEqual "recognize integers with exponents" [Lib.TokNumber 42e123] - (lexStr "42e123")) + (lexStr "42e123\n")) testExponential :: Test -testExponential = TestCase (assertEqual "42.03E+12 -> [TokNumber 42.03e12]" +testExponential = TestCase (assertEqual "recognize fractional numbers with exponents" [Lib.TokNumber 42.03e12] - (lexStr "42.03E+12")) + (lexStr "42.03E+12\r")) testNegativeNumber :: Test -testNegativeNumber = TestCase (assertEqual "-42.03e-12 -> [TokNumber -42.03e-12]" +testNegativeNumber = TestCase (assertEqual "recognize negative numbers" [Lib.TokNumber (-42.03e-12)] (lexStr "-42.03e-12")) +testMalformedNumbersAreSymbols :: Test +testMalformedNumbersAreSymbols = TestCase (assertEqual "recognize symbols that arise from invalid numbers" + [Lib.TokSymbol "-", Lib.TokSymbol "123abc", Lib.TokSymbol "-123.01f23"] + (lexStr "- 123abc -123.01f23")) + testStringsAreRecognized :: Test testStringsAreRecognized = TestCase (assertEqual - "can strings be recognized?" + "recognize strings" [Lib.TokStr "\43981asdf\b\f\r\n\t\\/"] (lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\"")) @@ -89,6 +94,7 @@ tests = TestList [ TestLabel "test integers with exponential component are recognized" testExponentialInteger, TestLabel "test exponential number is recognized" testExponential, TestLabel "test negative numbers are recognized" testNegativeNumber, + TestLabel "test malformed numbers count as symbols" testMalformedNumbersAreSymbols, TestLabel "test strings are recognized" testStringsAreRecognized, TestLabel "test unclosed strings are symbols" testUnclosedStringsAreSymbols ]