refactor number recognition

This commit is contained in:
jjanzen 2025-03-22 13:17:12 -05:00
parent d12ca569e9
commit 0d6277a3a7
2 changed files with 84 additions and 87 deletions

View file

@ -60,7 +60,8 @@ identifyString ('r':xs) StrEscape = case identifyString xs StrInside of
identifyString ('t':xs) StrEscape = case identifyString xs StrInside of identifyString ('t':xs) StrEscape = case identifyString xs StrInside of
Just (ys, b) -> Just (ys, '\t':b) Just (ys, b) -> Just (ys, '\t':b)
y -> y y -> y
identifyString ('u':w:x:y:z:xs) StrEscape | isHexDigit w && identifyString ('u':w:x:y:z:xs) StrEscape
| isHexDigit w &&
isHexDigit x && isHexDigit x &&
isHexDigit y && isHexDigit y &&
isHexDigit z = isHexDigit z =
@ -78,74 +79,64 @@ data NumState = NumStart
| NumFractionalComponent | NumFractionalComponent
| NumExponentialComponent | NumExponentialComponent
identifyNum :: String -> NumState -> (String, Maybe String) identifyNum :: String -> NumState -> Maybe (String, String)
identifyNum "" NumStart = ("", Nothing) identifyNum "" _ = Nothing
identifyNum ('-':xs) NumStart = identifyNum xs NumNeg identifyNum (x:"") _ | isDigit x = Just ("", x:"")
identifyNum ('0':'.':x:xs) NumStart | isDigit x = case identifyNum (x:xs) NumFractionalComponent identifyNum (x:y:xs) _ | isDigit x && isWhitespace y = Just (y:xs, x:"")
of (ys, Just zs) -> (ys, Just ('0':'.':zs)) identifyNum ('-':xs) NumStart = case identifyNum xs NumNeg of
Just (ys, zs) -> Just (ys, '-':zs)
y -> y y -> y
| otherwise = (x:xs, Nothing) identifyNum ('0':'.':x:xs) NumStart | isDigit x = case identifyNum (x:xs) NumFractionalComponent of
identifyNum (x:xs) NumStart | x == '0' = (x:xs, Nothing) Just (ys, zs) -> Just (ys, '0':'.':zs)
y -> y
identifyNum (x:xs) NumStart | x == '0' = Nothing
| isDigit x = identifyNum (x:xs) NumIntegralComponent | isDigit x = identifyNum (x:xs) NumIntegralComponent
| otherwise = (x:xs, Nothing) | otherwise = Nothing
identifyNum "" NumNeg = ("", Nothing) identifyNum ('0':'.':x:xs) NumNeg | isDigit x = case identifyNum (x:xs) NumFractionalComponent of
identifyNum ('0':'.':x:xs) NumNeg | isDigit x = case identifyNum (x:xs) NumFractionalComponent Just (ys, zs) -> Just (ys, '0':'.':zs)
of (ys, Just zs) -> (ys, Just ('-':'0':'.':zs))
y -> y y -> y
identifyNum (x:xs) NumNeg | x == '0' = (x:xs, Nothing) identifyNum (x:xs) NumNeg | x == '0' = Nothing
| isDigit x = case identifyNum (x:xs) NumIntegralComponent | isDigit x = identifyNum (x:xs) NumIntegralComponent
of (ys, Just zs) -> (ys, Just ('-':zs)) | otherwise = Nothing
identifyNum ('.':x:xs) NumIntegralComponent
| isDigit x = case identifyNum (x:xs) NumFractionalComponent of
Just (ys, zs) -> Just (ys, '.':zs)
y -> y y -> y
| otherwise = (x:xs, Nothing) | otherwise = Nothing
identifyNum ('.':x:xs) NumIntegralComponent | isDigit x = case identifyNum (x:xs) NumFractionalComponent
of (ys, Just zs) -> (ys, Just ('.':zs))
y -> y
| otherwise = ('.':x:xs, Just "")
identifyNum (w:x:y:xs) NumIntegralComponent | (w == 'e' || w == 'E') && identifyNum (w:x:y:xs) NumIntegralComponent | (w == 'e' || w == 'E') &&
(x == '+' || x == '-') && (x == '+' || x == '-') &&
isDigit y = isDigit y =
case identifyNum (y:xs) NumExponentialComponent case identifyNum (y:xs) NumExponentialComponent of
of (ys, Just zs) -> (ys, Just (w:x:zs)) Just (ys, zs) -> Just (ys, w:x:zs)
z -> z z -> z
| (w == 'e' || w == 'E') && identifyNum (x:y:xs) NumIntegralComponent | (x == 'e' || x == 'E') &&
isDigit x = isDigit y =
case identifyNum (x:y:xs) NumExponentialComponent case identifyNum (y:xs) NumExponentialComponent of
of (ys, Just zs) -> (ys, Just (w:zs)) Just (ys, zs) -> Just (ys, x:zs)
z -> z z -> z
identifyNum (x:y:"") NumIntegralComponent | (x == 'e' || x == 'E') && identifyNum (x:xs) NumIntegralComponent | isDigit x = case identifyNum xs NumIntegralComponent of
isDigit y = ("", Just (x:y:"")) Just (ys, zs) -> Just (ys, x:zs)
| isDigit x && isDigit y = ("", Just (x:y:""))
| isDigit x = (y:"", Just (x:""))
| otherwise = (x:y:"", Just "")
identifyNum (x:xs) NumIntegralComponent | isDigit x = case identifyNum xs NumIntegralComponent
of (ys, Just zs) -> (ys, Just (x:zs))
y -> y y -> y
| otherwise = (x:xs, Just "") | otherwise = Nothing
identifyNum (w:x:y:xs) NumFractionalComponent | (w == 'e' || w == 'E') && identifyNum (w:x:y:xs) NumFractionalComponent | (w == 'e' || w == 'E') &&
(x == '+' || x == '-') && (x == '+' || x == '-') &&
isDigit y = isDigit y =
case identifyNum (y:xs) NumExponentialComponent case identifyNum (y:xs) NumExponentialComponent of
of (ys, Just zs) -> (ys, Just (w:x:zs)) Just (ys, zs) -> Just (ys, w:x:zs)
z -> z z -> z
| (w == 'e' || w == 'E') && identifyNum (x:y:xs) NumFractionalComponent | (x == 'e' || x == 'E') &&
isDigit x = isDigit y =
case identifyNum (x:y:xs) NumExponentialComponent case identifyNum (y:xs) NumExponentialComponent of
of (ys, Just zs) -> (ys, Just (w:zs)) Just (ys, zs) -> Just (ys, x:zs)
z -> z z -> z
identifyNum (x:y:"") NumFractionalComponent | (x == 'e' || x == 'E') && identifyNum (x:xs) NumFractionalComponent | isDigit x = case identifyNum xs NumFractionalComponent of
isDigit y = ("", Just (x:y:"")) Just (ys, zs) -> Just (ys, x:zs)
| isDigit x && isDigit y = ("", Just (x:y:""))
| isDigit x = (y:"", Just (x:""))
| otherwise = (x:y:"", Just "")
identifyNum (x:xs) NumFractionalComponent | isDigit x = case identifyNum xs NumFractionalComponent
of (ys, Just zs) -> (ys, Just (x:zs))
y -> y y -> y
| otherwise = (x:xs, Just "") | otherwise = Nothing
identifyNum (x:xs) NumExponentialComponent | isDigit x = case identifyNum xs NumExponentialComponent identifyNum (x:xs) NumExponentialComponent | isDigit x = case identifyNum xs NumExponentialComponent of
of (ys, Just zs) -> (ys, Just (x:zs)) Just (ys, zs) -> Just (ys, x:zs)
y -> y y -> y
| otherwise = (x:xs, Just "") | otherwise = Nothing
identifyNum [] _ = ("", Just "")
lexStr :: String -> [Token] lexStr :: String -> [Token]
lexStr "" = [] lexStr "" = []
@ -167,6 +158,6 @@ lexStr ('\r':xs) = lexStr xs
lexStr (x:xs) | isDigit x || lexStr (x:xs) | isDigit x ||
x == '-' = x == '-' =
case identifyNum (x:xs) NumStart of case identifyNum (x:xs) NumStart of
(ys, Nothing) -> TokErr "Failed to parse number" : lexStr ys Nothing -> case identifySymbol (x:xs) of (rest, symb) -> TokSymbol symb : lexStr rest
(ys, Just b) -> TokNumber (read b) : lexStr ys Just (ys, b) -> TokNumber (read b) : lexStr ys
lexStr xs = case identifySymbol xs of (ys, zs) -> TokSymbol zs : lexStr ys lexStr xs = case identifySymbol xs of (ys, zs) -> TokSymbol zs : lexStr ys

View file

@ -4,68 +4,73 @@ import Test.HUnit
import Lib import Lib
testPars :: Test testPars :: Test
testPars = TestCase (assertEqual "() -> [TokLPar,TokRPar]" testPars = TestCase (assertEqual "recognize parentheses"
[Lib.TokLPar, Lib.TokRPar] [Lib.TokLPar, Lib.TokRPar]
(lexStr "()")) (lexStr "()"))
testSymbol :: Test testSymbol :: Test
testSymbol = TestCase (assertEqual "hello world -> [TokSymbol \"hello\", TokSymbol \"world\"]" testSymbol = TestCase (assertEqual "recognize symbols"
[Lib.TokSymbol "hello", Lib.TokSymbol "world"] [Lib.TokSymbol "hello", Lib.TokSymbol "world"]
(lexStr "hello world")) (lexStr "hello world"))
testQuotedPars :: Test testQuotedPars :: Test
testQuotedPars = TestCase (assertEqual "'() -> [TokQuote,TokLPar,TokRPar]" testQuotedPars = TestCase (assertEqual "recognize quoted parentheses"
[Lib.TokQuote, Lib.TokLPar, Lib.TokRPar] [Lib.TokQuote, Lib.TokLPar, Lib.TokRPar]
(lexStr "'()")) (lexStr "'()"))
testLoneQuoteIsSymbol :: Test testLoneQuoteIsSymbol :: Test
testLoneQuoteIsSymbol = TestCase (assertEqual "' ' -> [TokSymbol \"'\", TokSymbol \"'\"]" testLoneQuoteIsSymbol = TestCase (assertEqual "a quote on its own is just a symbol"
[Lib.TokSymbol "'", Lib.TokSymbol "'"] [Lib.TokSymbol "'", Lib.TokSymbol "'"]
(lexStr "' '")) (lexStr "' '"))
testCharacter :: Test testCharacter :: Test
testCharacter = TestCase (assertEqual "'a' -> [TokCharacter 'a']" testCharacter = TestCase (assertEqual "recognize characters"
[Lib.TokCharacter 'a'] [Lib.TokCharacter 'a']
(lexStr "'a'")) (lexStr "'a'"))
testQuotedSymbol :: Test testQuotedSymbol :: Test
testQuotedSymbol = TestCase (assertEqual "'hello -> [TokQuote,TokSymbol \"hello\"]" testQuotedSymbol = TestCase (assertEqual "recognize quoted symbols"
[Lib.TokQuote, Lib.TokSymbol "hello"] [Lib.TokQuote, Lib.TokSymbol "hello"]
(lexStr "'hello")) (lexStr "'hello"))
testUnicodeCharacter :: Test testUnicodeCharacter :: Test
testUnicodeCharacter = TestCase (assertEqual "'📎' -> [TokCharacter '📎']" testUnicodeCharacter = TestCase (assertEqual "recognize unicode characters"
[Lib.TokCharacter '📎'] [Lib.TokCharacter '📎']
(lexStr "'📎'")) (lexStr "'📎'"))
testSimpleIntegers :: Test testSimpleIntegers :: Test
testSimpleIntegers = TestCase (assertEqual "42 -> [TokNumber 42]" testSimpleIntegers = TestCase (assertEqual "recognize basic integers"
[Lib.TokNumber 42] [Lib.TokNumber 42]
(lexStr "42")) (lexStr "42\t"))
testFractionalNumber :: Test testFractionalNumber :: Test
testFractionalNumber = TestCase (assertEqual "42.123 -> [TokNumber 42.123]" testFractionalNumber = TestCase (assertEqual "recognize numbers with fractional components"
[Lib.TokNumber 42.123] [Lib.TokNumber 42.123]
(lexStr "42.123 ")) (lexStr "42.123 "))
testExponentialInteger :: Test testExponentialInteger :: Test
testExponentialInteger = TestCase (assertEqual "42e123 -> [TokNumber 42e123]" testExponentialInteger = TestCase (assertEqual "recognize integers with exponents"
[Lib.TokNumber 42e123] [Lib.TokNumber 42e123]
(lexStr "42e123")) (lexStr "42e123\n"))
testExponential :: Test testExponential :: Test
testExponential = TestCase (assertEqual "42.03E+12 -> [TokNumber 42.03e12]" testExponential = TestCase (assertEqual "recognize fractional numbers with exponents"
[Lib.TokNumber 42.03e12] [Lib.TokNumber 42.03e12]
(lexStr "42.03E+12")) (lexStr "42.03E+12\r"))
testNegativeNumber :: Test testNegativeNumber :: Test
testNegativeNumber = TestCase (assertEqual "-42.03e-12 -> [TokNumber -42.03e-12]" testNegativeNumber = TestCase (assertEqual "recognize negative numbers"
[Lib.TokNumber (-42.03e-12)] [Lib.TokNumber (-42.03e-12)]
(lexStr "-42.03e-12")) (lexStr "-42.03e-12"))
testMalformedNumbersAreSymbols :: Test
testMalformedNumbersAreSymbols = TestCase (assertEqual "recognize symbols that arise from invalid numbers"
[Lib.TokSymbol "-", Lib.TokSymbol "123abc", Lib.TokSymbol "-123.01f23"]
(lexStr "- 123abc -123.01f23"))
testStringsAreRecognized :: Test testStringsAreRecognized :: Test
testStringsAreRecognized = TestCase (assertEqual testStringsAreRecognized = TestCase (assertEqual
"can strings be recognized?" "recognize strings"
[Lib.TokStr "\43981asdf\b\f\r\n\t\\/"] [Lib.TokStr "\43981asdf\b\f\r\n\t\\/"]
(lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\"")) (lexStr "\"\\uabcdasdf\\b\\f\\r\\n\\t\\\\\\/\""))
@ -89,6 +94,7 @@ tests = TestList [
TestLabel "test integers with exponential component are recognized" testExponentialInteger, TestLabel "test integers with exponential component are recognized" testExponentialInteger,
TestLabel "test exponential number is recognized" testExponential, TestLabel "test exponential number is recognized" testExponential,
TestLabel "test negative numbers are recognized" testNegativeNumber, TestLabel "test negative numbers are recognized" testNegativeNumber,
TestLabel "test malformed numbers count as symbols" testMalformedNumbersAreSymbols,
TestLabel "test strings are recognized" testStringsAreRecognized, TestLabel "test strings are recognized" testStringsAreRecognized,
TestLabel "test unclosed strings are symbols" testUnclosedStringsAreSymbols TestLabel "test unclosed strings are symbols" testUnclosedStringsAreSymbols
] ]