From 0c7ae37e3f482255c80796414b708c8bc9f6ee87 Mon Sep 17 00:00:00 2001
From: jjanzen <jjanzen@jjanzen.ca>
Date: Wed, 26 Feb 2025 23:23:37 -0600
Subject: initial commit

---
 src/parser.zig | 535 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 535 insertions(+)
 create mode 100644 src/parser.zig

(limited to 'src/parser.zig')

diff --git a/src/parser.zig b/src/parser.zig
new file mode 100644
index 0000000..253ddf6
--- /dev/null
+++ b/src/parser.zig
@@ -0,0 +1,535 @@
+const std = @import("std");
+const opcodes = @import("opcodes.zig");
+
+/// A symbol's value can be pure or point to a register
+const SymbolValueType = enum {
+    pure,
+    register,
+};
+const SymbolValue = union {
+    pure: u64,
+    register: u8,
+};
+
+/// A constant can be a number of a string
+const ConstantType = enum {
+    number,
+    string,
+};
+const ConstantValue = union(ConstantType) {
+    number: u64,
+    string: []const u8,
+};
+
+/// The Parser reads a provided input and assembles it into MMIX object code
+pub const Parser = struct {
+    allocator: std.mem.Allocator,
+    input: []const u8,
+    location: u64,
+    ch_pos: usize,
+    symbols: std.StringHashMap(SymbolValue),
+    object: std.ArrayList(u8),
+
+    /// Test is a character is whitespace
+    /// Note that newlines are special and not included in this implementation.
+    fn isWhitespace(ch: u8) bool {
+        return ch == ' ' or ch == '\t' or ch == '\r';
+    }
+
+    /// Test if a character is a letter
+    /// Note that underscores are letters for the purposes of symbol recognition.
+    fn isLetter(ch: u8) bool {
+        return ch == '_' or (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z');
+    }
+
+    /// Test if a character is a decimal digit
+    fn isDecimal(ch: u8) bool {
+        return ch >= '0' and ch <= '9';
+    }
+
+    /// Test if a character is a hexadecimal digit
+    fn isHexadecimal(ch: u8) bool {
+        return isDecimal(ch) or (ch >= 'a' and ch <= 'f') or (ch >= 'A' and ch <= 'F');
+    }
+
+    /// Test if a character is a symbol character
+    /// Note that all valid unicode characters larger than 126 are also valid symbol characters.
+    fn isSymbolChar(ch: u8) bool {
+        return isLetter(ch) or isDecimal(ch) or ch > 126;
+    }
+
+    /// Get a byte from the input at a specified location
+    /// Return 0 if the requested byte is out of range
+    fn getByte(self: *Parser, pos: usize) u8 {
+        if (pos < self.input.len) {
+            return self.input[pos];
+        }
+        return 0;
+    }
+
+    /// Move the cursor forward until it does not point at whitespace
+    fn skipWhitespace(self: *Parser) void {
+        while (isWhitespace(self.getByte(self.ch_pos))) {
+            self.ch_pose += 1;
+        }
+    }
+
+    /// Determine whether the cursor points at a valid integer in base 10
+    /// Move the cursor past the integer and return it
+    fn identifyDecimal(self: *Parser) !u64 {
+        const start = self.ch_pos;
+        while (isDecimal(self.getByte(self.ch_pos))) {
+            self.ch_pos += 1;
+        }
+        const end = self.ch_pos;
+
+        return std.fmt.parseInt(u64, self.input[start..end], 10) catch return error.NoDecimal;
+    }
+
+    /// Determine whether the cursor points at a valid integer in base 16
+    /// Base 16 is identified by a number starting with #
+    /// Move the cursor past the integer and return it
+    fn identifyHexadecimal(self: *Parser) !u64 {
+        if (self.getByte(self.ch_pos) != '#') {
+            return error.NoHexadecimal;
+        }
+
+        self.ch_pos += 1;
+
+        const start = self.ch_pos;
+        while (isHexadecimal(self.getByte(self.ch_pos))) {
+            self.ch_pos += 1;
+        }
+        const end = self.ch_pos;
+
+        return std.fmt.parseInt(u64, self.input[start..end], 16) catch return error.NoHexadecimal;
+    }
+
+    /// Determine whether the cursor points at a valid unicode character wrapped in single quotes
+    /// Move the cursor past the closing quote and return the character
+    fn identifyChar(self: *Parser) ![]const u8 {
+        if (self.getByte(self.ch_pos) != '\'') {
+            return error.NoChar;
+        }
+
+        self.ch_pos += 1;
+
+        const start = self.ch_pos;
+        while (self.getByte(self.ch_pos) != 0 and self.getByte(self.ch_pos) != '\'') {
+            self.ch_pos += 1;
+
+            if (self.ch_pos - start > 4) {
+                return error.NoChar;
+            }
+        }
+        const end = self.ch_pos;
+        self.ch_pos += 1;
+
+        if (end <= start) {
+            return error.NoChar;
+        }
+
+        const view = std.unicode.Utf8View.init(self.input[start..end]) catch return error.NoChar;
+        var iter = view.iterator();
+        var count: u8 = 0;
+        while (iter.nextCodepoint()) |_| {
+            count += 1;
+            if (count > 1) {
+                return error.NoChar;
+            }
+        }
+        if (count != 1) {
+            return error.NoChar;
+        }
+
+        return self.input[start..end];
+    }
+
+    /// Determine whether the cursor points at a valid string wrapped in double quotes
+    /// Note that a string has at least one character in it and that it cannot have " or newlines in it
+    /// Move the cursor past the string and return the string
+    fn identifyString(self: *Parser) ![]const u8 {
+        if (self.getByte(self.ch_pos) != '"') {
+            return error.NoString;
+        }
+
+        self.ch_pos += 1;
+        const start = self.ch_pos;
+        while (self.getByte(self.ch_pos) != 0 and self.getByte(self.ch_pos) != '"') {
+            if (self.getByte(self.ch_pos) == '\n') {
+                return error.NoString;
+            }
+            self.ch_pos += 1;
+        }
+        const end = self.ch_pos;
+        if (self.getByte(self.ch_pos) == '"') {
+            self.ch_pos += 1;
+        }
+
+        if (end <= start) {
+            return error.NoString;
+        }
+
+        return self.input[start..end];
+    }
+
+    /// Determine whether the cursor points at a valid constant
+    /// The constant may be a string or a number
+    /// Move the cursor past the constant and return it
+    fn identifyConstant(self: *Parser) !ConstantValue {
+        switch (self.getByte(self.ch_pos)) {
+            '0'...'9' => {
+                const number = try identifyDecimal(self);
+                return ConstantValue{ .number = number };
+            },
+            '#' => {
+                const number = try identifyHexadecimal(self);
+                return ConstantValue{ .number = number };
+            },
+            '\'' => {
+                const string = try identifyChar(self);
+                return ConstantValue{ .string = string };
+            },
+            '"' => {
+                const string = try identifyString(self);
+                return ConstantValue{ .string = string };
+            },
+            else => return error.NoConstant,
+        }
+    }
+
+    /// Determine whether the cursor points at a symbol
+    /// A symbol starts with a letter and only has symbol characters after that point
+    /// There is an exception that there are 30 special symbols of the form xH, xF, and xB where x is a single decimal digit
+    /// Move the cursor past the symbol and return its name
+    fn identifySymbol(self: *Parser) ![]const u8 {
+        const start = self.ch_pos;
+        if ((isLetter(self.getByte(self.ch_pos)) or self.getByte(self.ch_pos) == '_')) {
+            self.ch_pos += 1;
+
+            while (isSymbolChar(self.getByte(self.ch_pos))) {
+                self.ch_pos += 1;
+            }
+        } else if (isDecimal(self.getByte(self.ch_pos)) and
+            (self.getByte(self.ch_pos + 1) == 'H' or self.getByte(self.ch_pos + 1) == 'F' or self.getByte(self.ch_pos + 1) == 'B'))
+        {
+            self.ch_pos += 2;
+            return self.input[self.ch_pos - 2 .. self.ch_pos];
+        }
+        const end = self.ch_pos;
+
+        if (end > start) {
+            return self.input[start..end];
+        }
+
+        return error.NoSymbol;
+    }
+
+    /// Determine whether the cursor points at a valid opcode or pseudo operation
+    /// An opcode consists solely of symbol characters (letters and numbers in fact)
+    /// Move the cursor past the opcode and return it
+    fn identifyOperation(self: *Parser) !opcodes.Operation {
+        const start = self.ch_pos;
+        while (isSymbolChar(self.getByte(self.ch_pos))) {
+            self.ch_pos += 1;
+        }
+        const end = self.ch_pos;
+
+        return opcodes.parseOp(self.allocator, self.input[start..end]);
+    }
+
+    pub fn init(allocator: std.mem.Allocator, input: []const u8) Parser {
+        return Parser{
+            .allocator = allocator,
+            .input = input,
+            .location = 0,
+            .ch_pos = 0,
+            .symbols = std.StringHashMap(SymbolValue).init(allocator),
+            .object = std.ArrayList(u8).init(allocator),
+        };
+    }
+
+    pub fn deinit(self: *Parser) void {
+        self.symbols.deinit();
+        self.object.deinit();
+    }
+};
+
+test "normal ascii characters are recognized as symbol chars" {
+    const chars = "qwertyuiopasdfghjklzxcvbnm1234567890QWERTYUIOPASDFGHJKLZXCVBNM_";
+
+    for (chars) |c| {
+        try std.testing.expect(Parser.isSymbolChar(c));
+    }
+}
+
+test "large unicode characters are recognized as symbol chars" {
+    const cuneiform = "𒀀𒀁𒀂𒀃𒀄𒀅𒀆𒀇𒀈𒀉𒀊𒀋𒀌𒀍𒀎𒀏𒀐𒀑𒀒𒀓𒀔𒀕𒀖𒀗𒀘𒀙𒀚𒀛𒀜𒀝𒀞𒀟𒀠𒀡𒀢𒀣𒀤𒀥𒀦𒀧𒀨𒀩𒀪𒀫𒀬𒀭𒀮𒀯𒀰𒀱𒈷𒌄";
+
+    for (cuneiform) |c| {
+        try std.testing.expect(Parser.isSymbolChar(c));
+    }
+}
+
+test "non-symbol characters are detected" {
+    const chars = "\n\r \t!@#$%^&*()-=+[]{}\\|;:'\"/?,.<>`~";
+
+    for (chars) |c| {
+        try std.testing.expect(!Parser.isSymbolChar(c));
+    }
+}
+
+test "symbols are identified" {
+    const test_cases = [_][]const u8{
+        "_asdf$%@",
+        "ASFLKJ3332__q5 ;asdf;lk",
+        "asdf𒀤𒀥𒀦\nalsfkd",
+        "2H",
+        "5F",
+        "0B",
+    };
+
+    const expected = [_][]const u8{
+        "_asdf",
+        "ASFLKJ3332__q5",
+        "asdf𒀤𒀥𒀦",
+        "2H",
+        "5F",
+        "0B",
+    };
+
+    for (0..6) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const symbol = try parser.identifySymbol();
+        try std.testing.expect(std.mem.eql(u8, expected[i], symbol));
+        parser.deinit();
+    }
+}
+
+test "no symbols are found successfully" {
+    const test_cases = [_][]const u8{
+        " _asdf",
+        ";ASFLKJ3332__q5",
+        "\nasdf𒀤𒀥𒀦",
+    };
+
+    for (test_cases) |case| {
+        var parser = Parser.init(std.testing.allocator, case);
+        const symbol = parser.identifySymbol();
+        try std.testing.expectEqual(error.NoSymbol, symbol);
+        parser.deinit();
+    }
+}
+
+test "opcodes are identified" {
+    const test_cases = [_][]const u8{
+        "2ADDU%aldfk",
+        "GO ",
+        "ADD\taksfdjas",
+        "GREG\n",
+        "IS",
+    };
+
+    const expected = [_]opcodes.Operation{
+        opcodes.Operation{ .opcode = opcodes.Opcode._2ADDU },
+        opcodes.Operation{ .opcode = opcodes.Opcode.GO },
+        opcodes.Operation{ .opcode = opcodes.Opcode.ADD },
+        opcodes.Operation{ .pseudo_op = opcodes.PseudoOp.GREG },
+        opcodes.Operation{ .pseudo_op = opcodes.PseudoOp.IS },
+    };
+
+    for (0..5) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const op = try parser.identifyOperation();
+        try std.testing.expectEqual(expected[i], op);
+        parser.deinit();
+    }
+}
+
+test "no opcodes are found successfully" {
+    const test_cases = [_][]const u8{
+        " _asdf",
+        ";ASFLKJ3332__q5",
+        "\nasdf𒀤𒀥𒀦",
+        "asdfklajsdfl",
+    };
+
+    for (test_cases) |case| {
+        var parser = Parser.init(std.testing.allocator, case);
+        const symbol = parser.identifyOperation();
+        try std.testing.expectEqual(error.NoOpcode, symbol);
+        parser.deinit();
+    }
+}
+
+test "decimals are recognized" {
+    const test_cases = [_][]const u8{
+        "012314aslkfdj",
+        "1234567890  43",
+        "1234567891234567889\n123124",
+    };
+
+    const expected = [_]u64{
+        12314,
+        1234567890,
+        1234567891234567889,
+    };
+
+    for (0..3) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const symbol = try parser.identifyDecimal();
+        try std.testing.expectEqual(expected[i], symbol);
+        parser.deinit();
+    }
+}
+
+test "malformed decimals are not recognized" {
+    const test_cases = [_][]const u8{
+        "",
+        "asdf123",
+        "  123",
+        "12345678901234567890123456789012345678901234567890",
+    };
+
+    for (test_cases) |case| {
+        var parser = Parser.init(std.testing.allocator, case);
+        const symbol = parser.identifyDecimal();
+        try std.testing.expectEqual(error.NoDecimal, symbol);
+        parser.deinit();
+    }
+}
+
+test "hexadecimals are recognized" {
+    const test_cases = [_][]const u8{
+        "#012314saslkfdj",
+        "#1234567890abcdef  43",
+        "#1234567891\n123124",
+    };
+
+    const expected = [_]u64{
+        0x12314,
+        0x1234567890abcdef,
+        0x1234567891,
+    };
+
+    for (0..3) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const symbol = try parser.identifyHexadecimal();
+        try std.testing.expectEqual(expected[i], symbol);
+        parser.deinit();
+    }
+}
+
+test "malformed hexadecimals are not recognized" {
+    const test_cases = [_][]const u8{
+        "",
+        "sasdf123",
+        "  123",
+        "#12345678901234567890123456789012345678901234567890",
+        "#",
+    };
+
+    for (test_cases) |case| {
+        var parser = Parser.init(std.testing.allocator, case);
+        const symbol = parser.identifyHexadecimal();
+        try std.testing.expectEqual(error.NoHexadecimal, symbol);
+        parser.deinit();
+    }
+}
+
+test "characters are recognized" {
+    const test_cases = [_][]const u8{
+        "'a'",
+        "'1'",
+        "'𒀤'",
+    };
+
+    const expected = [_][]const u8{
+        "a",
+        "1",
+        "𒀤",
+    };
+
+    for (0..3) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const symbol = try parser.identifyChar();
+        try std.testing.expect(std.mem.eql(u8, expected[i], symbol));
+        parser.deinit();
+    }
+}
+
+test "invalid unicode sequences are not characters" {
+    const test_cases = [_][]const u8{
+        "'asdf'",
+        "'asdfg'",
+        "'as'",
+        "''",
+        "'",
+    };
+
+    for (test_cases) |case| {
+        var parser = Parser.init(std.testing.allocator, case);
+        const symbol = parser.identifyChar();
+        try std.testing.expectEqual(error.NoChar, symbol);
+        parser.deinit();
+    }
+}
+
+test "strings are recognized" {
+    const test_cases = [_][]const u8{
+        "\" \"",
+        "\"aslkdfjlaskdfj lkasjflkasjdflaksjfd''12309)($)(#$[[]𒀤\"",
+    };
+
+    const expected = [_][]const u8{
+        " ",
+        "aslkdfjlaskdfj lkasjflkasjdflaksjfd''12309)($)(#$[[]𒀤",
+    };
+
+    for (0..2) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const symbol = try parser.identifyString();
+        try std.testing.expect(std.mem.eql(u8, expected[i], symbol));
+        parser.deinit();
+    }
+}
+
+test "invalid strings are not recognized" {
+    const test_cases = [_][]const u8{
+        "\"\"",
+        "\"",
+        "\"\n\"",
+    };
+
+    for (test_cases) |case| {
+        var parser = Parser.init(std.testing.allocator, case);
+        const symbol = parser.identifyString();
+        try std.testing.expectEqual(error.NoString, symbol);
+        parser.deinit();
+    }
+}
+
+test "constants are recognized" {
+    const test_cases = [_][]const u8{
+        "1234567890 1234",
+        "#1234567890abcdef;%#*(",
+        "'a'uuuuuu",
+        "\"hello \"world",
+    };
+
+    const expected = [_]ConstantValue{
+        ConstantValue{ .number = 1234567890 },
+        ConstantValue{ .number = 0x1234567890abcdef },
+        ConstantValue{ .string = "a" },
+        ConstantValue{ .string = "hello " },
+    };
+
+    for (0..4) |i| {
+        var parser = Parser.init(std.testing.allocator, test_cases[i]);
+        const symbol = try parser.identifyConstant();
+        switch (symbol) {
+            .number => try std.testing.expectEqual(expected[i].number, symbol.number),
+            .string => try std.testing.expect(std.mem.eql(u8, expected[i].string, symbol.string)),
+        }
+    }
+}
-- 
cgit v1.2.3