From 0c7ae37e3f482255c80796414b708c8bc9f6ee87 Mon Sep 17 00:00:00 2001 From: jjanzen Date: Wed, 26 Feb 2025 23:23:37 -0600 Subject: initial commit --- src/parser.zig | 535 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 535 insertions(+) create mode 100644 src/parser.zig (limited to 'src/parser.zig') diff --git a/src/parser.zig b/src/parser.zig new file mode 100644 index 0000000..253ddf6 --- /dev/null +++ b/src/parser.zig @@ -0,0 +1,535 @@ +const std = @import("std"); +const opcodes = @import("opcodes.zig"); + +/// A symbol's value can be pure or point to a register +const SymbolValueType = enum { + pure, + register, +}; +const SymbolValue = union { + pure: u64, + register: u8, +}; + +/// A constant can be a number of a string +const ConstantType = enum { + number, + string, +}; +const ConstantValue = union(ConstantType) { + number: u64, + string: []const u8, +}; + +/// The Parser reads a provided input and assembles it into MMIX object code +pub const Parser = struct { + allocator: std.mem.Allocator, + input: []const u8, + location: u64, + ch_pos: usize, + symbols: std.StringHashMap(SymbolValue), + object: std.ArrayList(u8), + + /// Test is a character is whitespace + /// Note that newlines are special and not included in this implementation. + fn isWhitespace(ch: u8) bool { + return ch == ' ' or ch == '\t' or ch == '\r'; + } + + /// Test if a character is a letter + /// Note that underscores are letters for the purposes of symbol recognition. + fn isLetter(ch: u8) bool { + return ch == '_' or (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z'); + } + + /// Test if a character is a decimal digit + fn isDecimal(ch: u8) bool { + return ch >= '0' and ch <= '9'; + } + + /// Test if a character is a hexadecimal digit + fn isHexadecimal(ch: u8) bool { + return isDecimal(ch) or (ch >= 'a' and ch <= 'f') or (ch >= 'A' and ch <= 'F'); + } + + /// Test if a character is a symbol character + /// Note that all valid unicode characters larger than 126 are also valid symbol characters. + fn isSymbolChar(ch: u8) bool { + return isLetter(ch) or isDecimal(ch) or ch > 126; + } + + /// Get a byte from the input at a specified location + /// Return 0 if the requested byte is out of range + fn getByte(self: *Parser, pos: usize) u8 { + if (pos < self.input.len) { + return self.input[pos]; + } + return 0; + } + + /// Move the cursor forward until it does not point at whitespace + fn skipWhitespace(self: *Parser) void { + while (isWhitespace(self.getByte(self.ch_pos))) { + self.ch_pose += 1; + } + } + + /// Determine whether the cursor points at a valid integer in base 10 + /// Move the cursor past the integer and return it + fn identifyDecimal(self: *Parser) !u64 { + const start = self.ch_pos; + while (isDecimal(self.getByte(self.ch_pos))) { + self.ch_pos += 1; + } + const end = self.ch_pos; + + return std.fmt.parseInt(u64, self.input[start..end], 10) catch return error.NoDecimal; + } + + /// Determine whether the cursor points at a valid integer in base 16 + /// Base 16 is identified by a number starting with # + /// Move the cursor past the integer and return it + fn identifyHexadecimal(self: *Parser) !u64 { + if (self.getByte(self.ch_pos) != '#') { + return error.NoHexadecimal; + } + + self.ch_pos += 1; + + const start = self.ch_pos; + while (isHexadecimal(self.getByte(self.ch_pos))) { + self.ch_pos += 1; + } + const end = self.ch_pos; + + return std.fmt.parseInt(u64, self.input[start..end], 16) catch return error.NoHexadecimal; + } + + /// Determine whether the cursor points at a valid unicode character wrapped in single quotes + /// Move the cursor past the closing quote and return the character + fn identifyChar(self: *Parser) ![]const u8 { + if (self.getByte(self.ch_pos) != '\'') { + return error.NoChar; + } + + self.ch_pos += 1; + + const start = self.ch_pos; + while (self.getByte(self.ch_pos) != 0 and self.getByte(self.ch_pos) != '\'') { + self.ch_pos += 1; + + if (self.ch_pos - start > 4) { + return error.NoChar; + } + } + const end = self.ch_pos; + self.ch_pos += 1; + + if (end <= start) { + return error.NoChar; + } + + const view = std.unicode.Utf8View.init(self.input[start..end]) catch return error.NoChar; + var iter = view.iterator(); + var count: u8 = 0; + while (iter.nextCodepoint()) |_| { + count += 1; + if (count > 1) { + return error.NoChar; + } + } + if (count != 1) { + return error.NoChar; + } + + return self.input[start..end]; + } + + /// Determine whether the cursor points at a valid string wrapped in double quotes + /// Note that a string has at least one character in it and that it cannot have " or newlines in it + /// Move the cursor past the string and return the string + fn identifyString(self: *Parser) ![]const u8 { + if (self.getByte(self.ch_pos) != '"') { + return error.NoString; + } + + self.ch_pos += 1; + const start = self.ch_pos; + while (self.getByte(self.ch_pos) != 0 and self.getByte(self.ch_pos) != '"') { + if (self.getByte(self.ch_pos) == '\n') { + return error.NoString; + } + self.ch_pos += 1; + } + const end = self.ch_pos; + if (self.getByte(self.ch_pos) == '"') { + self.ch_pos += 1; + } + + if (end <= start) { + return error.NoString; + } + + return self.input[start..end]; + } + + /// Determine whether the cursor points at a valid constant + /// The constant may be a string or a number + /// Move the cursor past the constant and return it + fn identifyConstant(self: *Parser) !ConstantValue { + switch (self.getByte(self.ch_pos)) { + '0'...'9' => { + const number = try identifyDecimal(self); + return ConstantValue{ .number = number }; + }, + '#' => { + const number = try identifyHexadecimal(self); + return ConstantValue{ .number = number }; + }, + '\'' => { + const string = try identifyChar(self); + return ConstantValue{ .string = string }; + }, + '"' => { + const string = try identifyString(self); + return ConstantValue{ .string = string }; + }, + else => return error.NoConstant, + } + } + + /// Determine whether the cursor points at a symbol + /// A symbol starts with a letter and only has symbol characters after that point + /// There is an exception that there are 30 special symbols of the form xH, xF, and xB where x is a single decimal digit + /// Move the cursor past the symbol and return its name + fn identifySymbol(self: *Parser) ![]const u8 { + const start = self.ch_pos; + if ((isLetter(self.getByte(self.ch_pos)) or self.getByte(self.ch_pos) == '_')) { + self.ch_pos += 1; + + while (isSymbolChar(self.getByte(self.ch_pos))) { + self.ch_pos += 1; + } + } else if (isDecimal(self.getByte(self.ch_pos)) and + (self.getByte(self.ch_pos + 1) == 'H' or self.getByte(self.ch_pos + 1) == 'F' or self.getByte(self.ch_pos + 1) == 'B')) + { + self.ch_pos += 2; + return self.input[self.ch_pos - 2 .. self.ch_pos]; + } + const end = self.ch_pos; + + if (end > start) { + return self.input[start..end]; + } + + return error.NoSymbol; + } + + /// Determine whether the cursor points at a valid opcode or pseudo operation + /// An opcode consists solely of symbol characters (letters and numbers in fact) + /// Move the cursor past the opcode and return it + fn identifyOperation(self: *Parser) !opcodes.Operation { + const start = self.ch_pos; + while (isSymbolChar(self.getByte(self.ch_pos))) { + self.ch_pos += 1; + } + const end = self.ch_pos; + + return opcodes.parseOp(self.allocator, self.input[start..end]); + } + + pub fn init(allocator: std.mem.Allocator, input: []const u8) Parser { + return Parser{ + .allocator = allocator, + .input = input, + .location = 0, + .ch_pos = 0, + .symbols = std.StringHashMap(SymbolValue).init(allocator), + .object = std.ArrayList(u8).init(allocator), + }; + } + + pub fn deinit(self: *Parser) void { + self.symbols.deinit(); + self.object.deinit(); + } +}; + +test "normal ascii characters are recognized as symbol chars" { + const chars = "qwertyuiopasdfghjklzxcvbnm1234567890QWERTYUIOPASDFGHJKLZXCVBNM_"; + + for (chars) |c| { + try std.testing.expect(Parser.isSymbolChar(c)); + } +} + +test "large unicode characters are recognized as symbol chars" { + const cuneiform = "𒀀𒀁𒀂𒀃𒀄𒀅𒀆𒀇𒀈𒀉𒀊𒀋𒀌𒀍𒀎𒀏𒀐𒀑𒀒𒀓𒀔𒀕𒀖𒀗𒀘𒀙𒀚𒀛𒀜𒀝𒀞𒀟𒀠𒀡𒀢𒀣𒀤𒀥𒀦𒀧𒀨𒀩𒀪𒀫𒀬𒀭𒀮𒀯𒀰𒀱𒈷𒌄"; + + for (cuneiform) |c| { + try std.testing.expect(Parser.isSymbolChar(c)); + } +} + +test "non-symbol characters are detected" { + const chars = "\n\r \t!@#$%^&*()-=+[]{}\\|;:'\"/?,.<>`~"; + + for (chars) |c| { + try std.testing.expect(!Parser.isSymbolChar(c)); + } +} + +test "symbols are identified" { + const test_cases = [_][]const u8{ + "_asdf$%@", + "ASFLKJ3332__q5 ;asdf;lk", + "asdf𒀤𒀥𒀦\nalsfkd", + "2H", + "5F", + "0B", + }; + + const expected = [_][]const u8{ + "_asdf", + "ASFLKJ3332__q5", + "asdf𒀤𒀥𒀦", + "2H", + "5F", + "0B", + }; + + for (0..6) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const symbol = try parser.identifySymbol(); + try std.testing.expect(std.mem.eql(u8, expected[i], symbol)); + parser.deinit(); + } +} + +test "no symbols are found successfully" { + const test_cases = [_][]const u8{ + " _asdf", + ";ASFLKJ3332__q5", + "\nasdf𒀤𒀥𒀦", + }; + + for (test_cases) |case| { + var parser = Parser.init(std.testing.allocator, case); + const symbol = parser.identifySymbol(); + try std.testing.expectEqual(error.NoSymbol, symbol); + parser.deinit(); + } +} + +test "opcodes are identified" { + const test_cases = [_][]const u8{ + "2ADDU%aldfk", + "GO ", + "ADD\taksfdjas", + "GREG\n", + "IS", + }; + + const expected = [_]opcodes.Operation{ + opcodes.Operation{ .opcode = opcodes.Opcode._2ADDU }, + opcodes.Operation{ .opcode = opcodes.Opcode.GO }, + opcodes.Operation{ .opcode = opcodes.Opcode.ADD }, + opcodes.Operation{ .pseudo_op = opcodes.PseudoOp.GREG }, + opcodes.Operation{ .pseudo_op = opcodes.PseudoOp.IS }, + }; + + for (0..5) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const op = try parser.identifyOperation(); + try std.testing.expectEqual(expected[i], op); + parser.deinit(); + } +} + +test "no opcodes are found successfully" { + const test_cases = [_][]const u8{ + " _asdf", + ";ASFLKJ3332__q5", + "\nasdf𒀤𒀥𒀦", + "asdfklajsdfl", + }; + + for (test_cases) |case| { + var parser = Parser.init(std.testing.allocator, case); + const symbol = parser.identifyOperation(); + try std.testing.expectEqual(error.NoOpcode, symbol); + parser.deinit(); + } +} + +test "decimals are recognized" { + const test_cases = [_][]const u8{ + "012314aslkfdj", + "1234567890 43", + "1234567891234567889\n123124", + }; + + const expected = [_]u64{ + 12314, + 1234567890, + 1234567891234567889, + }; + + for (0..3) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const symbol = try parser.identifyDecimal(); + try std.testing.expectEqual(expected[i], symbol); + parser.deinit(); + } +} + +test "malformed decimals are not recognized" { + const test_cases = [_][]const u8{ + "", + "asdf123", + " 123", + "12345678901234567890123456789012345678901234567890", + }; + + for (test_cases) |case| { + var parser = Parser.init(std.testing.allocator, case); + const symbol = parser.identifyDecimal(); + try std.testing.expectEqual(error.NoDecimal, symbol); + parser.deinit(); + } +} + +test "hexadecimals are recognized" { + const test_cases = [_][]const u8{ + "#012314saslkfdj", + "#1234567890abcdef 43", + "#1234567891\n123124", + }; + + const expected = [_]u64{ + 0x12314, + 0x1234567890abcdef, + 0x1234567891, + }; + + for (0..3) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const symbol = try parser.identifyHexadecimal(); + try std.testing.expectEqual(expected[i], symbol); + parser.deinit(); + } +} + +test "malformed hexadecimals are not recognized" { + const test_cases = [_][]const u8{ + "", + "sasdf123", + " 123", + "#12345678901234567890123456789012345678901234567890", + "#", + }; + + for (test_cases) |case| { + var parser = Parser.init(std.testing.allocator, case); + const symbol = parser.identifyHexadecimal(); + try std.testing.expectEqual(error.NoHexadecimal, symbol); + parser.deinit(); + } +} + +test "characters are recognized" { + const test_cases = [_][]const u8{ + "'a'", + "'1'", + "'𒀤'", + }; + + const expected = [_][]const u8{ + "a", + "1", + "𒀤", + }; + + for (0..3) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const symbol = try parser.identifyChar(); + try std.testing.expect(std.mem.eql(u8, expected[i], symbol)); + parser.deinit(); + } +} + +test "invalid unicode sequences are not characters" { + const test_cases = [_][]const u8{ + "'asdf'", + "'asdfg'", + "'as'", + "''", + "'", + }; + + for (test_cases) |case| { + var parser = Parser.init(std.testing.allocator, case); + const symbol = parser.identifyChar(); + try std.testing.expectEqual(error.NoChar, symbol); + parser.deinit(); + } +} + +test "strings are recognized" { + const test_cases = [_][]const u8{ + "\" \"", + "\"aslkdfjlaskdfj lkasjflkasjdflaksjfd''12309)($)(#$[[]𒀤\"", + }; + + const expected = [_][]const u8{ + " ", + "aslkdfjlaskdfj lkasjflkasjdflaksjfd''12309)($)(#$[[]𒀤", + }; + + for (0..2) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const symbol = try parser.identifyString(); + try std.testing.expect(std.mem.eql(u8, expected[i], symbol)); + parser.deinit(); + } +} + +test "invalid strings are not recognized" { + const test_cases = [_][]const u8{ + "\"\"", + "\"", + "\"\n\"", + }; + + for (test_cases) |case| { + var parser = Parser.init(std.testing.allocator, case); + const symbol = parser.identifyString(); + try std.testing.expectEqual(error.NoString, symbol); + parser.deinit(); + } +} + +test "constants are recognized" { + const test_cases = [_][]const u8{ + "1234567890 1234", + "#1234567890abcdef;%#*(", + "'a'uuuuuu", + "\"hello \"world", + }; + + const expected = [_]ConstantValue{ + ConstantValue{ .number = 1234567890 }, + ConstantValue{ .number = 0x1234567890abcdef }, + ConstantValue{ .string = "a" }, + ConstantValue{ .string = "hello " }, + }; + + for (0..4) |i| { + var parser = Parser.init(std.testing.allocator, test_cases[i]); + const symbol = try parser.identifyConstant(); + switch (symbol) { + .number => try std.testing.expectEqual(expected[i].number, symbol.number), + .string => try std.testing.expect(std.mem.eql(u8, expected[i].string, symbol.string)), + } + } +} -- cgit v1.2.3