Cleaned up Lexer

2026-04-04 01:49:05 +00:00 · 2025-05-07 18:10:15 +01:00
parent 6783564f10
commit 98fc4589ab
11 changed files with 246 additions and 203 deletions
--- a/Lexer/src/Lexer.cpp
+++ b/Lexer/src/Lexer.cpp
@@ -4,174 +4,67 @@

 #include <LexerErrors.h>
 #include <LexerInfo.h>
+#include <Constants.h>

 namespace LX
 {
-	InvalidCharInSource::InvalidCharInSource(const LexerInfo& info, const std::string& source, const std::string _file)
-		: col(info.column), line(info.line), file(_file), lineContents{}, invalid(source[info.index])
-	{
-		// Gets the line the error is on //
-		lineContents = GetLineAtIndexOf(source, info.index);
-	}
-
-	void InvalidCharInSource::PrintToConsole() const
-	{
-		// Calculates the length of the line number in the console so it is formatted correctly //
-		std::ostringstream oss;
-		oss << std::setw(3) << line;
-		size_t lineNumberWidthInConsole = std::max(oss.str().size(), (size_t)3);
-
-		// Prints the error with the relevant information to the console //
-		std::cout << "\n";
-		LX::PrintStringAsColor("Error: ", LX::Color::LIGHT_RED);
-		std::cout << "Invalid character found in ";
-		LX::PrintStringAsColor(file, LX::Color::WHITE);
-		std::cout << " {";
-		LX::PrintStringAsColor(std::string(1, invalid), LX::Color::LIGHT_RED);
-		std::cout << "}:\n";
-		std::cout << "Line: " << std::setw(lineNumberWidthInConsole) << line << " | " << lineContents << "\n";
-		std::cout << "      " << std::setw(lineNumberWidthInConsole) << "" << " | " << std::setw(col - 1) << "";
-		LX::PrintStringAsColor("^", LX::Color::LIGHT_RED);
-		std::cout << "\n";
-	}
-
-	const char* InvalidCharInSource::ErrorType() const
-	{
-		return "Invalid char in source";
-	}
-
-	// Helper macro for outputting token type //
-	#define TOKEN_CASE(type) case type: return #type;
-
-	// Helper util function to translate a tokentype to it's enum val //
-	static std::string ToStringNoFormat(Token::TokenType type)
-	{
-		switch (type)
-		{
-			TOKEN_CASE(Token::STRING_LITERAL);
-			TOKEN_CASE(Token::IDENTIFIER);
-			TOKEN_CASE(Token::FOR);
-			TOKEN_CASE(Token::WHILE);
-			TOKEN_CASE(Token::IF);
-			TOKEN_CASE(Token::ELSE);
-			TOKEN_CASE(Token::ELIF);
-			TOKEN_CASE(Token::FUNCTION);
-			TOKEN_CASE(Token::ADD);
-			TOKEN_CASE(Token::SUB);
-			TOKEN_CASE(Token::MUL);
-			TOKEN_CASE(Token::DIV);
-			TOKEN_CASE(Token::NUMBER_LITERAL);
-			TOKEN_CASE(Token::RETURN);
-			TOKEN_CASE(Token::OPEN_BRACE);
-			TOKEN_CASE(Token::CLOSE_BRACE);
-			TOKEN_CASE(Token::OPEN_BRACKET);
-			TOKEN_CASE(Token::CLOSE_BRACKET);
-			TOKEN_CASE(Token::OPEN_PAREN);
-			TOKEN_CASE(Token::CLOSE_PAREN);
-			TOKEN_CASE(Token::ASSIGN);
-			TOKEN_CASE(Token::INT_DEC);
-
-			default:
-				return "Unknown: " + std::to_string(type);
-		}
-	}
-
-	// Logging function to turn a tokentype enum val into a nicely formatted string //
-	std::string ToString(Token::TokenType type)
-	{
-		// Gets the unformated version of the string //
-		std::string unformatted = ToStringNoFormat(type);
-		unformatted = unformatted.substr(7); // Removes the Token:: prefix
-
-		// Formats the string (turns to lowercase and replaces _ with a space //
-		std::string formatted;
-
-		for (char current : unformatted)
-		{
-			// Adding 32 makes it lowercase due to how ASCII works //
-			if ((current >= 'A' && current <= 'Z')) { formatted.push_back(current + 32); }
-
-			// Replaces _ with spaces //
-			else if (current == '_') { formatted.push_back(' '); }
-
-			// Else adds the current character //
-			else { formatted.push_back(current); }
-		}
-
-		// Returns the formatted string //
-		return formatted;
-	}
-	
-	// Stops use outside of the function //
-	#undef TOKEN_CASE
-
 	// Helper function for dealing with floating-point number literals //
 	static constexpr bool CanBePartOfNumberLiteral(const char c) { return (c == '.') || (c == 'f'); }

-	// Helper function to stop printing whitespace as pure whitespace //
-	static std::string PrintChar(const char c)
-	{
-		switch (c)
-		{
-			// Stores them as pure string literals //
-			case '\n': return R"(\n)";
-			case '\t': return R"(\t)";
-			case '\r': return R"(\r)";
-
-			// Else returns a string of length one with the char inside //
-			default: return std::string(1, c);
-		}
-	}
-
-	// All the keywords the lexer currently supports with their token-enum equivalents //
-	static const std::unordered_map<std::string, Token::TokenType> keywords =
-	{
-		{ "for"			, Token::FOR		},
-		{ "while"		, Token::WHILE		},
-		{ "if"			, Token::IF			},
-		{ "else"		, Token::ELSE		},
-		{ "elif"		, Token::ELIF		},
-		{ "func"		, Token::FUNCTION	},
-		{ "return"		, Token::RETURN		},
-		{ "int"			, Token::INT_DEC	}
-	};
-
-	// All the symbols supported by the lexer //
-	static const std::unordered_map<char, Token::TokenType> symbols =
-	{
-		{ '{', Token::OPEN_BRACKET		},
-		{ '}', Token::CLOSE_BRACKET		},
-		{ '[', Token::OPEN_BRACE		},
-		{ ']', Token::CLOSE_BRACE		},
-		{ '(', Token::OPEN_PAREN		},
-		{ ')', Token::CLOSE_PAREN		},
-		{ ',', Token::COMMA				},
-		{ '=', Token::ASSIGN			}
-	};
-
-	// All the single-char operators currently supported by the lexer with their token-enum equivalents //
-	// TODO: Support multi-char operators such as: ==, -> +=, &&
-	static const std::unordered_map<char, Token::TokenType> operators =
-	{
-		{ '+', Token::ADD },
-		{ '-', Token::SUB },
-		{ '*', Token::MUL },
-		{ '/', Token::DIV }
-	};
-
 	// Checks if the given word is a keyword before adding it to the tokens //
-	static void TokenizeWord(const std::string& word, std::vector<Token>& tokens, LexerInfo& info, const std::string& contents)
+	static void TokenizeWord(const std::string& word, std::vector<Token>& tokens, LexerInfo& info)
 	{
 		// Checks the map for a check and if so adds it with its enum equivalent //
 		if (auto keyword = keywords.find(word); keyword != keywords.end())
 		{
-			tokens.push_back({ keyword->second, info, (std::streamsize)word.size(), contents });
+			tokens.push_back({ keyword->second, info, (std::streamsize)word.size(), info.source });
 		}

 		// Else adds it as a type of IDENTIFIER //
 		else
 		{
-			tokens.push_back({ Token::IDENTIFIER, info, (std::streamsize)word.size(), contents });
+			tokens.push_back({ Token::IDENTIFIER, info, (std::streamsize)word.size(), info.source });
+		}
+	}
+
+	static inline void UpdateLexerInfo(LexerInfo& info)
+	{
+		// Transfers the previous state if not at the beginning //
+		if (info.index != 0) [[likely]]
+		{
+			info.wasLastCharAlpha = info.isAlpha;
+			info.wasLastCharNumeric = info.isNumeric;
+
+			info.isAlpha = info.isNextCharAlpha;
+			info.isNumeric = info.isNextCharNumeric;
+		}
+
+		else
+		{
+			// Stores the current character for easy access
+			const char current = info.source[info.index];
+
+			// Works out if the current character is alphabetic or numeric //
+			info.isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z');
+			info.isNumeric = (current >= '0' && current <= '9');
+		}
+
+		// Only does next character checks when not at the end //
+		if (info.index + 1 < info.len) [[likely]]
+		{
+			// Gets the next character //
+			const char next = info.source[info.index + 1];
+
+			// Sets flags depending on the value of the next character //
+			info.isNextCharAlpha = (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z');
+			info.isNextCharNumeric = (next >= '0' && next <= '9') || CanBePartOfNumberLiteral(next);
+		}
+
+		// Else defaults the flags to false //
+		else
+		{
+			info.isNextCharAlpha = false;
+			info.isNextCharNumeric = false;
 		}
 	}

@@ -180,8 +73,8 @@ namespace LX
 		// Logs that the file is being read //
 		Log::LogNewSection("Reading file: ", path.string());

-		std::string contents = ReadFileToString(path);
-		const std::streamsize len = contents.length();
+		std::string fileContents = ReadFileToString(path);
+		const std::streamsize len = fileContents.length();

 		// Logs the start of the lexical analysis
 		Log::LogNewSection("Lexing file");
@@ -192,36 +85,16 @@ namespace LX
 		tokens.reserve(0xFFFF);

 		// Trackers for when the program is iterating over the file //
-		LexerInfo info;
+		LexerInfo info(fileContents);

 		// Iterates over the file and turns it into tokens //
 		while (info.index < len)
 		{
 			// Stores the current character for easy access
-			const char current = contents[info.index];
+			const char current = info.source[info.index];

-			// Checks if it is not at end //
-			// Predicts it is not at end for microptimsation //
-			if (info.index + 1 < len) [[likely]]
-			{
-				// Gets the next character //
-				const char next = contents[info.index + 1];
-
-				// Sets flags depending on the value of the next character //
-				info.isNextCharAlpha = (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z');
-				info.isNextCharNumeric = (next >= '0' && next <= '9') || CanBePartOfNumberLiteral(next);
-			}
-
-			else
-			{
-				// Else defaults the next character's flags to false //
-				info.isNextCharAlpha = false;
-				info.isNextCharNumeric = false;
-			}
-
-			// Works out if the current character is alphabetic or numeric //
-			info.isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z');
-			info.isNumeric = (current >= '0' && current <= '9');
+			// Updates the LexerInfo //
+			UpdateLexerInfo(info);

 			// Updates string literal tracker and skips over rest if in a string literal //
 			if (current == '"')
@@ -238,8 +111,8 @@ namespace LX
 				else
 				{
 					// Adds the string literal token to the token vector //
-					std::string lit(contents.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral);
-					tokens.push_back({ Token::STRING_LITERAL, info, (std::streamsize)lit.length() + 2, contents }); // Adding two makes the "" be stored as well
+					std::string lit(info.source.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral);
+					tokens.push_back({ Token::STRING_LITERAL, info, (std::streamsize)lit.length() + 2, info.source }); // Adding two makes the "" be stored as well

 					// Updates trackers //
 					info.inStringLiteral = false;
@@ -268,8 +141,8 @@ namespace LX
 				if (info.isNextCharNumeric == false)
 				{
 					// Pushes the number to the token vector. Number literals are stored as string in the tokens //
-					std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
-					tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), contents });
+					std::string num(info.source.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
+					tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), info.source });
 				}

 				// Stores it is lexing a number literal //
@@ -280,8 +153,8 @@ namespace LX
 			else if ((info.isNumeric == true || CanBePartOfNumberLiteral(current)) && info.isNextCharNumeric == false && info.lexingNumber == true)
 			{
 				// Pushes the number to the token vector. Number literals are stored as string in the tokens //
-				std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
-				tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), contents });
+				std::string num(info.source.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
+				tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), info.source });
 				info.lexingNumber = false; // Stops storing it is lexing a number
 			}

@@ -299,7 +172,7 @@ namespace LX
 				if (info.isNextCharAlpha == false)
 				{
 					// Calls the function designed to handle the tokenisation of words //
-					TokenizeWord({ contents.data() + info.startOfWord, 1 }, tokens, info, contents);
+					TokenizeWord({ info.source.data() + info.startOfWord, 1 }, tokens, info);
 				}
 			}

@@ -307,7 +180,7 @@ namespace LX
 			else if (info.isAlpha == true && info.isNextCharAlpha == false)
 			{
 				// Calls the function designed to handle the tokenisation of words //
-				TokenizeWord({ contents.data() + info.startOfWord, (unsigned __int64)((info.index + 1) - info.startOfWord) }, tokens, info, contents);
+				TokenizeWord({ info.source.data() + info.startOfWord, (unsigned __int64)((info.index + 1) - info.startOfWord) }, tokens, info);
 			}

 			// During a word //
@@ -316,13 +189,13 @@ namespace LX
 			// Symbols //
 			else if (auto sym = symbols.find(current); sym != symbols.end())
 			{
-				tokens.push_back({ sym->second, info, 1, contents });
+				tokens.push_back({ sym->second, info, 1, info.source });
 			}

 			// Operators (+, -, /, *) //
 			else if (auto op = operators.find(current); op != operators.end())
 			{
-				tokens.push_back({ op->second, info, 1, contents });
+				tokens.push_back({ op->second, info, 1, info.source });
 			}

 			// If it is here and not whitespace that means it's an invalid character //
@@ -345,7 +218,7 @@ namespace LX
 			// Throws an error with all the relevant information //
 			else
 			{
-				ThrowIf<InvalidCharInSource>(true, info, contents, path.string());
+				ThrowIf<InvalidCharInSource>(true, info, path.string());
 			}

 			// Log dumps A LOT of info //
@@ -361,16 +234,13 @@ namespace LX
 				" Next Char Numeric: ", info.wasLastCharNumeric,
 				" Last Char Numeric: ", info.wasLastCharAlpha,
 				" Lexing number: ", info.lexingNumber,
-				" Current: {", PrintChar(current), "}"
+				" Current: {", CharAsStrLit(current), "}"
 			);

-			// Updates trackers to their default state of a new character //
+			// Updates the indecies to the next character //

 			info.index++;
 			info.column++;
-
-			info.wasLastCharAlpha = info.isAlpha;
-			info.wasLastCharNumeric = info.isNumeric;
 		}

 		Log::out("\n"); // Puts a space to clean up the log
@@ -383,7 +253,7 @@ namespace LX
 				"{ Line: ", std::setw(3), token.line,
 				", Index: ", std::setw(3), token.index,
 				", Length: ", std::setw(2), token.length, " } ",
-				std::setw(30), ToStringNoFormat(token.type) + ":", "{", token.GetContents(), "}"
+				std::setw(30), ToString(token.type) + ":", "{", token.GetContents(), "}"
 			);
 		}