Added symbols/operators to lexer

2025-07-21 17:34:47 +01:00
parent a6afeff493
commit 24fde1b770
6 changed files with 158 additions and 96 deletions
--- a/Common/File.h
+++ b/Common/File.h
@@ -75,14 +75,14 @@ namespace LXC::Util
 	// Finds the location of a given index within a file //
 	inline bool GetFileLocationAtIndex(FileLocation& location, const std::string& file, __int32 index)
 	{
-		// Returns false if outside the bounds //
-		if (index < 0 || index > file.length())
-			return false;
-
 		// Resets location //
 		location.line = 1;
 		location.col = 1;

+		// Returns false if outside the bounds //
+		if (index < 0 || index > file.length())
+			return false;
+
 		// Finds the location //
 		__int32 localIndex = 0;
 		while (localIndex != index)
--- a/Common/LXC.h
+++ b/Common/LXC.h
@@ -2,6 +2,7 @@

 // Standard libraries //

+#include <unordered_map>
 #include <vector>

 // LXC util files //
--- a/LXC/LXC.cpp
+++ b/LXC/LXC.cpp
@@ -47,6 +47,9 @@ int main(int argc, char** argv)
 		if (err.reason == Lexer::LexerError::InvalidCharacter)
 			Util::PrintLn(": {", fileContents.Result()[err.index], '}');

+		if (err.reason == Lexer::LexerError::UnknownSymbolOrOperand)
+			Util::PrintLn(": {", err.info, '}');
+
 		else
 			Util::PrintLn();

--- a/Lexer/inc/Lexer.h
+++ b/Lexer/inc/Lexer.h
@@ -27,12 +27,13 @@ namespace LXC::Lexer
 		enum Reason
 		{
 			InvalidCharacter,
-			UnterminatedStringLiteral
+			UnterminatedStringLiteral,
+			UnknownSymbolOrOperand
 		};

 		// Constructor to pass arguments through to the struct //
-		LexerError(Reason _reason, __int32 errorIndex)
-			: reason(_reason), index(errorIndex)
+		LexerError(Reason _reason, __int32 errorIndex, std::string _info = "")
+			: reason(_reason), index(errorIndex), info(_info)
 		{}

 		// Turns the error into a c-string //
@@ -41,7 +42,8 @@ namespace LXC::Lexer
 			static const char* reasons[] =
 			{
 				"Invalid character found in source",
-				"Unterminated string literal in source"
+				"Unterminated string literal in source",
+				"Unknown symbol or operand in source"
 			};

 			return reasons[reason];
@@ -50,6 +52,7 @@ namespace LXC::Lexer
 		// Error information //
 		const Reason reason;
 		const __int32 index;
+		const std::string info;
 	};

 	// Turns a file into a vector of tokens //
--- a/Lexer/src/Lexer.cpp
+++ b/Lexer/src/Lexer.cpp
@@ -5,116 +5,171 @@

 namespace LXC::Internal
 {
-	static constexpr bool IsNumeric(const char c)
-	{
-		return c >= '0' && c <= '9';
-	}
+    static constexpr bool IsNumeric(const char c)
+    {
+        return c >= '0' && c <= '9';
+    }

-	static constexpr bool IsAlpha(const char c)
-	{
-		return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-	}
+    static constexpr bool IsAlpha(const char c)
+    {
+        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+    }

-	static constexpr bool IsWhitespace(const char c)
-	{
-		return c == ' ' || c == '\t' || c == '\n' || c == '\r';
-	}
+    static constexpr bool IsWhitespace(const char c)
+    {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+    }
+
+    static constexpr bool IsSymbolOrOperator(const char c)
+    {
+        return
+            c == '+' || c == '-' ||
+            c == '*' || c == '/' ||
+            c == '%' || c == '=' ||
+            c == ',' || c == '[' ||
+            c == ']' || c == '{' ||
+            c == '}' || c == '(' ||
+            c == ')';
+    }
+
+    static const std::unordered_map<std::string_view, Lexer::Token::TokenType> symbolAndOpMap =
+    {
+        { "+",	Lexer::Token::Add				},
+        { "-",	Lexer::Token::Sub				},	
+        { "*",	Lexer::Token::Mul				},
+        { "/",	Lexer::Token::Div				},
+        { "%",	Lexer::Token::Mod				},
+
+        { "=",	Lexer::Token::Assign			},
+        { ",",	Lexer::Token::Comma				},
+
+        { "[",	Lexer::Token::CloseBracket		},
+        { "]",	Lexer::Token::OpenBracket		},
+
+        { "{",	Lexer::Token::CloseBrace		},
+        { "}",	Lexer::Token::OpenBrace			},
+
+        { ")",	Lexer::Token::CloseParen		},
+        { "(",	Lexer::Token::OpenParen			}
+    };
 }

 namespace LXC::Lexer
 {
-	LexerContext::LexerContext(const std::string& _source) :
-		source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
-	{}
+    LexerContext::LexerContext(const std::string& _source) :
+        source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
+    {}

-	Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
-	{
-		// Creates the context for the lexer //
-		LexerContext ctx(fileContents);
+    Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
+    {
+        // Creates the context for the lexer //
+        LexerContext ctx(fileContents);

-		struct
-		{
-			bool inStrLiteral = false;
-			bool inIdentifier = false;
-			bool inNumLiteral = false;
+        struct
+        {
+            bool inStrLiteral = false;
+            bool inIdentifier = false;
+            bool inNumLiteral = false;
+            bool inSymbolOrOp = false;

-			bool inComment = false;
+            bool inComment = false;

-			unsigned __int32 sectionStart = 0;
+            unsigned __int32 sectionStart = 0;

-		} trackers;
+        } trackers;

-		while (ctx.index < ctx.len)
-		{
-			// The current char within the source that is being lexed //
-			const char current = ctx.source[ctx.index];
-			const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
+        while (ctx.index < ctx.len)
+        {
+            // The current char within the source that is being lexed //
+            const char current = ctx.source[ctx.index];
+            const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';

-			// === Comments === //
-			if (current == '#')
-				trackers.inComment = !trackers.inComment;
+            // === Comments === //
+            if (current == '#')
+                trackers.inComment = !trackers.inComment;

-			else if (trackers.inComment) {} // Contents of comments are skipped over
+            else if (trackers.inComment) {} // Contents of comments are skipped over

-			// === String literals === //
-			else if (current == '"')
-			{
-				// Updates trackers //
-				trackers.inStrLiteral = !trackers.inStrLiteral;
-				trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
+            // === String literals === //
+            else if (current == '"')
+            {
+                // Updates trackers //
+                trackers.inStrLiteral = !trackers.inStrLiteral;
+                trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;

-				// Creates the token (if at the end of the string literal) //
-				if (!trackers.inStrLiteral)
-					ctx.out.emplace_back(ctx, trackers.sectionStart + 1, (USHORT)(ctx.index - trackers.sectionStart - 1), Token::StringLiteral);
-			
-			} else if (trackers.inStrLiteral) {}
+                // Creates the token (if at the end of the string literal) //
+                if (!trackers.inStrLiteral)
+                    ctx.out.emplace_back(ctx, trackers.sectionStart + 1, (USHORT)(ctx.index - trackers.sectionStart - 1), Token::StringLiteral);
+            
+            } else if (trackers.inStrLiteral) {}

-			// === Numbers === //
-			else if (Internal::IsNumeric(current))
-			{
-				// Updates trackers //
-				trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
-				trackers.inNumLiteral = true;
+            // === Numbers === //
+            else if (Internal::IsNumeric(current))
+            {
+                // Updates trackers //
+                trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
+                trackers.inNumLiteral = true;

-				// Checks for the end of the number literal to create the token //
-				if (!Internal::IsNumeric(next)) _UNLIKELY
-				{
-					ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::NumLiteral);
-					trackers.inNumLiteral = false;
-				}
-			}
+                // Checks for the end of the number literal to create the token //
+                if (!Internal::IsNumeric(next)) _UNLIKELY
+                {
+                    ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::NumLiteral);
+                    trackers.inNumLiteral = false;
+                }
+            }

-			// === Words === //
-			else if (Internal::IsAlpha(current))
-			{
-				// Updates trackers //
-				trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
-				trackers.inIdentifier = true;
+            // === Words === //
+            else if (Internal::IsAlpha(current))
+            {
+                // Updates trackers //
+                trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
+                trackers.inIdentifier = true;

-				// Checks for the end of the word to create the token //
-				if (!Internal::IsAlpha(next)) _UNLIKELY
-				{
-					ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::Identifier);
-					trackers.inIdentifier = false;
-				}
-			}
+                // Checks for the end of the word to create the token //
+                if (!Internal::IsAlpha(next)) _UNLIKELY
+                {
+                    ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::Identifier);
+                    trackers.inIdentifier = false;
+                }
+            }

-			// === Whitespace === //
-			else if (Internal::IsWhitespace(current)) {}
+            // === Symbols/Operators === //
+            else if (Internal::IsSymbolOrOperator(current))
+            {
+                // Updates trackers //
+                trackers.sectionStart = trackers.inSymbolOrOp ? trackers.sectionStart : ctx.index;
+                trackers.inSymbolOrOp = true;

-			// If an if-statement has not been triggered the character must be invalid //
-			else
-				return Util::FunctionFail<LexerError>(LexerError::InvalidCharacter, ctx.index);
+                // Checks for the end of the symbol or operator //
+                if (!Internal::IsSymbolOrOperator(next))
+                {
+                    // Finds the operator/symbol if it can //
+                    std::string_view fullSymbol(ctx.source.data() + trackers.sectionStart, ctx.index - trackers.sectionStart + 1);
+                    auto it = Internal::symbolAndOpMap.find(fullSymbol);
+                    if (it != Internal::symbolAndOpMap.end())
+                       ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), it->second);

-			// Iterates to the next index //
-			ctx.column++;
-			ctx.index++;
-		}
+                    else
+                        return Util::FunctionFail<LexerError>(LexerError::UnknownSymbolOrOperand, trackers.sectionStart, std::string(fullSymbol));
+                }
+            }

-		// Checks for an unterminated string literal //
-		if (trackers.inStrLiteral)
-			return Util::FunctionFail<LexerError>(LexerError::UnterminatedStringLiteral, trackers.sectionStart);
+            // === Whitespace === //
+            else if (Internal::IsWhitespace(current)) {}

-		return ctx.out;
-	}
+            // If an if-statement has not been triggered the character must be invalid //
+            else
+                return Util::FunctionFail<LexerError>(LexerError::InvalidCharacter, ctx.index);
+
+            // Iterates to the next index //
+            ctx.column++;
+            ctx.index++;
+        }
+
+        // Checks for an unterminated string literal //
+        if (trackers.inStrLiteral)
+            return Util::FunctionFail<LexerError>(LexerError::UnterminatedStringLiteral, trackers.sectionStart);
+
+        return ctx.out;
+    }
 }
--- a/example/example.lx
+++ b/example/example.lx
@@ -1 +1 @@
-FILE 4 CONTENTS "A" GO B HERE 34 5 "ELLO THER"
+FILE 4 CONTENTS "A" GO B HERE 34 += 5 "ELLO THER"