Added basic lexer

2025-07-20 16:15:58 +01:00
parent f5bb46788c
commit 903b4da7df
4 changed files with 103 additions and 39 deletions
--- a/Lexer/inc/Lexer.h
+++ b/Lexer/inc/Lexer.h
@@ -11,10 +11,10 @@ namespace LXC::Lexer
 		// Trackers for the Lexer itself //
 		const std::string& source;
-		size_t index;
+		__int32 index;
 		LexerOutput out;
-		const size_t len;
+		const __int32 len;
 		// Trackers for where the Lexer is within the user version of source //
 		unsigned short column;
--- a/Lexer/inc/Token.h
+++ b/Lexer/inc/Token.h
@@ -4,9 +4,6 @@
 namespace LXC::Lexer
 {
 	// Foward declaration to allow it passing to the Token class //
 	struct LexerContext;
 	namespace TokenClass
 	{
 		// Bitmask for different token classes //
@@ -29,6 +26,8 @@ namespace LXC::Lexer
 		};
 	};
 	struct LexerContext;
 	// Data type for storing the output of the lexer //
 	class Token final
 	{
@@ -56,8 +55,7 @@ namespace LXC::Lexer
 				// === User defined === //
 				String_Literal	= TokenClass::UserDefined,
-				Int_Literal,
+				Num_Literal,
 				Float_Literal,
 				Identifier,
 				// === Symbols === //
@@ -86,8 +84,8 @@ namespace LXC::Lexer
 			template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(TokenType type) { return type & mask; }
 			template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(Token token) { return token.type & mask; }
-			// Constructor to set the data of the token //
+			// Constructor to set the data of the token for more complex token types //
-			Token(const LexerContext& context, const unsigned short _length, TokenType _type);
+			Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type);
 			// Deconstructor to clean up the allocated memory //
 			~Token();
@@ -102,15 +100,12 @@ namespace LXC::Lexer
 			// The length of the token //
 			const unsigned short length;
-			// The line the token is on (starts on 1) //
+			// Start index of the token //
-			const unsigned short line;
+			const unsigned __int32 index;
 			// The index on the line (starts on 1) //
 			const unsigned short column;
 		private:
 			// The data of the token //
-			const char* contents;
+			char* contents;
 	};
 	// Typedef for the output type of how the Lexer outputs //
--- a/Lexer/src/Lexer.cpp
+++ b/Lexer/src/Lexer.cpp
@@ -5,22 +5,97 @@
 namespace LXC::Lexer
 {
 	static constexpr bool IsNumeric(const char c)
 	{
 		return c >= '0' && c <= '9';
 	}
 	static constexpr bool IsAlpha(const char c)
 	{
 		return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 	}
 	LexerContext::LexerContext(const std::string& _source) :
-		source(_source), index(0), out{}, len(_source.length()), column(0), line(0)
+		source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
 	{}
 	Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
 	{
 		// Creates the context for the lexer //
-		LexerContext context(fileContents);
+		LexerContext ctx(fileContents);
-		while (context.index > context.len)
+		struct
 		{
-			// Iterates to the next index //
+			bool inStrLiteral = false;
-			context.column++;
+			bool inIdentifier = false;
-			context.index++;
+			bool inNumLiteral = false;
 			bool inComment = false;
 			unsigned __int32 sectionStart = 0;
 		} trackers;
 		while (ctx.index > ctx.len)
 		{
 			// The current char within the source that is being lexed //
 			const char current = ctx.source[ctx.index];
 			const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
 			// === Comments === //
 			if (current == '#')
 				trackers.inComment = !trackers.inComment;
 			else if (trackers.inComment) {} // Contents of comments are skipped over
 			// === String literals === //
 			else if (current == '"')
 			{
 				// Updates trackers //
 				trackers.inStrLiteral = !trackers.inStrLiteral;
 				trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
 				// Creates the token (if at the end of the string literal) //
 				if (!trackers.inStrLiteral)
 					ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::String_Literal });
 			} else if (trackers.inStrLiteral) {}
 			// === Numbers === //
 			else if (IsNumeric(current))
 			{
 				// Updates trackers //
 				trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
 				trackers.inNumLiteral = true;
 				// Checks for the end of the number literal to create the token //
 				if (!IsNumeric(next)) _UNLIKELY
 				{
 					ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Num_Literal });
 					trackers.inNumLiteral = false;
 				}
 			}
-		return context.out;
+			// === Words === //
 			else if (IsAlpha(current))
 			{
 				// Updates trackers //
 				trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
 				trackers.inIdentifier = true;
 				// Checks for the end of the word to create the token //
 				if (!IsAlpha(next)) _UNLIKELY
 				{
 					ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Identifier });
 					trackers.inIdentifier = false;
 				}
 			}
 			// Iterates to the next index //
 			ctx.column++;
 			ctx.index++;
 		}
 		return ctx.out;
 	}
 }
--- a/Lexer/src/Token.cpp
+++ b/Lexer/src/Token.cpp
@@ -5,25 +5,19 @@
 namespace LXC::Lexer
 {
 	static const char* const CopySubstrToMem(const LexerContext& context, const size_t length, Token::TokenType type) 
 	{
 		// Only user defined class tokens need to store their type //
 		if (!Token::IsTypeClass<TokenClass::UserDefined>(type))
 			return nullptr;
 		// Copies the memory to a c-string //
 		char* cStr = new char[length + 1];
 		std::memcpy(cStr, context.source.data() + context.index, length);
 		cStr[length] = '\0';
 		return cStr;
 	}
 	// Constructor to assign the members of the token class //
-	Token::Token(const LexerContext& context, const unsigned short _length, TokenType _type) :
+	Token::Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type) :
-		type(_type), length(_length), line(context.line), column(context.column),
+		type(_type), length(len), index(start), contents(nullptr)
-		contents(CopySubstrToMem(context, _length, _type))
+	{
-	{}
+		// Only user defined class tokens need to store c-string //
 		if (Token::IsTypeClass<TokenClass::UserDefined>(type))
 		{
 			// Copies the memory to a c-string //
 			contents = new char[len + 1]; // +1 for null terminator
 			std::memcpy(contents, ctx.source.data() + start, len);
 			contents[len] = '\0';
 		}
 	}
 	// Destructor to clean up the memory of the token that can be allocated //
 	Token::~Token()