Added basic lexer

2025-07-20 16:15:58 +01:00
parent f5bb46788c
commit 903b4da7df
4 changed files with 103 additions and 39 deletions
--- a/Lexer/inc/Lexer.h
+++ b/Lexer/inc/Lexer.h
@@ -11,10 +11,10 @@ namespace LXC::Lexer

 		// Trackers for the Lexer itself //
 		const std::string& source;
-		size_t index;
+		__int32 index;

 		LexerOutput out;
-		const size_t len;
+		const __int32 len;

 		// Trackers for where the Lexer is within the user version of source //
 		unsigned short column;
--- a/Lexer/inc/Token.h
+++ b/Lexer/inc/Token.h
@@ -4,9 +4,6 @@

 namespace LXC::Lexer
 {
-	// Foward declaration to allow it passing to the Token class //
-	struct LexerContext;
-
 	namespace TokenClass
 	{
 		// Bitmask for different token classes //
@@ -29,6 +26,8 @@ namespace LXC::Lexer
 		};
 	};

+	struct LexerContext;
+	
 	// Data type for storing the output of the lexer //
 	class Token final
 	{
@@ -56,8 +55,7 @@ namespace LXC::Lexer
 				// === User defined === //

 				String_Literal	= TokenClass::UserDefined,
-				Int_Literal,
-				Float_Literal,
+				Num_Literal,
 				Identifier,

 				// === Symbols === //
@@ -86,8 +84,8 @@ namespace LXC::Lexer
 			template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(TokenType type) { return type & mask; }
 			template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(Token token) { return token.type & mask; }

-			// Constructor to set the data of the token //
-			Token(const LexerContext& context, const unsigned short _length, TokenType _type);
+			// Constructor to set the data of the token for more complex token types //
+			Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type);

 			// Deconstructor to clean up the allocated memory //
 			~Token();
@@ -102,15 +100,12 @@ namespace LXC::Lexer
 			// The length of the token //
 			const unsigned short length;

-			// The line the token is on (starts on 1) //
-			const unsigned short line;
-
-			// The index on the line (starts on 1) //
-			const unsigned short column;
+			// Start index of the token //
+			const unsigned __int32 index;

 		private:
 			// The data of the token //
-			const char* contents;
+			char* contents;
 	};

 	// Typedef for the output type of how the Lexer outputs //
--- a/Lexer/src/Lexer.cpp
+++ b/Lexer/src/Lexer.cpp
@@ -5,22 +5,97 @@

 namespace LXC::Lexer
 {
+	static constexpr bool IsNumeric(const char c)
+	{
+		return c >= '0' && c <= '9';
+	}
+
+	static constexpr bool IsAlpha(const char c)
+	{
+		return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+	}
+
 	LexerContext::LexerContext(const std::string& _source) :
-		source(_source), index(0), out{}, len(_source.length()), column(0), line(0)
+		source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
 	{}

 	Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
 	{
 		// Creates the context for the lexer //
-		LexerContext context(fileContents);
+		LexerContext ctx(fileContents);

-		while (context.index > context.len)
+		struct
 		{
+			bool inStrLiteral = false;
+			bool inIdentifier = false;
+			bool inNumLiteral = false;
+
+			bool inComment = false;
+
+			unsigned __int32 sectionStart = 0;
+
+		} trackers;
+
+		while (ctx.index > ctx.len)
+		{
+			// The current char within the source that is being lexed //
+			const char current = ctx.source[ctx.index];
+			const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
+
+			// === Comments === //
+			if (current == '#')
+				trackers.inComment = !trackers.inComment;
+
+			else if (trackers.inComment) {} // Contents of comments are skipped over
+
+			// === String literals === //
+			else if (current == '"')
+			{
+				// Updates trackers //
+				trackers.inStrLiteral = !trackers.inStrLiteral;
+				trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
+
+				// Creates the token (if at the end of the string literal) //
+				if (!trackers.inStrLiteral)
+					ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::String_Literal });
+			
+			} else if (trackers.inStrLiteral) {}
+
+			// === Numbers === //
+			else if (IsNumeric(current))
+			{
+				// Updates trackers //
+				trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
+				trackers.inNumLiteral = true;
+
+				// Checks for the end of the number literal to create the token //
+				if (!IsNumeric(next)) _UNLIKELY
+				{
+					ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Num_Literal });
+					trackers.inNumLiteral = false;
+				}
+			}
+
+			// === Words === //
+			else if (IsAlpha(current))
+			{
+				// Updates trackers //
+				trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
+				trackers.inIdentifier = true;
+
+				// Checks for the end of the word to create the token //
+				if (!IsAlpha(next)) _UNLIKELY
+				{
+					ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Identifier });
+					trackers.inIdentifier = false;
+				}
+			}
+
 			// Iterates to the next index //
-			context.column++;
-			context.index++;
+			ctx.column++;
+			ctx.index++;
 		}

-		return context.out;
+		return ctx.out;
 	}
 }
--- a/Lexer/src/Token.cpp
+++ b/Lexer/src/Token.cpp
@@ -5,25 +5,19 @@

 namespace LXC::Lexer
 {
-	static const char* const CopySubstrToMem(const LexerContext& context, const size_t length, Token::TokenType type) 
-	{
-		// Only user defined class tokens need to store their type //
-		if (!Token::IsTypeClass<TokenClass::UserDefined>(type))
-			return nullptr;
-
-		// Copies the memory to a c-string //
-		char* cStr = new char[length + 1];
-		std::memcpy(cStr, context.source.data() + context.index, length);
-		cStr[length] = '\0';
-
-		return cStr;
-	}
-
 	// Constructor to assign the members of the token class //
-	Token::Token(const LexerContext& context, const unsigned short _length, TokenType _type) :
-		type(_type), length(_length), line(context.line), column(context.column),
-		contents(CopySubstrToMem(context, _length, _type))
-	{}
+	Token::Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type) :
+		type(_type), length(len), index(start), contents(nullptr)
+	{
+		// Only user defined class tokens need to store c-string //
+		if (Token::IsTypeClass<TokenClass::UserDefined>(type))
+		{
+			// Copies the memory to a c-string //
+			contents = new char[len + 1]; // +1 for null terminator
+			std::memcpy(contents, ctx.source.data() + start, len);
+			contents[len] = '\0';
+		}
+	}

 	// Destructor to clean up the memory of the token that can be allocated //
 	Token::~Token()