From 903b4da7dfa7cc641ac2423780d9c669989d955e Mon Sep 17 00:00:00 2001 From: Pasha Bibko <156938226+PashaBibko@users.noreply.github.com> Date: Sun, 20 Jul 2025 16:15:58 +0100 Subject: [PATCH] Added basic lexer --- Lexer/inc/Lexer.h | 4 +-- Lexer/inc/Token.h | 21 +++++------ Lexer/src/Lexer.cpp | 87 +++++++++++++++++++++++++++++++++++++++++---- Lexer/src/Token.cpp | 30 +++++++--------- 4 files changed, 103 insertions(+), 39 deletions(-) diff --git a/Lexer/inc/Lexer.h b/Lexer/inc/Lexer.h index 1b4e5d3..ac11bda 100644 --- a/Lexer/inc/Lexer.h +++ b/Lexer/inc/Lexer.h @@ -11,10 +11,10 @@ namespace LXC::Lexer // Trackers for the Lexer itself // const std::string& source; - size_t index; + __int32 index; LexerOutput out; - const size_t len; + const __int32 len; // Trackers for where the Lexer is within the user version of source // unsigned short column; diff --git a/Lexer/inc/Token.h b/Lexer/inc/Token.h index 829ba9b..64b3e5c 100644 --- a/Lexer/inc/Token.h +++ b/Lexer/inc/Token.h @@ -4,9 +4,6 @@ namespace LXC::Lexer { - // Foward declaration to allow it passing to the Token class // - struct LexerContext; - namespace TokenClass { // Bitmask for different token classes // @@ -29,6 +26,8 @@ namespace LXC::Lexer }; }; + struct LexerContext; + // Data type for storing the output of the lexer // class Token final { @@ -56,8 +55,7 @@ namespace LXC::Lexer // === User defined === // String_Literal = TokenClass::UserDefined, - Int_Literal, - Float_Literal, + Num_Literal, Identifier, // === Symbols === // @@ -86,8 +84,8 @@ namespace LXC::Lexer template static constexpr bool IsTypeClass(TokenType type) { return type & mask; } template static constexpr bool IsTypeClass(Token token) { return token.type & mask; } - // Constructor to set the data of the token // - Token(const LexerContext& context, const unsigned short _length, TokenType _type); + // Constructor to set the data of the token for more complex token types // + Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type); // Deconstructor to clean up the allocated memory // ~Token(); @@ -102,15 +100,12 @@ namespace LXC::Lexer // The length of the token // const unsigned short length; - // The line the token is on (starts on 1) // - const unsigned short line; - - // The index on the line (starts on 1) // - const unsigned short column; + // Start index of the token // + const unsigned __int32 index; private: // The data of the token // - const char* contents; + char* contents; }; // Typedef for the output type of how the Lexer outputs // diff --git a/Lexer/src/Lexer.cpp b/Lexer/src/Lexer.cpp index 4dda93e..e28fa0a 100644 --- a/Lexer/src/Lexer.cpp +++ b/Lexer/src/Lexer.cpp @@ -5,22 +5,97 @@ namespace LXC::Lexer { + static constexpr bool IsNumeric(const char c) + { + return c >= '0' && c <= '9'; + } + + static constexpr bool IsAlpha(const char c) + { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); + } + LexerContext::LexerContext(const std::string& _source) : - source(_source), index(0), out{}, len(_source.length()), column(0), line(0) + source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0) {} Util::ReturnVal TokenizeFile(const std::string& fileContents) { // Creates the context for the lexer // - LexerContext context(fileContents); + LexerContext ctx(fileContents); - while (context.index > context.len) + struct { + bool inStrLiteral = false; + bool inIdentifier = false; + bool inNumLiteral = false; + + bool inComment = false; + + unsigned __int32 sectionStart = 0; + + } trackers; + + while (ctx.index > ctx.len) + { + // The current char within the source that is being lexed // + const char current = ctx.source[ctx.index]; + const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0'; + + // === Comments === // + if (current == '#') + trackers.inComment = !trackers.inComment; + + else if (trackers.inComment) {} // Contents of comments are skipped over + + // === String literals === // + else if (current == '"') + { + // Updates trackers // + trackers.inStrLiteral = !trackers.inStrLiteral; + trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart; + + // Creates the token (if at the end of the string literal) // + if (!trackers.inStrLiteral) + ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::String_Literal }); + + } else if (trackers.inStrLiteral) {} + + // === Numbers === // + else if (IsNumeric(current)) + { + // Updates trackers // + trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index; + trackers.inNumLiteral = true; + + // Checks for the end of the number literal to create the token // + if (!IsNumeric(next)) _UNLIKELY + { + ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Num_Literal }); + trackers.inNumLiteral = false; + } + } + + // === Words === // + else if (IsAlpha(current)) + { + // Updates trackers // + trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index; + trackers.inIdentifier = true; + + // Checks for the end of the word to create the token // + if (!IsAlpha(next)) _UNLIKELY + { + ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Identifier }); + trackers.inIdentifier = false; + } + } + // Iterates to the next index // - context.column++; - context.index++; + ctx.column++; + ctx.index++; } - return context.out; + return ctx.out; } } diff --git a/Lexer/src/Token.cpp b/Lexer/src/Token.cpp index 3c2138e..e1e6f87 100644 --- a/Lexer/src/Token.cpp +++ b/Lexer/src/Token.cpp @@ -5,25 +5,19 @@ namespace LXC::Lexer { - static const char* const CopySubstrToMem(const LexerContext& context, const size_t length, Token::TokenType type) - { - // Only user defined class tokens need to store their type // - if (!Token::IsTypeClass(type)) - return nullptr; - - // Copies the memory to a c-string // - char* cStr = new char[length + 1]; - std::memcpy(cStr, context.source.data() + context.index, length); - cStr[length] = '\0'; - - return cStr; - } - // Constructor to assign the members of the token class // - Token::Token(const LexerContext& context, const unsigned short _length, TokenType _type) : - type(_type), length(_length), line(context.line), column(context.column), - contents(CopySubstrToMem(context, _length, _type)) - {} + Token::Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type) : + type(_type), length(len), index(start), contents(nullptr) + { + // Only user defined class tokens need to store c-string // + if (Token::IsTypeClass(type)) + { + // Copies the memory to a c-string // + contents = new char[len + 1]; // +1 for null terminator + std::memcpy(contents, ctx.source.data() + start, len); + contents[len] = '\0'; + } + } // Destructor to clean up the memory of the token that can be allocated // Token::~Token()