Added basic lexer

This commit is contained in:
Pasha Bibko
2025-07-20 16:15:58 +01:00
parent f5bb46788c
commit 903b4da7df
4 changed files with 103 additions and 39 deletions

View File

@@ -11,10 +11,10 @@ namespace LXC::Lexer
// Trackers for the Lexer itself //
const std::string& source;
size_t index;
__int32 index;
LexerOutput out;
const size_t len;
const __int32 len;
// Trackers for where the Lexer is within the user version of source //
unsigned short column;

View File

@@ -4,9 +4,6 @@
namespace LXC::Lexer
{
// Foward declaration to allow it passing to the Token class //
struct LexerContext;
namespace TokenClass
{
// Bitmask for different token classes //
@@ -29,6 +26,8 @@ namespace LXC::Lexer
};
};
struct LexerContext;
// Data type for storing the output of the lexer //
class Token final
{
@@ -56,8 +55,7 @@ namespace LXC::Lexer
// === User defined === //
String_Literal = TokenClass::UserDefined,
Int_Literal,
Float_Literal,
Num_Literal,
Identifier,
// === Symbols === //
@@ -86,8 +84,8 @@ namespace LXC::Lexer
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(TokenType type) { return type & mask; }
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(Token token) { return token.type & mask; }
// Constructor to set the data of the token //
Token(const LexerContext& context, const unsigned short _length, TokenType _type);
// Constructor to set the data of the token for more complex token types //
Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type);
// Deconstructor to clean up the allocated memory //
~Token();
@@ -102,15 +100,12 @@ namespace LXC::Lexer
// The length of the token //
const unsigned short length;
// The line the token is on (starts on 1) //
const unsigned short line;
// The index on the line (starts on 1) //
const unsigned short column;
// Start index of the token //
const unsigned __int32 index;
private:
// The data of the token //
const char* contents;
char* contents;
};
// Typedef for the output type of how the Lexer outputs //

View File

@@ -5,22 +5,97 @@
namespace LXC::Lexer
{
static constexpr bool IsNumeric(const char c)
{
return c >= '0' && c <= '9';
}
static constexpr bool IsAlpha(const char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
LexerContext::LexerContext(const std::string& _source) :
source(_source), index(0), out{}, len(_source.length()), column(0), line(0)
source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
{}
Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
{
// Creates the context for the lexer //
LexerContext context(fileContents);
LexerContext ctx(fileContents);
while (context.index > context.len)
struct
{
bool inStrLiteral = false;
bool inIdentifier = false;
bool inNumLiteral = false;
bool inComment = false;
unsigned __int32 sectionStart = 0;
} trackers;
while (ctx.index > ctx.len)
{
// The current char within the source that is being lexed //
const char current = ctx.source[ctx.index];
const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
// === Comments === //
if (current == '#')
trackers.inComment = !trackers.inComment;
else if (trackers.inComment) {} // Contents of comments are skipped over
// === String literals === //
else if (current == '"')
{
// Updates trackers //
trackers.inStrLiteral = !trackers.inStrLiteral;
trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
// Creates the token (if at the end of the string literal) //
if (!trackers.inStrLiteral)
ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::String_Literal });
} else if (trackers.inStrLiteral) {}
// === Numbers === //
else if (IsNumeric(current))
{
// Updates trackers //
trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
trackers.inNumLiteral = true;
// Checks for the end of the number literal to create the token //
if (!IsNumeric(next)) _UNLIKELY
{
ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Num_Literal });
trackers.inNumLiteral = false;
}
}
// === Words === //
else if (IsAlpha(current))
{
// Updates trackers //
trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
trackers.inIdentifier = true;
// Checks for the end of the word to create the token //
if (!IsAlpha(next)) _UNLIKELY
{
ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Identifier });
trackers.inIdentifier = false;
}
}
// Iterates to the next index //
context.column++;
context.index++;
ctx.column++;
ctx.index++;
}
return context.out;
return ctx.out;
}
}

View File

@@ -5,25 +5,19 @@
namespace LXC::Lexer
{
static const char* const CopySubstrToMem(const LexerContext& context, const size_t length, Token::TokenType type)
{
// Only user defined class tokens need to store their type //
if (!Token::IsTypeClass<TokenClass::UserDefined>(type))
return nullptr;
// Copies the memory to a c-string //
char* cStr = new char[length + 1];
std::memcpy(cStr, context.source.data() + context.index, length);
cStr[length] = '\0';
return cStr;
}
// Constructor to assign the members of the token class //
Token::Token(const LexerContext& context, const unsigned short _length, TokenType _type) :
type(_type), length(_length), line(context.line), column(context.column),
contents(CopySubstrToMem(context, _length, _type))
{}
Token::Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type) :
type(_type), length(len), index(start), contents(nullptr)
{
// Only user defined class tokens need to store c-string //
if (Token::IsTypeClass<TokenClass::UserDefined>(type))
{
// Copies the memory to a c-string //
contents = new char[len + 1]; // +1 for null terminator
std::memcpy(contents, ctx.source.data() + start, len);
contents[len] = '\0';
}
}
// Destructor to clean up the memory of the token that can be allocated //
Token::~Token()