Added basic lexer
This commit is contained in:
@@ -11,10 +11,10 @@ namespace LXC::Lexer
|
|||||||
|
|
||||||
// Trackers for the Lexer itself //
|
// Trackers for the Lexer itself //
|
||||||
const std::string& source;
|
const std::string& source;
|
||||||
size_t index;
|
__int32 index;
|
||||||
|
|
||||||
LexerOutput out;
|
LexerOutput out;
|
||||||
const size_t len;
|
const __int32 len;
|
||||||
|
|
||||||
// Trackers for where the Lexer is within the user version of source //
|
// Trackers for where the Lexer is within the user version of source //
|
||||||
unsigned short column;
|
unsigned short column;
|
||||||
|
|||||||
@@ -4,9 +4,6 @@
|
|||||||
|
|
||||||
namespace LXC::Lexer
|
namespace LXC::Lexer
|
||||||
{
|
{
|
||||||
// Foward declaration to allow it passing to the Token class //
|
|
||||||
struct LexerContext;
|
|
||||||
|
|
||||||
namespace TokenClass
|
namespace TokenClass
|
||||||
{
|
{
|
||||||
// Bitmask for different token classes //
|
// Bitmask for different token classes //
|
||||||
@@ -29,6 +26,8 @@ namespace LXC::Lexer
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct LexerContext;
|
||||||
|
|
||||||
// Data type for storing the output of the lexer //
|
// Data type for storing the output of the lexer //
|
||||||
class Token final
|
class Token final
|
||||||
{
|
{
|
||||||
@@ -56,8 +55,7 @@ namespace LXC::Lexer
|
|||||||
// === User defined === //
|
// === User defined === //
|
||||||
|
|
||||||
String_Literal = TokenClass::UserDefined,
|
String_Literal = TokenClass::UserDefined,
|
||||||
Int_Literal,
|
Num_Literal,
|
||||||
Float_Literal,
|
|
||||||
Identifier,
|
Identifier,
|
||||||
|
|
||||||
// === Symbols === //
|
// === Symbols === //
|
||||||
@@ -86,8 +84,8 @@ namespace LXC::Lexer
|
|||||||
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(TokenType type) { return type & mask; }
|
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(TokenType type) { return type & mask; }
|
||||||
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(Token token) { return token.type & mask; }
|
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(Token token) { return token.type & mask; }
|
||||||
|
|
||||||
// Constructor to set the data of the token //
|
// Constructor to set the data of the token for more complex token types //
|
||||||
Token(const LexerContext& context, const unsigned short _length, TokenType _type);
|
Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type);
|
||||||
|
|
||||||
// Deconstructor to clean up the allocated memory //
|
// Deconstructor to clean up the allocated memory //
|
||||||
~Token();
|
~Token();
|
||||||
@@ -102,15 +100,12 @@ namespace LXC::Lexer
|
|||||||
// The length of the token //
|
// The length of the token //
|
||||||
const unsigned short length;
|
const unsigned short length;
|
||||||
|
|
||||||
// The line the token is on (starts on 1) //
|
// Start index of the token //
|
||||||
const unsigned short line;
|
const unsigned __int32 index;
|
||||||
|
|
||||||
// The index on the line (starts on 1) //
|
|
||||||
const unsigned short column;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The data of the token //
|
// The data of the token //
|
||||||
const char* contents;
|
char* contents;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Typedef for the output type of how the Lexer outputs //
|
// Typedef for the output type of how the Lexer outputs //
|
||||||
|
|||||||
@@ -5,22 +5,97 @@
|
|||||||
|
|
||||||
namespace LXC::Lexer
|
namespace LXC::Lexer
|
||||||
{
|
{
|
||||||
|
static constexpr bool IsNumeric(const char c)
|
||||||
|
{
|
||||||
|
return c >= '0' && c <= '9';
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool IsAlpha(const char c)
|
||||||
|
{
|
||||||
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
||||||
|
}
|
||||||
|
|
||||||
LexerContext::LexerContext(const std::string& _source) :
|
LexerContext::LexerContext(const std::string& _source) :
|
||||||
source(_source), index(0), out{}, len(_source.length()), column(0), line(0)
|
source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
|
Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
|
||||||
{
|
{
|
||||||
// Creates the context for the lexer //
|
// Creates the context for the lexer //
|
||||||
LexerContext context(fileContents);
|
LexerContext ctx(fileContents);
|
||||||
|
|
||||||
while (context.index > context.len)
|
struct
|
||||||
{
|
{
|
||||||
// Iterates to the next index //
|
bool inStrLiteral = false;
|
||||||
context.column++;
|
bool inIdentifier = false;
|
||||||
context.index++;
|
bool inNumLiteral = false;
|
||||||
|
|
||||||
|
bool inComment = false;
|
||||||
|
|
||||||
|
unsigned __int32 sectionStart = 0;
|
||||||
|
|
||||||
|
} trackers;
|
||||||
|
|
||||||
|
while (ctx.index > ctx.len)
|
||||||
|
{
|
||||||
|
// The current char within the source that is being lexed //
|
||||||
|
const char current = ctx.source[ctx.index];
|
||||||
|
const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
|
||||||
|
|
||||||
|
// === Comments === //
|
||||||
|
if (current == '#')
|
||||||
|
trackers.inComment = !trackers.inComment;
|
||||||
|
|
||||||
|
else if (trackers.inComment) {} // Contents of comments are skipped over
|
||||||
|
|
||||||
|
// === String literals === //
|
||||||
|
else if (current == '"')
|
||||||
|
{
|
||||||
|
// Updates trackers //
|
||||||
|
trackers.inStrLiteral = !trackers.inStrLiteral;
|
||||||
|
trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
|
||||||
|
|
||||||
|
// Creates the token (if at the end of the string literal) //
|
||||||
|
if (!trackers.inStrLiteral)
|
||||||
|
ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::String_Literal });
|
||||||
|
|
||||||
|
} else if (trackers.inStrLiteral) {}
|
||||||
|
|
||||||
|
// === Numbers === //
|
||||||
|
else if (IsNumeric(current))
|
||||||
|
{
|
||||||
|
// Updates trackers //
|
||||||
|
trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
|
||||||
|
trackers.inNumLiteral = true;
|
||||||
|
|
||||||
|
// Checks for the end of the number literal to create the token //
|
||||||
|
if (!IsNumeric(next)) _UNLIKELY
|
||||||
|
{
|
||||||
|
ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Num_Literal });
|
||||||
|
trackers.inNumLiteral = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return context.out;
|
// === Words === //
|
||||||
|
else if (IsAlpha(current))
|
||||||
|
{
|
||||||
|
// Updates trackers //
|
||||||
|
trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
|
||||||
|
trackers.inIdentifier = true;
|
||||||
|
|
||||||
|
// Checks for the end of the word to create the token //
|
||||||
|
if (!IsAlpha(next)) _UNLIKELY
|
||||||
|
{
|
||||||
|
ctx.out.push_back({ ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart), Token::Identifier });
|
||||||
|
trackers.inIdentifier = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterates to the next index //
|
||||||
|
ctx.column++;
|
||||||
|
ctx.index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctx.out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,25 +5,19 @@
|
|||||||
|
|
||||||
namespace LXC::Lexer
|
namespace LXC::Lexer
|
||||||
{
|
{
|
||||||
static const char* const CopySubstrToMem(const LexerContext& context, const size_t length, Token::TokenType type)
|
|
||||||
{
|
|
||||||
// Only user defined class tokens need to store their type //
|
|
||||||
if (!Token::IsTypeClass<TokenClass::UserDefined>(type))
|
|
||||||
return nullptr;
|
|
||||||
|
|
||||||
// Copies the memory to a c-string //
|
|
||||||
char* cStr = new char[length + 1];
|
|
||||||
std::memcpy(cStr, context.source.data() + context.index, length);
|
|
||||||
cStr[length] = '\0';
|
|
||||||
|
|
||||||
return cStr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructor to assign the members of the token class //
|
// Constructor to assign the members of the token class //
|
||||||
Token::Token(const LexerContext& context, const unsigned short _length, TokenType _type) :
|
Token::Token(const LexerContext& ctx, unsigned __int32 start, unsigned short len, TokenType _type) :
|
||||||
type(_type), length(_length), line(context.line), column(context.column),
|
type(_type), length(len), index(start), contents(nullptr)
|
||||||
contents(CopySubstrToMem(context, _length, _type))
|
{
|
||||||
{}
|
// Only user defined class tokens need to store c-string //
|
||||||
|
if (Token::IsTypeClass<TokenClass::UserDefined>(type))
|
||||||
|
{
|
||||||
|
// Copies the memory to a c-string //
|
||||||
|
contents = new char[len + 1]; // +1 for null terminator
|
||||||
|
std::memcpy(contents, ctx.source.data() + start, len);
|
||||||
|
contents[len] = '\0';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Destructor to clean up the memory of the token that can be allocated //
|
// Destructor to clean up the memory of the token that can be allocated //
|
||||||
Token::~Token()
|
Token::~Token()
|
||||||
|
|||||||
Reference in New Issue
Block a user