Added symbols/operators to lexer

This commit is contained in:
Pasha Bibko
2025-07-21 17:34:47 +01:00
parent a6afeff493
commit 24fde1b770
6 changed files with 158 additions and 96 deletions

View File

@@ -75,14 +75,14 @@ namespace LXC::Util
// Finds the location of a given index within a file // // Finds the location of a given index within a file //
inline bool GetFileLocationAtIndex(FileLocation& location, const std::string& file, __int32 index) inline bool GetFileLocationAtIndex(FileLocation& location, const std::string& file, __int32 index)
{ {
// Returns false if outside the bounds //
if (index < 0 || index > file.length())
return false;
// Resets location // // Resets location //
location.line = 1; location.line = 1;
location.col = 1; location.col = 1;
// Returns false if outside the bounds //
if (index < 0 || index > file.length())
return false;
// Finds the location // // Finds the location //
__int32 localIndex = 0; __int32 localIndex = 0;
while (localIndex != index) while (localIndex != index)

View File

@@ -2,6 +2,7 @@
// Standard libraries // // Standard libraries //
#include <unordered_map>
#include <vector> #include <vector>
// LXC util files // // LXC util files //

View File

@@ -47,6 +47,9 @@ int main(int argc, char** argv)
if (err.reason == Lexer::LexerError::InvalidCharacter) if (err.reason == Lexer::LexerError::InvalidCharacter)
Util::PrintLn(": {", fileContents.Result()[err.index], '}'); Util::PrintLn(": {", fileContents.Result()[err.index], '}');
if (err.reason == Lexer::LexerError::UnknownSymbolOrOperand)
Util::PrintLn(": {", err.info, '}');
else else
Util::PrintLn(); Util::PrintLn();

View File

@@ -27,12 +27,13 @@ namespace LXC::Lexer
enum Reason enum Reason
{ {
InvalidCharacter, InvalidCharacter,
UnterminatedStringLiteral UnterminatedStringLiteral,
UnknownSymbolOrOperand
}; };
// Constructor to pass arguments through to the struct // // Constructor to pass arguments through to the struct //
LexerError(Reason _reason, __int32 errorIndex) LexerError(Reason _reason, __int32 errorIndex, std::string _info = "")
: reason(_reason), index(errorIndex) : reason(_reason), index(errorIndex), info(_info)
{} {}
// Turns the error into a c-string // // Turns the error into a c-string //
@@ -41,7 +42,8 @@ namespace LXC::Lexer
static const char* reasons[] = static const char* reasons[] =
{ {
"Invalid character found in source", "Invalid character found in source",
"Unterminated string literal in source" "Unterminated string literal in source",
"Unknown symbol or operand in source"
}; };
return reasons[reason]; return reasons[reason];
@@ -50,6 +52,7 @@ namespace LXC::Lexer
// Error information // // Error information //
const Reason reason; const Reason reason;
const __int32 index; const __int32 index;
const std::string info;
}; };
// Turns a file into a vector of tokens // // Turns a file into a vector of tokens //

View File

@@ -5,116 +5,171 @@
namespace LXC::Internal namespace LXC::Internal
{ {
static constexpr bool IsNumeric(const char c) static constexpr bool IsNumeric(const char c)
{ {
return c >= '0' && c <= '9'; return c >= '0' && c <= '9';
} }
static constexpr bool IsAlpha(const char c) static constexpr bool IsAlpha(const char c)
{ {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
} }
static constexpr bool IsWhitespace(const char c) static constexpr bool IsWhitespace(const char c)
{ {
return c == ' ' || c == '\t' || c == '\n' || c == '\r'; return c == ' ' || c == '\t' || c == '\n' || c == '\r';
} }
static constexpr bool IsSymbolOrOperator(const char c)
{
return
c == '+' || c == '-' ||
c == '*' || c == '/' ||
c == '%' || c == '=' ||
c == ',' || c == '[' ||
c == ']' || c == '{' ||
c == '}' || c == '(' ||
c == ')';
}
static const std::unordered_map<std::string_view, Lexer::Token::TokenType> symbolAndOpMap =
{
{ "+", Lexer::Token::Add },
{ "-", Lexer::Token::Sub },
{ "*", Lexer::Token::Mul },
{ "/", Lexer::Token::Div },
{ "%", Lexer::Token::Mod },
{ "=", Lexer::Token::Assign },
{ ",", Lexer::Token::Comma },
{ "[", Lexer::Token::CloseBracket },
{ "]", Lexer::Token::OpenBracket },
{ "{", Lexer::Token::CloseBrace },
{ "}", Lexer::Token::OpenBrace },
{ ")", Lexer::Token::CloseParen },
{ "(", Lexer::Token::OpenParen }
};
} }
namespace LXC::Lexer namespace LXC::Lexer
{ {
LexerContext::LexerContext(const std::string& _source) : LexerContext::LexerContext(const std::string& _source) :
source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0) source(_source), index(0), out{}, len((__int32)_source.length()), column(0), line(0)
{} {}
Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents) Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
{ {
// Creates the context for the lexer // // Creates the context for the lexer //
LexerContext ctx(fileContents); LexerContext ctx(fileContents);
struct struct
{ {
bool inStrLiteral = false; bool inStrLiteral = false;
bool inIdentifier = false; bool inIdentifier = false;
bool inNumLiteral = false; bool inNumLiteral = false;
bool inSymbolOrOp = false;
bool inComment = false; bool inComment = false;
unsigned __int32 sectionStart = 0; unsigned __int32 sectionStart = 0;
} trackers; } trackers;
while (ctx.index < ctx.len) while (ctx.index < ctx.len)
{ {
// The current char within the source that is being lexed // // The current char within the source that is being lexed //
const char current = ctx.source[ctx.index]; const char current = ctx.source[ctx.index];
const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0'; const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
// === Comments === // // === Comments === //
if (current == '#') if (current == '#')
trackers.inComment = !trackers.inComment; trackers.inComment = !trackers.inComment;
else if (trackers.inComment) {} // Contents of comments are skipped over else if (trackers.inComment) {} // Contents of comments are skipped over
// === String literals === // // === String literals === //
else if (current == '"') else if (current == '"')
{ {
// Updates trackers // // Updates trackers //
trackers.inStrLiteral = !trackers.inStrLiteral; trackers.inStrLiteral = !trackers.inStrLiteral;
trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart; trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
// Creates the token (if at the end of the string literal) // // Creates the token (if at the end of the string literal) //
if (!trackers.inStrLiteral) if (!trackers.inStrLiteral)
ctx.out.emplace_back(ctx, trackers.sectionStart + 1, (USHORT)(ctx.index - trackers.sectionStart - 1), Token::StringLiteral); ctx.out.emplace_back(ctx, trackers.sectionStart + 1, (USHORT)(ctx.index - trackers.sectionStart - 1), Token::StringLiteral);
} else if (trackers.inStrLiteral) {} } else if (trackers.inStrLiteral) {}
// === Numbers === // // === Numbers === //
else if (Internal::IsNumeric(current)) else if (Internal::IsNumeric(current))
{ {
// Updates trackers // // Updates trackers //
trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index; trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
trackers.inNumLiteral = true; trackers.inNumLiteral = true;
// Checks for the end of the number literal to create the token // // Checks for the end of the number literal to create the token //
if (!Internal::IsNumeric(next)) _UNLIKELY if (!Internal::IsNumeric(next)) _UNLIKELY
{ {
ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::NumLiteral); ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::NumLiteral);
trackers.inNumLiteral = false; trackers.inNumLiteral = false;
} }
} }
// === Words === // // === Words === //
else if (Internal::IsAlpha(current)) else if (Internal::IsAlpha(current))
{ {
// Updates trackers // // Updates trackers //
trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index; trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
trackers.inIdentifier = true; trackers.inIdentifier = true;
// Checks for the end of the word to create the token // // Checks for the end of the word to create the token //
if (!Internal::IsAlpha(next)) _UNLIKELY if (!Internal::IsAlpha(next)) _UNLIKELY
{ {
ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::Identifier); ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), Token::Identifier);
trackers.inIdentifier = false; trackers.inIdentifier = false;
} }
} }
// === Whitespace === // // === Symbols/Operators === //
else if (Internal::IsWhitespace(current)) {} else if (Internal::IsSymbolOrOperator(current))
{
// Updates trackers //
trackers.sectionStart = trackers.inSymbolOrOp ? trackers.sectionStart : ctx.index;
trackers.inSymbolOrOp = true;
// If an if-statement has not been triggered the character must be invalid // // Checks for the end of the symbol or operator //
else if (!Internal::IsSymbolOrOperator(next))
return Util::FunctionFail<LexerError>(LexerError::InvalidCharacter, ctx.index); {
// Finds the operator/symbol if it can //
std::string_view fullSymbol(ctx.source.data() + trackers.sectionStart, ctx.index - trackers.sectionStart + 1);
auto it = Internal::symbolAndOpMap.find(fullSymbol);
if (it != Internal::symbolAndOpMap.end())
ctx.out.emplace_back(ctx, trackers.sectionStart, (USHORT)(ctx.index - trackers.sectionStart + 1), it->second);
// Iterates to the next index // else
ctx.column++; return Util::FunctionFail<LexerError>(LexerError::UnknownSymbolOrOperand, trackers.sectionStart, std::string(fullSymbol));
ctx.index++; }
} }
// Checks for an unterminated string literal // // === Whitespace === //
if (trackers.inStrLiteral) else if (Internal::IsWhitespace(current)) {}
return Util::FunctionFail<LexerError>(LexerError::UnterminatedStringLiteral, trackers.sectionStart);
return ctx.out; // If an if-statement has not been triggered the character must be invalid //
} else
return Util::FunctionFail<LexerError>(LexerError::InvalidCharacter, ctx.index);
// Iterates to the next index //
ctx.column++;
ctx.index++;
}
// Checks for an unterminated string literal //
if (trackers.inStrLiteral)
return Util::FunctionFail<LexerError>(LexerError::UnterminatedStringLiteral, trackers.sectionStart);
return ctx.out;
}
} }

View File

@@ -1 +1 @@
FILE 4 CONTENTS "A" GO B HERE 34 5 "ELLO THER" FILE 4 CONTENTS "A" GO B HERE 34 += 5 "ELLO THER"