Files
MIRROR-LX-OriginalRepo/Lexer/src/Lexer.cpp
2025-05-08 14:32:33 +01:00

266 lines
8.2 KiB
C++

#include <LX-Common.h>
#include <Lexer.h>
#include <LexerErrors.h>
#include <LexerInfo.h>
#include <Constants.h>
namespace LX
{
// Helper function for dealing with floating-point number literals //
static constexpr bool CanBePartOfNumberLiteral(const char c) { return (c == '.') || (c == 'f'); }
// Checks if the given word is a keyword before adding it to the tokens //
static void TokenizeWord(const std::string& word, std::vector<Token>& tokens, LexerInfo& info)
{
// Checks the map for a check and if so adds it with its enum equivalent //
if (auto keyword = keywords.find(word); keyword != keywords.end())
{
tokens.push_back({ keyword->second, info, (std::streamsize)word.size(), info.source });
}
// Else adds it as a type of IDENTIFIER //
else
{
tokens.push_back({ Token::IDENTIFIER, info, (std::streamsize)word.size(), info.source });
}
}
static inline void UpdateLexerInfo(LexerInfo& info)
{
// Transfers the previous state if not at the beginning //
if (info.index != 0) [[likely]]
{
info.wasLastCharAlpha = info.isAlpha;
info.wasLastCharNumeric = info.isNumeric;
info.isAlpha = info.isNextCharAlpha;
info.isNumeric = info.isNextCharNumeric;
}
else
{
// Stores the current character for easy access
const char current = info.source[info.index];
// Works out if the current character is alphabetic or numeric //
info.isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z');
info.isNumeric = (current >= '0' && current <= '9');
}
// Only does next character checks when not at the end //
if (info.index + 1 < info.len) [[likely]]
{
// Gets the next character //
const char next = info.source[info.index + 1];
// Sets flags depending on the value of the next character //
info.isNextCharAlpha = (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z');
info.isNextCharNumeric = (next >= '0' && next <= '9') || CanBePartOfNumberLiteral(next);
}
// Else defaults the flags to false //
else
{
info.isNextCharAlpha = false;
info.isNextCharNumeric = false;
}
}
const std::vector<Token> LX::LexicalAnalyze(const std::filesystem::path& path)
{
// Logs that the file is being read //
Log::LogNewSection("Reading file: ", path.string());
std::string fileContents = ReadFileToString(path);
const std::streamsize len = fileContents.length();
// Logs the start of the lexical analysis
Log::LogNewSection("Lexing file");
// Allocates a large ammount of memory to hold the output //
// Will shrink the size later on to stop excess memory being allocated //
std::vector<Token> tokens = {};
tokens.reserve(0xFFFF);
// Trackers for when the program is iterating over the file //
LexerInfo info(fileContents);
// Iterates over the file and turns it into tokens //
while (info.index < len)
{
// Stores the current character for easy access
const char current = info.source[info.index];
// Updates the LexerInfo //
UpdateLexerInfo(info);
// Updates string literal tracker and skips over rest if in a string literal //
if (current == '"')
{
// Start of string literal //
if (info.inStringLiteral == false)
{
// Updates the neccesarry trackers //
info.startOfStringLiteral = info.index + 1;
info.inStringLiteral = true;
}
// End of string literal //
else
{
// Adds the string literal token to the token vector //
std::string lit(info.source.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral);
tokens.push_back({ Token::STRING_LITERAL, info, (std::streamsize)lit.length() + 2, info.source }); // Adding two makes the "" be stored as well
// Updates trackers //
info.inStringLiteral = false;
}
}
// Skips over rest if within a string literal //
else if (info.inStringLiteral);
// Updates comment state //
else if (current == '#')
{
info.inComment = !info.inComment;
}
// Skips over if within a comment //
else if (info.inComment);
// Start of a number //
else if (info.isNumeric == true && info.wasLastCharNumeric == false && info.lexingNumber == false)
{
// Stores the start of the number //
info.startOfNumberLiteral = info.index;
// Checks if it as the end (single char numbers) //
if (info.isNextCharNumeric == false)
{
// Pushes the number to the token vector. Number literals are stored as string in the tokens //
std::string num(info.source.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), info.source });
}
// Stores it is lexing a number literal //
else { info.lexingNumber = true; }
}
// End of a number //
else if ((info.isNumeric == true || CanBePartOfNumberLiteral(current)) && info.isNextCharNumeric == false && info.lexingNumber == true)
{
// Pushes the number to the token vector. Number literals are stored as string in the tokens //
std::string num(info.source.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), info.source });
info.lexingNumber = false; // Stops storing it is lexing a number
}
// During a number //
else if (info.isNumeric == true);
else if (info.lexingNumber == true && CanBePartOfNumberLiteral(current));
// Start of a word //
else if (info.isAlpha == true && info.wasLastCharAlpha == false)
{
// Stores the start of the word //
info.startOfWord = info.index;
// Checks if it is at the end (single char words) //
if (info.isNextCharAlpha == false)
{
// Calls the function designed to handle the tokenisation of words //
TokenizeWord({ info.source.data() + info.startOfWord, 1 }, tokens, info);
}
}
// End of a word //
else if (info.isAlpha == true && info.isNextCharAlpha == false)
{
// Calls the function designed to handle the tokenisation of words //
TokenizeWord({ info.source.data() + info.startOfWord, (unsigned __int64)((info.index + 1) - info.startOfWord) }, tokens, info);
}
// During a word //
else if (info.isAlpha == true);
// Symbols //
else if (auto sym = symbols.find(current); sym != symbols.end())
{
tokens.push_back({ sym->second, info, 1, info.source });
}
// Operators (+, -, /, *) //
else if (auto op = operators.find(current); op != operators.end())
{
tokens.push_back({ op->second, info, 1, info.source });
}
// If it is here and not whitespace that means it's an invalid character //
else if (current == ' ' || current == '\r');
// Skips over an extra 3 spaces as tabs SHOULD ALWAYS take up 4 spaces //
// Only for the column and not index //
else if (current == '\t')
{
info.column = info.column + 3;
}
// Increments the line number and resets the column on entering a new line //
else if (current == '\n')
{
info.column = 0;
info.line++;
}
// Throws an error with all the relevant information //
else
{
ThrowIf<InvalidCharInSource>(true, info, path.string());
}
// Log dumps A LOT of info //
Log::out
(
"Index: ", std::left, std::setw(3), info.index,
" Is Alpha: ", info.isAlpha,
" Is Numeric: ", info.isNumeric,
" In Comment: ", info.inComment,
" In String: ", info.inStringLiteral,
" Next Char Alpha: ", info.isNextCharAlpha,
" Next Char Numeric: ", info.wasLastCharNumeric,
" Last Char Numeric: ", info.wasLastCharAlpha,
" Lexing number: ", info.lexingNumber,
" Current: {", CharAsStrLit(current), "}"
);
// Updates the indecies to the next character //
info.index++;
info.column++;
}
Log::out("\n"); // Puts a space to clean up the log
for (auto& token : tokens)
{
Log::out<Log::Priority::HIGH>
(
std::left,
"{ Line: ", std::setw(3), token.line,
", Index: ", std::setw(3), token.index,
", Length: ", std::setw(2), token.length, " } ",
std::setw(30), ToString(token.type) + ":", "{", token.GetContents(), "}"
);
}
Log::out("End of tokens");
// Shrinks the vector down to minimum size before returning to avoid excess memory being allocated
tokens.shrink_to_fit();
return tokens;
}
}