mirror of
https://github.com/PashaBibko/LX.git
synced 2026-04-04 01:49:05 +00:00
Small errors with logging parser unexpected token errors. Tokens need to be re-written with how they store memory as it can be halved.
344 lines
11 KiB
C++
344 lines
11 KiB
C++
#include <Lexer.h>
|
|
#include <Util.h>
|
|
|
|
#include <unordered_map>
|
|
#include <string_view>
|
|
#include <fstream>
|
|
#include <vector>
|
|
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
|
|
namespace LX
|
|
{
|
|
// Helper macro for outputting token type //
|
|
#define TOKEN_CASE(type) case type: return #type;
|
|
|
|
// Helper util function to translate a tokentype to it's enum val //
|
|
static std::string ToStringNoFormat(Token::TokenType type)
|
|
{
|
|
switch (type)
|
|
{
|
|
TOKEN_CASE(Token::STRING_LITERAL);
|
|
TOKEN_CASE(Token::IDENTIFIER);
|
|
TOKEN_CASE(Token::FOR);
|
|
TOKEN_CASE(Token::WHILE);
|
|
TOKEN_CASE(Token::IF);
|
|
TOKEN_CASE(Token::ELSE);
|
|
TOKEN_CASE(Token::ELIF);
|
|
TOKEN_CASE(Token::FUNCTION);
|
|
TOKEN_CASE(Token::ADD);
|
|
TOKEN_CASE(Token::SUB);
|
|
TOKEN_CASE(Token::MUL);
|
|
TOKEN_CASE(Token::DIV);
|
|
TOKEN_CASE(Token::NUMBER_LITERAL);
|
|
TOKEN_CASE(Token::RETURN);
|
|
|
|
default:
|
|
return "Unknown: " + std::to_string(type);
|
|
}
|
|
}
|
|
|
|
// Logging function to turn a tokentype enum val into a nicely formatted string //
|
|
std::string ToString(Token::TokenType type)
|
|
{
|
|
// Gets the unformated version of the string //
|
|
std::string unformatted = ToStringNoFormat(type);
|
|
unformatted = unformatted.substr(7); // Removes the Token:: prefix
|
|
|
|
// Formats the string (turns to lowercase and replaces _ with a space //
|
|
std::string formatted;
|
|
|
|
for (char current : unformatted)
|
|
{
|
|
// Adding 32 makes it lowercase due to how ASCII works //
|
|
if ((current >= 'A' && current <= 'Z')) { formatted.push_back(current + 32); }
|
|
|
|
// Replaces _ with spaces //
|
|
else if (current == '_') { formatted.push_back(' '); }
|
|
|
|
// Else adds the current character //
|
|
else { formatted.push_back(current); }
|
|
}
|
|
|
|
// Returns the formatted string //
|
|
return formatted;
|
|
}
|
|
|
|
// Stops use outside of the function //
|
|
#undef TOKEN_CASE
|
|
|
|
// Helper function for dealing with floating-point number literals //
|
|
static constexpr bool CanBePartOfNumberLiteral(const char c) { return (c == '.') || (c == 'f'); }
|
|
|
|
// Helper function to stop printing whitespace as pure whitespace //
|
|
static std::string PrintChar(const char c)
|
|
{
|
|
switch (c)
|
|
{
|
|
// Stores them as pure string literals //
|
|
case '\n': return R"(\n)";
|
|
case '\t': return R"(\t)";
|
|
case '\r': return R"(\r)";
|
|
|
|
// Else returns a string of length one with the char inside //
|
|
default: return std::string(1, c);
|
|
}
|
|
}
|
|
|
|
// All the keywords the lexer currently supports with their token-enum equivalents //
|
|
static const std::unordered_map<std::string, Token::TokenType> keywords =
|
|
{
|
|
{ "for" , Token::FOR },
|
|
{ "while" , Token::WHILE },
|
|
{ "if" , Token::IF },
|
|
{ "else" , Token::ELSE },
|
|
{ "elif" , Token::ELIF },
|
|
{ "func" , Token::FUNCTION },
|
|
{ "return" , Token::RETURN }
|
|
};
|
|
|
|
// All the single-char operators currently supported by the lexer with their token-enum equivalents //
|
|
// TODO: Support multi-char operators such as: ==, -> +=, &&
|
|
static const std::unordered_map<char, Token::TokenType> operators =
|
|
{
|
|
{ '+', Token::ADD },
|
|
{ '-', Token::SUB },
|
|
{ '*', Token::MUL },
|
|
{ '/', Token::DIV }
|
|
};
|
|
|
|
// Checks if the given word is a keyword before adding it to the tokens //
|
|
static void TokenizeWord(const std::string& word, std::vector<Token>& tokens, LexerInfo& info)
|
|
{
|
|
// Checks the map for a check and if so adds it with its enum equivalent //
|
|
if (auto keyword = keywords.find(word); keyword != keywords.end())
|
|
{
|
|
tokens.push_back({ keyword->second, info, "", (std::streamsize)word.size() });
|
|
}
|
|
|
|
// Else adds it as a type of IDENTIFIER //
|
|
else
|
|
{
|
|
tokens.push_back({ Token::IDENTIFIER, info, word, (std::streamsize)word.size() });
|
|
}
|
|
}
|
|
|
|
const std::vector<Token> LX::LexicalAnalyze(const std::string& contents, std::streamsize len, std::ofstream* log)
|
|
{
|
|
// Logs the start of the lexical analysis
|
|
SafeLog(log, LOG_BREAK, "Started lexing file", LOG_BREAK);
|
|
|
|
// Allocates a large ammount of memory to hold the output //
|
|
// Will shrink the size later on to stop excess memory being allocated //
|
|
std::vector<Token> tokens = {};
|
|
tokens.reserve(0xFFFF);
|
|
|
|
// Trackers for when the program is iterating over the file //
|
|
LexerInfo info;
|
|
|
|
// Iterates over the file and turns it into tokens //
|
|
while (info.index < len)
|
|
{
|
|
// Stores the current character for easy access
|
|
const char current = contents[info.index];
|
|
|
|
// Checks if it is not at end //
|
|
// Predicts it is not at end for microptimsation //
|
|
if (info.index + 1 < len) [[likely]]
|
|
{
|
|
// Gets the next character //
|
|
const char next = contents[info.index + 1];
|
|
|
|
// Sets flags depending on the value of the next character //
|
|
info.isNextCharAlpha = (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z');
|
|
info.isNextCharNumeric = (next >= '0' && next <= '9') || CanBePartOfNumberLiteral(next);
|
|
}
|
|
|
|
else
|
|
{
|
|
// Else defaults the next character's flags to false //
|
|
info.isNextCharAlpha = false;
|
|
info.isNextCharNumeric = false;
|
|
}
|
|
|
|
// Works out if the current character is alphabetic or numeric //
|
|
info.isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z');
|
|
info.isNumeric = (current >= '0' && current <= '9');
|
|
|
|
// Updates string literal tracker and skips over rest if in a string literal //
|
|
if (current == '"')
|
|
{
|
|
// Start of string literal //
|
|
if (info.inStringLiteral == false)
|
|
{
|
|
// Updates the neccesarry trackers //
|
|
info.startOfStringLiteral = info.index + 1;
|
|
info.inStringLiteral = true;
|
|
}
|
|
|
|
// End of string literal //
|
|
else
|
|
{
|
|
// Adds the string literal token to the token vector //
|
|
std::string lit(contents.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral);
|
|
tokens.push_back({ Token::STRING_LITERAL, info, lit, (std::streamsize)lit.length() + 1 });
|
|
|
|
// Updates trackers //
|
|
info.inStringLiteral = false;
|
|
}
|
|
}
|
|
|
|
// Skips over rest if within a string literal //
|
|
else if (info.inStringLiteral);
|
|
|
|
// Updates comment state //
|
|
else if (current == '#')
|
|
{
|
|
info.inComment = !info.inComment;
|
|
}
|
|
|
|
// Skips over if within a comment //
|
|
else if (info.inComment);
|
|
|
|
// Start of a number //
|
|
else if (info.isNumeric == true && info.wasLastCharNumeric == false && info.lexingNumber == false)
|
|
{
|
|
// Stores the start of the number //
|
|
info.startOfNumberLiteral = info.index;
|
|
|
|
// Checks if it as the end (single char numbers) //
|
|
if (info.isNextCharNumeric == false)
|
|
{
|
|
// Pushes the number to the token vector. Number literals are stored as string in the tokens //
|
|
std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
|
|
tokens.push_back({ Token::NUMBER_LITERAL, info, num, (std::streamsize)num.size() });
|
|
}
|
|
|
|
// Stores it is lexing a number literal //
|
|
else { info.lexingNumber = true; }
|
|
}
|
|
|
|
// End of a number //
|
|
else if ((info.isNumeric == true || CanBePartOfNumberLiteral(current)) && info.isNextCharNumeric == false && info.lexingNumber == true)
|
|
{
|
|
// Pushes the number to the token vector. Number literals are stored as string in the tokens //
|
|
std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral);
|
|
tokens.push_back({ Token::NUMBER_LITERAL, info, num, (std::streamsize)num.size() });
|
|
info.lexingNumber = false; // Stops storing it is lexing a number
|
|
}
|
|
|
|
// During a number //
|
|
else if (info.isNumeric == true);
|
|
else if (info.lexingNumber == true && CanBePartOfNumberLiteral(current));
|
|
|
|
// Start of a word //
|
|
else if (info.isAlpha == true && info.wasLastCharAlpha == false)
|
|
{
|
|
// Stores the start of the word //
|
|
info.startOfWord = info.index;
|
|
|
|
// Checks if it is at the end (single char words) //
|
|
if (info.isNextCharAlpha == false)
|
|
{
|
|
// Calls the function designed to handle the tokenisation of words //
|
|
TokenizeWord({ contents.data() + info.startOfWord, 1 }, tokens, info);
|
|
}
|
|
}
|
|
|
|
// End of a word //
|
|
else if (info.isAlpha == true && info.isNextCharAlpha == false)
|
|
{
|
|
// Calls the function designed to handle the tokenisation of words //
|
|
TokenizeWord({ contents.data() + info.startOfWord, (unsigned __int64)((info.index + 1) - info.startOfWord) }, tokens, info);
|
|
}
|
|
|
|
// During a word //
|
|
else if (info.isAlpha == true);
|
|
|
|
// Operators (+, -, /, *) //
|
|
else if (auto op = operators.find(current); op != operators.end())
|
|
{
|
|
tokens.push_back({ op->second, info, "", 1 });
|
|
}
|
|
|
|
// If it is here and not whitespace that means it's an invalid character //
|
|
else if (current == ' ' || current == '\r');
|
|
|
|
// Skips over an extra 3 spaces as tabs SHOULD ALWAYS take up 4 spaces //
|
|
// Only for the column and not index //
|
|
else if (current == '\t')
|
|
{
|
|
info.column = info.column + 3;
|
|
}
|
|
|
|
// Increments the line number and resets the column on entering a new line //
|
|
else if (current == '\n')
|
|
{
|
|
info.column = 0;
|
|
info.line++;
|
|
}
|
|
|
|
// Throws an error with all the relevant information //
|
|
else
|
|
{
|
|
throw InvalidCharInSource(info.column, info.line, info.index, contents[info.index]);
|
|
}
|
|
|
|
// Log dumps A LOT of info //
|
|
|
|
#ifdef LOG_EVERYTHING
|
|
|
|
SafeLog
|
|
(
|
|
log, "Index: ", std::left, std::setw(3), info.index,
|
|
" Is Alpha: ", info.isAlpha,
|
|
" Is Numeric: ", info.isNumeric,
|
|
" In Comment: ", info.inComment,
|
|
" In String: ", info.inStringLiteral,
|
|
" Next Char Alpha: ", info.isNextCharAlpha,
|
|
" Next Char Numeric: ", info.wasLastCharNumeric,
|
|
" Last Char Numeric: ", info.wasLastCharAlpha,
|
|
" Lexing number: ", info.lexingNumber,
|
|
" Current: {", PrintChar(current), "}"
|
|
);
|
|
|
|
#endif // LOG_EVERYTHING
|
|
|
|
// Updates trackers to their default state of a new character //
|
|
|
|
info.index++;
|
|
info.column++;
|
|
|
|
info.wasLastCharAlpha = info.isAlpha;
|
|
info.wasLastCharNumeric = info.isNumeric;
|
|
}
|
|
|
|
// Logs the tokens if logging is on //
|
|
if (log != nullptr)
|
|
{
|
|
#ifdef LOG_EVERYTHING
|
|
SafeLog(log, "\n"); // Puts a space when there is a lot in the log
|
|
#endif // LOG_EVERYTHING
|
|
|
|
for (auto& token : tokens)
|
|
{
|
|
if (token.contents.empty() == false)
|
|
{
|
|
SafeLog(log, std::left, "{ Line: ", std::setw(3), token.line, ", Index: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, " } ", std::setw(30), ToStringNoFormat(token.type) + ":", "{", token.contents, "}");
|
|
}
|
|
|
|
else
|
|
{
|
|
SafeLog(log, std::left, "{ Line: ", std::setw(3), token.line, ", Index: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, " } ", ToStringNoFormat(token.type));
|
|
}
|
|
}
|
|
|
|
SafeLog(log, "\n END OF TOKENS");
|
|
}
|
|
|
|
// Shrinks the vector down to minimum size before returning to avoid excess memory being allocated
|
|
tokens.shrink_to_fit();
|
|
return tokens;
|
|
}
|
|
} |