diff --git a/Common/inc/IO.h b/Common/inc/IO.h index 7900e0f..805bfdd 100644 --- a/Common/inc/IO.h +++ b/Common/inc/IO.h @@ -57,4 +57,19 @@ namespace LX // Returns the string between start and end // return src.substr(start, end - start); } + + // Util function for turning a a char to a string. Used to stop '\t' being printed as a tab // + inline std::string CharAsStrLit(const char c) + { + switch (c) + { + // Stores them as pure string literals // + case '\n': return R"(\n)"; + case '\t': return R"(\t)"; + case '\r': return R"(\r)"; + + // Else returns a string of length one with the char inside // + default: return std::string(1, c); + } + } } diff --git a/IR-Generator/src/Generator.cpp b/IR-Generator/src/Generator.cpp index a967f2c..7f2aaf3 100644 --- a/IR-Generator/src/Generator.cpp +++ b/IR-Generator/src/Generator.cpp @@ -5,9 +5,6 @@ extern "C" int __declspec(dllexport) GenIR(const char* a_inpPath, const char* a_outPath) { - // Initalises the log // - LX::Log::Init(); - try { // Initalises the log // diff --git a/Lexer/Lexer.vcxproj b/Lexer/Lexer.vcxproj index a55931d..25693d2 100644 --- a/Lexer/Lexer.vcxproj +++ b/Lexer/Lexer.vcxproj @@ -148,9 +148,11 @@ Create + + diff --git a/Lexer/Lexer.vcxproj.filters b/Lexer/Lexer.vcxproj.filters index 2238084..af22fc2 100644 --- a/Lexer/Lexer.vcxproj.filters +++ b/Lexer/Lexer.vcxproj.filters @@ -20,6 +20,9 @@ Header Files + + Source Files + @@ -28,5 +31,8 @@ Header Files + + Header Files + \ No newline at end of file diff --git a/Lexer/inc/Constants.h b/Lexer/inc/Constants.h new file mode 100644 index 0000000..6d6b385 --- /dev/null +++ b/Lexer/inc/Constants.h @@ -0,0 +1,42 @@ +#include + +#include + +namespace LX +{ + // All the keywords the lexer currently supports with their token-enum equivalents // + static const std::unordered_map keywords = + { + { "for" , Token::FOR }, + { "while" , Token::WHILE }, + { "if" , Token::IF }, + { "else" , Token::ELSE }, + { "elif" , Token::ELIF }, + { "func" , Token::FUNCTION }, + { "return" , Token::RETURN }, + { "int" , Token::INT_DEC } + }; + + // All the symbols supported by the lexer // + static const std::unordered_map symbols = + { + { '{', Token::OPEN_BRACKET }, + { '}', Token::CLOSE_BRACKET }, + { '[', Token::OPEN_BRACE }, + { ']', Token::CLOSE_BRACE }, + { '(', Token::OPEN_PAREN }, + { ')', Token::CLOSE_PAREN }, + { ',', Token::COMMA }, + { '=', Token::ASSIGN } + }; + + // All the single-char operators currently supported by the lexer with their token-enum equivalents // + // TODO: Support multi-char operators such as: ==, -> +=, && + static const std::unordered_map operators = + { + { '+', Token::ADD }, + { '-', Token::SUB }, + { '*', Token::MUL }, + { '/', Token::DIV } + }; +} diff --git a/Lexer/inc/LexerErrors.h b/Lexer/inc/LexerErrors.h index baf40ac..1059316 100644 --- a/Lexer/inc/LexerErrors.h +++ b/Lexer/inc/LexerErrors.h @@ -11,7 +11,7 @@ namespace LX { GENERATE_LX_ERROR_REQUIRED_FUNCTION_DECLARATIONS; - InvalidCharInSource(const LexerInfo& info, const std::string& source, const std::string _file); + InvalidCharInSource(const LexerInfo& info, const std::string _file); std::string lineContents; std::string file; diff --git a/Lexer/inc/LexerInfo.h b/Lexer/inc/LexerInfo.h index 86b29a9..00d6abb 100644 --- a/Lexer/inc/LexerInfo.h +++ b/Lexer/inc/LexerInfo.h @@ -7,6 +7,11 @@ namespace LX // Struct to store the current information of the lexer // struct LexerInfo { + // Constructor to set the constants // + LexerInfo(const std::string& _source) + : source(_source), len(_source.length()) + {} + // Current trackers of where in the source it is // std::streamsize line = 1; // <- Lines start on 1 (probably because of non-programmer's) @@ -19,6 +24,11 @@ namespace LX std::streamsize startOfNumberLiteral = 0; std::streamsize startOfStringLiteral = 0; + // Information about the source // + + const std::string& source; + const std::streamsize len; + // Different flags of the lexer // // Stored as a bitset to minimse memory allocated // // - Basically no difference, because only one exists at any given time // diff --git a/Lexer/src/Lexer.cpp b/Lexer/src/Lexer.cpp index 2b724e3..d4e1f8b 100644 --- a/Lexer/src/Lexer.cpp +++ b/Lexer/src/Lexer.cpp @@ -4,174 +4,67 @@ #include #include +#include namespace LX { - InvalidCharInSource::InvalidCharInSource(const LexerInfo& info, const std::string& source, const std::string _file) - : col(info.column), line(info.line), file(_file), lineContents{}, invalid(source[info.index]) - { - // Gets the line the error is on // - lineContents = GetLineAtIndexOf(source, info.index); - } - - void InvalidCharInSource::PrintToConsole() const - { - // Calculates the length of the line number in the console so it is formatted correctly // - std::ostringstream oss; - oss << std::setw(3) << line; - size_t lineNumberWidthInConsole = std::max(oss.str().size(), (size_t)3); - - // Prints the error with the relevant information to the console // - std::cout << "\n"; - LX::PrintStringAsColor("Error: ", LX::Color::LIGHT_RED); - std::cout << "Invalid character found in "; - LX::PrintStringAsColor(file, LX::Color::WHITE); - std::cout << " {"; - LX::PrintStringAsColor(std::string(1, invalid), LX::Color::LIGHT_RED); - std::cout << "}:\n"; - std::cout << "Line: " << std::setw(lineNumberWidthInConsole) << line << " | " << lineContents << "\n"; - std::cout << " " << std::setw(lineNumberWidthInConsole) << "" << " | " << std::setw(col - 1) << ""; - LX::PrintStringAsColor("^", LX::Color::LIGHT_RED); - std::cout << "\n"; - } - - const char* InvalidCharInSource::ErrorType() const - { - return "Invalid char in source"; - } - - // Helper macro for outputting token type // - #define TOKEN_CASE(type) case type: return #type; - - // Helper util function to translate a tokentype to it's enum val // - static std::string ToStringNoFormat(Token::TokenType type) - { - switch (type) - { - TOKEN_CASE(Token::STRING_LITERAL); - TOKEN_CASE(Token::IDENTIFIER); - TOKEN_CASE(Token::FOR); - TOKEN_CASE(Token::WHILE); - TOKEN_CASE(Token::IF); - TOKEN_CASE(Token::ELSE); - TOKEN_CASE(Token::ELIF); - TOKEN_CASE(Token::FUNCTION); - TOKEN_CASE(Token::ADD); - TOKEN_CASE(Token::SUB); - TOKEN_CASE(Token::MUL); - TOKEN_CASE(Token::DIV); - TOKEN_CASE(Token::NUMBER_LITERAL); - TOKEN_CASE(Token::RETURN); - TOKEN_CASE(Token::OPEN_BRACE); - TOKEN_CASE(Token::CLOSE_BRACE); - TOKEN_CASE(Token::OPEN_BRACKET); - TOKEN_CASE(Token::CLOSE_BRACKET); - TOKEN_CASE(Token::OPEN_PAREN); - TOKEN_CASE(Token::CLOSE_PAREN); - TOKEN_CASE(Token::ASSIGN); - TOKEN_CASE(Token::INT_DEC); - - default: - return "Unknown: " + std::to_string(type); - } - } - - // Logging function to turn a tokentype enum val into a nicely formatted string // - std::string ToString(Token::TokenType type) - { - // Gets the unformated version of the string // - std::string unformatted = ToStringNoFormat(type); - unformatted = unformatted.substr(7); // Removes the Token:: prefix - - // Formats the string (turns to lowercase and replaces _ with a space // - std::string formatted; - - for (char current : unformatted) - { - // Adding 32 makes it lowercase due to how ASCII works // - if ((current >= 'A' && current <= 'Z')) { formatted.push_back(current + 32); } - - // Replaces _ with spaces // - else if (current == '_') { formatted.push_back(' '); } - - // Else adds the current character // - else { formatted.push_back(current); } - } - - // Returns the formatted string // - return formatted; - } - - // Stops use outside of the function // - #undef TOKEN_CASE - // Helper function for dealing with floating-point number literals // static constexpr bool CanBePartOfNumberLiteral(const char c) { return (c == '.') || (c == 'f'); } - // Helper function to stop printing whitespace as pure whitespace // - static std::string PrintChar(const char c) - { - switch (c) - { - // Stores them as pure string literals // - case '\n': return R"(\n)"; - case '\t': return R"(\t)"; - case '\r': return R"(\r)"; - - // Else returns a string of length one with the char inside // - default: return std::string(1, c); - } - } - - // All the keywords the lexer currently supports with their token-enum equivalents // - static const std::unordered_map keywords = - { - { "for" , Token::FOR }, - { "while" , Token::WHILE }, - { "if" , Token::IF }, - { "else" , Token::ELSE }, - { "elif" , Token::ELIF }, - { "func" , Token::FUNCTION }, - { "return" , Token::RETURN }, - { "int" , Token::INT_DEC } - }; - - // All the symbols supported by the lexer // - static const std::unordered_map symbols = - { - { '{', Token::OPEN_BRACKET }, - { '}', Token::CLOSE_BRACKET }, - { '[', Token::OPEN_BRACE }, - { ']', Token::CLOSE_BRACE }, - { '(', Token::OPEN_PAREN }, - { ')', Token::CLOSE_PAREN }, - { ',', Token::COMMA }, - { '=', Token::ASSIGN } - }; - - // All the single-char operators currently supported by the lexer with their token-enum equivalents // - // TODO: Support multi-char operators such as: ==, -> +=, && - static const std::unordered_map operators = - { - { '+', Token::ADD }, - { '-', Token::SUB }, - { '*', Token::MUL }, - { '/', Token::DIV } - }; - // Checks if the given word is a keyword before adding it to the tokens // - static void TokenizeWord(const std::string& word, std::vector& tokens, LexerInfo& info, const std::string& contents) + static void TokenizeWord(const std::string& word, std::vector& tokens, LexerInfo& info) { // Checks the map for a check and if so adds it with its enum equivalent // if (auto keyword = keywords.find(word); keyword != keywords.end()) { - tokens.push_back({ keyword->second, info, (std::streamsize)word.size(), contents }); + tokens.push_back({ keyword->second, info, (std::streamsize)word.size(), info.source }); } // Else adds it as a type of IDENTIFIER // else { - tokens.push_back({ Token::IDENTIFIER, info, (std::streamsize)word.size(), contents }); + tokens.push_back({ Token::IDENTIFIER, info, (std::streamsize)word.size(), info.source }); + } + } + + static inline void UpdateLexerInfo(LexerInfo& info) + { + // Transfers the previous state if not at the beginning // + if (info.index != 0) [[likely]] + { + info.wasLastCharAlpha = info.isAlpha; + info.wasLastCharNumeric = info.isNumeric; + + info.isAlpha = info.isNextCharAlpha; + info.isNumeric = info.isNextCharNumeric; + } + + else + { + // Stores the current character for easy access + const char current = info.source[info.index]; + + // Works out if the current character is alphabetic or numeric // + info.isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z'); + info.isNumeric = (current >= '0' && current <= '9'); + } + + // Only does next character checks when not at the end // + if (info.index + 1 < info.len) [[likely]] + { + // Gets the next character // + const char next = info.source[info.index + 1]; + + // Sets flags depending on the value of the next character // + info.isNextCharAlpha = (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z'); + info.isNextCharNumeric = (next >= '0' && next <= '9') || CanBePartOfNumberLiteral(next); + } + + // Else defaults the flags to false // + else + { + info.isNextCharAlpha = false; + info.isNextCharNumeric = false; } } @@ -180,8 +73,8 @@ namespace LX // Logs that the file is being read // Log::LogNewSection("Reading file: ", path.string()); - std::string contents = ReadFileToString(path); - const std::streamsize len = contents.length(); + std::string fileContents = ReadFileToString(path); + const std::streamsize len = fileContents.length(); // Logs the start of the lexical analysis Log::LogNewSection("Lexing file"); @@ -192,36 +85,16 @@ namespace LX tokens.reserve(0xFFFF); // Trackers for when the program is iterating over the file // - LexerInfo info; + LexerInfo info(fileContents); // Iterates over the file and turns it into tokens // while (info.index < len) { // Stores the current character for easy access - const char current = contents[info.index]; + const char current = info.source[info.index]; - // Checks if it is not at end // - // Predicts it is not at end for microptimsation // - if (info.index + 1 < len) [[likely]] - { - // Gets the next character // - const char next = contents[info.index + 1]; - - // Sets flags depending on the value of the next character // - info.isNextCharAlpha = (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z'); - info.isNextCharNumeric = (next >= '0' && next <= '9') || CanBePartOfNumberLiteral(next); - } - - else - { - // Else defaults the next character's flags to false // - info.isNextCharAlpha = false; - info.isNextCharNumeric = false; - } - - // Works out if the current character is alphabetic or numeric // - info.isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z'); - info.isNumeric = (current >= '0' && current <= '9'); + // Updates the LexerInfo // + UpdateLexerInfo(info); // Updates string literal tracker and skips over rest if in a string literal // if (current == '"') @@ -238,8 +111,8 @@ namespace LX else { // Adds the string literal token to the token vector // - std::string lit(contents.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral); - tokens.push_back({ Token::STRING_LITERAL, info, (std::streamsize)lit.length() + 2, contents }); // Adding two makes the "" be stored as well + std::string lit(info.source.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral); + tokens.push_back({ Token::STRING_LITERAL, info, (std::streamsize)lit.length() + 2, info.source }); // Adding two makes the "" be stored as well // Updates trackers // info.inStringLiteral = false; @@ -268,8 +141,8 @@ namespace LX if (info.isNextCharNumeric == false) { // Pushes the number to the token vector. Number literals are stored as string in the tokens // - std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral); - tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), contents }); + std::string num(info.source.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral); + tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), info.source }); } // Stores it is lexing a number literal // @@ -280,8 +153,8 @@ namespace LX else if ((info.isNumeric == true || CanBePartOfNumberLiteral(current)) && info.isNextCharNumeric == false && info.lexingNumber == true) { // Pushes the number to the token vector. Number literals are stored as string in the tokens // - std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral); - tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), contents }); + std::string num(info.source.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral); + tokens.push_back({ Token::NUMBER_LITERAL, info, (std::streamsize)num.size(), info.source }); info.lexingNumber = false; // Stops storing it is lexing a number } @@ -299,7 +172,7 @@ namespace LX if (info.isNextCharAlpha == false) { // Calls the function designed to handle the tokenisation of words // - TokenizeWord({ contents.data() + info.startOfWord, 1 }, tokens, info, contents); + TokenizeWord({ info.source.data() + info.startOfWord, 1 }, tokens, info); } } @@ -307,7 +180,7 @@ namespace LX else if (info.isAlpha == true && info.isNextCharAlpha == false) { // Calls the function designed to handle the tokenisation of words // - TokenizeWord({ contents.data() + info.startOfWord, (unsigned __int64)((info.index + 1) - info.startOfWord) }, tokens, info, contents); + TokenizeWord({ info.source.data() + info.startOfWord, (unsigned __int64)((info.index + 1) - info.startOfWord) }, tokens, info); } // During a word // @@ -316,13 +189,13 @@ namespace LX // Symbols // else if (auto sym = symbols.find(current); sym != symbols.end()) { - tokens.push_back({ sym->second, info, 1, contents }); + tokens.push_back({ sym->second, info, 1, info.source }); } // Operators (+, -, /, *) // else if (auto op = operators.find(current); op != operators.end()) { - tokens.push_back({ op->second, info, 1, contents }); + tokens.push_back({ op->second, info, 1, info.source }); } // If it is here and not whitespace that means it's an invalid character // @@ -345,7 +218,7 @@ namespace LX // Throws an error with all the relevant information // else { - ThrowIf(true, info, contents, path.string()); + ThrowIf(true, info, path.string()); } // Log dumps A LOT of info // @@ -361,16 +234,13 @@ namespace LX " Next Char Numeric: ", info.wasLastCharNumeric, " Last Char Numeric: ", info.wasLastCharAlpha, " Lexing number: ", info.lexingNumber, - " Current: {", PrintChar(current), "}" + " Current: {", CharAsStrLit(current), "}" ); - // Updates trackers to their default state of a new character // + // Updates the indecies to the next character // info.index++; info.column++; - - info.wasLastCharAlpha = info.isAlpha; - info.wasLastCharNumeric = info.isNumeric; } Log::out("\n"); // Puts a space to clean up the log @@ -383,7 +253,7 @@ namespace LX "{ Line: ", std::setw(3), token.line, ", Index: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, " } ", - std::setw(30), ToStringNoFormat(token.type) + ":", "{", token.GetContents(), "}" + std::setw(30), ToString(token.type) + ":", "{", token.GetContents(), "}" ); } diff --git a/Lexer/src/LexerErrors.cpp b/Lexer/src/LexerErrors.cpp new file mode 100644 index 0000000..5677d39 --- /dev/null +++ b/Lexer/src/LexerErrors.cpp @@ -0,0 +1,38 @@ +#include + +#include + +#include + +namespace LX +{ + InvalidCharInSource::InvalidCharInSource(const LexerInfo& info, const std::string _file) + : col(info.column), line(info.line), file(_file), lineContents(GetLineAtIndexOf(info.source, info.index)), invalid(info.source[info.index]) + {} + + void InvalidCharInSource::PrintToConsole() const + { + // Calculates the length of the line number in the console so it is formatted correctly // + std::ostringstream oss; + oss << std::setw(3) << line; + size_t lineNumberWidthInConsole = std::max(oss.str().size(), (size_t)3); + + // Prints the error with the relevant information to the console // + std::cout << "\n"; + LX::PrintStringAsColor("Error: ", LX::Color::LIGHT_RED); + std::cout << "Invalid character found in "; + LX::PrintStringAsColor(file, LX::Color::WHITE); + std::cout << " {"; + LX::PrintStringAsColor(std::string(1, invalid), LX::Color::LIGHT_RED); + std::cout << "}:\n"; + std::cout << "Line: " << std::setw(lineNumberWidthInConsole) << line << " | " << lineContents << "\n"; + std::cout << " " << std::setw(lineNumberWidthInConsole) << "" << " | " << std::setw(col - 1) << ""; + LX::PrintStringAsColor("^", LX::Color::LIGHT_RED); + std::cout << "\n"; + } + + const char* InvalidCharInSource::ErrorType() const + { + return "Invalid char in source"; + } +} diff --git a/Lexer/src/Token.cpp b/Lexer/src/Token.cpp index ee54024..d342c13 100644 --- a/Lexer/src/Token.cpp +++ b/Lexer/src/Token.cpp @@ -6,13 +6,78 @@ namespace LX { + // Helper macro for the function below // + // Undefined after the function to stop accidental use // + #define TOKEN_CASE(type) case type: return #type; + + // Static to stop use outside of this file as ToString should be used instead // + static std::string ToStringNoFormat(Token::TokenType type) + { + switch (type) + { + TOKEN_CASE(Token::STRING_LITERAL); + TOKEN_CASE(Token::IDENTIFIER); + TOKEN_CASE(Token::FOR); + TOKEN_CASE(Token::WHILE); + TOKEN_CASE(Token::IF); + TOKEN_CASE(Token::ELSE); + TOKEN_CASE(Token::ELIF); + TOKEN_CASE(Token::FUNCTION); + TOKEN_CASE(Token::ADD); + TOKEN_CASE(Token::SUB); + TOKEN_CASE(Token::MUL); + TOKEN_CASE(Token::DIV); + TOKEN_CASE(Token::NUMBER_LITERAL); + TOKEN_CASE(Token::RETURN); + TOKEN_CASE(Token::OPEN_BRACE); + TOKEN_CASE(Token::CLOSE_BRACE); + TOKEN_CASE(Token::OPEN_BRACKET); + TOKEN_CASE(Token::CLOSE_BRACKET); + TOKEN_CASE(Token::OPEN_PAREN); + TOKEN_CASE(Token::CLOSE_PAREN); + TOKEN_CASE(Token::ASSIGN); + TOKEN_CASE(Token::INT_DEC); + + // Default just returns it as it's numerical value // + default: return "Unknown: " + std::to_string((int)type); + } + } + + #undef TOKEN_CASE + + // Logging function to turn a tokentype enum val into a nicer formatted string than the base equivalent // + std::string ToString(Token::TokenType type) + { + // Gets the unformated version of the string // + std::string unformatted = ToStringNoFormat(type); + unformatted = unformatted.substr(7); // Removes the Token:: prefix + + // Formats the string (turns to lowercase and replaces _ with a space // + std::string formatted; + + for (char current : unformatted) + { + // Adding 32 makes it lowercase due to how ASCII works // + if ((current >= 'A' && current <= 'Z')) { formatted.push_back(current + 32); } + + // Replaces _ with spaces // + else if (current == '_') { formatted.push_back(' '); } + + // Else adds the current character // + else { formatted.push_back(current); } + } + + // Returns the formatted string // + return formatted; + } + // Passes the constructor args to the values // Token::Token(const TokenType _type, const LexerInfo& info, std::streamsize _length, const std::string& source) : type(_type), index(info.index - _length + 1), line(info.line), column(info.column - _length), length(_length), contents(source.data() + index, length) {} - // This function used to have a use // - // Now it exists cause I'm lazy // + // This function used to have a use but now it is just a simple getter // + // Recommended to use in case of future changes // std::string Token::GetContents() const { return contents; diff --git a/example/main.lx b/example/main.lx index 5b58e0e..d56ce30 100644 --- a/example/main.lx +++ b/example/main.lx @@ -3,8 +3,6 @@ func main() int a a = 65465 - / - int b b = 6