From 4b47e803ceb1d98e2eae6ebd3b7cce560928ab91 Mon Sep 17 00:00:00 2001 From: Pasha Bibko <156938226+PashaBibko@users.noreply.github.com> Date: Tue, 22 Apr 2025 00:16:20 +0100 Subject: [PATCH] Improved logging Small errors with logging parser unexpected token errors. Tokens need to be re-written with how they store memory as it can be halved. --- Common/Common.vcxproj | 6 +- Common/Common.vcxproj.filters | 32 ++------- Frontend-Main.cpp | 69 +++++++++++++++++- LX-Compiler.sln | 2 +- LX-LLVM.vcxproj | 2 +- Lexer/src/Lexer.cpp | 131 +++++++++++++++------------------- Lexer/src/Token.cpp | 4 +- Parser/src/Parser.cpp | 6 +- common/Lexer.h | 56 +++++++++++---- common/Parser.h | 14 ++++ common/Util.h | 3 + 11 files changed, 201 insertions(+), 124 deletions(-) diff --git a/Common/Common.vcxproj b/Common/Common.vcxproj index 9b8525d..90b2c99 100644 --- a/Common/Common.vcxproj +++ b/Common/Common.vcxproj @@ -128,6 +128,9 @@ true + + + @@ -135,9 +138,6 @@ - - - diff --git a/Common/Common.vcxproj.filters b/Common/Common.vcxproj.filters index 4db1d4a..b18231f 100644 --- a/Common/Common.vcxproj.filters +++ b/Common/Common.vcxproj.filters @@ -5,35 +5,17 @@ {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd - - - - - inclu - - - inclu - - - inclu - - - inclu - - - inclu - Source Files + + + + + + + \ No newline at end of file diff --git a/Frontend-Main.cpp b/Frontend-Main.cpp index d26b630..b490ba1 100644 --- a/Frontend-Main.cpp +++ b/Frontend-Main.cpp @@ -17,6 +17,22 @@ namespace LX struct InvalidInputFilePath {}; struct InvalidOutputFilePath {}; struct InvalidLogFilePath {}; + + // Util function for getting a line of the source at a given index (used for errors) // + static std::string GetLineAtIndexOf(const std::string src, const std::streamsize index) + { + // Finds the start of the line // + size_t start = src.rfind('\n', index); + if (start == std::string::npos) { start = 0; } // None means first line + else { start = start + 1; } // Skips new line char + + // Finds the end of the line // + size_t end = src.find('\n', index); + if (end == std::string::npos) { end = src.size(); } // None means last line + + // Returns the string between start and end // + return src.substr(start, end - start); + } } int main(int argc, char** argv) @@ -26,6 +42,9 @@ int main(int argc, char** argv) std::filesystem::path outPath; std::filesystem::path logPath; + // Creates the contents string outside of the try-catch so they can be used in errors // + std::string contents; + // Creates the log-file out of the try-catch so it can be closed propely if an error is thrown // std::unique_ptr log = nullptr; @@ -43,6 +62,12 @@ int main(int argc, char** argv) std::ifstream inpFile(inpPath, std::ios::binary | std::ios::ate); // Opens in binary at the end for microptimisation // LX::ThrowIf(inpFile.is_open() == false); + // Copies the file into the string // + const std::streamsize len = inpFile.tellg(); // Gets length of file because it was opened at the end + inpFile.seekg(0, std::ios::beg); // Goes back to the beginning + contents = std::string(len, '\0'); // Allocates all the space for the string + inpFile.read(&contents[0], len); // Transfers file contents to string + // Opens / Creates the output file // std::ofstream outFile(outPath); LX::ThrowIf(outFile.is_open() == false); @@ -60,7 +85,7 @@ int main(int argc, char** argv) std::cout << std::filesystem::absolute(inpPath) << " -> " << std::filesystem::absolute(outPath) << std::endl; // Create tokens out of the input file // - std::vectortokens = LX::LexicalAnalyze(inpFile, log.get()); + std::vectortokens = LX::LexicalAnalyze(contents, len, log.get()); LX::SafeFlush(log.get()); // Turns the tokens into an AST // @@ -144,14 +169,17 @@ int main(int argc, char** argv) oss << std::setw(3) << e.line; size_t lineNumberWidthInConsole = std::max(oss.str().size(), (size_t)3); + // Gets the line of the error // + std::string line = LX::GetLineAtIndexOf(contents, e.index); + // Prints the error with the relevant information to the console // std::cout << "\n"; LX::PrintStringAsColor("Error: ", LX::Color::LIGHT_RED); std::cout << "Invalid character found in "; LX::PrintStringAsColor(inpPath.filename().string(), LX::Color::WHITE); std::cout << ":\n"; - std::cout << "Line: " << std::setw(lineNumberWidthInConsole) << e.line << " | " << e.lineContents << "\n"; - std::cout << " " << std::setw(lineNumberWidthInConsole) << "" << " | " << std::setw(e.index); + std::cout << "Line: " << std::setw(lineNumberWidthInConsole) << e.line << " | " << line << "\n"; + std::cout << " " << std::setw(lineNumberWidthInConsole) << "" << " | " << std::setw(e.col); LX::PrintStringAsColor("^", LX::Color::LIGHT_RED); std::cout << "\n"; @@ -159,6 +187,41 @@ int main(int argc, char** argv) return 5; } + catch (LX::UnexpectedToken& e) + { + // Calculates the length of the line number in the console so it is formatted correctly // + std::ostringstream oss; + oss << std::setw(3) << e.got.line; + size_t lineNumberWidthInConsole = std::max(oss.str().size(), (size_t)3); + + // Gets the line of the error // + std::string line = LX::GetLineAtIndexOf(contents, e.got.index); + + // Prints the error to the console with the relevant info // + std::cout << "\n"; + LX::PrintStringAsColor("Error: ", LX::Color::LIGHT_RED); + std::cout << "Incorrect syntax in "; + LX::PrintStringAsColor(inpPath.filename().string(), LX::Color::WHITE); + std::cout << ", found "; + LX::PrintStringAsColor(LX::ToString(e.got.type), LX::Color::WHITE); + std::cout << " expected: "; + + // Allows the error to have a custom type that is printed to the console // + if (e.expected == LX::Token::UNDEFINED) { LX::PrintStringAsColor(e.override, LX::Color::WHITE); } + else { LX::PrintStringAsColor(LX::ToString(e.expected), LX::Color::WHITE); } + std::cout << "\n"; + + // Prints the code with the error to the console // + std::string errorSquiggle(e.got.length, '^'); + std::cout << "Line: " << std::setw(lineNumberWidthInConsole) << e.got.line << " | " << line << "\n"; + std::cout << " " << std::setw(lineNumberWidthInConsole) << "" << " | " << std::setw(e.got.column - 1) << ""; + LX::PrintStringAsColor(errorSquiggle, LX::Color::LIGHT_RED); + std::cout << "\n"; + + // Returns Exit id of 6 so other process can be alerted of the error // + return 6; + } + // Catches any std errors, there should be none // catch (std::exception& e) { diff --git a/LX-Compiler.sln b/LX-Compiler.sln index 3838c05..98e046c 100644 --- a/LX-Compiler.sln +++ b/LX-Compiler.sln @@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.13.35931.197 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LX-Compiler", "LX-LLVM.vcxproj", "{CC37E36F-B3B3-41B0-A887-01E8EFE84994}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Frontend", "LX-LLVM.vcxproj", "{CC37E36F-B3B3-41B0-A887-01E8EFE84994}" ProjectSection(ProjectDependencies) = postProject {3125CA11-9F6D-4A4F-AFC1-37FEB3BBD9FA} = {3125CA11-9F6D-4A4F-AFC1-37FEB3BBD9FA} {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA} = {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA} diff --git a/LX-LLVM.vcxproj b/LX-LLVM.vcxproj index 9541de2..e10c052 100644 --- a/LX-LLVM.vcxproj +++ b/LX-LLVM.vcxproj @@ -24,7 +24,7 @@ {cc37e36f-b3b3-41b0-a887-01e8efe84994} LXLLVM 10.0 - Compiler-Frontend + Frontend diff --git a/Lexer/src/Lexer.cpp b/Lexer/src/Lexer.cpp index fa76f55..89d6f03 100644 --- a/Lexer/src/Lexer.cpp +++ b/Lexer/src/Lexer.cpp @@ -11,12 +11,11 @@ namespace LX { - // Local macros cause im lazy // - + // Helper macro for outputting token type // #define TOKEN_CASE(type) case type: return #type; - // Logging function to turn a tokentype enum val into it's string // - static std::string ToString(Token::TokenType type) + // Helper util function to translate a tokentype to it's enum val // + static std::string ToStringNoFormat(Token::TokenType type) { switch (type) { @@ -40,51 +39,53 @@ namespace LX } } - static constexpr bool CanBePartOfNumberLiteral(const char c) + // Logging function to turn a tokentype enum val into a nicely formatted string // + std::string ToString(Token::TokenType type) { - return (c == '.') || (c == 'f'); - } + // Gets the unformated version of the string // + std::string unformatted = ToStringNoFormat(type); + unformatted = unformatted.substr(7); // Removes the Token:: prefix + // Formats the string (turns to lowercase and replaces _ with a space // + std::string formatted; + + for (char current : unformatted) + { + // Adding 32 makes it lowercase due to how ASCII works // + if ((current >= 'A' && current <= 'Z')) { formatted.push_back(current + 32); } + + // Replaces _ with spaces // + else if (current == '_') { formatted.push_back(' '); } + + // Else adds the current character // + else { formatted.push_back(current); } + } + + // Returns the formatted string // + return formatted; + } + + // Stops use outside of the function // + #undef TOKEN_CASE + + // Helper function for dealing with floating-point number literals // + static constexpr bool CanBePartOfNumberLiteral(const char c) { return (c == '.') || (c == 'f'); } + + // Helper function to stop printing whitespace as pure whitespace // static std::string PrintChar(const char c) { switch (c) { + // Stores them as pure string literals // case '\n': return R"(\n)"; case '\t': return R"(\t)"; case '\r': return R"(\r)"; + + // Else returns a string of length one with the char inside // default: return std::string(1, c); } } - // Struct to store the current information of the lexer // - struct LexerInfo - { - // Current trackers of where in the source it is // - - std::streamsize line = 1; // <- Lines start on 1 (probably because of non-programmer's) - std::streamsize index = 0; - std::streamsize column = 0; // <- Columns start on 1 (probably because of non-programmer's) - - // Trackers for when a multi-char token started // - - std::streamsize startOfWord = 0; - std::streamsize startOfNumberLiteral = 0; - std::streamsize startOfStringLiteral = 0; - - // Different flags of the lexer // - // Stored as a bitset to minimse memory allocated (basically no difference, because only one exists at any given time) // - - bool isAlpha : 1 = false; - bool isNumeric : 1 = false; - bool inComment : 1 = false; - bool inStringLiteral : 1 = false; - bool isNextCharAlpha : 1 = false; - bool isNextCharNumeric : 1 = false; - bool wasLastCharAlpha : 1 = false; - bool wasLastCharNumeric : 1 = false; - bool lexingNumber : 1 = false; - }; - // All the keywords the lexer currently supports with their token-enum equivalents // static const std::unordered_map keywords = { @@ -113,17 +114,17 @@ namespace LX // Checks the map for a check and if so adds it with its enum equivalent // if (auto keyword = keywords.find(word); keyword != keywords.end()) { - tokens.push_back({ keyword->second, "", info.line, info.column - (std::streamsize)word.size(), (std::streamsize)word.size()}); + tokens.push_back({ keyword->second, info, "", (std::streamsize)word.size() }); } // Else adds it as a type of IDENTIFIER // else { - tokens.push_back({ Token::IDENTIFIER, word, info.line, info.column - (std::streamsize)word.size(), (std::streamsize)word.size()}); + tokens.push_back({ Token::IDENTIFIER, info, word, (std::streamsize)word.size() }); } } - const std::vector LX::LexicalAnalyze(std::ifstream& src, std::ofstream* log) + const std::vector LX::LexicalAnalyze(const std::string& contents, std::streamsize len, std::ofstream* log) { // Logs the start of the lexical analysis SafeLog(log, LOG_BREAK, "Started lexing file", LOG_BREAK); @@ -133,15 +134,6 @@ namespace LX std::vector tokens = {}; tokens.reserve(0xFFFF); - // Turns the contents of the file into a string // - - // Gets length of the file because it is opened at the end - const std::streamsize len = src.tellg(); - src.seekg(0, std::ios::beg); // Goes back to the beginning - - std::string contents(len, '\0'); // Preallocates all space needed - src.read(&contents[0], len); // Transfers file to string - // Trackers for when the program is iterating over the file // LexerInfo info; @@ -154,7 +146,7 @@ namespace LX // Checks if it is not at end // // Predicts it is not at end for microptimsation // if (info.index + 1 < len) [[likely]] - { + { // Gets the next character // const char next = contents[info.index + 1]; @@ -190,7 +182,7 @@ namespace LX { // Adds the string literal token to the token vector // std::string lit(contents.data() + info.startOfStringLiteral, info.index - info.startOfStringLiteral); - tokens.push_back({ Token::STRING_LITERAL, lit, info.line, info.column - (std::streamsize)lit.length(), (std::streamsize)lit.length() }); + tokens.push_back({ Token::STRING_LITERAL, info, lit, (std::streamsize)lit.length() + 1 }); // Updates trackers // info.inStringLiteral = false; @@ -220,7 +212,7 @@ namespace LX { // Pushes the number to the token vector. Number literals are stored as string in the tokens // std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral); - tokens.push_back({ Token::NUMBER_LITERAL, num, info.line, info.column - (std::streamsize)num.size(), (std::streamsize)num.size() }); + tokens.push_back({ Token::NUMBER_LITERAL, info, num, (std::streamsize)num.size() }); } // Stores it is lexing a number literal // @@ -232,7 +224,7 @@ namespace LX { // Pushes the number to the token vector. Number literals are stored as string in the tokens // std::string num(contents.data() + info.startOfNumberLiteral, (unsigned __int64)(info.index + 1) - info.startOfNumberLiteral); - tokens.push_back({ Token::NUMBER_LITERAL, num, info.line, info.column - (std::streamsize)num.size(), (std::streamsize)num.size() }); + tokens.push_back({ Token::NUMBER_LITERAL, info, num, (std::streamsize)num.size() }); info.lexingNumber = false; // Stops storing it is lexing a number } @@ -267,7 +259,7 @@ namespace LX // Operators (+, -, /, *) // else if (auto op = operators.find(current); op != operators.end()) { - tokens.push_back({ op->second, "", info.line, info.column, 1}); + tokens.push_back({ op->second, info, "", 1 }); } // If it is here and not whitespace that means it's an invalid character // @@ -287,31 +279,20 @@ namespace LX info.line++; } - // Throws an error with all the relevant information //s + // Throws an error with all the relevant information // else { - // Finds the start of the line // - size_t start = contents.rfind('\n', info.index); - if (start == std::string::npos) { start = 0; } // std::npos means none was found so defaults to 1 - else { start = start + 1; } // Skips the new line character - - // Finds the end of the line // - size_t end = contents.find('\n', info.index); - if (end == std::string::npos) { end = contents.size(); } // If it reaches the end with no /n it defaults to the length of the string - - // The line where the invalid character is // - std::string line = contents.substr(start, end - start); - - // Throws an error to alert the user of the invalid character // - throw InvalidCharInSource(info.column, info.line, line, contents[info.index]); + throw InvalidCharInSource(info.column, info.line, info.index, contents[info.index]); } // Log dumps A LOT of info // + #ifdef LOG_EVERYTHING + SafeLog ( - log, - "Is Alpha: ", info.isAlpha, + log, "Index: ", std::left, std::setw(3), info.index, + " Is Alpha: ", info.isAlpha, " Is Numeric: ", info.isNumeric, " In Comment: ", info.inComment, " In String: ", info.inStringLiteral, @@ -322,6 +303,8 @@ namespace LX " Current: {", PrintChar(current), "}" ); + #endif // LOG_EVERYTHING + // Updates trackers to their default state of a new character // info.index++; @@ -334,18 +317,20 @@ namespace LX // Logs the tokens if logging is on // if (log != nullptr) { - SafeLog(log, LOG_BREAK, "Tokens", LOG_BREAK); + #ifdef LOG_EVERYTHING + SafeLog(log, "\n"); // Puts a space when there is a lot in the log + #endif // LOG_EVERYTHING for (auto& token : tokens) { if (token.contents.empty() == false) { - SafeLog(log, "{ Line: ", std::left, std::setw(3), token.line, ", Column: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, "} ", std::setw(30), ToString(token.type) + ":", "{", token.contents, "}"); + SafeLog(log, std::left, "{ Line: ", std::setw(3), token.line, ", Index: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, " } ", std::setw(30), ToStringNoFormat(token.type) + ":", "{", token.contents, "}"); } else { - SafeLog(log, "{ Line: ", std::left, std::setw(3), token.line, ", Column: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, "} ", ToString(token.type)); + SafeLog(log, std::left, "{ Line: ", std::setw(3), token.line, ", Index: ", std::setw(3), token.index, ", Length: ", std::setw(2), token.length, " } ", ToStringNoFormat(token.type)); } } diff --git a/Lexer/src/Token.cpp b/Lexer/src/Token.cpp index 9f018c4..a2bd3a4 100644 --- a/Lexer/src/Token.cpp +++ b/Lexer/src/Token.cpp @@ -6,7 +6,7 @@ namespace LX { // Passes the constructor args to the values // - Token::Token(const TokenType _type, std::string _contents, std::streamsize _line, std::streamsize _index, std::streamsize _length) - : type(_type), contents(_contents), line(_line), index(_index), length(_length) + Token::Token(const TokenType _type, const LexerInfo& info, std::string _contents, std::streamsize _length) + : type(_type), contents(_contents), index(info.index - _length + 1), line(info.line), column(info.column - _length), length(_length) {} } diff --git a/Parser/src/Parser.cpp b/Parser/src/Parser.cpp index 28c91ee..550da06 100644 --- a/Parser/src/Parser.cpp +++ b/Parser/src/Parser.cpp @@ -45,10 +45,8 @@ namespace LX case Token::NUMBER_LITERAL: return std::make_unique(p.tokens[p.index++].contents); - // Default just alerts the user of an error // - // TODO: Actually make this error tell the user something useful // + // Returns nullptr, the parsing function that recives that value will decide if that is valid // default: - std::cout << "UNKNOWN TOKEN: " << p.tokens[p.index].type << std::endl; p.index++; return nullptr; } @@ -66,6 +64,7 @@ namespace LX { // Parses the left hand side of the operation // std::unique_ptr lhs = ParsePrimary(p); + ThrowIf(lhs == nullptr, Token::UNDEFINED, "value", p.tokens[p.index - 1]); // Stores the operator to pass into the AST node // Token::TokenType op = p.tokens[p.index].type; @@ -73,6 +72,7 @@ namespace LX // Parses the right hand of the operation // std::unique_ptr rhs = ParseOperation(p); + ThrowIf(rhs == nullptr, Token::UNDEFINED, "value", p.tokens[p.index - 1]); // Returns an AST node as all of the components combined together // return std::make_unique(std::move(lhs), op, std::move(rhs)); diff --git a/common/Lexer.h b/common/Lexer.h index 02dc405..24c97e5 100644 --- a/common/Lexer.h +++ b/common/Lexer.h @@ -9,13 +9,9 @@ namespace std template struct char_traits; - template - class basic_ifstream; - template class basic_ofstream; - using ifstream = basic_ifstream>; using ofstream = basic_ofstream>; } @@ -27,14 +23,42 @@ namespace LX // Error type with index and character to alert the user that LX does not understand that symbol // struct InvalidCharInSource { - std::streamsize index; + std::streamsize col; std::streamsize line; - - std::string lineContents; + std::streamsize index; char invalid; }; + // Struct to store the current information of the lexer // + struct LexerInfo + { + // Current trackers of where in the source it is // + + std::streamsize line = 1; // <- Lines start on 1 (probably because of non-programmer's) + std::streamsize index = 0; + std::streamsize column = 0; // <- Columns start on 1 (probably because of non-programmer's) + + // Trackers for when a multi-char token started // + + std::streamsize startOfWord = 0; + std::streamsize startOfNumberLiteral = 0; + std::streamsize startOfStringLiteral = 0; + + // Different flags of the lexer // + // Stored as a bitset to minimse memory allocated (basically no difference, because only one exists at any given time) // + + bool isAlpha : 1 = false; + bool isNumeric : 1 = false; + bool inComment : 1 = false; + bool inStringLiteral : 1 = false; + bool isNextCharAlpha : 1 = false; + bool isNextCharNumeric : 1 = false; + bool wasLastCharAlpha : 1 = false; + bool wasLastCharNumeric : 1 = false; + bool lexingNumber : 1 = false; + }; + // Data type to store a more computer readable version of files struct __declspec(novtable) Token final { @@ -65,7 +89,7 @@ namespace LX }; // Constructor of the tokens to set their info // - Token(const TokenType _type, std::string _contents, std::streamsize _line, std::streamsize _index, std::streamsize _length); + Token(const TokenType _type, const LexerInfo& info, std::string _contents, std::streamsize _length); // Contents of the token (may be empty if not needed) // // Const to avoid external changes // @@ -74,17 +98,23 @@ namespace LX // Type of the token // // Const to avoid external changes // const TokenType type; - - // The line where the token is located in the source // - const std::streamsize line; - // Index on the line where the token starts // + // Index in the source of the token // const std::streamsize index; // The length of the token on the line, may be different to the length of contents // const std::streamsize length; + + // The line the token is located on // + const std::streamsize line; + + // The column on the line where it is located // + const std::streamsize column; }; + // Logging function to turn a tokentype enum val into it's string // + std::string ToString(Token::TokenType t); + // Lexer function to take in a file and output a vector of tokens // - const std::vector LexicalAnalyze(std::ifstream& src, std::ofstream* log); + const std::vector LexicalAnalyze(const std::string& contents, const std::streamsize len, std::ofstream* log); } diff --git a/common/Parser.h b/common/Parser.h index 42e74ca..d9d98d8 100644 --- a/common/Parser.h +++ b/common/Parser.h @@ -61,6 +61,20 @@ namespace LX // Thrown if there was an error during IR Generation // struct IRGenerationError {}; + // Thrown if there was an unexpected (incorrect) token // + struct UnexpectedToken + { + // The token type that should be there // + Token::TokenType expected; + + // If there are multiple expected types there is an option for a custom message // + std::string override; + + // What token was actually at that position // + // Stored as Token not TokenType to store the location of it within the source // + Token got; + }; + // Holds all needed info about a function // // Currently only holds the body but in the future will hold: name, params, namespace/class-member struct FunctionDefinition diff --git a/common/Util.h b/common/Util.h index 88db0da..6ae60f9 100644 --- a/common/Util.h +++ b/common/Util.h @@ -2,6 +2,9 @@ #include +// Defining this is only if you are at the point where you should be using a debugger // +#define LOG_EVERYTHING + namespace LX { template