From 4ac5061f039fc4fcc031d3671642f362e7625214 Mon Sep 17 00:00:00 2001 From: Pasha Bibko <156938226+PashaBibko@users.noreply.github.com> Date: Wed, 16 Apr 2025 19:32:26 +0100 Subject: [PATCH] Added lexer from previous project --- LX-Compiler.sln | 17 +++- LX-LLVM.vcxproj | 9 +- LX-LLVM.vcxproj.filters | 19 ++-- LX-LLVM.vcxproj.user | 4 +- Lexer/Lexer.vcxproj | 140 ++++++++++++++++++++++++++++ Lexer/Lexer.vcxproj.filters | 21 +++++ Lexer/Lexer.vcxproj.user | 4 + Lexer/src/Lexer.cpp | 181 ++++++++++++++++++++++++++++++++++++ Lexer/src/Token.cpp | 11 +++ Main.cpp | 47 ++++++++-- build-test/Log.txt | 9 ++ build-test/main.lx | 2 + common/Lexer.h | 55 +++++++++++ common/Util.h | 10 ++ 14 files changed, 506 insertions(+), 23 deletions(-) create mode 100644 Lexer/Lexer.vcxproj create mode 100644 Lexer/Lexer.vcxproj.filters create mode 100644 Lexer/Lexer.vcxproj.user create mode 100644 Lexer/src/Lexer.cpp create mode 100644 Lexer/src/Token.cpp create mode 100644 build-test/Log.txt create mode 100644 common/Lexer.h diff --git a/LX-Compiler.sln b/LX-Compiler.sln index 06aa645..e1b0f26 100644 --- a/LX-Compiler.sln +++ b/LX-Compiler.sln @@ -1,9 +1,14 @@  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 -VisualStudioVersion = 17.13.35931.197 d17.13 +VisualStudioVersion = 17.13.35931.197 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LX-LLVM", "LX-LLVM.vcxproj", "{CC37E36F-B3B3-41B0-A887-01E8EFE84994}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LX-Compiler", "LX-LLVM.vcxproj", "{CC37E36F-B3B3-41B0-A887-01E8EFE84994}" + ProjectSection(ProjectDependencies) = postProject + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA} = {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Lexer", "Lexer\Lexer.vcxproj", "{4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -21,6 +26,14 @@ Global {CC37E36F-B3B3-41B0-A887-01E8EFE84994}.Release|x64.Build.0 = Release|x64 {CC37E36F-B3B3-41B0-A887-01E8EFE84994}.Release|x86.ActiveCfg = Release|Win32 {CC37E36F-B3B3-41B0-A887-01E8EFE84994}.Release|x86.Build.0 = Release|Win32 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Debug|x64.ActiveCfg = Debug|x64 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Debug|x64.Build.0 = Debug|x64 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Debug|x86.ActiveCfg = Debug|Win32 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Debug|x86.Build.0 = Debug|Win32 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Release|x64.ActiveCfg = Release|x64 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Release|x64.Build.0 = Release|x64 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Release|x86.ActiveCfg = Release|Win32 + {4E4019F5-12E0-4EE2-9658-A0DD3038EEDA}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/LX-LLVM.vcxproj b/LX-LLVM.vcxproj index 7bad884..44d9d5f 100644 --- a/LX-LLVM.vcxproj +++ b/LX-LLVM.vcxproj @@ -106,11 +106,13 @@ _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true stdcpp20 - $(SolutionDir)/Common;%(AdditionalIncludeDirectories) + $(SolutionDir)common;%(AdditionalIncludeDirectories) Console true + Lexer.lib;%(AdditionalDependencies) + $(SolutionDir)$(Platform)\$(Configuration)\;%(AdditionalLibraryDirectories) @@ -122,19 +124,22 @@ NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true stdcpp20 - $(SolutionDir)/Common;%(AdditionalIncludeDirectories) + $(SolutionDir)common;%(AdditionalIncludeDirectories) Console true true true + Lexer.lib;%(AdditionalDependencies) + $(SolutionDir)$(Platform)\$(Configuration)\;%(AdditionalLibraryDirectories) + diff --git a/LX-LLVM.vcxproj.filters b/LX-LLVM.vcxproj.filters index 805c715..4066ea5 100644 --- a/LX-LLVM.vcxproj.filters +++ b/LX-LLVM.vcxproj.filters @@ -1,23 +1,20 @@  - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx - - + {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd - - - Source Files - - - Header Files + Common + + + Common + + + \ No newline at end of file diff --git a/LX-LLVM.vcxproj.user b/LX-LLVM.vcxproj.user index bf3f381..8459cf7 100644 --- a/LX-LLVM.vcxproj.user +++ b/LX-LLVM.vcxproj.user @@ -4,11 +4,11 @@ false - build-test/Main.lx build-test/Main.ll + build-test/Main.lx build-test/Main.ll build-test/Log.txt WindowsLocalDebugger - build-test/Main.lx build-test/Main.ll + build-test/Main.lx build-test/Main.ll build-test/Log.txt WindowsLocalDebugger \ No newline at end of file diff --git a/Lexer/Lexer.vcxproj b/Lexer/Lexer.vcxproj new file mode 100644 index 0000000..6f3b908 --- /dev/null +++ b/Lexer/Lexer.vcxproj @@ -0,0 +1,140 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + Win32Proj + {4e4019f5-12e0-4ee2-9658-a0dd3038eeda} + Lexer + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + StaticLibrary + true + v143 + Unicode + + + StaticLibrary + false + v143 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + $(ProjectDir)inc;$(SolutionDir)common;%(AdditionalIncludeDirectories) + stdcpp20 + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + $(ProjectDir)inc;$(SolutionDir)common;%(AdditionalIncludeDirectories) + stdcpp20 + + + Console + true + true + true + + + + + + + + + + \ No newline at end of file diff --git a/Lexer/Lexer.vcxproj.filters b/Lexer/Lexer.vcxproj.filters new file mode 100644 index 0000000..3ec9ec0 --- /dev/null +++ b/Lexer/Lexer.vcxproj.filters @@ -0,0 +1,21 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + + + Source Files + + + Source Files + + + \ No newline at end of file diff --git a/Lexer/Lexer.vcxproj.user b/Lexer/Lexer.vcxproj.user new file mode 100644 index 0000000..88a5509 --- /dev/null +++ b/Lexer/Lexer.vcxproj.user @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/Lexer/src/Lexer.cpp b/Lexer/src/Lexer.cpp new file mode 100644 index 0000000..1c69e7d --- /dev/null +++ b/Lexer/src/Lexer.cpp @@ -0,0 +1,181 @@ +#include +#include + +#include +#include +#include +#include + +#include + +namespace LX +{ + // Local macros cause im lazy // + + #define ITERATE index++; continue + #define TOKEN_CASE(type) case type: return #type; + + static std::string ToString(Token::TokenType type) + { + switch (type) + { + TOKEN_CASE(Token::STRING_LITERAL); + TOKEN_CASE(Token::IDENTIFIER); + TOKEN_CASE(Token::FOR); + TOKEN_CASE(Token::WHILE); + TOKEN_CASE(Token::IF); + TOKEN_CASE(Token::ELSE); + TOKEN_CASE(Token::ELIF); + TOKEN_CASE(Token::FUNCTION); + + default: + return std::string("Unknown: " + (short)type); + } + } + + static const std::unordered_map keywords = + { + { "for", Token::FOR }, + { "while", Token::WHILE }, + { "if", Token::IF }, + { "else", Token::ELSE }, + { "elif", Token::ELIF }, + { "func", Token::FUNCTION }, + }; + + const std::vector LX::LexicalAnalyze(std::ifstream& src, std::ofstream* log) + { + // Logs the start of the lexical analysis + SafeLog(log, LOG_BREAK, "Started lexing file", LOG_BREAK); + + // Allocates a large ammount of memory to hold the output + // Will shrink the size later on to stop excess memory + std::vector tokens = {}; + tokens.reserve(0xFFFF); + + // Turns the contents of the file into a string // + + // Gets length of the file because it is opened at the end + const std::streamsize len = src.tellg(); + src.seekg(0, std::ios::beg); // Goes back to the beginning + + std::string contents(len, '\0'); // Preallocates all space needed + src.read(&contents[0], len); // Transfers file to string + + // Trackers for when the program is iterating over the file // + + std::streamsize index = 0; + + std::streamsize startOfWord = 0; + std::streamsize startOfStringLiteral = 0; + + bool inComment = false; + bool inStringLiteral = false; + bool wasLastCharAlpha = false; + + // Iterates over the file and turns it into tokens // + while (index < len) + { + // Stores the current character for easy access + const char current = contents[index]; + + // Updates string literal tracker and skips over rest if in a string literal + if (current == '"') + { + // Start of string literal + if (inStringLiteral == false) + { + // Updates the neccesarry trackers + startOfStringLiteral = index + 1; + inStringLiteral = true; + ITERATE; + } + + // End of string literal + else + { + // Adds the string literal token to the token vector + std::string lit(contents.data() + startOfStringLiteral, index - startOfStringLiteral); + tokens.push_back({ Token::STRING_LITERAL, lit }); + + // Updates trackers + inStringLiteral = false; + ITERATE; + } + } + + // Skips over rest if within a string literal + if (inStringLiteral) { ITERATE; } + + // Updates comment state + if (current == '#') + { + inComment = !inComment; + + ITERATE; + } + + // Skips over if within a comment + if (inComment) { ITERATE; } + + // Works out if the current character is alphabetic + bool isAlpha = (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z'); + + if (isAlpha == true) + { + // Start of a word + if (wasLastCharAlpha == false) + { + // Updates trackers + wasLastCharAlpha = true; + startOfWord = index; + } + + ITERATE; + } + + // End of a word + if (isAlpha == false && wasLastCharAlpha == true) + { + // Adds the word token to the token vector + std::string word(contents.data() + startOfWord, index - startOfWord); + + if (auto keyword = keywords.find(word); keyword != keywords.end()) + { + tokens.push_back({ keyword->second, word }); + } + + else + { + tokens.push_back({ Token::IDENTIFIER, word }); + } + } + + // Operators will eventually go here + + // If it is here and not whitespace that means it's an invalid character + if (current == ' ' || current == '\t' || current == '\r' || current == '\n') + { + // Updates trackers + wasLastCharAlpha = isAlpha; + ITERATE; + } + + // Throws an error to alert the user + throw InvalidCharInSource(index, current); + } + + // Logs the tokens if logging is on // + if (log != nullptr) + { + for (auto& token : tokens) + { + SafeLog(log, ToString(token.type), ":\t", token.contents); + } + } + + // Shrinks the vector down to minimum size before returning to avoid excess memory being allocated + tokens.shrink_to_fit(); + return tokens; + } +} \ No newline at end of file diff --git a/Lexer/src/Token.cpp b/Lexer/src/Token.cpp new file mode 100644 index 0000000..0acd429 --- /dev/null +++ b/Lexer/src/Token.cpp @@ -0,0 +1,11 @@ +#include + +#include +#include + +namespace LX +{ + Token::Token(const TokenType _type, std::string _contents) + : type(_type), contents(_contents) + {} +} diff --git a/Main.cpp b/Main.cpp index 613bd1a..8ac84dc 100644 --- a/Main.cpp +++ b/Main.cpp @@ -2,7 +2,9 @@ #include #include #include +#include +#include #include namespace LX @@ -12,6 +14,7 @@ namespace LX struct IncorrectCommandLineArgs {}; struct InvalidInputFilePath {}; struct InvalidOutputFilePath {}; + struct InvalidLogFilePath {}; } int main(int argc, char** argv) @@ -19,7 +22,7 @@ int main(int argc, char** argv) try { // Checks there is the correct ammount of arguments - LX::ThrowIf(argc != 3); + LX::ThrowIf((argc == 3 || argc == 4) == false); // Turns the file paths into the C++ type for handling them std::filesystem::path inpPath = argv[1]; @@ -28,18 +31,32 @@ int main(int argc, char** argv) // Prints the full paths to the console to let the user know compiling is being done std::cout << std::filesystem::absolute(inpPath) << " -> " << std::filesystem::absolute(outPath) << std::endl; - // Checks the input file exists + // Checks the input file exists and opens it LX::ThrowIf(std::filesystem::exists(inpPath) == false); + std::ifstream inpFile(inpPath, std::ios::binary | std::ios::ate); // Opens in binary at the end for microptimisation + LX::ThrowIf(inpFile.is_open() == false); - // Opens / Creates the output file and checks if it is open + // Opens / Creates the output file std::ofstream outFile(outPath); LX::ThrowIf(outFile.is_open() == false); + + // Opens / Creates the log file + std::unique_ptr log = nullptr; + + if (argc == 4) + { + log = std::make_unique(argv[3]); + LX::ThrowIf(log->is_open() == false); + } + + // Create tokens out of the input file + std::vectortokens = LX::LexicalAnalyze(inpFile, log.get()); } catch (LX::IncorrectCommandLineArgs) { // Displays to the console of how to use the program - std::cout << "\nUsage: [source file] [output file]\n"; + std::cout << "\nUsage: [source file] [output file] (optional)[log file]\n"; return 1; } @@ -47,7 +64,7 @@ int main(int argc, char** argv) catch (LX::InvalidInputFilePath) { // Tells user the input file could not be found - std::cout << "\nFile path: {" << argv[1] << "} could not be found\n"; + std::cout << "\nFile path: {" << argv[1] << "} could not be opened\n"; return 2; } @@ -55,7 +72,25 @@ int main(int argc, char** argv) catch (LX::InvalidOutputFilePath) { // Tells the user the output file could not be opened - std::cout << "\nCould not open/create {" << argv[2] << "}"; + std::cout << "\nCould not open/create {" << argv[2] << "}\n"; + } + + catch (LX::InvalidCharInSource& e) + { + // + std::cout << "\nInvalid character found in source file: {" << e.invalid << "} at index: " << e.index << "\n"; + } + + catch (std::exception& e) + { + // Prints the std exception to the console + std::cout << "\nAn error occured:\n" << e.what() << std::endl; + } + + catch (...) + { + // Tells the user if an error has happened + std::cout << "\nAn Error occured\n"; } return 0; diff --git a/build-test/Log.txt b/build-test/Log.txt new file mode 100644 index 0000000..0cf4dac --- /dev/null +++ b/build-test/Log.txt @@ -0,0 +1,9 @@ + +-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +Started lexing file +-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +Token::FUNCTION: func +Token::IDENTIFIER: main +Token::IDENTIFIER: print +Token::STRING_LITERAL: Hello World! diff --git a/build-test/main.lx b/build-test/main.lx index e69de29..f3f3246 100644 --- a/build-test/main.lx +++ b/build-test/main.lx @@ -0,0 +1,2 @@ +func main + print "Hello World!" diff --git a/common/Lexer.h b/common/Lexer.h new file mode 100644 index 0000000..2010860 --- /dev/null +++ b/common/Lexer.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include + +// This file contains everything that is exported from Lexer.lib +// The rest of the items within the Lexer project are internal only + +namespace LX +{ + struct InvalidCharInSource + { + std::streamsize index; + char invalid; + }; + + // Data type to store a more computer readable version of files + struct __declspec(novtable) Token final + { + // Enum to hold the type of the token + enum TokenType : short + { + // General tokens // + + STRING_LITERAL, + IDENTIFIER, + + // Keywords // + + FOR, WHILE, + IF, ELSE, ELIF, + + FUNCTION, + + // You made a mistake somehow // + + UNDEFINED = -1 + }; + + // Constructor of the tokens to set their info + Token(const TokenType _type, std::string _contents); + + // Contents of the token (may be empty if not needed) + // Const to avoid external changes + const std::string contents; + + // Type of the token + // Const to avoid external changes + const TokenType type; + }; + + // Lexer function to take in a file and output a vector of tokens + const std::vector LexicalAnalyze(std::ifstream& src, std::ofstream* log); +} diff --git a/common/Util.h b/common/Util.h index c72015b..04dff5d 100644 --- a/common/Util.h +++ b/common/Util.h @@ -1,8 +1,18 @@ #pragma once +#include + namespace LX { template inline void ThrowIf(const bool condition, Args... args) { if (condition) [[unlikely]] { throw T(args...); }} + + template + inline void SafeLog(std::ofstream* log, Args... args) + { + if (log != nullptr) { (*log << ... << args); *log << "\n"; } + } + + constexpr const char* LOG_BREAK = "\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n"; }