Fixing folder capitlisation [2/2]
This commit is contained in:
18
lexer/CMakeLists.txt
Normal file
18
lexer/CMakeLists.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
# Fetches all .cpp files for the binary #
|
||||
add_library(Lexer STATIC
|
||||
src/Lexer.cpp
|
||||
src/Token.cpp
|
||||
)
|
||||
|
||||
# Adds the headers in the current directory #
|
||||
target_include_directories (
|
||||
Lexer PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/inc
|
||||
)
|
||||
|
||||
# Links to the all needed internal libraries #
|
||||
target_link_libraries(Lexer PRIVATE PashaBibko-UTIL)
|
||||
|
||||
# Creates the precompiled header for the binary #
|
||||
target_include_directories(Lexer PRIVATE ${CMAKE_SOURCE_DIR}/external/util)
|
||||
target_precompile_headers(Lexer PRIVATE ${CMAKE_SOURCE_DIR}/external/util/Util.h)
|
||||
60
lexer/inc/Lexer.h
Normal file
60
lexer/inc/Lexer.h
Normal file
@@ -0,0 +1,60 @@
|
||||
#pragma once
|
||||
|
||||
#include <Token.h>
|
||||
|
||||
namespace PashaBibko::LXC::Lexer
|
||||
{
|
||||
struct LexerContext final
|
||||
{
|
||||
// Constructor to set the information of the context //
|
||||
LexerContext(const std::string& _source);
|
||||
|
||||
// Trackers for the Lexer itself //
|
||||
const std::string& source;
|
||||
uint32_t index;
|
||||
|
||||
LexerOutput out;
|
||||
const uint32_t len;
|
||||
|
||||
// Trackers for where the Lexer is within the user version of source //
|
||||
unsigned short column;
|
||||
unsigned short line;
|
||||
};
|
||||
|
||||
struct LexerError final
|
||||
{
|
||||
// Different reasons why the Lexer can fail //
|
||||
enum Reason
|
||||
{
|
||||
InvalidCharacter,
|
||||
UnterminatedStringLiteral,
|
||||
UnknownSymbolOrOperand
|
||||
};
|
||||
|
||||
// Constructor to pass arguments through to the struct //
|
||||
LexerError(Reason _reason, uint32_t errorIndex, std::string _info = "")
|
||||
: reason(_reason), index(errorIndex), info(_info)
|
||||
{}
|
||||
|
||||
// Turns the error into a c-string //
|
||||
inline static const char* const ReasonStr(Reason reason)
|
||||
{
|
||||
static const char* reasons[] =
|
||||
{
|
||||
"Invalid character found in source",
|
||||
"Unterminated string literal in source",
|
||||
"Unknown symbol or operand in source"
|
||||
};
|
||||
|
||||
return reasons[reason];
|
||||
}
|
||||
|
||||
// Error information //
|
||||
const Reason reason;
|
||||
const uint32_t index;
|
||||
const std::string info;
|
||||
};
|
||||
|
||||
// Turns a file into a vector of tokens //
|
||||
Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents);
|
||||
}
|
||||
136
lexer/inc/Token.h
Normal file
136
lexer/inc/Token.h
Normal file
@@ -0,0 +1,136 @@
|
||||
#pragma once
|
||||
|
||||
#include <Util.h>
|
||||
|
||||
namespace PashaBibko::LXC::Lexer
|
||||
{
|
||||
namespace TokenClass
|
||||
{
|
||||
// Bitmask for different token classes //
|
||||
enum ClassMask : unsigned short
|
||||
{
|
||||
// Mathematical and logic operators //
|
||||
Operator = 1 << (1 + 8),
|
||||
|
||||
// Special words defined by the compiler //
|
||||
Keyword = 1 << (2 + 8),
|
||||
|
||||
// Words such as literals and identifiers //
|
||||
UserDefined = 1 << (3 + 8),
|
||||
|
||||
// Symbols in the source like (? , . ! <) //
|
||||
Symbols = 1 << (4 + 8),
|
||||
|
||||
// Tokens not defined by previous classes //
|
||||
Misc = 1 << (5 + 8)
|
||||
};
|
||||
};
|
||||
|
||||
struct LexerContext;
|
||||
|
||||
// Data type for storing the output of the lexer //
|
||||
class Token final
|
||||
{
|
||||
public:
|
||||
// Enum of token type organised by their token class //
|
||||
enum TokenType : unsigned short
|
||||
{
|
||||
// === Operators === //
|
||||
|
||||
Add = TokenClass::Operator,
|
||||
Sub,
|
||||
Mul,
|
||||
Div,
|
||||
Mod,
|
||||
|
||||
Eql,
|
||||
|
||||
// === Keywords === //
|
||||
|
||||
For = TokenClass::Keyword,
|
||||
While,
|
||||
If,
|
||||
ElseIf,
|
||||
Else,
|
||||
Return,
|
||||
|
||||
FunctionDef,
|
||||
|
||||
// === User defined === //
|
||||
|
||||
StringLiteral = TokenClass::UserDefined,
|
||||
NumLiteral,
|
||||
Identifier,
|
||||
|
||||
// === Symbols === //
|
||||
|
||||
Assign = TokenClass::Symbols,
|
||||
Colon,
|
||||
|
||||
CloseBracket,
|
||||
OpenBracket,
|
||||
|
||||
CloseBrace,
|
||||
OpenBrace,
|
||||
|
||||
CloseParen,
|
||||
OpenParen,
|
||||
|
||||
CloseCrocodile,
|
||||
OpenCrocodile,
|
||||
|
||||
Comma,
|
||||
|
||||
// === Misc === //
|
||||
|
||||
End_of_file = TokenClass::Misc,
|
||||
|
||||
UNDEFINED = 65535 // Invalid token type (max number)
|
||||
};
|
||||
|
||||
// Util function calculating wether a token is of a given class //
|
||||
template<TokenClass::ClassMask mask> static constexpr bool IsTypeClass(TokenType type)
|
||||
{
|
||||
using T = std::underlying_type_t<TokenType>;
|
||||
return static_cast<T>(type) & static_cast<T>(mask);
|
||||
}
|
||||
|
||||
// Constructor to set the data of the token for more complex token types //
|
||||
Token(const LexerContext& ctx, uint32_t start, unsigned short len, TokenType _type);
|
||||
|
||||
// Copy constructor //
|
||||
Token(const Token& other);
|
||||
|
||||
// Move constructor (transfers memory allocated) //
|
||||
Token(Token&& other) noexcept;
|
||||
|
||||
// Cannot use these as members are const //
|
||||
Token& operator=(const Token&) = delete;
|
||||
Token& operator=(Token&&) = delete;
|
||||
|
||||
// Deconstructor to clean up the allocated memory //
|
||||
~Token();
|
||||
|
||||
// Getters for the c-string to stop it being reassigned (or deleted) //
|
||||
inline const char* const Str() const { return contents; }
|
||||
|
||||
// Outputs all the relevant infomration in a string for logging purposes //
|
||||
std::string LogStr() const;
|
||||
|
||||
// The type of the token //
|
||||
const TokenType type;
|
||||
|
||||
// The length of the token //
|
||||
const unsigned short length;
|
||||
|
||||
// Start index of the token //
|
||||
const uint32_t index;
|
||||
|
||||
private:
|
||||
// The data of the token //
|
||||
char* contents;
|
||||
};
|
||||
|
||||
// Typedef for the output type of how the Lexer outputs //
|
||||
typedef std::vector<Token> LexerOutput;
|
||||
}
|
||||
215
lexer/src/Lexer.cpp
Normal file
215
lexer/src/Lexer.cpp
Normal file
@@ -0,0 +1,215 @@
|
||||
#include <Util.h>
|
||||
|
||||
#include <Lexer.h>
|
||||
#include <Token.h>
|
||||
|
||||
namespace PashaBibko::LXC::Internal
|
||||
{
|
||||
static constexpr bool IsNumeric(const char c)
|
||||
{
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
static constexpr bool IsAlpha(const char c)
|
||||
{
|
||||
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
||||
}
|
||||
|
||||
static constexpr bool IsWhitespace(const char c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
|
||||
}
|
||||
|
||||
static constexpr bool IsOperator(const char c)
|
||||
{
|
||||
return
|
||||
c == '+' || c == '-' ||
|
||||
c == '*' || c == '/' ||
|
||||
c == '%' || c == '=';
|
||||
}
|
||||
|
||||
static constexpr bool IsSymbol(const char c)
|
||||
{
|
||||
return
|
||||
c == ',' || c == '[' ||
|
||||
c == ']' || c == '{' ||
|
||||
c == '}' || c == '(' ||
|
||||
c == ')' || c == '<' ||
|
||||
c == '>' || c == ':';
|
||||
}
|
||||
|
||||
static const std::unordered_map<std::string_view, Lexer::Token::TokenType> operatorMap =
|
||||
{
|
||||
{ "+", Lexer::Token::Add },
|
||||
{ "-", Lexer::Token::Sub },
|
||||
{ "*", Lexer::Token::Mul },
|
||||
{ "/", Lexer::Token::Div },
|
||||
{ "%", Lexer::Token::Mod },
|
||||
|
||||
{ "==", Lexer::Token::Eql },
|
||||
|
||||
{ "=", Lexer::Token::Assign }
|
||||
};
|
||||
|
||||
static const std::unordered_map<char, Lexer::Token::TokenType> symbolMap =
|
||||
{
|
||||
{ ',', Lexer::Token::Comma },
|
||||
{ ':', Lexer::Token::Colon },
|
||||
|
||||
{ '[', Lexer::Token::CloseBracket },
|
||||
{ ']', Lexer::Token::OpenBracket },
|
||||
|
||||
{ '}', Lexer::Token::CloseBrace },
|
||||
{ '{', Lexer::Token::OpenBrace },
|
||||
|
||||
{ ')', Lexer::Token::CloseParen },
|
||||
{ '(', Lexer::Token::OpenParen },
|
||||
|
||||
{ '>', Lexer::Token::CloseCrocodile },
|
||||
{ '<', Lexer::Token::OpenCrocodile }
|
||||
};
|
||||
|
||||
static const std::unordered_map<std::string_view, Lexer::Token::TokenType> keywords =
|
||||
{
|
||||
{ "for", Lexer::Token::For },
|
||||
{ "while", Lexer::Token::While },
|
||||
{ "if", Lexer::Token::If },
|
||||
{ "elif", Lexer::Token::ElseIf },
|
||||
{ "else", Lexer::Token::Else },
|
||||
{ "return", Lexer::Token::Return },
|
||||
{ "func", Lexer::Token::FunctionDef },
|
||||
};
|
||||
}
|
||||
|
||||
namespace PashaBibko::LXC::Lexer
|
||||
{
|
||||
LexerContext::LexerContext(const std::string& _source) :
|
||||
source(_source), index(0), out{}, len((uint32_t)_source.length()), column(0), line(0)
|
||||
{}
|
||||
|
||||
Util::ReturnVal<LexerOutput, LexerError> TokenizeFile(const std::string& fileContents)
|
||||
{
|
||||
// Creates the context for the lexer //
|
||||
LexerContext ctx(fileContents);
|
||||
|
||||
struct
|
||||
{
|
||||
bool inStrLiteral = false;
|
||||
bool inIdentifier = false;
|
||||
bool inNumLiteral = false;
|
||||
bool inOperator = false;
|
||||
|
||||
bool inComment = false;
|
||||
|
||||
uint32_t sectionStart = 0;
|
||||
|
||||
} trackers;
|
||||
|
||||
while (ctx.index < ctx.len)
|
||||
{
|
||||
// The current char within the source that is being lexed //
|
||||
const char current = ctx.source[ctx.index];
|
||||
const char next = (ctx.index + 1 < ctx.len) ? ctx.source[ctx.index + 1] : '\0';
|
||||
|
||||
// === Comments === //
|
||||
if (current == '#')
|
||||
trackers.inComment = !trackers.inComment;
|
||||
|
||||
else if (trackers.inComment) {} // Contents of comments are skipped over
|
||||
|
||||
// === String literals === //
|
||||
else if (current == '"')
|
||||
{
|
||||
// Updates trackers //
|
||||
trackers.inStrLiteral = !trackers.inStrLiteral;
|
||||
trackers.sectionStart = trackers.inStrLiteral ? ctx.index : trackers.sectionStart;
|
||||
|
||||
// Creates the token (if at the end of the string literal) //
|
||||
if (!trackers.inStrLiteral)
|
||||
ctx.out.emplace_back(ctx, trackers.sectionStart + 1, (unsigned short)(ctx.index - trackers.sectionStart - 1), Token::StringLiteral);
|
||||
|
||||
} else if (trackers.inStrLiteral) {}
|
||||
|
||||
// === Numbers === //
|
||||
else if (Internal::IsNumeric(current))
|
||||
{
|
||||
// Updates trackers //
|
||||
trackers.sectionStart = trackers.inNumLiteral ? trackers.sectionStart : ctx.index;
|
||||
trackers.inNumLiteral = true;
|
||||
|
||||
// Checks for the end of the number literal to create the token //
|
||||
if (!Internal::IsNumeric(next)) _UNLIKELY
|
||||
{
|
||||
ctx.out.emplace_back(ctx, trackers.sectionStart, (unsigned short)(ctx.index - trackers.sectionStart + 1), Token::NumLiteral);
|
||||
trackers.inNumLiteral = false;
|
||||
}
|
||||
}
|
||||
|
||||
// === Words === //
|
||||
else if (Internal::IsAlpha(current))
|
||||
{
|
||||
// Updates trackers //
|
||||
trackers.sectionStart = trackers.inIdentifier ? trackers.sectionStart : ctx.index;
|
||||
trackers.inIdentifier = true;
|
||||
|
||||
// Checks for the end of the word to create the token //
|
||||
if (!Internal::IsAlpha(next)) _UNLIKELY
|
||||
{
|
||||
// Finds out if the word is a keyword or not //
|
||||
std::string_view fullWord(ctx.source.data() + trackers.sectionStart, ctx.index - trackers.sectionStart + 1);
|
||||
auto it = Internal::keywords.find(fullWord);
|
||||
Token::TokenType tType = (it != Internal::keywords.end()) ? it->second : Token::Identifier;
|
||||
|
||||
ctx.out.emplace_back(ctx, trackers.sectionStart, (unsigned short)(ctx.index - trackers.sectionStart + 1), tType);
|
||||
trackers.inIdentifier = false;
|
||||
}
|
||||
}
|
||||
|
||||
// === Operators === //
|
||||
else if (Internal::IsOperator(current))
|
||||
{
|
||||
// Updates trackers //
|
||||
trackers.sectionStart = trackers.inOperator ? trackers.sectionStart : ctx.index;
|
||||
trackers.inOperator = true;
|
||||
|
||||
// Checks for the end of the symbol or operator //
|
||||
if (!Internal::IsOperator(next)) _LIKELY
|
||||
{
|
||||
trackers.inOperator = false;
|
||||
|
||||
// Finds the operator/symbol if it can //
|
||||
std::string_view fullSymbol(ctx.source.data() + trackers.sectionStart, ctx.index - trackers.sectionStart + 1);
|
||||
auto it = Internal::operatorMap.find(fullSymbol);
|
||||
if (it != Internal::operatorMap.end())
|
||||
ctx.out.emplace_back(ctx, trackers.sectionStart, (unsigned short)(ctx.index - trackers.sectionStart + 1), it->second);
|
||||
|
||||
else
|
||||
return Util::FunctionFail<LexerError>(LexerError::UnknownSymbolOrOperand, trackers.sectionStart, std::string(fullSymbol));
|
||||
}
|
||||
}
|
||||
|
||||
// === Symbols === //
|
||||
else if (Internal::IsSymbol(current))
|
||||
{
|
||||
ctx.out.emplace_back(ctx, ctx.index, 1, Internal::symbolMap.at(current));
|
||||
}
|
||||
|
||||
// === Whitespace === //
|
||||
else if (Internal::IsWhitespace(current)) _LIKELY {}
|
||||
|
||||
// If an if-statement has not been triggered the character must be invalid //
|
||||
else
|
||||
return Util::FunctionFail<LexerError>(LexerError::InvalidCharacter, ctx.index);
|
||||
|
||||
// Iterates to the next index //
|
||||
ctx.column++;
|
||||
ctx.index++;
|
||||
}
|
||||
|
||||
// Checks for an unterminated string literal //
|
||||
if (trackers.inStrLiteral)
|
||||
return Util::FunctionFail<LexerError>(LexerError::UnterminatedStringLiteral, trackers.sectionStart);
|
||||
|
||||
return std::move(ctx.out);
|
||||
}
|
||||
}
|
||||
119
lexer/src/Token.cpp
Normal file
119
lexer/src/Token.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
#include <Util.h>
|
||||
|
||||
#include <Lexer.h>
|
||||
#include <Token.h>
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
namespace PashaBibko::LXC::Lexer
|
||||
{
|
||||
// Constructor to assign the members of the token class //
|
||||
Token::Token(const LexerContext& ctx, const uint32_t start, unsigned short len, TokenType _type) :
|
||||
type(_type), length(len), index(start), contents(nullptr)
|
||||
{
|
||||
// Only user defined class tokens need to store c-string //
|
||||
if (Token::IsTypeClass<TokenClass::UserDefined>(type))
|
||||
{
|
||||
// Copies the memory to a c-string //
|
||||
contents = new char[len + 1]; // +1 for null terminator
|
||||
std::memcpy(contents, ctx.source.data() + start, len);
|
||||
contents[len] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
// Copy constructor //
|
||||
Token::Token(const Token& other) :
|
||||
type(other.type), length(other.length), index(other.index), contents(nullptr)
|
||||
{
|
||||
if (other.contents != nullptr)
|
||||
{
|
||||
size_t len = std::strlen(other.contents) + 1; // Adds one for null-terminator
|
||||
contents = new char[len];
|
||||
std::memcpy(contents, other.contents, len);
|
||||
}
|
||||
}
|
||||
|
||||
// Move constructor (transfers memory allocated) //
|
||||
Token::Token(Token&& other) noexcept :
|
||||
type(other.type), length(other.length), index(other.index), contents(other.contents)
|
||||
{
|
||||
// Stops the other from thinking it owns the memory //
|
||||
other.contents = nullptr;
|
||||
}
|
||||
|
||||
// Destructor to clean up the memory of the token that can be allocated //
|
||||
Token::~Token()
|
||||
{
|
||||
// Frees any allocated memory //
|
||||
if (contents != nullptr) _UNLIKELY
|
||||
{
|
||||
delete[] contents;
|
||||
contents = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper macro for converting type to string //
|
||||
#define TOKEN_TYPE_CASE(type) case type: return #type;
|
||||
|
||||
static constexpr const char* TokenTypeToCStr(Token::TokenType type)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
// All the different types of tokens //
|
||||
TOKEN_TYPE_CASE(Token::Add);
|
||||
TOKEN_TYPE_CASE(Token::Sub);
|
||||
TOKEN_TYPE_CASE(Token::Mul);
|
||||
TOKEN_TYPE_CASE(Token::Div);
|
||||
TOKEN_TYPE_CASE(Token::Mod);
|
||||
|
||||
TOKEN_TYPE_CASE(Token::Eql);
|
||||
|
||||
TOKEN_TYPE_CASE(Token::For);
|
||||
TOKEN_TYPE_CASE(Token::While);
|
||||
TOKEN_TYPE_CASE(Token::If);
|
||||
TOKEN_TYPE_CASE(Token::ElseIf);
|
||||
TOKEN_TYPE_CASE(Token::Else);
|
||||
TOKEN_TYPE_CASE(Token::Return);
|
||||
|
||||
TOKEN_TYPE_CASE(Token::FunctionDef);
|
||||
|
||||
TOKEN_TYPE_CASE(Token::StringLiteral);
|
||||
TOKEN_TYPE_CASE(Token::NumLiteral);
|
||||
TOKEN_TYPE_CASE(Token::Identifier);
|
||||
|
||||
TOKEN_TYPE_CASE(Token::Assign);
|
||||
TOKEN_TYPE_CASE(Token::Colon);
|
||||
TOKEN_TYPE_CASE(Token::CloseBracket);
|
||||
TOKEN_TYPE_CASE(Token::OpenBracket);
|
||||
TOKEN_TYPE_CASE(Token::CloseBrace);
|
||||
TOKEN_TYPE_CASE(Token::OpenBrace);
|
||||
TOKEN_TYPE_CASE(Token::CloseParen);
|
||||
TOKEN_TYPE_CASE(Token::OpenParen);
|
||||
TOKEN_TYPE_CASE(Token::CloseCrocodile);
|
||||
TOKEN_TYPE_CASE(Token::OpenCrocodile);
|
||||
TOKEN_TYPE_CASE(Token::Comma);
|
||||
|
||||
TOKEN_TYPE_CASE(Token::End_of_file);
|
||||
TOKEN_TYPE_CASE(Token::UNDEFINED);
|
||||
|
||||
// When the case has not been defined yet //
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
std::string LXC::Lexer::Token::LogStr() const
|
||||
{
|
||||
// Output stream to log to //
|
||||
std::ostringstream os;
|
||||
os << std::setw(25) << std::left << TokenTypeToCStr(type) << " | ";
|
||||
|
||||
// Prints the contents if they are not null //
|
||||
if (contents != nullptr)
|
||||
os << std::setw(25) << std::left << std::string('"' + std::string(contents) + '"');
|
||||
else
|
||||
os << std::setw(25) << std::left << "EMPTY";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user