Program Listing for File lexer.c¶
↰ Return to documentation for file (src/lovejoy/lexer.c
)
#include "lexer.h"
#include <string.h>
#include <lovejoy/utf.h>
#ifndef IMPLEMENTATION
LexerContext NewLexer()
{
const OperatorTable table = DefaultOperators();
LexerContext ctx = {
.filename = "<stdin>",
.operator_table = table,
.lineptr = nil,
.lineno = 1,
.last_token_type = TT_NONE
};
return ctx;
}
string lexeme_substring(const Lexeme *lexeme)
{
usize span = lexeme_span(lexeme);
return VIEW(string, (byte *)lexeme->start, 0, span);
}
u0 lexeme_free(Lexeme *lexeme)
{
free((u0 *)lexeme);
}
Lexeme *peek(LexerContext *ctx, u16 count, const byte *source)
{
LexerContext tmp_ctx = *ctx;
Lexeme *peeked = nil;
for (u16 i = 0; i < count; ++i) {
peeked = lex(&tmp_ctx, source);
if (peeked == nil) return nil;
source = peeked->end;
if (i != count - 1) // While not at last iteration:
lexeme_free(peeked);
}
return peeked;
}
TokenType character_type(byte chr)
{
// Whitespace or NUL.
switch (chr) {
case '\n':
case ';':
return TT_TERM;
case '\0':
case '\t':
case '\v':
case '\f':
case '\r':
case 0xA0:
case ' ':
return TT_NONE;
}
if ((chr >= 'A' && chr <= 'Z')
|| (chr >= 'a' && chr <= 'z')
|| (chr == '_' || chr == '$')
|| (chr >= 0xA1 ))
return TT_IDENT;
if (chr >= '0' && chr <= '9')
return TT_NUMBER;
switch (chr) {
case '"': return TT_STRING;
case 0x27: return TT_CHAR;
case '(': return TT_LPAREN;
case ')': return TT_RPAREN;
case '{': return TT_LCURLY;
case '}': return TT_RCURLY;
case '[': return TT_LBRACKET;
case ']': return TT_RBRACKET;
}
if ((chr >= 0x01 && chr <= 0x1f)
|| (chr >= 0x7f && chr <= 0x9f))
return TT_NONE; // Control characters.
// Default is operator, since they're spread everywhere.
return TT_OPERATOR;
}
static const byte *skip_whitespace(const byte *source)
{
loop
switch (*source) {
case '\0': return source;
case '\t':
case '\v':
case '\f':
case '\r':
case 0xA0:
case ' ':
++source;
break;
default:
return source;
}
}
static ierr expect_string(const byte *source, const string expected)
{
string view = VIEW(string, (byte *)source, 0, expected.len);
if (string_eq(view, expected))
return NO_ERROR;
return LEXER_ERROR_UNEXPECTED;
}
Lexeme *lex(LexerContext *ctx, const byte *source)
{
if (ctx->lineptr == nil)
ctx->lineptr = source;
if (*source == '\0') return nil;
source = skip_whitespace(source);
if (*source == '\0') return nil;
// Look for comments.
if (*source == '-') { // All comments start with '-', (-{-,*}).
if (*(source + 1) == '-') { // EOL comment.
++source;
until (*source++ == '\0' || *source == '\n');
} else if (*(source + 1) == '*') { // Multiline comment.
// TODO: Allow for nested multiline comments.
source += 2; // Skip the '-' and '*'.
until (*source == '\0'
|| (*source++ == '*' && *source == '-')
|| *source == '\0')
if (*source == '\n') {
++ctx->lineno;
ctx->lineptr = source + 1;
}
if (*source != '\0') ++source;
}
}
source = skip_whitespace(source);
if (*source == '\0') return nil;
// Collect multiple terminators together.
bool is_terminal = false;
do switch (*source) {
case '\n':
++ctx->lineno;
ctx->lineptr = source + 1;
/* fallthrough */
case ';':
++source;
is_terminal = true;
break;
default:
if (is_terminal) --source;
goto make_token;
} while (is_terminal);
make_token:;
TokenType tt = character_type(*source);
if (tt == TT_NONE) return nil;
// Stop the lexer from returning many consecutive
// terminator tokens.
if (tt == TT_TERM && ctx->last_token_type == TT_TERM)
return lex(ctx, source + 1); // Tail recursion should be optimised.
Lexeme *token = emalloc(1, sizeof(Lexeme));
token->type = tt;
token->start = source;
token->line = ctx->lineptr;
token->lineno = ctx->lineno;
if (tt == TT_TERM) {
token->end = source + 1;
goto return_token;
}
if (tt == TT_STRING) {
// Find the EOS.
usize eos = 1;
until (source[eos++] == '"') {
if (source[eos - 1] == '\\'
&& (source[eos] == '"' || source[eos] == '\\'))
++eos;
}
token->end = source + eos;
goto return_token;
}
if (tt == TT_CHAR) {
// A 'char' is one rune.
++source;
usize width = 0;
if (*source == '\\') { // Rune is denoted by an escape sequence.
++source; // Skip backslash.
string view = VIEW(string, (byte *)source, 0, 16);
rune ch; UNUSED(ch);
width += read_escape(view, &ch);
} else { // Literal rune.
string view = VIEW(string, (byte *)source, 0, 16);
next_rune(view, &width);
}
source += width;
ierr err = expect_string(source++, STR("'"));
unless (err == 0) { // TODO: Proper error reporting.
eprintln("error: Expected single-quote at end of character."
" Got `%c' instead.", *(source - 1));
return nil;
}
token->end = source;
goto return_token;
}
switch (tt) {
case TT_LPAREN:
case TT_RPAREN:
case TT_LBRACKET:
case TT_RBRACKET:
case TT_LCURLY:
case TT_RCURLY:
token->end = source + 1;
goto return_token;
case TT_NUMBER:
// TODO: Decimal point.
// TODO: Scientific notation, e.g. 3.7e-11.
// TODO: Hex exponent, e.g. 0xF1.D3p+4A
// TODO: Radix notation, e.g. 0775r8 == 0o0775.
// Mainly we need to accommodate for the
// use of '.', '+' and '-' characters, which would
// otherwise be considered TT_OPERATORs, as opposed
// to being part of TT_NUMBER literals.
while (tt == TT_NUMBER || tt == TT_IDENT)
tt = character_type(*(++source));
token->end = source;
goto return_token;
case TT_IDENT:
case TT_OPERATOR:; // Check for known operators:
// Check which operators exists, and try and
// match against the longest operators first.
// e.g. 2+-3 becomes 2 + (-3), but if an operator '+-' exists,
// then it becomes 2 +- 3.
OperatorTable operators = ctx->operator_table;
foreach (op, operators) {
string op_substr = VIEW(string, (byte *)source, 0, op->name.len);
if (string_eq(op->name, op_substr)) {
token->end = source + op_substr.len;
goto return_token;
} // Otherwise continue searching.
}
// No operator was found...
if (tt == TT_IDENT) { // Hence, it's just a regular identifier.
while (tt == TT_NUMBER || tt == TT_IDENT)
tt = character_type(*(++source)); // Consolidate characters.
token->end = source;
goto return_token;
}
// Otherwise, if we have an operator symbol, but it was not covered by
// the previous check for known operators, then it does not exist.
// Hence, we throw an error.
// TODO: Throw proper lexer error.
eprintln("error: Operator (`%c') is not recognised.", *source);
return nil;
default:
return nil;
}
return_token:;
ctx->last_token_type = token->type;
return token;
}
#endif