This commit is contained in:
2026-02-13 02:57:01 -05:00
commit 15e5ccb064
23 changed files with 11899 additions and 0 deletions

633
src/lexer.rs Normal file
View File

@@ -0,0 +1,633 @@
//! Lexer for the Lux language
#![allow(dead_code)]
use crate::ast::Span;
use std::fmt;
use std::iter::Peekable;
use std::str::Chars;
/// Token types
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
// Literals
Int(i64),
Float(f64),
String(String),
Char(char),
Bool(bool),
// Identifiers and keywords
Ident(String),
// Keywords
Fn,
Let,
If,
Then,
Else,
Match,
With,
Effect,
Handler,
Run,
Resume,
Type,
True,
False,
Import,
Pub,
As,
From, // from (for migrations)
Latest, // latest (for @latest version constraint)
// Operators
Plus, // +
Minus, // -
Star, // *
Slash, // /
Percent, // %
Eq, // =
EqEq, // ==
Ne, // !=
Lt, // <
Le, // <=
Gt, // >
Ge, // >=
And, // &&
Or, // ||
Not, // !
Pipe, // |
PipeGt, // |>
Arrow, // =>
ThinArrow, // ->
Dot, // .
Colon, // :
ColonColon, // ::
Comma, // ,
Semi, // ;
At, // @
// Delimiters
LParen, // (
RParen, // )
LBrace, // {
RBrace, // }
LBracket, // [
RBracket, // ]
// Special
Underscore, // _
Newline,
Eof,
}
impl fmt::Display for TokenKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TokenKind::Int(n) => write!(f, "{}", n),
TokenKind::Float(n) => write!(f, "{}", n),
TokenKind::String(s) => write!(f, "\"{}\"", s),
TokenKind::Char(c) => write!(f, "'{}'", c),
TokenKind::Bool(b) => write!(f, "{}", b),
TokenKind::Ident(s) => write!(f, "{}", s),
TokenKind::Fn => write!(f, "fn"),
TokenKind::Let => write!(f, "let"),
TokenKind::If => write!(f, "if"),
TokenKind::Then => write!(f, "then"),
TokenKind::Else => write!(f, "else"),
TokenKind::Match => write!(f, "match"),
TokenKind::With => write!(f, "with"),
TokenKind::Effect => write!(f, "effect"),
TokenKind::Handler => write!(f, "handler"),
TokenKind::Run => write!(f, "run"),
TokenKind::Resume => write!(f, "resume"),
TokenKind::Type => write!(f, "type"),
TokenKind::Import => write!(f, "import"),
TokenKind::Pub => write!(f, "pub"),
TokenKind::As => write!(f, "as"),
TokenKind::From => write!(f, "from"),
TokenKind::Latest => write!(f, "latest"),
TokenKind::True => write!(f, "true"),
TokenKind::False => write!(f, "false"),
TokenKind::Plus => write!(f, "+"),
TokenKind::Minus => write!(f, "-"),
TokenKind::Star => write!(f, "*"),
TokenKind::Slash => write!(f, "/"),
TokenKind::Percent => write!(f, "%"),
TokenKind::Eq => write!(f, "="),
TokenKind::EqEq => write!(f, "=="),
TokenKind::Ne => write!(f, "!="),
TokenKind::Lt => write!(f, "<"),
TokenKind::Le => write!(f, "<="),
TokenKind::Gt => write!(f, ">"),
TokenKind::Ge => write!(f, ">="),
TokenKind::And => write!(f, "&&"),
TokenKind::Or => write!(f, "||"),
TokenKind::Not => write!(f, "!"),
TokenKind::Pipe => write!(f, "|"),
TokenKind::PipeGt => write!(f, "|>"),
TokenKind::Arrow => write!(f, "=>"),
TokenKind::ThinArrow => write!(f, "->"),
TokenKind::Dot => write!(f, "."),
TokenKind::Colon => write!(f, ":"),
TokenKind::ColonColon => write!(f, "::"),
TokenKind::Comma => write!(f, ","),
TokenKind::Semi => write!(f, ";"),
TokenKind::At => write!(f, "@"),
TokenKind::LParen => write!(f, "("),
TokenKind::RParen => write!(f, ")"),
TokenKind::LBrace => write!(f, "{{"),
TokenKind::RBrace => write!(f, "}}"),
TokenKind::LBracket => write!(f, "["),
TokenKind::RBracket => write!(f, "]"),
TokenKind::Underscore => write!(f, "_"),
TokenKind::Newline => write!(f, "\\n"),
TokenKind::Eof => write!(f, "EOF"),
}
}
}
/// A token with its source location
#[derive(Debug, Clone)]
pub struct Token {
pub kind: TokenKind,
pub span: Span,
}
impl Token {
pub fn new(kind: TokenKind, span: Span) -> Self {
Self { kind, span }
}
}
/// Lexer error
#[derive(Debug, Clone)]
pub struct LexError {
pub message: String,
pub span: Span,
}
impl fmt::Display for LexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Lexer error at {}-{}: {}",
self.span.start, self.span.end, self.message
)
}
}
/// The lexer
pub struct Lexer<'a> {
source: &'a str,
chars: Peekable<Chars<'a>>,
pos: usize,
}
impl<'a> Lexer<'a> {
pub fn new(source: &'a str) -> Self {
Self {
source,
chars: source.chars().peekable(),
pos: 0,
}
}
/// Tokenize the entire source
pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
let mut tokens = Vec::new();
loop {
let token = self.next_token()?;
let is_eof = token.kind == TokenKind::Eof;
tokens.push(token);
if is_eof {
break;
}
}
Ok(tokens)
}
fn next_token(&mut self) -> Result<Token, LexError> {
self.skip_whitespace_and_comments();
let start = self.pos;
let Some(c) = self.advance() else {
return Ok(Token::new(TokenKind::Eof, Span::new(start, start)));
};
let kind = match c {
// Single-character tokens
'+' => TokenKind::Plus,
'*' => TokenKind::Star,
'%' => TokenKind::Percent,
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'{' => TokenKind::LBrace,
'}' => TokenKind::RBrace,
'[' => TokenKind::LBracket,
']' => TokenKind::RBracket,
',' => TokenKind::Comma,
';' => TokenKind::Semi,
'@' => TokenKind::At,
'\n' => TokenKind::Newline,
// Multi-character tokens
'-' => {
if self.peek() == Some('>') {
self.advance();
TokenKind::ThinArrow
} else {
TokenKind::Minus
}
}
'/' => {
if self.peek() == Some('/') {
// Line comment
self.skip_line_comment();
return self.next_token();
} else {
TokenKind::Slash
}
}
'=' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::EqEq
} else if self.peek() == Some('>') {
self.advance();
TokenKind::Arrow
} else {
TokenKind::Eq
}
}
'!' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::Ne
} else {
TokenKind::Not
}
}
'<' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::Le
} else {
TokenKind::Lt
}
}
'>' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::Ge
} else {
TokenKind::Gt
}
}
'&' => {
if self.peek() == Some('&') {
self.advance();
TokenKind::And
} else {
return Err(LexError {
message: "Expected '&&'".into(),
span: Span::new(start, self.pos),
});
}
}
'|' => {
if self.peek() == Some('|') {
self.advance();
TokenKind::Or
} else if self.peek() == Some('>') {
self.advance();
TokenKind::PipeGt
} else {
TokenKind::Pipe
}
}
'.' => TokenKind::Dot,
':' => {
if self.peek() == Some(':') {
self.advance();
TokenKind::ColonColon
} else {
TokenKind::Colon
}
}
'_' => {
if self.peek().map_or(false, |c| c.is_alphanumeric()) {
// It's an identifier starting with _
self.scan_ident_rest(start)
} else {
TokenKind::Underscore
}
}
// String literals
'"' => self.scan_string(start)?,
// Char literals
'\'' => self.scan_char(start)?,
// Numbers
c if c.is_ascii_digit() => self.scan_number(c, start)?,
// Identifiers and keywords
c if c.is_alphabetic() || c == '_' => self.scan_ident_rest(start),
_ => {
return Err(LexError {
message: format!("Unexpected character: '{}'", c),
span: Span::new(start, self.pos),
});
}
};
Ok(Token::new(kind, Span::new(start, self.pos)))
}
fn advance(&mut self) -> Option<char> {
let c = self.chars.next()?;
self.pos += c.len_utf8();
Some(c)
}
fn peek(&mut self) -> Option<char> {
self.chars.peek().copied()
}
fn skip_whitespace_and_comments(&mut self) {
while let Some(c) = self.peek() {
if c == ' ' || c == '\t' || c == '\r' {
self.advance();
} else if c == '/' {
// Check for comment
let mut chars = self.chars.clone();
chars.next(); // consume '/'
if chars.peek() == Some(&'/') {
self.skip_line_comment();
} else {
break;
}
} else {
break;
}
}
}
fn skip_line_comment(&mut self) {
while let Some(c) = self.peek() {
if c == '\n' {
break;
}
self.advance();
}
}
fn scan_string(&mut self, _start: usize) -> Result<TokenKind, LexError> {
let mut value = String::new();
loop {
match self.advance() {
Some('"') => break,
Some('\\') => {
let escaped = match self.advance() {
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('\\') => '\\',
Some('"') => '"',
Some(c) => c,
None => {
return Err(LexError {
message: "Unterminated string".into(),
span: Span::new(_start, self.pos),
});
}
};
value.push(escaped);
}
Some(c) => value.push(c),
None => {
return Err(LexError {
message: "Unterminated string".into(),
span: Span::new(_start, self.pos),
});
}
}
}
Ok(TokenKind::String(value))
}
fn scan_char(&mut self, start: usize) -> Result<TokenKind, LexError> {
let c = match self.advance() {
Some('\\') => match self.advance() {
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('\\') => '\\',
Some('\'') => '\'',
Some(c) => c,
None => {
return Err(LexError {
message: "Unterminated character literal".into(),
span: Span::new(start, self.pos),
});
}
},
Some(c) => c,
None => {
return Err(LexError {
message: "Unterminated character literal".into(),
span: Span::new(start, self.pos),
});
}
};
if self.advance() != Some('\'') {
return Err(LexError {
message: "Expected closing quote for character literal".into(),
span: Span::new(start, self.pos),
});
}
Ok(TokenKind::Char(c))
}
fn scan_number(&mut self, first: char, start: usize) -> Result<TokenKind, LexError> {
let mut num_str = String::new();
num_str.push(first);
while let Some(c) = self.peek() {
if c.is_ascii_digit() || c == '_' {
if c != '_' {
num_str.push(c);
}
self.advance();
} else {
break;
}
}
// Check for float
if self.peek() == Some('.') {
// Look ahead to make sure it's not a method call
let mut chars = self.chars.clone();
chars.next(); // consume '.'
if chars.peek().map_or(false, |c| c.is_ascii_digit()) {
self.advance(); // consume '.'
num_str.push('.');
while let Some(c) = self.peek() {
if c.is_ascii_digit() || c == '_' {
if c != '_' {
num_str.push(c);
}
self.advance();
} else {
break;
}
}
let f: f64 = num_str.parse().map_err(|_| LexError {
message: "Invalid float literal".into(),
span: Span::new(start, self.pos),
})?;
return Ok(TokenKind::Float(f));
}
}
let n: i64 = num_str.parse().map_err(|_| LexError {
message: "Invalid integer literal".into(),
span: Span::new(start, self.pos),
})?;
Ok(TokenKind::Int(n))
}
fn scan_ident_rest(&mut self, start: usize) -> TokenKind {
while let Some(c) = self.peek() {
if c.is_alphanumeric() || c == '_' {
self.advance();
} else {
break;
}
}
let ident = &self.source[start..self.pos];
match ident {
"fn" => TokenKind::Fn,
"let" => TokenKind::Let,
"if" => TokenKind::If,
"then" => TokenKind::Then,
"else" => TokenKind::Else,
"match" => TokenKind::Match,
"with" => TokenKind::With,
"effect" => TokenKind::Effect,
"handler" => TokenKind::Handler,
"run" => TokenKind::Run,
"resume" => TokenKind::Resume,
"type" => TokenKind::Type,
"import" => TokenKind::Import,
"pub" => TokenKind::Pub,
"as" => TokenKind::As,
"from" => TokenKind::From,
"latest" => TokenKind::Latest,
"true" => TokenKind::Bool(true),
"false" => TokenKind::Bool(false),
_ => TokenKind::Ident(ident.to_string()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn lex(source: &str) -> Vec<TokenKind> {
Lexer::new(source)
.tokenize()
.unwrap()
.into_iter()
.map(|t| t.kind)
.filter(|k| !matches!(k, TokenKind::Newline))
.collect()
}
#[test]
fn test_basic_tokens() {
assert_eq!(
lex("fn let if else"),
vec![
TokenKind::Fn,
TokenKind::Let,
TokenKind::If,
TokenKind::Else,
TokenKind::Eof
]
);
}
#[test]
fn test_operators() {
assert_eq!(
lex("+ - * / == != |>"),
vec![
TokenKind::Plus,
TokenKind::Minus,
TokenKind::Star,
TokenKind::Slash,
TokenKind::EqEq,
TokenKind::Ne,
TokenKind::PipeGt,
TokenKind::Eof
]
);
}
#[test]
fn test_numbers() {
assert_eq!(
lex("42 3.14"),
vec![TokenKind::Int(42), TokenKind::Float(3.14), TokenKind::Eof]
);
}
#[test]
fn test_strings() {
assert_eq!(
lex("\"hello\" \"world\""),
vec![
TokenKind::String("hello".into()),
TokenKind::String("world".into()),
TokenKind::Eof
]
);
}
#[test]
fn test_function() {
assert_eq!(
lex("fn add(a: Int, b: Int): Int = a + b"),
vec![
TokenKind::Fn,
TokenKind::Ident("add".into()),
TokenKind::LParen,
TokenKind::Ident("a".into()),
TokenKind::Colon,
TokenKind::Ident("Int".into()),
TokenKind::Comma,
TokenKind::Ident("b".into()),
TokenKind::Colon,
TokenKind::Ident("Int".into()),
TokenKind::RParen,
TokenKind::Colon,
TokenKind::Ident("Int".into()),
TokenKind::Eq,
TokenKind::Ident("a".into()),
TokenKind::Plus,
TokenKind::Ident("b".into()),
TokenKind::Eof
]
);
}
}