Files
lux/src/lexer.rs
Brandon Lucas c81349d82c fix: resolve all stress test bugs
- Record equality: add Record case to values_equal in interpreter
- Invalid escapes: error on unknown escape sequences in lexer
- Unknown effects: validate effect names in check_function with suggestions
- Circular types: add DFS cycle detection in check_type_cycles
- Parser: require | for enum variants, enabling proper type alias syntax

All 265 tests pass.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-14 02:45:52 -05:00

847 lines
26 KiB
Rust

//! Lexer for the Lux language
#![allow(dead_code)]
use crate::ast::Span;
use std::fmt;
use std::iter::Peekable;
use std::str::Chars;
/// Part of an interpolated string
#[derive(Debug, Clone, PartialEq)]
pub enum StringPart {
/// Literal text
Literal(String),
/// Expression to be evaluated (stored as source text to be parsed later)
Expr(String),
}
/// Token types
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
// Literals
Int(i64),
Float(f64),
String(String),
/// Interpolated string with embedded expressions: "Hello, {name}!"
InterpolatedString(Vec<StringPart>),
Char(char),
Bool(bool),
// Identifiers and keywords
Ident(String),
// Keywords
Fn,
Let,
If,
Then,
Else,
Match,
With,
Effect,
Handler,
Run,
Resume,
Type,
True,
False,
Import,
Pub,
As,
From, // from (for migrations)
Latest, // latest (for @latest version constraint)
Trait, // trait (for type classes)
Impl, // impl (for trait implementations)
For, // for (in impl Trait for Type)
// Documentation
DocComment(String), // /// doc comment
// Behavioral type keywords
Is, // is (for behavioral properties)
Pure, // pure
Total, // total
Idempotent, // idempotent
Deterministic, // deterministic
Commutative, // commutative
Where, // where (for constraints)
Assume, // assume (for unverified properties)
// Operators
Plus, // +
Minus, // -
Star, // *
Slash, // /
Percent, // %
Eq, // =
EqEq, // ==
Ne, // !=
Lt, // <
Le, // <=
Gt, // >
Ge, // >=
And, // &&
Or, // ||
Not, // !
Pipe, // |
PipeGt, // |>
Arrow, // =>
ThinArrow, // ->
Dot, // .
Colon, // :
ColonColon, // ::
Comma, // ,
Semi, // ;
At, // @
// Delimiters
LParen, // (
RParen, // )
LBrace, // {
RBrace, // }
LBracket, // [
RBracket, // ]
// Special
Underscore, // _
Newline,
Eof,
}
impl fmt::Display for TokenKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TokenKind::Int(n) => write!(f, "{}", n),
TokenKind::Float(n) => write!(f, "{}", n),
TokenKind::String(s) => write!(f, "\"{}\"", s),
TokenKind::InterpolatedString(parts) => {
write!(f, "\"")?;
for part in parts {
match part {
StringPart::Literal(s) => write!(f, "{}", s)?,
StringPart::Expr(e) => write!(f, "{{{}}}", e)?,
}
}
write!(f, "\"")
}
TokenKind::Char(c) => write!(f, "'{}'", c),
TokenKind::Bool(b) => write!(f, "{}", b),
TokenKind::Ident(s) => write!(f, "{}", s),
TokenKind::Fn => write!(f, "fn"),
TokenKind::Let => write!(f, "let"),
TokenKind::If => write!(f, "if"),
TokenKind::Then => write!(f, "then"),
TokenKind::Else => write!(f, "else"),
TokenKind::Match => write!(f, "match"),
TokenKind::With => write!(f, "with"),
TokenKind::Effect => write!(f, "effect"),
TokenKind::Handler => write!(f, "handler"),
TokenKind::Run => write!(f, "run"),
TokenKind::Resume => write!(f, "resume"),
TokenKind::Type => write!(f, "type"),
TokenKind::Import => write!(f, "import"),
TokenKind::Pub => write!(f, "pub"),
TokenKind::As => write!(f, "as"),
TokenKind::From => write!(f, "from"),
TokenKind::Latest => write!(f, "latest"),
TokenKind::Trait => write!(f, "trait"),
TokenKind::Impl => write!(f, "impl"),
TokenKind::For => write!(f, "for"),
TokenKind::DocComment(s) => write!(f, "/// {}", s),
TokenKind::Is => write!(f, "is"),
TokenKind::Pure => write!(f, "pure"),
TokenKind::Total => write!(f, "total"),
TokenKind::Idempotent => write!(f, "idempotent"),
TokenKind::Deterministic => write!(f, "deterministic"),
TokenKind::Commutative => write!(f, "commutative"),
TokenKind::Where => write!(f, "where"),
TokenKind::Assume => write!(f, "assume"),
TokenKind::True => write!(f, "true"),
TokenKind::False => write!(f, "false"),
TokenKind::Plus => write!(f, "+"),
TokenKind::Minus => write!(f, "-"),
TokenKind::Star => write!(f, "*"),
TokenKind::Slash => write!(f, "/"),
TokenKind::Percent => write!(f, "%"),
TokenKind::Eq => write!(f, "="),
TokenKind::EqEq => write!(f, "=="),
TokenKind::Ne => write!(f, "!="),
TokenKind::Lt => write!(f, "<"),
TokenKind::Le => write!(f, "<="),
TokenKind::Gt => write!(f, ">"),
TokenKind::Ge => write!(f, ">="),
TokenKind::And => write!(f, "&&"),
TokenKind::Or => write!(f, "||"),
TokenKind::Not => write!(f, "!"),
TokenKind::Pipe => write!(f, "|"),
TokenKind::PipeGt => write!(f, "|>"),
TokenKind::Arrow => write!(f, "=>"),
TokenKind::ThinArrow => write!(f, "->"),
TokenKind::Dot => write!(f, "."),
TokenKind::Colon => write!(f, ":"),
TokenKind::ColonColon => write!(f, "::"),
TokenKind::Comma => write!(f, ","),
TokenKind::Semi => write!(f, ";"),
TokenKind::At => write!(f, "@"),
TokenKind::LParen => write!(f, "("),
TokenKind::RParen => write!(f, ")"),
TokenKind::LBrace => write!(f, "{{"),
TokenKind::RBrace => write!(f, "}}"),
TokenKind::LBracket => write!(f, "["),
TokenKind::RBracket => write!(f, "]"),
TokenKind::Underscore => write!(f, "_"),
TokenKind::Newline => write!(f, "\\n"),
TokenKind::Eof => write!(f, "EOF"),
}
}
}
/// A token with its source location
#[derive(Debug, Clone)]
pub struct Token {
pub kind: TokenKind,
pub span: Span,
}
impl Token {
pub fn new(kind: TokenKind, span: Span) -> Self {
Self { kind, span }
}
}
/// Lexer error
#[derive(Debug, Clone)]
pub struct LexError {
pub message: String,
pub span: Span,
}
impl fmt::Display for LexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Lexer error at {}-{}: {}",
self.span.start, self.span.end, self.message
)
}
}
/// The lexer
pub struct Lexer<'a> {
source: &'a str,
chars: Peekable<Chars<'a>>,
pos: usize,
}
impl<'a> Lexer<'a> {
pub fn new(source: &'a str) -> Self {
Self {
source,
chars: source.chars().peekable(),
pos: 0,
}
}
/// Tokenize the entire source
pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
let mut tokens = Vec::new();
loop {
let token = self.next_token()?;
let is_eof = token.kind == TokenKind::Eof;
tokens.push(token);
if is_eof {
break;
}
}
Ok(tokens)
}
fn next_token(&mut self) -> Result<Token, LexError> {
self.skip_whitespace_and_comments();
let start = self.pos;
let Some(c) = self.advance() else {
return Ok(Token::new(TokenKind::Eof, Span::new(start, start)));
};
let kind = match c {
// Single-character tokens
'+' => TokenKind::Plus,
'*' => TokenKind::Star,
'%' => TokenKind::Percent,
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'{' => TokenKind::LBrace,
'}' => TokenKind::RBrace,
'[' => TokenKind::LBracket,
']' => TokenKind::RBracket,
',' => TokenKind::Comma,
';' => TokenKind::Semi,
'@' => TokenKind::At,
'\n' => TokenKind::Newline,
// Multi-character tokens
'-' => {
if self.peek() == Some('>') {
self.advance();
TokenKind::ThinArrow
} else {
TokenKind::Minus
}
}
'/' => {
if self.peek() == Some('/') {
self.advance(); // consume second '/'
// Check if this is a doc comment (///)
if self.peek() == Some('/') {
self.advance(); // consume third '/'
return Ok(self.scan_doc_comment(start));
} else {
// Regular line comment
self.skip_line_comment();
return self.next_token();
}
} else {
TokenKind::Slash
}
}
'=' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::EqEq
} else if self.peek() == Some('>') {
self.advance();
TokenKind::Arrow
} else {
TokenKind::Eq
}
}
'!' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::Ne
} else {
TokenKind::Not
}
}
'<' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::Le
} else {
TokenKind::Lt
}
}
'>' => {
if self.peek() == Some('=') {
self.advance();
TokenKind::Ge
} else {
TokenKind::Gt
}
}
'&' => {
if self.peek() == Some('&') {
self.advance();
TokenKind::And
} else {
return Err(LexError {
message: "Expected '&&'".into(),
span: Span::new(start, self.pos),
});
}
}
'|' => {
if self.peek() == Some('|') {
self.advance();
TokenKind::Or
} else if self.peek() == Some('>') {
self.advance();
TokenKind::PipeGt
} else {
TokenKind::Pipe
}
}
'.' => TokenKind::Dot,
':' => {
if self.peek() == Some(':') {
self.advance();
TokenKind::ColonColon
} else {
TokenKind::Colon
}
}
'_' => {
if self.peek().map_or(false, |c| c.is_alphanumeric()) {
// It's an identifier starting with _
self.scan_ident_rest(start)
} else {
TokenKind::Underscore
}
}
// String literals
'"' => self.scan_string(start)?,
// Char literals
'\'' => self.scan_char(start)?,
// Numbers
c if c.is_ascii_digit() => self.scan_number(c, start)?,
// Identifiers and keywords
c if c.is_alphabetic() || c == '_' => self.scan_ident_rest(start),
_ => {
return Err(LexError {
message: format!("Unexpected character: '{}'", c),
span: Span::new(start, self.pos),
});
}
};
Ok(Token::new(kind, Span::new(start, self.pos)))
}
fn advance(&mut self) -> Option<char> {
let c = self.chars.next()?;
self.pos += c.len_utf8();
Some(c)
}
fn peek(&mut self) -> Option<char> {
self.chars.peek().copied()
}
fn skip_whitespace_and_comments(&mut self) {
while let Some(c) = self.peek() {
if c == ' ' || c == '\t' || c == '\r' {
self.advance();
} else if c == '/' {
// Check for comment
let mut chars = self.chars.clone();
chars.next(); // consume '/'
if chars.peek() == Some(&'/') {
self.skip_line_comment();
} else {
break;
}
} else {
break;
}
}
}
fn skip_line_comment(&mut self) {
while let Some(c) = self.peek() {
if c == '\n' {
break;
}
self.advance();
}
}
fn scan_doc_comment(&mut self, start: usize) -> Token {
// Skip leading whitespace after ///
while self.peek() == Some(' ') || self.peek() == Some('\t') {
self.advance();
}
// Collect the rest of the line
let mut content = String::new();
while let Some(c) = self.peek() {
if c == '\n' {
break;
}
content.push(c);
self.advance();
}
Token::new(
TokenKind::DocComment(content.trim_end().to_string()),
Span::new(start, self.pos),
)
}
fn scan_string(&mut self, _start: usize) -> Result<TokenKind, LexError> {
let mut parts: Vec<StringPart> = Vec::new();
let mut current_literal = String::new();
loop {
match self.advance() {
Some('"') => break,
Some('\\') => {
// Check for escaped brace
match self.peek() {
Some('{') => {
self.advance();
current_literal.push('{');
}
Some('}') => {
self.advance();
current_literal.push('}');
}
_ => {
let escape_start = self.pos;
let escaped = match self.advance() {
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('\\') => '\\',
Some('"') => '"',
Some('0') => '\0',
Some('\'') => '\'',
Some(c) => {
return Err(LexError {
message: format!("Invalid escape sequence: \\{}", c),
span: Span::new(escape_start - 1, self.pos),
});
}
None => {
return Err(LexError {
message: "Unterminated string".into(),
span: Span::new(_start, self.pos),
});
}
};
current_literal.push(escaped);
}
}
}
Some('{') => {
// Start of interpolation
if !current_literal.is_empty() {
parts.push(StringPart::Literal(std::mem::take(&mut current_literal)));
}
// Scan the expression until matching '}'
let mut expr_text = String::new();
let mut brace_depth = 1;
loop {
match self.advance() {
Some('{') => {
brace_depth += 1;
expr_text.push('{');
}
Some('}') => {
brace_depth -= 1;
if brace_depth == 0 {
break;
}
expr_text.push('}');
}
Some(c) => expr_text.push(c),
None => {
return Err(LexError {
message: "Unterminated interpolation in string".into(),
span: Span::new(_start, self.pos),
});
}
}
}
parts.push(StringPart::Expr(expr_text));
}
Some(c) => current_literal.push(c),
None => {
return Err(LexError {
message: "Unterminated string".into(),
span: Span::new(_start, self.pos),
});
}
}
}
// If we have no interpolations, return a simple string
if parts.is_empty() {
return Ok(TokenKind::String(current_literal));
}
// Add any remaining literal
if !current_literal.is_empty() {
parts.push(StringPart::Literal(current_literal));
}
Ok(TokenKind::InterpolatedString(parts))
}
fn scan_char(&mut self, start: usize) -> Result<TokenKind, LexError> {
let c = match self.advance() {
Some('\\') => match self.advance() {
Some('n') => '\n',
Some('r') => '\r',
Some('t') => '\t',
Some('\\') => '\\',
Some('\'') => '\'',
Some(c) => c,
None => {
return Err(LexError {
message: "Unterminated character literal".into(),
span: Span::new(start, self.pos),
});
}
},
Some(c) => c,
None => {
return Err(LexError {
message: "Unterminated character literal".into(),
span: Span::new(start, self.pos),
});
}
};
if self.advance() != Some('\'') {
return Err(LexError {
message: "Expected closing quote for character literal".into(),
span: Span::new(start, self.pos),
});
}
Ok(TokenKind::Char(c))
}
fn scan_number(&mut self, first: char, start: usize) -> Result<TokenKind, LexError> {
let mut num_str = String::new();
num_str.push(first);
while let Some(c) = self.peek() {
if c.is_ascii_digit() || c == '_' {
if c != '_' {
num_str.push(c);
}
self.advance();
} else {
break;
}
}
// Check for float
if self.peek() == Some('.') {
// Look ahead to make sure it's not a method call
let mut chars = self.chars.clone();
chars.next(); // consume '.'
if chars.peek().map_or(false, |c| c.is_ascii_digit()) {
self.advance(); // consume '.'
num_str.push('.');
while let Some(c) = self.peek() {
if c.is_ascii_digit() || c == '_' {
if c != '_' {
num_str.push(c);
}
self.advance();
} else {
break;
}
}
let f: f64 = num_str.parse().map_err(|_| LexError {
message: "Invalid float literal".into(),
span: Span::new(start, self.pos),
})?;
return Ok(TokenKind::Float(f));
}
}
let n: i64 = num_str.parse().map_err(|_| LexError {
message: "Invalid integer literal".into(),
span: Span::new(start, self.pos),
})?;
Ok(TokenKind::Int(n))
}
fn scan_ident_rest(&mut self, start: usize) -> TokenKind {
while let Some(c) = self.peek() {
if c.is_alphanumeric() || c == '_' {
self.advance();
} else {
break;
}
}
let ident = &self.source[start..self.pos];
match ident {
"fn" => TokenKind::Fn,
"let" => TokenKind::Let,
"if" => TokenKind::If,
"then" => TokenKind::Then,
"else" => TokenKind::Else,
"match" => TokenKind::Match,
"with" => TokenKind::With,
"effect" => TokenKind::Effect,
"handler" => TokenKind::Handler,
"run" => TokenKind::Run,
"resume" => TokenKind::Resume,
"type" => TokenKind::Type,
"import" => TokenKind::Import,
"pub" => TokenKind::Pub,
"as" => TokenKind::As,
"from" => TokenKind::From,
"latest" => TokenKind::Latest,
"trait" => TokenKind::Trait,
"impl" => TokenKind::Impl,
"for" => TokenKind::For,
"is" => TokenKind::Is,
"pure" => TokenKind::Pure,
"total" => TokenKind::Total,
"idempotent" => TokenKind::Idempotent,
"deterministic" => TokenKind::Deterministic,
"commutative" => TokenKind::Commutative,
"where" => TokenKind::Where,
"assume" => TokenKind::Assume,
"true" => TokenKind::Bool(true),
"false" => TokenKind::Bool(false),
_ => TokenKind::Ident(ident.to_string()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn lex(source: &str) -> Vec<TokenKind> {
Lexer::new(source)
.tokenize()
.unwrap()
.into_iter()
.map(|t| t.kind)
.filter(|k| !matches!(k, TokenKind::Newline))
.collect()
}
#[test]
fn test_basic_tokens() {
assert_eq!(
lex("fn let if else"),
vec![
TokenKind::Fn,
TokenKind::Let,
TokenKind::If,
TokenKind::Else,
TokenKind::Eof
]
);
}
#[test]
fn test_operators() {
assert_eq!(
lex("+ - * / == != |>"),
vec![
TokenKind::Plus,
TokenKind::Minus,
TokenKind::Star,
TokenKind::Slash,
TokenKind::EqEq,
TokenKind::Ne,
TokenKind::PipeGt,
TokenKind::Eof
]
);
}
#[test]
fn test_numbers() {
assert_eq!(
lex("42 3.14"),
vec![TokenKind::Int(42), TokenKind::Float(3.14), TokenKind::Eof]
);
}
#[test]
fn test_strings() {
assert_eq!(
lex("\"hello\" \"world\""),
vec![
TokenKind::String("hello".into()),
TokenKind::String("world".into()),
TokenKind::Eof
]
);
}
#[test]
fn test_string_interpolation_simple() {
assert_eq!(
lex("\"Hello, {name}!\""),
vec![
TokenKind::InterpolatedString(vec![
StringPart::Literal("Hello, ".into()),
StringPart::Expr("name".into()),
StringPart::Literal("!".into()),
]),
TokenKind::Eof
]
);
}
#[test]
fn test_string_interpolation_multiple() {
assert_eq!(
lex("\"{x} + {y} = {x + y}\""),
vec![
TokenKind::InterpolatedString(vec![
StringPart::Expr("x".into()),
StringPart::Literal(" + ".into()),
StringPart::Expr("y".into()),
StringPart::Literal(" = ".into()),
StringPart::Expr("x + y".into()),
]),
TokenKind::Eof
]
);
}
#[test]
fn test_string_interpolation_escaped_braces() {
assert_eq!(
lex("\"literal \\{braces\\}\""),
vec![
TokenKind::String("literal {braces}".into()),
TokenKind::Eof
]
);
}
#[test]
fn test_string_no_interpolation() {
// Plain strings without interpolation should remain as String tokens
assert_eq!(
lex("\"no interpolation here\""),
vec![
TokenKind::String("no interpolation here".into()),
TokenKind::Eof
]
);
}
#[test]
fn test_function() {
assert_eq!(
lex("fn add(a: Int, b: Int): Int = a + b"),
vec![
TokenKind::Fn,
TokenKind::Ident("add".into()),
TokenKind::LParen,
TokenKind::Ident("a".into()),
TokenKind::Colon,
TokenKind::Ident("Int".into()),
TokenKind::Comma,
TokenKind::Ident("b".into()),
TokenKind::Colon,
TokenKind::Ident("Int".into()),
TokenKind::RParen,
TokenKind::Colon,
TokenKind::Ident("Int".into()),
TokenKind::Eq,
TokenKind::Ident("a".into()),
TokenKind::Plus,
TokenKind::Ident("b".into()),
TokenKind::Eof
]
);
}
}