//! Lexer for the Lux language #![allow(dead_code)] use crate::ast::Span; use std::fmt; use std::iter::Peekable; use std::str::Chars; /// Part of an interpolated string #[derive(Debug, Clone, PartialEq)] pub enum StringPart { /// Literal text Literal(String), /// Expression to be evaluated (stored as source text to be parsed later) Expr(String), } /// Token types #[derive(Debug, Clone, PartialEq)] pub enum TokenKind { // Literals Int(i64), Float(f64), String(String), /// Interpolated string with embedded expressions: "Hello, {name}!" InterpolatedString(Vec), Char(char), Bool(bool), // Identifiers and keywords Ident(String), // Keywords Fn, Let, If, Then, Else, Match, With, Effect, Handler, Run, Resume, Type, True, False, Import, Pub, As, From, // from (for migrations) Latest, // latest (for @latest version constraint) Trait, // trait (for type classes) Impl, // impl (for trait implementations) For, // for (in impl Trait for Type) // Documentation DocComment(String), // /// doc comment // Behavioral type keywords Is, // is (for behavioral properties) Pure, // pure Total, // total Idempotent, // idempotent Deterministic, // deterministic Commutative, // commutative Where, // where (for constraints) Assume, // assume (for unverified properties) // Operators Plus, // + PlusPlus, // ++ Minus, // - Star, // * Slash, // / Percent, // % Eq, // = EqEq, // == Ne, // != Lt, // < Le, // <= Gt, // > Ge, // >= And, // && Or, // || Not, // ! Pipe, // | PipeGt, // |> Arrow, // => ThinArrow, // -> Dot, // . DotDotDot, // ... Colon, // : ColonColon, // :: Comma, // , Semi, // ; At, // @ // Delimiters LParen, // ( RParen, // ) LBrace, // { RBrace, // } LBracket, // [ RBracket, // ] // Special Underscore, // _ Newline, Eof, } impl fmt::Display for TokenKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { TokenKind::Int(n) => write!(f, "{}", n), TokenKind::Float(n) => write!(f, "{}", n), TokenKind::String(s) => write!(f, "\"{}\"", s), TokenKind::InterpolatedString(parts) => { write!(f, "\"")?; for part in parts { match part { StringPart::Literal(s) => write!(f, "{}", s)?, StringPart::Expr(e) => write!(f, "{{{}}}", e)?, } } write!(f, "\"") } TokenKind::Char(c) => write!(f, "'{}'", c), TokenKind::Bool(b) => write!(f, "{}", b), TokenKind::Ident(s) => write!(f, "{}", s), TokenKind::Fn => write!(f, "fn"), TokenKind::Let => write!(f, "let"), TokenKind::If => write!(f, "if"), TokenKind::Then => write!(f, "then"), TokenKind::Else => write!(f, "else"), TokenKind::Match => write!(f, "match"), TokenKind::With => write!(f, "with"), TokenKind::Effect => write!(f, "effect"), TokenKind::Handler => write!(f, "handler"), TokenKind::Run => write!(f, "run"), TokenKind::Resume => write!(f, "resume"), TokenKind::Type => write!(f, "type"), TokenKind::Import => write!(f, "import"), TokenKind::Pub => write!(f, "pub"), TokenKind::As => write!(f, "as"), TokenKind::From => write!(f, "from"), TokenKind::Latest => write!(f, "latest"), TokenKind::Trait => write!(f, "trait"), TokenKind::Impl => write!(f, "impl"), TokenKind::For => write!(f, "for"), TokenKind::DocComment(s) => write!(f, "/// {}", s), TokenKind::Is => write!(f, "is"), TokenKind::Pure => write!(f, "pure"), TokenKind::Total => write!(f, "total"), TokenKind::Idempotent => write!(f, "idempotent"), TokenKind::Deterministic => write!(f, "deterministic"), TokenKind::Commutative => write!(f, "commutative"), TokenKind::Where => write!(f, "where"), TokenKind::Assume => write!(f, "assume"), TokenKind::True => write!(f, "true"), TokenKind::False => write!(f, "false"), TokenKind::Plus => write!(f, "+"), TokenKind::PlusPlus => write!(f, "++"), TokenKind::Minus => write!(f, "-"), TokenKind::Star => write!(f, "*"), TokenKind::Slash => write!(f, "/"), TokenKind::Percent => write!(f, "%"), TokenKind::Eq => write!(f, "="), TokenKind::EqEq => write!(f, "=="), TokenKind::Ne => write!(f, "!="), TokenKind::Lt => write!(f, "<"), TokenKind::Le => write!(f, "<="), TokenKind::Gt => write!(f, ">"), TokenKind::Ge => write!(f, ">="), TokenKind::And => write!(f, "&&"), TokenKind::Or => write!(f, "||"), TokenKind::Not => write!(f, "!"), TokenKind::Pipe => write!(f, "|"), TokenKind::PipeGt => write!(f, "|>"), TokenKind::Arrow => write!(f, "=>"), TokenKind::ThinArrow => write!(f, "->"), TokenKind::Dot => write!(f, "."), TokenKind::DotDotDot => write!(f, "..."), TokenKind::Colon => write!(f, ":"), TokenKind::ColonColon => write!(f, "::"), TokenKind::Comma => write!(f, ","), TokenKind::Semi => write!(f, ";"), TokenKind::At => write!(f, "@"), TokenKind::LParen => write!(f, "("), TokenKind::RParen => write!(f, ")"), TokenKind::LBrace => write!(f, "{{"), TokenKind::RBrace => write!(f, "}}"), TokenKind::LBracket => write!(f, "["), TokenKind::RBracket => write!(f, "]"), TokenKind::Underscore => write!(f, "_"), TokenKind::Newline => write!(f, "\\n"), TokenKind::Eof => write!(f, "EOF"), } } } /// A token with its source location #[derive(Debug, Clone)] pub struct Token { pub kind: TokenKind, pub span: Span, } impl Token { pub fn new(kind: TokenKind, span: Span) -> Self { Self { kind, span } } } /// Lexer error #[derive(Debug, Clone)] pub struct LexError { pub message: String, pub span: Span, } impl fmt::Display for LexError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "Lexer error at {}-{}: {}", self.span.start, self.span.end, self.message ) } } /// The lexer pub struct Lexer<'a> { source: &'a str, chars: Peekable>, pos: usize, } impl<'a> Lexer<'a> { pub fn new(source: &'a str) -> Self { Self { source, chars: source.chars().peekable(), pos: 0, } } /// Tokenize the entire source pub fn tokenize(mut self) -> Result, LexError> { let mut tokens = Vec::new(); loop { let token = self.next_token()?; let is_eof = token.kind == TokenKind::Eof; tokens.push(token); if is_eof { break; } } Ok(tokens) } fn next_token(&mut self) -> Result { self.skip_whitespace_and_comments(); let start = self.pos; let Some(c) = self.advance() else { return Ok(Token::new(TokenKind::Eof, Span::new(start, start))); }; let kind = match c { // Single-character tokens '+' => { if self.peek() == Some('+') { self.advance(); TokenKind::PlusPlus } else { TokenKind::Plus } } '*' => TokenKind::Star, '%' => TokenKind::Percent, '(' => TokenKind::LParen, ')' => TokenKind::RParen, '{' => TokenKind::LBrace, '}' => TokenKind::RBrace, '[' => TokenKind::LBracket, ']' => TokenKind::RBracket, ',' => TokenKind::Comma, ';' => TokenKind::Semi, '@' => TokenKind::At, '\n' => TokenKind::Newline, // Multi-character tokens '-' => { if self.peek() == Some('>') { self.advance(); TokenKind::ThinArrow } else { TokenKind::Minus } } '/' => { if self.peek() == Some('/') { self.advance(); // consume second '/' // Check if this is a doc comment (///) if self.peek() == Some('/') { self.advance(); // consume third '/' return Ok(self.scan_doc_comment(start)); } else { // Regular line comment self.skip_line_comment(); return self.next_token(); } } else { TokenKind::Slash } } '=' => { if self.peek() == Some('=') { self.advance(); TokenKind::EqEq } else if self.peek() == Some('>') { self.advance(); TokenKind::Arrow } else { TokenKind::Eq } } '!' => { if self.peek() == Some('=') { self.advance(); TokenKind::Ne } else { TokenKind::Not } } '<' => { if self.peek() == Some('=') { self.advance(); TokenKind::Le } else { TokenKind::Lt } } '>' => { if self.peek() == Some('=') { self.advance(); TokenKind::Ge } else { TokenKind::Gt } } '&' => { if self.peek() == Some('&') { self.advance(); TokenKind::And } else { return Err(LexError { message: "Expected '&&'".into(), span: Span::new(start, self.pos), }); } } '|' => { if self.peek() == Some('|') { self.advance(); TokenKind::Or } else if self.peek() == Some('>') { self.advance(); TokenKind::PipeGt } else { TokenKind::Pipe } } '.' => { if self.peek() == Some('.') { // Check for ... (need to peek past second dot) // We look at source directly since we can only peek one ahead let next_next = self.source[self.pos..].chars().nth(1); if next_next == Some('.') { self.advance(); // consume second '.' self.advance(); // consume third '.' TokenKind::DotDotDot } else { TokenKind::Dot } } else { TokenKind::Dot } } ':' => { if self.peek() == Some(':') { self.advance(); TokenKind::ColonColon } else { TokenKind::Colon } } '_' => { if self.peek().map_or(false, |c| c.is_alphanumeric()) { // It's an identifier starting with _ self.scan_ident_rest(start) } else { TokenKind::Underscore } } // String literals '"' => self.scan_string(start)?, // Char literals '\'' => self.scan_char(start)?, // Numbers c if c.is_ascii_digit() => self.scan_number(c, start)?, // Identifiers and keywords c if c.is_alphabetic() || c == '_' => self.scan_ident_rest(start), _ => { return Err(LexError { message: format!("Unexpected character: '{}'", c), span: Span::new(start, self.pos), }); } }; Ok(Token::new(kind, Span::new(start, self.pos))) } fn advance(&mut self) -> Option { let c = self.chars.next()?; self.pos += c.len_utf8(); Some(c) } fn peek(&mut self) -> Option { self.chars.peek().copied() } fn skip_whitespace_and_comments(&mut self) { while let Some(c) = self.peek() { if c == ' ' || c == '\t' || c == '\r' { self.advance(); } else if c == '/' { // Check for comment let mut chars = self.chars.clone(); chars.next(); // consume '/' if chars.peek() == Some(&'/') { self.skip_line_comment(); } else { break; } } else { break; } } } fn skip_line_comment(&mut self) { while let Some(c) = self.peek() { if c == '\n' { break; } self.advance(); } } fn scan_doc_comment(&mut self, start: usize) -> Token { // Skip leading whitespace after /// while self.peek() == Some(' ') || self.peek() == Some('\t') { self.advance(); } // Collect the rest of the line let mut content = String::new(); while let Some(c) = self.peek() { if c == '\n' { break; } content.push(c); self.advance(); } Token::new( TokenKind::DocComment(content.trim_end().to_string()), Span::new(start, self.pos), ) } fn scan_string(&mut self, _start: usize) -> Result { let mut parts: Vec = Vec::new(); let mut current_literal = String::new(); loop { match self.advance() { Some('"') => break, Some('\\') => { // Check for escaped brace match self.peek() { Some('{') => { self.advance(); current_literal.push('{'); } Some('}') => { self.advance(); current_literal.push('}'); } _ => { let escape_start = self.pos; let escaped = match self.advance() { Some('n') => '\n', Some('r') => '\r', Some('t') => '\t', Some('\\') => '\\', Some('"') => '"', Some('0') => '\0', Some('\'') => '\'', Some('{') => '{', Some('}') => '}', Some('x') => { // Hex escape \xNN let h1 = self.advance().and_then(|c| c.to_digit(16)); let h2 = self.advance().and_then(|c| c.to_digit(16)); match (h1, h2) { (Some(d1), Some(d2)) => { let byte = (d1 * 16 + d2) as u8; byte as char } _ => { return Err(LexError { message: "Invalid hex escape sequence: expected \\xNN".into(), span: Span::new(escape_start - 1, self.pos), }); } } } Some('u') => { // Unicode escape \u{NNNN} or \uNNNN if self.peek() == Some('{') { self.advance(); // consume '{' let mut hex = String::new(); while let Some(c) = self.peek() { if c == '}' { self.advance(); break; } if c.is_ascii_hexdigit() { hex.push(c); self.advance(); } else { return Err(LexError { message: format!("Invalid unicode escape: expected hex digit, got '{}'", c), span: Span::new(escape_start - 1, self.pos), }); } } match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) { Some(c) => c, None => { return Err(LexError { message: format!("Invalid unicode escape: \\u{{{}}}", hex), span: Span::new(escape_start - 1, self.pos), }); } } } else { // \uNNNN format (4 hex digits) let mut hex = String::new(); for _ in 0..4 { match self.advance() { Some(c) if c.is_ascii_hexdigit() => hex.push(c), _ => { return Err(LexError { message: "Invalid unicode escape: expected 4 hex digits".into(), span: Span::new(escape_start - 1, self.pos), }); } } } match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) { Some(c) => c, None => { return Err(LexError { message: format!("Invalid unicode escape: \\u{}", hex), span: Span::new(escape_start - 1, self.pos), }); } } } } Some(c) => { return Err(LexError { message: format!("Invalid escape sequence: \\{}", c), span: Span::new(escape_start - 1, self.pos), }); } None => { return Err(LexError { message: "Unterminated string".into(), span: Span::new(_start, self.pos), }); } }; current_literal.push(escaped); } } } Some('{') => { // Start of interpolation if !current_literal.is_empty() { parts.push(StringPart::Literal(std::mem::take(&mut current_literal))); } // Scan the expression until matching '}' let mut expr_text = String::new(); let mut brace_depth = 1; loop { match self.advance() { Some('{') => { brace_depth += 1; expr_text.push('{'); } Some('}') => { brace_depth -= 1; if brace_depth == 0 { break; } expr_text.push('}'); } Some(c) => expr_text.push(c), None => { return Err(LexError { message: "Unterminated interpolation in string".into(), span: Span::new(_start, self.pos), }); } } } parts.push(StringPart::Expr(expr_text)); } Some(c) => current_literal.push(c), None => { return Err(LexError { message: "Unterminated string".into(), span: Span::new(_start, self.pos), }); } } } // If we have no interpolations, return a simple string if parts.is_empty() { return Ok(TokenKind::String(current_literal)); } // Add any remaining literal if !current_literal.is_empty() { parts.push(StringPart::Literal(current_literal)); } Ok(TokenKind::InterpolatedString(parts)) } fn scan_char(&mut self, start: usize) -> Result { let c = match self.advance() { Some('\\') => match self.advance() { Some('n') => '\n', Some('r') => '\r', Some('t') => '\t', Some('\\') => '\\', Some('\'') => '\'', Some(c) => c, None => { return Err(LexError { message: "Unterminated character literal".into(), span: Span::new(start, self.pos), }); } }, Some(c) => c, None => { return Err(LexError { message: "Unterminated character literal".into(), span: Span::new(start, self.pos), }); } }; if self.advance() != Some('\'') { return Err(LexError { message: "Expected closing quote for character literal".into(), span: Span::new(start, self.pos), }); } Ok(TokenKind::Char(c)) } fn scan_number(&mut self, first: char, start: usize) -> Result { let mut num_str = String::new(); num_str.push(first); while let Some(c) = self.peek() { if c.is_ascii_digit() || c == '_' { if c != '_' { num_str.push(c); } self.advance(); } else { break; } } // Check for float if self.peek() == Some('.') { // Look ahead to make sure it's not a method call let mut chars = self.chars.clone(); chars.next(); // consume '.' if chars.peek().map_or(false, |c| c.is_ascii_digit()) { self.advance(); // consume '.' num_str.push('.'); while let Some(c) = self.peek() { if c.is_ascii_digit() || c == '_' { if c != '_' { num_str.push(c); } self.advance(); } else { break; } } let f: f64 = num_str.parse().map_err(|_| LexError { message: "Invalid float literal".into(), span: Span::new(start, self.pos), })?; return Ok(TokenKind::Float(f)); } } let n: i64 = num_str.parse().map_err(|_| LexError { message: "Invalid integer literal".into(), span: Span::new(start, self.pos), })?; Ok(TokenKind::Int(n)) } fn scan_ident_rest(&mut self, start: usize) -> TokenKind { while let Some(c) = self.peek() { if c.is_alphanumeric() || c == '_' { self.advance(); } else { break; } } let ident = &self.source[start..self.pos]; match ident { "fn" => TokenKind::Fn, "let" => TokenKind::Let, "if" => TokenKind::If, "then" => TokenKind::Then, "else" => TokenKind::Else, "match" => TokenKind::Match, "with" => TokenKind::With, "effect" => TokenKind::Effect, "handler" => TokenKind::Handler, "run" => TokenKind::Run, "resume" => TokenKind::Resume, "type" => TokenKind::Type, "import" => TokenKind::Import, "pub" => TokenKind::Pub, "as" => TokenKind::As, "from" => TokenKind::From, "latest" => TokenKind::Latest, "trait" => TokenKind::Trait, "impl" => TokenKind::Impl, "for" => TokenKind::For, "is" => TokenKind::Is, "pure" => TokenKind::Pure, "total" => TokenKind::Total, "idempotent" => TokenKind::Idempotent, "deterministic" => TokenKind::Deterministic, "commutative" => TokenKind::Commutative, "where" => TokenKind::Where, "assume" => TokenKind::Assume, "true" => TokenKind::Bool(true), "false" => TokenKind::Bool(false), _ => TokenKind::Ident(ident.to_string()), } } } #[cfg(test)] mod tests { use super::*; fn lex(source: &str) -> Vec { Lexer::new(source) .tokenize() .unwrap() .into_iter() .map(|t| t.kind) .filter(|k| !matches!(k, TokenKind::Newline)) .collect() } #[test] fn test_basic_tokens() { assert_eq!( lex("fn let if else"), vec![ TokenKind::Fn, TokenKind::Let, TokenKind::If, TokenKind::Else, TokenKind::Eof ] ); } #[test] fn test_operators() { assert_eq!( lex("+ - * / == != |>"), vec![ TokenKind::Plus, TokenKind::Minus, TokenKind::Star, TokenKind::Slash, TokenKind::EqEq, TokenKind::Ne, TokenKind::PipeGt, TokenKind::Eof ] ); } #[test] fn test_numbers() { assert_eq!( lex("42 3.14"), vec![TokenKind::Int(42), TokenKind::Float(3.14), TokenKind::Eof] ); } #[test] fn test_strings() { assert_eq!( lex("\"hello\" \"world\""), vec![ TokenKind::String("hello".into()), TokenKind::String("world".into()), TokenKind::Eof ] ); } #[test] fn test_string_interpolation_simple() { assert_eq!( lex("\"Hello, {name}!\""), vec![ TokenKind::InterpolatedString(vec![ StringPart::Literal("Hello, ".into()), StringPart::Expr("name".into()), StringPart::Literal("!".into()), ]), TokenKind::Eof ] ); } #[test] fn test_string_interpolation_multiple() { assert_eq!( lex("\"{x} + {y} = {x + y}\""), vec![ TokenKind::InterpolatedString(vec![ StringPart::Expr("x".into()), StringPart::Literal(" + ".into()), StringPart::Expr("y".into()), StringPart::Literal(" = ".into()), StringPart::Expr("x + y".into()), ]), TokenKind::Eof ] ); } #[test] fn test_string_interpolation_escaped_braces() { assert_eq!( lex("\"literal \\{braces\\}\""), vec![ TokenKind::String("literal {braces}".into()), TokenKind::Eof ] ); } #[test] fn test_string_no_interpolation() { // Plain strings without interpolation should remain as String tokens assert_eq!( lex("\"no interpolation here\""), vec![ TokenKind::String("no interpolation here".into()), TokenKind::Eof ] ); } #[test] fn test_function() { assert_eq!( lex("fn add(a: Int, b: Int): Int = a + b"), vec![ TokenKind::Fn, TokenKind::Ident("add".into()), TokenKind::LParen, TokenKind::Ident("a".into()), TokenKind::Colon, TokenKind::Ident("Int".into()), TokenKind::Comma, TokenKind::Ident("b".into()), TokenKind::Colon, TokenKind::Ident("Int".into()), TokenKind::RParen, TokenKind::Colon, TokenKind::Ident("Int".into()), TokenKind::Eq, TokenKind::Ident("a".into()), TokenKind::Plus, TokenKind::Ident("b".into()), TokenKind::Eof ] ); } }