2 Commits

Author SHA1 Message Date
98605d2b70 feat: add self-hosted Lux lexer as first step toward bootstrapping
The lexer tokenizes Lux source code written entirely in Lux itself.
Supports all token types: keywords, operators, literals, behavioral
properties, doc comments, and delimiters.

This is the first component of the Lux-in-Lux compiler, demonstrating
that Lux's pattern matching, recursion, and string handling are
sufficient for compiler construction.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 08:25:22 -05:00
e3b6f4322a fix: add Char pattern matching and Char comparison operators
- Parser: support Char literals in match patterns (e.g., 'x' => ...)
- Interpreter: add Char comparison for <, <=, >, >= operators
  Previously only Int, Float, and String supported ordering comparisons.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 08:25:15 -05:00
3 changed files with 524 additions and 0 deletions

View File

@@ -0,0 +1,512 @@
// Lux Lexer — Self-hosted lexer for the Lux language
//
// This is the first component of the Lux-in-Lux compiler.
// It tokenizes Lux source code into a list of tokens.
//
// Design:
// - Recursive descent character scanning
// - Immutable state (ParseState tracks chars + position)
// - Pattern matching for all token types
// === Token types ===
type TokenKind =
// Literals
| TkInt(Int)
| TkFloat(String)
| TkString(String)
| TkChar(Char)
| TkBool(Bool)
// Identifiers
| TkIdent(String)
// Keywords
| TkFn | TkLet | TkIf | TkThen | TkElse | TkMatch
| TkWith | TkEffect | TkHandler | TkRun | TkResume
| TkType | TkImport | TkPub | TkAs | TkFrom
| TkTrait | TkImpl | TkFor
// Behavioral
| TkIs | TkPure | TkTotal | TkIdempotent
| TkDeterministic | TkCommutative
| TkWhere | TkAssume
// Operators
| TkPlus | TkMinus | TkStar | TkSlash | TkPercent
| TkEq | TkEqEq | TkNe | TkLt | TkLe | TkGt | TkGe
| TkAnd | TkOr | TkNot
| TkPipe | TkPipeGt | TkArrow | TkThinArrow
| TkDot | TkColon | TkColonColon | TkComma | TkSemi | TkAt
// Delimiters
| TkLParen | TkRParen | TkLBrace | TkRBrace
| TkLBracket | TkRBracket
// Special
| TkUnderscore | TkNewline | TkEof
// Doc comment
| TkDocComment(String)
type Token =
| Token(TokenKind, Int, Int) // kind, start, end
type LexState =
| LexState(List<Char>, Int) // chars, position
type LexResult =
| LexOk(Token, LexState)
| LexErr(String, Int)
// === Character utilities ===
fn peek(state: LexState): Option<Char> =
match state {
LexState(chars, pos) => List.get(chars, pos)
}
fn peekAt(state: LexState, offset: Int): Option<Char> =
match state {
LexState(chars, pos) => List.get(chars, pos + offset)
}
fn advance(state: LexState): LexState =
match state {
LexState(chars, pos) => LexState(chars, pos + 1)
}
fn position(state: LexState): Int =
match state { LexState(_, pos) => pos }
fn isDigit(c: Char): Bool =
c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
c == '5' || c == '6' || c == '7' || c == '8' || c == '9'
fn isAlpha(c: Char): Bool =
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
fn isAlphaNumeric(c: Char): Bool =
isAlpha(c) || isDigit(c)
fn isWhitespace(c: Char): Bool =
c == ' ' || c == '\t' || c == '\r'
// === Core lexing ===
fn skipLineComment(state: LexState): LexState =
match peek(state) {
None => state,
Some(c) =>
if c == '\n' then state
else skipLineComment(advance(state))
}
fn skipWhitespaceAndComments(state: LexState): LexState =
match peek(state) {
None => state,
Some(c) =>
if isWhitespace(c) then
skipWhitespaceAndComments(advance(state))
else if c == '/' then
match peekAt(state, 1) {
Some('/') =>
// Check for doc comment (///)
match peekAt(state, 2) {
Some('/') => state, // Don't skip doc comments
_ => skipWhitespaceAndComments(skipLineComment(advance(advance(state))))
},
_ => state
}
else state
}
// Collect identifier characters
fn collectIdent(state: LexState, acc: List<Char>): (List<Char>, LexState) =
match peek(state) {
None => (acc, state),
Some(c) =>
if isAlphaNumeric(c) then
collectIdent(advance(state), List.concat(acc, [c]))
else (acc, state)
}
// Collect number characters (digits only)
fn collectDigits(state: LexState, acc: List<Char>): (List<Char>, LexState) =
match peek(state) {
None => (acc, state),
Some(c) =>
if isDigit(c) then
collectDigits(advance(state), List.concat(acc, [c]))
else (acc, state)
}
// Convert list of digit chars to int
fn charsToInt(chars: List<Char>): Int =
List.fold(chars, 0, fn(acc, c) => acc * 10 + charToDigit(c))
fn charToDigit(c: Char): Int =
if c == '0' then 0
else if c == '1' then 1
else if c == '2' then 2
else if c == '3' then 3
else if c == '4' then 4
else if c == '5' then 5
else if c == '6' then 6
else if c == '7' then 7
else if c == '8' then 8
else 9
// Map identifier string to keyword token or ident
fn identToToken(name: String): TokenKind =
if name == "fn" then TkFn
else if name == "let" then TkLet
else if name == "if" then TkIf
else if name == "then" then TkThen
else if name == "else" then TkElse
else if name == "match" then TkMatch
else if name == "with" then TkWith
else if name == "effect" then TkEffect
else if name == "handler" then TkHandler
else if name == "run" then TkRun
else if name == "resume" then TkResume
else if name == "type" then TkType
else if name == "true" then TkBool(true)
else if name == "false" then TkBool(false)
else if name == "import" then TkImport
else if name == "pub" then TkPub
else if name == "as" then TkAs
else if name == "from" then TkFrom
else if name == "trait" then TkTrait
else if name == "impl" then TkImpl
else if name == "for" then TkFor
else if name == "is" then TkIs
else if name == "pure" then TkPure
else if name == "total" then TkTotal
else if name == "idempotent" then TkIdempotent
else if name == "deterministic" then TkDeterministic
else if name == "commutative" then TkCommutative
else if name == "where" then TkWhere
else if name == "assume" then TkAssume
else TkIdent(name)
// Lex a string literal (after opening quote consumed)
fn lexStringBody(state: LexState, acc: List<Char>): (List<Char>, LexState) =
match peek(state) {
None => (acc, state),
Some(c) =>
if c == '"' then (acc, advance(state))
else if c == '\\' then
match peekAt(state, 1) {
Some('n') => lexStringBody(advance(advance(state)), List.concat(acc, ['\n'])),
Some('t') => lexStringBody(advance(advance(state)), List.concat(acc, ['\t'])),
Some('\\') => lexStringBody(advance(advance(state)), List.concat(acc, ['\\'])),
Some('"') => lexStringBody(advance(advance(state)), List.concat(acc, ['"'])),
_ => lexStringBody(advance(state), List.concat(acc, [c]))
}
else lexStringBody(advance(state), List.concat(acc, [c]))
}
// Lex a char literal (after opening quote consumed)
fn lexCharLiteral(state: LexState): LexResult =
let start = position(state) - 1;
match peek(state) {
None => LexErr("Unexpected end of input in char literal", start),
Some(c) =>
if c == '\\' then
match peekAt(state, 1) {
Some('n') =>
match peekAt(state, 2) {
Some('\'') => LexOk(Token(TkChar('\n'), start, position(state) + 3), advance(advance(advance(state)))),
_ => LexErr("Expected closing quote", position(state))
},
Some('t') =>
match peekAt(state, 2) {
Some('\'') => LexOk(Token(TkChar('\t'), start, position(state) + 3), advance(advance(advance(state)))),
_ => LexErr("Expected closing quote", position(state))
},
Some('\\') =>
match peekAt(state, 2) {
Some('\'') => LexOk(Token(TkChar('\\'), start, position(state) + 3), advance(advance(advance(state)))),
_ => LexErr("Expected closing quote", position(state))
},
_ => LexErr("Unknown escape sequence", position(state))
}
else
match peekAt(state, 1) {
Some('\'') => LexOk(Token(TkChar(c), start, position(state) + 2), advance(advance(state))),
_ => LexErr("Expected closing quote", position(state))
}
}
// Collect doc comment text (after /// consumed)
fn collectDocComment(state: LexState, acc: List<Char>): (List<Char>, LexState) =
match peek(state) {
None => (acc, state),
Some(c) =>
if c == '\n' then (acc, state)
else collectDocComment(advance(state), List.concat(acc, [c]))
}
// Lex a single token
fn lexToken(state: LexState): LexResult =
let state = skipWhitespaceAndComments(state);
let start = position(state);
match peek(state) {
None => LexOk(Token(TkEof, start, start), state),
Some(c) =>
if c == '\n' then
LexOk(Token(TkNewline, start, start + 1), advance(state))
// Numbers
else if isDigit(c) then
let result = collectDigits(state, []);
match result {
(digits, nextState) =>
// Check for float
match peek(nextState) {
Some('.') =>
match peekAt(nextState, 1) {
Some(d) =>
if isDigit(d) then
let fracResult = collectDigits(advance(nextState), []);
match fracResult {
(fracDigits, finalState) =>
let intPart = String.join(List.map(digits, fn(ch) => String.fromChar(ch)), "");
let fracPart = String.join(List.map(fracDigits, fn(ch) => String.fromChar(ch)), "");
LexOk(Token(TkFloat(intPart + "." + fracPart), start, position(finalState)), finalState)
}
else
LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState),
None =>
LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState)
},
_ => LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState)
}
}
// Identifiers and keywords
else if isAlpha(c) then
let result = collectIdent(state, []);
match result {
(chars, nextState) =>
let name = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), "");
LexOk(Token(identToToken(name), start, position(nextState)), nextState)
}
// String literals
else if c == '"' then
let result = lexStringBody(advance(state), []);
match result {
(chars, nextState) =>
let str = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), "");
LexOk(Token(TkString(str), start, position(nextState)), nextState)
}
// Char literals
else if c == '\'' then
lexCharLiteral(advance(state))
// Doc comments (///)
else if c == '/' then
match peekAt(state, 1) {
Some('/') =>
match peekAt(state, 2) {
Some('/') =>
// Skip the "/// " prefix
let docState = advance(advance(advance(state)));
let docState = match peek(docState) {
Some(' ') => advance(docState),
_ => docState
};
let result = collectDocComment(docState, []);
match result {
(chars, nextState) =>
let text = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), "");
LexOk(Token(TkDocComment(text), start, position(nextState)), nextState)
},
_ => LexOk(Token(TkSlash, start, start + 1), advance(state))
},
_ => LexOk(Token(TkSlash, start, start + 1), advance(state))
}
// Two-character operators
else if c == '=' then
match peekAt(state, 1) {
Some('=') => LexOk(Token(TkEqEq, start, start + 2), advance(advance(state))),
Some('>') => LexOk(Token(TkArrow, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkEq, start, start + 1), advance(state))
}
else if c == '!' then
match peekAt(state, 1) {
Some('=') => LexOk(Token(TkNe, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkNot, start, start + 1), advance(state))
}
else if c == '<' then
match peekAt(state, 1) {
Some('=') => LexOk(Token(TkLe, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkLt, start, start + 1), advance(state))
}
else if c == '>' then
match peekAt(state, 1) {
Some('=') => LexOk(Token(TkGe, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkGt, start, start + 1), advance(state))
}
else if c == '&' then
match peekAt(state, 1) {
Some('&') => LexOk(Token(TkAnd, start, start + 2), advance(advance(state))),
_ => LexErr("Expected '&&'", start)
}
else if c == '|' then
match peekAt(state, 1) {
Some('|') => LexOk(Token(TkOr, start, start + 2), advance(advance(state))),
Some('>') => LexOk(Token(TkPipeGt, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkPipe, start, start + 1), advance(state))
}
else if c == '-' then
match peekAt(state, 1) {
Some('>') => LexOk(Token(TkThinArrow, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkMinus, start, start + 1), advance(state))
}
else if c == ':' then
match peekAt(state, 1) {
Some(':') => LexOk(Token(TkColonColon, start, start + 2), advance(advance(state))),
_ => LexOk(Token(TkColon, start, start + 1), advance(state))
}
// Single-character tokens
else if c == '+' then LexOk(Token(TkPlus, start, start + 1), advance(state))
else if c == '*' then LexOk(Token(TkStar, start, start + 1), advance(state))
else if c == '%' then LexOk(Token(TkPercent, start, start + 1), advance(state))
else if c == '.' then LexOk(Token(TkDot, start, start + 1), advance(state))
else if c == ',' then LexOk(Token(TkComma, start, start + 1), advance(state))
else if c == ';' then LexOk(Token(TkSemi, start, start + 1), advance(state))
else if c == '@' then LexOk(Token(TkAt, start, start + 1), advance(state))
else if c == '(' then LexOk(Token(TkLParen, start, start + 1), advance(state))
else if c == ')' then LexOk(Token(TkRParen, start, start + 1), advance(state))
else if c == '{' then LexOk(Token(TkLBrace, start, start + 1), advance(state))
else if c == '}' then LexOk(Token(TkRBrace, start, start + 1), advance(state))
else if c == '[' then LexOk(Token(TkLBracket, start, start + 1), advance(state))
else if c == ']' then LexOk(Token(TkRBracket, start, start + 1), advance(state))
else if c == '_' then
// Check if it's just underscore or start of ident
match peekAt(state, 1) {
Some(next) =>
if isAlphaNumeric(next) then
let result = collectIdent(state, []);
match result {
(chars, nextState) =>
let name = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), "");
LexOk(Token(TkIdent(name), start, position(nextState)), nextState)
}
else LexOk(Token(TkUnderscore, start, start + 1), advance(state)),
None => LexOk(Token(TkUnderscore, start, start + 1), advance(state))
}
else LexErr("Unexpected character: " + String.fromChar(c), start)
}
// Lex all tokens from source
fn lexAll(state: LexState, acc: List<Token>): List<Token> =
match lexToken(state) {
LexErr(msg, pos) =>
// On error, skip the character and continue
List.concat(acc, [Token(TkEof, pos, pos)]),
LexOk(token, nextState) =>
match token {
Token(TkEof, _, _) => List.concat(acc, [token]),
Token(TkNewline, _, _) =>
// Skip consecutive newlines
lexAll(nextState, List.concat(acc, [token])),
_ => lexAll(nextState, List.concat(acc, [token]))
}
}
// Public API: tokenize a source string
fn tokenize(source: String): List<Token> =
let chars = String.chars(source);
let state = LexState(chars, 0);
lexAll(state, [])
// === Token display ===
fn tokenKindToString(kind: TokenKind): String =
match kind {
TkInt(n) => "Int(" + toString(n) + ")",
TkFloat(s) => "Float(" + s + ")",
TkString(s) => "String(\"" + s + "\")",
TkChar(c) => "Char('" + String.fromChar(c) + "')",
TkBool(b) => if b then "true" else "false",
TkIdent(name) => "Ident(" + name + ")",
TkFn => "fn", TkLet => "let", TkIf => "if",
TkThen => "then", TkElse => "else", TkMatch => "match",
TkWith => "with", TkEffect => "effect", TkHandler => "handler",
TkRun => "run", TkResume => "resume", TkType => "type",
TkImport => "import", TkPub => "pub", TkAs => "as",
TkFrom => "from", TkTrait => "trait", TkImpl => "impl", TkFor => "for",
TkIs => "is", TkPure => "pure", TkTotal => "total",
TkIdempotent => "idempotent", TkDeterministic => "deterministic",
TkCommutative => "commutative", TkWhere => "where", TkAssume => "assume",
TkPlus => "+", TkMinus => "-", TkStar => "*", TkSlash => "/",
TkPercent => "%", TkEq => "=", TkEqEq => "==", TkNe => "!=",
TkLt => "<", TkLe => "<=", TkGt => ">", TkGe => ">=",
TkAnd => "&&", TkOr => "||", TkNot => "!",
TkPipe => "|", TkPipeGt => "|>",
TkArrow => "=>", TkThinArrow => "->",
TkDot => ".", TkColon => ":", TkColonColon => "::",
TkComma => ",", TkSemi => ";", TkAt => "@",
TkLParen => "(", TkRParen => ")", TkLBrace => "{", TkRBrace => "}",
TkLBracket => "[", TkRBracket => "]",
TkUnderscore => "_", TkNewline => "\\n", TkEof => "EOF",
TkDocComment(text) => "DocComment(\"" + text + "\")",
_ => "?"
}
fn tokenToString(token: Token): String =
match token {
Token(kind, start, end) =>
tokenKindToString(kind) + " [" + toString(start) + ".." + toString(end) + "]"
}
// === Tests ===
fn printTokens(tokens: List<Token>): Unit with {Console} =
match List.head(tokens) {
None => Console.print(""),
Some(t) => {
Console.print(" " + tokenToString(t));
match List.tail(tokens) {
Some(rest) => printTokens(rest),
None => Console.print("")
}
}
}
fn testLexer(label: String, source: String): Unit with {Console} = {
Console.print("--- " + label + " ---");
Console.print(" Input: \"" + source + "\"");
let tokens = tokenize(source);
printTokens(tokens)
}
fn main(): Unit with {Console} = {
Console.print("=== Lux Self-Hosted Lexer ===");
Console.print("");
// Basic tokens
testLexer("numbers", "42 3");
Console.print("");
// Identifiers and keywords
testLexer("keywords", "fn main let x");
Console.print("");
// Operators
testLexer("operators", "a + b == c");
Console.print("");
// String literal
testLexer("string", "\"hello world\"");
Console.print("");
// Function declaration
testLexer("function", "fn add(a: Int, b: Int): Int = a + b");
Console.print("");
// Behavioral properties
testLexer("behavioral", "fn add(a: Int): Int is pure = a");
Console.print("");
// Complex expression
testLexer("complex", "let result = if x > 0 then x else 0 - x");
Console.print("");
Console.print("=== Lexer test complete ===")
}
let _ = run main() with {}

View File

@@ -1610,6 +1610,7 @@ impl Interpreter {
(Value::Int(a), Value::Int(b)) => Ok(Value::Bool(a < b)),
(Value::Float(a), Value::Float(b)) => Ok(Value::Bool(a < b)),
(Value::String(a), Value::String(b)) => Ok(Value::Bool(a < b)),
(Value::Char(a), Value::Char(b)) => Ok(Value::Bool(a < b)),
(l, r) => Err(RuntimeError {
message: format!("Cannot compare {} and {}", l.type_name(), r.type_name()),
span: Some(span),
@@ -1619,6 +1620,7 @@ impl Interpreter {
(Value::Int(a), Value::Int(b)) => Ok(Value::Bool(a <= b)),
(Value::Float(a), Value::Float(b)) => Ok(Value::Bool(a <= b)),
(Value::String(a), Value::String(b)) => Ok(Value::Bool(a <= b)),
(Value::Char(a), Value::Char(b)) => Ok(Value::Bool(a <= b)),
(l, r) => Err(RuntimeError {
message: format!("Cannot compare {} and {}", l.type_name(), r.type_name()),
span: Some(span),
@@ -1628,6 +1630,7 @@ impl Interpreter {
(Value::Int(a), Value::Int(b)) => Ok(Value::Bool(a > b)),
(Value::Float(a), Value::Float(b)) => Ok(Value::Bool(a > b)),
(Value::String(a), Value::String(b)) => Ok(Value::Bool(a > b)),
(Value::Char(a), Value::Char(b)) => Ok(Value::Bool(a > b)),
(l, r) => Err(RuntimeError {
message: format!("Cannot compare {} and {}", l.type_name(), r.type_name()),
span: Some(span),
@@ -1637,6 +1640,7 @@ impl Interpreter {
(Value::Int(a), Value::Int(b)) => Ok(Value::Bool(a >= b)),
(Value::Float(a), Value::Float(b)) => Ok(Value::Bool(a >= b)),
(Value::String(a), Value::String(b)) => Ok(Value::Bool(a >= b)),
(Value::Char(a), Value::Char(b)) => Ok(Value::Bool(a >= b)),
(l, r) => Err(RuntimeError {
message: format!("Cannot compare {} and {}", l.type_name(), r.type_name()),
span: Some(span),

View File

@@ -1887,6 +1887,14 @@ impl Parser {
span: token.span,
}))
}
TokenKind::Char(c) => {
let c = *c;
self.advance();
Ok(Pattern::Literal(Literal {
kind: LiteralKind::Char(c),
span: token.span,
}))
}
TokenKind::Ident(name) => {
// Check if it starts with uppercase (constructor) or lowercase (variable)
if name.chars().next().map_or(false, |c| c.is_uppercase()) {