From 98605d2b70a42a702544ddc2449bc4e75abf4431 Mon Sep 17 00:00:00 2001 From: Brandon Lucas Date: Tue, 17 Feb 2026 08:25:22 -0500 Subject: [PATCH] feat: add self-hosted Lux lexer as first step toward bootstrapping The lexer tokenizes Lux source code written entirely in Lux itself. Supports all token types: keywords, operators, literals, behavioral properties, doc comments, and delimiters. This is the first component of the Lux-in-Lux compiler, demonstrating that Lux's pattern matching, recursion, and string handling are sufficient for compiler construction. Co-Authored-By: Claude Opus 4.6 --- projects/lux-compiler/lexer.lux | 512 ++++++++++++++++++++++++++++++++ 1 file changed, 512 insertions(+) create mode 100644 projects/lux-compiler/lexer.lux diff --git a/projects/lux-compiler/lexer.lux b/projects/lux-compiler/lexer.lux new file mode 100644 index 0000000..cf50811 --- /dev/null +++ b/projects/lux-compiler/lexer.lux @@ -0,0 +1,512 @@ +// Lux Lexer — Self-hosted lexer for the Lux language +// +// This is the first component of the Lux-in-Lux compiler. +// It tokenizes Lux source code into a list of tokens. +// +// Design: +// - Recursive descent character scanning +// - Immutable state (ParseState tracks chars + position) +// - Pattern matching for all token types + +// === Token types === + +type TokenKind = + // Literals + | TkInt(Int) + | TkFloat(String) + | TkString(String) + | TkChar(Char) + | TkBool(Bool) + // Identifiers + | TkIdent(String) + // Keywords + | TkFn | TkLet | TkIf | TkThen | TkElse | TkMatch + | TkWith | TkEffect | TkHandler | TkRun | TkResume + | TkType | TkImport | TkPub | TkAs | TkFrom + | TkTrait | TkImpl | TkFor + // Behavioral + | TkIs | TkPure | TkTotal | TkIdempotent + | TkDeterministic | TkCommutative + | TkWhere | TkAssume + // Operators + | TkPlus | TkMinus | TkStar | TkSlash | TkPercent + | TkEq | TkEqEq | TkNe | TkLt | TkLe | TkGt | TkGe + | TkAnd | TkOr | TkNot + | TkPipe | TkPipeGt | TkArrow | TkThinArrow + | TkDot | TkColon | TkColonColon | TkComma | TkSemi | TkAt + // Delimiters + | TkLParen | TkRParen | TkLBrace | TkRBrace + | TkLBracket | TkRBracket + // Special + | TkUnderscore | TkNewline | TkEof + // Doc comment + | TkDocComment(String) + +type Token = + | Token(TokenKind, Int, Int) // kind, start, end + +type LexState = + | LexState(List, Int) // chars, position + +type LexResult = + | LexOk(Token, LexState) + | LexErr(String, Int) + +// === Character utilities === + +fn peek(state: LexState): Option = + match state { + LexState(chars, pos) => List.get(chars, pos) + } + +fn peekAt(state: LexState, offset: Int): Option = + match state { + LexState(chars, pos) => List.get(chars, pos + offset) + } + +fn advance(state: LexState): LexState = + match state { + LexState(chars, pos) => LexState(chars, pos + 1) + } + +fn position(state: LexState): Int = + match state { LexState(_, pos) => pos } + +fn isDigit(c: Char): Bool = + c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || + c == '5' || c == '6' || c == '7' || c == '8' || c == '9' + +fn isAlpha(c: Char): Bool = + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' + +fn isAlphaNumeric(c: Char): Bool = + isAlpha(c) || isDigit(c) + +fn isWhitespace(c: Char): Bool = + c == ' ' || c == '\t' || c == '\r' + +// === Core lexing === + +fn skipLineComment(state: LexState): LexState = + match peek(state) { + None => state, + Some(c) => + if c == '\n' then state + else skipLineComment(advance(state)) + } + +fn skipWhitespaceAndComments(state: LexState): LexState = + match peek(state) { + None => state, + Some(c) => + if isWhitespace(c) then + skipWhitespaceAndComments(advance(state)) + else if c == '/' then + match peekAt(state, 1) { + Some('/') => + // Check for doc comment (///) + match peekAt(state, 2) { + Some('/') => state, // Don't skip doc comments + _ => skipWhitespaceAndComments(skipLineComment(advance(advance(state)))) + }, + _ => state + } + else state + } + +// Collect identifier characters +fn collectIdent(state: LexState, acc: List): (List, LexState) = + match peek(state) { + None => (acc, state), + Some(c) => + if isAlphaNumeric(c) then + collectIdent(advance(state), List.concat(acc, [c])) + else (acc, state) + } + +// Collect number characters (digits only) +fn collectDigits(state: LexState, acc: List): (List, LexState) = + match peek(state) { + None => (acc, state), + Some(c) => + if isDigit(c) then + collectDigits(advance(state), List.concat(acc, [c])) + else (acc, state) + } + +// Convert list of digit chars to int +fn charsToInt(chars: List): Int = + List.fold(chars, 0, fn(acc, c) => acc * 10 + charToDigit(c)) + +fn charToDigit(c: Char): Int = + if c == '0' then 0 + else if c == '1' then 1 + else if c == '2' then 2 + else if c == '3' then 3 + else if c == '4' then 4 + else if c == '5' then 5 + else if c == '6' then 6 + else if c == '7' then 7 + else if c == '8' then 8 + else 9 + +// Map identifier string to keyword token or ident +fn identToToken(name: String): TokenKind = + if name == "fn" then TkFn + else if name == "let" then TkLet + else if name == "if" then TkIf + else if name == "then" then TkThen + else if name == "else" then TkElse + else if name == "match" then TkMatch + else if name == "with" then TkWith + else if name == "effect" then TkEffect + else if name == "handler" then TkHandler + else if name == "run" then TkRun + else if name == "resume" then TkResume + else if name == "type" then TkType + else if name == "true" then TkBool(true) + else if name == "false" then TkBool(false) + else if name == "import" then TkImport + else if name == "pub" then TkPub + else if name == "as" then TkAs + else if name == "from" then TkFrom + else if name == "trait" then TkTrait + else if name == "impl" then TkImpl + else if name == "for" then TkFor + else if name == "is" then TkIs + else if name == "pure" then TkPure + else if name == "total" then TkTotal + else if name == "idempotent" then TkIdempotent + else if name == "deterministic" then TkDeterministic + else if name == "commutative" then TkCommutative + else if name == "where" then TkWhere + else if name == "assume" then TkAssume + else TkIdent(name) + +// Lex a string literal (after opening quote consumed) +fn lexStringBody(state: LexState, acc: List): (List, LexState) = + match peek(state) { + None => (acc, state), + Some(c) => + if c == '"' then (acc, advance(state)) + else if c == '\\' then + match peekAt(state, 1) { + Some('n') => lexStringBody(advance(advance(state)), List.concat(acc, ['\n'])), + Some('t') => lexStringBody(advance(advance(state)), List.concat(acc, ['\t'])), + Some('\\') => lexStringBody(advance(advance(state)), List.concat(acc, ['\\'])), + Some('"') => lexStringBody(advance(advance(state)), List.concat(acc, ['"'])), + _ => lexStringBody(advance(state), List.concat(acc, [c])) + } + else lexStringBody(advance(state), List.concat(acc, [c])) + } + +// Lex a char literal (after opening quote consumed) +fn lexCharLiteral(state: LexState): LexResult = + let start = position(state) - 1; + match peek(state) { + None => LexErr("Unexpected end of input in char literal", start), + Some(c) => + if c == '\\' then + match peekAt(state, 1) { + Some('n') => + match peekAt(state, 2) { + Some('\'') => LexOk(Token(TkChar('\n'), start, position(state) + 3), advance(advance(advance(state)))), + _ => LexErr("Expected closing quote", position(state)) + }, + Some('t') => + match peekAt(state, 2) { + Some('\'') => LexOk(Token(TkChar('\t'), start, position(state) + 3), advance(advance(advance(state)))), + _ => LexErr("Expected closing quote", position(state)) + }, + Some('\\') => + match peekAt(state, 2) { + Some('\'') => LexOk(Token(TkChar('\\'), start, position(state) + 3), advance(advance(advance(state)))), + _ => LexErr("Expected closing quote", position(state)) + }, + _ => LexErr("Unknown escape sequence", position(state)) + } + else + match peekAt(state, 1) { + Some('\'') => LexOk(Token(TkChar(c), start, position(state) + 2), advance(advance(state))), + _ => LexErr("Expected closing quote", position(state)) + } + } + +// Collect doc comment text (after /// consumed) +fn collectDocComment(state: LexState, acc: List): (List, LexState) = + match peek(state) { + None => (acc, state), + Some(c) => + if c == '\n' then (acc, state) + else collectDocComment(advance(state), List.concat(acc, [c])) + } + +// Lex a single token +fn lexToken(state: LexState): LexResult = + let state = skipWhitespaceAndComments(state); + let start = position(state); + match peek(state) { + None => LexOk(Token(TkEof, start, start), state), + Some(c) => + if c == '\n' then + LexOk(Token(TkNewline, start, start + 1), advance(state)) + // Numbers + else if isDigit(c) then + let result = collectDigits(state, []); + match result { + (digits, nextState) => + // Check for float + match peek(nextState) { + Some('.') => + match peekAt(nextState, 1) { + Some(d) => + if isDigit(d) then + let fracResult = collectDigits(advance(nextState), []); + match fracResult { + (fracDigits, finalState) => + let intPart = String.join(List.map(digits, fn(ch) => String.fromChar(ch)), ""); + let fracPart = String.join(List.map(fracDigits, fn(ch) => String.fromChar(ch)), ""); + LexOk(Token(TkFloat(intPart + "." + fracPart), start, position(finalState)), finalState) + } + else + LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState), + None => + LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState) + }, + _ => LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState) + } + } + // Identifiers and keywords + else if isAlpha(c) then + let result = collectIdent(state, []); + match result { + (chars, nextState) => + let name = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); + LexOk(Token(identToToken(name), start, position(nextState)), nextState) + } + // String literals + else if c == '"' then + let result = lexStringBody(advance(state), []); + match result { + (chars, nextState) => + let str = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); + LexOk(Token(TkString(str), start, position(nextState)), nextState) + } + // Char literals + else if c == '\'' then + lexCharLiteral(advance(state)) + // Doc comments (///) + else if c == '/' then + match peekAt(state, 1) { + Some('/') => + match peekAt(state, 2) { + Some('/') => + // Skip the "/// " prefix + let docState = advance(advance(advance(state))); + let docState = match peek(docState) { + Some(' ') => advance(docState), + _ => docState + }; + let result = collectDocComment(docState, []); + match result { + (chars, nextState) => + let text = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); + LexOk(Token(TkDocComment(text), start, position(nextState)), nextState) + }, + _ => LexOk(Token(TkSlash, start, start + 1), advance(state)) + }, + _ => LexOk(Token(TkSlash, start, start + 1), advance(state)) + } + // Two-character operators + else if c == '=' then + match peekAt(state, 1) { + Some('=') => LexOk(Token(TkEqEq, start, start + 2), advance(advance(state))), + Some('>') => LexOk(Token(TkArrow, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkEq, start, start + 1), advance(state)) + } + else if c == '!' then + match peekAt(state, 1) { + Some('=') => LexOk(Token(TkNe, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkNot, start, start + 1), advance(state)) + } + else if c == '<' then + match peekAt(state, 1) { + Some('=') => LexOk(Token(TkLe, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkLt, start, start + 1), advance(state)) + } + else if c == '>' then + match peekAt(state, 1) { + Some('=') => LexOk(Token(TkGe, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkGt, start, start + 1), advance(state)) + } + else if c == '&' then + match peekAt(state, 1) { + Some('&') => LexOk(Token(TkAnd, start, start + 2), advance(advance(state))), + _ => LexErr("Expected '&&'", start) + } + else if c == '|' then + match peekAt(state, 1) { + Some('|') => LexOk(Token(TkOr, start, start + 2), advance(advance(state))), + Some('>') => LexOk(Token(TkPipeGt, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkPipe, start, start + 1), advance(state)) + } + else if c == '-' then + match peekAt(state, 1) { + Some('>') => LexOk(Token(TkThinArrow, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkMinus, start, start + 1), advance(state)) + } + else if c == ':' then + match peekAt(state, 1) { + Some(':') => LexOk(Token(TkColonColon, start, start + 2), advance(advance(state))), + _ => LexOk(Token(TkColon, start, start + 1), advance(state)) + } + // Single-character tokens + else if c == '+' then LexOk(Token(TkPlus, start, start + 1), advance(state)) + else if c == '*' then LexOk(Token(TkStar, start, start + 1), advance(state)) + else if c == '%' then LexOk(Token(TkPercent, start, start + 1), advance(state)) + else if c == '.' then LexOk(Token(TkDot, start, start + 1), advance(state)) + else if c == ',' then LexOk(Token(TkComma, start, start + 1), advance(state)) + else if c == ';' then LexOk(Token(TkSemi, start, start + 1), advance(state)) + else if c == '@' then LexOk(Token(TkAt, start, start + 1), advance(state)) + else if c == '(' then LexOk(Token(TkLParen, start, start + 1), advance(state)) + else if c == ')' then LexOk(Token(TkRParen, start, start + 1), advance(state)) + else if c == '{' then LexOk(Token(TkLBrace, start, start + 1), advance(state)) + else if c == '}' then LexOk(Token(TkRBrace, start, start + 1), advance(state)) + else if c == '[' then LexOk(Token(TkLBracket, start, start + 1), advance(state)) + else if c == ']' then LexOk(Token(TkRBracket, start, start + 1), advance(state)) + else if c == '_' then + // Check if it's just underscore or start of ident + match peekAt(state, 1) { + Some(next) => + if isAlphaNumeric(next) then + let result = collectIdent(state, []); + match result { + (chars, nextState) => + let name = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); + LexOk(Token(TkIdent(name), start, position(nextState)), nextState) + } + else LexOk(Token(TkUnderscore, start, start + 1), advance(state)), + None => LexOk(Token(TkUnderscore, start, start + 1), advance(state)) + } + else LexErr("Unexpected character: " + String.fromChar(c), start) + } + +// Lex all tokens from source +fn lexAll(state: LexState, acc: List): List = + match lexToken(state) { + LexErr(msg, pos) => + // On error, skip the character and continue + List.concat(acc, [Token(TkEof, pos, pos)]), + LexOk(token, nextState) => + match token { + Token(TkEof, _, _) => List.concat(acc, [token]), + Token(TkNewline, _, _) => + // Skip consecutive newlines + lexAll(nextState, List.concat(acc, [token])), + _ => lexAll(nextState, List.concat(acc, [token])) + } + } + +// Public API: tokenize a source string +fn tokenize(source: String): List = + let chars = String.chars(source); + let state = LexState(chars, 0); + lexAll(state, []) + +// === Token display === + +fn tokenKindToString(kind: TokenKind): String = + match kind { + TkInt(n) => "Int(" + toString(n) + ")", + TkFloat(s) => "Float(" + s + ")", + TkString(s) => "String(\"" + s + "\")", + TkChar(c) => "Char('" + String.fromChar(c) + "')", + TkBool(b) => if b then "true" else "false", + TkIdent(name) => "Ident(" + name + ")", + TkFn => "fn", TkLet => "let", TkIf => "if", + TkThen => "then", TkElse => "else", TkMatch => "match", + TkWith => "with", TkEffect => "effect", TkHandler => "handler", + TkRun => "run", TkResume => "resume", TkType => "type", + TkImport => "import", TkPub => "pub", TkAs => "as", + TkFrom => "from", TkTrait => "trait", TkImpl => "impl", TkFor => "for", + TkIs => "is", TkPure => "pure", TkTotal => "total", + TkIdempotent => "idempotent", TkDeterministic => "deterministic", + TkCommutative => "commutative", TkWhere => "where", TkAssume => "assume", + TkPlus => "+", TkMinus => "-", TkStar => "*", TkSlash => "/", + TkPercent => "%", TkEq => "=", TkEqEq => "==", TkNe => "!=", + TkLt => "<", TkLe => "<=", TkGt => ">", TkGe => ">=", + TkAnd => "&&", TkOr => "||", TkNot => "!", + TkPipe => "|", TkPipeGt => "|>", + TkArrow => "=>", TkThinArrow => "->", + TkDot => ".", TkColon => ":", TkColonColon => "::", + TkComma => ",", TkSemi => ";", TkAt => "@", + TkLParen => "(", TkRParen => ")", TkLBrace => "{", TkRBrace => "}", + TkLBracket => "[", TkRBracket => "]", + TkUnderscore => "_", TkNewline => "\\n", TkEof => "EOF", + TkDocComment(text) => "DocComment(\"" + text + "\")", + _ => "?" + } + +fn tokenToString(token: Token): String = + match token { + Token(kind, start, end) => + tokenKindToString(kind) + " [" + toString(start) + ".." + toString(end) + "]" + } + +// === Tests === + +fn printTokens(tokens: List): Unit with {Console} = + match List.head(tokens) { + None => Console.print(""), + Some(t) => { + Console.print(" " + tokenToString(t)); + match List.tail(tokens) { + Some(rest) => printTokens(rest), + None => Console.print("") + } + } + } + +fn testLexer(label: String, source: String): Unit with {Console} = { + Console.print("--- " + label + " ---"); + Console.print(" Input: \"" + source + "\""); + let tokens = tokenize(source); + printTokens(tokens) +} + +fn main(): Unit with {Console} = { + Console.print("=== Lux Self-Hosted Lexer ==="); + Console.print(""); + + // Basic tokens + testLexer("numbers", "42 3"); + Console.print(""); + + // Identifiers and keywords + testLexer("keywords", "fn main let x"); + Console.print(""); + + // Operators + testLexer("operators", "a + b == c"); + Console.print(""); + + // String literal + testLexer("string", "\"hello world\""); + Console.print(""); + + // Function declaration + testLexer("function", "fn add(a: Int, b: Int): Int = a + b"); + Console.print(""); + + // Behavioral properties + testLexer("behavioral", "fn add(a: Int): Int is pure = a"); + Console.print(""); + + // Complex expression + testLexer("complex", "let result = if x > 0 then x else 0 - x"); + Console.print(""); + + Console.print("=== Lexer test complete ===") +} + +let _ = run main() with {}