// Lux Lexer — Self-hosted lexer for the Lux language // // This is the first component of the Lux-in-Lux compiler. // It tokenizes Lux source code into a list of tokens. // // Design: // - Recursive descent character scanning // - Immutable state (ParseState tracks chars + position) // - Pattern matching for all token types // === Token types === type TokenKind = // Literals | TkInt(Int) | TkFloat(String) | TkString(String) | TkChar(Char) | TkBool(Bool) // Identifiers | TkIdent(String) // Keywords | TkFn | TkLet | TkIf | TkThen | TkElse | TkMatch | TkWith | TkEffect | TkHandler | TkRun | TkResume | TkType | TkImport | TkPub | TkAs | TkFrom | TkTrait | TkImpl | TkFor // Behavioral | TkIs | TkPure | TkTotal | TkIdempotent | TkDeterministic | TkCommutative | TkWhere | TkAssume // Operators | TkPlus | TkMinus | TkStar | TkSlash | TkPercent | TkEq | TkEqEq | TkNe | TkLt | TkLe | TkGt | TkGe | TkAnd | TkOr | TkNot | TkPipe | TkPipeGt | TkArrow | TkThinArrow | TkDot | TkColon | TkColonColon | TkComma | TkSemi | TkAt // Delimiters | TkLParen | TkRParen | TkLBrace | TkRBrace | TkLBracket | TkRBracket // Special | TkUnderscore | TkNewline | TkEof // Doc comment | TkDocComment(String) type Token = | Token(TokenKind, Int, Int) // kind, start, end type LexState = | LexState(List, Int) // chars, position type LexResult = | LexOk(Token, LexState) | LexErr(String, Int) // === Character utilities === fn peek(state: LexState): Option = match state { LexState(chars, pos) => List.get(chars, pos) } fn peekAt(state: LexState, offset: Int): Option = match state { LexState(chars, pos) => List.get(chars, pos + offset) } fn advance(state: LexState): LexState = match state { LexState(chars, pos) => LexState(chars, pos + 1) } fn position(state: LexState): Int = match state { LexState(_, pos) => pos } fn isDigit(c: Char): Bool = c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9' fn isAlpha(c: Char): Bool = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' fn isAlphaNumeric(c: Char): Bool = isAlpha(c) || isDigit(c) fn isWhitespace(c: Char): Bool = c == ' ' || c == '\t' || c == '\r' // === Core lexing === fn skipLineComment(state: LexState): LexState = match peek(state) { None => state, Some(c) => if c == '\n' then state else skipLineComment(advance(state)) } fn skipWhitespaceAndComments(state: LexState): LexState = match peek(state) { None => state, Some(c) => if isWhitespace(c) then skipWhitespaceAndComments(advance(state)) else if c == '/' then match peekAt(state, 1) { Some('/') => // Check for doc comment (///) match peekAt(state, 2) { Some('/') => state, // Don't skip doc comments _ => skipWhitespaceAndComments(skipLineComment(advance(advance(state)))) }, _ => state } else state } // Collect identifier characters fn collectIdent(state: LexState, acc: List): (List, LexState) = match peek(state) { None => (acc, state), Some(c) => if isAlphaNumeric(c) then collectIdent(advance(state), List.concat(acc, [c])) else (acc, state) } // Collect number characters (digits only) fn collectDigits(state: LexState, acc: List): (List, LexState) = match peek(state) { None => (acc, state), Some(c) => if isDigit(c) then collectDigits(advance(state), List.concat(acc, [c])) else (acc, state) } // Convert list of digit chars to int fn charsToInt(chars: List): Int = List.fold(chars, 0, fn(acc, c) => acc * 10 + charToDigit(c)) fn charToDigit(c: Char): Int = if c == '0' then 0 else if c == '1' then 1 else if c == '2' then 2 else if c == '3' then 3 else if c == '4' then 4 else if c == '5' then 5 else if c == '6' then 6 else if c == '7' then 7 else if c == '8' then 8 else 9 // Map identifier string to keyword token or ident fn identToToken(name: String): TokenKind = if name == "fn" then TkFn else if name == "let" then TkLet else if name == "if" then TkIf else if name == "then" then TkThen else if name == "else" then TkElse else if name == "match" then TkMatch else if name == "with" then TkWith else if name == "effect" then TkEffect else if name == "handler" then TkHandler else if name == "run" then TkRun else if name == "resume" then TkResume else if name == "type" then TkType else if name == "true" then TkBool(true) else if name == "false" then TkBool(false) else if name == "import" then TkImport else if name == "pub" then TkPub else if name == "as" then TkAs else if name == "from" then TkFrom else if name == "trait" then TkTrait else if name == "impl" then TkImpl else if name == "for" then TkFor else if name == "is" then TkIs else if name == "pure" then TkPure else if name == "total" then TkTotal else if name == "idempotent" then TkIdempotent else if name == "deterministic" then TkDeterministic else if name == "commutative" then TkCommutative else if name == "where" then TkWhere else if name == "assume" then TkAssume else TkIdent(name) // Lex a string literal (after opening quote consumed) fn lexStringBody(state: LexState, acc: List): (List, LexState) = match peek(state) { None => (acc, state), Some(c) => if c == '"' then (acc, advance(state)) else if c == '\\' then match peekAt(state, 1) { Some('n') => lexStringBody(advance(advance(state)), List.concat(acc, ['\n'])), Some('t') => lexStringBody(advance(advance(state)), List.concat(acc, ['\t'])), Some('\\') => lexStringBody(advance(advance(state)), List.concat(acc, ['\\'])), Some('"') => lexStringBody(advance(advance(state)), List.concat(acc, ['"'])), _ => lexStringBody(advance(state), List.concat(acc, [c])) } else lexStringBody(advance(state), List.concat(acc, [c])) } // Lex a char literal (after opening quote consumed) fn lexCharLiteral(state: LexState): LexResult = let start = position(state) - 1; match peek(state) { None => LexErr("Unexpected end of input in char literal", start), Some(c) => if c == '\\' then match peekAt(state, 1) { Some('n') => match peekAt(state, 2) { Some('\'') => LexOk(Token(TkChar('\n'), start, position(state) + 3), advance(advance(advance(state)))), _ => LexErr("Expected closing quote", position(state)) }, Some('t') => match peekAt(state, 2) { Some('\'') => LexOk(Token(TkChar('\t'), start, position(state) + 3), advance(advance(advance(state)))), _ => LexErr("Expected closing quote", position(state)) }, Some('\\') => match peekAt(state, 2) { Some('\'') => LexOk(Token(TkChar('\\'), start, position(state) + 3), advance(advance(advance(state)))), _ => LexErr("Expected closing quote", position(state)) }, _ => LexErr("Unknown escape sequence", position(state)) } else match peekAt(state, 1) { Some('\'') => LexOk(Token(TkChar(c), start, position(state) + 2), advance(advance(state))), _ => LexErr("Expected closing quote", position(state)) } } // Collect doc comment text (after /// consumed) fn collectDocComment(state: LexState, acc: List): (List, LexState) = match peek(state) { None => (acc, state), Some(c) => if c == '\n' then (acc, state) else collectDocComment(advance(state), List.concat(acc, [c])) } // Lex a single token fn lexToken(state: LexState): LexResult = let state = skipWhitespaceAndComments(state); let start = position(state); match peek(state) { None => LexOk(Token(TkEof, start, start), state), Some(c) => if c == '\n' then LexOk(Token(TkNewline, start, start + 1), advance(state)) // Numbers else if isDigit(c) then let result = collectDigits(state, []); match result { (digits, nextState) => // Check for float match peek(nextState) { Some('.') => match peekAt(nextState, 1) { Some(d) => if isDigit(d) then let fracResult = collectDigits(advance(nextState), []); match fracResult { (fracDigits, finalState) => let intPart = String.join(List.map(digits, fn(ch) => String.fromChar(ch)), ""); let fracPart = String.join(List.map(fracDigits, fn(ch) => String.fromChar(ch)), ""); LexOk(Token(TkFloat(intPart + "." + fracPart), start, position(finalState)), finalState) } else LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState), None => LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState) }, _ => LexOk(Token(TkInt(charsToInt(digits)), start, position(nextState)), nextState) } } // Identifiers and keywords else if isAlpha(c) then let result = collectIdent(state, []); match result { (chars, nextState) => let name = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); LexOk(Token(identToToken(name), start, position(nextState)), nextState) } // String literals else if c == '"' then let result = lexStringBody(advance(state), []); match result { (chars, nextState) => let str = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); LexOk(Token(TkString(str), start, position(nextState)), nextState) } // Char literals else if c == '\'' then lexCharLiteral(advance(state)) // Doc comments (///) else if c == '/' then match peekAt(state, 1) { Some('/') => match peekAt(state, 2) { Some('/') => // Skip the "/// " prefix let docState = advance(advance(advance(state))); let docState = match peek(docState) { Some(' ') => advance(docState), _ => docState }; let result = collectDocComment(docState, []); match result { (chars, nextState) => let text = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); LexOk(Token(TkDocComment(text), start, position(nextState)), nextState) }, _ => LexOk(Token(TkSlash, start, start + 1), advance(state)) }, _ => LexOk(Token(TkSlash, start, start + 1), advance(state)) } // Two-character operators else if c == '=' then match peekAt(state, 1) { Some('=') => LexOk(Token(TkEqEq, start, start + 2), advance(advance(state))), Some('>') => LexOk(Token(TkArrow, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkEq, start, start + 1), advance(state)) } else if c == '!' then match peekAt(state, 1) { Some('=') => LexOk(Token(TkNe, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkNot, start, start + 1), advance(state)) } else if c == '<' then match peekAt(state, 1) { Some('=') => LexOk(Token(TkLe, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkLt, start, start + 1), advance(state)) } else if c == '>' then match peekAt(state, 1) { Some('=') => LexOk(Token(TkGe, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkGt, start, start + 1), advance(state)) } else if c == '&' then match peekAt(state, 1) { Some('&') => LexOk(Token(TkAnd, start, start + 2), advance(advance(state))), _ => LexErr("Expected '&&'", start) } else if c == '|' then match peekAt(state, 1) { Some('|') => LexOk(Token(TkOr, start, start + 2), advance(advance(state))), Some('>') => LexOk(Token(TkPipeGt, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkPipe, start, start + 1), advance(state)) } else if c == '-' then match peekAt(state, 1) { Some('>') => LexOk(Token(TkThinArrow, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkMinus, start, start + 1), advance(state)) } else if c == ':' then match peekAt(state, 1) { Some(':') => LexOk(Token(TkColonColon, start, start + 2), advance(advance(state))), _ => LexOk(Token(TkColon, start, start + 1), advance(state)) } // Single-character tokens else if c == '+' then LexOk(Token(TkPlus, start, start + 1), advance(state)) else if c == '*' then LexOk(Token(TkStar, start, start + 1), advance(state)) else if c == '%' then LexOk(Token(TkPercent, start, start + 1), advance(state)) else if c == '.' then LexOk(Token(TkDot, start, start + 1), advance(state)) else if c == ',' then LexOk(Token(TkComma, start, start + 1), advance(state)) else if c == ';' then LexOk(Token(TkSemi, start, start + 1), advance(state)) else if c == '@' then LexOk(Token(TkAt, start, start + 1), advance(state)) else if c == '(' then LexOk(Token(TkLParen, start, start + 1), advance(state)) else if c == ')' then LexOk(Token(TkRParen, start, start + 1), advance(state)) else if c == '{' then LexOk(Token(TkLBrace, start, start + 1), advance(state)) else if c == '}' then LexOk(Token(TkRBrace, start, start + 1), advance(state)) else if c == '[' then LexOk(Token(TkLBracket, start, start + 1), advance(state)) else if c == ']' then LexOk(Token(TkRBracket, start, start + 1), advance(state)) else if c == '_' then // Check if it's just underscore or start of ident match peekAt(state, 1) { Some(next) => if isAlphaNumeric(next) then let result = collectIdent(state, []); match result { (chars, nextState) => let name = String.join(List.map(chars, fn(ch) => String.fromChar(ch)), ""); LexOk(Token(TkIdent(name), start, position(nextState)), nextState) } else LexOk(Token(TkUnderscore, start, start + 1), advance(state)), None => LexOk(Token(TkUnderscore, start, start + 1), advance(state)) } else LexErr("Unexpected character: " + String.fromChar(c), start) } // Lex all tokens from source fn lexAll(state: LexState, acc: List): List = match lexToken(state) { LexErr(msg, pos) => // On error, skip the character and continue List.concat(acc, [Token(TkEof, pos, pos)]), LexOk(token, nextState) => match token { Token(TkEof, _, _) => List.concat(acc, [token]), Token(TkNewline, _, _) => // Skip consecutive newlines lexAll(nextState, List.concat(acc, [token])), _ => lexAll(nextState, List.concat(acc, [token])) } } // Public API: tokenize a source string fn tokenize(source: String): List = let chars = String.chars(source); let state = LexState(chars, 0); lexAll(state, []) // === Token display === fn tokenKindToString(kind: TokenKind): String = match kind { TkInt(n) => "Int(" + toString(n) + ")", TkFloat(s) => "Float(" + s + ")", TkString(s) => "String(\"" + s + "\")", TkChar(c) => "Char('" + String.fromChar(c) + "')", TkBool(b) => if b then "true" else "false", TkIdent(name) => "Ident(" + name + ")", TkFn => "fn", TkLet => "let", TkIf => "if", TkThen => "then", TkElse => "else", TkMatch => "match", TkWith => "with", TkEffect => "effect", TkHandler => "handler", TkRun => "run", TkResume => "resume", TkType => "type", TkImport => "import", TkPub => "pub", TkAs => "as", TkFrom => "from", TkTrait => "trait", TkImpl => "impl", TkFor => "for", TkIs => "is", TkPure => "pure", TkTotal => "total", TkIdempotent => "idempotent", TkDeterministic => "deterministic", TkCommutative => "commutative", TkWhere => "where", TkAssume => "assume", TkPlus => "+", TkMinus => "-", TkStar => "*", TkSlash => "/", TkPercent => "%", TkEq => "=", TkEqEq => "==", TkNe => "!=", TkLt => "<", TkLe => "<=", TkGt => ">", TkGe => ">=", TkAnd => "&&", TkOr => "||", TkNot => "!", TkPipe => "|", TkPipeGt => "|>", TkArrow => "=>", TkThinArrow => "->", TkDot => ".", TkColon => ":", TkColonColon => "::", TkComma => ",", TkSemi => ";", TkAt => "@", TkLParen => "(", TkRParen => ")", TkLBrace => "{", TkRBrace => "}", TkLBracket => "[", TkRBracket => "]", TkUnderscore => "_", TkNewline => "\\n", TkEof => "EOF", TkDocComment(text) => "DocComment(\"" + text + "\")", _ => "?" } fn tokenToString(token: Token): String = match token { Token(kind, start, end) => tokenKindToString(kind) + " [" + toString(start) + ".." + toString(end) + "]" } // === Tests === fn printTokens(tokens: List): Unit with {Console} = match List.head(tokens) { None => Console.print(""), Some(t) => { Console.print(" " + tokenToString(t)); match List.tail(tokens) { Some(rest) => printTokens(rest), None => Console.print("") } } } fn testLexer(label: String, source: String): Unit with {Console} = { Console.print("--- " + label + " ---"); Console.print(" Input: \"" + source + "\""); let tokens = tokenize(source); printTokens(tokens) } fn main(): Unit with {Console} = { Console.print("=== Lux Self-Hosted Lexer ==="); Console.print(""); // Basic tokens testLexer("numbers", "42 3"); Console.print(""); // Identifiers and keywords testLexer("keywords", "fn main let x"); Console.print(""); // Operators testLexer("operators", "a + b == c"); Console.print(""); // String literal testLexer("string", "\"hello world\""); Console.print(""); // Function declaration testLexer("function", "fn add(a: Int, b: Int): Int = a + b"); Console.print(""); // Behavioral properties testLexer("behavioral", "fn add(a: Int): Int is pure = a"); Console.print(""); // Complex expression testLexer("complex", "let result = if x > 0 then x else 0 - x"); Console.print(""); Console.print("=== Lexer test complete ===") } let _ = run main() with {}