use multipeek::multipeek; use crate::*; /// We lex input into a TokenStream containing the position of the token and the Token itself. pub struct TokenStream(Vec<(usize, Token)>); /// Iterating over a TokenStream pops both the Tokens and the spans. /// The parser only *really* cares about Tokens... spans are for use in error reporting. impl IntoIterator for TokenStream { type Item = (usize, Token); type IntoIter = std::vec::IntoIter<(usize, Token)>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } } /// Basic syntax tokens. Form an unambiguous TokenStream. #[derive(Clone, PartialEq)] pub enum Token { Key(Keyword), // keyword identifiers. ex. for, use, func... Lit(Literal), // literal values. ex. for strings/comments. Sep(Punctuation), // punctuation. non-word tokens. Op(String), // operators. lexographically separate so the parser can parse infix notation Num(String), // numeric values. ex. 413, 0b101011, 0xabcd, 612.1024 Ident(String), // identifiers. variable names, function names, method calls... Indent(usize), // indentation. denotes line breaks and scope at which a line starts End // explicit scope end marker. inserted in normalization. } #[derive(Clone, PartialEq, Debug)] pub enum LexicalError { InvalidIndentation(usize), MismatchedParens(usize), MismatchedBrackets(usize), UnknownPunctuation(usize), EarlyEndOfInput, } impl std::fmt::Display for LexicalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) } } impl std::error::Error for LexicalError {} /// Literal text. This can be chars, strings, and comments. #[derive(Clone, PartialEq)] pub enum Literal { Char(String), SingleLineString(String), MultiLineString(String), Comment(String), DocComment(String), MultiLineComment(String), } /// Keywords, made explicit for easier use with Rust. /// (static strings inside match patterns are fucky!!) #[derive(Clone, PartialEq)] pub enum Keyword { Pub, Let, Var, Const, Func, Macro, Type, Mod, Use, As, For, While, Loop, Block, Static, If, When, Elif, Else, Match, Where, Try, Catch, Finally, Struct, Tuple, Enum, Union, Interface, Distinct, Ref, Ptr, Mut, Break, Continue, Return, In, Is, Of, } /// All punctuation recognized by the lexer. /// Note the distinction between FuncLeftParen and TupleLeftParen. #[derive(Clone, PartialEq)] pub enum Punctuation { Comma, // , Period, // . Semicolon, // ; Colon, // : BackTick, // ` SingleQuote, // ' DoubleQuote, // " FuncLeftParen, // ( FuncRightParen, // ) TupleLeftParen, // ( TupleRightParen, // ) GenericLeftBracket, // [ GenericRightBracket, // ] ArrayLeftBracket, // [ ArrayRightBracket, // ] DependentLeftBrace, // { (not currently used) DependentRightBrace, // } StructLeftBrace, // { StructRightBrace, // } Equals, // = Plus, // + Minus, // distinction between minus and negative. Negative, // negative binds tightly: there is no whitespace following. Times, // * Slash, // / LessThan, // < GreaterThan, // > At, // @ Sha, // $ Tilde, // ~ And, // & Percent, // % Or, // | Exclamation, // ! Question, // ? Caret, // ^ Backslash, // \ } struct Input<'a>(multipeek::MultiPeek>); impl<'a> Input<'a> { /// Map input.next() to return Results for use with the propagation operator fn next(&mut self) -> Result { self.0.next().map(|x| x.1).ok_or(EarlyEndOfInput.into()) } fn peek_eat(&mut self, expected: char) -> bool { if self.peek().is_ok_and(|x| x == expected) { self.0.next(); true } else { false } } fn peek(&mut self) -> Result { self.0.peek().copied().map(|x| x.1).ok_or(EarlyEndOfInput.into()) } fn peek_nth(&mut self, n: usize) -> Result { self.0.peek_nth(n).copied().map(|x| x.1).ok_or(EarlyEndOfInput.into()) } fn consume_indentation(&mut self) -> usize { let mut count = 0; while self.peek_eat(' ') { count += 1; } if self.peek_eat('\n') { self.consume_indentation() } else { count } } fn strip(&mut self) { loop { if !self.peek_eat(' ') { break } } } } use Token::*; use Literal::*; use Punctuation::*; use LexicalError::*; enum Paren { Func, Tuple } enum Bracket { Generic, Array } struct State { start_of_line: bool, paren_stack: Vec, bracket_stack: Vec, next_token_is_parameter_punctuation: bool, } /// Parses whitespace-sensitive code into an unambiguous TokenStream. pub fn lex(input: &str) -> Result { let mut input = Input(multipeek(input.char_indices())); let mut res = TokenStream(Vec::new()); let mut state = State { start_of_line: true, paren_stack: vec!(), bracket_stack: vec!(), next_token_is_parameter_punctuation: false, }; res.0.push((0, Indent(input.consume_indentation()))); loop { input.strip(); match input.0.next() { Some((pos, c)) => res.0.push((pos, lex_token(&mut input, &mut state, pos, c)?)), None => break } } Ok(res) } fn lex_token(input: &mut Input, state: &mut State, pos: usize, c: char) -> Result { Ok(match c { '\n' => Indent(input.consume_indentation()), '\t' => return Err(InvalidIndentation(pos).into()), ' ' => unreachable!(), ':' => Sep(Colon), '=' => Sep(Equals), '\'' => lex_char(input)?, '"' => lex_string(input)?, '#' => lex_comment(input)?, c if c.is_alphabetic() || c == '_' => lex_identifier(input, state, c), // valid identifiers! c if c.is_digit(10) => lex_number(input, c), // numeric literals! '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. match input.peek()? { ' ' => Sep(Minus), _ => Sep(Negative) } }, '(' => { if state.next_token_is_parameter_punctuation { state.next_token_is_parameter_punctuation = false; state.paren_stack.push(Paren::Func); Sep(FuncLeftParen) } else { state.paren_stack.push(Paren::Tuple); Sep(TupleLeftParen) } }, '[' => { if state.next_token_is_parameter_punctuation { state.next_token_is_parameter_punctuation = false; state.bracket_stack.push(Bracket::Generic); Sep(GenericLeftBracket) } else { state.bracket_stack.push(Bracket::Array); Sep(ArrayLeftBracket) } }, ')' => { match state.paren_stack.pop() { Some(Paren::Func) => Sep(FuncRightParen), Some(Paren::Tuple) => Sep(TupleRightParen), None => return Err(MismatchedParens(pos).into()), } }, ']' => { match state.bracket_stack.pop() { Some(Bracket::Generic) => { if let Ok('(') = input.peek() { state.next_token_is_parameter_punctuation = true; } Sep(GenericRightBracket) }, Some(Bracket::Array) => Sep(ArrayRightBracket), None => return Err(MismatchedBrackets(pos).into()), } }, '`' => { // backticks are used for injections, so generics/parameters may follow // todo: SERIOUS ambiguity issue with `injection`[] match input.peek() { Ok('(') | Ok('[') => { state.next_token_is_parameter_punctuation = true; }, _ => () } Sep(BackTick) }, ',' => Sep(Comma), '.' => Sep(Period), ';' => Sep(Semicolon), '{' => Sep(StructLeftBrace), '}' => Sep(StructRightBrace), '+' => Sep(Plus), '*' => Sep(Times), '/' => Sep(Slash), '<' => Sep(LessThan), '>' => Sep(GreaterThan), '@' => Sep(At), '$' => Sep(Sha), '~' => Sep(Tilde), '&' => Sep(And), '|' => Sep(Or), '!' => Sep(Exclamation), '?' => Sep(Question), '^' => Sep(Caret), '\\' => Sep(Backslash), _ => return Err(UnknownPunctuation(pos).into()) }) } /// Char ::= '\'' '\'' fn lex_char(input: &mut Input) -> Result { let mut buf = String::new(); loop { match input.next()? { '\'' => break, '\\' => { if let y = input.next()? { buf.push('\\'); buf.push(y); } }, x => buf.push(x) } } Ok(Lit(Char(buf))) } /// String ::= '"' '"' | '"""' '"""' fn lex_string(input: &mut Input) -> Result { let mut buf = String::new(); match (input.peek_nth(0), input.peek_nth(1)) { (Ok('"'), Ok('"')) => { // triple quoted strings input.next()?; input.next()?; loop { match input.next()? { '"' if input.peek_nth(0)? == '"' && input.peek_nth(1)? == '"' => { input.next()?; input.next()?; break; }, x => buf.push(x) } } Ok(Lit(MultiLineString(buf))) }, _ => { // single quoted strings loop { match input.next()? { '"' => break, '\\' => if let y = input.next()? { buf.push(y) // FIXME }, x => buf.push(x) } } Ok(Lit(SingleLineString(buf))) } } } /// Comment ::= '#' '\n' | '#[' Comment | ']#' fn lex_comment(input: &mut Input) -> Result { let mut buf = String::new(); match input.peek() { Ok('[') => { // block comment, can be nested input.next()?; let mut comment_level = 1; while comment_level > 0 { match input.next()? { '#' if input.peek_eat('[') => { comment_level += 1; buf.push_str("#["); }, ']' if input.peek_eat('#') => { comment_level -= 1; buf.push_str("]#"); }, x => buf.push(x) }; } Ok(Lit(MultiLineComment(buf))) }, Ok('#') => { // documentation comment input.next()?; while let Ok(x) = input.peek() { match x { '\n' => break, _ => { buf.push(x); input.next()?; } } } Ok(Lit(DocComment(buf))) }, _ => { // standard comment, runs til EOL while let Ok(x) = input.peek() { match x { '\n' => break, _ => { buf.push(x); input.next()?; } } } Ok(Lit(Comment(buf))) } } } #[allow(unused_assignments)] fn lex_identifier(input: &mut Input, state: &mut State, c: char) -> Token { use Keyword::*; let mut buf = String::new(); let mut keyword = None; buf.push(c); loop { match input.peek() { Ok(x) if x.is_alphanumeric() || x == '_' => { buf.push(x); let _ = input.next(); }, _ => { keyword = Some(match buf.as_str() { // keywords! "pub" => Key(Pub), "let" => Key(Let), "var" => Key(Var), "const" => Key(Const), "func" => Key(Func), "macro" => Key(Macro), "type" => Key(Type), "mod" => Key(Mod), "use" => Key(Use), "as" => Key(As), "for" => Key(For), "while" => Key(While), "loop" => Key(Loop), "block" => Key(Block), "static" => Key(Static), "if" => Key(If), "when" => Key(When), "elif" => Key(Elif), "else" => Key(Else), "match" => Key(Match), "where" => Key(Where), "try" => Key(Try), "catch" => Key(Catch), "finally" => Key(Finally), "struct" => Key(Struct), "tuple" => Key(Tuple), "enum" => Key(Enum), "union" => Key(Union), "interface" => Key(Interface), "distinct" => Key(Distinct), "ref" => Key(Ref), "ptr" => Key(Ptr), "mut" => Key(Mut), "in" => Key(In), "is" => Key(Is), "of" => Key(Of), "break" => Key(Break), "continue" => Key(Continue), "return" => Key(Return), _ => Ident(String::from(&buf)) }); break } } } // () and [] denote both parameters/generics and tuples/arrays // we must disambiguate by treating those *directly* after words as such match input.peek() { Ok('(') | Ok('[') => state.next_token_is_parameter_punctuation = true, _ => (), } keyword.unwrap() } fn lex_number(input: &mut Input, c: char) -> Token { let mut buf = String::new(); buf.push(c); loop { match input.peek() { Ok(x) if x.is_alphanumeric() => { // FIXME buf.push(x); let _ = input.next(); }, _ => break } } Num(buf) } impl std::fmt::Display for TokenStream { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; let mut prev_token = Indent(0); for (_, token) in &self.0 { match (prev_token, token) { // fixme (Ident(_), Ident(_)) | (Ident(_), Num(_)) | (Num(_), Ident(_)) | (Num(_), Num(_)) | (Key(_), Key(_)) | (Key(_), Ident(_)) | (Ident(_), Key(_)) => write!(f, " {}", token)?, _ => write!(f, "{}", token)?, } prev_token = token.clone(); } Ok(()) } } impl std::fmt::Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; match self { Key(word) => write!(f, "{}", word), Ident(val) | Op(val) | Num(val) => write!(f, "{}", val), Lit(lit) => write!(f, "{}", lit), Sep(sep) => write!(f, "{}", sep), Indent(i) => write!(f, "\n{}", " ".repeat(*i)), End => write!(f, "end"), } } } impl std::fmt::Display for Literal { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Literal::*; match self { Char(val) => write!(f, "'{}'", val), SingleLineString(val) => write!(f, "\"{}\"", val), MultiLineString(val) => write!(f, "\"\"\"{}\"\"\"", val), Comment(val) => write!(f, "#{}", val), DocComment(val) => write!(f, "##{}", val), MultiLineComment(val) => write!(f, "#[{}]#", val), } } } impl std::fmt::Display for Keyword { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Keyword::*; match self { Pub => write!(f, "pub"), Let => write!(f, "let"), Var => write!(f, "var"), Const => write!(f, "const"), Func => write!(f, "func"), Macro => write!(f, "macro"), Type => write!(f, "type"), Mod => write!(f, "mod"), Use => write!(f, "use"), As => write!(f, "as"), For => write!(f, "for"), While => write!(f, "while"), Loop => write!(f, "loop"), Block => write!(f, "block"), Static => write!(f, "static"), If => write!(f, "if"), When => write!(f, "when"), Elif => write!(f, "elif"), Else => write!(f, "else"), Match => write!(f, "match"), Where => write!(f, "where"), Try => write!(f, "try"), Catch => write!(f, "catch"), Finally => write!(f, "finally"), Struct => write!(f, "struct"), Tuple => write!(f, "tuple"), Enum => write!(f, "enum"), Union => write!(f, "union"), Interface => write!(f, "interface"), Distinct => write!(f, "distinct"), Ref => write!(f, "ref"), Ptr => write!(f, "ptr"), Mut => write!(f, "mut"), Break => write!(f, "break"), Continue => write!(f, "continue"), Return => write!(f, "return"), In => write!(f, "in"), Is => write!(f, "is"), Of => write!(f, "of"), } } } impl std::fmt::Display for Punctuation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Punctuation::*; match self { Comma => write!(f, ","), Period => write!(f, "."), Semicolon => write!(f, ";"), Colon => write!(f, ":"), BackTick => write!(f, "`"), SingleQuote => write!(f, "'"), DoubleQuote => write!(f, "\""), FuncLeftParen => write!(f, "("), FuncRightParen => write!(f, ")"), TupleLeftParen => write!(f, " ("), TupleRightParen => write!(f, ")"), GenericLeftBracket => write!(f, "["), GenericRightBracket => write!(f, "]"), ArrayLeftBracket => write!(f, " ["), ArrayRightBracket => write!(f, "]"), DependentLeftBrace => write!(f, "{{"), DependentRightBrace => write!(f, "}}"), StructLeftBrace => write!(f, "{{"), StructRightBrace => write!(f, "}}"), Equals => write!(f, "="), Plus => write!(f, "+"), Minus => write!(f, "- "), Negative => write!(f, "-"), Times => write!(f, "*"), Slash => write!(f, "/"), LessThan => write!(f, "<"), GreaterThan => write!(f, ">"), At => write!(f, "@"), Sha => write!(f, "$"), Tilde => write!(f, "~"), And => write!(f, "&"), Percent => write!(f, "%"), Or => write!(f, "|"), Exclamation => write!(f, "!"), Question => write!(f, "?"), Caret => write!(f, "^"), Backslash => write!(f, "\\"), } } } pub fn test() { let input = std::fs::read_to_string("std/ast.pk").unwrap(); let lexed = lex(&input).unwrap(); println!("{}", lexed); }