use multipeek::multipeek; pub type Result = core::result::Result>; pub struct TokenStream(Vec); #[derive(Clone, PartialEq, Debug)] pub enum LexicalError { InvalidIndentation, MismatchedParens, MismatchedBrackets, UnknownPunctuation, } impl std::fmt::Display for LexicalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) } } impl std::error::Error for LexicalError {} /// **Basic** syntax tokens. Form an unambiguous TokenStream. #[derive(Clone, PartialEq)] pub enum Token { Word(String), // identifiers. Num(String), // numeric value, ex. 413, 0b101011, 0xabcd Lit(Literal), // literal value, ex. for strings/comments. Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words. Indent(usize), // indentation. denotes line breaks and scope at which a line starts. } #[derive(Clone, PartialEq)] pub enum Literal { Char(String), SingleLineString(String), MultiLineString(String), Comment(String), DocComment(String), MultiLineComment(String), } /// All punctuation recognized by the lexer. /// Note the distinction between FuncLeftParen and TupleLeftParen. #[derive(Clone, PartialEq)] pub enum Punctuation { Comma, // , Period, // . Semicolon, // ; Colon, // : BackTick, // ` SingleQuote, // ' DoubleQuote, // " FuncLeftParen, // ( FuncRightParen, // ) TupleLeftParen, // ( TupleRightParen, // ) GenericLeftBracket, // [ GenericRightBracket, // ] ArrayLeftBracket, // [ ArrayRightBracket, // ] StructLeftBrace, // } StructRightBrace, // } Equals, // = Plus, // + Minus, // distinction between minus and negative. Negative, // negative binds tightly: there is no whitespace following. Times, // * Slash, // / LessThan, // < GreaterThan, // > At, // @ Sha, // $ Tilde, // ~ And, // & Percent, // % Or, // | Exclamation, // ! Question, // ? Caret, // ^ Backslash, // \ } /// Parses whitespace-sensitive code into an unambiguous TokenStream. /// Also useful for formatting. pub fn tokenize(input: &str) -> Result { // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. // Tokens are matched by looping within their case until complete. // This then eliminates the need for most global parser state. (i hate state) use Token::*; use Literal::*; use Punctuation::*; use LexicalError::*; enum Paren { Func, Tuple } enum Bracket { Generic, Array } struct State { start_of_line: bool, paren_stack: Vec, bracket_stack: Vec, } let mut state = State { start_of_line: true, paren_stack: vec!(), bracket_stack: vec!(), }; let mut buf = String::new(); let mut res = Vec::new(); // `char` in rust is four bytes it's fine let mut input = multipeek(input.chars()); while let Some(c) = input.next() { match c { ' ' => { // indentation! and whitespace match res.last() { Some(Indent(_)) => { // indentation! res.pop(); // discard previous empty or useless Indent token let mut current_indent_level = 1; while let Some(x) = input.peek() { match x { ' ' => { current_indent_level += 1; input.next(); }, _ => match res.last() { // indentation ends Some(Word(a)) if a == "==" || a == "and" || a == "or" || a == "xor" || a == "in" || a == "is" => break, Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) | Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) | Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break, _ => { res.push(Indent(current_indent_level)); break; } } } } }, _ => { // get rid of excess (all) whitespace between words/operators while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); } } } }, '\t' => return Err(InvalidIndentation.into()), '\n' => res.push(Indent(0)), '\'' => { // chars! while let Some(x) = input.next() { match x { '\'' => break, '\\' => if let Some(y) = input.next() { buf.push(y) }, _ => buf.push(x) } } res.push(Lit(Char(String::from(&buf)))); }, '"' => { // strings! match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) { (Some('"'), Some('"')) => { // triple quoted strings input.next(); input.next(); while let Some(x) = input.next() { match x { '"' if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') => { input.next(); input.next(); break; }, _ => buf.push(x) } } res.push(Lit(MultiLineString(String::from(&buf)))); }, (_, _) => { // single quoted strings while let Some(x) = input.next() { match x { '"' => break, '\\' => if let Some(y) = input.next() { buf.push(y) }, _ => buf.push(x) } } res.push(Lit(SingleLineString(String::from(&buf)))); } } }, '#' => { // comments! match input.peek() { Some('[') => { // block comment, can be nested input.next(); let mut comment_level = 1; while let Some(x) = input.next() && comment_level > 0 { match x { '#' if input.peek() == Some(&'[') => { comment_level += 1; input.next(); }, ']' if input.peek() == Some(&'#') => { comment_level -= 1; input.next(); }, _ => buf.push(x) } } res.push(Lit(MultiLineComment(String::from(&buf)))); }, Some(&'#') => { // documentation comment input.next(); while let Some(x) = input.next() { match x { '\n' => break, _ => { buf.push(x); } } } res.push(Lit(DocComment(String::from(&buf)))); }, _ => { // standard comment, runs til EOL while let Some(x) = input.next() { match x { '\n' => break, _ => { buf.push(x); } } } res.push(Lit(Comment(String::from(&buf)))); } } }, c if c.is_alphabetic() || c == '_' => { // valid identifiers! buf.push(c); while let Some(x) = input.next() { match x { x if x.is_alphanumeric() || x == '_' => buf.push(x), _ => { res.push(Word(String::from(&buf))); match x { // () and [] denote both parameters/generics and tuples/arrays '(' => { // we must disambiguate by treating those *directly* after words as such res.push(Sep(FuncLeftParen)); state.paren_stack.push(Paren::Func); }, '[' => { res.push(Sep(GenericLeftBracket)); state.bracket_stack.push(Bracket::Generic); }, _ => {}, } break; } } } }, '0'..='9' => { // numeric literals! buf.push(c); while let Some(x) = input.next() { match x { 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { buf.push(x); input.next(); }, _ => break } } res.push(Num(String::from(&buf))) }, '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. match input.peek() { Some(' ') => res.push(Sep(Minus)), _ => res.push(Sep(Negative)) } }, '(' => { // note: FuncParens were matched above, directly after identifiers res.push(Sep(TupleLeftParen)); state.paren_stack.push(Paren::Tuple); }, '[' => { // note: GenericBrackets were matched above, directly after identifiers res.push(Sep(ArrayLeftBracket)); state.bracket_stack.push(Bracket::Array); }, ')' => { match state.paren_stack.pop() { Some(Paren::Func) => res.push(Sep(FuncRightParen)), Some(Paren::Tuple) => res.push(Sep(TupleRightParen)), None => return Err(MismatchedParens.into()), } }, ']' => { match state.bracket_stack.pop() { Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)), Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)), None => return Err(MismatchedBrackets.into()), } if input.peek() == Some(&'[') { // parameters following generics res.push(Sep(FuncLeftParen)); state.paren_stack.push(Paren::Func); input.next(); } }, ',' => res.push(Sep(Comma)), '.' => res.push(Sep(Period)), ';' => res.push(Sep(Semicolon)), ':' => res.push(Sep(Colon)), '`' => res.push(Sep(BackTick)), '{' => res.push(Sep(StructLeftBrace)), '}' => res.push(Sep(StructRightBrace)), '=' => res.push(Sep(Equals)), '+' => res.push(Sep(Plus)), '*' => res.push(Sep(Times)), '/' => res.push(Sep(Slash)), '<' => res.push(Sep(LessThan)), '>' => res.push(Sep(GreaterThan)), '@' => res.push(Sep(At)), '$' => res.push(Sep(Sha)), '~' => res.push(Sep(Tilde)), '&' => res.push(Sep(And)), '|' => res.push(Sep(Or)), '!' => res.push(Sep(Exclamation)), '?' => res.push(Sep(Question)), '^' => res.push(Sep(Caret)), '\\' => res.push(Sep(Backslash)), _ => return Err(UnknownPunctuation.into()) } buf.clear(); } Ok(TokenStream(res)) } impl std::fmt::Display for TokenStream { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; let mut prev_token = Indent(0); for token in &self.0 { match (&prev_token, &token) { (Word(_), Word(_)) | (Word(_), Num(_)) | (Num(_), Word(_)) | (Num(_), Num(_)) => write!(f, " {}", token)?, _ => write!(f, "{}", token)?, } prev_token = token.clone(); } Ok(()) } } impl std::fmt::Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; match self { Word(token) => write!(f, "{}", token), Num(token) => write!(f, "{}", token), Lit(lit) => write!(f, "{}", lit), Sep(sep) => write!(f, "{}", sep), Indent(i) => write!(f, "\n{}", " ".repeat(*i)), } } } impl std::fmt::Display for Literal { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Literal::*; match self { Char(token) => write!(f, "'{}'", token), SingleLineString(token) => write!(f, "\"{}\"", token), MultiLineString(token) => write!(f, "\"\"\"{}\"\"\"", token), Comment(token) => write!(f, "#{}", token), DocComment(token) => write!(f, "##{}", token), MultiLineComment(token) => write!(f, "#[{}]#", token), } } } impl std::fmt::Display for Punctuation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Punctuation::*; match self { Comma => write!(f, ","), Period => write!(f, "."), Semicolon => write!(f, ";"), Colon => write!(f, ":"), BackTick => write!(f, "`"), SingleQuote => write!(f, "'"), DoubleQuote => write!(f, "\""), FuncLeftParen => write!(f, "("), FuncRightParen => write!(f, ")"), TupleLeftParen => write!(f, " ("), TupleRightParen => write!(f, ")"), GenericLeftBracket => write!(f, "["), GenericRightBracket => write!(f, "]"), ArrayLeftBracket => write!(f, " ["), ArrayRightBracket => write!(f, "]"), StructLeftBrace => write!(f, "{{"), StructRightBrace => write!(f, "}}"), Equals => write!(f, "="), Plus => write!(f, "+"), Minus => write!(f, "- "), Negative => write!(f, "-"), Times => write!(f, "*"), Slash => write!(f, "/"), LessThan => write!(f, "<"), GreaterThan => write!(f, ">"), At => write!(f, "@"), Sha => write!(f, "$"), Tilde => write!(f, "~"), And => write!(f, "&"), Percent => write!(f, "%"), Or => write!(f, "|"), Exclamation => write!(f, "!"), Question => write!(f, "?"), Caret => write!(f, "^"), Backslash => write!(f, "\\"), } } }