diff options
Diffstat (limited to 'src/frontend/lex.rs')
-rw-r--r-- | src/frontend/lex.rs | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/src/frontend/lex.rs b/src/frontend/lex.rs new file mode 100644 index 0000000..771ba38 --- /dev/null +++ b/src/frontend/lex.rs @@ -0,0 +1,542 @@ +use multipeek::multipeek; + +pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>; +pub struct TokenStream(Vec<Token>); + +impl IntoIterator for TokenStream { + type Item = Token; + type IntoIter = std::vec::IntoIter<Token>; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +#[derive(Clone, PartialEq, Debug)] +pub enum LexicalError { + InvalidIndentation, + MismatchedParens, + MismatchedBrackets, + UnknownPunctuation, +} + +impl std::fmt::Display for LexicalError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} +impl std::error::Error for LexicalError {} + +/// **Basic** syntax tokens. Form an unambiguous TokenStream. +#[derive(Clone, PartialEq)] +pub enum Token { + Key(Keyword), // keyword identifiers. + Word(String), // non-keyword identifiers. + Num(String), // numeric value, ex. 413, 0b101011, 0xabcd + Lit(Literal), // literal value, ex. for strings/comments. + Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words. + Indent(usize), // indentation. denotes line breaks and scope at which a line starts. +} + +#[derive(Clone, PartialEq)] +pub enum Literal { + Char(String), + SingleLineString(String), + MultiLineString(String), + Comment(String), + DocComment(String), + MultiLineComment(String), +} + +/// Keywords, made explicit for easier use with Rust. +/// (strings inside match patterns are fucky!!) +#[derive(Clone, PartialEq)] +pub enum Keyword { + Pub, Let, Var, Const, + Func, Macro, Type, + Mod, From, Import, + For, While, Loop, + Block, Static, + If, When, Elif, Else, Match, + Try, Catch, Finally, + Struct, Tuple, Enum, Union, Interface, + Distinct, Ref, // todo: Mut once figured out + Break, Continue, Return, + In, Is, Of, As, +} + +/// All punctuation recognized by the lexer. +/// Note the distinction between FuncLeftParen and TupleLeftParen. +#[derive(Clone, PartialEq)] +pub enum Punctuation { + Comma, // , + Period, // . + Semicolon, // ; + Colon, // : + BackTick, // ` + SingleQuote, // ' + DoubleQuote, // " + FuncLeftParen, // ( + FuncRightParen, // ) + TupleLeftParen, // ( + TupleRightParen, // ) + GenericLeftBracket, // [ + GenericRightBracket, // ] + ArrayLeftBracket, // [ + ArrayRightBracket, // ] + StructLeftBrace, // } + StructRightBrace, // } + Equals, // = + Plus, // + + Minus, // distinction between minus and negative. + Negative, // negative binds tightly: there is no whitespace following. + Times, // * + Slash, // / + LessThan, // < + GreaterThan, // > + At, // @ + Sha, // $ + Tilde, // ~ + And, // & + Percent, // % + Or, // | + Exclamation, // ! + Question, // ? + Caret, // ^ + Backslash, // \ +} + +/// Parses whitespace-sensitive code into an unambiguous TokenStream. +/// Also useful for formatting. +pub fn tokenize(input: &str) -> Result<TokenStream> { + // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. + // Tokens are matched by looping within their case until complete. + // This then eliminates the need for most global parser state. (i hate state) + + use Token::*; + use Literal::*; + use Punctuation::*; + use LexicalError::*; + enum Paren { Func, Tuple } + enum Bracket { Generic, Array } + struct State { + start_of_line: bool, + paren_stack: Vec<Paren>, + bracket_stack: Vec<Bracket>, + } + + let mut state = State { + start_of_line: true, + paren_stack: vec!(), + bracket_stack: vec!(), + }; + + let mut buf = String::new(); + let mut res = Vec::new(); + + // `char` in rust is four bytes it's fine + let mut input = multipeek(input.chars()); + while let Some(c) = input.next() { + match c { + ' ' => { // indentation! and whitespace + match res.last() { + Some(Indent(_)) => { // indentation! + res.pop(); // discard previous empty or useless Indent token + let mut current_indent_level = 1; + while let Some(x) = input.peek() { + match x { + ' ' => { current_indent_level += 1; input.next(); }, + _ => match res.last() { // indentation ends + Some(Word(a)) if a == "==" || a == "and" || a == "or" || + a == "xor" || a == "in" || a == "is" => break, + Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) | + Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) | + Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break, + _ => { + res.push(Indent(current_indent_level)); + break; + } + } + } + } + }, + _ => { // get rid of excess (all) whitespace between words/operators + while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); } + } + } + }, + '\t' => return Err(InvalidIndentation.into()), + '\n' => res.push(Indent(0)), + '\'' => { // chars! + while let Some(x) = input.next() { + match x { + '\'' => break, + '\\' => if let Some(y) = input.next() { buf.push(y) }, + _ => buf.push(x) + } + } + res.push(Lit(Char(String::from(&buf)))); + }, + '"' => { // strings! + match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) { + (Some('"'), Some('"')) => { // triple quoted strings + input.next(); input.next(); + while let Some(x) = input.next() { + match x { + '"' if input.peek_nth(0) == Some(&'"') && + input.peek_nth(1) == Some(&'"') => { + input.next(); input.next(); + break; + }, + _ => buf.push(x) + } + } + res.push(Lit(MultiLineString(String::from(&buf)))); + }, + (_, _) => { // single quoted strings + while let Some(x) = input.next() { + match x { + '"' => break, + '\\' => if let Some(y) = input.next() { buf.push(y) }, + _ => buf.push(x) + } + } + res.push(Lit(SingleLineString(String::from(&buf)))); + } + } + }, + '#' => { // comments! + match input.peek() { + Some('[') => { // block comment, can be nested + input.next(); + let mut comment_level = 1; + while let Some(x) = input.next() && comment_level > 0 { + match x { + '#' if input.peek() == Some(&'[') => { + comment_level += 1; + input.next(); + }, + ']' if input.peek() == Some(&'#') => { + comment_level -= 1; + input.next(); + }, + _ => buf.push(x) + } + } + res.push(Lit(MultiLineComment(String::from(&buf)))); + }, + Some(&'#') => { // documentation comment + input.next(); + while let Some(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(*x); + } + } + input.next(); + } + res.push(Lit(DocComment(String::from(&buf)))); + }, + _ => { // standard comment, runs til EOL + while let Some(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(*x); + } + } + input.next(); + } + res.push(Lit(Comment(String::from(&buf)))); + } + } + }, + c if c.is_alphabetic() || c == '_' => { // valid identifiers! + buf.push(c); + while let Some(x) = input.peek() { + match x { + x if x.is_alphanumeric() || x == &'_' => { + buf.push(*x); + input.next(); + }, + _ => { + use Keyword::*; + match buf.as_str() { // keywords! + "pub" => res.push(Key(Pub)), + "let" => res.push(Key(Let)), + "var" => res.push(Key(Var)), + "const" => res.push(Key(Const)), + "func" => res.push(Key(Func)), + "macro" => res.push(Key(Macro)), + "type" => res.push(Key(Type)), + "mod" => res.push(Key(Mod)), + "from" => res.push(Key(From)), + "import" => res.push(Key(Import)), + "for" => res.push(Key(For)), + "while" => res.push(Key(While)), + "loop" => res.push(Key(Loop)), + "block" => res.push(Key(Block)), + "static" => res.push(Key(Static)), + "if" => res.push(Key(If)), + "when" => res.push(Key(When)), + "elif" => res.push(Key(Elif)), + "else" => res.push(Key(Else)), + "match" => res.push(Key(Match)), + "try" => res.push(Key(Try)), + "catch" => res.push(Key(Catch)), + "finally" => res.push(Key(Finally)), + "struct" => res.push(Key(Struct)), + "tuple" => res.push(Key(Tuple)), + "enum" => res.push(Key(Enum)), + "union" => res.push(Key(Union)), + "interface" => res.push(Key(Interface)), + "distinct" => res.push(Key(Distinct)), + "ref" => res.push(Key(Ref)), + "break" => res.push(Key(Break)), + "continue" => res.push(Key(Continue)), + "return" => res.push(Key(Return)), + "in" => res.push(Key(In)), + "is" => res.push(Key(Is)), + "of" => res.push(Key(Of)), + "as" => res.push(Key(As)), + _ => res.push(Word(String::from(&buf))) + } + match x { // () and [] denote both parameters/generics and tuples/arrays + '(' => { // we must disambiguate by treating those *directly* after words as such + res.push(Sep(FuncLeftParen)); + state.paren_stack.push(Paren::Func); + input.next(); + }, + '[' => { + res.push(Sep(GenericLeftBracket)); + state.bracket_stack.push(Bracket::Generic); + input.next(); + }, + _ => {}, + } + break; + } + } + } + }, + '0'..='9' => { // numeric literals! + buf.push(c); + while let Some(x) = input.peek() { + match x { + 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { + buf.push(*x); + input.next(); + }, + _ => break + } + } + res.push(Num(String::from(&buf))) + }, + '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. + match input.peek() { + Some(' ') => res.push(Sep(Minus)), + _ => res.push(Sep(Negative)) + } + }, + '(' => { // note: FuncParens were matched above, directly after identifiers + res.push(Sep(TupleLeftParen)); + state.paren_stack.push(Paren::Tuple); + }, + '[' => { // note: GenericBrackets were matched above, directly after identifiers + res.push(Sep(ArrayLeftBracket)); + state.bracket_stack.push(Bracket::Array); + }, + ')' => { + match state.paren_stack.pop() { + Some(Paren::Func) => res.push(Sep(FuncRightParen)), + Some(Paren::Tuple) => res.push(Sep(TupleRightParen)), + None => return Err(MismatchedParens.into()), + } + }, + ']' => { + match state.bracket_stack.pop() { + Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)), + Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)), + None => return Err(MismatchedBrackets.into()), + } + if input.peek() == Some(&'(') { // parameters following generics + res.push(Sep(FuncLeftParen)); + state.paren_stack.push(Paren::Func); + input.next(); + } + }, + '`' => { + res.push(Sep(BackTick)); + match input.peek() { + Some('(') => { + res.push(Sep(FuncLeftParen)); + state.paren_stack.push(Paren::Func); + input.next(); + }, + Some('[') => { + res.push(Sep(GenericLeftBracket)); + state.bracket_stack.push(Bracket::Generic); + input.next(); + }, + _ => {} + } + }, + ',' => res.push(Sep(Comma)), + '.' => res.push(Sep(Period)), + ';' => res.push(Sep(Semicolon)), + ':' => res.push(Sep(Colon)), + '{' => res.push(Sep(StructLeftBrace)), + '}' => res.push(Sep(StructRightBrace)), + '=' => res.push(Sep(Equals)), + '+' => res.push(Sep(Plus)), + '*' => res.push(Sep(Times)), + '/' => res.push(Sep(Slash)), + '<' => res.push(Sep(LessThan)), + '>' => res.push(Sep(GreaterThan)), + '@' => res.push(Sep(At)), + '$' => res.push(Sep(Sha)), + '~' => res.push(Sep(Tilde)), + '&' => res.push(Sep(And)), + '|' => res.push(Sep(Or)), + '!' => res.push(Sep(Exclamation)), + '?' => res.push(Sep(Question)), + '^' => res.push(Sep(Caret)), + '\\' => res.push(Sep(Backslash)), + _ => return Err(UnknownPunctuation.into()) + } + buf.clear(); + } + Ok(TokenStream(res)) +} + +impl std::fmt::Display for TokenStream { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use Token::*; + let mut prev_token = Indent(0); + for token in &self.0 { + match (&prev_token, &token) { + (Word(_), Word(_)) | (Word(_), Num(_)) | + (Num(_), Word(_)) | (Num(_), Num(_)) => write!(f, " {}", token)?, + _ => write!(f, "{}", token)?, + } + prev_token = token.clone(); + } + Ok(()) + } +} + +impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use Token::*; + match self { + Key(word) => write!(f, "{}", word), + Word(val) => write!(f, "{}", val), + Num(val) => write!(f, "{}", val), + Lit(lit) => write!(f, "{}", lit), + Sep(sep) => write!(f, "{}", sep), + Indent(i) => write!(f, "\n{}", " ".repeat(*i)), + } + } +} + +impl std::fmt::Display for Literal { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use Literal::*; + match self { + Char(val) => write!(f, "'{}'", val), + SingleLineString(val) => write!(f, "\"{}\"", val), + MultiLineString(val) => write!(f, "\"\"\"{}\"\"\"", val), + Comment(val) => write!(f, "#{}", val), + DocComment(val) => write!(f, "##{}", val), + MultiLineComment(val) => write!(f, "#[{}]#", val), + } + } +} + +impl std::fmt::Display for Keyword { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use Keyword::*; + match self { + Pub => write!(f, "pub"), + Let => write!(f, "let"), + Var => write!(f, "var"), + Const => write!(f, "const"), + Func => write!(f, "func"), + Macro => write!(f, "macro"), + Type => write!(f, "type"), + Mod => write!(f, "mod"), + From => write!(f, "from"), + Import => write!(f, "import"), + For => write!(f, "for"), + While => write!(f, "while"), + Loop => write!(f, "loop"), + Block => write!(f, "block"), + Static => write!(f, "static"), + If => write!(f, "if"), + When => write!(f, "when"), + Elif => write!(f, "elif"), + Else => write!(f, "else"), + Match => write!(f, "match"), + Try => write!(f, "try"), + Catch => write!(f, "catch"), + Finally => write!(f, "finally"), + Struct => write!(f, "struct"), + Tuple => write!(f, "tuple"), + Enum => write!(f, "enum"), + Union => write!(f, "union"), + Interface => write!(f, "interface"), + Distinct => write!(f, "distinct"), + Ref => write!(f, "ref"), + Break => write!(f, "break"), + Continue => write!(f, "continue"), + Return => write!(f, "return"), + In => write!(f, "in"), + Is => write!(f, "is"), + Of => write!(f, "of"), + As => write!(f, "as"), + } + } +} +impl std::fmt::Display for Punctuation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use Punctuation::*; + match self { + Comma => write!(f, ","), + Period => write!(f, "."), + Semicolon => write!(f, ";"), + Colon => write!(f, ":"), + BackTick => write!(f, "`"), + SingleQuote => write!(f, "'"), + DoubleQuote => write!(f, "\""), + FuncLeftParen => write!(f, "("), + FuncRightParen => write!(f, ")"), + TupleLeftParen => write!(f, " ("), + TupleRightParen => write!(f, ")"), + GenericLeftBracket => write!(f, "["), + GenericRightBracket => write!(f, "]"), + ArrayLeftBracket => write!(f, " ["), + ArrayRightBracket => write!(f, "]"), + StructLeftBrace => write!(f, "{{"), + StructRightBrace => write!(f, "}}"), + Equals => write!(f, "="), + Plus => write!(f, "+"), + Minus => write!(f, "- "), + Negative => write!(f, "-"), + Times => write!(f, "*"), + Slash => write!(f, "/"), + LessThan => write!(f, "<"), + GreaterThan => write!(f, ">"), + At => write!(f, "@"), + Sha => write!(f, "$"), + Tilde => write!(f, "~"), + And => write!(f, "&"), + Percent => write!(f, "%"), + Or => write!(f, "|"), + Exclamation => write!(f, "!"), + Question => write!(f, "?"), + Caret => write!(f, "^"), + Backslash => write!(f, "\\"), + } + } +} |