1 files changed, 542 insertions, 0 deletions
diff --git a/src/frontend/lex.rs b/src/frontend/lex.rs
new file mode 100644
index 0000000..771ba38
--- /dev/null
+++ b/src/frontend/lex.rs
@@ -0,0 +1,542 @@
+use multipeek::multipeek;
+
+pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
+pub struct TokenStream(Vec<Token>);
+
+impl IntoIterator for TokenStream {
+    type Item = Token;
+    type IntoIter = std::vec::IntoIter<Token>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+#[derive(Clone, PartialEq, Debug)]
+pub enum LexicalError {
+    InvalidIndentation,
+    MismatchedParens,
+    MismatchedBrackets,
+    UnknownPunctuation,
+}
+
+impl std::fmt::Display for LexicalError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+impl std::error::Error for LexicalError {}
+
+/// **Basic** syntax tokens. Form an unambiguous TokenStream.
+#[derive(Clone, PartialEq)]
+pub enum Token {
+    Key(Keyword),   // keyword identifiers.
+    Word(String),   // non-keyword identifiers.
+    Num(String),    // numeric value, ex. 413, 0b101011, 0xabcd
+    Lit(Literal),   // literal value, ex. for strings/comments.
+    Sep(Punctuation),   // punctuation. non-word tokens. operators are lexed as this and later transformed to words.
+    Indent(usize),      // indentation. denotes line breaks and scope at which a line starts.
+}
+
+#[derive(Clone, PartialEq)]
+pub enum Literal {
+    Char(String),
+    SingleLineString(String),
+    MultiLineString(String),
+    Comment(String),
+    DocComment(String),
+    MultiLineComment(String),
+}
+
+/// Keywords, made explicit for easier use with Rust.
+/// (strings inside match patterns are fucky!!)
+#[derive(Clone, PartialEq)]
+pub enum Keyword {
+    Pub, Let, Var, Const,
+    Func, Macro, Type,
+    Mod, From, Import,
+    For, While, Loop,
+    Block, Static,
+    If, When, Elif, Else, Match,
+    Try, Catch, Finally,
+    Struct, Tuple, Enum, Union, Interface,
+    Distinct, Ref, // todo: Mut once figured out
+    Break, Continue, Return,
+    In, Is, Of, As,
+}
+
+/// All punctuation recognized by the lexer.
+/// Note the distinction between FuncLeftParen and TupleLeftParen.
+#[derive(Clone, PartialEq)]
+pub enum Punctuation {
+    Comma,               // ,
+    Period,              // .
+    Semicolon,           // ;
+    Colon,               // :
+    BackTick,            // `
+    SingleQuote,         // '
+    DoubleQuote,         // "
+    FuncLeftParen,       // (
+    FuncRightParen,      // )
+    TupleLeftParen,      // (
+    TupleRightParen,     // )
+    GenericLeftBracket,  // [
+    GenericRightBracket, // ]
+    ArrayLeftBracket,    // [
+    ArrayRightBracket,   // ]
+    StructLeftBrace,     // }
+    StructRightBrace,    // }
+    Equals,      // =
+    Plus,        // +
+    Minus,       // distinction between minus and negative.
+    Negative,    // negative binds tightly: there is no whitespace following.
+    Times,       // *
+    Slash,       // /
+    LessThan,    // <
+    GreaterThan, // >
+    At,          // @
+    Sha,         // $
+    Tilde,       // ~
+    And,         // &
+    Percent,     // %
+    Or,          // |
+    Exclamation, // !
+    Question,    // ?
+    Caret,       // ^
+    Backslash,   // \
+}
+
+/// Parses whitespace-sensitive code into an unambiguous TokenStream.
+/// Also useful for formatting.
+pub fn tokenize(input: &str) -> Result<TokenStream> {
+    // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
+    // Tokens are matched by looping within their case until complete.
+    // This then eliminates the need for most global parser state. (i hate state)
+
+    use Token::*;
+    use Literal::*;
+    use Punctuation::*;
+    use LexicalError::*;
+    enum Paren { Func, Tuple }
+    enum Bracket { Generic, Array }
+    struct State {
+        start_of_line: bool,
+        paren_stack: Vec<Paren>,
+        bracket_stack: Vec<Bracket>,
+    }
+
+    let mut state = State {
+        start_of_line: true,
+        paren_stack: vec!(),
+        bracket_stack: vec!(),
+    };
+
+    let mut buf = String::new();
+    let mut res = Vec::new();
+
+    // `char` in rust is four bytes it's fine
+    let mut input = multipeek(input.chars());
+    while let Some(c) = input.next() {
+        match c {
+            ' ' => { // indentation! and whitespace
+                match res.last() {
+                    Some(Indent(_)) => { // indentation!
+                        res.pop(); // discard previous empty or useless Indent token
+                        let mut current_indent_level = 1;
+                        while let Some(x) = input.peek() {
+                            match x {
+                                ' ' => { current_indent_level += 1; input.next(); },
+                                _ => match res.last() { // indentation ends
+                                    Some(Word(a)) if a == "==" || a == "and" || a == "or" ||
+                                                     a == "xor" || a == "in" || a == "is" => break,
+                                    Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) |
+                                    Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) |
+                                    Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break,
+                                    _ => {
+                                        res.push(Indent(current_indent_level));
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    _ => { // get rid of excess (all) whitespace between words/operators
+                        while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); }
+                    }
+                }
+            },
+            '\t' => return Err(InvalidIndentation.into()),
+            '\n' => res.push(Indent(0)),
+            '\'' => { // chars!
+                while let Some(x) = input.next() {
+                    match x {
+                        '\'' => break,
+                        '\\' => if let Some(y) = input.next() { buf.push(y) },
+                        _ => buf.push(x)
+                    }
+                }
+                res.push(Lit(Char(String::from(&buf))));
+            },
+            '"' => { // strings!
+                match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) {
+                    (Some('"'), Some('"')) => { // triple quoted strings
+                        input.next(); input.next();
+                        while let Some(x) = input.next() {
+                            match x {
+                                '"' if input.peek_nth(0) == Some(&'"') &&
+                                       input.peek_nth(1) == Some(&'"') => {
+                                    input.next(); input.next();
+                                    break;
+                               },
+                               _ => buf.push(x)
+                            }
+                        }
+                        res.push(Lit(MultiLineString(String::from(&buf))));
+                    },
+                    (_, _) => { // single quoted strings
+                        while let Some(x) = input.next() {
+                            match x {
+                                '"' => break,
+                                '\\' => if let Some(y) = input.next() { buf.push(y) },
+                                _ => buf.push(x)
+                            }
+                        }
+                        res.push(Lit(SingleLineString(String::from(&buf))));
+                    }
+                }
+            },
+            '#' => { // comments!
+                match input.peek() {
+                    Some('[') => { // block comment, can be nested
+                        input.next();
+                        let mut comment_level = 1;
+                        while let Some(x) = input.next() && comment_level > 0 {
+                            match x {
+                                '#' if input.peek() == Some(&'[') => {
+                                    comment_level += 1;
+                                    input.next();
+                                },
+                                ']' if input.peek() == Some(&'#') => {
+                                    comment_level -= 1;
+                                    input.next();
+                                },
+                                _ => buf.push(x)
+                            }
+                        }
+                        res.push(Lit(MultiLineComment(String::from(&buf))));
+                    },
+                    Some(&'#') => { // documentation comment
+                        input.next();
+                        while let Some(x) = input.peek() {
+                            match x {
+                                '\n' => break,
+                                _ => {
+                                    buf.push(*x);
+                                }
+                            }
+                            input.next();
+                        }
+                        res.push(Lit(DocComment(String::from(&buf))));
+                    },
+                    _ => { // standard comment, runs til EOL
+                        while let Some(x) = input.peek() {
+                            match x {
+                                '\n' => break,
+                                _ => {
+                                    buf.push(*x);
+                                }
+                            }
+                            input.next();
+                        }
+                        res.push(Lit(Comment(String::from(&buf))));
+                    }
+                }
+            },
+            c if c.is_alphabetic() || c == '_' => { // valid identifiers!
+                buf.push(c);
+                while let Some(x) = input.peek() {
+                    match x {
+                        x if x.is_alphanumeric() || x == &'_' => {
+                            buf.push(*x);
+                            input.next();
+                        },
+                        _ => {
+                            use Keyword::*;
+                            match buf.as_str() { // keywords!
+                                "pub" => res.push(Key(Pub)),
+                                "let" => res.push(Key(Let)),
+                                "var" => res.push(Key(Var)),
+                                "const" => res.push(Key(Const)),
+                                "func"  => res.push(Key(Func)),
+                                "macro" => res.push(Key(Macro)),
+                                "type"  => res.push(Key(Type)),
+                                "mod"    => res.push(Key(Mod)),
+                                "from"   => res.push(Key(From)),
+                                "import" => res.push(Key(Import)),
+                                "for"    => res.push(Key(For)),
+                                "while"  => res.push(Key(While)),
+                                "loop"   => res.push(Key(Loop)),
+                                "block"  => res.push(Key(Block)),
+                                "static" => res.push(Key(Static)),
+                                "if"    => res.push(Key(If)),
+                                "when"  => res.push(Key(When)),
+                                "elif"  => res.push(Key(Elif)),
+                                "else"  => res.push(Key(Else)),
+                                "match" => res.push(Key(Match)),
+                                "try"     => res.push(Key(Try)),
+                                "catch"   => res.push(Key(Catch)),
+                                "finally" => res.push(Key(Finally)),
+                                "struct"    => res.push(Key(Struct)),
+                                "tuple"     => res.push(Key(Tuple)),
+                                "enum"      => res.push(Key(Enum)),
+                                "union"     => res.push(Key(Union)),
+                                "interface" => res.push(Key(Interface)),
+                                "distinct" => res.push(Key(Distinct)),
+                                "ref"      => res.push(Key(Ref)),
+                                "break"    => res.push(Key(Break)),
+                                "continue" => res.push(Key(Continue)),
+                                "return"   => res.push(Key(Return)),
+                                "in" => res.push(Key(In)),
+                                "is" => res.push(Key(Is)),
+                                "of" => res.push(Key(Of)),
+                                "as" => res.push(Key(As)),
+                                _ => res.push(Word(String::from(&buf)))
+                            }
+                            match x { // () and [] denote both parameters/generics and tuples/arrays
+                                '(' => { // we must disambiguate by treating those *directly* after words as such
+                                    res.push(Sep(FuncLeftParen));
+                                    state.paren_stack.push(Paren::Func);
+                                    input.next();
+                                },
+                                '[' => {
+                                    res.push(Sep(GenericLeftBracket));
+                                    state.bracket_stack.push(Bracket::Generic);
+                                    input.next();
+                                },
+                                _ => {},
+                            }
+                            break;
+                        }
+                    }
+                }
+            },
+            '0'..='9' => { // numeric literals!
+                buf.push(c);
+                while let Some(x) = input.peek() {
+                    match x {
+                        'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
+                            buf.push(*x);
+                            input.next();
+                        },
+                        _ => break
+                    }
+                }
+                res.push(Num(String::from(&buf)))
+            },
+            '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator.
+                match input.peek() {
+                    Some(' ') => res.push(Sep(Minus)),
+                    _ => res.push(Sep(Negative))
+                }
+            },
+            '(' => { // note: FuncParens were matched above, directly after identifiers
+                res.push(Sep(TupleLeftParen));
+                state.paren_stack.push(Paren::Tuple);
+            },
+            '[' => { // note: GenericBrackets were matched above, directly after identifiers
+                res.push(Sep(ArrayLeftBracket));
+                state.bracket_stack.push(Bracket::Array);
+            },
+            ')' => {
+                match state.paren_stack.pop() {
+                    Some(Paren::Func) => res.push(Sep(FuncRightParen)),
+                    Some(Paren::Tuple) => res.push(Sep(TupleRightParen)),
+                    None => return Err(MismatchedParens.into()),
+                }
+            },
+            ']' => {
+                match state.bracket_stack.pop() {
+                    Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)),
+                    Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)),
+                    None => return Err(MismatchedBrackets.into()),
+                }
+                if input.peek() == Some(&'(') { // parameters following generics
+                    res.push(Sep(FuncLeftParen));
+                    state.paren_stack.push(Paren::Func);
+                    input.next();
+                }
+            },
+            '`' => {
+                res.push(Sep(BackTick));
+                match input.peek() {
+                    Some('(') => {
+                        res.push(Sep(FuncLeftParen));
+                        state.paren_stack.push(Paren::Func);
+                        input.next();
+                    },
+                    Some('[') => {
+                        res.push(Sep(GenericLeftBracket));
+                        state.bracket_stack.push(Bracket::Generic);
+                        input.next();
+                    },
+                    _ => {}
+                }
+            },
+            ',' => res.push(Sep(Comma)),
+            '.' => res.push(Sep(Period)),
+            ';' => res.push(Sep(Semicolon)),
+            ':' => res.push(Sep(Colon)),
+            '{' => res.push(Sep(StructLeftBrace)),
+            '}' => res.push(Sep(StructRightBrace)),
+            '=' => res.push(Sep(Equals)),
+            '+' => res.push(Sep(Plus)),
+            '*' => res.push(Sep(Times)),
+            '/' => res.push(Sep(Slash)),
+            '<' => res.push(Sep(LessThan)),
+            '>' => res.push(Sep(GreaterThan)),
+            '@' => res.push(Sep(At)),
+            '$' => res.push(Sep(Sha)),
+            '~' => res.push(Sep(Tilde)),
+            '&' => res.push(Sep(And)),
+            '|' => res.push(Sep(Or)),
+            '!' => res.push(Sep(Exclamation)),
+            '?' => res.push(Sep(Question)),
+            '^' => res.push(Sep(Caret)),
+            '\\' => res.push(Sep(Backslash)),
+            _ => return Err(UnknownPunctuation.into())
+        }
+        buf.clear();
+    }
+    Ok(TokenStream(res))
+}
+
+impl std::fmt::Display for TokenStream {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use Token::*;
+        let mut prev_token = Indent(0);
+        for token in &self.0 {
+            match (&prev_token, &token) {
+                (Word(_), Word(_)) | (Word(_), Num(_)) |
+                (Num(_), Word(_)) | (Num(_), Num(_)) => write!(f, " {}", token)?,
+                _ => write!(f, "{}", token)?,
+            }
+            prev_token = token.clone();
+        }
+        Ok(())
+    }
+}
+
+impl std::fmt::Display for Token {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use Token::*;
+        match self {
+            Key(word) => write!(f, "{}", word),
+            Word(val) => write!(f, "{}", val),
+            Num(val) => write!(f, "{}", val),
+            Lit(lit) => write!(f, "{}", lit),
+            Sep(sep) => write!(f, "{}", sep),
+            Indent(i) => write!(f, "\n{}", " ".repeat(*i)),
+        }
+    }
+}
+
+impl std::fmt::Display for Literal {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use Literal::*;
+        match self {
+            Char(val) => write!(f, "'{}'", val),
+            SingleLineString(val) => write!(f, "\"{}\"", val),
+            MultiLineString(val) => write!(f, "\"\"\"{}\"\"\"", val),
+            Comment(val) => write!(f, "#{}", val),
+            DocComment(val) => write!(f, "##{}", val),
+            MultiLineComment(val) => write!(f, "#[{}]#", val),
+        }
+    }
+}
+
+impl std::fmt::Display for Keyword {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use Keyword::*;
+        match self {
+            Pub => write!(f, "pub"),
+            Let => write!(f, "let"),
+            Var => write!(f, "var"),
+            Const => write!(f, "const"),
+            Func => write!(f, "func"),
+            Macro => write!(f, "macro"),
+            Type => write!(f, "type"),
+            Mod => write!(f, "mod"),
+            From => write!(f, "from"),
+            Import => write!(f, "import"),
+            For => write!(f, "for"),
+            While => write!(f, "while"),
+            Loop => write!(f, "loop"),
+            Block => write!(f, "block"),
+            Static => write!(f, "static"),
+            If => write!(f, "if"),
+            When => write!(f, "when"),
+            Elif => write!(f, "elif"),
+            Else => write!(f, "else"),
+            Match => write!(f, "match"),
+            Try => write!(f, "try"),
+            Catch => write!(f, "catch"),
+            Finally => write!(f, "finally"),
+            Struct => write!(f, "struct"),
+            Tuple => write!(f, "tuple"),
+            Enum => write!(f, "enum"),
+            Union => write!(f, "union"),
+            Interface => write!(f, "interface"),
+            Distinct => write!(f, "distinct"),
+            Ref => write!(f, "ref"),
+            Break => write!(f, "break"),
+            Continue => write!(f, "continue"),
+            Return => write!(f, "return"),
+            In => write!(f, "in"),
+            Is => write!(f, "is"),
+            Of => write!(f, "of"),
+            As => write!(f, "as"),
+        }
+    }
+}
+impl std::fmt::Display for Punctuation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use Punctuation::*;
+        match self {
+            Comma => write!(f, ","),
+            Period => write!(f, "."),
+            Semicolon => write!(f, ";"),
+            Colon => write!(f, ":"),
+            BackTick => write!(f, "`"),
+            SingleQuote => write!(f, "'"),
+            DoubleQuote => write!(f, "\""),
+            FuncLeftParen => write!(f, "("),
+            FuncRightParen => write!(f, ")"),
+            TupleLeftParen => write!(f, " ("),
+            TupleRightParen => write!(f, ")"),
+            GenericLeftBracket => write!(f, "["),
+            GenericRightBracket => write!(f, "]"),
+            ArrayLeftBracket => write!(f, " ["),
+            ArrayRightBracket => write!(f, "]"),
+            StructLeftBrace => write!(f, "{{"),
+            StructRightBrace => write!(f, "}}"),
+            Equals => write!(f, "="),
+            Plus => write!(f, "+"),
+            Minus => write!(f, "- "),
+            Negative => write!(f, "-"),
+            Times => write!(f, "*"),
+            Slash => write!(f, "/"),
+            LessThan => write!(f, "<"),
+            GreaterThan => write!(f, ">"),
+            At => write!(f, "@"),
+            Sha => write!(f, "$"),
+            Tilde => write!(f, "~"),
+            And => write!(f, "&"),
+            Percent => write!(f, "%"),
+            Or => write!(f, "|"),
+            Exclamation => write!(f, "!"),
+            Question => write!(f, "?"),
+            Caret => write!(f, "^"),
+            Backslash => write!(f, "\\"),
+        }
+    }
+}