compiler: redesign lexer. still needs unicode support & expr indentation

author: JJ 2023-10-26 08:57:33 +0000
committer: JJ 2023-10-26 08:57:33 +0000
commit: 7473fb09ed89e5e2fde8a7d6e89bc5d28151bdae (patch)
tree: fa1be73a3b87f30ca685c3a6c6f213b2c7d3ee50
parent: d5f89ba0e2fd0458f4e51e13233069583d5a89ac (diff)
2 files changed, 263 insertions, 102 deletions
diff --git a/src/lex.rs b/src/lex.rs
index 8999aea..d3d5a7e 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -3,41 +3,119 @@ use multipeek::multipeek;
 pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
 pub type TokenStream = Vec<Token>;
 
+#[derive(Clone, PartialEq, Debug)]
+pub enum LexicalError {
+    InvalidIndentation,
+    MismatchedParens,
+    MismatchedBrackets,
+}
+
+impl std::fmt::Display for LexicalError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+impl std::error::Error for LexicalError {}
+
 /// **Basic** syntax tokens. Form an unambiguous TokenStream.
 #[derive(Clone, PartialEq)]
 pub enum Token {
     Word(String),   // identifiers.
-    Lit(String),    // literal value, ex. for strings/comments.
-    Sep(char),      // punctuation. non-word tokens.
-    Begin, End      // scope indicators.
+    Num(String),    // numeric value, ex. 413, 0b101011, 0xabcd
+    Lit(Literal),   // literal value, ex. for strings/comments.
+    Sep(Punctuation),   // punctuation. non-word tokens. operators are lexed as this and later transformed to words.
+    Begin, End, Newline // scope indicators. can i use trees instead? should i use trees instead?
+}
+
+#[derive(Clone, PartialEq)]
+pub enum Literal {
+    Char(String),
+    SingleLineString(String),
+    MultiLineString(String),
+    Comment(String),
+    DocComment(String),
 }
 
-/// All keywords that may continue a line. For knowing valid line splits.
-const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"];
+/// All punctuation recognized by the lexer.
+/// Note the distinction between FuncLeftParen and TupleLeftParen.
+#[derive(Clone, PartialEq)]
+pub enum Punctuation {
+    Comma,               // ,
+    Period,              // .
+    Semicolon,           // ;
+    Colon,               // :
+    BackTick,            // `
+    SingleQuote,         // '
+    DoubleQuote,         // "
+    FuncLeftParen,       // (
+    FuncRightParen,      // )
+    TupleLeftParen,      // (
+    TupleRightParen,     // )
+    GenericLeftBracket,  // [
+    GenericRightBracket, // ]
+    ArrayLeftBracket,    // [
+    ArrayRightBracket,   // ]
+    StructLeftBrace,     // }
+    StructRightBrace,    // }
+    Equals,      // =
+    Plus,        // +
+    Minus,       // distinction between minus and negative.
+    Negative,    // negative binds tightly: there is no whitespace following.
+    Times,       // *
+    Slash,       // /
+    LessThan,    // <
+    GreaterThan, // >
+    At,          // @
+    Sha,         // $
+    Tilde,       // ~
+    And,         // &
+    Percent,     // %
+    Or,          // |
+    Exclamation, // !
+    Question,    // ?
+    Caret,       // ^
+    Backslash,   // \
+}
 
 /// Parses whitespace-sensitive code into an unambiguous TokenStream.
 /// Also useful for formatting.
-// todo: support indentation within expressions
-// nim: "As a rule of thumb, indentation within expressions is
-// allowed after operators, an open parenthesis and after commas."
+// todo: rewrite indentation parsing to do what nim does, annotate tokens with indentation preceding
 pub fn tokenize(input: &str) -> Result<TokenStream> {
     // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
     // Tokens are matched by looping within their case until complete.
-    // This then eliminates the need for almost all global parser state.
+    // This then eliminates the need for most global parser state. (i hate state)
 
     use Token::*;
-    let mut start_of_line = true;   // state
-    let mut indent_level = 0;       // state
-    let mut indent_width = None;    // state
-    let mut buf = String::new();    // buffer
-    let mut res = Vec::new();       // result
+    use Literal::*;
+    use Punctuation::*;
+    use LexicalError::*;
+    enum Paren { Func, Tuple }
+    enum Bracket { Generic, Array }
+    struct State {
+        start_of_line: bool,
+        indent_level: isize,
+        indent_width: isize,
+        paren_stack: Vec<Paren>,
+        bracket_stack: Vec<Bracket>,
+    }
+
+    let mut state = State {
+        start_of_line: true,
+        indent_level: 0,
+        indent_width: 0,
+        paren_stack: vec!(),
+        bracket_stack: vec!(),
+    };
+
+    let mut buf = String::new();
+    let mut res = Vec::new();
 
     // `char` in rust is four bytes it's fine
     let mut input = multipeek(input.chars());
     while let Some(c) = input.next() {
         match c {
             ' ' => {
-                if start_of_line { // indentation
+                if state.start_of_line { // indentation
                     let mut current_indent_level = 1;
                     while let Some(x) = input.peek() {
                         match x {
@@ -47,38 +125,42 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
                                 // really gross. this just checks if the previous token was a newline,
                                 // and that the token before it was punctuation or a known "operator",
                                 // and if so disregards indentation and treats it as a line continuation.
-                                if let Some(&Sep('\n')) = res.get(res.len() - 1) {
-                                    if let Some(y) = res.get(res.len() - 2) {
-                                        if let Word(z) = y {
-                                            if valid_continuations.contains(&&z[..]) {
+                                if let Some(Newline) = res.get(res.len() - 1) {
+                                    if let Some(prev) = res.get(res.len() - 2) {
+                                        match prev { // all keywords and punctuation that may continue a line
+                                            // workaround for https://github.com/rust-lang/rust/issues/87121
+                                            Word(a) if a == "==" || a == "and" || a == "or" ||
+                                                       a == "xor" || a == "in" || a == "is" => {
+                                                res.pop();
+                                                break;
+                                            },
+                                            &Sep(FuncLeftParen) | Sep(GenericLeftBracket) | Sep(StructLeftBrace) |
+                                            Sep(TupleLeftParen) | Sep(ArrayLeftBracket) | Sep(Comma) => {
                                                 res.pop();
                                                 break;
                                             }
-                                        } else if let Sep(_) = y {
-                                            res.pop();
-                                            break;
+                                            _ => {}
                                         }
                                     }
                                 }
 
                                 // will only fire once. allows us to support X number of spaces so long as it's consistent
-                                if indent_width.is_none() {
-                                    indent_width = Some(current_indent_level);
+                                if state.indent_width == 0 {
+                                    state.indent_width = current_indent_level;
                                 }
 
-                                let indent_width = indent_width.unwrap(); // safe. see above
-                                if current_indent_level % indent_width != 0 {
-                                    return Err("indentation is offset".into());
+                                if current_indent_level % state.indent_width != 0 {
+                                    return Err(InvalidIndentation.into());
                                 }
 
-                                let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize;
+                                let diff = (current_indent_level - state.indent_level) / state.indent_width;
                                 match diff {
                                     0 => (),                // same level of indentation
                                     1 => res.push(Begin),   // new level of indentation
                                     -1 => res.push(End),    // old level of indentation
-                                    _ => return Err("indentation stepped by too much in one go".into())
+                                    _ => return Err(InvalidIndentation.into()) // todo: support indentation in exprs
                                 }
-                                indent_level = current_indent_level;
+                                state.indent_level = current_indent_level;
                                 break;
                             }
                         }
@@ -87,13 +169,12 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
                     while input.peek() == Some(&' ') { input.next(); }
                 }
             },
-            '\n' => { // newlines are separators
-                start_of_line = true;
-                res.push(Sep('\n'))
+            '\t' => return Err(InvalidIndentation.into()),
+            '\n' => {
+                state.start_of_line = true;
+                res.push(Newline)
             },
-            c if c.is_whitespace() => return Err("tabs etc are not supported".into()),
-            '\'' => { // single quoted strings, i.e. chars
-                res.push(Sep('\''));
+            '\'' => { // chars!
                 while let Some(x) = input.next() {
                     match x {
                         '\'' => break,
@@ -101,88 +182,170 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
                         _ => buf.push(x)
                     }
                 }
-                res.push(Lit(String::from(&buf)));
-                res.push(Sep('\''));
+                res.push(Lit(Char(String::from(&buf))));
             },
-            '"' => { // triple quoted strings
-                if input.peek_nth(0) == Some(&'"') &&
-                   input.peek_nth(1) == Some(&'"') {
-                    input.next(); input.next();
-                    res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
-                    while let Some(x) = input.next() {
-                        match x {
-                            '"' if input.peek_nth(1) == Some(&'"') &&
-                                   input.peek_nth(2) == Some(&'"') => {
-                                break;
-                           },
-                           _ => buf.push(x)
+            '"' => { // strings!
+                match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) {
+                    (Some('"'), Some('"')) => { // triple quoted strings
+                        input.next(); input.next();
+                        while let Some(x) = input.next() {
+                            match x {
+                                '"' if input.peek_nth(1) == Some(&'"') &&
+                                       input.peek_nth(2) == Some(&'"') => {
+                                    input.next(); input.next();
+                                    break;
+                               },
+                               _ => buf.push(x)
+                            }
                         }
-                    }
-                    res.push(Lit(String::from(&buf)));
-                    input.next(); input.next();
-                    res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
-                } else { // regular strings
-                    res.push(Sep('"'));
-                    while let Some(x) = input.next() {
-                        match x {
-                            '"' => break,
-                            '\\' => if let Some(y) = input.next() { buf.push(y) },
-                            _ => buf.push(x)
+                        res.push(Lit(MultiLineString(String::from(&buf))));
+                    },
+                    (_, _) => { // single quoted strings
+                        while let Some(x) = input.next() {
+                            match x {
+                                '"' => break,
+                                '\\' => if let Some(y) = input.next() { buf.push(y) },
+                                _ => buf.push(x)
+                            }
                         }
+                        res.push(Lit(SingleLineString(String::from(&buf))));
                     }
-                    res.push(Lit(String::from(&buf)));
-                    res.push(Sep('"'));
                 }
             },
-            '#' => { // block comment, can be nested
-                if input.peek() == Some(&'[') {
-                    input.next();
-                    res.push(Sep('#')); res.push(Sep('['));
-                    let mut comment_level = 1;
-                    while let Some(x) = input.next() && comment_level > 0 {
-                        match x {
-                            '#' if input.peek() == Some(&'[') => {
-                                comment_level += 1;
-                                input.next();
-                            },
-                            ']' if input.peek() == Some(&'#') => {
-                                comment_level -= 1;
-                                input.next();
-                            },
-                            _ => buf.push(x)
+            '#' => { // comments!
+                match input.peek() {
+                    Some('[') => { // block comment, can be nested
+                        input.next();
+                        let mut comment_level = 1;
+                        while let Some(x) = input.next() && comment_level > 0 {
+                            match x {
+                                '#' if input.peek() == Some(&'[') => {
+                                    comment_level += 1;
+                                    input.next();
+                                },
+                                ']' if input.peek() == Some(&'#') => {
+                                    comment_level -= 1;
+                                    input.next();
+                                },
+                                _ => buf.push(x)
+                            }
                         }
-                    }
-                    res.push(Lit(String::from(&buf)));
-                    res.push(Sep(']')); res.push(Sep('#'));
-                } else { // standard comment, runs until eol
-                    res.push(Sep('#'));
-                    while let Some(x) = input.peek() {
-                        match x {
-                            '\n' => break,
-                            _ => {
-                                buf.push(*x);
-                                input.next();
+                        res.push(Lit(Comment(String::from(&buf))));
+                    },
+                    Some(&'#') => { // documentation comment
+                        input.next();
+                        while let Some(x) = input.next() {
+                            match x {
+                                '\n' => break,
+                                _ => {
+                                    buf.push(x);
+                                }
+                            }
+                        }
+                        res.push(Lit(DocComment(String::from(&buf))));
+                    },
+                    _ => { // standard comment, runs til EOL
+                        while let Some(x) = input.next() {
+                            match x {
+                                '\n' => break,
+                                _ => {
+                                    buf.push(x);
+                                }
                             }
                         }
+                        res.push(Lit(Comment(String::from(&buf))));
                     }
-                    res.push(Lit(String::from(&buf)));
                 }
             },
-            'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier
-                while let Some(x) = input.peek() {
+            'a'..='z' | 'A'..='Z' | '_' => { // valid identifiers!
+                buf.push(c); // todo: unicode support
+                while let Some(x) = input.next() {
                     match x {
-                        'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
-                            buf.push(*x);
+                        'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
+                            buf.push(x);
+                        },
+                        _ => {
+                            res.push(Word(String::from(&buf)));
+                            match x { // () and [] denote both parameters/generics and tuples/arrays
+                                '(' => { // we must disambiguate by treating those *directly* after words as such
+                                    res.push(Sep(FuncLeftParen));
+                                    state.paren_stack.push(Paren::Func);
+                                },
+                                '[' => {
+                                    res.push(Sep(GenericLeftBracket));
+                                    state.bracket_stack.push(Bracket::Generic);
+                                },
+                                _ => {},
+                            }
+                            break;
+                        }
+                    }
+                }
+            },
+            '0'..='9' => { // numeric literals!
+                buf.push(c);
+                while let Some(x) = input.next() {
+                    match x { // todo: unicode support
+                        'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
+                            buf.push(x);
                             input.next();
                         },
                         _ => break
                     }
                 }
-                res.push(Word(String::from(&buf)));
+                res.push(Num(String::from(&buf)))
+            },
+            '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator.
+                match input.peek() {
+                    Some(' ') => res.push(Sep(Minus)),
+                    _ => res.push(Sep(Negative))
+                }
+            },
+            '(' => { // note: FuncParens were matched above, directly after identifiers
+                res.push(Sep(TupleLeftParen));
+                state.paren_stack.push(Paren::Tuple);
+            },
+            '[' => { // note: GenericBrackets were matched above, directly after identifiers
+                res.push(Sep(ArrayLeftBracket));
+                state.bracket_stack.push(Bracket::Array);
+            },
+            ')' => {
+                match state.paren_stack.pop() {
+                    Some(Paren::Func) => res.push(Sep(FuncRightParen)),
+                    Some(Paren::Tuple) => res.push(Sep(TupleRightParen)),
+                    None => return Err(MismatchedParens.into()),
+                }
+            },
+            ']' => {
+                match state.bracket_stack.pop() {
+                    Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)),
+                    Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)),
+                    None => return Err(MismatchedBrackets.into()),
+                }
             },
-            '.' | ',' | ':' | ';' | // punctuation
-            '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),
-            _ => res.push(Sep(c))   // for now: treat unknown chars as Sep
+            ',' => res.push(Sep(Comma)),
+            '.' => res.push(Sep(Period)),
+            ';' => res.push(Sep(Semicolon)),
+            ':' => res.push(Sep(Colon)),
+            '`' => res.push(Sep(BackTick)),
+            '{' => res.push(Sep(StructLeftBrace)),
+            '}' => res.push(Sep(StructRightBrace)),
+            '=' => res.push(Sep(Equals)),
+            '+' => res.push(Sep(Plus)),
+            '*' => res.push(Sep(Times)),
+            '/' => res.push(Sep(Slash)),
+            '<' => res.push(Sep(LessThan)),
+            '>' => res.push(Sep(GreaterThan)),
+            '@' => res.push(Sep(At)),
+            '$' => res.push(Sep(Sha)),
+            '~' => res.push(Sep(Tilde)),
+            '&' => res.push(Sep(And)),
+            '|' => res.push(Sep(Or)),
+            '!' => res.push(Sep(Exclamation)),
+            '?' => res.push(Sep(Question)),
+            '^' => res.push(Sep(Caret)),
+            '\\' => res.push(Sep(Backslash)),
+            _ => return Err("unknown character".into()) // todo: support unicode!
         }
         buf.clear();
     }
diff --git a/src/main.rs b/src/main.rs
index c366367..5cbdf00 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,8 +2,6 @@
 #![feature(exclusive_range_pattern, let_chains)]
 
 mod lex;
-// mod parse;
-// mod check;
 mod tree;
 
-fn main() { }
+fn main() {}
author	JJ	2023-10-26 08:57:33 +0000
committer	JJ	2023-10-26 08:57:33 +0000
commit	7473fb09ed89e5e2fde8a7d6e89bc5d28151bdae (patch)
tree	fa1be73a3b87f30ca685c3a6c6f213b2c7d3ee50
parent	d5f89ba0e2fd0458f4e51e13233069583d5a89ac (diff)