compiler: implement a borrow checker free tree and begin lexing

author: JJ 2023-07-18 05:11:40 +0000
committer: JJ 2023-07-18 06:48:33 +0000
commit: fbfeffad5d85b73102a0f4e5ed607c33319f09e6 (patch)
tree: 6c2a9e53e838cbcc78a3638ca7b9347c88853f0c /src/lex.rs
parent: 4a3796c1ff54b44b1493f3afed18d0dd71b5f19f (diff)
1 files changed, 121 insertions, 0 deletions
diff --git a/src/lex.rs b/src/lex.rs
new file mode 100644
index 0000000..7c0cc21
--- /dev/null
+++ b/src/lex.rs
@@ -0,0 +1,121 @@
+use std::todo;
+use crate::tree::Tree;
+use multipeek::multipeek;
+
+/// **Basic** syntax tokens.
+pub enum Token {
+    Word(String),
+    Lit(String),
+    Sep(char)
+}
+
+/// Parses whitespace-sensitive code into an unambiguous syntax tree.
+/// Also useful for formatting.
+pub fn tokenize(input: &str) -> Tree<Vec<Token>> {
+    use Token::*;
+    let mut indendation_level = 0;
+    let mut buffer = String::new();
+    let mut result = Tree::new(Vec::new());
+    let ctx = result.data(result.root());
+
+    // `char` in rust is four bytes it's fine
+    let mut input = multipeek(input.chars());
+    while let Some(c) = input.next() {
+        match c {
+            ' ' => todo!(),
+            '\n' => todo!(),
+            '\t' => todo!(),
+            '\'' => {
+                ctx.push(Sep('\''));
+                while let Some(x) = input.next() {
+                    if x == '\\' {
+                        buffer.push(x);
+                        continue;
+                    } else if x == '\'' {
+                        break;
+                    } else {
+                        buffer.push(x);
+                    }
+                }
+                ctx.push(Lit(String::from(&buffer)));
+                ctx.push(Sep('\''));
+            },
+            '"' => { // triple quoted strings and regular strings
+                if input.peek_nth(0) == Some(&'"') &&
+                   input.peek_nth(1) == Some(&'"') {
+                    input.next(); input.next();
+                    ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
+                    while let Some(c) = input.next() {
+                        if c == '"' &&
+                           input.peek_nth(1) == Some(&'"') &&
+                           input.peek_nth(2) == Some(&'"') {
+                            input.next(); input.next();
+                            break;
+                        }
+                        buffer.push(c);
+                    }
+                    ctx.push(Lit(String::from(&buffer)));
+                    ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
+                } else {
+                    ctx.push(Sep('"'));
+                    while let Some(x) = input.next() {
+                        if x == '\\' {
+                            buffer.push(x);
+                            continue;
+                        } else if x == '"' {
+                            break;
+                        }
+                        buffer.push(c);
+                    }
+                    ctx.push(Lit(String::from(&buffer)));
+                    ctx.push(Sep('"'));
+                }
+            },
+            '#' => {
+                if input.peek() == Some(&'[') {
+                    input.next();
+                    ctx.push(Sep('#')); ctx.push(Sep('['));
+                    let mut comment_level = 1;
+                    while let Some(x) = input.next() && comment_level > 0 {
+                        if x == '#' && input.peek() == Some(&'[') {
+                            comment_level += 1;
+                            input.next();
+                        } else if x == ']' && input.peek() == Some(&'#') {
+                            comment_level -= 1;
+                            input.next();
+                        } else {
+                            buffer.push(x);
+                        }
+                    }
+                    ctx.push(Lit(String::from(&buffer)));
+                    ctx.push(Sep(']')); ctx.push(Sep('#'));
+                } else {
+                    ctx.push(Sep('#'));
+                    while let Some(x) = input.next() && x != '\n' {
+                        buffer.push(x);
+                    }
+                    ctx.push(Lit(String::from(&buffer)));
+                }
+            },
+            'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
+                while let Some(x) = input.peek() {
+                    match x {
+                        'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
+                            buffer.push(c);
+                            input.next();
+                        },
+                        _ => break
+                    }
+                }
+                ctx.push(Word(String::from(&buffer)));
+            },
+            '.' | ',' | ':' | ';' |
+            '(' | ')' | '[' | ']' | '{' | '}' => ctx.push(Sep(c)),
+            _ => ctx.push(Sep(c))
+        }
+        buffer.clear();
+    }
+    return result;
+}
+
+// note: we can't have a TokenStream because there is significant whitespace. so, we construct the tree structure here, although we don't do anything fancy with the tokens.
author	JJ	2023-07-18 05:11:40 +0000
committer	JJ	2023-07-18 06:48:33 +0000
commit	fbfeffad5d85b73102a0f4e5ed607c33319f09e6 (patch)
tree	6c2a9e53e838cbcc78a3638ca7b9347c88853f0c /src/lex.rs
parent	4a3796c1ff54b44b1493f3afed18d0dd71b5f19f (diff)